]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
treewide: use log_*_errno whenever %m is in the format string
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92 #include "event-util.h"
93
94 #ifdef HAVE_SECCOMP
95 #include "seccomp-util.h"
96 #endif
97
98 typedef enum ContainerStatus {
99 CONTAINER_TERMINATED,
100 CONTAINER_REBOOTED
101 } ContainerStatus;
102
103 typedef enum LinkJournal {
104 LINK_NO,
105 LINK_AUTO,
106 LINK_HOST,
107 LINK_GUEST
108 } LinkJournal;
109
110 typedef enum Volatile {
111 VOLATILE_NO,
112 VOLATILE_YES,
113 VOLATILE_STATE,
114 } Volatile;
115
116 static char *arg_directory = NULL;
117 static char *arg_user = NULL;
118 static sd_id128_t arg_uuid = {};
119 static char *arg_machine = NULL;
120 static const char *arg_selinux_context = NULL;
121 static const char *arg_selinux_apifs_context = NULL;
122 static const char *arg_slice = NULL;
123 static bool arg_private_network = false;
124 static bool arg_read_only = false;
125 static bool arg_boot = false;
126 static LinkJournal arg_link_journal = LINK_AUTO;
127 static bool arg_link_journal_try = false;
128 static uint64_t arg_retain =
129 (1ULL << CAP_CHOWN) |
130 (1ULL << CAP_DAC_OVERRIDE) |
131 (1ULL << CAP_DAC_READ_SEARCH) |
132 (1ULL << CAP_FOWNER) |
133 (1ULL << CAP_FSETID) |
134 (1ULL << CAP_IPC_OWNER) |
135 (1ULL << CAP_KILL) |
136 (1ULL << CAP_LEASE) |
137 (1ULL << CAP_LINUX_IMMUTABLE) |
138 (1ULL << CAP_NET_BIND_SERVICE) |
139 (1ULL << CAP_NET_BROADCAST) |
140 (1ULL << CAP_NET_RAW) |
141 (1ULL << CAP_SETGID) |
142 (1ULL << CAP_SETFCAP) |
143 (1ULL << CAP_SETPCAP) |
144 (1ULL << CAP_SETUID) |
145 (1ULL << CAP_SYS_ADMIN) |
146 (1ULL << CAP_SYS_CHROOT) |
147 (1ULL << CAP_SYS_NICE) |
148 (1ULL << CAP_SYS_PTRACE) |
149 (1ULL << CAP_SYS_TTY_CONFIG) |
150 (1ULL << CAP_SYS_RESOURCE) |
151 (1ULL << CAP_SYS_BOOT) |
152 (1ULL << CAP_AUDIT_WRITE) |
153 (1ULL << CAP_AUDIT_CONTROL) |
154 (1ULL << CAP_MKNOD);
155 static char **arg_bind = NULL;
156 static char **arg_bind_ro = NULL;
157 static char **arg_tmpfs = NULL;
158 static char **arg_setenv = NULL;
159 static bool arg_quiet = false;
160 static bool arg_share_system = false;
161 static bool arg_register = true;
162 static bool arg_keep_unit = false;
163 static char **arg_network_interfaces = NULL;
164 static char **arg_network_macvlan = NULL;
165 static bool arg_network_veth = false;
166 static const char *arg_network_bridge = NULL;
167 static unsigned long arg_personality = 0xffffffffLU;
168 static const char *arg_image = NULL;
169 static Volatile arg_volatile = VOLATILE_NO;
170
171 static void help(void) {
172 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
173 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
174 " -h --help Show this help\n"
175 " --version Print version string\n"
176 " -q --quiet Do not show status information\n"
177 " -D --directory=PATH Root directory for the container\n"
178 " -i --image=PATH File system device or image for the container\n"
179 " -b --boot Boot up full system (i.e. invoke init)\n"
180 " -u --user=USER Run the command under specified user or uid\n"
181 " -M --machine=NAME Set the machine name for the container\n"
182 " --uuid=UUID Set a specific machine UUID for the container\n"
183 " -S --slice=SLICE Place the container in the specified slice\n"
184 " --private-network Disable network in container\n"
185 " --network-interface=INTERFACE\n"
186 " Assign an existing network interface to the\n"
187 " container\n"
188 " --network-macvlan=INTERFACE\n"
189 " Create a macvlan network interface based on an\n"
190 " existing network interface to the container\n"
191 " --network-veth Add a virtual ethernet connection between host\n"
192 " and container\n"
193 " --network-bridge=INTERFACE\n"
194 " Add a virtual ethernet connection between host\n"
195 " and container and add it to an existing bridge on\n"
196 " the host\n"
197 " -Z --selinux-context=SECLABEL\n"
198 " Set the SELinux security context to be used by\n"
199 " processes in the container\n"
200 " -L --selinux-apifs-context=SECLABEL\n"
201 " Set the SELinux security context to be used by\n"
202 " API/tmpfs file systems in the container\n"
203 " --capability=CAP In addition to the default, retain specified\n"
204 " capability\n"
205 " --drop-capability=CAP Drop the specified capability from the default set\n"
206 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
207 " try-guest, try-host\n"
208 " -j Equivalent to --link-journal=try-guest\n"
209 " --read-only Mount the root directory read-only\n"
210 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
211 " the container\n"
212 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
213 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
214 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
215 " --share-system Share system namespaces with host\n"
216 " --register=BOOLEAN Register container as machine\n"
217 " --keep-unit Do not register a scope for the machine, reuse\n"
218 " the service unit nspawn is running in\n"
219 " --volatile[=MODE] Run the system in volatile mode\n",
220 program_invocation_short_name);
221 }
222
223 static int parse_argv(int argc, char *argv[]) {
224
225 enum {
226 ARG_VERSION = 0x100,
227 ARG_PRIVATE_NETWORK,
228 ARG_UUID,
229 ARG_READ_ONLY,
230 ARG_CAPABILITY,
231 ARG_DROP_CAPABILITY,
232 ARG_LINK_JOURNAL,
233 ARG_BIND,
234 ARG_BIND_RO,
235 ARG_TMPFS,
236 ARG_SETENV,
237 ARG_SHARE_SYSTEM,
238 ARG_REGISTER,
239 ARG_KEEP_UNIT,
240 ARG_NETWORK_INTERFACE,
241 ARG_NETWORK_MACVLAN,
242 ARG_NETWORK_VETH,
243 ARG_NETWORK_BRIDGE,
244 ARG_PERSONALITY,
245 ARG_VOLATILE,
246 };
247
248 static const struct option options[] = {
249 { "help", no_argument, NULL, 'h' },
250 { "version", no_argument, NULL, ARG_VERSION },
251 { "directory", required_argument, NULL, 'D' },
252 { "user", required_argument, NULL, 'u' },
253 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
254 { "boot", no_argument, NULL, 'b' },
255 { "uuid", required_argument, NULL, ARG_UUID },
256 { "read-only", no_argument, NULL, ARG_READ_ONLY },
257 { "capability", required_argument, NULL, ARG_CAPABILITY },
258 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
259 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
260 { "bind", required_argument, NULL, ARG_BIND },
261 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
262 { "tmpfs", required_argument, NULL, ARG_TMPFS },
263 { "machine", required_argument, NULL, 'M' },
264 { "slice", required_argument, NULL, 'S' },
265 { "setenv", required_argument, NULL, ARG_SETENV },
266 { "selinux-context", required_argument, NULL, 'Z' },
267 { "selinux-apifs-context", required_argument, NULL, 'L' },
268 { "quiet", no_argument, NULL, 'q' },
269 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
270 { "register", required_argument, NULL, ARG_REGISTER },
271 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
272 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
273 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
274 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
275 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
276 { "personality", required_argument, NULL, ARG_PERSONALITY },
277 { "image", required_argument, NULL, 'i' },
278 { "volatile", optional_argument, NULL, ARG_VOLATILE },
279 {}
280 };
281
282 int c, r;
283 uint64_t plus = 0, minus = 0;
284
285 assert(argc >= 0);
286 assert(argv);
287
288 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
289
290 switch (c) {
291
292 case 'h':
293 help();
294 return 0;
295
296 case ARG_VERSION:
297 puts(PACKAGE_STRING);
298 puts(SYSTEMD_FEATURES);
299 return 0;
300
301 case 'D':
302 free(arg_directory);
303 arg_directory = canonicalize_file_name(optarg);
304 if (!arg_directory) {
305 log_error_errno(errno, "Invalid root directory: %m");
306 return -ENOMEM;
307 }
308
309 break;
310
311 case 'i':
312 arg_image = optarg;
313 break;
314
315 case 'u':
316 free(arg_user);
317 arg_user = strdup(optarg);
318 if (!arg_user)
319 return log_oom();
320
321 break;
322
323 case ARG_NETWORK_BRIDGE:
324 arg_network_bridge = optarg;
325
326 /* fall through */
327
328 case ARG_NETWORK_VETH:
329 arg_network_veth = true;
330 arg_private_network = true;
331 break;
332
333 case ARG_NETWORK_INTERFACE:
334 if (strv_extend(&arg_network_interfaces, optarg) < 0)
335 return log_oom();
336
337 arg_private_network = true;
338 break;
339
340 case ARG_NETWORK_MACVLAN:
341 if (strv_extend(&arg_network_macvlan, optarg) < 0)
342 return log_oom();
343
344 /* fall through */
345
346 case ARG_PRIVATE_NETWORK:
347 arg_private_network = true;
348 break;
349
350 case 'b':
351 arg_boot = true;
352 break;
353
354 case ARG_UUID:
355 r = sd_id128_from_string(optarg, &arg_uuid);
356 if (r < 0) {
357 log_error("Invalid UUID: %s", optarg);
358 return r;
359 }
360 break;
361
362 case 'S':
363 arg_slice = optarg;
364 break;
365
366 case 'M':
367 if (isempty(optarg)) {
368 free(arg_machine);
369 arg_machine = NULL;
370 } else {
371
372 if (!hostname_is_valid(optarg)) {
373 log_error("Invalid machine name: %s", optarg);
374 return -EINVAL;
375 }
376
377 free(arg_machine);
378 arg_machine = strdup(optarg);
379 if (!arg_machine)
380 return log_oom();
381
382 break;
383 }
384
385 case 'Z':
386 arg_selinux_context = optarg;
387 break;
388
389 case 'L':
390 arg_selinux_apifs_context = optarg;
391 break;
392
393 case ARG_READ_ONLY:
394 arg_read_only = true;
395 break;
396
397 case ARG_CAPABILITY:
398 case ARG_DROP_CAPABILITY: {
399 const char *state, *word;
400 size_t length;
401
402 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
403 _cleanup_free_ char *t;
404 cap_value_t cap;
405
406 t = strndup(word, length);
407 if (!t)
408 return log_oom();
409
410 if (streq(t, "all")) {
411 if (c == ARG_CAPABILITY)
412 plus = (uint64_t) -1;
413 else
414 minus = (uint64_t) -1;
415 } else {
416 if (cap_from_name(t, &cap) < 0) {
417 log_error("Failed to parse capability %s.", t);
418 return -EINVAL;
419 }
420
421 if (c == ARG_CAPABILITY)
422 plus |= 1ULL << (uint64_t) cap;
423 else
424 minus |= 1ULL << (uint64_t) cap;
425 }
426 }
427
428 break;
429 }
430
431 case 'j':
432 arg_link_journal = LINK_GUEST;
433 arg_link_journal_try = true;
434 break;
435
436 case ARG_LINK_JOURNAL:
437 if (streq(optarg, "auto"))
438 arg_link_journal = LINK_AUTO;
439 else if (streq(optarg, "no"))
440 arg_link_journal = LINK_NO;
441 else if (streq(optarg, "guest"))
442 arg_link_journal = LINK_GUEST;
443 else if (streq(optarg, "host"))
444 arg_link_journal = LINK_HOST;
445 else if (streq(optarg, "try-guest")) {
446 arg_link_journal = LINK_GUEST;
447 arg_link_journal_try = true;
448 } else if (streq(optarg, "try-host")) {
449 arg_link_journal = LINK_HOST;
450 arg_link_journal_try = true;
451 } else {
452 log_error("Failed to parse link journal mode %s", optarg);
453 return -EINVAL;
454 }
455
456 break;
457
458 case ARG_BIND:
459 case ARG_BIND_RO: {
460 _cleanup_free_ char *a = NULL, *b = NULL;
461 char *e;
462 char ***x;
463
464 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
465
466 e = strchr(optarg, ':');
467 if (e) {
468 a = strndup(optarg, e - optarg);
469 b = strdup(e + 1);
470 } else {
471 a = strdup(optarg);
472 b = strdup(optarg);
473 }
474
475 if (!a || !b)
476 return log_oom();
477
478 if (!path_is_absolute(a) || !path_is_absolute(b)) {
479 log_error("Invalid bind mount specification: %s", optarg);
480 return -EINVAL;
481 }
482
483 r = strv_extend(x, a);
484 if (r < 0)
485 return log_oom();
486
487 r = strv_extend(x, b);
488 if (r < 0)
489 return log_oom();
490
491 break;
492 }
493
494 case ARG_TMPFS: {
495 _cleanup_free_ char *a = NULL, *b = NULL;
496 char *e;
497
498 e = strchr(optarg, ':');
499 if (e) {
500 a = strndup(optarg, e - optarg);
501 b = strdup(e + 1);
502 } else {
503 a = strdup(optarg);
504 b = strdup("mode=0755");
505 }
506
507 if (!a || !b)
508 return log_oom();
509
510 if (!path_is_absolute(a)) {
511 log_error("Invalid tmpfs specification: %s", optarg);
512 return -EINVAL;
513 }
514
515 r = strv_push(&arg_tmpfs, a);
516 if (r < 0)
517 return log_oom();
518
519 a = NULL;
520
521 r = strv_push(&arg_tmpfs, b);
522 if (r < 0)
523 return log_oom();
524
525 b = NULL;
526
527 break;
528 }
529
530 case ARG_SETENV: {
531 char **n;
532
533 if (!env_assignment_is_valid(optarg)) {
534 log_error("Environment variable assignment '%s' is not valid.", optarg);
535 return -EINVAL;
536 }
537
538 n = strv_env_set(arg_setenv, optarg);
539 if (!n)
540 return log_oom();
541
542 strv_free(arg_setenv);
543 arg_setenv = n;
544 break;
545 }
546
547 case 'q':
548 arg_quiet = true;
549 break;
550
551 case ARG_SHARE_SYSTEM:
552 arg_share_system = true;
553 break;
554
555 case ARG_REGISTER:
556 r = parse_boolean(optarg);
557 if (r < 0) {
558 log_error("Failed to parse --register= argument: %s", optarg);
559 return r;
560 }
561
562 arg_register = r;
563 break;
564
565 case ARG_KEEP_UNIT:
566 arg_keep_unit = true;
567 break;
568
569 case ARG_PERSONALITY:
570
571 arg_personality = personality_from_string(optarg);
572 if (arg_personality == 0xffffffffLU) {
573 log_error("Unknown or unsupported personality '%s'.", optarg);
574 return -EINVAL;
575 }
576
577 break;
578
579 case ARG_VOLATILE:
580
581 if (!optarg)
582 arg_volatile = VOLATILE_YES;
583 else {
584 r = parse_boolean(optarg);
585 if (r < 0) {
586 if (streq(optarg, "state"))
587 arg_volatile = VOLATILE_STATE;
588 else {
589 log_error("Failed to parse --volatile= argument: %s", optarg);
590 return r;
591 }
592 } else
593 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
594 }
595
596 break;
597
598 case '?':
599 return -EINVAL;
600
601 default:
602 assert_not_reached("Unhandled option");
603 }
604
605 if (arg_share_system)
606 arg_register = false;
607
608 if (arg_boot && arg_share_system) {
609 log_error("--boot and --share-system may not be combined.");
610 return -EINVAL;
611 }
612
613 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
614 log_error("--keep-unit may not be used when invoked from a user session.");
615 return -EINVAL;
616 }
617
618 if (arg_directory && arg_image) {
619 log_error("--directory= and --image= may not be combined.");
620 return -EINVAL;
621 }
622
623 if (arg_volatile != VOLATILE_NO && arg_read_only) {
624 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
625 return -EINVAL;
626 }
627
628 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
629
630 return 1;
631 }
632
633 static int mount_all(const char *dest) {
634
635 typedef struct MountPoint {
636 const char *what;
637 const char *where;
638 const char *type;
639 const char *options;
640 unsigned long flags;
641 bool fatal;
642 } MountPoint;
643
644 static const MountPoint mount_table[] = {
645 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
646 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
647 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
648 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
649 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
650 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
651 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
652 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
653 #ifdef HAVE_SELINUX
654 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
655 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
656 #endif
657 };
658
659 unsigned k;
660 int r = 0;
661
662 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
663 _cleanup_free_ char *where = NULL;
664 #ifdef HAVE_SELINUX
665 _cleanup_free_ char *options = NULL;
666 #endif
667 const char *o;
668 int t;
669
670 where = strjoin(dest, "/", mount_table[k].where, NULL);
671 if (!where)
672 return log_oom();
673
674 t = path_is_mount_point(where, true);
675 if (t < 0) {
676 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
677
678 if (r == 0)
679 r = t;
680
681 continue;
682 }
683
684 /* Skip this entry if it is not a remount. */
685 if (mount_table[k].what && t > 0)
686 continue;
687
688 t = mkdir_p(where, 0755);
689 if (t < 0) {
690 if (mount_table[k].fatal) {
691 log_error_errno(t, "Failed to create directory %s: %m", where);
692
693 if (r == 0)
694 r = t;
695 } else
696 log_warning_errno(t, "Failed to create directory %s: %m", where);
697
698 continue;
699 }
700
701 #ifdef HAVE_SELINUX
702 if (arg_selinux_apifs_context &&
703 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
704 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
705 if (!options)
706 return log_oom();
707
708 o = options;
709 } else
710 #endif
711 o = mount_table[k].options;
712
713
714 if (mount(mount_table[k].what,
715 where,
716 mount_table[k].type,
717 mount_table[k].flags,
718 o) < 0) {
719
720 if (mount_table[k].fatal) {
721 log_error_errno(errno, "mount(%s) failed: %m", where);
722
723 if (r == 0)
724 r = -errno;
725 } else
726 log_warning_errno(errno, "mount(%s) failed: %m", where);
727 }
728 }
729
730 return r;
731 }
732
733 static int mount_binds(const char *dest, char **l, bool ro) {
734 char **x, **y;
735
736 STRV_FOREACH_PAIR(x, y, l) {
737 _cleanup_free_ char *where = NULL;
738 struct stat source_st, dest_st;
739 int r;
740
741 if (stat(*x, &source_st) < 0) {
742 log_error_errno(errno, "Failed to stat %s: %m", *x);
743 return -errno;
744 }
745
746 where = strappend(dest, *y);
747 if (!where)
748 return log_oom();
749
750 r = stat(where, &dest_st);
751 if (r == 0) {
752 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
753 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
754 return -EINVAL;
755 }
756 } else if (errno == ENOENT) {
757 r = mkdir_parents_label(where, 0755);
758 if (r < 0)
759 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
760 } else {
761 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
762 return -errno;
763 }
764
765 /* Create the mount point, but be conservative -- refuse to create block
766 * and char devices. */
767 if (S_ISDIR(source_st.st_mode)) {
768 r = mkdir_label(where, 0755);
769 if (r < 0 && errno != EEXIST)
770 return log_error_errno(r, "Failed to create mount point %s: %m", where);
771 } else if (S_ISFIFO(source_st.st_mode)) {
772 r = mkfifo(where, 0644);
773 if (r < 0 && errno != EEXIST) {
774 log_error_errno(errno, "Failed to create mount point %s: %m", where);
775
776 return -errno;
777 }
778 } else if (S_ISSOCK(source_st.st_mode)) {
779 r = mknod(where, 0644 | S_IFSOCK, 0);
780 if (r < 0 && errno != EEXIST) {
781 log_error_errno(errno, "Failed to create mount point %s: %m", where);
782
783 return -errno;
784 }
785 } else if (S_ISREG(source_st.st_mode)) {
786 r = touch(where);
787 if (r < 0)
788 return log_error_errno(r, "Failed to create mount point %s: %m", where);
789 } else {
790 log_error("Refusing to create mountpoint for file: %s", *x);
791 return -ENOTSUP;
792 }
793
794 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
795 log_error_errno(errno, "mount(%s) failed: %m", where);
796 return -errno;
797 }
798
799 if (ro) {
800 r = bind_remount_recursive(where, true);
801 if (r < 0)
802 return log_error_errno(r, "Read-Only bind mount failed: %m");
803 }
804 }
805
806 return 0;
807 }
808
809 static int mount_tmpfs(const char *dest) {
810 char **i, **o;
811
812 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
813 _cleanup_free_ char *where = NULL;
814 int r;
815
816 where = strappend(dest, *i);
817 if (!where)
818 return log_oom();
819
820 r = mkdir_label(where, 0755);
821 if (r < 0 && errno != EEXIST)
822 return log_error_errno(r, "creating mount point for tmpfs %s failed: %m", where);
823
824 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
825 log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
826 return -errno;
827 }
828 }
829
830 return 0;
831 }
832
833 static int setup_timezone(const char *dest) {
834 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
835 char *z, *y;
836 int r;
837
838 assert(dest);
839
840 /* Fix the timezone, if possible */
841 r = readlink_malloc("/etc/localtime", &p);
842 if (r < 0) {
843 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
844 return 0;
845 }
846
847 z = path_startswith(p, "../usr/share/zoneinfo/");
848 if (!z)
849 z = path_startswith(p, "/usr/share/zoneinfo/");
850 if (!z) {
851 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
852 return 0;
853 }
854
855 where = strappend(dest, "/etc/localtime");
856 if (!where)
857 return log_oom();
858
859 r = readlink_malloc(where, &q);
860 if (r >= 0) {
861 y = path_startswith(q, "../usr/share/zoneinfo/");
862 if (!y)
863 y = path_startswith(q, "/usr/share/zoneinfo/");
864
865 /* Already pointing to the right place? Then do nothing .. */
866 if (y && streq(y, z))
867 return 0;
868 }
869
870 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
871 if (!check)
872 return log_oom();
873
874 if (access(check, F_OK) < 0) {
875 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
876 return 0;
877 }
878
879 what = strappend("../usr/share/zoneinfo/", z);
880 if (!what)
881 return log_oom();
882
883 r = mkdir_parents(where, 0755);
884 if (r < 0) {
885 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
886
887 return 0;
888 }
889
890 r = unlink(where);
891 if (r < 0 && errno != ENOENT) {
892 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
893
894 return 0;
895 }
896
897 if (symlink(what, where) < 0) {
898 log_error_errno(errno, "Failed to correct timezone of container: %m");
899 return 0;
900 }
901
902 return 0;
903 }
904
905 static int setup_resolv_conf(const char *dest) {
906 _cleanup_free_ char *where = NULL;
907 int r;
908
909 assert(dest);
910
911 if (arg_private_network)
912 return 0;
913
914 /* Fix resolv.conf, if possible */
915 where = strappend(dest, "/etc/resolv.conf");
916 if (!where)
917 return log_oom();
918
919 /* We don't really care for the results of this really. If it
920 * fails, it fails, but meh... */
921 r = mkdir_parents(where, 0755);
922 if (r < 0) {
923 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
924
925 return 0;
926 }
927
928 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
929 if (r < 0) {
930 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
931
932 return 0;
933 }
934
935 return 0;
936 }
937
938 static int setup_volatile_state(const char *directory) {
939 const char *p;
940 int r;
941
942 assert(directory);
943
944 if (arg_volatile != VOLATILE_STATE)
945 return 0;
946
947 /* --volatile=state means we simply overmount /var
948 with a tmpfs, and the rest read-only. */
949
950 r = bind_remount_recursive(directory, true);
951 if (r < 0)
952 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
953
954 p = strappenda(directory, "/var");
955 r = mkdir(p, 0755);
956 if (r < 0 && errno != EEXIST) {
957 log_error_errno(errno, "Failed to create %s: %m", directory);
958 return -errno;
959 }
960
961 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
962 log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
963 return -errno;
964 }
965
966 return 0;
967 }
968
969 static int setup_volatile(const char *directory) {
970 bool tmpfs_mounted = false, bind_mounted = false;
971 char template[] = "/tmp/nspawn-volatile-XXXXXX";
972 const char *f, *t;
973 int r;
974
975 assert(directory);
976
977 if (arg_volatile != VOLATILE_YES)
978 return 0;
979
980 /* --volatile=yes means we mount a tmpfs to the root dir, and
981 the original /usr to use inside it, and that read-only. */
982
983 if (!mkdtemp(template)) {
984 log_error_errno(errno, "Failed to create temporary directory: %m");
985 return -errno;
986 }
987
988 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
989 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
990 r = -errno;
991 goto fail;
992 }
993
994 tmpfs_mounted = true;
995
996 f = strappenda(directory, "/usr");
997 t = strappenda(template, "/usr");
998
999 r = mkdir(t, 0755);
1000 if (r < 0 && errno != EEXIST) {
1001 log_error_errno(errno, "Failed to create %s: %m", t);
1002 r = -errno;
1003 goto fail;
1004 }
1005
1006 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1007 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1008 r = -errno;
1009 goto fail;
1010 }
1011
1012 bind_mounted = true;
1013
1014 r = bind_remount_recursive(t, true);
1015 if (r < 0) {
1016 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1017 goto fail;
1018 }
1019
1020 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1021 log_error_errno(errno, "Failed to move root mount: %m");
1022 r = -errno;
1023 goto fail;
1024 }
1025
1026 rmdir(template);
1027
1028 return 0;
1029
1030 fail:
1031 if (bind_mounted)
1032 umount(t);
1033 if (tmpfs_mounted)
1034 umount(template);
1035 rmdir(template);
1036 return r;
1037 }
1038
1039 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1040
1041 snprintf(s, 37,
1042 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1043 SD_ID128_FORMAT_VAL(id));
1044
1045 return s;
1046 }
1047
1048 static int setup_boot_id(const char *dest) {
1049 _cleanup_free_ char *from = NULL, *to = NULL;
1050 sd_id128_t rnd = {};
1051 char as_uuid[37];
1052 int r;
1053
1054 assert(dest);
1055
1056 if (arg_share_system)
1057 return 0;
1058
1059 /* Generate a new randomized boot ID, so that each boot-up of
1060 * the container gets a new one */
1061
1062 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1063 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1064 if (!from || !to)
1065 return log_oom();
1066
1067 r = sd_id128_randomize(&rnd);
1068 if (r < 0)
1069 return log_error_errno(r, "Failed to generate random boot id: %m");
1070
1071 id128_format_as_uuid(rnd, as_uuid);
1072
1073 r = write_string_file(from, as_uuid);
1074 if (r < 0)
1075 return log_error_errno(r, "Failed to write boot id: %m");
1076
1077 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1078 log_error_errno(errno, "Failed to bind mount boot id: %m");
1079 r = -errno;
1080 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1081 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1082
1083 unlink(from);
1084 return r;
1085 }
1086
1087 static int copy_devnodes(const char *dest) {
1088
1089 static const char devnodes[] =
1090 "null\0"
1091 "zero\0"
1092 "full\0"
1093 "random\0"
1094 "urandom\0"
1095 "tty\0"
1096 "net/tun\0";
1097
1098 const char *d;
1099 int r = 0;
1100 _cleanup_umask_ mode_t u;
1101
1102 assert(dest);
1103
1104 u = umask(0000);
1105
1106 NULSTR_FOREACH(d, devnodes) {
1107 _cleanup_free_ char *from = NULL, *to = NULL;
1108 struct stat st;
1109
1110 from = strappend("/dev/", d);
1111 to = strjoin(dest, "/dev/", d, NULL);
1112 if (!from || !to)
1113 return log_oom();
1114
1115 if (stat(from, &st) < 0) {
1116
1117 if (errno != ENOENT) {
1118 log_error_errno(errno, "Failed to stat %s: %m", from);
1119 return -errno;
1120 }
1121
1122 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1123
1124 log_error("%s is not a char or block device, cannot copy", from);
1125 return -EIO;
1126
1127 } else {
1128 r = mkdir_parents(to, 0775);
1129 if (r < 0) {
1130 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1131 return -r;
1132 }
1133
1134 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1135 log_error_errno(errno, "mknod(%s) failed: %m", dest);
1136 return -errno;
1137 }
1138 }
1139 }
1140
1141 return r;
1142 }
1143
1144 static int setup_ptmx(const char *dest) {
1145 _cleanup_free_ char *p = NULL;
1146
1147 p = strappend(dest, "/dev/ptmx");
1148 if (!p)
1149 return log_oom();
1150
1151 if (symlink("pts/ptmx", p) < 0) {
1152 log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1153 return -errno;
1154 }
1155
1156 return 0;
1157 }
1158
1159 static int setup_dev_console(const char *dest, const char *console) {
1160 _cleanup_umask_ mode_t u;
1161 const char *to;
1162 struct stat st;
1163 int r;
1164
1165 assert(dest);
1166 assert(console);
1167
1168 u = umask(0000);
1169
1170 if (stat("/dev/null", &st) < 0) {
1171 log_error_errno(errno, "Failed to stat /dev/null: %m");
1172 return -errno;
1173 }
1174
1175 r = chmod_and_chown(console, 0600, 0, 0);
1176 if (r < 0)
1177 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1178
1179 /* We need to bind mount the right tty to /dev/console since
1180 * ptys can only exist on pts file systems. To have something
1181 * to bind mount things on we create a device node first, and
1182 * use /dev/null for that since we the cgroups device policy
1183 * allows us to create that freely, while we cannot create
1184 * /dev/console. (Note that the major minor doesn't actually
1185 * matter here, since we mount it over anyway). */
1186
1187 to = strappenda(dest, "/dev/console");
1188 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1189 log_error_errno(errno, "mknod() for /dev/console failed: %m");
1190 return -errno;
1191 }
1192
1193 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1194 log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1195 return -errno;
1196 }
1197
1198 return 0;
1199 }
1200
1201 static int setup_kmsg(const char *dest, int kmsg_socket) {
1202 _cleanup_free_ char *from = NULL, *to = NULL;
1203 int r, fd, k;
1204 _cleanup_umask_ mode_t u;
1205 union {
1206 struct cmsghdr cmsghdr;
1207 uint8_t buf[CMSG_SPACE(sizeof(int))];
1208 } control = {};
1209 struct msghdr mh = {
1210 .msg_control = &control,
1211 .msg_controllen = sizeof(control),
1212 };
1213 struct cmsghdr *cmsg;
1214
1215 assert(dest);
1216 assert(kmsg_socket >= 0);
1217
1218 u = umask(0000);
1219
1220 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1221 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1222 * on the reading side behave very similar to /proc/kmsg,
1223 * their writing side behaves differently from /dev/kmsg in
1224 * that writing blocks when nothing is reading. In order to
1225 * avoid any problems with containers deadlocking due to this
1226 * we simply make /dev/kmsg unavailable to the container. */
1227 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1228 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1229 return log_oom();
1230
1231 if (mkfifo(from, 0600) < 0) {
1232 log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1233 return -errno;
1234 }
1235
1236 r = chmod_and_chown(from, 0600, 0, 0);
1237 if (r < 0)
1238 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1239
1240 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1241 log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1242 return -errno;
1243 }
1244
1245 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1246 if (fd < 0) {
1247 log_error_errno(errno, "Failed to open fifo: %m");
1248 return -errno;
1249 }
1250
1251 cmsg = CMSG_FIRSTHDR(&mh);
1252 cmsg->cmsg_level = SOL_SOCKET;
1253 cmsg->cmsg_type = SCM_RIGHTS;
1254 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1255 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1256
1257 mh.msg_controllen = cmsg->cmsg_len;
1258
1259 /* Store away the fd in the socket, so that it stays open as
1260 * long as we run the child */
1261 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1262 safe_close(fd);
1263
1264 if (k < 0) {
1265 log_error_errno(errno, "Failed to send FIFO fd: %m");
1266 return -errno;
1267 }
1268
1269 /* And now make the FIFO unavailable as /dev/kmsg... */
1270 unlink(from);
1271 return 0;
1272 }
1273
1274 static int setup_hostname(void) {
1275
1276 if (arg_share_system)
1277 return 0;
1278
1279 if (sethostname_idempotent(arg_machine) < 0)
1280 return -errno;
1281
1282 return 0;
1283 }
1284
1285 static int setup_journal(const char *directory) {
1286 sd_id128_t machine_id, this_id;
1287 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1288 char *id;
1289 int r;
1290
1291 p = strappend(directory, "/etc/machine-id");
1292 if (!p)
1293 return log_oom();
1294
1295 r = read_one_line_file(p, &b);
1296 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1297 return 0;
1298 else if (r < 0)
1299 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1300
1301 id = strstrip(b);
1302 if (isempty(id) && arg_link_journal == LINK_AUTO)
1303 return 0;
1304
1305 /* Verify validity */
1306 r = sd_id128_from_string(id, &machine_id);
1307 if (r < 0)
1308 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1309
1310 r = sd_id128_get_machine(&this_id);
1311 if (r < 0)
1312 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1313
1314 if (sd_id128_equal(machine_id, this_id)) {
1315 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1316 "Host and machine ids are equal (%s): refusing to link journals", id);
1317 if (arg_link_journal == LINK_AUTO)
1318 return 0;
1319 return
1320 -EEXIST;
1321 }
1322
1323 if (arg_link_journal == LINK_NO)
1324 return 0;
1325
1326 free(p);
1327 p = strappend("/var/log/journal/", id);
1328 q = strjoin(directory, "/var/log/journal/", id, NULL);
1329 if (!p || !q)
1330 return log_oom();
1331
1332 if (path_is_mount_point(p, false) > 0) {
1333 if (arg_link_journal != LINK_AUTO) {
1334 log_error("%s: already a mount point, refusing to use for journal", p);
1335 return -EEXIST;
1336 }
1337
1338 return 0;
1339 }
1340
1341 if (path_is_mount_point(q, false) > 0) {
1342 if (arg_link_journal != LINK_AUTO) {
1343 log_error("%s: already a mount point, refusing to use for journal", q);
1344 return -EEXIST;
1345 }
1346
1347 return 0;
1348 }
1349
1350 r = readlink_and_make_absolute(p, &d);
1351 if (r >= 0) {
1352 if ((arg_link_journal == LINK_GUEST ||
1353 arg_link_journal == LINK_AUTO) &&
1354 path_equal(d, q)) {
1355
1356 r = mkdir_p(q, 0755);
1357 if (r < 0)
1358 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1359 return 0;
1360 }
1361
1362 if (unlink(p) < 0) {
1363 log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1364 return -errno;
1365 }
1366 } else if (r == -EINVAL) {
1367
1368 if (arg_link_journal == LINK_GUEST &&
1369 rmdir(p) < 0) {
1370
1371 if (errno == ENOTDIR) {
1372 log_error("%s already exists and is neither a symlink nor a directory", p);
1373 return r;
1374 } else {
1375 log_error_errno(errno, "Failed to remove %s: %m", p);
1376 return -errno;
1377 }
1378 }
1379 } else if (r != -ENOENT) {
1380 log_error_errno(errno, "readlink(%s) failed: %m", p);
1381 return r;
1382 }
1383
1384 if (arg_link_journal == LINK_GUEST) {
1385
1386 if (symlink(q, p) < 0) {
1387 if (arg_link_journal_try) {
1388 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1389 return 0;
1390 } else {
1391 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1392 return -errno;
1393 }
1394 }
1395
1396 r = mkdir_p(q, 0755);
1397 if (r < 0)
1398 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1399 return 0;
1400 }
1401
1402 if (arg_link_journal == LINK_HOST) {
1403 /* don't create parents here -- if the host doesn't have
1404 * permanent journal set up, don't force it here */
1405 r = mkdir(p, 0755);
1406 if (r < 0) {
1407 if (arg_link_journal_try) {
1408 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1409 return 0;
1410 } else {
1411 log_error_errno(errno, "Failed to create %s: %m", p);
1412 return r;
1413 }
1414 }
1415
1416 } else if (access(p, F_OK) < 0)
1417 return 0;
1418
1419 if (dir_is_empty(q) == 0)
1420 log_warning("%s is not empty, proceeding anyway.", q);
1421
1422 r = mkdir_p(q, 0755);
1423 if (r < 0) {
1424 log_error_errno(errno, "Failed to create %s: %m", q);
1425 return r;
1426 }
1427
1428 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1429 log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1430 return -errno;
1431 }
1432
1433 return 0;
1434 }
1435
1436 static int drop_capabilities(void) {
1437 return capability_bounding_set_drop(~arg_retain, false);
1438 }
1439
1440 static int register_machine(pid_t pid, int local_ifindex) {
1441 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1442 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1443 int r;
1444
1445 if (!arg_register)
1446 return 0;
1447
1448 r = sd_bus_default_system(&bus);
1449 if (r < 0)
1450 return log_error_errno(r, "Failed to open system bus: %m");
1451
1452 if (arg_keep_unit) {
1453 r = sd_bus_call_method(
1454 bus,
1455 "org.freedesktop.machine1",
1456 "/org/freedesktop/machine1",
1457 "org.freedesktop.machine1.Manager",
1458 "RegisterMachineWithNetwork",
1459 &error,
1460 NULL,
1461 "sayssusai",
1462 arg_machine,
1463 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1464 "nspawn",
1465 "container",
1466 (uint32_t) pid,
1467 strempty(arg_directory),
1468 local_ifindex > 0 ? 1 : 0, local_ifindex);
1469 } else {
1470 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1471
1472 r = sd_bus_message_new_method_call(
1473 bus,
1474 &m,
1475 "org.freedesktop.machine1",
1476 "/org/freedesktop/machine1",
1477 "org.freedesktop.machine1.Manager",
1478 "CreateMachineWithNetwork");
1479 if (r < 0)
1480 return log_error_errno(r, "Failed to create message: %m");
1481
1482 r = sd_bus_message_append(
1483 m,
1484 "sayssusai",
1485 arg_machine,
1486 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1487 "nspawn",
1488 "container",
1489 (uint32_t) pid,
1490 strempty(arg_directory),
1491 local_ifindex > 0 ? 1 : 0, local_ifindex);
1492 if (r < 0)
1493 return log_error_errno(r, "Failed to append message arguments: %m");
1494
1495 r = sd_bus_message_open_container(m, 'a', "(sv)");
1496 if (r < 0)
1497 return log_error_errno(r, "Failed to open container: %m");
1498
1499 if (!isempty(arg_slice)) {
1500 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1501 if (r < 0)
1502 return log_error_errno(r, "Failed to append slice: %m");
1503 }
1504
1505 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1506 if (r < 0)
1507 return log_error_errno(r, "Failed to add device policy: %m");
1508
1509 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1510 /* Allow the container to
1511 * access and create the API
1512 * device nodes, so that
1513 * PrivateDevices= in the
1514 * container can work
1515 * fine */
1516 "/dev/null", "rwm",
1517 "/dev/zero", "rwm",
1518 "/dev/full", "rwm",
1519 "/dev/random", "rwm",
1520 "/dev/urandom", "rwm",
1521 "/dev/tty", "rwm",
1522 "/dev/net/tun", "rwm",
1523 /* Allow the container
1524 * access to ptys. However,
1525 * do not permit the
1526 * container to ever create
1527 * these device nodes. */
1528 "/dev/pts/ptmx", "rw",
1529 "char-pts", "rw");
1530 if (r < 0)
1531 return log_error_errno(r, "Failed to add device whitelist: %m");
1532
1533 r = sd_bus_message_close_container(m);
1534 if (r < 0)
1535 return log_error_errno(r, "Failed to close container: %m");
1536
1537 r = sd_bus_call(bus, m, 0, &error, NULL);
1538 }
1539
1540 if (r < 0) {
1541 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1542 return r;
1543 }
1544
1545 return 0;
1546 }
1547
1548 static int terminate_machine(pid_t pid) {
1549 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1550 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1551 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1552 const char *path;
1553 int r;
1554
1555 if (!arg_register)
1556 return 0;
1557
1558 r = sd_bus_default_system(&bus);
1559 if (r < 0)
1560 return log_error_errno(r, "Failed to open system bus: %m");
1561
1562 r = sd_bus_call_method(
1563 bus,
1564 "org.freedesktop.machine1",
1565 "/org/freedesktop/machine1",
1566 "org.freedesktop.machine1.Manager",
1567 "GetMachineByPID",
1568 &error,
1569 &reply,
1570 "u",
1571 (uint32_t) pid);
1572 if (r < 0) {
1573 /* Note that the machine might already have been
1574 * cleaned up automatically, hence don't consider it a
1575 * failure if we cannot get the machine object. */
1576 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1577 return 0;
1578 }
1579
1580 r = sd_bus_message_read(reply, "o", &path);
1581 if (r < 0)
1582 return bus_log_parse_error(r);
1583
1584 r = sd_bus_call_method(
1585 bus,
1586 "org.freedesktop.machine1",
1587 path,
1588 "org.freedesktop.machine1.Machine",
1589 "Terminate",
1590 &error,
1591 NULL,
1592 NULL);
1593 if (r < 0) {
1594 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1595 return 0;
1596 }
1597
1598 return 0;
1599 }
1600
1601 static int reset_audit_loginuid(void) {
1602 _cleanup_free_ char *p = NULL;
1603 int r;
1604
1605 if (arg_share_system)
1606 return 0;
1607
1608 r = read_one_line_file("/proc/self/loginuid", &p);
1609 if (r == -ENOENT)
1610 return 0;
1611 if (r < 0)
1612 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1613
1614 /* Already reset? */
1615 if (streq(p, "4294967295"))
1616 return 0;
1617
1618 r = write_string_file("/proc/self/loginuid", "4294967295");
1619 if (r < 0) {
1620 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1621 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1622 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1623 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1624 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1625
1626 sleep(5);
1627 }
1628
1629 return 0;
1630 }
1631
1632 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1633 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1634
1635 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key) {
1636 int r;
1637
1638 uint8_t result[8];
1639 size_t l, sz;
1640 uint8_t *v;
1641
1642 l = strlen(arg_machine);
1643 sz = sizeof(sd_id128_t) + l;
1644 v = alloca(sz);
1645
1646 /* fetch some persistent data unique to the host */
1647 r = sd_id128_get_machine((sd_id128_t*) v);
1648 if (r < 0)
1649 return r;
1650
1651 /* combine with some data unique (on this host) to this
1652 * container instance */
1653 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1654
1655 /* Let's hash the host machine ID plus the container name. We
1656 * use a fixed, but originally randomly created hash key here. */
1657 siphash24(result, v, sz, hash_key.bytes);
1658
1659 assert_cc(ETH_ALEN <= sizeof(result));
1660 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1661
1662 /* see eth_random_addr in the kernel */
1663 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1664 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1665
1666 return 0;
1667 }
1668
1669 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1670 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1671 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1672 struct ether_addr mac_host, mac_container;
1673 int r, i;
1674
1675 if (!arg_private_network)
1676 return 0;
1677
1678 if (!arg_network_veth)
1679 return 0;
1680
1681 /* Use two different interface name prefixes depending whether
1682 * we are in bridge mode or not. */
1683 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1684 arg_network_bridge ? "vb" : "ve", arg_machine);
1685
1686 r = generate_mac(&mac_container, CONTAINER_HASH_KEY);
1687 if (r < 0) {
1688 log_error("Failed to generate predictable MAC address for container side");
1689 return r;
1690 }
1691
1692 r = generate_mac(&mac_host, HOST_HASH_KEY);
1693 if (r < 0) {
1694 log_error("Failed to generate predictable MAC address for host side");
1695 return r;
1696 }
1697
1698 r = sd_rtnl_open(&rtnl, 0);
1699 if (r < 0)
1700 return log_error_errno(r, "Failed to connect to netlink: %m");
1701
1702 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1703 if (r < 0)
1704 return log_error_errno(r, "Failed to allocate netlink message: %m");
1705
1706 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1707 if (r < 0)
1708 return log_error_errno(r, "Failed to add netlink interface name: %m");
1709
1710 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1711 if (r < 0)
1712 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1713
1714 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1715 if (r < 0)
1716 return log_error_errno(r, "Failed to open netlink container: %m");
1717
1718 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1719 if (r < 0)
1720 return log_error_errno(r, "Failed to open netlink container: %m");
1721
1722 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1723 if (r < 0)
1724 return log_error_errno(r, "Failed to open netlink container: %m");
1725
1726 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1727 if (r < 0)
1728 return log_error_errno(r, "Failed to add netlink interface name: %m");
1729
1730 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1731 if (r < 0)
1732 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1733
1734 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1735 if (r < 0)
1736 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1737
1738 r = sd_rtnl_message_close_container(m);
1739 if (r < 0)
1740 return log_error_errno(r, "Failed to close netlink container: %m");
1741
1742 r = sd_rtnl_message_close_container(m);
1743 if (r < 0)
1744 return log_error_errno(r, "Failed to close netlink container: %m");
1745
1746 r = sd_rtnl_message_close_container(m);
1747 if (r < 0)
1748 return log_error_errno(r, "Failed to close netlink container: %m");
1749
1750 r = sd_rtnl_call(rtnl, m, 0, NULL);
1751 if (r < 0)
1752 return log_error_errno(r, "Failed to add new veth interfaces: %m");
1753
1754 i = (int) if_nametoindex(iface_name);
1755 if (i <= 0) {
1756 log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
1757 return -errno;
1758 }
1759
1760 *ifi = i;
1761
1762 return 0;
1763 }
1764
1765 static int setup_bridge(const char veth_name[], int *ifi) {
1766 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1767 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1768 int r, bridge;
1769
1770 if (!arg_private_network)
1771 return 0;
1772
1773 if (!arg_network_veth)
1774 return 0;
1775
1776 if (!arg_network_bridge)
1777 return 0;
1778
1779 bridge = (int) if_nametoindex(arg_network_bridge);
1780 if (bridge <= 0) {
1781 log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
1782 return -errno;
1783 }
1784
1785 *ifi = bridge;
1786
1787 r = sd_rtnl_open(&rtnl, 0);
1788 if (r < 0)
1789 return log_error_errno(r, "Failed to connect to netlink: %m");
1790
1791 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1792 if (r < 0)
1793 return log_error_errno(r, "Failed to allocate netlink message: %m");
1794
1795 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1796 if (r < 0)
1797 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
1798
1799 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1800 if (r < 0)
1801 return log_error_errno(r, "Failed to add netlink interface name field: %m");
1802
1803 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1804 if (r < 0)
1805 return log_error_errno(r, "Failed to add netlink master field: %m");
1806
1807 r = sd_rtnl_call(rtnl, m, 0, NULL);
1808 if (r < 0)
1809 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
1810
1811 return 0;
1812 }
1813
1814 static int parse_interface(struct udev *udev, const char *name) {
1815 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1816 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1817 int ifi;
1818
1819 ifi = (int) if_nametoindex(name);
1820 if (ifi <= 0) {
1821 log_error_errno(errno, "Failed to resolve interface %s: %m", name);
1822 return -errno;
1823 }
1824
1825 sprintf(ifi_str, "n%i", ifi);
1826 d = udev_device_new_from_device_id(udev, ifi_str);
1827 if (!d) {
1828 log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
1829 return -errno;
1830 }
1831
1832 if (udev_device_get_is_initialized(d) <= 0) {
1833 log_error("Network interface %s is not initialized yet.", name);
1834 return -EBUSY;
1835 }
1836
1837 return ifi;
1838 }
1839
1840 static int move_network_interfaces(pid_t pid) {
1841 _cleanup_udev_unref_ struct udev *udev = NULL;
1842 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1843 char **i;
1844 int r;
1845
1846 if (!arg_private_network)
1847 return 0;
1848
1849 if (strv_isempty(arg_network_interfaces))
1850 return 0;
1851
1852 r = sd_rtnl_open(&rtnl, 0);
1853 if (r < 0)
1854 return log_error_errno(r, "Failed to connect to netlink: %m");
1855
1856 udev = udev_new();
1857 if (!udev) {
1858 log_error("Failed to connect to udev.");
1859 return -ENOMEM;
1860 }
1861
1862 STRV_FOREACH(i, arg_network_interfaces) {
1863 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1864 int ifi;
1865
1866 ifi = parse_interface(udev, *i);
1867 if (ifi < 0)
1868 return ifi;
1869
1870 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1871 if (r < 0)
1872 return log_error_errno(r, "Failed to allocate netlink message: %m");
1873
1874 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1875 if (r < 0)
1876 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
1877
1878 r = sd_rtnl_call(rtnl, m, 0, NULL);
1879 if (r < 0)
1880 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
1881 }
1882
1883 return 0;
1884 }
1885
1886 static int setup_macvlan(pid_t pid) {
1887 _cleanup_udev_unref_ struct udev *udev = NULL;
1888 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1889 char **i;
1890 int r;
1891
1892 if (!arg_private_network)
1893 return 0;
1894
1895 if (strv_isempty(arg_network_macvlan))
1896 return 0;
1897
1898 r = sd_rtnl_open(&rtnl, 0);
1899 if (r < 0)
1900 return log_error_errno(r, "Failed to connect to netlink: %m");
1901
1902 udev = udev_new();
1903 if (!udev) {
1904 log_error("Failed to connect to udev.");
1905 return -ENOMEM;
1906 }
1907
1908 STRV_FOREACH(i, arg_network_macvlan) {
1909 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1910 _cleanup_free_ char *n = NULL;
1911 int ifi;
1912
1913 ifi = parse_interface(udev, *i);
1914 if (ifi < 0)
1915 return ifi;
1916
1917 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1918 if (r < 0)
1919 return log_error_errno(r, "Failed to allocate netlink message: %m");
1920
1921 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1922 if (r < 0)
1923 return log_error_errno(r, "Failed to add netlink interface index: %m");
1924
1925 n = strappend("mv-", *i);
1926 if (!n)
1927 return log_oom();
1928
1929 strshorten(n, IFNAMSIZ-1);
1930
1931 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1932 if (r < 0)
1933 return log_error_errno(r, "Failed to add netlink interface name: %m");
1934
1935 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1936 if (r < 0)
1937 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1938
1939 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1940 if (r < 0)
1941 return log_error_errno(r, "Failed to open netlink container: %m");
1942
1943 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1944 if (r < 0)
1945 return log_error_errno(r, "Failed to open netlink container: %m");
1946
1947 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1948 if (r < 0)
1949 return log_error_errno(r, "Failed to append macvlan mode: %m");
1950
1951 r = sd_rtnl_message_close_container(m);
1952 if (r < 0)
1953 return log_error_errno(r, "Failed to close netlink container: %m");
1954
1955 r = sd_rtnl_message_close_container(m);
1956 if (r < 0)
1957 return log_error_errno(r, "Failed to close netlink container: %m");
1958
1959 r = sd_rtnl_call(rtnl, m, 0, NULL);
1960 if (r < 0)
1961 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
1962 }
1963
1964 return 0;
1965 }
1966
1967 static int setup_seccomp(void) {
1968
1969 #ifdef HAVE_SECCOMP
1970 static const int blacklist[] = {
1971 SCMP_SYS(kexec_load),
1972 SCMP_SYS(open_by_handle_at),
1973 SCMP_SYS(init_module),
1974 SCMP_SYS(finit_module),
1975 SCMP_SYS(delete_module),
1976 SCMP_SYS(iopl),
1977 SCMP_SYS(ioperm),
1978 SCMP_SYS(swapon),
1979 SCMP_SYS(swapoff),
1980 };
1981
1982 scmp_filter_ctx seccomp;
1983 unsigned i;
1984 int r;
1985
1986 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1987 if (!seccomp)
1988 return log_oom();
1989
1990 r = seccomp_add_secondary_archs(seccomp);
1991 if (r < 0) {
1992 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1993 goto finish;
1994 }
1995
1996 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1997 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
1998 if (r == -EFAULT)
1999 continue; /* unknown syscall */
2000 if (r < 0) {
2001 log_error_errno(r, "Failed to block syscall: %m");
2002 goto finish;
2003 }
2004 }
2005
2006 /*
2007 Audit is broken in containers, much of the userspace audit
2008 hookup will fail if running inside a container. We don't
2009 care and just turn off creation of audit sockets.
2010
2011 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2012 with EAFNOSUPPORT which audit userspace uses as indication
2013 that audit is disabled in the kernel.
2014 */
2015
2016 r = seccomp_rule_add(
2017 seccomp,
2018 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2019 SCMP_SYS(socket),
2020 2,
2021 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2022 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2023 if (r < 0) {
2024 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2025 goto finish;
2026 }
2027
2028 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2029 if (r < 0) {
2030 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2031 goto finish;
2032 }
2033
2034 r = seccomp_load(seccomp);
2035 if (r < 0)
2036 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2037
2038 finish:
2039 seccomp_release(seccomp);
2040 return r;
2041 #else
2042 return 0;
2043 #endif
2044
2045 }
2046
2047 static int setup_image(char **device_path, int *loop_nr) {
2048 struct loop_info64 info = {
2049 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2050 };
2051 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2052 _cleanup_free_ char* loopdev = NULL;
2053 struct stat st;
2054 int r, nr;
2055
2056 assert(device_path);
2057 assert(loop_nr);
2058
2059 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2060 if (fd < 0) {
2061 log_error_errno(errno, "Failed to open %s: %m", arg_image);
2062 return -errno;
2063 }
2064
2065 if (fstat(fd, &st) < 0) {
2066 log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2067 return -errno;
2068 }
2069
2070 if (S_ISBLK(st.st_mode)) {
2071 char *p;
2072
2073 p = strdup(arg_image);
2074 if (!p)
2075 return log_oom();
2076
2077 *device_path = p;
2078
2079 *loop_nr = -1;
2080
2081 r = fd;
2082 fd = -1;
2083
2084 return r;
2085 }
2086
2087 if (!S_ISREG(st.st_mode)) {
2088 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2089 return -EINVAL;
2090 }
2091
2092 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2093 if (control < 0) {
2094 log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2095 return -errno;
2096 }
2097
2098 nr = ioctl(control, LOOP_CTL_GET_FREE);
2099 if (nr < 0) {
2100 log_error_errno(errno, "Failed to allocate loop device: %m");
2101 return -errno;
2102 }
2103
2104 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2105 return log_oom();
2106
2107 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2108 if (loop < 0) {
2109 log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2110 return -errno;
2111 }
2112
2113 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2114 log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2115 return -errno;
2116 }
2117
2118 if (arg_read_only)
2119 info.lo_flags |= LO_FLAGS_READ_ONLY;
2120
2121 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2122 log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2123 return -errno;
2124 }
2125
2126 *device_path = loopdev;
2127 loopdev = NULL;
2128
2129 *loop_nr = nr;
2130
2131 r = loop;
2132 loop = -1;
2133
2134 return r;
2135 }
2136
2137 static int dissect_image(
2138 int fd,
2139 char **root_device, bool *root_device_rw,
2140 char **home_device, bool *home_device_rw,
2141 char **srv_device, bool *srv_device_rw,
2142 bool *secondary) {
2143
2144 #ifdef HAVE_BLKID
2145 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2146 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2147 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2148 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2149 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2150 _cleanup_udev_unref_ struct udev *udev = NULL;
2151 struct udev_list_entry *first, *item;
2152 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2153 const char *pttype = NULL;
2154 blkid_partlist pl;
2155 struct stat st;
2156 int r;
2157
2158 assert(fd >= 0);
2159 assert(root_device);
2160 assert(home_device);
2161 assert(srv_device);
2162 assert(secondary);
2163
2164 b = blkid_new_probe();
2165 if (!b)
2166 return log_oom();
2167
2168 errno = 0;
2169 r = blkid_probe_set_device(b, fd, 0, 0);
2170 if (r != 0) {
2171 if (errno == 0)
2172 return log_oom();
2173
2174 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2175 return -errno;
2176 }
2177
2178 blkid_probe_enable_partitions(b, 1);
2179 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2180
2181 errno = 0;
2182 r = blkid_do_safeprobe(b);
2183 if (r == -2 || r == 1) {
2184 log_error("Failed to identify any partition table on %s.\n"
2185 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2186 return -EINVAL;
2187 } else if (r != 0) {
2188 if (errno == 0)
2189 errno = EIO;
2190 log_error_errno(errno, "Failed to probe: %m");
2191 return -errno;
2192 }
2193
2194 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2195 if (!streq_ptr(pttype, "gpt")) {
2196 log_error("Image %s does not carry a GUID Partition Table.\n"
2197 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2198 return -EINVAL;
2199 }
2200
2201 errno = 0;
2202 pl = blkid_probe_get_partitions(b);
2203 if (!pl) {
2204 if (errno == 0)
2205 return log_oom();
2206
2207 log_error("Failed to list partitions of %s", arg_image);
2208 return -errno;
2209 }
2210
2211 udev = udev_new();
2212 if (!udev)
2213 return log_oom();
2214
2215 if (fstat(fd, &st) < 0) {
2216 log_error_errno(errno, "Failed to stat block device: %m");
2217 return -errno;
2218 }
2219
2220 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2221 if (!d)
2222 return log_oom();
2223
2224 e = udev_enumerate_new(udev);
2225 if (!e)
2226 return log_oom();
2227
2228 r = udev_enumerate_add_match_parent(e, d);
2229 if (r < 0)
2230 return log_oom();
2231
2232 r = udev_enumerate_scan_devices(e);
2233 if (r < 0)
2234 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2235
2236 first = udev_enumerate_get_list_entry(e);
2237 udev_list_entry_foreach(item, first) {
2238 _cleanup_udev_device_unref_ struct udev_device *q;
2239 const char *stype, *node;
2240 unsigned long long flags;
2241 sd_id128_t type_id;
2242 blkid_partition pp;
2243 dev_t qn;
2244 int nr;
2245
2246 errno = 0;
2247 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2248 if (!q) {
2249 if (!errno)
2250 errno = ENOMEM;
2251
2252 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2253 return -errno;
2254 }
2255
2256 qn = udev_device_get_devnum(q);
2257 if (major(qn) == 0)
2258 continue;
2259
2260 if (st.st_rdev == qn)
2261 continue;
2262
2263 node = udev_device_get_devnode(q);
2264 if (!node)
2265 continue;
2266
2267 pp = blkid_partlist_devno_to_partition(pl, qn);
2268 if (!pp)
2269 continue;
2270
2271 flags = blkid_partition_get_flags(pp);
2272 if (flags & GPT_FLAG_NO_AUTO)
2273 continue;
2274
2275 nr = blkid_partition_get_partno(pp);
2276 if (nr < 0)
2277 continue;
2278
2279 stype = blkid_partition_get_type_string(pp);
2280 if (!stype)
2281 continue;
2282
2283 if (sd_id128_from_string(stype, &type_id) < 0)
2284 continue;
2285
2286 if (sd_id128_equal(type_id, GPT_HOME)) {
2287
2288 if (home && nr >= home_nr)
2289 continue;
2290
2291 home_nr = nr;
2292 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2293
2294 free(home);
2295 home = strdup(node);
2296 if (!home)
2297 return log_oom();
2298 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2299
2300 if (srv && nr >= srv_nr)
2301 continue;
2302
2303 srv_nr = nr;
2304 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2305
2306 free(srv);
2307 srv = strdup(node);
2308 if (!srv)
2309 return log_oom();
2310 }
2311 #ifdef GPT_ROOT_NATIVE
2312 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2313
2314 if (root && nr >= root_nr)
2315 continue;
2316
2317 root_nr = nr;
2318 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2319
2320 free(root);
2321 root = strdup(node);
2322 if (!root)
2323 return log_oom();
2324 }
2325 #endif
2326 #ifdef GPT_ROOT_SECONDARY
2327 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2328
2329 if (secondary_root && nr >= secondary_root_nr)
2330 continue;
2331
2332 secondary_root_nr = nr;
2333 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2334
2335
2336 free(secondary_root);
2337 secondary_root = strdup(node);
2338 if (!secondary_root)
2339 return log_oom();
2340 }
2341 #endif
2342 }
2343
2344 if (!root && !secondary_root) {
2345 log_error("Failed to identify root partition in disk image %s.\n"
2346 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2347 return -EINVAL;
2348 }
2349
2350 if (root) {
2351 *root_device = root;
2352 root = NULL;
2353
2354 *root_device_rw = root_rw;
2355 *secondary = false;
2356 } else if (secondary_root) {
2357 *root_device = secondary_root;
2358 secondary_root = NULL;
2359
2360 *root_device_rw = secondary_root_rw;
2361 *secondary = true;
2362 }
2363
2364 if (home) {
2365 *home_device = home;
2366 home = NULL;
2367
2368 *home_device_rw = home_rw;
2369 }
2370
2371 if (srv) {
2372 *srv_device = srv;
2373 srv = NULL;
2374
2375 *srv_device_rw = srv_rw;
2376 }
2377
2378 return 0;
2379 #else
2380 log_error("--image= is not supported, compiled without blkid support.");
2381 return -ENOTSUP;
2382 #endif
2383 }
2384
2385 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2386 #ifdef HAVE_BLKID
2387 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2388 const char *fstype, *p;
2389 int r;
2390
2391 assert(what);
2392 assert(where);
2393
2394 if (arg_read_only)
2395 rw = false;
2396
2397 if (directory)
2398 p = strappenda(where, directory);
2399 else
2400 p = where;
2401
2402 errno = 0;
2403 b = blkid_new_probe_from_filename(what);
2404 if (!b) {
2405 if (errno == 0)
2406 return log_oom();
2407 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2408 return -errno;
2409 }
2410
2411 blkid_probe_enable_superblocks(b, 1);
2412 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2413
2414 errno = 0;
2415 r = blkid_do_safeprobe(b);
2416 if (r == -1 || r == 1) {
2417 log_error("Cannot determine file system type of %s", what);
2418 return -EINVAL;
2419 } else if (r != 0) {
2420 if (errno == 0)
2421 errno = EIO;
2422 log_error_errno(errno, "Failed to probe %s: %m", what);
2423 return -errno;
2424 }
2425
2426 errno = 0;
2427 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2428 if (errno == 0)
2429 errno = EINVAL;
2430 log_error("Failed to determine file system type of %s", what);
2431 return -errno;
2432 }
2433
2434 if (streq(fstype, "crypto_LUKS")) {
2435 log_error("nspawn currently does not support LUKS disk images.");
2436 return -ENOTSUP;
2437 }
2438
2439 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2440 log_error_errno(errno, "Failed to mount %s: %m", what);
2441 return -errno;
2442 }
2443
2444 return 0;
2445 #else
2446 log_error("--image= is not supported, compiled without blkid support.");
2447 return -ENOTSUP;
2448 #endif
2449 }
2450
2451 static int mount_devices(
2452 const char *where,
2453 const char *root_device, bool root_device_rw,
2454 const char *home_device, bool home_device_rw,
2455 const char *srv_device, bool srv_device_rw) {
2456 int r;
2457
2458 assert(where);
2459
2460 if (root_device) {
2461 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2462 if (r < 0)
2463 return log_error_errno(r, "Failed to mount root directory: %m");
2464 }
2465
2466 if (home_device) {
2467 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2468 if (r < 0)
2469 return log_error_errno(r, "Failed to mount home directory: %m");
2470 }
2471
2472 if (srv_device) {
2473 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2474 if (r < 0)
2475 return log_error_errno(r, "Failed to mount server data directory: %m");
2476 }
2477
2478 return 0;
2479 }
2480
2481 static void loop_remove(int nr, int *image_fd) {
2482 _cleanup_close_ int control = -1;
2483 int r;
2484
2485 if (nr < 0)
2486 return;
2487
2488 if (image_fd && *image_fd >= 0) {
2489 r = ioctl(*image_fd, LOOP_CLR_FD);
2490 if (r < 0)
2491 log_warning_errno(errno, "Failed to close loop image: %m");
2492 *image_fd = safe_close(*image_fd);
2493 }
2494
2495 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2496 if (control < 0) {
2497 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2498 return;
2499 }
2500
2501 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2502 if (r < 0)
2503 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2504 }
2505
2506 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2507 int pipe_fds[2];
2508 pid_t pid;
2509
2510 assert(database);
2511 assert(key);
2512 assert(rpid);
2513
2514 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2515 log_error_errno(errno, "Failed to allocate pipe: %m");
2516 return -errno;
2517 }
2518
2519 pid = fork();
2520 if (pid < 0) {
2521 log_error_errno(errno, "Failed to fork getent child: %m");
2522 return -errno;
2523 } else if (pid == 0) {
2524 int nullfd;
2525 char *empty_env = NULL;
2526
2527 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2528 _exit(EXIT_FAILURE);
2529
2530 if (pipe_fds[0] > 2)
2531 safe_close(pipe_fds[0]);
2532 if (pipe_fds[1] > 2)
2533 safe_close(pipe_fds[1]);
2534
2535 nullfd = open("/dev/null", O_RDWR);
2536 if (nullfd < 0)
2537 _exit(EXIT_FAILURE);
2538
2539 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2540 _exit(EXIT_FAILURE);
2541
2542 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2543 _exit(EXIT_FAILURE);
2544
2545 if (nullfd > 2)
2546 safe_close(nullfd);
2547
2548 reset_all_signal_handlers();
2549 close_all_fds(NULL, 0);
2550
2551 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2552 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2553 _exit(EXIT_FAILURE);
2554 }
2555
2556 pipe_fds[1] = safe_close(pipe_fds[1]);
2557
2558 *rpid = pid;
2559
2560 return pipe_fds[0];
2561 }
2562
2563 static int change_uid_gid(char **_home) {
2564 char line[LINE_MAX], *x, *u, *g, *h;
2565 const char *word, *state;
2566 _cleanup_free_ uid_t *uids = NULL;
2567 _cleanup_free_ char *home = NULL;
2568 _cleanup_fclose_ FILE *f = NULL;
2569 _cleanup_close_ int fd = -1;
2570 unsigned n_uids = 0;
2571 size_t sz = 0, l;
2572 uid_t uid;
2573 gid_t gid;
2574 pid_t pid;
2575 int r;
2576
2577 assert(_home);
2578
2579 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2580 /* Reset everything fully to 0, just in case */
2581
2582 if (setgroups(0, NULL) < 0) {
2583 log_error_errno(errno, "setgroups() failed: %m");
2584 return -errno;
2585 }
2586
2587 if (setresgid(0, 0, 0) < 0) {
2588 log_error_errno(errno, "setregid() failed: %m");
2589 return -errno;
2590 }
2591
2592 if (setresuid(0, 0, 0) < 0) {
2593 log_error_errno(errno, "setreuid() failed: %m");
2594 return -errno;
2595 }
2596
2597 *_home = NULL;
2598 return 0;
2599 }
2600
2601 /* First, get user credentials */
2602 fd = spawn_getent("passwd", arg_user, &pid);
2603 if (fd < 0)
2604 return fd;
2605
2606 f = fdopen(fd, "r");
2607 if (!f)
2608 return log_oom();
2609 fd = -1;
2610
2611 if (!fgets(line, sizeof(line), f)) {
2612
2613 if (!ferror(f)) {
2614 log_error("Failed to resolve user %s.", arg_user);
2615 return -ESRCH;
2616 }
2617
2618 log_error_errno(errno, "Failed to read from getent: %m");
2619 return -errno;
2620 }
2621
2622 truncate_nl(line);
2623
2624 wait_for_terminate_and_warn("getent passwd", pid);
2625
2626 x = strchr(line, ':');
2627 if (!x) {
2628 log_error("/etc/passwd entry has invalid user field.");
2629 return -EIO;
2630 }
2631
2632 u = strchr(x+1, ':');
2633 if (!u) {
2634 log_error("/etc/passwd entry has invalid password field.");
2635 return -EIO;
2636 }
2637
2638 u++;
2639 g = strchr(u, ':');
2640 if (!g) {
2641 log_error("/etc/passwd entry has invalid UID field.");
2642 return -EIO;
2643 }
2644
2645 *g = 0;
2646 g++;
2647 x = strchr(g, ':');
2648 if (!x) {
2649 log_error("/etc/passwd entry has invalid GID field.");
2650 return -EIO;
2651 }
2652
2653 *x = 0;
2654 h = strchr(x+1, ':');
2655 if (!h) {
2656 log_error("/etc/passwd entry has invalid GECOS field.");
2657 return -EIO;
2658 }
2659
2660 h++;
2661 x = strchr(h, ':');
2662 if (!x) {
2663 log_error("/etc/passwd entry has invalid home directory field.");
2664 return -EIO;
2665 }
2666
2667 *x = 0;
2668
2669 r = parse_uid(u, &uid);
2670 if (r < 0) {
2671 log_error("Failed to parse UID of user.");
2672 return -EIO;
2673 }
2674
2675 r = parse_gid(g, &gid);
2676 if (r < 0) {
2677 log_error("Failed to parse GID of user.");
2678 return -EIO;
2679 }
2680
2681 home = strdup(h);
2682 if (!home)
2683 return log_oom();
2684
2685 /* Second, get group memberships */
2686 fd = spawn_getent("initgroups", arg_user, &pid);
2687 if (fd < 0)
2688 return fd;
2689
2690 fclose(f);
2691 f = fdopen(fd, "r");
2692 if (!f)
2693 return log_oom();
2694 fd = -1;
2695
2696 if (!fgets(line, sizeof(line), f)) {
2697 if (!ferror(f)) {
2698 log_error("Failed to resolve user %s.", arg_user);
2699 return -ESRCH;
2700 }
2701
2702 log_error_errno(errno, "Failed to read from getent: %m");
2703 return -errno;
2704 }
2705
2706 truncate_nl(line);
2707
2708 wait_for_terminate_and_warn("getent initgroups", pid);
2709
2710 /* Skip over the username and subsequent separator whitespace */
2711 x = line;
2712 x += strcspn(x, WHITESPACE);
2713 x += strspn(x, WHITESPACE);
2714
2715 FOREACH_WORD(word, l, x, state) {
2716 char c[l+1];
2717
2718 memcpy(c, word, l);
2719 c[l] = 0;
2720
2721 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2722 return log_oom();
2723
2724 r = parse_uid(c, &uids[n_uids++]);
2725 if (r < 0) {
2726 log_error("Failed to parse group data from getent.");
2727 return -EIO;
2728 }
2729 }
2730
2731 r = mkdir_parents(home, 0775);
2732 if (r < 0)
2733 return log_error_errno(r, "Failed to make home root directory: %m");
2734
2735 r = mkdir_safe(home, 0755, uid, gid);
2736 if (r < 0 && r != -EEXIST)
2737 return log_error_errno(r, "Failed to make home directory: %m");
2738
2739 fchown(STDIN_FILENO, uid, gid);
2740 fchown(STDOUT_FILENO, uid, gid);
2741 fchown(STDERR_FILENO, uid, gid);
2742
2743 if (setgroups(n_uids, uids) < 0) {
2744 log_error_errno(errno, "Failed to set auxiliary groups: %m");
2745 return -errno;
2746 }
2747
2748 if (setresgid(gid, gid, gid) < 0) {
2749 log_error_errno(errno, "setregid() failed: %m");
2750 return -errno;
2751 }
2752
2753 if (setresuid(uid, uid, uid) < 0) {
2754 log_error_errno(errno, "setreuid() failed: %m");
2755 return -errno;
2756 }
2757
2758 if (_home) {
2759 *_home = home;
2760 home = NULL;
2761 }
2762
2763 return 0;
2764 }
2765
2766 /*
2767 * Return values:
2768 * < 0 : wait_for_terminate() failed to get the state of the
2769 * container, the container was terminated by a signal, or
2770 * failed for an unknown reason. No change is made to the
2771 * container argument.
2772 * > 0 : The program executed in the container terminated with an
2773 * error. The exit code of the program executed in the
2774 * container is returned. The container argument has been set
2775 * to CONTAINER_TERMINATED.
2776 * 0 : The container is being rebooted, has been shut down or exited
2777 * successfully. The container argument has been set to either
2778 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2779 *
2780 * That is, success is indicated by a return value of zero, and an
2781 * error is indicated by a non-zero value.
2782 */
2783 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2784 siginfo_t status;
2785 int r;
2786
2787 r = wait_for_terminate(pid, &status);
2788 if (r < 0)
2789 return log_warning_errno(r, "Failed to wait for container: %m");
2790
2791 switch (status.si_code) {
2792
2793 case CLD_EXITED:
2794 if (status.si_status == 0) {
2795 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2796
2797 } else
2798 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2799
2800 *container = CONTAINER_TERMINATED;
2801 return status.si_status;
2802
2803 case CLD_KILLED:
2804 if (status.si_status == SIGINT) {
2805
2806 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2807 *container = CONTAINER_TERMINATED;
2808 return 0;
2809
2810 } else if (status.si_status == SIGHUP) {
2811
2812 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2813 *container = CONTAINER_REBOOTED;
2814 return 0;
2815 }
2816
2817 /* CLD_KILLED fallthrough */
2818
2819 case CLD_DUMPED:
2820 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2821 return -EIO;
2822
2823 default:
2824 log_error("Container %s failed due to unknown reason.", arg_machine);
2825 return -EIO;
2826 }
2827
2828 return r;
2829 }
2830
2831 static void nop_handler(int sig) {}
2832
2833 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2834 pid_t pid;
2835
2836 pid = PTR_TO_UINT32(userdata);
2837 if (pid > 0) {
2838 if (kill(pid, SIGRTMIN+3) >= 0) {
2839 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2840 sd_event_source_set_userdata(s, NULL);
2841 return 0;
2842 }
2843 }
2844
2845 sd_event_exit(sd_event_source_get_event(s), 0);
2846 return 0;
2847 }
2848
2849 int main(int argc, char *argv[]) {
2850
2851 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2852 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2853 _cleanup_close_ int master = -1, image_fd = -1;
2854 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2855 _cleanup_fdset_free_ FDSet *fds = NULL;
2856 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2857 const char *console = NULL;
2858 char veth_name[IFNAMSIZ];
2859 bool secondary = false;
2860 sigset_t mask, mask_chld;
2861 pid_t pid = 0;
2862
2863 log_parse_environment();
2864 log_open();
2865
2866 k = parse_argv(argc, argv);
2867 if (k < 0)
2868 goto finish;
2869 else if (k == 0) {
2870 r = EXIT_SUCCESS;
2871 goto finish;
2872 }
2873
2874 if (!arg_image) {
2875 if (arg_directory) {
2876 char *p;
2877
2878 p = path_make_absolute_cwd(arg_directory);
2879 free(arg_directory);
2880 arg_directory = p;
2881 } else
2882 arg_directory = get_current_dir_name();
2883
2884 if (!arg_directory) {
2885 log_error("Failed to determine path, please use -D.");
2886 goto finish;
2887 }
2888 path_kill_slashes(arg_directory);
2889 }
2890
2891 if (!arg_machine) {
2892 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2893 if (!arg_machine) {
2894 log_oom();
2895 goto finish;
2896 }
2897
2898 hostname_cleanup(arg_machine, false);
2899 if (isempty(arg_machine)) {
2900 log_error("Failed to determine machine name automatically, please use -M.");
2901 goto finish;
2902 }
2903 }
2904
2905 if (geteuid() != 0) {
2906 log_error("Need to be root.");
2907 goto finish;
2908 }
2909
2910 if (sd_booted() <= 0) {
2911 log_error("Not running on a systemd system.");
2912 goto finish;
2913 }
2914
2915 log_close();
2916 n_fd_passed = sd_listen_fds(false);
2917 if (n_fd_passed > 0) {
2918 k = fdset_new_listen_fds(&fds, false);
2919 if (k < 0) {
2920 log_error_errno(k, "Failed to collect file descriptors: %m");
2921 goto finish;
2922 }
2923 }
2924 fdset_close_others(fds);
2925 log_open();
2926
2927 if (arg_directory) {
2928 if (path_equal(arg_directory, "/")) {
2929 log_error("Spawning container on root directory not supported.");
2930 goto finish;
2931 }
2932
2933 if (arg_boot) {
2934 if (path_is_os_tree(arg_directory) <= 0) {
2935 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2936 goto finish;
2937 }
2938 } else {
2939 const char *p;
2940
2941 p = strappenda(arg_directory,
2942 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2943 if (access(p, F_OK) < 0) {
2944 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2945 goto finish;
2946
2947 }
2948 }
2949 } else {
2950 char template[] = "/tmp/nspawn-root-XXXXXX";
2951
2952 if (!mkdtemp(template)) {
2953 log_error_errno(errno, "Failed to create temporary directory: %m");
2954 r = -errno;
2955 goto finish;
2956 }
2957
2958 arg_directory = strdup(template);
2959 if (!arg_directory) {
2960 r = log_oom();
2961 goto finish;
2962 }
2963
2964 image_fd = setup_image(&device_path, &loop_nr);
2965 if (image_fd < 0) {
2966 r = image_fd;
2967 goto finish;
2968 }
2969
2970 r = dissect_image(image_fd,
2971 &root_device, &root_device_rw,
2972 &home_device, &home_device_rw,
2973 &srv_device, &srv_device_rw,
2974 &secondary);
2975 if (r < 0)
2976 goto finish;
2977 }
2978
2979 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2980 if (master < 0) {
2981 log_error_errno(errno, "Failed to acquire pseudo tty: %m");
2982 goto finish;
2983 }
2984
2985 console = ptsname(master);
2986 if (!console) {
2987 log_error_errno(errno, "Failed to determine tty name: %m");
2988 goto finish;
2989 }
2990
2991 if (!arg_quiet)
2992 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2993 arg_machine, arg_image ? arg_image : arg_directory);
2994
2995 if (unlockpt(master) < 0) {
2996 log_error_errno(errno, "Failed to unlock tty: %m");
2997 goto finish;
2998 }
2999
3000 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3001 log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3002 goto finish;
3003 }
3004
3005 sd_notify(false,
3006 "READY=1\n"
3007 "STATUS=Container running.");
3008
3009 assert_se(sigemptyset(&mask) == 0);
3010 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3011 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3012
3013 assert_se(sigemptyset(&mask_chld) == 0);
3014 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3015
3016 for (;;) {
3017 ContainerStatus container_status;
3018 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3019 struct sigaction sa = {
3020 .sa_handler = nop_handler,
3021 .sa_flags = SA_NOCLDSTOP,
3022 };
3023
3024 r = barrier_create(&barrier);
3025 if (r < 0) {
3026 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3027 goto finish;
3028 }
3029
3030 /* Child can be killed before execv(), so handle SIGCHLD
3031 * in order to interrupt parent's blocking calls and
3032 * give it a chance to call wait() and terminate. */
3033 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3034 if (r < 0) {
3035 log_error_errno(errno, "Failed to change the signal mask: %m");
3036 goto finish;
3037 }
3038
3039 r = sigaction(SIGCHLD, &sa, NULL);
3040 if (r < 0) {
3041 log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3042 goto finish;
3043 }
3044
3045 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3046 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3047 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3048 if (pid < 0) {
3049 if (errno == EINVAL)
3050 log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3051 else
3052 log_error_errno(errno, "clone() failed: %m");
3053
3054 r = pid;
3055 goto finish;
3056 }
3057
3058 if (pid == 0) {
3059 /* child */
3060 _cleanup_free_ char *home = NULL;
3061 unsigned n_env = 2;
3062 const char *envp[] = {
3063 "PATH=" DEFAULT_PATH_SPLIT_USR,
3064 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3065 NULL, /* TERM */
3066 NULL, /* HOME */
3067 NULL, /* USER */
3068 NULL, /* LOGNAME */
3069 NULL, /* container_uuid */
3070 NULL, /* LISTEN_FDS */
3071 NULL, /* LISTEN_PID */
3072 NULL
3073 };
3074 char **env_use;
3075
3076 barrier_set_role(&barrier, BARRIER_CHILD);
3077
3078 envp[n_env] = strv_find_prefix(environ, "TERM=");
3079 if (envp[n_env])
3080 n_env ++;
3081
3082 master = safe_close(master);
3083
3084 close_nointr(STDIN_FILENO);
3085 close_nointr(STDOUT_FILENO);
3086 close_nointr(STDERR_FILENO);
3087
3088 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3089
3090 reset_all_signal_handlers();
3091 reset_signal_mask();
3092
3093 k = open_terminal(console, O_RDWR);
3094 if (k != STDIN_FILENO) {
3095 if (k >= 0) {
3096 safe_close(k);
3097 k = -EINVAL;
3098 }
3099
3100 log_error_errno(k, "Failed to open console: %m");
3101 _exit(EXIT_FAILURE);
3102 }
3103
3104 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3105 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3106 log_error_errno(errno, "Failed to duplicate console: %m");
3107 _exit(EXIT_FAILURE);
3108 }
3109
3110 if (setsid() < 0) {
3111 log_error_errno(errno, "setsid() failed: %m");
3112 _exit(EXIT_FAILURE);
3113 }
3114
3115 if (reset_audit_loginuid() < 0)
3116 _exit(EXIT_FAILURE);
3117
3118 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3119 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3120 _exit(EXIT_FAILURE);
3121 }
3122
3123 /* Mark everything as slave, so that we still
3124 * receive mounts from the real root, but don't
3125 * propagate mounts to the real root. */
3126 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3127 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3128 _exit(EXIT_FAILURE);
3129 }
3130
3131 if (mount_devices(arg_directory,
3132 root_device, root_device_rw,
3133 home_device, home_device_rw,
3134 srv_device, srv_device_rw) < 0)
3135 _exit(EXIT_FAILURE);
3136
3137 /* Turn directory into bind mount */
3138 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3139 log_error_errno(errno, "Failed to make bind mount: %m");
3140 _exit(EXIT_FAILURE);
3141 }
3142
3143 r = setup_volatile(arg_directory);
3144 if (r < 0)
3145 _exit(EXIT_FAILURE);
3146
3147 if (setup_volatile_state(arg_directory) < 0)
3148 _exit(EXIT_FAILURE);
3149
3150 r = base_filesystem_create(arg_directory);
3151 if (r < 0)
3152 _exit(EXIT_FAILURE);
3153
3154 if (arg_read_only) {
3155 k = bind_remount_recursive(arg_directory, true);
3156 if (k < 0) {
3157 log_error_errno(k, "Failed to make tree read-only: %m");
3158 _exit(EXIT_FAILURE);
3159 }
3160 }
3161
3162 if (mount_all(arg_directory) < 0)
3163 _exit(EXIT_FAILURE);
3164
3165 if (copy_devnodes(arg_directory) < 0)
3166 _exit(EXIT_FAILURE);
3167
3168 if (setup_ptmx(arg_directory) < 0)
3169 _exit(EXIT_FAILURE);
3170
3171 dev_setup(arg_directory);
3172
3173 if (setup_seccomp() < 0)
3174 _exit(EXIT_FAILURE);
3175
3176 if (setup_dev_console(arg_directory, console) < 0)
3177 _exit(EXIT_FAILURE);
3178
3179 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3180 _exit(EXIT_FAILURE);
3181
3182 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3183
3184 if (setup_boot_id(arg_directory) < 0)
3185 _exit(EXIT_FAILURE);
3186
3187 if (setup_timezone(arg_directory) < 0)
3188 _exit(EXIT_FAILURE);
3189
3190 if (setup_resolv_conf(arg_directory) < 0)
3191 _exit(EXIT_FAILURE);
3192
3193 if (setup_journal(arg_directory) < 0)
3194 _exit(EXIT_FAILURE);
3195
3196 if (mount_binds(arg_directory, arg_bind, false) < 0)
3197 _exit(EXIT_FAILURE);
3198
3199 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3200 _exit(EXIT_FAILURE);
3201
3202 if (mount_tmpfs(arg_directory) < 0)
3203 _exit(EXIT_FAILURE);
3204
3205 /* Tell the parent that we are ready, and that
3206 * it can cgroupify us to that we lack access
3207 * to certain devices and resources. */
3208 (void)barrier_place(&barrier);
3209
3210 if (chdir(arg_directory) < 0) {
3211 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3212 _exit(EXIT_FAILURE);
3213 }
3214
3215 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3216 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3217 _exit(EXIT_FAILURE);
3218 }
3219
3220 if (chroot(".") < 0) {
3221 log_error_errno(errno, "chroot() failed: %m");
3222 _exit(EXIT_FAILURE);
3223 }
3224
3225 if (chdir("/") < 0) {
3226 log_error_errno(errno, "chdir() failed: %m");
3227 _exit(EXIT_FAILURE);
3228 }
3229
3230 umask(0022);
3231
3232 if (arg_private_network)
3233 loopback_setup();
3234
3235 if (drop_capabilities() < 0) {
3236 log_error_errno(errno, "drop_capabilities() failed: %m");
3237 _exit(EXIT_FAILURE);
3238 }
3239
3240 r = change_uid_gid(&home);
3241 if (r < 0)
3242 _exit(EXIT_FAILURE);
3243
3244 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3245 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3246 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3247 log_oom();
3248 _exit(EXIT_FAILURE);
3249 }
3250
3251 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3252 char as_uuid[37];
3253
3254 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3255 log_oom();
3256 _exit(EXIT_FAILURE);
3257 }
3258 }
3259
3260 if (fdset_size(fds) > 0) {
3261 k = fdset_cloexec(fds, false);
3262 if (k < 0) {
3263 log_error("Failed to unset O_CLOEXEC for file descriptors.");
3264 _exit(EXIT_FAILURE);
3265 }
3266
3267 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3268 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3269 log_oom();
3270 _exit(EXIT_FAILURE);
3271 }
3272 }
3273
3274 setup_hostname();
3275
3276 if (arg_personality != 0xffffffffLU) {
3277 if (personality(arg_personality) < 0) {
3278 log_error_errno(errno, "personality() failed: %m");
3279 _exit(EXIT_FAILURE);
3280 }
3281 } else if (secondary) {
3282 if (personality(PER_LINUX32) < 0) {
3283 log_error_errno(errno, "personality() failed: %m");
3284 _exit(EXIT_FAILURE);
3285 }
3286 }
3287
3288 #ifdef HAVE_SELINUX
3289 if (arg_selinux_context)
3290 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3291 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3292 _exit(EXIT_FAILURE);
3293 }
3294 #endif
3295
3296 if (!strv_isempty(arg_setenv)) {
3297 char **n;
3298
3299 n = strv_env_merge(2, envp, arg_setenv);
3300 if (!n) {
3301 log_oom();
3302 _exit(EXIT_FAILURE);
3303 }
3304
3305 env_use = n;
3306 } else
3307 env_use = (char**) envp;
3308
3309 /* Wait until the parent is ready with the setup, too... */
3310 if (!barrier_place_and_sync(&barrier))
3311 _exit(EXIT_FAILURE);
3312
3313 if (arg_boot) {
3314 char **a;
3315 size_t l;
3316
3317 /* Automatically search for the init system */
3318
3319 l = 1 + argc - optind;
3320 a = newa(char*, l + 1);
3321 memcpy(a + 1, argv + optind, l * sizeof(char*));
3322
3323 a[0] = (char*) "/usr/lib/systemd/systemd";
3324 execve(a[0], a, env_use);
3325
3326 a[0] = (char*) "/lib/systemd/systemd";
3327 execve(a[0], a, env_use);
3328
3329 a[0] = (char*) "/sbin/init";
3330 execve(a[0], a, env_use);
3331 } else if (argc > optind)
3332 execvpe(argv[optind], argv + optind, env_use);
3333 else {
3334 chdir(home ? home : "/root");
3335 execle("/bin/bash", "-bash", NULL, env_use);
3336 execle("/bin/sh", "-sh", NULL, env_use);
3337 }
3338
3339 log_error_errno(errno, "execv() failed: %m");
3340 _exit(EXIT_FAILURE);
3341 }
3342
3343 barrier_set_role(&barrier, BARRIER_PARENT);
3344 fdset_free(fds);
3345 fds = NULL;
3346
3347 /* wait for child-setup to be done */
3348 if (barrier_place_and_sync(&barrier)) {
3349 _cleanup_event_unref_ sd_event *event = NULL;
3350 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3351 int ifi = 0;
3352
3353 r = move_network_interfaces(pid);
3354 if (r < 0)
3355 goto finish;
3356
3357 r = setup_veth(pid, veth_name, &ifi);
3358 if (r < 0)
3359 goto finish;
3360
3361 r = setup_bridge(veth_name, &ifi);
3362 if (r < 0)
3363 goto finish;
3364
3365 r = setup_macvlan(pid);
3366 if (r < 0)
3367 goto finish;
3368
3369 r = register_machine(pid, ifi);
3370 if (r < 0)
3371 goto finish;
3372
3373 /* Block SIGCHLD here, before notifying child.
3374 * process_pty() will handle it with the other signals. */
3375 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3376 if (r < 0)
3377 goto finish;
3378
3379 /* Reset signal to default */
3380 r = default_signals(SIGCHLD, -1);
3381 if (r < 0)
3382 goto finish;
3383
3384 /* Notify the child that the parent is ready with all
3385 * its setup, and that the child can now hand over
3386 * control to the code to run inside the container. */
3387 (void)barrier_place(&barrier);
3388
3389 r = sd_event_new(&event);
3390 if (r < 0) {
3391 log_error_errno(r, "Failed to get default event source: %m");
3392 goto finish;
3393 }
3394
3395 if (arg_boot) {
3396 /* Try to kill the init system on SIGINT or SIGTERM */
3397 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3398 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3399 } else {
3400 /* Immediately exit */
3401 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3402 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3403 }
3404
3405 /* simply exit on sigchld */
3406 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3407
3408 r = pty_forward_new(event, master, &forward);
3409 if (r < 0) {
3410 log_error_errno(r, "Failed to create PTY forwarder: %m");
3411 goto finish;
3412 }
3413
3414 r = sd_event_loop(event);
3415 if (r < 0)
3416 return log_error_errno(r, "Failed to run event loop: %m");
3417
3418 forward = pty_forward_free(forward);
3419
3420 if (!arg_quiet)
3421 putc('\n', stdout);
3422
3423 /* Kill if it is not dead yet anyway */
3424 terminate_machine(pid);
3425 }
3426
3427 /* Normally redundant, but better safe than sorry */
3428 kill(pid, SIGKILL);
3429
3430 r = wait_for_container(pid, &container_status);
3431 pid = 0;
3432
3433 if (r < 0) {
3434 /* We failed to wait for the container, or the
3435 * container exited abnormally */
3436 r = EXIT_FAILURE;
3437 break;
3438 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3439 /* The container exited with a non-zero
3440 * status, or with zero status and no reboot
3441 * was requested. */
3442 break;
3443
3444 /* CONTAINER_REBOOTED, loop again */
3445
3446 if (arg_keep_unit) {
3447 /* Special handling if we are running as a
3448 * service: instead of simply restarting the
3449 * machine we want to restart the entire
3450 * service, so let's inform systemd about this
3451 * with the special exit code 133. The service
3452 * file uses RestartForceExitStatus=133 so
3453 * that this results in a full nspawn
3454 * restart. This is necessary since we might
3455 * have cgroup parameters set we want to have
3456 * flushed out. */
3457 r = 133;
3458 break;
3459 }
3460 }
3461
3462 finish:
3463 sd_notify(false,
3464 "STOPPING=1\n"
3465 "STATUS=Terminating...");
3466
3467 loop_remove(loop_nr, &image_fd);
3468
3469 if (pid > 0)
3470 kill(pid, SIGKILL);
3471
3472 free(arg_directory);
3473 free(arg_machine);
3474 free(arg_user);
3475 strv_free(arg_setenv);
3476 strv_free(arg_network_interfaces);
3477 strv_free(arg_network_macvlan);
3478 strv_free(arg_bind);
3479 strv_free(arg_bind_ro);
3480 strv_free(arg_tmpfs);
3481
3482 return r;
3483 }