]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
5af89c9b3238560e064d51fff46795ddb2a8e4bd
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92
93 #ifdef HAVE_SECCOMP
94 #include "seccomp-util.h"
95 #endif
96
97 typedef enum ContainerStatus {
98 CONTAINER_TERMINATED,
99 CONTAINER_REBOOTED
100 } ContainerStatus;
101
102 typedef enum LinkJournal {
103 LINK_NO,
104 LINK_AUTO,
105 LINK_HOST,
106 LINK_GUEST
107 } LinkJournal;
108
109 typedef enum Volatile {
110 VOLATILE_NO,
111 VOLATILE_YES,
112 VOLATILE_STATE,
113 } Volatile;
114
115 static char *arg_directory = NULL;
116 static char *arg_user = NULL;
117 static sd_id128_t arg_uuid = {};
118 static char *arg_machine = NULL;
119 static const char *arg_selinux_context = NULL;
120 static const char *arg_selinux_apifs_context = NULL;
121 static const char *arg_slice = NULL;
122 static bool arg_private_network = false;
123 static bool arg_read_only = false;
124 static bool arg_boot = false;
125 static LinkJournal arg_link_journal = LINK_AUTO;
126 static uint64_t arg_retain =
127 (1ULL << CAP_CHOWN) |
128 (1ULL << CAP_DAC_OVERRIDE) |
129 (1ULL << CAP_DAC_READ_SEARCH) |
130 (1ULL << CAP_FOWNER) |
131 (1ULL << CAP_FSETID) |
132 (1ULL << CAP_IPC_OWNER) |
133 (1ULL << CAP_KILL) |
134 (1ULL << CAP_LEASE) |
135 (1ULL << CAP_LINUX_IMMUTABLE) |
136 (1ULL << CAP_NET_BIND_SERVICE) |
137 (1ULL << CAP_NET_BROADCAST) |
138 (1ULL << CAP_NET_RAW) |
139 (1ULL << CAP_SETGID) |
140 (1ULL << CAP_SETFCAP) |
141 (1ULL << CAP_SETPCAP) |
142 (1ULL << CAP_SETUID) |
143 (1ULL << CAP_SYS_ADMIN) |
144 (1ULL << CAP_SYS_CHROOT) |
145 (1ULL << CAP_SYS_NICE) |
146 (1ULL << CAP_SYS_PTRACE) |
147 (1ULL << CAP_SYS_TTY_CONFIG) |
148 (1ULL << CAP_SYS_RESOURCE) |
149 (1ULL << CAP_SYS_BOOT) |
150 (1ULL << CAP_AUDIT_WRITE) |
151 (1ULL << CAP_AUDIT_CONTROL) |
152 (1ULL << CAP_MKNOD);
153 static char **arg_bind = NULL;
154 static char **arg_bind_ro = NULL;
155 static char **arg_tmpfs = NULL;
156 static char **arg_setenv = NULL;
157 static bool arg_quiet = false;
158 static bool arg_share_system = false;
159 static bool arg_register = true;
160 static bool arg_keep_unit = false;
161 static char **arg_network_interfaces = NULL;
162 static char **arg_network_macvlan = NULL;
163 static bool arg_network_veth = false;
164 static const char *arg_network_bridge = NULL;
165 static unsigned long arg_personality = 0xffffffffLU;
166 static const char *arg_image = NULL;
167 static Volatile arg_volatile = VOLATILE_NO;
168
169 static void help(void) {
170 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
171 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
172 " -h --help Show this help\n"
173 " --version Print version string\n"
174 " -q --quiet Do not show status information\n"
175 " -D --directory=PATH Root directory for the container\n"
176 " -i --image=PATH File system device or image for the container\n"
177 " -b --boot Boot up full system (i.e. invoke init)\n"
178 " -u --user=USER Run the command under specified user or uid\n"
179 " -M --machine=NAME Set the machine name for the container\n"
180 " --uuid=UUID Set a specific machine UUID for the container\n"
181 " -S --slice=SLICE Place the container in the specified slice\n"
182 " --private-network Disable network in container\n"
183 " --network-interface=INTERFACE\n"
184 " Assign an existing network interface to the\n"
185 " container\n"
186 " --network-macvlan=INTERFACE\n"
187 " Create a macvlan network interface based on an\n"
188 " existing network interface to the container\n"
189 " --network-veth Add a virtual ethernet connection between host\n"
190 " and container\n"
191 " --network-bridge=INTERFACE\n"
192 " Add a virtual ethernet connection between host\n"
193 " and container and add it to an existing bridge on\n"
194 " the host\n"
195 " -Z --selinux-context=SECLABEL\n"
196 " Set the SELinux security context to be used by\n"
197 " processes in the container\n"
198 " -L --selinux-apifs-context=SECLABEL\n"
199 " Set the SELinux security context to be used by\n"
200 " API/tmpfs file systems in the container\n"
201 " --capability=CAP In addition to the default, retain specified\n"
202 " capability\n"
203 " --drop-capability=CAP Drop the specified capability from the default set\n"
204 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
205 " -j Equivalent to --link-journal=host\n"
206 " --read-only Mount the root directory read-only\n"
207 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
208 " the container\n"
209 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
210 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
211 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
212 " --share-system Share system namespaces with host\n"
213 " --register=BOOLEAN Register container as machine\n"
214 " --keep-unit Do not register a scope for the machine, reuse\n"
215 " the service unit nspawn is running in\n"
216 " --volatile[=MODE] Run the system in volatile mode\n",
217 program_invocation_short_name);
218 }
219
220 static int parse_argv(int argc, char *argv[]) {
221
222 enum {
223 ARG_VERSION = 0x100,
224 ARG_PRIVATE_NETWORK,
225 ARG_UUID,
226 ARG_READ_ONLY,
227 ARG_CAPABILITY,
228 ARG_DROP_CAPABILITY,
229 ARG_LINK_JOURNAL,
230 ARG_BIND,
231 ARG_BIND_RO,
232 ARG_TMPFS,
233 ARG_SETENV,
234 ARG_SHARE_SYSTEM,
235 ARG_REGISTER,
236 ARG_KEEP_UNIT,
237 ARG_NETWORK_INTERFACE,
238 ARG_NETWORK_MACVLAN,
239 ARG_NETWORK_VETH,
240 ARG_NETWORK_BRIDGE,
241 ARG_PERSONALITY,
242 ARG_VOLATILE,
243 };
244
245 static const struct option options[] = {
246 { "help", no_argument, NULL, 'h' },
247 { "version", no_argument, NULL, ARG_VERSION },
248 { "directory", required_argument, NULL, 'D' },
249 { "user", required_argument, NULL, 'u' },
250 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
251 { "boot", no_argument, NULL, 'b' },
252 { "uuid", required_argument, NULL, ARG_UUID },
253 { "read-only", no_argument, NULL, ARG_READ_ONLY },
254 { "capability", required_argument, NULL, ARG_CAPABILITY },
255 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
256 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
257 { "bind", required_argument, NULL, ARG_BIND },
258 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
259 { "tmpfs", required_argument, NULL, ARG_TMPFS },
260 { "machine", required_argument, NULL, 'M' },
261 { "slice", required_argument, NULL, 'S' },
262 { "setenv", required_argument, NULL, ARG_SETENV },
263 { "selinux-context", required_argument, NULL, 'Z' },
264 { "selinux-apifs-context", required_argument, NULL, 'L' },
265 { "quiet", no_argument, NULL, 'q' },
266 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
267 { "register", required_argument, NULL, ARG_REGISTER },
268 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
269 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
270 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
271 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
272 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
273 { "personality", required_argument, NULL, ARG_PERSONALITY },
274 { "image", required_argument, NULL, 'i' },
275 { "volatile", optional_argument, NULL, ARG_VOLATILE },
276 {}
277 };
278
279 int c, r;
280 uint64_t plus = 0, minus = 0;
281
282 assert(argc >= 0);
283 assert(argv);
284
285 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
286
287 switch (c) {
288
289 case 'h':
290 help();
291 return 0;
292
293 case ARG_VERSION:
294 puts(PACKAGE_STRING);
295 puts(SYSTEMD_FEATURES);
296 return 0;
297
298 case 'D':
299 free(arg_directory);
300 arg_directory = canonicalize_file_name(optarg);
301 if (!arg_directory) {
302 log_error("Invalid root directory: %m");
303 return -ENOMEM;
304 }
305
306 break;
307
308 case 'i':
309 arg_image = optarg;
310 break;
311
312 case 'u':
313 free(arg_user);
314 arg_user = strdup(optarg);
315 if (!arg_user)
316 return log_oom();
317
318 break;
319
320 case ARG_NETWORK_BRIDGE:
321 arg_network_bridge = optarg;
322
323 /* fall through */
324
325 case ARG_NETWORK_VETH:
326 arg_network_veth = true;
327 arg_private_network = true;
328 break;
329
330 case ARG_NETWORK_INTERFACE:
331 if (strv_extend(&arg_network_interfaces, optarg) < 0)
332 return log_oom();
333
334 arg_private_network = true;
335 break;
336
337 case ARG_NETWORK_MACVLAN:
338 if (strv_extend(&arg_network_macvlan, optarg) < 0)
339 return log_oom();
340
341 /* fall through */
342
343 case ARG_PRIVATE_NETWORK:
344 arg_private_network = true;
345 break;
346
347 case 'b':
348 arg_boot = true;
349 break;
350
351 case ARG_UUID:
352 r = sd_id128_from_string(optarg, &arg_uuid);
353 if (r < 0) {
354 log_error("Invalid UUID: %s", optarg);
355 return r;
356 }
357 break;
358
359 case 'S':
360 arg_slice = optarg;
361 break;
362
363 case 'M':
364 if (isempty(optarg)) {
365 free(arg_machine);
366 arg_machine = NULL;
367 } else {
368
369 if (!hostname_is_valid(optarg)) {
370 log_error("Invalid machine name: %s", optarg);
371 return -EINVAL;
372 }
373
374 free(arg_machine);
375 arg_machine = strdup(optarg);
376 if (!arg_machine)
377 return log_oom();
378
379 break;
380 }
381
382 case 'Z':
383 arg_selinux_context = optarg;
384 break;
385
386 case 'L':
387 arg_selinux_apifs_context = optarg;
388 break;
389
390 case ARG_READ_ONLY:
391 arg_read_only = true;
392 break;
393
394 case ARG_CAPABILITY:
395 case ARG_DROP_CAPABILITY: {
396 const char *state, *word;
397 size_t length;
398
399 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
400 _cleanup_free_ char *t;
401 cap_value_t cap;
402
403 t = strndup(word, length);
404 if (!t)
405 return log_oom();
406
407 if (streq(t, "all")) {
408 if (c == ARG_CAPABILITY)
409 plus = (uint64_t) -1;
410 else
411 minus = (uint64_t) -1;
412 } else {
413 if (cap_from_name(t, &cap) < 0) {
414 log_error("Failed to parse capability %s.", t);
415 return -EINVAL;
416 }
417
418 if (c == ARG_CAPABILITY)
419 plus |= 1ULL << (uint64_t) cap;
420 else
421 minus |= 1ULL << (uint64_t) cap;
422 }
423 }
424
425 break;
426 }
427
428 case 'j':
429 arg_link_journal = LINK_GUEST;
430 break;
431
432 case ARG_LINK_JOURNAL:
433 if (streq(optarg, "auto"))
434 arg_link_journal = LINK_AUTO;
435 else if (streq(optarg, "no"))
436 arg_link_journal = LINK_NO;
437 else if (streq(optarg, "guest"))
438 arg_link_journal = LINK_GUEST;
439 else if (streq(optarg, "host"))
440 arg_link_journal = LINK_HOST;
441 else {
442 log_error("Failed to parse link journal mode %s", optarg);
443 return -EINVAL;
444 }
445
446 break;
447
448 case ARG_BIND:
449 case ARG_BIND_RO: {
450 _cleanup_free_ char *a = NULL, *b = NULL;
451 char *e;
452 char ***x;
453
454 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
455
456 e = strchr(optarg, ':');
457 if (e) {
458 a = strndup(optarg, e - optarg);
459 b = strdup(e + 1);
460 } else {
461 a = strdup(optarg);
462 b = strdup(optarg);
463 }
464
465 if (!a || !b)
466 return log_oom();
467
468 if (!path_is_absolute(a) || !path_is_absolute(b)) {
469 log_error("Invalid bind mount specification: %s", optarg);
470 return -EINVAL;
471 }
472
473 r = strv_extend(x, a);
474 if (r < 0)
475 return log_oom();
476
477 r = strv_extend(x, b);
478 if (r < 0)
479 return log_oom();
480
481 break;
482 }
483
484 case ARG_TMPFS: {
485 _cleanup_free_ char *a = NULL, *b = NULL;
486 char *e;
487
488 e = strchr(optarg, ':');
489 if (e) {
490 a = strndup(optarg, e - optarg);
491 b = strdup(e + 1);
492 } else {
493 a = strdup(optarg);
494 b = strdup("mode=0755");
495 }
496
497 if (!a || !b)
498 return log_oom();
499
500 if (!path_is_absolute(a)) {
501 log_error("Invalid tmpfs specification: %s", optarg);
502 return -EINVAL;
503 }
504
505 r = strv_push(&arg_tmpfs, a);
506 if (r < 0)
507 return log_oom();
508
509 a = NULL;
510
511 r = strv_push(&arg_tmpfs, b);
512 if (r < 0)
513 return log_oom();
514
515 b = NULL;
516
517 break;
518 }
519
520 case ARG_SETENV: {
521 char **n;
522
523 if (!env_assignment_is_valid(optarg)) {
524 log_error("Environment variable assignment '%s' is not valid.", optarg);
525 return -EINVAL;
526 }
527
528 n = strv_env_set(arg_setenv, optarg);
529 if (!n)
530 return log_oom();
531
532 strv_free(arg_setenv);
533 arg_setenv = n;
534 break;
535 }
536
537 case 'q':
538 arg_quiet = true;
539 break;
540
541 case ARG_SHARE_SYSTEM:
542 arg_share_system = true;
543 break;
544
545 case ARG_REGISTER:
546 r = parse_boolean(optarg);
547 if (r < 0) {
548 log_error("Failed to parse --register= argument: %s", optarg);
549 return r;
550 }
551
552 arg_register = r;
553 break;
554
555 case ARG_KEEP_UNIT:
556 arg_keep_unit = true;
557 break;
558
559 case ARG_PERSONALITY:
560
561 arg_personality = personality_from_string(optarg);
562 if (arg_personality == 0xffffffffLU) {
563 log_error("Unknown or unsupported personality '%s'.", optarg);
564 return -EINVAL;
565 }
566
567 break;
568
569 case ARG_VOLATILE:
570
571 if (!optarg)
572 arg_volatile = VOLATILE_YES;
573 else {
574 r = parse_boolean(optarg);
575 if (r < 0) {
576 if (streq(optarg, "state"))
577 arg_volatile = VOLATILE_STATE;
578 else {
579 log_error("Failed to parse --volatile= argument: %s", optarg);
580 return r;
581 }
582 } else
583 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
584 }
585
586 break;
587
588 case '?':
589 return -EINVAL;
590
591 default:
592 assert_not_reached("Unhandled option");
593 }
594
595 if (arg_share_system)
596 arg_register = false;
597
598 if (arg_boot && arg_share_system) {
599 log_error("--boot and --share-system may not be combined.");
600 return -EINVAL;
601 }
602
603 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
604 log_error("--keep-unit may not be used when invoked from a user session.");
605 return -EINVAL;
606 }
607
608 if (arg_directory && arg_image) {
609 log_error("--directory= and --image= may not be combined.");
610 return -EINVAL;
611 }
612
613 if (arg_volatile != VOLATILE_NO && arg_read_only) {
614 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
615 return -EINVAL;
616 }
617
618 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
619
620 return 1;
621 }
622
623 static int mount_all(const char *dest) {
624
625 typedef struct MountPoint {
626 const char *what;
627 const char *where;
628 const char *type;
629 const char *options;
630 unsigned long flags;
631 bool fatal;
632 } MountPoint;
633
634 static const MountPoint mount_table[] = {
635 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
636 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
637 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
638 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
639 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
640 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
641 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
642 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
643 #ifdef HAVE_SELINUX
644 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
645 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
646 #endif
647 };
648
649 unsigned k;
650 int r = 0;
651
652 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
653 _cleanup_free_ char *where = NULL;
654 #ifdef HAVE_SELINUX
655 _cleanup_free_ char *options = NULL;
656 #endif
657 const char *o;
658 int t;
659
660 where = strjoin(dest, "/", mount_table[k].where, NULL);
661 if (!where)
662 return log_oom();
663
664 t = path_is_mount_point(where, true);
665 if (t < 0) {
666 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
667
668 if (r == 0)
669 r = t;
670
671 continue;
672 }
673
674 /* Skip this entry if it is not a remount. */
675 if (mount_table[k].what && t > 0)
676 continue;
677
678 mkdir_p(where, 0755);
679
680 #ifdef HAVE_SELINUX
681 if (arg_selinux_apifs_context &&
682 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
683 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
684 if (!options)
685 return log_oom();
686
687 o = options;
688 } else
689 #endif
690 o = mount_table[k].options;
691
692
693 if (mount(mount_table[k].what,
694 where,
695 mount_table[k].type,
696 mount_table[k].flags,
697 o) < 0 &&
698 mount_table[k].fatal) {
699
700 log_error("mount(%s) failed: %m", where);
701
702 if (r == 0)
703 r = -errno;
704 }
705 }
706
707 return r;
708 }
709
710 static int mount_binds(const char *dest, char **l, bool ro) {
711 char **x, **y;
712
713 STRV_FOREACH_PAIR(x, y, l) {
714 _cleanup_free_ char *where = NULL;
715 struct stat source_st, dest_st;
716 int r;
717
718 if (stat(*x, &source_st) < 0) {
719 log_error("Failed to stat %s: %m", *x);
720 return -errno;
721 }
722
723 where = strappend(dest, *y);
724 if (!where)
725 return log_oom();
726
727 r = stat(where, &dest_st);
728 if (r == 0) {
729 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
730 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
731 return -EINVAL;
732 }
733 } else if (errno == ENOENT) {
734 r = mkdir_parents_label(where, 0755);
735 if (r < 0) {
736 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
737 return r;
738 }
739 } else {
740 log_error("Failed to bind mount %s: %m", *x);
741 return -errno;
742 }
743
744 /* Create the mount point, but be conservative -- refuse to create block
745 * and char devices. */
746 if (S_ISDIR(source_st.st_mode))
747 mkdir_label(where, 0755);
748 else if (S_ISFIFO(source_st.st_mode))
749 mkfifo(where, 0644);
750 else if (S_ISSOCK(source_st.st_mode))
751 mknod(where, 0644 | S_IFSOCK, 0);
752 else if (S_ISREG(source_st.st_mode))
753 touch(where);
754 else {
755 log_error("Refusing to create mountpoint for file: %s", *x);
756 return -ENOTSUP;
757 }
758
759 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
760 log_error("mount(%s) failed: %m", where);
761 return -errno;
762 }
763
764 if (ro) {
765 r = bind_remount_recursive(where, true);
766 if (r < 0) {
767 log_error("Read-Only bind mount failed: %s", strerror(-r));
768 return r;
769 }
770 }
771 }
772
773 return 0;
774 }
775
776 static int mount_tmpfs(const char *dest) {
777 char **i, **o;
778
779 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
780 _cleanup_free_ char *where = NULL;
781
782 where = strappend(dest, *i);
783 if (!where)
784 return log_oom();
785
786 mkdir_label(where, 0755);
787
788 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
789 log_error("tmpfs mount to %s failed: %m", where);
790 return -errno;
791 }
792 }
793
794 return 0;
795 }
796
797 static int setup_timezone(const char *dest) {
798 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
799 char *z, *y;
800 int r;
801
802 assert(dest);
803
804 /* Fix the timezone, if possible */
805 r = readlink_malloc("/etc/localtime", &p);
806 if (r < 0) {
807 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
808 return 0;
809 }
810
811 z = path_startswith(p, "../usr/share/zoneinfo/");
812 if (!z)
813 z = path_startswith(p, "/usr/share/zoneinfo/");
814 if (!z) {
815 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
816 return 0;
817 }
818
819 where = strappend(dest, "/etc/localtime");
820 if (!where)
821 return log_oom();
822
823 r = readlink_malloc(where, &q);
824 if (r >= 0) {
825 y = path_startswith(q, "../usr/share/zoneinfo/");
826 if (!y)
827 y = path_startswith(q, "/usr/share/zoneinfo/");
828
829 /* Already pointing to the right place? Then do nothing .. */
830 if (y && streq(y, z))
831 return 0;
832 }
833
834 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
835 if (!check)
836 return log_oom();
837
838 if (access(check, F_OK) < 0) {
839 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
840 return 0;
841 }
842
843 what = strappend("../usr/share/zoneinfo/", z);
844 if (!what)
845 return log_oom();
846
847 mkdir_parents(where, 0755);
848 unlink(where);
849
850 if (symlink(what, where) < 0) {
851 log_error("Failed to correct timezone of container: %m");
852 return 0;
853 }
854
855 return 0;
856 }
857
858 static int setup_resolv_conf(const char *dest) {
859 _cleanup_free_ char *where = NULL;
860
861 assert(dest);
862
863 if (arg_private_network)
864 return 0;
865
866 /* Fix resolv.conf, if possible */
867 where = strappend(dest, "/etc/resolv.conf");
868 if (!where)
869 return log_oom();
870
871 /* We don't really care for the results of this really. If it
872 * fails, it fails, but meh... */
873 mkdir_parents(where, 0755);
874 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
875
876 return 0;
877 }
878
879 static int setup_volatile_state(const char *directory) {
880 const char *p;
881 int r;
882
883 assert(directory);
884
885 if (arg_volatile != VOLATILE_STATE)
886 return 0;
887
888 /* --volatile=state means we simply overmount /var
889 with a tmpfs, and the rest read-only. */
890
891 r = bind_remount_recursive(directory, true);
892 if (r < 0) {
893 log_error("Failed to remount %s read-only: %s", directory, strerror(-r));
894 return r;
895 }
896
897 p = strappenda(directory, "/var");
898 mkdir(p, 0755);
899
900 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
901 log_error("Failed to mount tmpfs to /var: %m");
902 return -errno;
903 }
904
905 return 0;
906 }
907
908 static int setup_volatile(const char *directory) {
909 bool tmpfs_mounted = false, bind_mounted = false;
910 char template[] = "/tmp/nspawn-volatile-XXXXXX";
911 const char *f, *t;
912 int r;
913
914 assert(directory);
915
916 if (arg_volatile != VOLATILE_YES)
917 return 0;
918
919 /* --volatile=yes means we mount a tmpfs to the root dir, and
920 the original /usr to use inside it, and that read-only. */
921
922 if (!mkdtemp(template)) {
923 log_error("Failed to create temporary directory: %m");
924 return -errno;
925 }
926
927 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
928 log_error("Failed to mount tmpfs for root directory: %m");
929 r = -errno;
930 goto fail;
931 }
932
933 tmpfs_mounted = true;
934
935 f = strappenda(directory, "/usr");
936 t = strappenda(template, "/usr");
937
938 mkdir(t, 0755);
939 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
940 log_error("Failed to create /usr bind mount: %m");
941 r = -errno;
942 goto fail;
943 }
944
945 bind_mounted = true;
946
947 r = bind_remount_recursive(t, true);
948 if (r < 0) {
949 log_error("Failed to remount %s read-only: %s", t, strerror(-r));
950 goto fail;
951 }
952
953 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
954 log_error("Failed to move root mount: %m");
955 r = -errno;
956 goto fail;
957 }
958
959 rmdir(template);
960
961 return 0;
962
963 fail:
964 if (bind_mounted)
965 umount(t);
966 if (tmpfs_mounted)
967 umount(template);
968 rmdir(template);
969 return r;
970 }
971
972 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
973
974 snprintf(s, 37,
975 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
976 SD_ID128_FORMAT_VAL(id));
977
978 return s;
979 }
980
981 static int setup_boot_id(const char *dest) {
982 _cleanup_free_ char *from = NULL, *to = NULL;
983 sd_id128_t rnd = {};
984 char as_uuid[37];
985 int r;
986
987 assert(dest);
988
989 if (arg_share_system)
990 return 0;
991
992 /* Generate a new randomized boot ID, so that each boot-up of
993 * the container gets a new one */
994
995 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
996 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
997 if (!from || !to)
998 return log_oom();
999
1000 r = sd_id128_randomize(&rnd);
1001 if (r < 0) {
1002 log_error("Failed to generate random boot id: %s", strerror(-r));
1003 return r;
1004 }
1005
1006 id128_format_as_uuid(rnd, as_uuid);
1007
1008 r = write_string_file(from, as_uuid);
1009 if (r < 0) {
1010 log_error("Failed to write boot id: %s", strerror(-r));
1011 return r;
1012 }
1013
1014 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1015 log_error("Failed to bind mount boot id: %m");
1016 r = -errno;
1017 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1018 log_warning("Failed to make boot id read-only: %m");
1019
1020 unlink(from);
1021 return r;
1022 }
1023
1024 static int copy_devnodes(const char *dest) {
1025
1026 static const char devnodes[] =
1027 "null\0"
1028 "zero\0"
1029 "full\0"
1030 "random\0"
1031 "urandom\0"
1032 "tty\0";
1033
1034 const char *d;
1035 int r = 0;
1036 _cleanup_umask_ mode_t u;
1037
1038 assert(dest);
1039
1040 u = umask(0000);
1041
1042 NULSTR_FOREACH(d, devnodes) {
1043 _cleanup_free_ char *from = NULL, *to = NULL;
1044 struct stat st;
1045
1046 from = strappend("/dev/", d);
1047 to = strjoin(dest, "/dev/", d, NULL);
1048 if (!from || !to)
1049 return log_oom();
1050
1051 if (stat(from, &st) < 0) {
1052
1053 if (errno != ENOENT) {
1054 log_error("Failed to stat %s: %m", from);
1055 return -errno;
1056 }
1057
1058 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1059
1060 log_error("%s is not a char or block device, cannot copy", from);
1061 return -EIO;
1062
1063 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1064
1065 log_error("mknod(%s) failed: %m", dest);
1066 return -errno;
1067 }
1068 }
1069
1070 return r;
1071 }
1072
1073 static int setup_ptmx(const char *dest) {
1074 _cleanup_free_ char *p = NULL;
1075
1076 p = strappend(dest, "/dev/ptmx");
1077 if (!p)
1078 return log_oom();
1079
1080 if (symlink("pts/ptmx", p) < 0) {
1081 log_error("Failed to create /dev/ptmx symlink: %m");
1082 return -errno;
1083 }
1084
1085 return 0;
1086 }
1087
1088 static int setup_dev_console(const char *dest, const char *console) {
1089 _cleanup_umask_ mode_t u;
1090 const char *to;
1091 struct stat st;
1092 int r;
1093
1094 assert(dest);
1095 assert(console);
1096
1097 u = umask(0000);
1098
1099 if (stat("/dev/null", &st) < 0) {
1100 log_error("Failed to stat /dev/null: %m");
1101 return -errno;
1102 }
1103
1104 r = chmod_and_chown(console, 0600, 0, 0);
1105 if (r < 0) {
1106 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
1107 return r;
1108 }
1109
1110 /* We need to bind mount the right tty to /dev/console since
1111 * ptys can only exist on pts file systems. To have something
1112 * to bind mount things on we create a device node first, and
1113 * use /dev/null for that since we the cgroups device policy
1114 * allows us to create that freely, while we cannot create
1115 * /dev/console. (Note that the major minor doesn't actually
1116 * matter here, since we mount it over anyway). */
1117
1118 to = strappenda(dest, "/dev/console");
1119 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1120 log_error("mknod() for /dev/console failed: %m");
1121 return -errno;
1122 }
1123
1124 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1125 log_error("Bind mount for /dev/console failed: %m");
1126 return -errno;
1127 }
1128
1129 return 0;
1130 }
1131
1132 static int setup_kmsg(const char *dest, int kmsg_socket) {
1133 _cleanup_free_ char *from = NULL, *to = NULL;
1134 int r, fd, k;
1135 _cleanup_umask_ mode_t u;
1136 union {
1137 struct cmsghdr cmsghdr;
1138 uint8_t buf[CMSG_SPACE(sizeof(int))];
1139 } control = {};
1140 struct msghdr mh = {
1141 .msg_control = &control,
1142 .msg_controllen = sizeof(control),
1143 };
1144 struct cmsghdr *cmsg;
1145
1146 assert(dest);
1147 assert(kmsg_socket >= 0);
1148
1149 u = umask(0000);
1150
1151 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1152 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1153 * on the reading side behave very similar to /proc/kmsg,
1154 * their writing side behaves differently from /dev/kmsg in
1155 * that writing blocks when nothing is reading. In order to
1156 * avoid any problems with containers deadlocking due to this
1157 * we simply make /dev/kmsg unavailable to the container. */
1158 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1159 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1160 return log_oom();
1161
1162 if (mkfifo(from, 0600) < 0) {
1163 log_error("mkfifo() for /dev/kmsg failed: %m");
1164 return -errno;
1165 }
1166
1167 r = chmod_and_chown(from, 0600, 0, 0);
1168 if (r < 0) {
1169 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1170 return r;
1171 }
1172
1173 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1174 log_error("Bind mount for /proc/kmsg failed: %m");
1175 return -errno;
1176 }
1177
1178 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1179 if (fd < 0) {
1180 log_error("Failed to open fifo: %m");
1181 return -errno;
1182 }
1183
1184 cmsg = CMSG_FIRSTHDR(&mh);
1185 cmsg->cmsg_level = SOL_SOCKET;
1186 cmsg->cmsg_type = SCM_RIGHTS;
1187 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1188 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1189
1190 mh.msg_controllen = cmsg->cmsg_len;
1191
1192 /* Store away the fd in the socket, so that it stays open as
1193 * long as we run the child */
1194 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1195 safe_close(fd);
1196
1197 if (k < 0) {
1198 log_error("Failed to send FIFO fd: %m");
1199 return -errno;
1200 }
1201
1202 /* And now make the FIFO unavailable as /dev/kmsg... */
1203 unlink(from);
1204 return 0;
1205 }
1206
1207 static int setup_hostname(void) {
1208
1209 if (arg_share_system)
1210 return 0;
1211
1212 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1213 return -errno;
1214
1215 return 0;
1216 }
1217
1218 static int setup_journal(const char *directory) {
1219 sd_id128_t machine_id, this_id;
1220 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1221 char *id;
1222 int r;
1223
1224 p = strappend(directory, "/etc/machine-id");
1225 if (!p)
1226 return log_oom();
1227
1228 r = read_one_line_file(p, &b);
1229 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1230 return 0;
1231 else if (r < 0) {
1232 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1233 return r;
1234 }
1235
1236 id = strstrip(b);
1237 if (isempty(id) && arg_link_journal == LINK_AUTO)
1238 return 0;
1239
1240 /* Verify validity */
1241 r = sd_id128_from_string(id, &machine_id);
1242 if (r < 0) {
1243 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1244 return r;
1245 }
1246
1247 r = sd_id128_get_machine(&this_id);
1248 if (r < 0) {
1249 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1250 return r;
1251 }
1252
1253 if (sd_id128_equal(machine_id, this_id)) {
1254 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1255 "Host and machine ids are equal (%s): refusing to link journals", id);
1256 if (arg_link_journal == LINK_AUTO)
1257 return 0;
1258 return
1259 -EEXIST;
1260 }
1261
1262 if (arg_link_journal == LINK_NO)
1263 return 0;
1264
1265 free(p);
1266 p = strappend("/var/log/journal/", id);
1267 q = strjoin(directory, "/var/log/journal/", id, NULL);
1268 if (!p || !q)
1269 return log_oom();
1270
1271 if (path_is_mount_point(p, false) > 0) {
1272 if (arg_link_journal != LINK_AUTO) {
1273 log_error("%s: already a mount point, refusing to use for journal", p);
1274 return -EEXIST;
1275 }
1276
1277 return 0;
1278 }
1279
1280 if (path_is_mount_point(q, false) > 0) {
1281 if (arg_link_journal != LINK_AUTO) {
1282 log_error("%s: already a mount point, refusing to use for journal", q);
1283 return -EEXIST;
1284 }
1285
1286 return 0;
1287 }
1288
1289 r = readlink_and_make_absolute(p, &d);
1290 if (r >= 0) {
1291 if ((arg_link_journal == LINK_GUEST ||
1292 arg_link_journal == LINK_AUTO) &&
1293 path_equal(d, q)) {
1294
1295 r = mkdir_p(q, 0755);
1296 if (r < 0)
1297 log_warning("failed to create directory %s: %m", q);
1298 return 0;
1299 }
1300
1301 if (unlink(p) < 0) {
1302 log_error("Failed to remove symlink %s: %m", p);
1303 return -errno;
1304 }
1305 } else if (r == -EINVAL) {
1306
1307 if (arg_link_journal == LINK_GUEST &&
1308 rmdir(p) < 0) {
1309
1310 if (errno == ENOTDIR) {
1311 log_error("%s already exists and is neither a symlink nor a directory", p);
1312 return r;
1313 } else {
1314 log_error("Failed to remove %s: %m", p);
1315 return -errno;
1316 }
1317 }
1318 } else if (r != -ENOENT) {
1319 log_error("readlink(%s) failed: %m", p);
1320 return r;
1321 }
1322
1323 if (arg_link_journal == LINK_GUEST) {
1324
1325 if (symlink(q, p) < 0) {
1326 log_error("Failed to symlink %s to %s: %m", q, p);
1327 return -errno;
1328 }
1329
1330 r = mkdir_p(q, 0755);
1331 if (r < 0)
1332 log_warning("failed to create directory %s: %m", q);
1333 return 0;
1334 }
1335
1336 if (arg_link_journal == LINK_HOST) {
1337 r = mkdir_p(p, 0755);
1338 if (r < 0) {
1339 log_error("Failed to create %s: %m", p);
1340 return r;
1341 }
1342
1343 } else if (access(p, F_OK) < 0)
1344 return 0;
1345
1346 if (dir_is_empty(q) == 0)
1347 log_warning("%s is not empty, proceeding anyway.", q);
1348
1349 r = mkdir_p(q, 0755);
1350 if (r < 0) {
1351 log_error("Failed to create %s: %m", q);
1352 return r;
1353 }
1354
1355 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1356 log_error("Failed to bind mount journal from host into guest: %m");
1357 return -errno;
1358 }
1359
1360 return 0;
1361 }
1362
1363 static int setup_kdbus(const char *dest, const char *path) {
1364 const char *p;
1365
1366 if (!path)
1367 return 0;
1368
1369 p = strappenda(dest, "/dev/kdbus");
1370 if (mkdir(p, 0755) < 0) {
1371 log_error("Failed to create kdbus path: %m");
1372 return -errno;
1373 }
1374
1375 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1376 log_error("Failed to mount kdbus domain path: %m");
1377 return -errno;
1378 }
1379
1380 return 0;
1381 }
1382
1383 static int drop_capabilities(void) {
1384 return capability_bounding_set_drop(~arg_retain, false);
1385 }
1386
1387 static int register_machine(pid_t pid, int local_ifindex) {
1388 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1389 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1390 int r;
1391
1392 if (!arg_register)
1393 return 0;
1394
1395 r = sd_bus_default_system(&bus);
1396 if (r < 0) {
1397 log_error("Failed to open system bus: %s", strerror(-r));
1398 return r;
1399 }
1400
1401 if (arg_keep_unit) {
1402 r = sd_bus_call_method(
1403 bus,
1404 "org.freedesktop.machine1",
1405 "/org/freedesktop/machine1",
1406 "org.freedesktop.machine1.Manager",
1407 "RegisterMachineWithNetwork",
1408 &error,
1409 NULL,
1410 "sayssusai",
1411 arg_machine,
1412 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1413 "nspawn",
1414 "container",
1415 (uint32_t) pid,
1416 strempty(arg_directory),
1417 local_ifindex > 0 ? 1 : 0, local_ifindex);
1418 } else {
1419 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1420
1421 r = sd_bus_message_new_method_call(
1422 bus,
1423 &m,
1424 "org.freedesktop.machine1",
1425 "/org/freedesktop/machine1",
1426 "org.freedesktop.machine1.Manager",
1427 "CreateMachineWithNetwork");
1428 if (r < 0) {
1429 log_error("Failed to create message: %s", strerror(-r));
1430 return r;
1431 }
1432
1433 r = sd_bus_message_append(
1434 m,
1435 "sayssusai",
1436 arg_machine,
1437 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1438 "nspawn",
1439 "container",
1440 (uint32_t) pid,
1441 strempty(arg_directory),
1442 local_ifindex > 0 ? 1 : 0, local_ifindex);
1443 if (r < 0) {
1444 log_error("Failed to append message arguments: %s", strerror(-r));
1445 return r;
1446 }
1447
1448 r = sd_bus_message_open_container(m, 'a', "(sv)");
1449 if (r < 0) {
1450 log_error("Failed to open container: %s", strerror(-r));
1451 return r;
1452 }
1453
1454 if (!isempty(arg_slice)) {
1455 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1456 if (r < 0) {
1457 log_error("Failed to append slice: %s", strerror(-r));
1458 return r;
1459 }
1460 }
1461
1462 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1463 if (r < 0) {
1464 log_error("Failed to add device policy: %s", strerror(-r));
1465 return r;
1466 }
1467
1468 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1469 /* Allow the container to
1470 * access and create the API
1471 * device nodes, so that
1472 * PrivateDevices= in the
1473 * container can work
1474 * fine */
1475 "/dev/null", "rwm",
1476 "/dev/zero", "rwm",
1477 "/dev/full", "rwm",
1478 "/dev/random", "rwm",
1479 "/dev/urandom", "rwm",
1480 "/dev/tty", "rwm",
1481 /* Allow the container
1482 * access to ptys. However,
1483 * do not permit the
1484 * container to ever create
1485 * these device nodes. */
1486 "/dev/pts/ptmx", "rw",
1487 "char-pts", "rw",
1488 /* Allow the container
1489 * access to all kdbus
1490 * devices. Again, the
1491 * container cannot create
1492 * these nodes, only use
1493 * them. We use a pretty
1494 * open match here, so that
1495 * the kernel API can still
1496 * change. */
1497 "char-kdbus", "rw",
1498 "char-kdbus/*", "rw");
1499 if (r < 0) {
1500 log_error("Failed to add device whitelist: %s", strerror(-r));
1501 return r;
1502 }
1503
1504 r = sd_bus_message_close_container(m);
1505 if (r < 0) {
1506 log_error("Failed to close container: %s", strerror(-r));
1507 return r;
1508 }
1509
1510 r = sd_bus_call(bus, m, 0, &error, NULL);
1511 }
1512
1513 if (r < 0) {
1514 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1515 return r;
1516 }
1517
1518 return 0;
1519 }
1520
1521 static int terminate_machine(pid_t pid) {
1522 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1523 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1524 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1525 const char *path;
1526 int r;
1527
1528 if (!arg_register)
1529 return 0;
1530
1531 r = sd_bus_default_system(&bus);
1532 if (r < 0) {
1533 log_error("Failed to open system bus: %s", strerror(-r));
1534 return r;
1535 }
1536
1537 r = sd_bus_call_method(
1538 bus,
1539 "org.freedesktop.machine1",
1540 "/org/freedesktop/machine1",
1541 "org.freedesktop.machine1.Manager",
1542 "GetMachineByPID",
1543 &error,
1544 &reply,
1545 "u",
1546 (uint32_t) pid);
1547 if (r < 0) {
1548 /* Note that the machine might already have been
1549 * cleaned up automatically, hence don't consider it a
1550 * failure if we cannot get the machine object. */
1551 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1552 return 0;
1553 }
1554
1555 r = sd_bus_message_read(reply, "o", &path);
1556 if (r < 0)
1557 return bus_log_parse_error(r);
1558
1559 r = sd_bus_call_method(
1560 bus,
1561 "org.freedesktop.machine1",
1562 path,
1563 "org.freedesktop.machine1.Machine",
1564 "Terminate",
1565 &error,
1566 NULL,
1567 NULL);
1568 if (r < 0) {
1569 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1570 return 0;
1571 }
1572
1573 return 0;
1574 }
1575
1576 static int reset_audit_loginuid(void) {
1577 _cleanup_free_ char *p = NULL;
1578 int r;
1579
1580 if (arg_share_system)
1581 return 0;
1582
1583 r = read_one_line_file("/proc/self/loginuid", &p);
1584 if (r == -ENOENT)
1585 return 0;
1586 if (r < 0) {
1587 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1588 return r;
1589 }
1590
1591 /* Already reset? */
1592 if (streq(p, "4294967295"))
1593 return 0;
1594
1595 r = write_string_file("/proc/self/loginuid", "4294967295");
1596 if (r < 0) {
1597 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1598 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1599 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1600 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1601 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1602
1603 sleep(5);
1604 }
1605
1606 return 0;
1607 }
1608
1609 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1610 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1611
1612 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key) {
1613 int r;
1614
1615 uint8_t result[8];
1616 size_t l, sz;
1617 uint8_t *v;
1618
1619 l = strlen(arg_machine);
1620 sz = sizeof(sd_id128_t) + l;
1621 v = alloca(sz);
1622
1623 /* fetch some persistent data unique to the host */
1624 r = sd_id128_get_machine((sd_id128_t*) v);
1625 if (r < 0)
1626 return r;
1627
1628 /* combine with some data unique (on this host) to this
1629 * container instance */
1630 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1631
1632 /* Let's hash the host machine ID plus the container name. We
1633 * use a fixed, but originally randomly created hash key here. */
1634 siphash24(result, v, sz, hash_key.bytes);
1635
1636 assert_cc(ETH_ALEN <= sizeof(result));
1637 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1638
1639 /* see eth_random_addr in the kernel */
1640 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1641 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1642
1643 return 0;
1644 }
1645
1646 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1647 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1648 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1649 struct ether_addr mac_host, mac_container;
1650 int r, i;
1651
1652 if (!arg_private_network)
1653 return 0;
1654
1655 if (!arg_network_veth)
1656 return 0;
1657
1658 /* Use two different interface name prefixes depending whether
1659 * we are in bridge mode or not. */
1660 snprintf(iface_name, IFNAMSIZ, "%s-%s",
1661 arg_network_bridge ? "vb" : "ve", arg_machine);
1662
1663 r = generate_mac(&mac_container, CONTAINER_HASH_KEY);
1664 if (r < 0) {
1665 log_error("Failed to generate predictable MAC address for container side");
1666 return r;
1667 }
1668
1669 r = generate_mac(&mac_host, HOST_HASH_KEY);
1670 if (r < 0) {
1671 log_error("Failed to generate predictable MAC address for host side");
1672 return r;
1673 }
1674
1675 r = sd_rtnl_open(&rtnl, 0);
1676 if (r < 0) {
1677 log_error("Failed to connect to netlink: %s", strerror(-r));
1678 return r;
1679 }
1680
1681 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1682 if (r < 0) {
1683 log_error("Failed to allocate netlink message: %s", strerror(-r));
1684 return r;
1685 }
1686
1687 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1688 if (r < 0) {
1689 log_error("Failed to add netlink interface name: %s", strerror(-r));
1690 return r;
1691 }
1692
1693 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1694 if (r < 0) {
1695 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1696 return r;
1697 }
1698
1699 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1700 if (r < 0) {
1701 log_error("Failed to open netlink container: %s", strerror(-r));
1702 return r;
1703 }
1704
1705 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1706 if (r < 0) {
1707 log_error("Failed to open netlink container: %s", strerror(-r));
1708 return r;
1709 }
1710
1711 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1712 if (r < 0) {
1713 log_error("Failed to open netlink container: %s", strerror(-r));
1714 return r;
1715 }
1716
1717 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1718 if (r < 0) {
1719 log_error("Failed to add netlink interface name: %s", strerror(-r));
1720 return r;
1721 }
1722
1723 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1724 if (r < 0) {
1725 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1726 return r;
1727 }
1728
1729 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1730 if (r < 0) {
1731 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1732 return r;
1733 }
1734
1735 r = sd_rtnl_message_close_container(m);
1736 if (r < 0) {
1737 log_error("Failed to close netlink container: %s", strerror(-r));
1738 return r;
1739 }
1740
1741 r = sd_rtnl_message_close_container(m);
1742 if (r < 0) {
1743 log_error("Failed to close netlink container: %s", strerror(-r));
1744 return r;
1745 }
1746
1747 r = sd_rtnl_message_close_container(m);
1748 if (r < 0) {
1749 log_error("Failed to close netlink container: %s", strerror(-r));
1750 return r;
1751 }
1752
1753 r = sd_rtnl_call(rtnl, m, 0, NULL);
1754 if (r < 0) {
1755 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1756 return r;
1757 }
1758
1759 i = (int) if_nametoindex(iface_name);
1760 if (i <= 0) {
1761 log_error("Failed to resolve interface %s: %m", iface_name);
1762 return -errno;
1763 }
1764
1765 *ifi = i;
1766
1767 return 0;
1768 }
1769
1770 static int setup_bridge(const char veth_name[], int *ifi) {
1771 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1772 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1773 int r, bridge;
1774
1775 if (!arg_private_network)
1776 return 0;
1777
1778 if (!arg_network_veth)
1779 return 0;
1780
1781 if (!arg_network_bridge)
1782 return 0;
1783
1784 bridge = (int) if_nametoindex(arg_network_bridge);
1785 if (bridge <= 0) {
1786 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1787 return -errno;
1788 }
1789
1790 *ifi = bridge;
1791
1792 r = sd_rtnl_open(&rtnl, 0);
1793 if (r < 0) {
1794 log_error("Failed to connect to netlink: %s", strerror(-r));
1795 return r;
1796 }
1797
1798 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1799 if (r < 0) {
1800 log_error("Failed to allocate netlink message: %s", strerror(-r));
1801 return r;
1802 }
1803
1804 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1805 if (r < 0) {
1806 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1807 return r;
1808 }
1809
1810 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1811 if (r < 0) {
1812 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1813 return r;
1814 }
1815
1816 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1817 if (r < 0) {
1818 log_error("Failed to add netlink master field: %s", strerror(-r));
1819 return r;
1820 }
1821
1822 r = sd_rtnl_call(rtnl, m, 0, NULL);
1823 if (r < 0) {
1824 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1825 return r;
1826 }
1827
1828 return 0;
1829 }
1830
1831 static int parse_interface(struct udev *udev, const char *name) {
1832 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1833 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1834 int ifi;
1835
1836 ifi = (int) if_nametoindex(name);
1837 if (ifi <= 0) {
1838 log_error("Failed to resolve interface %s: %m", name);
1839 return -errno;
1840 }
1841
1842 sprintf(ifi_str, "n%i", ifi);
1843 d = udev_device_new_from_device_id(udev, ifi_str);
1844 if (!d) {
1845 log_error("Failed to get udev device for interface %s: %m", name);
1846 return -errno;
1847 }
1848
1849 if (udev_device_get_is_initialized(d) <= 0) {
1850 log_error("Network interface %s is not initialized yet.", name);
1851 return -EBUSY;
1852 }
1853
1854 return ifi;
1855 }
1856
1857 static int move_network_interfaces(pid_t pid) {
1858 _cleanup_udev_unref_ struct udev *udev = NULL;
1859 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1860 char **i;
1861 int r;
1862
1863 if (!arg_private_network)
1864 return 0;
1865
1866 if (strv_isempty(arg_network_interfaces))
1867 return 0;
1868
1869 r = sd_rtnl_open(&rtnl, 0);
1870 if (r < 0) {
1871 log_error("Failed to connect to netlink: %s", strerror(-r));
1872 return r;
1873 }
1874
1875 udev = udev_new();
1876 if (!udev) {
1877 log_error("Failed to connect to udev.");
1878 return -ENOMEM;
1879 }
1880
1881 STRV_FOREACH(i, arg_network_interfaces) {
1882 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1883 int ifi;
1884
1885 ifi = parse_interface(udev, *i);
1886 if (ifi < 0)
1887 return ifi;
1888
1889 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1890 if (r < 0) {
1891 log_error("Failed to allocate netlink message: %s", strerror(-r));
1892 return r;
1893 }
1894
1895 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1896 if (r < 0) {
1897 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1898 return r;
1899 }
1900
1901 r = sd_rtnl_call(rtnl, m, 0, NULL);
1902 if (r < 0) {
1903 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1904 return r;
1905 }
1906 }
1907
1908 return 0;
1909 }
1910
1911 static int setup_macvlan(pid_t pid) {
1912 _cleanup_udev_unref_ struct udev *udev = NULL;
1913 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1914 char **i;
1915 int r;
1916
1917 if (!arg_private_network)
1918 return 0;
1919
1920 if (strv_isempty(arg_network_macvlan))
1921 return 0;
1922
1923 r = sd_rtnl_open(&rtnl, 0);
1924 if (r < 0) {
1925 log_error("Failed to connect to netlink: %s", strerror(-r));
1926 return r;
1927 }
1928
1929 udev = udev_new();
1930 if (!udev) {
1931 log_error("Failed to connect to udev.");
1932 return -ENOMEM;
1933 }
1934
1935 STRV_FOREACH(i, arg_network_macvlan) {
1936 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1937 _cleanup_free_ char *n = NULL;
1938 int ifi;
1939
1940 ifi = parse_interface(udev, *i);
1941 if (ifi < 0)
1942 return ifi;
1943
1944 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1945 if (r < 0) {
1946 log_error("Failed to allocate netlink message: %s", strerror(-r));
1947 return r;
1948 }
1949
1950 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1951 if (r < 0) {
1952 log_error("Failed to add netlink interface index: %s", strerror(-r));
1953 return r;
1954 }
1955
1956 n = strappend("mv-", *i);
1957 if (!n)
1958 return log_oom();
1959
1960 strshorten(n, IFNAMSIZ-1);
1961
1962 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1963 if (r < 0) {
1964 log_error("Failed to add netlink interface name: %s", strerror(-r));
1965 return r;
1966 }
1967
1968 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1969 if (r < 0) {
1970 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1971 return r;
1972 }
1973
1974 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1975 if (r < 0) {
1976 log_error("Failed to open netlink container: %s", strerror(-r));
1977 return r;
1978 }
1979
1980 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1981 if (r < 0) {
1982 log_error("Failed to open netlink container: %s", strerror(-r));
1983 return r;
1984 }
1985
1986 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1987 if (r < 0) {
1988 log_error("Failed to append macvlan mode: %s", strerror(-r));
1989 return r;
1990 }
1991
1992 r = sd_rtnl_message_close_container(m);
1993 if (r < 0) {
1994 log_error("Failed to close netlink container: %s", strerror(-r));
1995 return r;
1996 }
1997
1998 r = sd_rtnl_message_close_container(m);
1999 if (r < 0) {
2000 log_error("Failed to close netlink container: %s", strerror(-r));
2001 return r;
2002 }
2003
2004 r = sd_rtnl_call(rtnl, m, 0, NULL);
2005 if (r < 0) {
2006 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
2007 return r;
2008 }
2009 }
2010
2011 return 0;
2012 }
2013
2014 static int setup_seccomp(void) {
2015
2016 #ifdef HAVE_SECCOMP
2017 static const int blacklist[] = {
2018 SCMP_SYS(kexec_load),
2019 SCMP_SYS(open_by_handle_at),
2020 SCMP_SYS(init_module),
2021 SCMP_SYS(finit_module),
2022 SCMP_SYS(delete_module),
2023 SCMP_SYS(iopl),
2024 SCMP_SYS(ioperm),
2025 SCMP_SYS(swapon),
2026 SCMP_SYS(swapoff),
2027 };
2028
2029 scmp_filter_ctx seccomp;
2030 unsigned i;
2031 int r;
2032
2033 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2034 if (!seccomp)
2035 return log_oom();
2036
2037 r = seccomp_add_secondary_archs(seccomp);
2038 if (r < 0) {
2039 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
2040 goto finish;
2041 }
2042
2043 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2044 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2045 if (r == -EFAULT)
2046 continue; /* unknown syscall */
2047 if (r < 0) {
2048 log_error("Failed to block syscall: %s", strerror(-r));
2049 goto finish;
2050 }
2051 }
2052
2053 /*
2054 Audit is broken in containers, much of the userspace audit
2055 hookup will fail if running inside a container. We don't
2056 care and just turn off creation of audit sockets.
2057
2058 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2059 with EAFNOSUPPORT which audit userspace uses as indication
2060 that audit is disabled in the kernel.
2061 */
2062
2063 r = seccomp_rule_add(
2064 seccomp,
2065 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2066 SCMP_SYS(socket),
2067 2,
2068 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2069 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2070 if (r < 0) {
2071 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
2072 goto finish;
2073 }
2074
2075 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2076 if (r < 0) {
2077 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
2078 goto finish;
2079 }
2080
2081 r = seccomp_load(seccomp);
2082 if (r < 0)
2083 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
2084
2085 finish:
2086 seccomp_release(seccomp);
2087 return r;
2088 #else
2089 return 0;
2090 #endif
2091
2092 }
2093
2094 static int setup_image(char **device_path, int *loop_nr) {
2095 struct loop_info64 info = {
2096 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2097 };
2098 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2099 _cleanup_free_ char* loopdev = NULL;
2100 struct stat st;
2101 int r, nr;
2102
2103 assert(device_path);
2104 assert(loop_nr);
2105
2106 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2107 if (fd < 0) {
2108 log_error("Failed to open %s: %m", arg_image);
2109 return -errno;
2110 }
2111
2112 if (fstat(fd, &st) < 0) {
2113 log_error("Failed to stat %s: %m", arg_image);
2114 return -errno;
2115 }
2116
2117 if (S_ISBLK(st.st_mode)) {
2118 char *p;
2119
2120 p = strdup(arg_image);
2121 if (!p)
2122 return log_oom();
2123
2124 *device_path = p;
2125
2126 *loop_nr = -1;
2127
2128 r = fd;
2129 fd = -1;
2130
2131 return r;
2132 }
2133
2134 if (!S_ISREG(st.st_mode)) {
2135 log_error("%s is not a regular file or block device: %m", arg_image);
2136 return -EINVAL;
2137 }
2138
2139 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2140 if (control < 0) {
2141 log_error("Failed to open /dev/loop-control: %m");
2142 return -errno;
2143 }
2144
2145 nr = ioctl(control, LOOP_CTL_GET_FREE);
2146 if (nr < 0) {
2147 log_error("Failed to allocate loop device: %m");
2148 return -errno;
2149 }
2150
2151 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2152 return log_oom();
2153
2154 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2155 if (loop < 0) {
2156 log_error("Failed to open loop device %s: %m", loopdev);
2157 return -errno;
2158 }
2159
2160 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2161 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2162 return -errno;
2163 }
2164
2165 if (arg_read_only)
2166 info.lo_flags |= LO_FLAGS_READ_ONLY;
2167
2168 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2169 log_error("Failed to set loopback settings on %s: %m", loopdev);
2170 return -errno;
2171 }
2172
2173 *device_path = loopdev;
2174 loopdev = NULL;
2175
2176 *loop_nr = nr;
2177
2178 r = loop;
2179 loop = -1;
2180
2181 return r;
2182 }
2183
2184 static int dissect_image(
2185 int fd,
2186 char **root_device, bool *root_device_rw,
2187 char **home_device, bool *home_device_rw,
2188 char **srv_device, bool *srv_device_rw,
2189 bool *secondary) {
2190
2191 #ifdef HAVE_BLKID
2192 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2193 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2194 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2195 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2196 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2197 _cleanup_udev_unref_ struct udev *udev = NULL;
2198 struct udev_list_entry *first, *item;
2199 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2200 const char *pttype = NULL;
2201 blkid_partlist pl;
2202 struct stat st;
2203 int r;
2204
2205 assert(fd >= 0);
2206 assert(root_device);
2207 assert(home_device);
2208 assert(srv_device);
2209 assert(secondary);
2210
2211 b = blkid_new_probe();
2212 if (!b)
2213 return log_oom();
2214
2215 errno = 0;
2216 r = blkid_probe_set_device(b, fd, 0, 0);
2217 if (r != 0) {
2218 if (errno == 0)
2219 return log_oom();
2220
2221 log_error("Failed to set device on blkid probe: %m");
2222 return -errno;
2223 }
2224
2225 blkid_probe_enable_partitions(b, 1);
2226 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2227
2228 errno = 0;
2229 r = blkid_do_safeprobe(b);
2230 if (r == -2 || r == 1) {
2231 log_error("Failed to identify any partition table on %s.\n"
2232 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2233 return -EINVAL;
2234 } else if (r != 0) {
2235 if (errno == 0)
2236 errno = EIO;
2237 log_error("Failed to probe: %m");
2238 return -errno;
2239 }
2240
2241 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2242 if (!streq_ptr(pttype, "gpt")) {
2243 log_error("Image %s does not carry a GUID Partition Table.\n"
2244 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2245 return -EINVAL;
2246 }
2247
2248 errno = 0;
2249 pl = blkid_probe_get_partitions(b);
2250 if (!pl) {
2251 if (errno == 0)
2252 return log_oom();
2253
2254 log_error("Failed to list partitions of %s", arg_image);
2255 return -errno;
2256 }
2257
2258 udev = udev_new();
2259 if (!udev)
2260 return log_oom();
2261
2262 if (fstat(fd, &st) < 0) {
2263 log_error("Failed to stat block device: %m");
2264 return -errno;
2265 }
2266
2267 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2268 if (!d)
2269 return log_oom();
2270
2271 e = udev_enumerate_new(udev);
2272 if (!e)
2273 return log_oom();
2274
2275 r = udev_enumerate_add_match_parent(e, d);
2276 if (r < 0)
2277 return log_oom();
2278
2279 r = udev_enumerate_scan_devices(e);
2280 if (r < 0) {
2281 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2282 return r;
2283 }
2284
2285 first = udev_enumerate_get_list_entry(e);
2286 udev_list_entry_foreach(item, first) {
2287 _cleanup_udev_device_unref_ struct udev_device *q;
2288 const char *stype, *node;
2289 unsigned long long flags;
2290 sd_id128_t type_id;
2291 blkid_partition pp;
2292 dev_t qn;
2293 int nr;
2294
2295 errno = 0;
2296 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2297 if (!q) {
2298 if (!errno)
2299 errno = ENOMEM;
2300
2301 log_error("Failed to get partition device of %s: %m", arg_image);
2302 return -errno;
2303 }
2304
2305 qn = udev_device_get_devnum(q);
2306 if (major(qn) == 0)
2307 continue;
2308
2309 if (st.st_rdev == qn)
2310 continue;
2311
2312 node = udev_device_get_devnode(q);
2313 if (!node)
2314 continue;
2315
2316 pp = blkid_partlist_devno_to_partition(pl, qn);
2317 if (!pp)
2318 continue;
2319
2320 flags = blkid_partition_get_flags(pp);
2321 if (flags & GPT_FLAG_NO_AUTO)
2322 continue;
2323
2324 nr = blkid_partition_get_partno(pp);
2325 if (nr < 0)
2326 continue;
2327
2328 stype = blkid_partition_get_type_string(pp);
2329 if (!stype)
2330 continue;
2331
2332 if (sd_id128_from_string(stype, &type_id) < 0)
2333 continue;
2334
2335 if (sd_id128_equal(type_id, GPT_HOME)) {
2336
2337 if (home && nr >= home_nr)
2338 continue;
2339
2340 home_nr = nr;
2341 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2342
2343 free(home);
2344 home = strdup(node);
2345 if (!home)
2346 return log_oom();
2347 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2348
2349 if (srv && nr >= srv_nr)
2350 continue;
2351
2352 srv_nr = nr;
2353 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2354
2355 free(srv);
2356 srv = strdup(node);
2357 if (!srv)
2358 return log_oom();
2359 }
2360 #ifdef GPT_ROOT_NATIVE
2361 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2362
2363 if (root && nr >= root_nr)
2364 continue;
2365
2366 root_nr = nr;
2367 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2368
2369 free(root);
2370 root = strdup(node);
2371 if (!root)
2372 return log_oom();
2373 }
2374 #endif
2375 #ifdef GPT_ROOT_SECONDARY
2376 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2377
2378 if (secondary_root && nr >= secondary_root_nr)
2379 continue;
2380
2381 secondary_root_nr = nr;
2382 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2383
2384
2385 free(secondary_root);
2386 secondary_root = strdup(node);
2387 if (!secondary_root)
2388 return log_oom();
2389 }
2390 #endif
2391 }
2392
2393 if (!root && !secondary_root) {
2394 log_error("Failed to identify root partition in disk image %s.\n"
2395 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2396 return -EINVAL;
2397 }
2398
2399 if (root) {
2400 *root_device = root;
2401 root = NULL;
2402
2403 *root_device_rw = root_rw;
2404 *secondary = false;
2405 } else if (secondary_root) {
2406 *root_device = secondary_root;
2407 secondary_root = NULL;
2408
2409 *root_device_rw = secondary_root_rw;
2410 *secondary = true;
2411 }
2412
2413 if (home) {
2414 *home_device = home;
2415 home = NULL;
2416
2417 *home_device_rw = home_rw;
2418 }
2419
2420 if (srv) {
2421 *srv_device = srv;
2422 srv = NULL;
2423
2424 *srv_device_rw = srv_rw;
2425 }
2426
2427 return 0;
2428 #else
2429 log_error("--image= is not supported, compiled without blkid support.");
2430 return -ENOTSUP;
2431 #endif
2432 }
2433
2434 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2435 #ifdef HAVE_BLKID
2436 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2437 const char *fstype, *p;
2438 int r;
2439
2440 assert(what);
2441 assert(where);
2442
2443 if (arg_read_only)
2444 rw = false;
2445
2446 if (directory)
2447 p = strappenda(where, directory);
2448 else
2449 p = where;
2450
2451 errno = 0;
2452 b = blkid_new_probe_from_filename(what);
2453 if (!b) {
2454 if (errno == 0)
2455 return log_oom();
2456 log_error("Failed to allocate prober for %s: %m", what);
2457 return -errno;
2458 }
2459
2460 blkid_probe_enable_superblocks(b, 1);
2461 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2462
2463 errno = 0;
2464 r = blkid_do_safeprobe(b);
2465 if (r == -1 || r == 1) {
2466 log_error("Cannot determine file system type of %s", what);
2467 return -EINVAL;
2468 } else if (r != 0) {
2469 if (errno == 0)
2470 errno = EIO;
2471 log_error("Failed to probe %s: %m", what);
2472 return -errno;
2473 }
2474
2475 errno = 0;
2476 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2477 if (errno == 0)
2478 errno = EINVAL;
2479 log_error("Failed to determine file system type of %s", what);
2480 return -errno;
2481 }
2482
2483 if (streq(fstype, "crypto_LUKS")) {
2484 log_error("nspawn currently does not support LUKS disk images.");
2485 return -ENOTSUP;
2486 }
2487
2488 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2489 log_error("Failed to mount %s: %m", what);
2490 return -errno;
2491 }
2492
2493 return 0;
2494 #else
2495 log_error("--image= is not supported, compiled without blkid support.");
2496 return -ENOTSUP;
2497 #endif
2498 }
2499
2500 static int mount_devices(
2501 const char *where,
2502 const char *root_device, bool root_device_rw,
2503 const char *home_device, bool home_device_rw,
2504 const char *srv_device, bool srv_device_rw) {
2505 int r;
2506
2507 assert(where);
2508
2509 if (root_device) {
2510 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2511 if (r < 0) {
2512 log_error("Failed to mount root directory: %s", strerror(-r));
2513 return r;
2514 }
2515 }
2516
2517 if (home_device) {
2518 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2519 if (r < 0) {
2520 log_error("Failed to mount home directory: %s", strerror(-r));
2521 return r;
2522 }
2523 }
2524
2525 if (srv_device) {
2526 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2527 if (r < 0) {
2528 log_error("Failed to mount server data directory: %s", strerror(-r));
2529 return r;
2530 }
2531 }
2532
2533 return 0;
2534 }
2535
2536 static void loop_remove(int nr, int *image_fd) {
2537 _cleanup_close_ int control = -1;
2538
2539 if (nr < 0)
2540 return;
2541
2542 if (image_fd && *image_fd >= 0) {
2543 ioctl(*image_fd, LOOP_CLR_FD);
2544 *image_fd = safe_close(*image_fd);
2545 }
2546
2547 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2548 if (control < 0)
2549 return;
2550
2551 ioctl(control, LOOP_CTL_REMOVE, nr);
2552 }
2553
2554 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2555 int pipe_fds[2];
2556 pid_t pid;
2557
2558 assert(database);
2559 assert(key);
2560 assert(rpid);
2561
2562 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2563 log_error("Failed to allocate pipe: %m");
2564 return -errno;
2565 }
2566
2567 pid = fork();
2568 if (pid < 0) {
2569 log_error("Failed to fork getent child: %m");
2570 return -errno;
2571 } else if (pid == 0) {
2572 int nullfd;
2573 char *empty_env = NULL;
2574
2575 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2576 _exit(EXIT_FAILURE);
2577
2578 if (pipe_fds[0] > 2)
2579 safe_close(pipe_fds[0]);
2580 if (pipe_fds[1] > 2)
2581 safe_close(pipe_fds[1]);
2582
2583 nullfd = open("/dev/null", O_RDWR);
2584 if (nullfd < 0)
2585 _exit(EXIT_FAILURE);
2586
2587 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2588 _exit(EXIT_FAILURE);
2589
2590 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2591 _exit(EXIT_FAILURE);
2592
2593 if (nullfd > 2)
2594 safe_close(nullfd);
2595
2596 reset_all_signal_handlers();
2597 close_all_fds(NULL, 0);
2598
2599 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2600 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2601 _exit(EXIT_FAILURE);
2602 }
2603
2604 pipe_fds[1] = safe_close(pipe_fds[1]);
2605
2606 *rpid = pid;
2607
2608 return pipe_fds[0];
2609 }
2610
2611 static int change_uid_gid(char **_home) {
2612 char line[LINE_MAX], *x, *u, *g, *h;
2613 const char *word, *state;
2614 _cleanup_free_ uid_t *uids = NULL;
2615 _cleanup_free_ char *home = NULL;
2616 _cleanup_fclose_ FILE *f = NULL;
2617 _cleanup_close_ int fd = -1;
2618 unsigned n_uids = 0;
2619 size_t sz = 0, l;
2620 uid_t uid;
2621 gid_t gid;
2622 pid_t pid;
2623 int r;
2624
2625 assert(_home);
2626
2627 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2628 /* Reset everything fully to 0, just in case */
2629
2630 if (setgroups(0, NULL) < 0) {
2631 log_error("setgroups() failed: %m");
2632 return -errno;
2633 }
2634
2635 if (setresgid(0, 0, 0) < 0) {
2636 log_error("setregid() failed: %m");
2637 return -errno;
2638 }
2639
2640 if (setresuid(0, 0, 0) < 0) {
2641 log_error("setreuid() failed: %m");
2642 return -errno;
2643 }
2644
2645 *_home = NULL;
2646 return 0;
2647 }
2648
2649 /* First, get user credentials */
2650 fd = spawn_getent("passwd", arg_user, &pid);
2651 if (fd < 0)
2652 return fd;
2653
2654 f = fdopen(fd, "r");
2655 if (!f)
2656 return log_oom();
2657 fd = -1;
2658
2659 if (!fgets(line, sizeof(line), f)) {
2660
2661 if (!ferror(f)) {
2662 log_error("Failed to resolve user %s.", arg_user);
2663 return -ESRCH;
2664 }
2665
2666 log_error("Failed to read from getent: %m");
2667 return -errno;
2668 }
2669
2670 truncate_nl(line);
2671
2672 wait_for_terminate_and_warn("getent passwd", pid);
2673
2674 x = strchr(line, ':');
2675 if (!x) {
2676 log_error("/etc/passwd entry has invalid user field.");
2677 return -EIO;
2678 }
2679
2680 u = strchr(x+1, ':');
2681 if (!u) {
2682 log_error("/etc/passwd entry has invalid password field.");
2683 return -EIO;
2684 }
2685
2686 u++;
2687 g = strchr(u, ':');
2688 if (!g) {
2689 log_error("/etc/passwd entry has invalid UID field.");
2690 return -EIO;
2691 }
2692
2693 *g = 0;
2694 g++;
2695 x = strchr(g, ':');
2696 if (!x) {
2697 log_error("/etc/passwd entry has invalid GID field.");
2698 return -EIO;
2699 }
2700
2701 *x = 0;
2702 h = strchr(x+1, ':');
2703 if (!h) {
2704 log_error("/etc/passwd entry has invalid GECOS field.");
2705 return -EIO;
2706 }
2707
2708 h++;
2709 x = strchr(h, ':');
2710 if (!x) {
2711 log_error("/etc/passwd entry has invalid home directory field.");
2712 return -EIO;
2713 }
2714
2715 *x = 0;
2716
2717 r = parse_uid(u, &uid);
2718 if (r < 0) {
2719 log_error("Failed to parse UID of user.");
2720 return -EIO;
2721 }
2722
2723 r = parse_gid(g, &gid);
2724 if (r < 0) {
2725 log_error("Failed to parse GID of user.");
2726 return -EIO;
2727 }
2728
2729 home = strdup(h);
2730 if (!home)
2731 return log_oom();
2732
2733 /* Second, get group memberships */
2734 fd = spawn_getent("initgroups", arg_user, &pid);
2735 if (fd < 0)
2736 return fd;
2737
2738 fclose(f);
2739 f = fdopen(fd, "r");
2740 if (!f)
2741 return log_oom();
2742 fd = -1;
2743
2744 if (!fgets(line, sizeof(line), f)) {
2745 if (!ferror(f)) {
2746 log_error("Failed to resolve user %s.", arg_user);
2747 return -ESRCH;
2748 }
2749
2750 log_error("Failed to read from getent: %m");
2751 return -errno;
2752 }
2753
2754 truncate_nl(line);
2755
2756 wait_for_terminate_and_warn("getent initgroups", pid);
2757
2758 /* Skip over the username and subsequent separator whitespace */
2759 x = line;
2760 x += strcspn(x, WHITESPACE);
2761 x += strspn(x, WHITESPACE);
2762
2763 FOREACH_WORD(word, l, x, state) {
2764 char c[l+1];
2765
2766 memcpy(c, word, l);
2767 c[l] = 0;
2768
2769 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2770 return log_oom();
2771
2772 r = parse_uid(c, &uids[n_uids++]);
2773 if (r < 0) {
2774 log_error("Failed to parse group data from getent.");
2775 return -EIO;
2776 }
2777 }
2778
2779 r = mkdir_parents(home, 0775);
2780 if (r < 0) {
2781 log_error("Failed to make home root directory: %s", strerror(-r));
2782 return r;
2783 }
2784
2785 r = mkdir_safe(home, 0755, uid, gid);
2786 if (r < 0 && r != -EEXIST) {
2787 log_error("Failed to make home directory: %s", strerror(-r));
2788 return r;
2789 }
2790
2791 fchown(STDIN_FILENO, uid, gid);
2792 fchown(STDOUT_FILENO, uid, gid);
2793 fchown(STDERR_FILENO, uid, gid);
2794
2795 if (setgroups(n_uids, uids) < 0) {
2796 log_error("Failed to set auxiliary groups: %m");
2797 return -errno;
2798 }
2799
2800 if (setresgid(gid, gid, gid) < 0) {
2801 log_error("setregid() failed: %m");
2802 return -errno;
2803 }
2804
2805 if (setresuid(uid, uid, uid) < 0) {
2806 log_error("setreuid() failed: %m");
2807 return -errno;
2808 }
2809
2810 if (_home) {
2811 *_home = home;
2812 home = NULL;
2813 }
2814
2815 return 0;
2816 }
2817
2818 /*
2819 * Return values:
2820 * < 0 : wait_for_terminate() failed to get the state of the
2821 * container, the container was terminated by a signal, or
2822 * failed for an unknown reason. No change is made to the
2823 * container argument.
2824 * > 0 : The program executed in the container terminated with an
2825 * error. The exit code of the program executed in the
2826 * container is returned. No change is made to the container
2827 * argument.
2828 * 0 : The container is being rebooted, has been shut down or exited
2829 * successfully. The container argument has been set to either
2830 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2831 *
2832 * That is, success is indicated by a return value of zero, and an
2833 * error is indicated by a non-zero value.
2834 */
2835 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2836 int r;
2837 siginfo_t status;
2838
2839 r = wait_for_terminate(pid, &status);
2840 if (r < 0) {
2841 log_warning("Failed to wait for container: %s", strerror(-r));
2842 return r;
2843 }
2844
2845 switch (status.si_code) {
2846 case CLD_EXITED:
2847 r = status.si_status;
2848 if (r == 0) {
2849 if (!arg_quiet)
2850 log_debug("Container %s exited successfully.",
2851 arg_machine);
2852
2853 *container = CONTAINER_TERMINATED;
2854 } else {
2855 log_error("Container %s failed with error code %i.",
2856 arg_machine, status.si_status);
2857 }
2858 break;
2859
2860 case CLD_KILLED:
2861 if (status.si_status == SIGINT) {
2862 if (!arg_quiet)
2863 log_info("Container %s has been shut down.",
2864 arg_machine);
2865
2866 *container = CONTAINER_TERMINATED;
2867 r = 0;
2868 break;
2869 } else if (status.si_status == SIGHUP) {
2870 if (!arg_quiet)
2871 log_info("Container %s is being rebooted.",
2872 arg_machine);
2873
2874 *container = CONTAINER_REBOOTED;
2875 r = 0;
2876 break;
2877 }
2878 /* CLD_KILLED fallthrough */
2879
2880 case CLD_DUMPED:
2881 log_error("Container %s terminated by signal %s.",
2882 arg_machine, signal_to_string(status.si_status));
2883 r = -1;
2884 break;
2885
2886 default:
2887 log_error("Container %s failed due to unknown reason.",
2888 arg_machine);
2889 r = -1;
2890 break;
2891 }
2892
2893 return r;
2894 }
2895
2896 static void nop_handler(int sig) {}
2897
2898 int main(int argc, char *argv[]) {
2899
2900 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2901 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2902 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2903 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2904 _cleanup_fdset_free_ FDSet *fds = NULL;
2905 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2906 const char *console = NULL;
2907 char veth_name[IFNAMSIZ];
2908 bool secondary = false;
2909 sigset_t mask, mask_chld;
2910 pid_t pid = 0;
2911
2912 log_parse_environment();
2913 log_open();
2914
2915 k = parse_argv(argc, argv);
2916 if (k < 0)
2917 goto finish;
2918 else if (k == 0) {
2919 r = EXIT_SUCCESS;
2920 goto finish;
2921 }
2922
2923 if (!arg_image) {
2924 if (arg_directory) {
2925 char *p;
2926
2927 p = path_make_absolute_cwd(arg_directory);
2928 free(arg_directory);
2929 arg_directory = p;
2930 } else
2931 arg_directory = get_current_dir_name();
2932
2933 if (!arg_directory) {
2934 log_error("Failed to determine path, please use -D.");
2935 goto finish;
2936 }
2937 path_kill_slashes(arg_directory);
2938 }
2939
2940 if (!arg_machine) {
2941 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2942 if (!arg_machine) {
2943 log_oom();
2944 goto finish;
2945 }
2946
2947 hostname_cleanup(arg_machine, false);
2948 if (isempty(arg_machine)) {
2949 log_error("Failed to determine machine name automatically, please use -M.");
2950 goto finish;
2951 }
2952 }
2953
2954 if (geteuid() != 0) {
2955 log_error("Need to be root.");
2956 goto finish;
2957 }
2958
2959 if (sd_booted() <= 0) {
2960 log_error("Not running on a systemd system.");
2961 goto finish;
2962 }
2963
2964 log_close();
2965 n_fd_passed = sd_listen_fds(false);
2966 if (n_fd_passed > 0) {
2967 k = fdset_new_listen_fds(&fds, false);
2968 if (k < 0) {
2969 log_error("Failed to collect file descriptors: %s", strerror(-k));
2970 goto finish;
2971 }
2972 }
2973 fdset_close_others(fds);
2974 log_open();
2975
2976 if (arg_directory) {
2977 if (path_equal(arg_directory, "/")) {
2978 log_error("Spawning container on root directory not supported.");
2979 goto finish;
2980 }
2981
2982 if (arg_boot) {
2983 if (path_is_os_tree(arg_directory) <= 0) {
2984 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2985 goto finish;
2986 }
2987 } else {
2988 const char *p;
2989
2990 p = strappenda(arg_directory,
2991 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2992 if (access(p, F_OK) < 0) {
2993 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2994 goto finish;
2995
2996 }
2997 }
2998 } else {
2999 char template[] = "/tmp/nspawn-root-XXXXXX";
3000
3001 if (!mkdtemp(template)) {
3002 log_error("Failed to create temporary directory: %m");
3003 r = -errno;
3004 goto finish;
3005 }
3006
3007 arg_directory = strdup(template);
3008 if (!arg_directory) {
3009 r = log_oom();
3010 goto finish;
3011 }
3012
3013 image_fd = setup_image(&device_path, &loop_nr);
3014 if (image_fd < 0) {
3015 r = image_fd;
3016 goto finish;
3017 }
3018
3019 r = dissect_image(image_fd,
3020 &root_device, &root_device_rw,
3021 &home_device, &home_device_rw,
3022 &srv_device, &srv_device_rw,
3023 &secondary);
3024 if (r < 0)
3025 goto finish;
3026 }
3027
3028 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3029 if (master < 0) {
3030 log_error("Failed to acquire pseudo tty: %m");
3031 goto finish;
3032 }
3033
3034 console = ptsname(master);
3035 if (!console) {
3036 log_error("Failed to determine tty name: %m");
3037 goto finish;
3038 }
3039
3040 if (!arg_quiet)
3041 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3042 arg_machine, arg_image ? arg_image : arg_directory);
3043
3044 if (unlockpt(master) < 0) {
3045 log_error("Failed to unlock tty: %m");
3046 goto finish;
3047 }
3048
3049 if (access("/dev/kdbus/control", F_OK) >= 0) {
3050
3051 if (arg_share_system) {
3052 kdbus_domain = strdup("/dev/kdbus");
3053 if (!kdbus_domain) {
3054 log_oom();
3055 goto finish;
3056 }
3057 } else {
3058 const char *ns;
3059
3060 ns = strappenda("machine-", arg_machine);
3061 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
3062 if (r < 0)
3063 log_debug("Failed to create kdbus domain: %s", strerror(-r));
3064 else
3065 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
3066 }
3067 }
3068
3069 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3070 log_error("Failed to create kmsg socket pair: %m");
3071 goto finish;
3072 }
3073
3074 sd_notify(false,
3075 "READY=1\n"
3076 "STATUS=Container running.");
3077
3078 assert_se(sigemptyset(&mask) == 0);
3079 assert_se(sigemptyset(&mask_chld) == 0);
3080 sigaddset(&mask_chld, SIGCHLD);
3081 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3082 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3083
3084 for (;;) {
3085 ContainerStatus container_status;
3086 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3087 struct sigaction sa = {
3088 .sa_handler = nop_handler,
3089 .sa_flags = SA_NOCLDSTOP,
3090 };
3091
3092 r = barrier_create(&barrier);
3093 if (r < 0) {
3094 log_error("Cannot initialize IPC barrier: %s", strerror(-r));
3095 goto finish;
3096 }
3097
3098 /* Child can be killed before execv(), so handle SIGCHLD
3099 * in order to interrupt parent's blocking calls and
3100 * give it a chance to call wait() and terminate. */
3101 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3102 if (r < 0) {
3103 log_error("Failed to change the signal mask: %m");
3104 goto finish;
3105 }
3106
3107 r = sigaction(SIGCHLD, &sa, NULL);
3108 if (r < 0) {
3109 log_error("Failed to install SIGCHLD handler: %m");
3110 goto finish;
3111 }
3112
3113 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3114 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3115 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3116 if (pid < 0) {
3117 if (errno == EINVAL)
3118 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3119 else
3120 log_error("clone() failed: %m");
3121
3122 r = pid;
3123 goto finish;
3124 }
3125
3126 if (pid == 0) {
3127 /* child */
3128 _cleanup_free_ char *home = NULL;
3129 unsigned n_env = 2;
3130 const char *envp[] = {
3131 "PATH=" DEFAULT_PATH_SPLIT_USR,
3132 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3133 NULL, /* TERM */
3134 NULL, /* HOME */
3135 NULL, /* USER */
3136 NULL, /* LOGNAME */
3137 NULL, /* container_uuid */
3138 NULL, /* LISTEN_FDS */
3139 NULL, /* LISTEN_PID */
3140 NULL
3141 };
3142 char **env_use;
3143
3144 barrier_set_role(&barrier, BARRIER_CHILD);
3145
3146 envp[n_env] = strv_find_prefix(environ, "TERM=");
3147 if (envp[n_env])
3148 n_env ++;
3149
3150 master = safe_close(master);
3151
3152 close_nointr(STDIN_FILENO);
3153 close_nointr(STDOUT_FILENO);
3154 close_nointr(STDERR_FILENO);
3155
3156 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3157
3158 reset_all_signal_handlers();
3159 reset_signal_mask();
3160
3161 k = open_terminal(console, O_RDWR);
3162 if (k != STDIN_FILENO) {
3163 if (k >= 0) {
3164 safe_close(k);
3165 k = -EINVAL;
3166 }
3167
3168 log_error("Failed to open console: %s", strerror(-k));
3169 _exit(EXIT_FAILURE);
3170 }
3171
3172 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3173 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3174 log_error("Failed to duplicate console: %m");
3175 _exit(EXIT_FAILURE);
3176 }
3177
3178 if (setsid() < 0) {
3179 log_error("setsid() failed: %m");
3180 _exit(EXIT_FAILURE);
3181 }
3182
3183 if (reset_audit_loginuid() < 0)
3184 _exit(EXIT_FAILURE);
3185
3186 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3187 log_error("PR_SET_PDEATHSIG failed: %m");
3188 _exit(EXIT_FAILURE);
3189 }
3190
3191 /* Mark everything as slave, so that we still
3192 * receive mounts from the real root, but don't
3193 * propagate mounts to the real root. */
3194 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3195 log_error("MS_SLAVE|MS_REC failed: %m");
3196 _exit(EXIT_FAILURE);
3197 }
3198
3199 if (mount_devices(arg_directory,
3200 root_device, root_device_rw,
3201 home_device, home_device_rw,
3202 srv_device, srv_device_rw) < 0)
3203 _exit(EXIT_FAILURE);
3204
3205 /* Turn directory into bind mount */
3206 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3207 log_error("Failed to make bind mount: %m");
3208 _exit(EXIT_FAILURE);
3209 }
3210
3211 r = setup_volatile(arg_directory);
3212 if (r < 0)
3213 _exit(EXIT_FAILURE);
3214
3215 if (setup_volatile_state(arg_directory) < 0)
3216 _exit(EXIT_FAILURE);
3217
3218 r = base_filesystem_create(arg_directory);
3219 if (r < 0)
3220 _exit(EXIT_FAILURE);
3221
3222 if (arg_read_only) {
3223 k = bind_remount_recursive(arg_directory, true);
3224 if (k < 0) {
3225 log_error("Failed to make tree read-only: %s", strerror(-k));
3226 _exit(EXIT_FAILURE);
3227 }
3228 }
3229
3230 if (mount_all(arg_directory) < 0)
3231 _exit(EXIT_FAILURE);
3232
3233 if (copy_devnodes(arg_directory) < 0)
3234 _exit(EXIT_FAILURE);
3235
3236 if (setup_ptmx(arg_directory) < 0)
3237 _exit(EXIT_FAILURE);
3238
3239 dev_setup(arg_directory);
3240
3241 if (setup_seccomp() < 0)
3242 _exit(EXIT_FAILURE);
3243
3244 if (setup_dev_console(arg_directory, console) < 0)
3245 _exit(EXIT_FAILURE);
3246
3247 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3248 _exit(EXIT_FAILURE);
3249
3250 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3251
3252 if (setup_boot_id(arg_directory) < 0)
3253 _exit(EXIT_FAILURE);
3254
3255 if (setup_timezone(arg_directory) < 0)
3256 _exit(EXIT_FAILURE);
3257
3258 if (setup_resolv_conf(arg_directory) < 0)
3259 _exit(EXIT_FAILURE);
3260
3261 if (setup_journal(arg_directory) < 0)
3262 _exit(EXIT_FAILURE);
3263
3264 if (mount_binds(arg_directory, arg_bind, false) < 0)
3265 _exit(EXIT_FAILURE);
3266
3267 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3268 _exit(EXIT_FAILURE);
3269
3270 if (mount_tmpfs(arg_directory) < 0)
3271 _exit(EXIT_FAILURE);
3272
3273 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3274 _exit(EXIT_FAILURE);
3275
3276 /* Tell the parent that we are ready, and that
3277 * it can cgroupify us to that we lack access
3278 * to certain devices and resources. */
3279 barrier_place(&barrier);
3280
3281 if (chdir(arg_directory) < 0) {
3282 log_error("chdir(%s) failed: %m", arg_directory);
3283 _exit(EXIT_FAILURE);
3284 }
3285
3286 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3287 log_error("mount(MS_MOVE) failed: %m");
3288 _exit(EXIT_FAILURE);
3289 }
3290
3291 if (chroot(".") < 0) {
3292 log_error("chroot() failed: %m");
3293 _exit(EXIT_FAILURE);
3294 }
3295
3296 if (chdir("/") < 0) {
3297 log_error("chdir() failed: %m");
3298 _exit(EXIT_FAILURE);
3299 }
3300
3301 umask(0022);
3302
3303 if (arg_private_network)
3304 loopback_setup();
3305
3306 if (drop_capabilities() < 0) {
3307 log_error("drop_capabilities() failed: %m");
3308 _exit(EXIT_FAILURE);
3309 }
3310
3311 r = change_uid_gid(&home);
3312 if (r < 0)
3313 _exit(EXIT_FAILURE);
3314
3315 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3316 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3317 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3318 log_oom();
3319 _exit(EXIT_FAILURE);
3320 }
3321
3322 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3323 char as_uuid[37];
3324
3325 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3326 log_oom();
3327 _exit(EXIT_FAILURE);
3328 }
3329 }
3330
3331 if (fdset_size(fds) > 0) {
3332 k = fdset_cloexec(fds, false);
3333 if (k < 0) {
3334 log_error("Failed to unset O_CLOEXEC for file descriptors.");
3335 _exit(EXIT_FAILURE);
3336 }
3337
3338 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3339 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3340 log_oom();
3341 _exit(EXIT_FAILURE);
3342 }
3343 }
3344
3345 setup_hostname();
3346
3347 if (arg_personality != 0xffffffffLU) {
3348 if (personality(arg_personality) < 0) {
3349 log_error("personality() failed: %m");
3350 _exit(EXIT_FAILURE);
3351 }
3352 } else if (secondary) {
3353 if (personality(PER_LINUX32) < 0) {
3354 log_error("personality() failed: %m");
3355 _exit(EXIT_FAILURE);
3356 }
3357 }
3358
3359 #ifdef HAVE_SELINUX
3360 if (arg_selinux_context)
3361 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3362 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3363 _exit(EXIT_FAILURE);
3364 }
3365 #endif
3366
3367 if (!strv_isempty(arg_setenv)) {
3368 char **n;
3369
3370 n = strv_env_merge(2, envp, arg_setenv);
3371 if (!n) {
3372 log_oom();
3373 _exit(EXIT_FAILURE);
3374 }
3375
3376 env_use = n;
3377 } else
3378 env_use = (char**) envp;
3379
3380 /* Wait until the parent is ready with the setup, too... */
3381 if (!barrier_place_and_sync(&barrier))
3382 _exit(EXIT_FAILURE);
3383
3384 if (arg_boot) {
3385 char **a;
3386 size_t l;
3387
3388 /* Automatically search for the init system */
3389
3390 l = 1 + argc - optind;
3391 a = newa(char*, l + 1);
3392 memcpy(a + 1, argv + optind, l * sizeof(char*));
3393
3394 a[0] = (char*) "/usr/lib/systemd/systemd";
3395 execve(a[0], a, env_use);
3396
3397 a[0] = (char*) "/lib/systemd/systemd";
3398 execve(a[0], a, env_use);
3399
3400 a[0] = (char*) "/sbin/init";
3401 execve(a[0], a, env_use);
3402 } else if (argc > optind)
3403 execvpe(argv[optind], argv + optind, env_use);
3404 else {
3405 chdir(home ? home : "/root");
3406 execle("/bin/bash", "-bash", NULL, env_use);
3407 execle("/bin/sh", "-sh", NULL, env_use);
3408 }
3409
3410 log_error("execv() failed: %m");
3411 _exit(EXIT_FAILURE);
3412 }
3413
3414 barrier_set_role(&barrier, BARRIER_PARENT);
3415 fdset_free(fds);
3416 fds = NULL;
3417
3418 /* wait for child-setup to be done */
3419 if (barrier_place_and_sync(&barrier)) {
3420 int ifi = 0;
3421
3422 r = move_network_interfaces(pid);
3423 if (r < 0)
3424 goto finish;
3425
3426 r = setup_veth(pid, veth_name, &ifi);
3427 if (r < 0)
3428 goto finish;
3429
3430 r = setup_bridge(veth_name, &ifi);
3431 if (r < 0)
3432 goto finish;
3433
3434 r = setup_macvlan(pid);
3435 if (r < 0)
3436 goto finish;
3437
3438 r = register_machine(pid, ifi);
3439 if (r < 0)
3440 goto finish;
3441
3442 /* Block SIGCHLD here, before notifying child.
3443 * process_pty() will handle it with the other signals. */
3444 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3445 if (r < 0)
3446 goto finish;
3447
3448 /* Reset signal to default */
3449 r = default_signals(SIGCHLD, -1);
3450 if (r < 0)
3451 goto finish;
3452
3453 /* Notify the child that the parent is ready with all
3454 * its setup, and that the child can now hand over
3455 * control to the code to run inside the container. */
3456 barrier_place(&barrier);
3457
3458 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3459 if (k < 0) {
3460 r = EXIT_FAILURE;
3461 break;
3462 }
3463
3464 if (!arg_quiet)
3465 putc('\n', stdout);
3466
3467 /* Kill if it is not dead yet anyway */
3468 terminate_machine(pid);
3469 }
3470
3471 /* Normally redundant, but better safe than sorry */
3472 kill(pid, SIGKILL);
3473
3474 r = wait_for_container(pid, &container_status);
3475 pid = 0;
3476
3477 if (r < 0) {
3478 /* We failed to wait for the container, or the
3479 * container exited abnormally */
3480 r = EXIT_FAILURE;
3481 break;
3482 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3483 /* The container exited with a non-zero
3484 * status, or with zero status and no reboot
3485 * was requested. */
3486 break;
3487
3488 /* CONTAINER_REBOOTED, loop again */
3489
3490 if (arg_keep_unit) {
3491 /* Special handling if we are running as a
3492 * service: instead of simply restarting the
3493 * machine we want to restart the entire
3494 * service, so let's inform systemd about this
3495 * with the special exit code 133. The service
3496 * file uses RestartForceExitStatus=133 so
3497 * that this results in a full nspawn
3498 * restart. This is necessary since we might
3499 * have cgroup parameters set we want to have
3500 * flushed out. */
3501 r = 133;
3502 break;
3503 }
3504 }
3505
3506 finish:
3507 sd_notify(false,
3508 "STOPPING=1\n"
3509 "STATUS=Terminating...");
3510
3511 loop_remove(loop_nr, &image_fd);
3512
3513 if (pid > 0)
3514 kill(pid, SIGKILL);
3515
3516 free(arg_directory);
3517 free(arg_machine);
3518 free(arg_user);
3519 strv_free(arg_setenv);
3520 strv_free(arg_network_interfaces);
3521 strv_free(arg_network_macvlan);
3522 strv_free(arg_bind);
3523 strv_free(arg_bind_ro);
3524 strv_free(arg_tmpfs);
3525
3526 return r;
3527 }