]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: fix DeviceAllow list
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92
93 #ifdef HAVE_SECCOMP
94 #include "seccomp-util.h"
95 #endif
96
97 typedef enum ContainerStatus {
98 CONTAINER_TERMINATED,
99 CONTAINER_REBOOTED
100 } ContainerStatus;
101
102 typedef enum LinkJournal {
103 LINK_NO,
104 LINK_AUTO,
105 LINK_HOST,
106 LINK_GUEST
107 } LinkJournal;
108
109 typedef enum Volatile {
110 VOLATILE_NO,
111 VOLATILE_YES,
112 VOLATILE_STATE,
113 } Volatile;
114
115 static char *arg_directory = NULL;
116 static char *arg_user = NULL;
117 static sd_id128_t arg_uuid = {};
118 static char *arg_machine = NULL;
119 static const char *arg_selinux_context = NULL;
120 static const char *arg_selinux_apifs_context = NULL;
121 static const char *arg_slice = NULL;
122 static bool arg_private_network = false;
123 static bool arg_read_only = false;
124 static bool arg_boot = false;
125 static LinkJournal arg_link_journal = LINK_AUTO;
126 static uint64_t arg_retain =
127 (1ULL << CAP_CHOWN) |
128 (1ULL << CAP_DAC_OVERRIDE) |
129 (1ULL << CAP_DAC_READ_SEARCH) |
130 (1ULL << CAP_FOWNER) |
131 (1ULL << CAP_FSETID) |
132 (1ULL << CAP_IPC_OWNER) |
133 (1ULL << CAP_KILL) |
134 (1ULL << CAP_LEASE) |
135 (1ULL << CAP_LINUX_IMMUTABLE) |
136 (1ULL << CAP_NET_BIND_SERVICE) |
137 (1ULL << CAP_NET_BROADCAST) |
138 (1ULL << CAP_NET_RAW) |
139 (1ULL << CAP_SETGID) |
140 (1ULL << CAP_SETFCAP) |
141 (1ULL << CAP_SETPCAP) |
142 (1ULL << CAP_SETUID) |
143 (1ULL << CAP_SYS_ADMIN) |
144 (1ULL << CAP_SYS_CHROOT) |
145 (1ULL << CAP_SYS_NICE) |
146 (1ULL << CAP_SYS_PTRACE) |
147 (1ULL << CAP_SYS_TTY_CONFIG) |
148 (1ULL << CAP_SYS_RESOURCE) |
149 (1ULL << CAP_SYS_BOOT) |
150 (1ULL << CAP_AUDIT_WRITE) |
151 (1ULL << CAP_AUDIT_CONTROL) |
152 (1ULL << CAP_MKNOD);
153 static char **arg_bind = NULL;
154 static char **arg_bind_ro = NULL;
155 static char **arg_tmpfs = NULL;
156 static char **arg_setenv = NULL;
157 static bool arg_quiet = false;
158 static bool arg_share_system = false;
159 static bool arg_register = true;
160 static bool arg_keep_unit = false;
161 static char **arg_network_interfaces = NULL;
162 static char **arg_network_macvlan = NULL;
163 static bool arg_network_veth = false;
164 static const char *arg_network_bridge = NULL;
165 static unsigned long arg_personality = 0xffffffffLU;
166 static const char *arg_image = NULL;
167 static Volatile arg_volatile = VOLATILE_NO;
168
169 static void help(void) {
170 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
171 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
172 " -h --help Show this help\n"
173 " --version Print version string\n"
174 " -q --quiet Do not show status information\n"
175 " -D --directory=PATH Root directory for the container\n"
176 " -i --image=PATH File system device or image for the container\n"
177 " -b --boot Boot up full system (i.e. invoke init)\n"
178 " -u --user=USER Run the command under specified user or uid\n"
179 " -M --machine=NAME Set the machine name for the container\n"
180 " --uuid=UUID Set a specific machine UUID for the container\n"
181 " -S --slice=SLICE Place the container in the specified slice\n"
182 " --private-network Disable network in container\n"
183 " --network-interface=INTERFACE\n"
184 " Assign an existing network interface to the\n"
185 " container\n"
186 " --network-macvlan=INTERFACE\n"
187 " Create a macvlan network interface based on an\n"
188 " existing network interface to the container\n"
189 " --network-veth Add a virtual ethernet connection between host\n"
190 " and container\n"
191 " --network-bridge=INTERFACE\n"
192 " Add a virtual ethernet connection between host\n"
193 " and container and add it to an existing bridge on\n"
194 " the host\n"
195 " -Z --selinux-context=SECLABEL\n"
196 " Set the SELinux security context to be used by\n"
197 " processes in the container\n"
198 " -L --selinux-apifs-context=SECLABEL\n"
199 " Set the SELinux security context to be used by\n"
200 " API/tmpfs file systems in the container\n"
201 " --capability=CAP In addition to the default, retain specified\n"
202 " capability\n"
203 " --drop-capability=CAP Drop the specified capability from the default set\n"
204 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
205 " -j Equivalent to --link-journal=host\n"
206 " --read-only Mount the root directory read-only\n"
207 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
208 " the container\n"
209 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
210 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
211 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
212 " --share-system Share system namespaces with host\n"
213 " --register=BOOLEAN Register container as machine\n"
214 " --keep-unit Do not register a scope for the machine, reuse\n"
215 " the service unit nspawn is running in\n"
216 " --volatile[=MODE] Run the system in volatile mode\n",
217 program_invocation_short_name);
218 }
219
220 static int parse_argv(int argc, char *argv[]) {
221
222 enum {
223 ARG_VERSION = 0x100,
224 ARG_PRIVATE_NETWORK,
225 ARG_UUID,
226 ARG_READ_ONLY,
227 ARG_CAPABILITY,
228 ARG_DROP_CAPABILITY,
229 ARG_LINK_JOURNAL,
230 ARG_BIND,
231 ARG_BIND_RO,
232 ARG_TMPFS,
233 ARG_SETENV,
234 ARG_SHARE_SYSTEM,
235 ARG_REGISTER,
236 ARG_KEEP_UNIT,
237 ARG_NETWORK_INTERFACE,
238 ARG_NETWORK_MACVLAN,
239 ARG_NETWORK_VETH,
240 ARG_NETWORK_BRIDGE,
241 ARG_PERSONALITY,
242 ARG_VOLATILE,
243 };
244
245 static const struct option options[] = {
246 { "help", no_argument, NULL, 'h' },
247 { "version", no_argument, NULL, ARG_VERSION },
248 { "directory", required_argument, NULL, 'D' },
249 { "user", required_argument, NULL, 'u' },
250 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
251 { "boot", no_argument, NULL, 'b' },
252 { "uuid", required_argument, NULL, ARG_UUID },
253 { "read-only", no_argument, NULL, ARG_READ_ONLY },
254 { "capability", required_argument, NULL, ARG_CAPABILITY },
255 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
256 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
257 { "bind", required_argument, NULL, ARG_BIND },
258 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
259 { "tmpfs", required_argument, NULL, ARG_TMPFS },
260 { "machine", required_argument, NULL, 'M' },
261 { "slice", required_argument, NULL, 'S' },
262 { "setenv", required_argument, NULL, ARG_SETENV },
263 { "selinux-context", required_argument, NULL, 'Z' },
264 { "selinux-apifs-context", required_argument, NULL, 'L' },
265 { "quiet", no_argument, NULL, 'q' },
266 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
267 { "register", required_argument, NULL, ARG_REGISTER },
268 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
269 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
270 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
271 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
272 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
273 { "personality", required_argument, NULL, ARG_PERSONALITY },
274 { "image", required_argument, NULL, 'i' },
275 { "volatile", optional_argument, NULL, ARG_VOLATILE },
276 {}
277 };
278
279 int c, r;
280 uint64_t plus = 0, minus = 0;
281
282 assert(argc >= 0);
283 assert(argv);
284
285 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
286
287 switch (c) {
288
289 case 'h':
290 help();
291 return 0;
292
293 case ARG_VERSION:
294 puts(PACKAGE_STRING);
295 puts(SYSTEMD_FEATURES);
296 return 0;
297
298 case 'D':
299 free(arg_directory);
300 arg_directory = canonicalize_file_name(optarg);
301 if (!arg_directory) {
302 log_error("Invalid root directory: %m");
303 return -ENOMEM;
304 }
305
306 break;
307
308 case 'i':
309 arg_image = optarg;
310 break;
311
312 case 'u':
313 free(arg_user);
314 arg_user = strdup(optarg);
315 if (!arg_user)
316 return log_oom();
317
318 break;
319
320 case ARG_NETWORK_BRIDGE:
321 arg_network_bridge = optarg;
322
323 /* fall through */
324
325 case ARG_NETWORK_VETH:
326 arg_network_veth = true;
327 arg_private_network = true;
328 break;
329
330 case ARG_NETWORK_INTERFACE:
331 if (strv_extend(&arg_network_interfaces, optarg) < 0)
332 return log_oom();
333
334 arg_private_network = true;
335 break;
336
337 case ARG_NETWORK_MACVLAN:
338 if (strv_extend(&arg_network_macvlan, optarg) < 0)
339 return log_oom();
340
341 /* fall through */
342
343 case ARG_PRIVATE_NETWORK:
344 arg_private_network = true;
345 break;
346
347 case 'b':
348 arg_boot = true;
349 break;
350
351 case ARG_UUID:
352 r = sd_id128_from_string(optarg, &arg_uuid);
353 if (r < 0) {
354 log_error("Invalid UUID: %s", optarg);
355 return r;
356 }
357 break;
358
359 case 'S':
360 arg_slice = optarg;
361 break;
362
363 case 'M':
364 if (isempty(optarg)) {
365 free(arg_machine);
366 arg_machine = NULL;
367 } else {
368
369 if (!hostname_is_valid(optarg)) {
370 log_error("Invalid machine name: %s", optarg);
371 return -EINVAL;
372 }
373
374 free(arg_machine);
375 arg_machine = strdup(optarg);
376 if (!arg_machine)
377 return log_oom();
378
379 break;
380 }
381
382 case 'Z':
383 arg_selinux_context = optarg;
384 break;
385
386 case 'L':
387 arg_selinux_apifs_context = optarg;
388 break;
389
390 case ARG_READ_ONLY:
391 arg_read_only = true;
392 break;
393
394 case ARG_CAPABILITY:
395 case ARG_DROP_CAPABILITY: {
396 const char *state, *word;
397 size_t length;
398
399 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
400 _cleanup_free_ char *t;
401 cap_value_t cap;
402
403 t = strndup(word, length);
404 if (!t)
405 return log_oom();
406
407 if (streq(t, "all")) {
408 if (c == ARG_CAPABILITY)
409 plus = (uint64_t) -1;
410 else
411 minus = (uint64_t) -1;
412 } else {
413 if (cap_from_name(t, &cap) < 0) {
414 log_error("Failed to parse capability %s.", t);
415 return -EINVAL;
416 }
417
418 if (c == ARG_CAPABILITY)
419 plus |= 1ULL << (uint64_t) cap;
420 else
421 minus |= 1ULL << (uint64_t) cap;
422 }
423 }
424
425 break;
426 }
427
428 case 'j':
429 arg_link_journal = LINK_GUEST;
430 break;
431
432 case ARG_LINK_JOURNAL:
433 if (streq(optarg, "auto"))
434 arg_link_journal = LINK_AUTO;
435 else if (streq(optarg, "no"))
436 arg_link_journal = LINK_NO;
437 else if (streq(optarg, "guest"))
438 arg_link_journal = LINK_GUEST;
439 else if (streq(optarg, "host"))
440 arg_link_journal = LINK_HOST;
441 else {
442 log_error("Failed to parse link journal mode %s", optarg);
443 return -EINVAL;
444 }
445
446 break;
447
448 case ARG_BIND:
449 case ARG_BIND_RO: {
450 _cleanup_free_ char *a = NULL, *b = NULL;
451 char *e;
452 char ***x;
453
454 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
455
456 e = strchr(optarg, ':');
457 if (e) {
458 a = strndup(optarg, e - optarg);
459 b = strdup(e + 1);
460 } else {
461 a = strdup(optarg);
462 b = strdup(optarg);
463 }
464
465 if (!a || !b)
466 return log_oom();
467
468 if (!path_is_absolute(a) || !path_is_absolute(b)) {
469 log_error("Invalid bind mount specification: %s", optarg);
470 return -EINVAL;
471 }
472
473 r = strv_extend(x, a);
474 if (r < 0)
475 return log_oom();
476
477 r = strv_extend(x, b);
478 if (r < 0)
479 return log_oom();
480
481 break;
482 }
483
484 case ARG_TMPFS: {
485 _cleanup_free_ char *a = NULL, *b = NULL;
486 char *e;
487
488 e = strchr(optarg, ':');
489 if (e) {
490 a = strndup(optarg, e - optarg);
491 b = strdup(e + 1);
492 } else {
493 a = strdup(optarg);
494 b = strdup("mode=0755");
495 }
496
497 if (!a || !b)
498 return log_oom();
499
500 if (!path_is_absolute(a)) {
501 log_error("Invalid tmpfs specification: %s", optarg);
502 return -EINVAL;
503 }
504
505 r = strv_push(&arg_tmpfs, a);
506 if (r < 0)
507 return log_oom();
508
509 a = NULL;
510
511 r = strv_push(&arg_tmpfs, b);
512 if (r < 0)
513 return log_oom();
514
515 b = NULL;
516
517 break;
518 }
519
520 case ARG_SETENV: {
521 char **n;
522
523 if (!env_assignment_is_valid(optarg)) {
524 log_error("Environment variable assignment '%s' is not valid.", optarg);
525 return -EINVAL;
526 }
527
528 n = strv_env_set(arg_setenv, optarg);
529 if (!n)
530 return log_oom();
531
532 strv_free(arg_setenv);
533 arg_setenv = n;
534 break;
535 }
536
537 case 'q':
538 arg_quiet = true;
539 break;
540
541 case ARG_SHARE_SYSTEM:
542 arg_share_system = true;
543 break;
544
545 case ARG_REGISTER:
546 r = parse_boolean(optarg);
547 if (r < 0) {
548 log_error("Failed to parse --register= argument: %s", optarg);
549 return r;
550 }
551
552 arg_register = r;
553 break;
554
555 case ARG_KEEP_UNIT:
556 arg_keep_unit = true;
557 break;
558
559 case ARG_PERSONALITY:
560
561 arg_personality = personality_from_string(optarg);
562 if (arg_personality == 0xffffffffLU) {
563 log_error("Unknown or unsupported personality '%s'.", optarg);
564 return -EINVAL;
565 }
566
567 break;
568
569 case ARG_VOLATILE:
570
571 if (!optarg)
572 arg_volatile = VOLATILE_YES;
573 else {
574 r = parse_boolean(optarg);
575 if (r < 0) {
576 if (streq(optarg, "state"))
577 arg_volatile = VOLATILE_STATE;
578 else {
579 log_error("Failed to parse --volatile= argument: %s", optarg);
580 return r;
581 }
582 } else
583 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
584 }
585
586 break;
587
588 case '?':
589 return -EINVAL;
590
591 default:
592 assert_not_reached("Unhandled option");
593 }
594
595 if (arg_share_system)
596 arg_register = false;
597
598 if (arg_boot && arg_share_system) {
599 log_error("--boot and --share-system may not be combined.");
600 return -EINVAL;
601 }
602
603 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
604 log_error("--keep-unit may not be used when invoked from a user session.");
605 return -EINVAL;
606 }
607
608 if (arg_directory && arg_image) {
609 log_error("--directory= and --image= may not be combined.");
610 return -EINVAL;
611 }
612
613 if (arg_volatile != VOLATILE_NO && arg_read_only) {
614 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
615 return -EINVAL;
616 }
617
618 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
619
620 return 1;
621 }
622
623 static int mount_all(const char *dest) {
624
625 typedef struct MountPoint {
626 const char *what;
627 const char *where;
628 const char *type;
629 const char *options;
630 unsigned long flags;
631 bool fatal;
632 } MountPoint;
633
634 static const MountPoint mount_table[] = {
635 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
636 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
637 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
638 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
639 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
640 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
641 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
642 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
643 #ifdef HAVE_SELINUX
644 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
645 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
646 #endif
647 };
648
649 unsigned k;
650 int r = 0;
651
652 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
653 _cleanup_free_ char *where = NULL;
654 #ifdef HAVE_SELINUX
655 _cleanup_free_ char *options = NULL;
656 #endif
657 const char *o;
658 int t;
659
660 where = strjoin(dest, "/", mount_table[k].where, NULL);
661 if (!where)
662 return log_oom();
663
664 t = path_is_mount_point(where, true);
665 if (t < 0) {
666 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
667
668 if (r == 0)
669 r = t;
670
671 continue;
672 }
673
674 /* Skip this entry if it is not a remount. */
675 if (mount_table[k].what && t > 0)
676 continue;
677
678 t = mkdir_p(where, 0755);
679 if (t < 0) {
680 if (mount_table[k].fatal) {
681 log_error("Failed to create directory %s: %s", where, strerror(-t));
682
683 if (r == 0)
684 r = t;
685 } else
686 log_warning("Failed to create directory %s: %s", where, strerror(-t));
687
688 continue;
689 }
690
691 #ifdef HAVE_SELINUX
692 if (arg_selinux_apifs_context &&
693 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
694 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
695 if (!options)
696 return log_oom();
697
698 o = options;
699 } else
700 #endif
701 o = mount_table[k].options;
702
703
704 if (mount(mount_table[k].what,
705 where,
706 mount_table[k].type,
707 mount_table[k].flags,
708 o) < 0) {
709
710 if (mount_table[k].fatal) {
711 log_error("mount(%s) failed: %m", where);
712
713 if (r == 0)
714 r = -errno;
715 } else
716 log_warning("mount(%s) failed: %m", where);
717 }
718 }
719
720 return r;
721 }
722
723 static int mount_binds(const char *dest, char **l, bool ro) {
724 char **x, **y;
725
726 STRV_FOREACH_PAIR(x, y, l) {
727 _cleanup_free_ char *where = NULL;
728 struct stat source_st, dest_st;
729 int r;
730
731 if (stat(*x, &source_st) < 0) {
732 log_error("Failed to stat %s: %m", *x);
733 return -errno;
734 }
735
736 where = strappend(dest, *y);
737 if (!where)
738 return log_oom();
739
740 r = stat(where, &dest_st);
741 if (r == 0) {
742 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
743 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
744 return -EINVAL;
745 }
746 } else if (errno == ENOENT) {
747 r = mkdir_parents_label(where, 0755);
748 if (r < 0) {
749 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
750 return r;
751 }
752 } else {
753 log_error("Failed to bind mount %s: %m", *x);
754 return -errno;
755 }
756
757 /* Create the mount point, but be conservative -- refuse to create block
758 * and char devices. */
759 if (S_ISDIR(source_st.st_mode)) {
760 r = mkdir_label(where, 0755);
761 if (r < 0) {
762 log_error("Failed to create mount point %s: %s", where, strerror(-r));
763
764 return r;
765 }
766 } else if (S_ISFIFO(source_st.st_mode)) {
767 r = mkfifo(where, 0644);
768 if (r < 0 && errno != EEXIST) {
769 log_error("Failed to create mount point %s: %m", where);
770
771 return -errno;
772 }
773 } else if (S_ISSOCK(source_st.st_mode)) {
774 r = mknod(where, 0644 | S_IFSOCK, 0);
775 if (r < 0 && errno != EEXIST) {
776 log_error("Failed to create mount point %s: %m", where);
777
778 return -errno;
779 }
780 } else if (S_ISREG(source_st.st_mode)) {
781 r = touch(where);
782 if (r < 0) {
783 log_error("Failed to create mount point %s: %s", where, strerror(-r));
784
785 return r;
786 }
787 } else {
788 log_error("Refusing to create mountpoint for file: %s", *x);
789 return -ENOTSUP;
790 }
791
792 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
793 log_error("mount(%s) failed: %m", where);
794 return -errno;
795 }
796
797 if (ro) {
798 r = bind_remount_recursive(where, true);
799 if (r < 0) {
800 log_error("Read-Only bind mount failed: %s", strerror(-r));
801 return r;
802 }
803 }
804 }
805
806 return 0;
807 }
808
809 static int mount_tmpfs(const char *dest) {
810 char **i, **o;
811
812 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
813 _cleanup_free_ char *where = NULL;
814 int r;
815
816 where = strappend(dest, *i);
817 if (!where)
818 return log_oom();
819
820 r = mkdir_label(where, 0755);
821 if (r < 0) {
822 log_error("creating mount point for tmpfs %s failed: %s", where, strerror(-r));
823
824 return r;
825 }
826
827 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
828 log_error("tmpfs mount to %s failed: %m", where);
829 return -errno;
830 }
831 }
832
833 return 0;
834 }
835
836 static int setup_timezone(const char *dest) {
837 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
838 char *z, *y;
839 int r;
840
841 assert(dest);
842
843 /* Fix the timezone, if possible */
844 r = readlink_malloc("/etc/localtime", &p);
845 if (r < 0) {
846 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
847 return 0;
848 }
849
850 z = path_startswith(p, "../usr/share/zoneinfo/");
851 if (!z)
852 z = path_startswith(p, "/usr/share/zoneinfo/");
853 if (!z) {
854 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
855 return 0;
856 }
857
858 where = strappend(dest, "/etc/localtime");
859 if (!where)
860 return log_oom();
861
862 r = readlink_malloc(where, &q);
863 if (r >= 0) {
864 y = path_startswith(q, "../usr/share/zoneinfo/");
865 if (!y)
866 y = path_startswith(q, "/usr/share/zoneinfo/");
867
868 /* Already pointing to the right place? Then do nothing .. */
869 if (y && streq(y, z))
870 return 0;
871 }
872
873 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
874 if (!check)
875 return log_oom();
876
877 if (access(check, F_OK) < 0) {
878 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
879 return 0;
880 }
881
882 what = strappend("../usr/share/zoneinfo/", z);
883 if (!what)
884 return log_oom();
885
886 r = mkdir_parents(where, 0755);
887 if (r < 0) {
888 log_error("Failed to create directory for timezone info %s in container: %s", where, strerror(-r));
889
890 return 0;
891 }
892
893 r = unlink(where);
894 if (r < 0 && errno != ENOENT) {
895 log_error("Failed to remove existing timezone info %s in container: %m", where);
896
897 return 0;
898 }
899
900 if (symlink(what, where) < 0) {
901 log_error("Failed to correct timezone of container: %m");
902 return 0;
903 }
904
905 return 0;
906 }
907
908 static int setup_resolv_conf(const char *dest) {
909 _cleanup_free_ char *where = NULL;
910 int r;
911
912 assert(dest);
913
914 if (arg_private_network)
915 return 0;
916
917 /* Fix resolv.conf, if possible */
918 where = strappend(dest, "/etc/resolv.conf");
919 if (!where)
920 return log_oom();
921
922 /* We don't really care for the results of this really. If it
923 * fails, it fails, but meh... */
924 r = mkdir_parents(where, 0755);
925 if (r < 0) {
926 log_warning("Failed to create parent directory for resolv.conf %s: %s", where, strerror(-r));
927
928 return 0;
929 }
930
931 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
932 if (r < 0) {
933 log_warning("Failed to copy /etc/resolv.conf to %s: %s", where, strerror(-r));
934
935 return 0;
936 }
937
938 return 0;
939 }
940
941 static int setup_volatile_state(const char *directory) {
942 const char *p;
943 int r;
944
945 assert(directory);
946
947 if (arg_volatile != VOLATILE_STATE)
948 return 0;
949
950 /* --volatile=state means we simply overmount /var
951 with a tmpfs, and the rest read-only. */
952
953 r = bind_remount_recursive(directory, true);
954 if (r < 0) {
955 log_error("Failed to remount %s read-only: %s", directory, strerror(-r));
956 return r;
957 }
958
959 p = strappenda(directory, "/var");
960 r = mkdir(p, 0755);
961 if (r < 0 && errno != EEXIST) {
962 log_error("Failed to create %s: %m", directory);
963 return -errno;
964 }
965
966 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
967 log_error("Failed to mount tmpfs to /var: %m");
968 return -errno;
969 }
970
971 return 0;
972 }
973
974 static int setup_volatile(const char *directory) {
975 bool tmpfs_mounted = false, bind_mounted = false;
976 char template[] = "/tmp/nspawn-volatile-XXXXXX";
977 const char *f, *t;
978 int r;
979
980 assert(directory);
981
982 if (arg_volatile != VOLATILE_YES)
983 return 0;
984
985 /* --volatile=yes means we mount a tmpfs to the root dir, and
986 the original /usr to use inside it, and that read-only. */
987
988 if (!mkdtemp(template)) {
989 log_error("Failed to create temporary directory: %m");
990 return -errno;
991 }
992
993 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
994 log_error("Failed to mount tmpfs for root directory: %m");
995 r = -errno;
996 goto fail;
997 }
998
999 tmpfs_mounted = true;
1000
1001 f = strappenda(directory, "/usr");
1002 t = strappenda(template, "/usr");
1003
1004 r = mkdir(t, 0755);
1005 if (r < 0 && errno != EEXIST) {
1006 log_error("Failed to create %s: %m", t);
1007 r = -errno;
1008 goto fail;
1009 }
1010
1011 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1012 log_error("Failed to create /usr bind mount: %m");
1013 r = -errno;
1014 goto fail;
1015 }
1016
1017 bind_mounted = true;
1018
1019 r = bind_remount_recursive(t, true);
1020 if (r < 0) {
1021 log_error("Failed to remount %s read-only: %s", t, strerror(-r));
1022 goto fail;
1023 }
1024
1025 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1026 log_error("Failed to move root mount: %m");
1027 r = -errno;
1028 goto fail;
1029 }
1030
1031 rmdir(template);
1032
1033 return 0;
1034
1035 fail:
1036 if (bind_mounted)
1037 umount(t);
1038 if (tmpfs_mounted)
1039 umount(template);
1040 rmdir(template);
1041 return r;
1042 }
1043
1044 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1045
1046 snprintf(s, 37,
1047 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1048 SD_ID128_FORMAT_VAL(id));
1049
1050 return s;
1051 }
1052
1053 static int setup_boot_id(const char *dest) {
1054 _cleanup_free_ char *from = NULL, *to = NULL;
1055 sd_id128_t rnd = {};
1056 char as_uuid[37];
1057 int r;
1058
1059 assert(dest);
1060
1061 if (arg_share_system)
1062 return 0;
1063
1064 /* Generate a new randomized boot ID, so that each boot-up of
1065 * the container gets a new one */
1066
1067 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1068 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1069 if (!from || !to)
1070 return log_oom();
1071
1072 r = sd_id128_randomize(&rnd);
1073 if (r < 0) {
1074 log_error("Failed to generate random boot id: %s", strerror(-r));
1075 return r;
1076 }
1077
1078 id128_format_as_uuid(rnd, as_uuid);
1079
1080 r = write_string_file(from, as_uuid);
1081 if (r < 0) {
1082 log_error("Failed to write boot id: %s", strerror(-r));
1083 return r;
1084 }
1085
1086 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1087 log_error("Failed to bind mount boot id: %m");
1088 r = -errno;
1089 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1090 log_warning("Failed to make boot id read-only: %m");
1091
1092 unlink(from);
1093 return r;
1094 }
1095
1096 static int copy_devnodes(const char *dest) {
1097
1098 static const char devnodes[] =
1099 "null\0"
1100 "zero\0"
1101 "full\0"
1102 "random\0"
1103 "urandom\0"
1104 "tty\0"
1105 "net/tun\0";
1106
1107 const char *d;
1108 int r = 0;
1109 _cleanup_umask_ mode_t u;
1110
1111 assert(dest);
1112
1113 u = umask(0000);
1114
1115 NULSTR_FOREACH(d, devnodes) {
1116 _cleanup_free_ char *from = NULL, *to = NULL;
1117 struct stat st;
1118
1119 from = strappend("/dev/", d);
1120 to = strjoin(dest, "/dev/", d, NULL);
1121 if (!from || !to)
1122 return log_oom();
1123
1124 if (stat(from, &st) < 0) {
1125
1126 if (errno != ENOENT) {
1127 log_error("Failed to stat %s: %m", from);
1128 return -errno;
1129 }
1130
1131 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1132
1133 log_error("%s is not a char or block device, cannot copy", from);
1134 return -EIO;
1135
1136 } else {
1137 r = mkdir_parents(to, 0775);
1138 if (r < 0) {
1139 log_error("Failed to create parent directory of %s: %s", to, strerror(-r));
1140 return -r;
1141 }
1142
1143 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1144 log_error("mknod(%s) failed: %m", dest);
1145 return -errno;
1146 }
1147 }
1148 }
1149
1150 return r;
1151 }
1152
1153 static int setup_ptmx(const char *dest) {
1154 _cleanup_free_ char *p = NULL;
1155
1156 p = strappend(dest, "/dev/ptmx");
1157 if (!p)
1158 return log_oom();
1159
1160 if (symlink("pts/ptmx", p) < 0) {
1161 log_error("Failed to create /dev/ptmx symlink: %m");
1162 return -errno;
1163 }
1164
1165 return 0;
1166 }
1167
1168 static int setup_dev_console(const char *dest, const char *console) {
1169 _cleanup_umask_ mode_t u;
1170 const char *to;
1171 struct stat st;
1172 int r;
1173
1174 assert(dest);
1175 assert(console);
1176
1177 u = umask(0000);
1178
1179 if (stat("/dev/null", &st) < 0) {
1180 log_error("Failed to stat /dev/null: %m");
1181 return -errno;
1182 }
1183
1184 r = chmod_and_chown(console, 0600, 0, 0);
1185 if (r < 0) {
1186 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
1187 return r;
1188 }
1189
1190 /* We need to bind mount the right tty to /dev/console since
1191 * ptys can only exist on pts file systems. To have something
1192 * to bind mount things on we create a device node first, and
1193 * use /dev/null for that since we the cgroups device policy
1194 * allows us to create that freely, while we cannot create
1195 * /dev/console. (Note that the major minor doesn't actually
1196 * matter here, since we mount it over anyway). */
1197
1198 to = strappenda(dest, "/dev/console");
1199 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1200 log_error("mknod() for /dev/console failed: %m");
1201 return -errno;
1202 }
1203
1204 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1205 log_error("Bind mount for /dev/console failed: %m");
1206 return -errno;
1207 }
1208
1209 return 0;
1210 }
1211
1212 static int setup_kmsg(const char *dest, int kmsg_socket) {
1213 _cleanup_free_ char *from = NULL, *to = NULL;
1214 int r, fd, k;
1215 _cleanup_umask_ mode_t u;
1216 union {
1217 struct cmsghdr cmsghdr;
1218 uint8_t buf[CMSG_SPACE(sizeof(int))];
1219 } control = {};
1220 struct msghdr mh = {
1221 .msg_control = &control,
1222 .msg_controllen = sizeof(control),
1223 };
1224 struct cmsghdr *cmsg;
1225
1226 assert(dest);
1227 assert(kmsg_socket >= 0);
1228
1229 u = umask(0000);
1230
1231 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1232 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1233 * on the reading side behave very similar to /proc/kmsg,
1234 * their writing side behaves differently from /dev/kmsg in
1235 * that writing blocks when nothing is reading. In order to
1236 * avoid any problems with containers deadlocking due to this
1237 * we simply make /dev/kmsg unavailable to the container. */
1238 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1239 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1240 return log_oom();
1241
1242 if (mkfifo(from, 0600) < 0) {
1243 log_error("mkfifo() for /dev/kmsg failed: %m");
1244 return -errno;
1245 }
1246
1247 r = chmod_and_chown(from, 0600, 0, 0);
1248 if (r < 0) {
1249 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1250 return r;
1251 }
1252
1253 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1254 log_error("Bind mount for /proc/kmsg failed: %m");
1255 return -errno;
1256 }
1257
1258 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1259 if (fd < 0) {
1260 log_error("Failed to open fifo: %m");
1261 return -errno;
1262 }
1263
1264 cmsg = CMSG_FIRSTHDR(&mh);
1265 cmsg->cmsg_level = SOL_SOCKET;
1266 cmsg->cmsg_type = SCM_RIGHTS;
1267 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1268 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1269
1270 mh.msg_controllen = cmsg->cmsg_len;
1271
1272 /* Store away the fd in the socket, so that it stays open as
1273 * long as we run the child */
1274 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1275 safe_close(fd);
1276
1277 if (k < 0) {
1278 log_error("Failed to send FIFO fd: %m");
1279 return -errno;
1280 }
1281
1282 /* And now make the FIFO unavailable as /dev/kmsg... */
1283 unlink(from);
1284 return 0;
1285 }
1286
1287 static int setup_hostname(void) {
1288
1289 if (arg_share_system)
1290 return 0;
1291
1292 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1293 return -errno;
1294
1295 return 0;
1296 }
1297
1298 static int setup_journal(const char *directory) {
1299 sd_id128_t machine_id, this_id;
1300 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1301 char *id;
1302 int r;
1303
1304 p = strappend(directory, "/etc/machine-id");
1305 if (!p)
1306 return log_oom();
1307
1308 r = read_one_line_file(p, &b);
1309 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1310 return 0;
1311 else if (r < 0) {
1312 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1313 return r;
1314 }
1315
1316 id = strstrip(b);
1317 if (isempty(id) && arg_link_journal == LINK_AUTO)
1318 return 0;
1319
1320 /* Verify validity */
1321 r = sd_id128_from_string(id, &machine_id);
1322 if (r < 0) {
1323 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1324 return r;
1325 }
1326
1327 r = sd_id128_get_machine(&this_id);
1328 if (r < 0) {
1329 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1330 return r;
1331 }
1332
1333 if (sd_id128_equal(machine_id, this_id)) {
1334 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1335 "Host and machine ids are equal (%s): refusing to link journals", id);
1336 if (arg_link_journal == LINK_AUTO)
1337 return 0;
1338 return
1339 -EEXIST;
1340 }
1341
1342 if (arg_link_journal == LINK_NO)
1343 return 0;
1344
1345 free(p);
1346 p = strappend("/var/log/journal/", id);
1347 q = strjoin(directory, "/var/log/journal/", id, NULL);
1348 if (!p || !q)
1349 return log_oom();
1350
1351 if (path_is_mount_point(p, false) > 0) {
1352 if (arg_link_journal != LINK_AUTO) {
1353 log_error("%s: already a mount point, refusing to use for journal", p);
1354 return -EEXIST;
1355 }
1356
1357 return 0;
1358 }
1359
1360 if (path_is_mount_point(q, false) > 0) {
1361 if (arg_link_journal != LINK_AUTO) {
1362 log_error("%s: already a mount point, refusing to use for journal", q);
1363 return -EEXIST;
1364 }
1365
1366 return 0;
1367 }
1368
1369 r = readlink_and_make_absolute(p, &d);
1370 if (r >= 0) {
1371 if ((arg_link_journal == LINK_GUEST ||
1372 arg_link_journal == LINK_AUTO) &&
1373 path_equal(d, q)) {
1374
1375 r = mkdir_p(q, 0755);
1376 if (r < 0)
1377 log_warning("Failed to create directory %s: %m", q);
1378 return 0;
1379 }
1380
1381 if (unlink(p) < 0) {
1382 log_error("Failed to remove symlink %s: %m", p);
1383 return -errno;
1384 }
1385 } else if (r == -EINVAL) {
1386
1387 if (arg_link_journal == LINK_GUEST &&
1388 rmdir(p) < 0) {
1389
1390 if (errno == ENOTDIR) {
1391 log_error("%s already exists and is neither a symlink nor a directory", p);
1392 return r;
1393 } else {
1394 log_error("Failed to remove %s: %m", p);
1395 return -errno;
1396 }
1397 }
1398 } else if (r != -ENOENT) {
1399 log_error("readlink(%s) failed: %m", p);
1400 return r;
1401 }
1402
1403 if (arg_link_journal == LINK_GUEST) {
1404
1405 if (symlink(q, p) < 0) {
1406 log_error("Failed to symlink %s to %s: %m", q, p);
1407 return -errno;
1408 }
1409
1410 r = mkdir_p(q, 0755);
1411 if (r < 0)
1412 log_warning("Failed to create directory %s: %m", q);
1413 return 0;
1414 }
1415
1416 if (arg_link_journal == LINK_HOST) {
1417 r = mkdir_p(p, 0755);
1418 if (r < 0) {
1419 log_error("Failed to create %s: %m", p);
1420 return r;
1421 }
1422
1423 } else if (access(p, F_OK) < 0)
1424 return 0;
1425
1426 if (dir_is_empty(q) == 0)
1427 log_warning("%s is not empty, proceeding anyway.", q);
1428
1429 r = mkdir_p(q, 0755);
1430 if (r < 0) {
1431 log_error("Failed to create %s: %m", q);
1432 return r;
1433 }
1434
1435 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1436 log_error("Failed to bind mount journal from host into guest: %m");
1437 return -errno;
1438 }
1439
1440 return 0;
1441 }
1442
1443 static int setup_kdbus(const char *dest, const char *path) {
1444 const char *p;
1445
1446 if (!path)
1447 return 0;
1448
1449 p = strappenda(dest, "/dev/kdbus");
1450 if (mkdir(p, 0755) < 0) {
1451 log_error("Failed to create kdbus path: %m");
1452 return -errno;
1453 }
1454
1455 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1456 log_error("Failed to mount kdbus domain path: %m");
1457 return -errno;
1458 }
1459
1460 return 0;
1461 }
1462
1463 static int drop_capabilities(void) {
1464 return capability_bounding_set_drop(~arg_retain, false);
1465 }
1466
1467 static int register_machine(pid_t pid, int local_ifindex) {
1468 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1469 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1470 int r;
1471
1472 if (!arg_register)
1473 return 0;
1474
1475 r = sd_bus_default_system(&bus);
1476 if (r < 0) {
1477 log_error("Failed to open system bus: %s", strerror(-r));
1478 return r;
1479 }
1480
1481 if (arg_keep_unit) {
1482 r = sd_bus_call_method(
1483 bus,
1484 "org.freedesktop.machine1",
1485 "/org/freedesktop/machine1",
1486 "org.freedesktop.machine1.Manager",
1487 "RegisterMachineWithNetwork",
1488 &error,
1489 NULL,
1490 "sayssusai",
1491 arg_machine,
1492 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1493 "nspawn",
1494 "container",
1495 (uint32_t) pid,
1496 strempty(arg_directory),
1497 local_ifindex > 0 ? 1 : 0, local_ifindex);
1498 } else {
1499 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1500
1501 r = sd_bus_message_new_method_call(
1502 bus,
1503 &m,
1504 "org.freedesktop.machine1",
1505 "/org/freedesktop/machine1",
1506 "org.freedesktop.machine1.Manager",
1507 "CreateMachineWithNetwork");
1508 if (r < 0) {
1509 log_error("Failed to create message: %s", strerror(-r));
1510 return r;
1511 }
1512
1513 r = sd_bus_message_append(
1514 m,
1515 "sayssusai",
1516 arg_machine,
1517 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1518 "nspawn",
1519 "container",
1520 (uint32_t) pid,
1521 strempty(arg_directory),
1522 local_ifindex > 0 ? 1 : 0, local_ifindex);
1523 if (r < 0) {
1524 log_error("Failed to append message arguments: %s", strerror(-r));
1525 return r;
1526 }
1527
1528 r = sd_bus_message_open_container(m, 'a', "(sv)");
1529 if (r < 0) {
1530 log_error("Failed to open container: %s", strerror(-r));
1531 return r;
1532 }
1533
1534 if (!isempty(arg_slice)) {
1535 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1536 if (r < 0) {
1537 log_error("Failed to append slice: %s", strerror(-r));
1538 return r;
1539 }
1540 }
1541
1542 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1543 if (r < 0) {
1544 log_error("Failed to add device policy: %s", strerror(-r));
1545 return r;
1546 }
1547
1548 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 11,
1549 /* Allow the container to
1550 * access and create the API
1551 * device nodes, so that
1552 * PrivateDevices= in the
1553 * container can work
1554 * fine */
1555 "/dev/null", "rwm",
1556 "/dev/zero", "rwm",
1557 "/dev/full", "rwm",
1558 "/dev/random", "rwm",
1559 "/dev/urandom", "rwm",
1560 "/dev/tty", "rwm",
1561 "/dev/net/tun", "rwm",
1562 /* Allow the container
1563 * access to ptys. However,
1564 * do not permit the
1565 * container to ever create
1566 * these device nodes. */
1567 "/dev/pts/ptmx", "rw",
1568 "char-pts", "rw",
1569 /* Allow the container
1570 * access to all kdbus
1571 * devices. Again, the
1572 * container cannot create
1573 * these nodes, only use
1574 * them. We use a pretty
1575 * open match here, so that
1576 * the kernel API can still
1577 * change. */
1578 "char-kdbus", "rw",
1579 "char-kdbus/*", "rw");
1580 if (r < 0) {
1581 log_error("Failed to add device whitelist: %s", strerror(-r));
1582 return r;
1583 }
1584
1585 r = sd_bus_message_close_container(m);
1586 if (r < 0) {
1587 log_error("Failed to close container: %s", strerror(-r));
1588 return r;
1589 }
1590
1591 r = sd_bus_call(bus, m, 0, &error, NULL);
1592 }
1593
1594 if (r < 0) {
1595 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1596 return r;
1597 }
1598
1599 return 0;
1600 }
1601
1602 static int terminate_machine(pid_t pid) {
1603 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1604 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1605 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1606 const char *path;
1607 int r;
1608
1609 if (!arg_register)
1610 return 0;
1611
1612 r = sd_bus_default_system(&bus);
1613 if (r < 0) {
1614 log_error("Failed to open system bus: %s", strerror(-r));
1615 return r;
1616 }
1617
1618 r = sd_bus_call_method(
1619 bus,
1620 "org.freedesktop.machine1",
1621 "/org/freedesktop/machine1",
1622 "org.freedesktop.machine1.Manager",
1623 "GetMachineByPID",
1624 &error,
1625 &reply,
1626 "u",
1627 (uint32_t) pid);
1628 if (r < 0) {
1629 /* Note that the machine might already have been
1630 * cleaned up automatically, hence don't consider it a
1631 * failure if we cannot get the machine object. */
1632 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1633 return 0;
1634 }
1635
1636 r = sd_bus_message_read(reply, "o", &path);
1637 if (r < 0)
1638 return bus_log_parse_error(r);
1639
1640 r = sd_bus_call_method(
1641 bus,
1642 "org.freedesktop.machine1",
1643 path,
1644 "org.freedesktop.machine1.Machine",
1645 "Terminate",
1646 &error,
1647 NULL,
1648 NULL);
1649 if (r < 0) {
1650 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1651 return 0;
1652 }
1653
1654 return 0;
1655 }
1656
1657 static int reset_audit_loginuid(void) {
1658 _cleanup_free_ char *p = NULL;
1659 int r;
1660
1661 if (arg_share_system)
1662 return 0;
1663
1664 r = read_one_line_file("/proc/self/loginuid", &p);
1665 if (r == -ENOENT)
1666 return 0;
1667 if (r < 0) {
1668 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1669 return r;
1670 }
1671
1672 /* Already reset? */
1673 if (streq(p, "4294967295"))
1674 return 0;
1675
1676 r = write_string_file("/proc/self/loginuid", "4294967295");
1677 if (r < 0) {
1678 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1679 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1680 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1681 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1682 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1683
1684 sleep(5);
1685 }
1686
1687 return 0;
1688 }
1689
1690 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1691 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1692
1693 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key) {
1694 int r;
1695
1696 uint8_t result[8];
1697 size_t l, sz;
1698 uint8_t *v;
1699
1700 l = strlen(arg_machine);
1701 sz = sizeof(sd_id128_t) + l;
1702 v = alloca(sz);
1703
1704 /* fetch some persistent data unique to the host */
1705 r = sd_id128_get_machine((sd_id128_t*) v);
1706 if (r < 0)
1707 return r;
1708
1709 /* combine with some data unique (on this host) to this
1710 * container instance */
1711 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1712
1713 /* Let's hash the host machine ID plus the container name. We
1714 * use a fixed, but originally randomly created hash key here. */
1715 siphash24(result, v, sz, hash_key.bytes);
1716
1717 assert_cc(ETH_ALEN <= sizeof(result));
1718 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1719
1720 /* see eth_random_addr in the kernel */
1721 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1722 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1723
1724 return 0;
1725 }
1726
1727 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1728 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1729 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1730 struct ether_addr mac_host, mac_container;
1731 int r, i;
1732
1733 if (!arg_private_network)
1734 return 0;
1735
1736 if (!arg_network_veth)
1737 return 0;
1738
1739 /* Use two different interface name prefixes depending whether
1740 * we are in bridge mode or not. */
1741 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1742 arg_network_bridge ? "vb" : "ve", arg_machine);
1743
1744 r = generate_mac(&mac_container, CONTAINER_HASH_KEY);
1745 if (r < 0) {
1746 log_error("Failed to generate predictable MAC address for container side");
1747 return r;
1748 }
1749
1750 r = generate_mac(&mac_host, HOST_HASH_KEY);
1751 if (r < 0) {
1752 log_error("Failed to generate predictable MAC address for host side");
1753 return r;
1754 }
1755
1756 r = sd_rtnl_open(&rtnl, 0);
1757 if (r < 0) {
1758 log_error("Failed to connect to netlink: %s", strerror(-r));
1759 return r;
1760 }
1761
1762 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1763 if (r < 0) {
1764 log_error("Failed to allocate netlink message: %s", strerror(-r));
1765 return r;
1766 }
1767
1768 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1769 if (r < 0) {
1770 log_error("Failed to add netlink interface name: %s", strerror(-r));
1771 return r;
1772 }
1773
1774 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1775 if (r < 0) {
1776 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1777 return r;
1778 }
1779
1780 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1781 if (r < 0) {
1782 log_error("Failed to open netlink container: %s", strerror(-r));
1783 return r;
1784 }
1785
1786 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1787 if (r < 0) {
1788 log_error("Failed to open netlink container: %s", strerror(-r));
1789 return r;
1790 }
1791
1792 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1793 if (r < 0) {
1794 log_error("Failed to open netlink container: %s", strerror(-r));
1795 return r;
1796 }
1797
1798 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1799 if (r < 0) {
1800 log_error("Failed to add netlink interface name: %s", strerror(-r));
1801 return r;
1802 }
1803
1804 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1805 if (r < 0) {
1806 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1807 return r;
1808 }
1809
1810 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1811 if (r < 0) {
1812 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1813 return r;
1814 }
1815
1816 r = sd_rtnl_message_close_container(m);
1817 if (r < 0) {
1818 log_error("Failed to close netlink container: %s", strerror(-r));
1819 return r;
1820 }
1821
1822 r = sd_rtnl_message_close_container(m);
1823 if (r < 0) {
1824 log_error("Failed to close netlink container: %s", strerror(-r));
1825 return r;
1826 }
1827
1828 r = sd_rtnl_message_close_container(m);
1829 if (r < 0) {
1830 log_error("Failed to close netlink container: %s", strerror(-r));
1831 return r;
1832 }
1833
1834 r = sd_rtnl_call(rtnl, m, 0, NULL);
1835 if (r < 0) {
1836 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1837 return r;
1838 }
1839
1840 i = (int) if_nametoindex(iface_name);
1841 if (i <= 0) {
1842 log_error("Failed to resolve interface %s: %m", iface_name);
1843 return -errno;
1844 }
1845
1846 *ifi = i;
1847
1848 return 0;
1849 }
1850
1851 static int setup_bridge(const char veth_name[], int *ifi) {
1852 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1853 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1854 int r, bridge;
1855
1856 if (!arg_private_network)
1857 return 0;
1858
1859 if (!arg_network_veth)
1860 return 0;
1861
1862 if (!arg_network_bridge)
1863 return 0;
1864
1865 bridge = (int) if_nametoindex(arg_network_bridge);
1866 if (bridge <= 0) {
1867 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1868 return -errno;
1869 }
1870
1871 *ifi = bridge;
1872
1873 r = sd_rtnl_open(&rtnl, 0);
1874 if (r < 0) {
1875 log_error("Failed to connect to netlink: %s", strerror(-r));
1876 return r;
1877 }
1878
1879 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1880 if (r < 0) {
1881 log_error("Failed to allocate netlink message: %s", strerror(-r));
1882 return r;
1883 }
1884
1885 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1886 if (r < 0) {
1887 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1888 return r;
1889 }
1890
1891 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1892 if (r < 0) {
1893 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1894 return r;
1895 }
1896
1897 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1898 if (r < 0) {
1899 log_error("Failed to add netlink master field: %s", strerror(-r));
1900 return r;
1901 }
1902
1903 r = sd_rtnl_call(rtnl, m, 0, NULL);
1904 if (r < 0) {
1905 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1906 return r;
1907 }
1908
1909 return 0;
1910 }
1911
1912 static int parse_interface(struct udev *udev, const char *name) {
1913 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1914 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1915 int ifi;
1916
1917 ifi = (int) if_nametoindex(name);
1918 if (ifi <= 0) {
1919 log_error("Failed to resolve interface %s: %m", name);
1920 return -errno;
1921 }
1922
1923 sprintf(ifi_str, "n%i", ifi);
1924 d = udev_device_new_from_device_id(udev, ifi_str);
1925 if (!d) {
1926 log_error("Failed to get udev device for interface %s: %m", name);
1927 return -errno;
1928 }
1929
1930 if (udev_device_get_is_initialized(d) <= 0) {
1931 log_error("Network interface %s is not initialized yet.", name);
1932 return -EBUSY;
1933 }
1934
1935 return ifi;
1936 }
1937
1938 static int move_network_interfaces(pid_t pid) {
1939 _cleanup_udev_unref_ struct udev *udev = NULL;
1940 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1941 char **i;
1942 int r;
1943
1944 if (!arg_private_network)
1945 return 0;
1946
1947 if (strv_isempty(arg_network_interfaces))
1948 return 0;
1949
1950 r = sd_rtnl_open(&rtnl, 0);
1951 if (r < 0) {
1952 log_error("Failed to connect to netlink: %s", strerror(-r));
1953 return r;
1954 }
1955
1956 udev = udev_new();
1957 if (!udev) {
1958 log_error("Failed to connect to udev.");
1959 return -ENOMEM;
1960 }
1961
1962 STRV_FOREACH(i, arg_network_interfaces) {
1963 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1964 int ifi;
1965
1966 ifi = parse_interface(udev, *i);
1967 if (ifi < 0)
1968 return ifi;
1969
1970 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1971 if (r < 0) {
1972 log_error("Failed to allocate netlink message: %s", strerror(-r));
1973 return r;
1974 }
1975
1976 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1977 if (r < 0) {
1978 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1979 return r;
1980 }
1981
1982 r = sd_rtnl_call(rtnl, m, 0, NULL);
1983 if (r < 0) {
1984 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1985 return r;
1986 }
1987 }
1988
1989 return 0;
1990 }
1991
1992 static int setup_macvlan(pid_t pid) {
1993 _cleanup_udev_unref_ struct udev *udev = NULL;
1994 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1995 char **i;
1996 int r;
1997
1998 if (!arg_private_network)
1999 return 0;
2000
2001 if (strv_isempty(arg_network_macvlan))
2002 return 0;
2003
2004 r = sd_rtnl_open(&rtnl, 0);
2005 if (r < 0) {
2006 log_error("Failed to connect to netlink: %s", strerror(-r));
2007 return r;
2008 }
2009
2010 udev = udev_new();
2011 if (!udev) {
2012 log_error("Failed to connect to udev.");
2013 return -ENOMEM;
2014 }
2015
2016 STRV_FOREACH(i, arg_network_macvlan) {
2017 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2018 _cleanup_free_ char *n = NULL;
2019 int ifi;
2020
2021 ifi = parse_interface(udev, *i);
2022 if (ifi < 0)
2023 return ifi;
2024
2025 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2026 if (r < 0) {
2027 log_error("Failed to allocate netlink message: %s", strerror(-r));
2028 return r;
2029 }
2030
2031 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2032 if (r < 0) {
2033 log_error("Failed to add netlink interface index: %s", strerror(-r));
2034 return r;
2035 }
2036
2037 n = strappend("mv-", *i);
2038 if (!n)
2039 return log_oom();
2040
2041 strshorten(n, IFNAMSIZ-1);
2042
2043 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2044 if (r < 0) {
2045 log_error("Failed to add netlink interface name: %s", strerror(-r));
2046 return r;
2047 }
2048
2049 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2050 if (r < 0) {
2051 log_error("Failed to add netlink namespace field: %s", strerror(-r));
2052 return r;
2053 }
2054
2055 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2056 if (r < 0) {
2057 log_error("Failed to open netlink container: %s", strerror(-r));
2058 return r;
2059 }
2060
2061 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2062 if (r < 0) {
2063 log_error("Failed to open netlink container: %s", strerror(-r));
2064 return r;
2065 }
2066
2067 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2068 if (r < 0) {
2069 log_error("Failed to append macvlan mode: %s", strerror(-r));
2070 return r;
2071 }
2072
2073 r = sd_rtnl_message_close_container(m);
2074 if (r < 0) {
2075 log_error("Failed to close netlink container: %s", strerror(-r));
2076 return r;
2077 }
2078
2079 r = sd_rtnl_message_close_container(m);
2080 if (r < 0) {
2081 log_error("Failed to close netlink container: %s", strerror(-r));
2082 return r;
2083 }
2084
2085 r = sd_rtnl_call(rtnl, m, 0, NULL);
2086 if (r < 0) {
2087 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
2088 return r;
2089 }
2090 }
2091
2092 return 0;
2093 }
2094
2095 static int setup_seccomp(void) {
2096
2097 #ifdef HAVE_SECCOMP
2098 static const int blacklist[] = {
2099 SCMP_SYS(kexec_load),
2100 SCMP_SYS(open_by_handle_at),
2101 SCMP_SYS(init_module),
2102 SCMP_SYS(finit_module),
2103 SCMP_SYS(delete_module),
2104 SCMP_SYS(iopl),
2105 SCMP_SYS(ioperm),
2106 SCMP_SYS(swapon),
2107 SCMP_SYS(swapoff),
2108 };
2109
2110 scmp_filter_ctx seccomp;
2111 unsigned i;
2112 int r;
2113
2114 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2115 if (!seccomp)
2116 return log_oom();
2117
2118 r = seccomp_add_secondary_archs(seccomp);
2119 if (r < 0) {
2120 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
2121 goto finish;
2122 }
2123
2124 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2125 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2126 if (r == -EFAULT)
2127 continue; /* unknown syscall */
2128 if (r < 0) {
2129 log_error("Failed to block syscall: %s", strerror(-r));
2130 goto finish;
2131 }
2132 }
2133
2134 /*
2135 Audit is broken in containers, much of the userspace audit
2136 hookup will fail if running inside a container. We don't
2137 care and just turn off creation of audit sockets.
2138
2139 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2140 with EAFNOSUPPORT which audit userspace uses as indication
2141 that audit is disabled in the kernel.
2142 */
2143
2144 r = seccomp_rule_add(
2145 seccomp,
2146 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2147 SCMP_SYS(socket),
2148 2,
2149 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2150 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2151 if (r < 0) {
2152 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
2153 goto finish;
2154 }
2155
2156 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2157 if (r < 0) {
2158 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
2159 goto finish;
2160 }
2161
2162 r = seccomp_load(seccomp);
2163 if (r < 0)
2164 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
2165
2166 finish:
2167 seccomp_release(seccomp);
2168 return r;
2169 #else
2170 return 0;
2171 #endif
2172
2173 }
2174
2175 static int setup_image(char **device_path, int *loop_nr) {
2176 struct loop_info64 info = {
2177 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2178 };
2179 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2180 _cleanup_free_ char* loopdev = NULL;
2181 struct stat st;
2182 int r, nr;
2183
2184 assert(device_path);
2185 assert(loop_nr);
2186
2187 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2188 if (fd < 0) {
2189 log_error("Failed to open %s: %m", arg_image);
2190 return -errno;
2191 }
2192
2193 if (fstat(fd, &st) < 0) {
2194 log_error("Failed to stat %s: %m", arg_image);
2195 return -errno;
2196 }
2197
2198 if (S_ISBLK(st.st_mode)) {
2199 char *p;
2200
2201 p = strdup(arg_image);
2202 if (!p)
2203 return log_oom();
2204
2205 *device_path = p;
2206
2207 *loop_nr = -1;
2208
2209 r = fd;
2210 fd = -1;
2211
2212 return r;
2213 }
2214
2215 if (!S_ISREG(st.st_mode)) {
2216 log_error("%s is not a regular file or block device: %m", arg_image);
2217 return -EINVAL;
2218 }
2219
2220 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2221 if (control < 0) {
2222 log_error("Failed to open /dev/loop-control: %m");
2223 return -errno;
2224 }
2225
2226 nr = ioctl(control, LOOP_CTL_GET_FREE);
2227 if (nr < 0) {
2228 log_error("Failed to allocate loop device: %m");
2229 return -errno;
2230 }
2231
2232 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2233 return log_oom();
2234
2235 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2236 if (loop < 0) {
2237 log_error("Failed to open loop device %s: %m", loopdev);
2238 return -errno;
2239 }
2240
2241 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2242 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2243 return -errno;
2244 }
2245
2246 if (arg_read_only)
2247 info.lo_flags |= LO_FLAGS_READ_ONLY;
2248
2249 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2250 log_error("Failed to set loopback settings on %s: %m", loopdev);
2251 return -errno;
2252 }
2253
2254 *device_path = loopdev;
2255 loopdev = NULL;
2256
2257 *loop_nr = nr;
2258
2259 r = loop;
2260 loop = -1;
2261
2262 return r;
2263 }
2264
2265 static int dissect_image(
2266 int fd,
2267 char **root_device, bool *root_device_rw,
2268 char **home_device, bool *home_device_rw,
2269 char **srv_device, bool *srv_device_rw,
2270 bool *secondary) {
2271
2272 #ifdef HAVE_BLKID
2273 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2274 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2275 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2276 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2277 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2278 _cleanup_udev_unref_ struct udev *udev = NULL;
2279 struct udev_list_entry *first, *item;
2280 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2281 const char *pttype = NULL;
2282 blkid_partlist pl;
2283 struct stat st;
2284 int r;
2285
2286 assert(fd >= 0);
2287 assert(root_device);
2288 assert(home_device);
2289 assert(srv_device);
2290 assert(secondary);
2291
2292 b = blkid_new_probe();
2293 if (!b)
2294 return log_oom();
2295
2296 errno = 0;
2297 r = blkid_probe_set_device(b, fd, 0, 0);
2298 if (r != 0) {
2299 if (errno == 0)
2300 return log_oom();
2301
2302 log_error("Failed to set device on blkid probe: %m");
2303 return -errno;
2304 }
2305
2306 blkid_probe_enable_partitions(b, 1);
2307 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2308
2309 errno = 0;
2310 r = blkid_do_safeprobe(b);
2311 if (r == -2 || r == 1) {
2312 log_error("Failed to identify any partition table on %s.\n"
2313 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2314 return -EINVAL;
2315 } else if (r != 0) {
2316 if (errno == 0)
2317 errno = EIO;
2318 log_error("Failed to probe: %m");
2319 return -errno;
2320 }
2321
2322 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2323 if (!streq_ptr(pttype, "gpt")) {
2324 log_error("Image %s does not carry a GUID Partition Table.\n"
2325 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2326 return -EINVAL;
2327 }
2328
2329 errno = 0;
2330 pl = blkid_probe_get_partitions(b);
2331 if (!pl) {
2332 if (errno == 0)
2333 return log_oom();
2334
2335 log_error("Failed to list partitions of %s", arg_image);
2336 return -errno;
2337 }
2338
2339 udev = udev_new();
2340 if (!udev)
2341 return log_oom();
2342
2343 if (fstat(fd, &st) < 0) {
2344 log_error("Failed to stat block device: %m");
2345 return -errno;
2346 }
2347
2348 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2349 if (!d)
2350 return log_oom();
2351
2352 e = udev_enumerate_new(udev);
2353 if (!e)
2354 return log_oom();
2355
2356 r = udev_enumerate_add_match_parent(e, d);
2357 if (r < 0)
2358 return log_oom();
2359
2360 r = udev_enumerate_scan_devices(e);
2361 if (r < 0) {
2362 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2363 return r;
2364 }
2365
2366 first = udev_enumerate_get_list_entry(e);
2367 udev_list_entry_foreach(item, first) {
2368 _cleanup_udev_device_unref_ struct udev_device *q;
2369 const char *stype, *node;
2370 unsigned long long flags;
2371 sd_id128_t type_id;
2372 blkid_partition pp;
2373 dev_t qn;
2374 int nr;
2375
2376 errno = 0;
2377 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2378 if (!q) {
2379 if (!errno)
2380 errno = ENOMEM;
2381
2382 log_error("Failed to get partition device of %s: %m", arg_image);
2383 return -errno;
2384 }
2385
2386 qn = udev_device_get_devnum(q);
2387 if (major(qn) == 0)
2388 continue;
2389
2390 if (st.st_rdev == qn)
2391 continue;
2392
2393 node = udev_device_get_devnode(q);
2394 if (!node)
2395 continue;
2396
2397 pp = blkid_partlist_devno_to_partition(pl, qn);
2398 if (!pp)
2399 continue;
2400
2401 flags = blkid_partition_get_flags(pp);
2402 if (flags & GPT_FLAG_NO_AUTO)
2403 continue;
2404
2405 nr = blkid_partition_get_partno(pp);
2406 if (nr < 0)
2407 continue;
2408
2409 stype = blkid_partition_get_type_string(pp);
2410 if (!stype)
2411 continue;
2412
2413 if (sd_id128_from_string(stype, &type_id) < 0)
2414 continue;
2415
2416 if (sd_id128_equal(type_id, GPT_HOME)) {
2417
2418 if (home && nr >= home_nr)
2419 continue;
2420
2421 home_nr = nr;
2422 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2423
2424 free(home);
2425 home = strdup(node);
2426 if (!home)
2427 return log_oom();
2428 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2429
2430 if (srv && nr >= srv_nr)
2431 continue;
2432
2433 srv_nr = nr;
2434 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2435
2436 free(srv);
2437 srv = strdup(node);
2438 if (!srv)
2439 return log_oom();
2440 }
2441 #ifdef GPT_ROOT_NATIVE
2442 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2443
2444 if (root && nr >= root_nr)
2445 continue;
2446
2447 root_nr = nr;
2448 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2449
2450 free(root);
2451 root = strdup(node);
2452 if (!root)
2453 return log_oom();
2454 }
2455 #endif
2456 #ifdef GPT_ROOT_SECONDARY
2457 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2458
2459 if (secondary_root && nr >= secondary_root_nr)
2460 continue;
2461
2462 secondary_root_nr = nr;
2463 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2464
2465
2466 free(secondary_root);
2467 secondary_root = strdup(node);
2468 if (!secondary_root)
2469 return log_oom();
2470 }
2471 #endif
2472 }
2473
2474 if (!root && !secondary_root) {
2475 log_error("Failed to identify root partition in disk image %s.\n"
2476 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2477 return -EINVAL;
2478 }
2479
2480 if (root) {
2481 *root_device = root;
2482 root = NULL;
2483
2484 *root_device_rw = root_rw;
2485 *secondary = false;
2486 } else if (secondary_root) {
2487 *root_device = secondary_root;
2488 secondary_root = NULL;
2489
2490 *root_device_rw = secondary_root_rw;
2491 *secondary = true;
2492 }
2493
2494 if (home) {
2495 *home_device = home;
2496 home = NULL;
2497
2498 *home_device_rw = home_rw;
2499 }
2500
2501 if (srv) {
2502 *srv_device = srv;
2503 srv = NULL;
2504
2505 *srv_device_rw = srv_rw;
2506 }
2507
2508 return 0;
2509 #else
2510 log_error("--image= is not supported, compiled without blkid support.");
2511 return -ENOTSUP;
2512 #endif
2513 }
2514
2515 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2516 #ifdef HAVE_BLKID
2517 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2518 const char *fstype, *p;
2519 int r;
2520
2521 assert(what);
2522 assert(where);
2523
2524 if (arg_read_only)
2525 rw = false;
2526
2527 if (directory)
2528 p = strappenda(where, directory);
2529 else
2530 p = where;
2531
2532 errno = 0;
2533 b = blkid_new_probe_from_filename(what);
2534 if (!b) {
2535 if (errno == 0)
2536 return log_oom();
2537 log_error("Failed to allocate prober for %s: %m", what);
2538 return -errno;
2539 }
2540
2541 blkid_probe_enable_superblocks(b, 1);
2542 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2543
2544 errno = 0;
2545 r = blkid_do_safeprobe(b);
2546 if (r == -1 || r == 1) {
2547 log_error("Cannot determine file system type of %s", what);
2548 return -EINVAL;
2549 } else if (r != 0) {
2550 if (errno == 0)
2551 errno = EIO;
2552 log_error("Failed to probe %s: %m", what);
2553 return -errno;
2554 }
2555
2556 errno = 0;
2557 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2558 if (errno == 0)
2559 errno = EINVAL;
2560 log_error("Failed to determine file system type of %s", what);
2561 return -errno;
2562 }
2563
2564 if (streq(fstype, "crypto_LUKS")) {
2565 log_error("nspawn currently does not support LUKS disk images.");
2566 return -ENOTSUP;
2567 }
2568
2569 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2570 log_error("Failed to mount %s: %m", what);
2571 return -errno;
2572 }
2573
2574 return 0;
2575 #else
2576 log_error("--image= is not supported, compiled without blkid support.");
2577 return -ENOTSUP;
2578 #endif
2579 }
2580
2581 static int mount_devices(
2582 const char *where,
2583 const char *root_device, bool root_device_rw,
2584 const char *home_device, bool home_device_rw,
2585 const char *srv_device, bool srv_device_rw) {
2586 int r;
2587
2588 assert(where);
2589
2590 if (root_device) {
2591 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2592 if (r < 0) {
2593 log_error("Failed to mount root directory: %s", strerror(-r));
2594 return r;
2595 }
2596 }
2597
2598 if (home_device) {
2599 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2600 if (r < 0) {
2601 log_error("Failed to mount home directory: %s", strerror(-r));
2602 return r;
2603 }
2604 }
2605
2606 if (srv_device) {
2607 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2608 if (r < 0) {
2609 log_error("Failed to mount server data directory: %s", strerror(-r));
2610 return r;
2611 }
2612 }
2613
2614 return 0;
2615 }
2616
2617 static void loop_remove(int nr, int *image_fd) {
2618 _cleanup_close_ int control = -1;
2619 int r;
2620
2621 if (nr < 0)
2622 return;
2623
2624 if (image_fd && *image_fd >= 0) {
2625 r = ioctl(*image_fd, LOOP_CLR_FD);
2626 if (r < 0)
2627 log_warning("Failed to close loop image: %m");
2628 *image_fd = safe_close(*image_fd);
2629 }
2630
2631 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2632 if (control < 0) {
2633 log_warning("Failed to open /dev/loop-control: %m");
2634 return;
2635 }
2636
2637 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2638 if (r < 0)
2639 log_warning("Failed to remove loop %d: %m", nr);
2640 }
2641
2642 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2643 int pipe_fds[2];
2644 pid_t pid;
2645
2646 assert(database);
2647 assert(key);
2648 assert(rpid);
2649
2650 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2651 log_error("Failed to allocate pipe: %m");
2652 return -errno;
2653 }
2654
2655 pid = fork();
2656 if (pid < 0) {
2657 log_error("Failed to fork getent child: %m");
2658 return -errno;
2659 } else if (pid == 0) {
2660 int nullfd;
2661 char *empty_env = NULL;
2662
2663 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2664 _exit(EXIT_FAILURE);
2665
2666 if (pipe_fds[0] > 2)
2667 safe_close(pipe_fds[0]);
2668 if (pipe_fds[1] > 2)
2669 safe_close(pipe_fds[1]);
2670
2671 nullfd = open("/dev/null", O_RDWR);
2672 if (nullfd < 0)
2673 _exit(EXIT_FAILURE);
2674
2675 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2676 _exit(EXIT_FAILURE);
2677
2678 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2679 _exit(EXIT_FAILURE);
2680
2681 if (nullfd > 2)
2682 safe_close(nullfd);
2683
2684 reset_all_signal_handlers();
2685 close_all_fds(NULL, 0);
2686
2687 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2688 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2689 _exit(EXIT_FAILURE);
2690 }
2691
2692 pipe_fds[1] = safe_close(pipe_fds[1]);
2693
2694 *rpid = pid;
2695
2696 return pipe_fds[0];
2697 }
2698
2699 static int change_uid_gid(char **_home) {
2700 char line[LINE_MAX], *x, *u, *g, *h;
2701 const char *word, *state;
2702 _cleanup_free_ uid_t *uids = NULL;
2703 _cleanup_free_ char *home = NULL;
2704 _cleanup_fclose_ FILE *f = NULL;
2705 _cleanup_close_ int fd = -1;
2706 unsigned n_uids = 0;
2707 size_t sz = 0, l;
2708 uid_t uid;
2709 gid_t gid;
2710 pid_t pid;
2711 int r;
2712
2713 assert(_home);
2714
2715 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2716 /* Reset everything fully to 0, just in case */
2717
2718 if (setgroups(0, NULL) < 0) {
2719 log_error("setgroups() failed: %m");
2720 return -errno;
2721 }
2722
2723 if (setresgid(0, 0, 0) < 0) {
2724 log_error("setregid() failed: %m");
2725 return -errno;
2726 }
2727
2728 if (setresuid(0, 0, 0) < 0) {
2729 log_error("setreuid() failed: %m");
2730 return -errno;
2731 }
2732
2733 *_home = NULL;
2734 return 0;
2735 }
2736
2737 /* First, get user credentials */
2738 fd = spawn_getent("passwd", arg_user, &pid);
2739 if (fd < 0)
2740 return fd;
2741
2742 f = fdopen(fd, "r");
2743 if (!f)
2744 return log_oom();
2745 fd = -1;
2746
2747 if (!fgets(line, sizeof(line), f)) {
2748
2749 if (!ferror(f)) {
2750 log_error("Failed to resolve user %s.", arg_user);
2751 return -ESRCH;
2752 }
2753
2754 log_error("Failed to read from getent: %m");
2755 return -errno;
2756 }
2757
2758 truncate_nl(line);
2759
2760 wait_for_terminate_and_warn("getent passwd", pid);
2761
2762 x = strchr(line, ':');
2763 if (!x) {
2764 log_error("/etc/passwd entry has invalid user field.");
2765 return -EIO;
2766 }
2767
2768 u = strchr(x+1, ':');
2769 if (!u) {
2770 log_error("/etc/passwd entry has invalid password field.");
2771 return -EIO;
2772 }
2773
2774 u++;
2775 g = strchr(u, ':');
2776 if (!g) {
2777 log_error("/etc/passwd entry has invalid UID field.");
2778 return -EIO;
2779 }
2780
2781 *g = 0;
2782 g++;
2783 x = strchr(g, ':');
2784 if (!x) {
2785 log_error("/etc/passwd entry has invalid GID field.");
2786 return -EIO;
2787 }
2788
2789 *x = 0;
2790 h = strchr(x+1, ':');
2791 if (!h) {
2792 log_error("/etc/passwd entry has invalid GECOS field.");
2793 return -EIO;
2794 }
2795
2796 h++;
2797 x = strchr(h, ':');
2798 if (!x) {
2799 log_error("/etc/passwd entry has invalid home directory field.");
2800 return -EIO;
2801 }
2802
2803 *x = 0;
2804
2805 r = parse_uid(u, &uid);
2806 if (r < 0) {
2807 log_error("Failed to parse UID of user.");
2808 return -EIO;
2809 }
2810
2811 r = parse_gid(g, &gid);
2812 if (r < 0) {
2813 log_error("Failed to parse GID of user.");
2814 return -EIO;
2815 }
2816
2817 home = strdup(h);
2818 if (!home)
2819 return log_oom();
2820
2821 /* Second, get group memberships */
2822 fd = spawn_getent("initgroups", arg_user, &pid);
2823 if (fd < 0)
2824 return fd;
2825
2826 fclose(f);
2827 f = fdopen(fd, "r");
2828 if (!f)
2829 return log_oom();
2830 fd = -1;
2831
2832 if (!fgets(line, sizeof(line), f)) {
2833 if (!ferror(f)) {
2834 log_error("Failed to resolve user %s.", arg_user);
2835 return -ESRCH;
2836 }
2837
2838 log_error("Failed to read from getent: %m");
2839 return -errno;
2840 }
2841
2842 truncate_nl(line);
2843
2844 wait_for_terminate_and_warn("getent initgroups", pid);
2845
2846 /* Skip over the username and subsequent separator whitespace */
2847 x = line;
2848 x += strcspn(x, WHITESPACE);
2849 x += strspn(x, WHITESPACE);
2850
2851 FOREACH_WORD(word, l, x, state) {
2852 char c[l+1];
2853
2854 memcpy(c, word, l);
2855 c[l] = 0;
2856
2857 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2858 return log_oom();
2859
2860 r = parse_uid(c, &uids[n_uids++]);
2861 if (r < 0) {
2862 log_error("Failed to parse group data from getent.");
2863 return -EIO;
2864 }
2865 }
2866
2867 r = mkdir_parents(home, 0775);
2868 if (r < 0) {
2869 log_error("Failed to make home root directory: %s", strerror(-r));
2870 return r;
2871 }
2872
2873 r = mkdir_safe(home, 0755, uid, gid);
2874 if (r < 0 && r != -EEXIST) {
2875 log_error("Failed to make home directory: %s", strerror(-r));
2876 return r;
2877 }
2878
2879 fchown(STDIN_FILENO, uid, gid);
2880 fchown(STDOUT_FILENO, uid, gid);
2881 fchown(STDERR_FILENO, uid, gid);
2882
2883 if (setgroups(n_uids, uids) < 0) {
2884 log_error("Failed to set auxiliary groups: %m");
2885 return -errno;
2886 }
2887
2888 if (setresgid(gid, gid, gid) < 0) {
2889 log_error("setregid() failed: %m");
2890 return -errno;
2891 }
2892
2893 if (setresuid(uid, uid, uid) < 0) {
2894 log_error("setreuid() failed: %m");
2895 return -errno;
2896 }
2897
2898 if (_home) {
2899 *_home = home;
2900 home = NULL;
2901 }
2902
2903 return 0;
2904 }
2905
2906 /*
2907 * Return values:
2908 * < 0 : wait_for_terminate() failed to get the state of the
2909 * container, the container was terminated by a signal, or
2910 * failed for an unknown reason. No change is made to the
2911 * container argument.
2912 * > 0 : The program executed in the container terminated with an
2913 * error. The exit code of the program executed in the
2914 * container is returned. No change is made to the container
2915 * argument.
2916 * 0 : The container is being rebooted, has been shut down or exited
2917 * successfully. The container argument has been set to either
2918 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2919 *
2920 * That is, success is indicated by a return value of zero, and an
2921 * error is indicated by a non-zero value.
2922 */
2923 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2924 int r;
2925 siginfo_t status;
2926
2927 r = wait_for_terminate(pid, &status);
2928 if (r < 0) {
2929 log_warning("Failed to wait for container: %s", strerror(-r));
2930 return r;
2931 }
2932
2933 switch (status.si_code) {
2934 case CLD_EXITED:
2935 r = status.si_status;
2936 if (r == 0) {
2937 if (!arg_quiet)
2938 log_debug("Container %s exited successfully.",
2939 arg_machine);
2940
2941 *container = CONTAINER_TERMINATED;
2942 } else {
2943 log_error("Container %s failed with error code %i.",
2944 arg_machine, status.si_status);
2945 }
2946 break;
2947
2948 case CLD_KILLED:
2949 if (status.si_status == SIGINT) {
2950 if (!arg_quiet)
2951 log_info("Container %s has been shut down.",
2952 arg_machine);
2953
2954 *container = CONTAINER_TERMINATED;
2955 r = 0;
2956 break;
2957 } else if (status.si_status == SIGHUP) {
2958 if (!arg_quiet)
2959 log_info("Container %s is being rebooted.",
2960 arg_machine);
2961
2962 *container = CONTAINER_REBOOTED;
2963 r = 0;
2964 break;
2965 }
2966 /* CLD_KILLED fallthrough */
2967
2968 case CLD_DUMPED:
2969 log_error("Container %s terminated by signal %s.",
2970 arg_machine, signal_to_string(status.si_status));
2971 r = -1;
2972 break;
2973
2974 default:
2975 log_error("Container %s failed due to unknown reason.",
2976 arg_machine);
2977 r = -1;
2978 break;
2979 }
2980
2981 return r;
2982 }
2983
2984 static void nop_handler(int sig) {}
2985
2986 int main(int argc, char *argv[]) {
2987
2988 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2989 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2990 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2991 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2992 _cleanup_fdset_free_ FDSet *fds = NULL;
2993 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2994 const char *console = NULL;
2995 char veth_name[IFNAMSIZ];
2996 bool secondary = false;
2997 sigset_t mask, mask_chld;
2998 pid_t pid = 0;
2999
3000 log_parse_environment();
3001 log_open();
3002
3003 k = parse_argv(argc, argv);
3004 if (k < 0)
3005 goto finish;
3006 else if (k == 0) {
3007 r = EXIT_SUCCESS;
3008 goto finish;
3009 }
3010
3011 if (!arg_image) {
3012 if (arg_directory) {
3013 char *p;
3014
3015 p = path_make_absolute_cwd(arg_directory);
3016 free(arg_directory);
3017 arg_directory = p;
3018 } else
3019 arg_directory = get_current_dir_name();
3020
3021 if (!arg_directory) {
3022 log_error("Failed to determine path, please use -D.");
3023 goto finish;
3024 }
3025 path_kill_slashes(arg_directory);
3026 }
3027
3028 if (!arg_machine) {
3029 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
3030 if (!arg_machine) {
3031 log_oom();
3032 goto finish;
3033 }
3034
3035 hostname_cleanup(arg_machine, false);
3036 if (isempty(arg_machine)) {
3037 log_error("Failed to determine machine name automatically, please use -M.");
3038 goto finish;
3039 }
3040 }
3041
3042 if (geteuid() != 0) {
3043 log_error("Need to be root.");
3044 goto finish;
3045 }
3046
3047 if (sd_booted() <= 0) {
3048 log_error("Not running on a systemd system.");
3049 goto finish;
3050 }
3051
3052 log_close();
3053 n_fd_passed = sd_listen_fds(false);
3054 if (n_fd_passed > 0) {
3055 k = fdset_new_listen_fds(&fds, false);
3056 if (k < 0) {
3057 log_error("Failed to collect file descriptors: %s", strerror(-k));
3058 goto finish;
3059 }
3060 }
3061 fdset_close_others(fds);
3062 log_open();
3063
3064 if (arg_directory) {
3065 if (path_equal(arg_directory, "/")) {
3066 log_error("Spawning container on root directory not supported.");
3067 goto finish;
3068 }
3069
3070 if (arg_boot) {
3071 if (path_is_os_tree(arg_directory) <= 0) {
3072 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3073 goto finish;
3074 }
3075 } else {
3076 const char *p;
3077
3078 p = strappenda(arg_directory,
3079 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3080 if (access(p, F_OK) < 0) {
3081 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3082 goto finish;
3083
3084 }
3085 }
3086 } else {
3087 char template[] = "/tmp/nspawn-root-XXXXXX";
3088
3089 if (!mkdtemp(template)) {
3090 log_error("Failed to create temporary directory: %m");
3091 r = -errno;
3092 goto finish;
3093 }
3094
3095 arg_directory = strdup(template);
3096 if (!arg_directory) {
3097 r = log_oom();
3098 goto finish;
3099 }
3100
3101 image_fd = setup_image(&device_path, &loop_nr);
3102 if (image_fd < 0) {
3103 r = image_fd;
3104 goto finish;
3105 }
3106
3107 r = dissect_image(image_fd,
3108 &root_device, &root_device_rw,
3109 &home_device, &home_device_rw,
3110 &srv_device, &srv_device_rw,
3111 &secondary);
3112 if (r < 0)
3113 goto finish;
3114 }
3115
3116 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3117 if (master < 0) {
3118 log_error("Failed to acquire pseudo tty: %m");
3119 goto finish;
3120 }
3121
3122 console = ptsname(master);
3123 if (!console) {
3124 log_error("Failed to determine tty name: %m");
3125 goto finish;
3126 }
3127
3128 if (!arg_quiet)
3129 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3130 arg_machine, arg_image ? arg_image : arg_directory);
3131
3132 if (unlockpt(master) < 0) {
3133 log_error("Failed to unlock tty: %m");
3134 goto finish;
3135 }
3136
3137 if (access("/dev/kdbus/control", F_OK) >= 0) {
3138
3139 if (arg_share_system) {
3140 kdbus_domain = strdup("/dev/kdbus");
3141 if (!kdbus_domain) {
3142 log_oom();
3143 goto finish;
3144 }
3145 } else {
3146 const char *ns;
3147
3148 ns = strappenda("machine-", arg_machine);
3149 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
3150 if (r < 0)
3151 log_debug("Failed to create kdbus domain: %s", strerror(-r));
3152 else
3153 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
3154 }
3155 }
3156
3157 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3158 log_error("Failed to create kmsg socket pair: %m");
3159 goto finish;
3160 }
3161
3162 sd_notify(false,
3163 "READY=1\n"
3164 "STATUS=Container running.");
3165
3166 assert_se(sigemptyset(&mask) == 0);
3167 assert_se(sigemptyset(&mask_chld) == 0);
3168 sigaddset(&mask_chld, SIGCHLD);
3169 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3170 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3171
3172 for (;;) {
3173 ContainerStatus container_status;
3174 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3175 struct sigaction sa = {
3176 .sa_handler = nop_handler,
3177 .sa_flags = SA_NOCLDSTOP,
3178 };
3179
3180 r = barrier_create(&barrier);
3181 if (r < 0) {
3182 log_error("Cannot initialize IPC barrier: %s", strerror(-r));
3183 goto finish;
3184 }
3185
3186 /* Child can be killed before execv(), so handle SIGCHLD
3187 * in order to interrupt parent's blocking calls and
3188 * give it a chance to call wait() and terminate. */
3189 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3190 if (r < 0) {
3191 log_error("Failed to change the signal mask: %m");
3192 goto finish;
3193 }
3194
3195 r = sigaction(SIGCHLD, &sa, NULL);
3196 if (r < 0) {
3197 log_error("Failed to install SIGCHLD handler: %m");
3198 goto finish;
3199 }
3200
3201 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3202 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3203 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3204 if (pid < 0) {
3205 if (errno == EINVAL)
3206 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3207 else
3208 log_error("clone() failed: %m");
3209
3210 r = pid;
3211 goto finish;
3212 }
3213
3214 if (pid == 0) {
3215 /* child */
3216 _cleanup_free_ char *home = NULL;
3217 unsigned n_env = 2;
3218 const char *envp[] = {
3219 "PATH=" DEFAULT_PATH_SPLIT_USR,
3220 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3221 NULL, /* TERM */
3222 NULL, /* HOME */
3223 NULL, /* USER */
3224 NULL, /* LOGNAME */
3225 NULL, /* container_uuid */
3226 NULL, /* LISTEN_FDS */
3227 NULL, /* LISTEN_PID */
3228 NULL
3229 };
3230 char **env_use;
3231
3232 barrier_set_role(&barrier, BARRIER_CHILD);
3233
3234 envp[n_env] = strv_find_prefix(environ, "TERM=");
3235 if (envp[n_env])
3236 n_env ++;
3237
3238 master = safe_close(master);
3239
3240 close_nointr(STDIN_FILENO);
3241 close_nointr(STDOUT_FILENO);
3242 close_nointr(STDERR_FILENO);
3243
3244 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3245
3246 reset_all_signal_handlers();
3247 reset_signal_mask();
3248
3249 k = open_terminal(console, O_RDWR);
3250 if (k != STDIN_FILENO) {
3251 if (k >= 0) {
3252 safe_close(k);
3253 k = -EINVAL;
3254 }
3255
3256 log_error("Failed to open console: %s", strerror(-k));
3257 _exit(EXIT_FAILURE);
3258 }
3259
3260 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3261 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3262 log_error("Failed to duplicate console: %m");
3263 _exit(EXIT_FAILURE);
3264 }
3265
3266 if (setsid() < 0) {
3267 log_error("setsid() failed: %m");
3268 _exit(EXIT_FAILURE);
3269 }
3270
3271 if (reset_audit_loginuid() < 0)
3272 _exit(EXIT_FAILURE);
3273
3274 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3275 log_error("PR_SET_PDEATHSIG failed: %m");
3276 _exit(EXIT_FAILURE);
3277 }
3278
3279 /* Mark everything as slave, so that we still
3280 * receive mounts from the real root, but don't
3281 * propagate mounts to the real root. */
3282 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3283 log_error("MS_SLAVE|MS_REC failed: %m");
3284 _exit(EXIT_FAILURE);
3285 }
3286
3287 if (mount_devices(arg_directory,
3288 root_device, root_device_rw,
3289 home_device, home_device_rw,
3290 srv_device, srv_device_rw) < 0)
3291 _exit(EXIT_FAILURE);
3292
3293 /* Turn directory into bind mount */
3294 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3295 log_error("Failed to make bind mount: %m");
3296 _exit(EXIT_FAILURE);
3297 }
3298
3299 r = setup_volatile(arg_directory);
3300 if (r < 0)
3301 _exit(EXIT_FAILURE);
3302
3303 if (setup_volatile_state(arg_directory) < 0)
3304 _exit(EXIT_FAILURE);
3305
3306 r = base_filesystem_create(arg_directory);
3307 if (r < 0)
3308 _exit(EXIT_FAILURE);
3309
3310 if (arg_read_only) {
3311 k = bind_remount_recursive(arg_directory, true);
3312 if (k < 0) {
3313 log_error("Failed to make tree read-only: %s", strerror(-k));
3314 _exit(EXIT_FAILURE);
3315 }
3316 }
3317
3318 if (mount_all(arg_directory) < 0)
3319 _exit(EXIT_FAILURE);
3320
3321 if (copy_devnodes(arg_directory) < 0)
3322 _exit(EXIT_FAILURE);
3323
3324 if (setup_ptmx(arg_directory) < 0)
3325 _exit(EXIT_FAILURE);
3326
3327 dev_setup(arg_directory);
3328
3329 if (setup_seccomp() < 0)
3330 _exit(EXIT_FAILURE);
3331
3332 if (setup_dev_console(arg_directory, console) < 0)
3333 _exit(EXIT_FAILURE);
3334
3335 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3336 _exit(EXIT_FAILURE);
3337
3338 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3339
3340 if (setup_boot_id(arg_directory) < 0)
3341 _exit(EXIT_FAILURE);
3342
3343 if (setup_timezone(arg_directory) < 0)
3344 _exit(EXIT_FAILURE);
3345
3346 if (setup_resolv_conf(arg_directory) < 0)
3347 _exit(EXIT_FAILURE);
3348
3349 if (setup_journal(arg_directory) < 0)
3350 _exit(EXIT_FAILURE);
3351
3352 if (mount_binds(arg_directory, arg_bind, false) < 0)
3353 _exit(EXIT_FAILURE);
3354
3355 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3356 _exit(EXIT_FAILURE);
3357
3358 if (mount_tmpfs(arg_directory) < 0)
3359 _exit(EXIT_FAILURE);
3360
3361 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3362 _exit(EXIT_FAILURE);
3363
3364 /* Tell the parent that we are ready, and that
3365 * it can cgroupify us to that we lack access
3366 * to certain devices and resources. */
3367 barrier_place(&barrier);
3368
3369 if (chdir(arg_directory) < 0) {
3370 log_error("chdir(%s) failed: %m", arg_directory);
3371 _exit(EXIT_FAILURE);
3372 }
3373
3374 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3375 log_error("mount(MS_MOVE) failed: %m");
3376 _exit(EXIT_FAILURE);
3377 }
3378
3379 if (chroot(".") < 0) {
3380 log_error("chroot() failed: %m");
3381 _exit(EXIT_FAILURE);
3382 }
3383
3384 if (chdir("/") < 0) {
3385 log_error("chdir() failed: %m");
3386 _exit(EXIT_FAILURE);
3387 }
3388
3389 umask(0022);
3390
3391 if (arg_private_network)
3392 loopback_setup();
3393
3394 if (drop_capabilities() < 0) {
3395 log_error("drop_capabilities() failed: %m");
3396 _exit(EXIT_FAILURE);
3397 }
3398
3399 r = change_uid_gid(&home);
3400 if (r < 0)
3401 _exit(EXIT_FAILURE);
3402
3403 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3404 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3405 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3406 log_oom();
3407 _exit(EXIT_FAILURE);
3408 }
3409
3410 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3411 char as_uuid[37];
3412
3413 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3414 log_oom();
3415 _exit(EXIT_FAILURE);
3416 }
3417 }
3418
3419 if (fdset_size(fds) > 0) {
3420 k = fdset_cloexec(fds, false);
3421 if (k < 0) {
3422 log_error("Failed to unset O_CLOEXEC for file descriptors.");
3423 _exit(EXIT_FAILURE);
3424 }
3425
3426 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3427 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3428 log_oom();
3429 _exit(EXIT_FAILURE);
3430 }
3431 }
3432
3433 setup_hostname();
3434
3435 if (arg_personality != 0xffffffffLU) {
3436 if (personality(arg_personality) < 0) {
3437 log_error("personality() failed: %m");
3438 _exit(EXIT_FAILURE);
3439 }
3440 } else if (secondary) {
3441 if (personality(PER_LINUX32) < 0) {
3442 log_error("personality() failed: %m");
3443 _exit(EXIT_FAILURE);
3444 }
3445 }
3446
3447 #ifdef HAVE_SELINUX
3448 if (arg_selinux_context)
3449 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3450 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3451 _exit(EXIT_FAILURE);
3452 }
3453 #endif
3454
3455 if (!strv_isempty(arg_setenv)) {
3456 char **n;
3457
3458 n = strv_env_merge(2, envp, arg_setenv);
3459 if (!n) {
3460 log_oom();
3461 _exit(EXIT_FAILURE);
3462 }
3463
3464 env_use = n;
3465 } else
3466 env_use = (char**) envp;
3467
3468 /* Wait until the parent is ready with the setup, too... */
3469 if (!barrier_place_and_sync(&barrier))
3470 _exit(EXIT_FAILURE);
3471
3472 if (arg_boot) {
3473 char **a;
3474 size_t l;
3475
3476 /* Automatically search for the init system */
3477
3478 l = 1 + argc - optind;
3479 a = newa(char*, l + 1);
3480 memcpy(a + 1, argv + optind, l * sizeof(char*));
3481
3482 a[0] = (char*) "/usr/lib/systemd/systemd";
3483 execve(a[0], a, env_use);
3484
3485 a[0] = (char*) "/lib/systemd/systemd";
3486 execve(a[0], a, env_use);
3487
3488 a[0] = (char*) "/sbin/init";
3489 execve(a[0], a, env_use);
3490 } else if (argc > optind)
3491 execvpe(argv[optind], argv + optind, env_use);
3492 else {
3493 chdir(home ? home : "/root");
3494 execle("/bin/bash", "-bash", NULL, env_use);
3495 execle("/bin/sh", "-sh", NULL, env_use);
3496 }
3497
3498 log_error("execv() failed: %m");
3499 _exit(EXIT_FAILURE);
3500 }
3501
3502 barrier_set_role(&barrier, BARRIER_PARENT);
3503 fdset_free(fds);
3504 fds = NULL;
3505
3506 /* wait for child-setup to be done */
3507 if (barrier_place_and_sync(&barrier)) {
3508 int ifi = 0;
3509
3510 r = move_network_interfaces(pid);
3511 if (r < 0)
3512 goto finish;
3513
3514 r = setup_veth(pid, veth_name, &ifi);
3515 if (r < 0)
3516 goto finish;
3517
3518 r = setup_bridge(veth_name, &ifi);
3519 if (r < 0)
3520 goto finish;
3521
3522 r = setup_macvlan(pid);
3523 if (r < 0)
3524 goto finish;
3525
3526 r = register_machine(pid, ifi);
3527 if (r < 0)
3528 goto finish;
3529
3530 /* Block SIGCHLD here, before notifying child.
3531 * process_pty() will handle it with the other signals. */
3532 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3533 if (r < 0)
3534 goto finish;
3535
3536 /* Reset signal to default */
3537 r = default_signals(SIGCHLD, -1);
3538 if (r < 0)
3539 goto finish;
3540
3541 /* Notify the child that the parent is ready with all
3542 * its setup, and that the child can now hand over
3543 * control to the code to run inside the container. */
3544 barrier_place(&barrier);
3545
3546 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3547 if (k < 0) {
3548 r = EXIT_FAILURE;
3549 break;
3550 }
3551
3552 if (!arg_quiet)
3553 putc('\n', stdout);
3554
3555 /* Kill if it is not dead yet anyway */
3556 terminate_machine(pid);
3557 }
3558
3559 /* Normally redundant, but better safe than sorry */
3560 kill(pid, SIGKILL);
3561
3562 r = wait_for_container(pid, &container_status);
3563 pid = 0;
3564
3565 if (r < 0) {
3566 /* We failed to wait for the container, or the
3567 * container exited abnormally */
3568 r = EXIT_FAILURE;
3569 break;
3570 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3571 /* The container exited with a non-zero
3572 * status, or with zero status and no reboot
3573 * was requested. */
3574 break;
3575
3576 /* CONTAINER_REBOOTED, loop again */
3577
3578 if (arg_keep_unit) {
3579 /* Special handling if we are running as a
3580 * service: instead of simply restarting the
3581 * machine we want to restart the entire
3582 * service, so let's inform systemd about this
3583 * with the special exit code 133. The service
3584 * file uses RestartForceExitStatus=133 so
3585 * that this results in a full nspawn
3586 * restart. This is necessary since we might
3587 * have cgroup parameters set we want to have
3588 * flushed out. */
3589 r = 133;
3590 break;
3591 }
3592 }
3593
3594 finish:
3595 sd_notify(false,
3596 "STOPPING=1\n"
3597 "STATUS=Terminating...");
3598
3599 loop_remove(loop_nr, &image_fd);
3600
3601 if (pid > 0)
3602 kill(pid, SIGKILL);
3603
3604 free(arg_directory);
3605 free(arg_machine);
3606 free(arg_user);
3607 strv_free(arg_setenv);
3608 strv_free(arg_network_interfaces);
3609 strv_free(arg_network_macvlan);
3610 strv_free(arg_bind);
3611 strv_free(arg_bind_ro);
3612 strv_free(arg_tmpfs);
3613
3614 return r;
3615 }