]> git.ipfire.org Git - thirdparty/systemd.git/blame_incremental - src/nspawn/nspawn.c
os-release: define /usr/lib/os-release as fallback for /etc/os-release
[thirdparty/systemd.git] / src / nspawn / nspawn.c
... / ...
CommitLineData
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
36#include <termios.h>
37#include <sys/signalfd.h>
38#include <grp.h>
39#include <linux/fs.h>
40#include <sys/un.h>
41#include <sys/socket.h>
42#include <linux/netlink.h>
43#include <sys/eventfd.h>
44#include <net/if.h>
45#include <linux/veth.h>
46#include <sys/personality.h>
47#include <linux/loop.h>
48
49#ifdef HAVE_SELINUX
50#include <selinux/selinux.h>
51#endif
52
53#ifdef HAVE_SECCOMP
54#include <seccomp.h>
55#endif
56
57#ifdef HAVE_BLKID
58#include <blkid/blkid.h>
59#endif
60
61#include "sd-daemon.h"
62#include "sd-bus.h"
63#include "sd-id128.h"
64#include "sd-rtnl.h"
65#include "log.h"
66#include "util.h"
67#include "mkdir.h"
68#include "macro.h"
69#include "audit.h"
70#include "missing.h"
71#include "cgroup-util.h"
72#include "strv.h"
73#include "path-util.h"
74#include "loopback-setup.h"
75#include "dev-setup.h"
76#include "fdset.h"
77#include "build.h"
78#include "fileio.h"
79#include "bus-util.h"
80#include "bus-error.h"
81#include "ptyfwd.h"
82#include "bus-kernel.h"
83#include "env-util.h"
84#include "def.h"
85#include "rtnl-util.h"
86#include "udev-util.h"
87#include "eventfd-util.h"
88#include "blkid-util.h"
89#include "gpt.h"
90#include "siphash24.h"
91#include "copy.h"
92
93#ifdef HAVE_SECCOMP
94#include "seccomp-util.h"
95#endif
96
97typedef enum ContainerStatus {
98 CONTAINER_TERMINATED,
99 CONTAINER_REBOOTED
100} ContainerStatus;
101
102typedef enum LinkJournal {
103 LINK_NO,
104 LINK_AUTO,
105 LINK_HOST,
106 LINK_GUEST
107} LinkJournal;
108
109static char *arg_directory = NULL;
110static char *arg_user = NULL;
111static sd_id128_t arg_uuid = {};
112static char *arg_machine = NULL;
113static const char *arg_selinux_context = NULL;
114static const char *arg_selinux_apifs_context = NULL;
115static const char *arg_slice = NULL;
116static bool arg_private_network = false;
117static bool arg_read_only = false;
118static bool arg_boot = false;
119static LinkJournal arg_link_journal = LINK_AUTO;
120static uint64_t arg_retain =
121 (1ULL << CAP_CHOWN) |
122 (1ULL << CAP_DAC_OVERRIDE) |
123 (1ULL << CAP_DAC_READ_SEARCH) |
124 (1ULL << CAP_FOWNER) |
125 (1ULL << CAP_FSETID) |
126 (1ULL << CAP_IPC_OWNER) |
127 (1ULL << CAP_KILL) |
128 (1ULL << CAP_LEASE) |
129 (1ULL << CAP_LINUX_IMMUTABLE) |
130 (1ULL << CAP_NET_BIND_SERVICE) |
131 (1ULL << CAP_NET_BROADCAST) |
132 (1ULL << CAP_NET_RAW) |
133 (1ULL << CAP_SETGID) |
134 (1ULL << CAP_SETFCAP) |
135 (1ULL << CAP_SETPCAP) |
136 (1ULL << CAP_SETUID) |
137 (1ULL << CAP_SYS_ADMIN) |
138 (1ULL << CAP_SYS_CHROOT) |
139 (1ULL << CAP_SYS_NICE) |
140 (1ULL << CAP_SYS_PTRACE) |
141 (1ULL << CAP_SYS_TTY_CONFIG) |
142 (1ULL << CAP_SYS_RESOURCE) |
143 (1ULL << CAP_SYS_BOOT) |
144 (1ULL << CAP_AUDIT_WRITE) |
145 (1ULL << CAP_AUDIT_CONTROL) |
146 (1ULL << CAP_MKNOD);
147static char **arg_bind = NULL;
148static char **arg_bind_ro = NULL;
149static char **arg_tmpfs = NULL;
150static char **arg_setenv = NULL;
151static bool arg_quiet = false;
152static bool arg_share_system = false;
153static bool arg_register = true;
154static bool arg_keep_unit = false;
155static char **arg_network_interfaces = NULL;
156static char **arg_network_macvlan = NULL;
157static bool arg_network_veth = false;
158static const char *arg_network_bridge = NULL;
159static unsigned long arg_personality = 0xffffffffLU;
160static const char *arg_image = NULL;
161
162static int help(void) {
163
164 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
165 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
166 " -h --help Show this help\n"
167 " --version Print version string\n"
168 " -q --quiet Do not show status information\n"
169 " -D --directory=PATH Root directory for the container\n"
170 " -i --image=PATH File system device or image for the container\n"
171 " -b --boot Boot up full system (i.e. invoke init)\n"
172 " -u --user=USER Run the command under specified user or uid\n"
173 " -M --machine=NAME Set the machine name for the container\n"
174 " --uuid=UUID Set a specific machine UUID for the container\n"
175 " -S --slice=SLICE Place the container in the specified slice\n"
176 " --private-network Disable network in container\n"
177 " --network-interface=INTERFACE\n"
178 " Assign an existing network interface to the\n"
179 " container\n"
180 " --network-macvlan=INTERFACE\n"
181 " Create a macvlan network interface based on an\n"
182 " existing network interface to the container\n"
183 " --network-veth Add a virtual ethernet connection between host\n"
184 " and container\n"
185 " --network-bridge=INTERFACE\n"
186 " Add a virtual ethernet connection between host\n"
187 " and container and add it to an existing bridge on\n"
188 " the host\n"
189 " -Z --selinux-context=SECLABEL\n"
190 " Set the SELinux security context to be used by\n"
191 " processes in the container\n"
192 " -L --selinux-apifs-context=SECLABEL\n"
193 " Set the SELinux security context to be used by\n"
194 " API/tmpfs file systems in the container\n"
195 " --capability=CAP In addition to the default, retain specified\n"
196 " capability\n"
197 " --drop-capability=CAP Drop the specified capability from the default set\n"
198 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
199 " -j Equivalent to --link-journal=host\n"
200 " --read-only Mount the root directory read-only\n"
201 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
202 " the container\n"
203 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
204 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
205 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
206 " --share-system Share system namespaces with host\n"
207 " --register=BOOLEAN Register container as machine\n"
208 " --keep-unit Do not register a scope for the machine, reuse\n"
209 " the service unit nspawn is running in\n",
210 program_invocation_short_name);
211
212 return 0;
213}
214
215static int parse_argv(int argc, char *argv[]) {
216
217 enum {
218 ARG_VERSION = 0x100,
219 ARG_PRIVATE_NETWORK,
220 ARG_UUID,
221 ARG_READ_ONLY,
222 ARG_CAPABILITY,
223 ARG_DROP_CAPABILITY,
224 ARG_LINK_JOURNAL,
225 ARG_BIND,
226 ARG_BIND_RO,
227 ARG_TMPFS,
228 ARG_SETENV,
229 ARG_SHARE_SYSTEM,
230 ARG_REGISTER,
231 ARG_KEEP_UNIT,
232 ARG_NETWORK_INTERFACE,
233 ARG_NETWORK_MACVLAN,
234 ARG_NETWORK_VETH,
235 ARG_NETWORK_BRIDGE,
236 ARG_PERSONALITY,
237 };
238
239 static const struct option options[] = {
240 { "help", no_argument, NULL, 'h' },
241 { "version", no_argument, NULL, ARG_VERSION },
242 { "directory", required_argument, NULL, 'D' },
243 { "user", required_argument, NULL, 'u' },
244 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
245 { "boot", no_argument, NULL, 'b' },
246 { "uuid", required_argument, NULL, ARG_UUID },
247 { "read-only", no_argument, NULL, ARG_READ_ONLY },
248 { "capability", required_argument, NULL, ARG_CAPABILITY },
249 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
250 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
251 { "bind", required_argument, NULL, ARG_BIND },
252 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
253 { "tmpfs", required_argument, NULL, ARG_TMPFS },
254 { "machine", required_argument, NULL, 'M' },
255 { "slice", required_argument, NULL, 'S' },
256 { "setenv", required_argument, NULL, ARG_SETENV },
257 { "selinux-context", required_argument, NULL, 'Z' },
258 { "selinux-apifs-context", required_argument, NULL, 'L' },
259 { "quiet", no_argument, NULL, 'q' },
260 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
261 { "register", required_argument, NULL, ARG_REGISTER },
262 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
263 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
264 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
265 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
266 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
267 { "personality", required_argument, NULL, ARG_PERSONALITY },
268 { "image", required_argument, NULL, 'i' },
269 {}
270 };
271
272 int c, r;
273 uint64_t plus = 0, minus = 0;
274
275 assert(argc >= 0);
276 assert(argv);
277
278 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
279
280 switch (c) {
281
282 case 'h':
283 return help();
284
285 case ARG_VERSION:
286 puts(PACKAGE_STRING);
287 puts(SYSTEMD_FEATURES);
288 return 0;
289
290 case 'D':
291 free(arg_directory);
292 arg_directory = canonicalize_file_name(optarg);
293 if (!arg_directory) {
294 log_error("Invalid root directory: %m");
295 return -ENOMEM;
296 }
297
298 break;
299
300 case 'i':
301 arg_image = optarg;
302 break;
303
304 case 'u':
305 free(arg_user);
306 arg_user = strdup(optarg);
307 if (!arg_user)
308 return log_oom();
309
310 break;
311
312 case ARG_NETWORK_BRIDGE:
313 arg_network_bridge = optarg;
314
315 /* fall through */
316
317 case ARG_NETWORK_VETH:
318 arg_network_veth = true;
319 arg_private_network = true;
320 break;
321
322 case ARG_NETWORK_INTERFACE:
323 if (strv_extend(&arg_network_interfaces, optarg) < 0)
324 return log_oom();
325
326 arg_private_network = true;
327 break;
328
329 case ARG_NETWORK_MACVLAN:
330 if (strv_extend(&arg_network_macvlan, optarg) < 0)
331 return log_oom();
332
333 /* fall through */
334
335 case ARG_PRIVATE_NETWORK:
336 arg_private_network = true;
337 break;
338
339 case 'b':
340 arg_boot = true;
341 break;
342
343 case ARG_UUID:
344 r = sd_id128_from_string(optarg, &arg_uuid);
345 if (r < 0) {
346 log_error("Invalid UUID: %s", optarg);
347 return r;
348 }
349 break;
350
351 case 'S':
352 arg_slice = optarg;
353 break;
354
355 case 'M':
356 if (isempty(optarg)) {
357 free(arg_machine);
358 arg_machine = NULL;
359 } else {
360
361 if (!hostname_is_valid(optarg)) {
362 log_error("Invalid machine name: %s", optarg);
363 return -EINVAL;
364 }
365
366 free(arg_machine);
367 arg_machine = strdup(optarg);
368 if (!arg_machine)
369 return log_oom();
370
371 break;
372 }
373
374 case 'Z':
375 arg_selinux_context = optarg;
376 break;
377
378 case 'L':
379 arg_selinux_apifs_context = optarg;
380 break;
381
382 case ARG_READ_ONLY:
383 arg_read_only = true;
384 break;
385
386 case ARG_CAPABILITY:
387 case ARG_DROP_CAPABILITY: {
388 char *state, *word;
389 size_t length;
390
391 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
392 _cleanup_free_ char *t;
393 cap_value_t cap;
394
395 t = strndup(word, length);
396 if (!t)
397 return log_oom();
398
399 if (streq(t, "all")) {
400 if (c == ARG_CAPABILITY)
401 plus = (uint64_t) -1;
402 else
403 minus = (uint64_t) -1;
404 } else {
405 if (cap_from_name(t, &cap) < 0) {
406 log_error("Failed to parse capability %s.", t);
407 return -EINVAL;
408 }
409
410 if (c == ARG_CAPABILITY)
411 plus |= 1ULL << (uint64_t) cap;
412 else
413 minus |= 1ULL << (uint64_t) cap;
414 }
415 }
416
417 break;
418 }
419
420 case 'j':
421 arg_link_journal = LINK_GUEST;
422 break;
423
424 case ARG_LINK_JOURNAL:
425 if (streq(optarg, "auto"))
426 arg_link_journal = LINK_AUTO;
427 else if (streq(optarg, "no"))
428 arg_link_journal = LINK_NO;
429 else if (streq(optarg, "guest"))
430 arg_link_journal = LINK_GUEST;
431 else if (streq(optarg, "host"))
432 arg_link_journal = LINK_HOST;
433 else {
434 log_error("Failed to parse link journal mode %s", optarg);
435 return -EINVAL;
436 }
437
438 break;
439
440 case ARG_BIND:
441 case ARG_BIND_RO: {
442 _cleanup_free_ char *a = NULL, *b = NULL;
443 char *e;
444 char ***x;
445
446 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
447
448 e = strchr(optarg, ':');
449 if (e) {
450 a = strndup(optarg, e - optarg);
451 b = strdup(e + 1);
452 } else {
453 a = strdup(optarg);
454 b = strdup(optarg);
455 }
456
457 if (!a || !b)
458 return log_oom();
459
460 if (!path_is_absolute(a) || !path_is_absolute(b)) {
461 log_error("Invalid bind mount specification: %s", optarg);
462 return -EINVAL;
463 }
464
465 r = strv_extend(x, a);
466 if (r < 0)
467 return log_oom();
468
469 r = strv_extend(x, b);
470 if (r < 0)
471 return log_oom();
472
473 break;
474 }
475
476 case ARG_TMPFS: {
477 _cleanup_free_ char *a = NULL, *b = NULL;
478 char *e;
479
480 e = strchr(optarg, ':');
481 if (e) {
482 a = strndup(optarg, e - optarg);
483 b = strdup(e + 1);
484 } else {
485 a = strdup(optarg);
486 b = strdup("mode=0755");
487 }
488
489 if (!a || !b)
490 return log_oom();
491
492 if (!path_is_absolute(a)) {
493 log_error("Invalid tmpfs specification: %s", optarg);
494 return -EINVAL;
495 }
496
497 r = strv_push(&arg_tmpfs, a);
498 if (r < 0)
499 return log_oom();
500
501 a = NULL;
502
503 r = strv_push(&arg_tmpfs, b);
504 if (r < 0)
505 return log_oom();
506
507 b = NULL;
508
509 break;
510 }
511
512 case ARG_SETENV: {
513 char **n;
514
515 if (!env_assignment_is_valid(optarg)) {
516 log_error("Environment variable assignment '%s' is not valid.", optarg);
517 return -EINVAL;
518 }
519
520 n = strv_env_set(arg_setenv, optarg);
521 if (!n)
522 return log_oom();
523
524 strv_free(arg_setenv);
525 arg_setenv = n;
526 break;
527 }
528
529 case 'q':
530 arg_quiet = true;
531 break;
532
533 case ARG_SHARE_SYSTEM:
534 arg_share_system = true;
535 break;
536
537 case ARG_REGISTER:
538 r = parse_boolean(optarg);
539 if (r < 0) {
540 log_error("Failed to parse --register= argument: %s", optarg);
541 return r;
542 }
543
544 arg_register = r;
545 break;
546
547 case ARG_KEEP_UNIT:
548 arg_keep_unit = true;
549 break;
550
551 case ARG_PERSONALITY:
552
553 arg_personality = personality_from_string(optarg);
554 if (arg_personality == 0xffffffffLU) {
555 log_error("Unknown or unsupported personality '%s'.", optarg);
556 return -EINVAL;
557 }
558
559 break;
560
561 case '?':
562 return -EINVAL;
563
564 default:
565 assert_not_reached("Unhandled option");
566 }
567 }
568
569 if (arg_share_system)
570 arg_register = false;
571
572 if (arg_boot && arg_share_system) {
573 log_error("--boot and --share-system may not be combined.");
574 return -EINVAL;
575 }
576
577 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
578 log_error("--keep-unit may not be used when invoked from a user session.");
579 return -EINVAL;
580 }
581
582 if (arg_directory && arg_image) {
583 log_error("--directory= and --image= may not be combined.");
584 return -EINVAL;
585 }
586
587 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
588
589 return 1;
590}
591
592static int mount_all(const char *dest) {
593
594 typedef struct MountPoint {
595 const char *what;
596 const char *where;
597 const char *type;
598 const char *options;
599 unsigned long flags;
600 bool fatal;
601 } MountPoint;
602
603 static const MountPoint mount_table[] = {
604 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
605 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
606 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
607 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
608 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
609 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
610 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
611 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
612#ifdef HAVE_SELINUX
613 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
614 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
615#endif
616 };
617
618 unsigned k;
619 int r = 0;
620
621 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
622 _cleanup_free_ char *where = NULL;
623#ifdef HAVE_SELINUX
624 _cleanup_free_ char *options = NULL;
625#endif
626 const char *o;
627 int t;
628
629 where = strjoin(dest, "/", mount_table[k].where, NULL);
630 if (!where)
631 return log_oom();
632
633 t = path_is_mount_point(where, true);
634 if (t < 0) {
635 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
636
637 if (r == 0)
638 r = t;
639
640 continue;
641 }
642
643 /* Skip this entry if it is not a remount. */
644 if (mount_table[k].what && t > 0)
645 continue;
646
647 mkdir_p(where, 0755);
648
649#ifdef HAVE_SELINUX
650 if (arg_selinux_apifs_context &&
651 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
652 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
653 if (!options)
654 return log_oom();
655
656 o = options;
657 } else
658#endif
659 o = mount_table[k].options;
660
661
662 if (mount(mount_table[k].what,
663 where,
664 mount_table[k].type,
665 mount_table[k].flags,
666 o) < 0 &&
667 mount_table[k].fatal) {
668
669 log_error("mount(%s) failed: %m", where);
670
671 if (r == 0)
672 r = -errno;
673 }
674 }
675
676 return r;
677}
678
679static int mount_binds(const char *dest, char **l, bool ro) {
680 char **x, **y;
681
682 STRV_FOREACH_PAIR(x, y, l) {
683 _cleanup_free_ char *where = NULL;
684 struct stat source_st, dest_st;
685 int r;
686
687 if (stat(*x, &source_st) < 0) {
688 log_error("Failed to stat %s: %m", *x);
689 return -errno;
690 }
691
692 where = strappend(dest, *y);
693 if (!where)
694 return log_oom();
695
696 r = stat(where, &dest_st);
697 if (r == 0) {
698 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
699 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
700 return -EINVAL;
701 }
702 } else if (errno == ENOENT) {
703 r = mkdir_parents_label(where, 0755);
704 if (r < 0) {
705 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
706 return r;
707 }
708 } else {
709 log_error("Failed to bind mount %s: %m", *x);
710 return -errno;
711 }
712
713 /* Create the mount point, but be conservative -- refuse to create block
714 * and char devices. */
715 if (S_ISDIR(source_st.st_mode))
716 mkdir_label(where, 0755);
717 else if (S_ISFIFO(source_st.st_mode))
718 mkfifo(where, 0644);
719 else if (S_ISSOCK(source_st.st_mode))
720 mknod(where, 0644 | S_IFSOCK, 0);
721 else if (S_ISREG(source_st.st_mode))
722 touch(where);
723 else {
724 log_error("Refusing to create mountpoint for file: %s", *x);
725 return -ENOTSUP;
726 }
727
728 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
729 log_error("mount(%s) failed: %m", where);
730 return -errno;
731 }
732
733 if (ro) {
734 r = bind_remount_recursive(where, true);
735 if (r < 0) {
736 log_error("Read-Only bind mount failed: %s", strerror(-r));
737 return r;
738 }
739 }
740 }
741
742 return 0;
743}
744
745static int mount_tmpfs(const char *dest) {
746 char **i, **o;
747
748 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
749 _cleanup_free_ char *where = NULL;
750
751 where = strappend(dest, *i);
752 if (!where)
753 return log_oom();
754
755 mkdir_label(where, 0755);
756
757 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
758 log_error("tmpfs mount to %s failed: %m", where);
759 return -errno;
760 }
761 }
762
763 return 0;
764}
765
766static int setup_timezone(const char *dest) {
767 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
768 char *z, *y;
769 int r;
770
771 assert(dest);
772
773 /* Fix the timezone, if possible */
774 r = readlink_malloc("/etc/localtime", &p);
775 if (r < 0) {
776 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
777 return 0;
778 }
779
780 z = path_startswith(p, "../usr/share/zoneinfo/");
781 if (!z)
782 z = path_startswith(p, "/usr/share/zoneinfo/");
783 if (!z) {
784 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
785 return 0;
786 }
787
788 where = strappend(dest, "/etc/localtime");
789 if (!where)
790 return log_oom();
791
792 r = readlink_malloc(where, &q);
793 if (r >= 0) {
794 y = path_startswith(q, "../usr/share/zoneinfo/");
795 if (!y)
796 y = path_startswith(q, "/usr/share/zoneinfo/");
797
798
799 /* Already pointing to the right place? Then do nothing .. */
800 if (y && streq(y, z))
801 return 0;
802 }
803
804 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
805 if (!check)
806 return log_oom();
807
808 if (access(check, F_OK) < 0) {
809 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
810 return 0;
811 }
812
813 what = strappend("../usr/share/zoneinfo/", z);
814 if (!what)
815 return log_oom();
816
817 unlink(where);
818 if (symlink(what, where) < 0) {
819 log_error("Failed to correct timezone of container: %m");
820 return 0;
821 }
822
823 return 0;
824}
825
826static int setup_resolv_conf(const char *dest) {
827 char _cleanup_free_ *where = NULL;
828
829 assert(dest);
830
831 if (arg_private_network)
832 return 0;
833
834 /* Fix resolv.conf, if possible */
835 where = strappend(dest, "/etc/resolv.conf");
836 if (!where)
837 return log_oom();
838
839 /* We don't really care for the results of this really. If it
840 * fails, it fails, but meh... */
841 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
842
843 return 0;
844}
845
846static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
847
848 snprintf(s, 37,
849 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
850 SD_ID128_FORMAT_VAL(id));
851
852 return s;
853}
854
855static int setup_boot_id(const char *dest) {
856 _cleanup_free_ char *from = NULL, *to = NULL;
857 sd_id128_t rnd = {};
858 char as_uuid[37];
859 int r;
860
861 assert(dest);
862
863 if (arg_share_system)
864 return 0;
865
866 /* Generate a new randomized boot ID, so that each boot-up of
867 * the container gets a new one */
868
869 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
870 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
871 if (!from || !to)
872 return log_oom();
873
874 r = sd_id128_randomize(&rnd);
875 if (r < 0) {
876 log_error("Failed to generate random boot id: %s", strerror(-r));
877 return r;
878 }
879
880 id128_format_as_uuid(rnd, as_uuid);
881
882 r = write_string_file(from, as_uuid);
883 if (r < 0) {
884 log_error("Failed to write boot id: %s", strerror(-r));
885 return r;
886 }
887
888 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
889 log_error("Failed to bind mount boot id: %m");
890 r = -errno;
891 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
892 log_warning("Failed to make boot id read-only: %m");
893
894 unlink(from);
895 return r;
896}
897
898static int copy_devnodes(const char *dest) {
899
900 static const char devnodes[] =
901 "null\0"
902 "zero\0"
903 "full\0"
904 "random\0"
905 "urandom\0"
906 "tty\0";
907
908 const char *d;
909 int r = 0;
910 _cleanup_umask_ mode_t u;
911
912 assert(dest);
913
914 u = umask(0000);
915
916 NULSTR_FOREACH(d, devnodes) {
917 _cleanup_free_ char *from = NULL, *to = NULL;
918 struct stat st;
919
920 from = strappend("/dev/", d);
921 to = strjoin(dest, "/dev/", d, NULL);
922 if (!from || !to)
923 return log_oom();
924
925 if (stat(from, &st) < 0) {
926
927 if (errno != ENOENT) {
928 log_error("Failed to stat %s: %m", from);
929 return -errno;
930 }
931
932 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
933
934 log_error("%s is not a char or block device, cannot copy", from);
935 return -EIO;
936
937 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
938
939 log_error("mknod(%s) failed: %m", dest);
940 return -errno;
941 }
942 }
943
944 return r;
945}
946
947static int setup_ptmx(const char *dest) {
948 _cleanup_free_ char *p = NULL;
949
950 p = strappend(dest, "/dev/ptmx");
951 if (!p)
952 return log_oom();
953
954 if (symlink("pts/ptmx", p) < 0) {
955 log_error("Failed to create /dev/ptmx symlink: %m");
956 return -errno;
957 }
958
959 return 0;
960}
961
962static int setup_dev_console(const char *dest, const char *console) {
963 _cleanup_umask_ mode_t u;
964 const char *to;
965 struct stat st;
966 int r;
967
968 assert(dest);
969 assert(console);
970
971 u = umask(0000);
972
973 if (stat("/dev/null", &st) < 0) {
974 log_error("Failed to stat /dev/null: %m");
975 return -errno;
976 }
977
978 r = chmod_and_chown(console, 0600, 0, 0);
979 if (r < 0) {
980 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
981 return r;
982 }
983
984 /* We need to bind mount the right tty to /dev/console since
985 * ptys can only exist on pts file systems. To have something
986 * to bind mount things on we create a device node first, and
987 * use /dev/null for that since we the cgroups device policy
988 * allows us to create that freely, while we cannot create
989 * /dev/console. (Note that the major minor doesn't actually
990 * matter here, since we mount it over anyway). */
991
992 to = strappenda(dest, "/dev/console");
993 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
994 log_error("mknod() for /dev/console failed: %m");
995 return -errno;
996 }
997
998 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
999 log_error("Bind mount for /dev/console failed: %m");
1000 return -errno;
1001 }
1002
1003 return 0;
1004}
1005
1006static int setup_kmsg(const char *dest, int kmsg_socket) {
1007 _cleanup_free_ char *from = NULL, *to = NULL;
1008 int r, fd, k;
1009 _cleanup_umask_ mode_t u;
1010 union {
1011 struct cmsghdr cmsghdr;
1012 uint8_t buf[CMSG_SPACE(sizeof(int))];
1013 } control = {};
1014 struct msghdr mh = {
1015 .msg_control = &control,
1016 .msg_controllen = sizeof(control),
1017 };
1018 struct cmsghdr *cmsg;
1019
1020 assert(dest);
1021 assert(kmsg_socket >= 0);
1022
1023 u = umask(0000);
1024
1025 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1026 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1027 * on the reading side behave very similar to /proc/kmsg,
1028 * their writing side behaves differently from /dev/kmsg in
1029 * that writing blocks when nothing is reading. In order to
1030 * avoid any problems with containers deadlocking due to this
1031 * we simply make /dev/kmsg unavailable to the container. */
1032 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1033 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1034 return log_oom();
1035
1036 if (mkfifo(from, 0600) < 0) {
1037 log_error("mkfifo() for /dev/kmsg failed: %m");
1038 return -errno;
1039 }
1040
1041 r = chmod_and_chown(from, 0600, 0, 0);
1042 if (r < 0) {
1043 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1044 return r;
1045 }
1046
1047 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1048 log_error("Bind mount for /proc/kmsg failed: %m");
1049 return -errno;
1050 }
1051
1052 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1053 if (fd < 0) {
1054 log_error("Failed to open fifo: %m");
1055 return -errno;
1056 }
1057
1058 cmsg = CMSG_FIRSTHDR(&mh);
1059 cmsg->cmsg_level = SOL_SOCKET;
1060 cmsg->cmsg_type = SCM_RIGHTS;
1061 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1062 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1063
1064 mh.msg_controllen = cmsg->cmsg_len;
1065
1066 /* Store away the fd in the socket, so that it stays open as
1067 * long as we run the child */
1068 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1069 safe_close(fd);
1070
1071 if (k < 0) {
1072 log_error("Failed to send FIFO fd: %m");
1073 return -errno;
1074 }
1075
1076 /* And now make the FIFO unavailable as /dev/kmsg... */
1077 unlink(from);
1078 return 0;
1079}
1080
1081static int setup_hostname(void) {
1082
1083 if (arg_share_system)
1084 return 0;
1085
1086 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1087 return -errno;
1088
1089 return 0;
1090}
1091
1092static int setup_journal(const char *directory) {
1093 sd_id128_t machine_id, this_id;
1094 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1095 char *id;
1096 int r;
1097
1098 p = strappend(directory, "/etc/machine-id");
1099 if (!p)
1100 return log_oom();
1101
1102 r = read_one_line_file(p, &b);
1103 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1104 return 0;
1105 else if (r < 0) {
1106 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1107 return r;
1108 }
1109
1110 id = strstrip(b);
1111 if (isempty(id) && arg_link_journal == LINK_AUTO)
1112 return 0;
1113
1114 /* Verify validity */
1115 r = sd_id128_from_string(id, &machine_id);
1116 if (r < 0) {
1117 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1118 return r;
1119 }
1120
1121 r = sd_id128_get_machine(&this_id);
1122 if (r < 0) {
1123 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1124 return r;
1125 }
1126
1127 if (sd_id128_equal(machine_id, this_id)) {
1128 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1129 "Host and machine ids are equal (%s): refusing to link journals", id);
1130 if (arg_link_journal == LINK_AUTO)
1131 return 0;
1132 return
1133 -EEXIST;
1134 }
1135
1136 if (arg_link_journal == LINK_NO)
1137 return 0;
1138
1139 free(p);
1140 p = strappend("/var/log/journal/", id);
1141 q = strjoin(directory, "/var/log/journal/", id, NULL);
1142 if (!p || !q)
1143 return log_oom();
1144
1145 if (path_is_mount_point(p, false) > 0) {
1146 if (arg_link_journal != LINK_AUTO) {
1147 log_error("%s: already a mount point, refusing to use for journal", p);
1148 return -EEXIST;
1149 }
1150
1151 return 0;
1152 }
1153
1154 if (path_is_mount_point(q, false) > 0) {
1155 if (arg_link_journal != LINK_AUTO) {
1156 log_error("%s: already a mount point, refusing to use for journal", q);
1157 return -EEXIST;
1158 }
1159
1160 return 0;
1161 }
1162
1163 r = readlink_and_make_absolute(p, &d);
1164 if (r >= 0) {
1165 if ((arg_link_journal == LINK_GUEST ||
1166 arg_link_journal == LINK_AUTO) &&
1167 path_equal(d, q)) {
1168
1169 r = mkdir_p(q, 0755);
1170 if (r < 0)
1171 log_warning("failed to create directory %s: %m", q);
1172 return 0;
1173 }
1174
1175 if (unlink(p) < 0) {
1176 log_error("Failed to remove symlink %s: %m", p);
1177 return -errno;
1178 }
1179 } else if (r == -EINVAL) {
1180
1181 if (arg_link_journal == LINK_GUEST &&
1182 rmdir(p) < 0) {
1183
1184 if (errno == ENOTDIR) {
1185 log_error("%s already exists and is neither a symlink nor a directory", p);
1186 return r;
1187 } else {
1188 log_error("Failed to remove %s: %m", p);
1189 return -errno;
1190 }
1191 }
1192 } else if (r != -ENOENT) {
1193 log_error("readlink(%s) failed: %m", p);
1194 return r;
1195 }
1196
1197 if (arg_link_journal == LINK_GUEST) {
1198
1199 if (symlink(q, p) < 0) {
1200 log_error("Failed to symlink %s to %s: %m", q, p);
1201 return -errno;
1202 }
1203
1204 r = mkdir_p(q, 0755);
1205 if (r < 0)
1206 log_warning("failed to create directory %s: %m", q);
1207 return 0;
1208 }
1209
1210 if (arg_link_journal == LINK_HOST) {
1211 r = mkdir_p(p, 0755);
1212 if (r < 0) {
1213 log_error("Failed to create %s: %m", p);
1214 return r;
1215 }
1216
1217 } else if (access(p, F_OK) < 0)
1218 return 0;
1219
1220 if (dir_is_empty(q) == 0)
1221 log_warning("%s is not empty, proceeding anyway.", q);
1222
1223 r = mkdir_p(q, 0755);
1224 if (r < 0) {
1225 log_error("Failed to create %s: %m", q);
1226 return r;
1227 }
1228
1229 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1230 log_error("Failed to bind mount journal from host into guest: %m");
1231 return -errno;
1232 }
1233
1234 return 0;
1235}
1236
1237static int setup_kdbus(const char *dest, const char *path) {
1238 const char *p;
1239
1240 if (!path)
1241 return 0;
1242
1243 p = strappenda(dest, "/dev/kdbus");
1244 if (mkdir(p, 0755) < 0) {
1245 log_error("Failed to create kdbus path: %m");
1246 return -errno;
1247 }
1248
1249 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1250 log_error("Failed to mount kdbus domain path: %m");
1251 return -errno;
1252 }
1253
1254 return 0;
1255}
1256
1257static int drop_capabilities(void) {
1258 return capability_bounding_set_drop(~arg_retain, false);
1259}
1260
1261static int register_machine(pid_t pid) {
1262 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1263 _cleanup_bus_unref_ sd_bus *bus = NULL;
1264 int r;
1265
1266 if (!arg_register)
1267 return 0;
1268
1269 r = sd_bus_default_system(&bus);
1270 if (r < 0) {
1271 log_error("Failed to open system bus: %s", strerror(-r));
1272 return r;
1273 }
1274
1275 if (arg_keep_unit) {
1276 r = sd_bus_call_method(
1277 bus,
1278 "org.freedesktop.machine1",
1279 "/org/freedesktop/machine1",
1280 "org.freedesktop.machine1.Manager",
1281 "RegisterMachine",
1282 &error,
1283 NULL,
1284 "sayssus",
1285 arg_machine,
1286 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1287 "nspawn",
1288 "container",
1289 (uint32_t) pid,
1290 strempty(arg_directory));
1291 } else {
1292 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1293
1294 r = sd_bus_message_new_method_call(
1295 bus,
1296 &m,
1297 "org.freedesktop.machine1",
1298 "/org/freedesktop/machine1",
1299 "org.freedesktop.machine1.Manager",
1300 "CreateMachine");
1301 if (r < 0) {
1302 log_error("Failed to create message: %s", strerror(-r));
1303 return r;
1304 }
1305
1306 r = sd_bus_message_append(
1307 m,
1308 "sayssus",
1309 arg_machine,
1310 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1311 "nspawn",
1312 "container",
1313 (uint32_t) pid,
1314 strempty(arg_directory));
1315 if (r < 0) {
1316 log_error("Failed to append message arguments: %s", strerror(-r));
1317 return r;
1318 }
1319
1320 r = sd_bus_message_open_container(m, 'a', "(sv)");
1321 if (r < 0) {
1322 log_error("Failed to open container: %s", strerror(-r));
1323 return r;
1324 }
1325
1326 if (!isempty(arg_slice)) {
1327 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1328 if (r < 0) {
1329 log_error("Failed to append slice: %s", strerror(-r));
1330 return r;
1331 }
1332 }
1333
1334 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1335 if (r < 0) {
1336 log_error("Failed to add device policy: %s", strerror(-r));
1337 return r;
1338 }
1339
1340 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1341 /* Allow the container to
1342 * access and create the API
1343 * device nodes, so that
1344 * PrivateDevices= in the
1345 * container can work
1346 * fine */
1347 "/dev/null", "rwm",
1348 "/dev/zero", "rwm",
1349 "/dev/full", "rwm",
1350 "/dev/random", "rwm",
1351 "/dev/urandom", "rwm",
1352 "/dev/tty", "rwm",
1353 /* Allow the container
1354 * access to ptys. However,
1355 * do not permit the
1356 * container to ever create
1357 * these device nodes. */
1358 "/dev/pts/ptmx", "rw",
1359 "char-pts", "rw",
1360 /* Allow the container
1361 * access to all kdbus
1362 * devices. Again, the
1363 * container cannot create
1364 * these nodes, only use
1365 * them. We use a pretty
1366 * open match here, so that
1367 * the kernel API can still
1368 * change. */
1369 "char-kdbus", "rw",
1370 "char-kdbus/*", "rw");
1371 if (r < 0) {
1372 log_error("Failed to add device whitelist: %s", strerror(-r));
1373 return r;
1374 }
1375
1376 r = sd_bus_message_close_container(m);
1377 if (r < 0) {
1378 log_error("Failed to close container: %s", strerror(-r));
1379 return r;
1380 }
1381
1382 r = sd_bus_call(bus, m, 0, &error, NULL);
1383 }
1384
1385 if (r < 0) {
1386 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1387 return r;
1388 }
1389
1390 return 0;
1391}
1392
1393static int terminate_machine(pid_t pid) {
1394 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1395 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1396 _cleanup_bus_unref_ sd_bus *bus = NULL;
1397 const char *path;
1398 int r;
1399
1400 if (!arg_register)
1401 return 0;
1402
1403 r = sd_bus_default_system(&bus);
1404 if (r < 0) {
1405 log_error("Failed to open system bus: %s", strerror(-r));
1406 return r;
1407 }
1408
1409 r = sd_bus_call_method(
1410 bus,
1411 "org.freedesktop.machine1",
1412 "/org/freedesktop/machine1",
1413 "org.freedesktop.machine1.Manager",
1414 "GetMachineByPID",
1415 &error,
1416 &reply,
1417 "u",
1418 (uint32_t) pid);
1419 if (r < 0) {
1420 /* Note that the machine might already have been
1421 * cleaned up automatically, hence don't consider it a
1422 * failure if we cannot get the machine object. */
1423 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1424 return 0;
1425 }
1426
1427 r = sd_bus_message_read(reply, "o", &path);
1428 if (r < 0)
1429 return bus_log_parse_error(r);
1430
1431 r = sd_bus_call_method(
1432 bus,
1433 "org.freedesktop.machine1",
1434 path,
1435 "org.freedesktop.machine1.Machine",
1436 "Terminate",
1437 &error,
1438 NULL,
1439 NULL);
1440 if (r < 0) {
1441 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1442 return 0;
1443 }
1444
1445 return 0;
1446}
1447
1448static int reset_audit_loginuid(void) {
1449 _cleanup_free_ char *p = NULL;
1450 int r;
1451
1452 if (arg_share_system)
1453 return 0;
1454
1455 r = read_one_line_file("/proc/self/loginuid", &p);
1456 if (r == -ENOENT)
1457 return 0;
1458 if (r < 0) {
1459 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1460 return r;
1461 }
1462
1463 /* Already reset? */
1464 if (streq(p, "4294967295"))
1465 return 0;
1466
1467 r = write_string_file("/proc/self/loginuid", "4294967295");
1468 if (r < 0) {
1469 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1470 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1471 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1472 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1473 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1474
1475 sleep(5);
1476 }
1477
1478 return 0;
1479}
1480
1481#define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1482
1483static int get_mac(struct ether_addr *mac) {
1484 int r;
1485
1486 uint8_t result[8];
1487 size_t l, sz;
1488 uint8_t *v;
1489
1490 l = strlen(arg_machine);
1491 sz = sizeof(sd_id128_t) + l;
1492 v = alloca(sz);
1493
1494 /* fetch some persistent data unique to the host */
1495 r = sd_id128_get_machine((sd_id128_t*) v);
1496 if (r < 0)
1497 return r;
1498
1499 /* combine with some data unique (on this host) to this
1500 * container instance */
1501 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1502
1503 /* Let's hash the host machine ID plus the container name. We
1504 * use a fixed, but originally randomly created hash key here. */
1505 siphash24(result, v, sz, HASH_KEY.bytes);
1506
1507 assert_cc(ETH_ALEN <= sizeof(result));
1508 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1509
1510 /* see eth_random_addr in the kernel */
1511 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1512 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1513
1514 return 0;
1515}
1516
1517static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1518 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1519 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1520 struct ether_addr mac;
1521 int r;
1522
1523 if (!arg_private_network)
1524 return 0;
1525
1526 if (!arg_network_veth)
1527 return 0;
1528
1529 /* Use two different interface name prefixes depending whether
1530 * we are in bridge mode or not. */
1531 if (arg_network_bridge)
1532 memcpy(iface_name, "vb-", 3);
1533 else
1534 memcpy(iface_name, "ve-", 3);
1535 strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1536
1537 r = get_mac(&mac);
1538 if (r < 0) {
1539 log_error("Failed to generate predictable MAC address for host0");
1540 return r;
1541 }
1542
1543 r = sd_rtnl_open(&rtnl, 0);
1544 if (r < 0) {
1545 log_error("Failed to connect to netlink: %s", strerror(-r));
1546 return r;
1547 }
1548
1549 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1550 if (r < 0) {
1551 log_error("Failed to allocate netlink message: %s", strerror(-r));
1552 return r;
1553 }
1554
1555 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1556 if (r < 0) {
1557 log_error("Failed to add netlink interface name: %s", strerror(-r));
1558 return r;
1559 }
1560
1561 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1562 if (r < 0) {
1563 log_error("Failed to open netlink container: %s", strerror(-r));
1564 return r;
1565 }
1566
1567 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1568 if (r < 0) {
1569 log_error("Failed to open netlink container: %s", strerror(-r));
1570 return r;
1571 }
1572
1573 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1574 if (r < 0) {
1575 log_error("Failed to open netlink container: %s", strerror(-r));
1576 return r;
1577 }
1578
1579 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1580 if (r < 0) {
1581 log_error("Failed to add netlink interface name: %s", strerror(-r));
1582 return r;
1583 }
1584
1585 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1586 if (r < 0) {
1587 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1588 return r;
1589 }
1590
1591 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1592 if (r < 0) {
1593 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1594 return r;
1595 }
1596
1597 r = sd_rtnl_message_close_container(m);
1598 if (r < 0) {
1599 log_error("Failed to close netlink container: %s", strerror(-r));
1600 return r;
1601 }
1602
1603 r = sd_rtnl_message_close_container(m);
1604 if (r < 0) {
1605 log_error("Failed to close netlink container: %s", strerror(-r));
1606 return r;
1607 }
1608
1609 r = sd_rtnl_message_close_container(m);
1610 if (r < 0) {
1611 log_error("Failed to close netlink container: %s", strerror(-r));
1612 return r;
1613 }
1614
1615 r = sd_rtnl_call(rtnl, m, 0, NULL);
1616 if (r < 0) {
1617 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1618 return r;
1619 }
1620
1621 return 0;
1622}
1623
1624static int setup_bridge(const char veth_name[]) {
1625 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1626 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1627 int r, bridge;
1628
1629 if (!arg_private_network)
1630 return 0;
1631
1632 if (!arg_network_veth)
1633 return 0;
1634
1635 if (!arg_network_bridge)
1636 return 0;
1637
1638 bridge = (int) if_nametoindex(arg_network_bridge);
1639 if (bridge <= 0) {
1640 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1641 return -errno;
1642 }
1643
1644 r = sd_rtnl_open(&rtnl, 0);
1645 if (r < 0) {
1646 log_error("Failed to connect to netlink: %s", strerror(-r));
1647 return r;
1648 }
1649
1650 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1651 if (r < 0) {
1652 log_error("Failed to allocate netlink message: %s", strerror(-r));
1653 return r;
1654 }
1655
1656 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1657 if (r < 0) {
1658 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1659 return r;
1660 }
1661
1662 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1663 if (r < 0) {
1664 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1665 return r;
1666 }
1667
1668 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1669 if (r < 0) {
1670 log_error("Failed to add netlink master field: %s", strerror(-r));
1671 return r;
1672 }
1673
1674 r = sd_rtnl_call(rtnl, m, 0, NULL);
1675 if (r < 0) {
1676 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1677 return r;
1678 }
1679
1680 return 0;
1681}
1682
1683static int parse_interface(struct udev *udev, const char *name) {
1684 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1685 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1686 int ifi;
1687
1688 ifi = (int) if_nametoindex(name);
1689 if (ifi <= 0) {
1690 log_error("Failed to resolve interface %s: %m", name);
1691 return -errno;
1692 }
1693
1694 sprintf(ifi_str, "n%i", ifi);
1695 d = udev_device_new_from_device_id(udev, ifi_str);
1696 if (!d) {
1697 log_error("Failed to get udev device for interface %s: %m", name);
1698 return -errno;
1699 }
1700
1701 if (udev_device_get_is_initialized(d) <= 0) {
1702 log_error("Network interface %s is not initialized yet.", name);
1703 return -EBUSY;
1704 }
1705
1706 return ifi;
1707}
1708
1709static int move_network_interfaces(pid_t pid) {
1710 _cleanup_udev_unref_ struct udev *udev = NULL;
1711 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1712 char **i;
1713 int r;
1714
1715 if (!arg_private_network)
1716 return 0;
1717
1718 if (strv_isempty(arg_network_interfaces))
1719 return 0;
1720
1721 r = sd_rtnl_open(&rtnl, 0);
1722 if (r < 0) {
1723 log_error("Failed to connect to netlink: %s", strerror(-r));
1724 return r;
1725 }
1726
1727 udev = udev_new();
1728 if (!udev) {
1729 log_error("Failed to connect to udev.");
1730 return -ENOMEM;
1731 }
1732
1733 STRV_FOREACH(i, arg_network_interfaces) {
1734 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1735 int ifi;
1736
1737 ifi = parse_interface(udev, *i);
1738 if (ifi < 0)
1739 return ifi;
1740
1741 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1742 if (r < 0) {
1743 log_error("Failed to allocate netlink message: %s", strerror(-r));
1744 return r;
1745 }
1746
1747 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1748 if (r < 0) {
1749 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1750 return r;
1751 }
1752
1753 r = sd_rtnl_call(rtnl, m, 0, NULL);
1754 if (r < 0) {
1755 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1756 return r;
1757 }
1758 }
1759
1760 return 0;
1761}
1762
1763static int setup_macvlan(pid_t pid) {
1764 _cleanup_udev_unref_ struct udev *udev = NULL;
1765 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1766 char **i;
1767 int r;
1768
1769 if (!arg_private_network)
1770 return 0;
1771
1772 if (strv_isempty(arg_network_macvlan))
1773 return 0;
1774
1775 r = sd_rtnl_open(&rtnl, 0);
1776 if (r < 0) {
1777 log_error("Failed to connect to netlink: %s", strerror(-r));
1778 return r;
1779 }
1780
1781 udev = udev_new();
1782 if (!udev) {
1783 log_error("Failed to connect to udev.");
1784 return -ENOMEM;
1785 }
1786
1787 STRV_FOREACH(i, arg_network_macvlan) {
1788 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1789 _cleanup_free_ char *n = NULL;
1790 int ifi;
1791
1792 ifi = parse_interface(udev, *i);
1793 if (ifi < 0)
1794 return ifi;
1795
1796 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1797 if (r < 0) {
1798 log_error("Failed to allocate netlink message: %s", strerror(-r));
1799 return r;
1800 }
1801
1802 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1803 if (r < 0) {
1804 log_error("Failed to add netlink interface index: %s", strerror(-r));
1805 return r;
1806 }
1807
1808 n = strappend("mv-", *i);
1809 if (!n)
1810 return log_oom();
1811
1812 strshorten(n, IFNAMSIZ-1);
1813
1814 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1815 if (r < 0) {
1816 log_error("Failed to add netlink interface name: %s", strerror(-r));
1817 return r;
1818 }
1819
1820 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1821 if (r < 0) {
1822 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1823 return r;
1824 }
1825
1826 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1827 if (r < 0) {
1828 log_error("Failed to open netlink container: %s", strerror(-r));
1829 return r;
1830 }
1831
1832 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1833 if (r < 0) {
1834 log_error("Failed to open netlink container: %s", strerror(-r));
1835 return r;
1836 }
1837
1838 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1839 if (r < 0) {
1840 log_error("Failed to append macvlan mode: %s", strerror(-r));
1841 return r;
1842 }
1843
1844 r = sd_rtnl_message_close_container(m);
1845 if (r < 0) {
1846 log_error("Failed to close netlink container: %s", strerror(-r));
1847 return r;
1848 }
1849
1850 r = sd_rtnl_message_close_container(m);
1851 if (r < 0) {
1852 log_error("Failed to close netlink container: %s", strerror(-r));
1853 return r;
1854 }
1855
1856 r = sd_rtnl_call(rtnl, m, 0, NULL);
1857 if (r < 0) {
1858 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1859 return r;
1860 }
1861 }
1862
1863 return 0;
1864}
1865
1866static int audit_still_doesnt_work_in_containers(void) {
1867
1868#ifdef HAVE_SECCOMP
1869 scmp_filter_ctx seccomp;
1870 int r;
1871
1872 /*
1873 Audit is broken in containers, much of the userspace audit
1874 hookup will fail if running inside a container. We don't
1875 care and just turn off creation of audit sockets.
1876
1877 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1878 with EAFNOSUPPORT which audit userspace uses as indication
1879 that audit is disabled in the kernel.
1880 */
1881
1882 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1883 if (!seccomp)
1884 return log_oom();
1885
1886 r = seccomp_add_secondary_archs(seccomp);
1887 if (r < 0) {
1888 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1889 goto finish;
1890 }
1891
1892 r = seccomp_rule_add(
1893 seccomp,
1894 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1895 SCMP_SYS(socket),
1896 2,
1897 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1898 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1899 if (r < 0) {
1900 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1901 goto finish;
1902 }
1903
1904 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1905 if (r < 0) {
1906 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1907 goto finish;
1908 }
1909
1910 r = seccomp_load(seccomp);
1911 if (r < 0)
1912 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1913
1914finish:
1915 seccomp_release(seccomp);
1916 return r;
1917#else
1918 return 0;
1919#endif
1920
1921}
1922
1923static int setup_image(char **device_path, int *loop_nr) {
1924 struct loop_info64 info = {
1925 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1926 };
1927 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1928 _cleanup_free_ char* loopdev = NULL;
1929 struct stat st;
1930 int r, nr;
1931
1932 assert(device_path);
1933 assert(loop_nr);
1934
1935 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1936 if (fd < 0) {
1937 log_error("Failed to open %s: %m", arg_image);
1938 return -errno;
1939 }
1940
1941 if (fstat(fd, &st) < 0) {
1942 log_error("Failed to stat %s: %m", arg_image);
1943 return -errno;
1944 }
1945
1946 if (S_ISBLK(st.st_mode)) {
1947 char *p;
1948
1949 p = strdup(arg_image);
1950 if (!p)
1951 return log_oom();
1952
1953 *device_path = p;
1954
1955 *loop_nr = -1;
1956
1957 r = fd;
1958 fd = -1;
1959
1960 return r;
1961 }
1962
1963 if (!S_ISREG(st.st_mode)) {
1964 log_error("%s is not a regular file or block device: %m", arg_image);
1965 return -EINVAL;
1966 }
1967
1968 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1969 if (control < 0) {
1970 log_error("Failed to open /dev/loop-control: %m");
1971 return -errno;
1972 }
1973
1974 nr = ioctl(control, LOOP_CTL_GET_FREE);
1975 if (nr < 0) {
1976 log_error("Failed to allocate loop device: %m");
1977 return -errno;
1978 }
1979
1980 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1981 return log_oom();
1982
1983 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1984 if (loop < 0) {
1985 log_error("Failed to open loop device %s: %m", loopdev);
1986 return -errno;
1987 }
1988
1989 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1990 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1991 return -errno;
1992 }
1993
1994 if (arg_read_only)
1995 info.lo_flags |= LO_FLAGS_READ_ONLY;
1996
1997 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1998 log_error("Failed to set loopback settings on %s: %m", loopdev);
1999 return -errno;
2000 }
2001
2002 *device_path = loopdev;
2003 loopdev = NULL;
2004
2005 *loop_nr = nr;
2006
2007 r = loop;
2008 loop = -1;
2009
2010 return r;
2011}
2012
2013static int dissect_image(
2014 int fd,
2015 char **root_device, bool *root_device_rw,
2016 char **home_device, bool *home_device_rw,
2017 char **srv_device, bool *srv_device_rw,
2018 bool *secondary) {
2019
2020#ifdef HAVE_BLKID
2021 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2022 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2023 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2024 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2025 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2026 _cleanup_udev_unref_ struct udev *udev = NULL;
2027 struct udev_list_entry *first, *item;
2028 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2029 const char *pttype = NULL;
2030 blkid_partlist pl;
2031 struct stat st;
2032 int r;
2033
2034 assert(fd >= 0);
2035 assert(root_device);
2036 assert(home_device);
2037 assert(srv_device);
2038 assert(secondary);
2039
2040 b = blkid_new_probe();
2041 if (!b)
2042 return log_oom();
2043
2044 errno = 0;
2045 r = blkid_probe_set_device(b, fd, 0, 0);
2046 if (r != 0) {
2047 if (errno == 0)
2048 return log_oom();
2049
2050 log_error("Failed to set device on blkid probe: %m");
2051 return -errno;
2052 }
2053
2054 blkid_probe_enable_partitions(b, 1);
2055 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2056
2057 errno = 0;
2058 r = blkid_do_safeprobe(b);
2059 if (r == -2 || r == 1) {
2060 log_error("Failed to identify any partition table on %s.\n"
2061 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2062 return -EINVAL;
2063 } else if (r != 0) {
2064 if (errno == 0)
2065 errno = EIO;
2066 log_error("Failed to probe: %m");
2067 return -errno;
2068 }
2069
2070 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2071 if (!streq_ptr(pttype, "gpt")) {
2072 log_error("Image %s does not carry a GUID Partition Table.\n"
2073 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2074 return -EINVAL;
2075 }
2076
2077 errno = 0;
2078 pl = blkid_probe_get_partitions(b);
2079 if (!pl) {
2080 if (errno == 0)
2081 return log_oom();
2082
2083 log_error("Failed to list partitions of %s", arg_image);
2084 return -errno;
2085 }
2086
2087 udev = udev_new();
2088 if (!udev)
2089 return log_oom();
2090
2091 if (fstat(fd, &st) < 0) {
2092 log_error("Failed to stat block device: %m");
2093 return -errno;
2094 }
2095
2096 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2097 if (!d)
2098 return log_oom();
2099
2100 e = udev_enumerate_new(udev);
2101 if (!e)
2102 return log_oom();
2103
2104 r = udev_enumerate_add_match_parent(e, d);
2105 if (r < 0)
2106 return log_oom();
2107
2108 r = udev_enumerate_scan_devices(e);
2109 if (r < 0) {
2110 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2111 return r;
2112 }
2113
2114 first = udev_enumerate_get_list_entry(e);
2115 udev_list_entry_foreach(item, first) {
2116 _cleanup_udev_device_unref_ struct udev_device *q;
2117 const char *stype, *node;
2118 unsigned long long flags;
2119 sd_id128_t type_id;
2120 blkid_partition pp;
2121 dev_t qn;
2122 int nr;
2123
2124 errno = 0;
2125 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2126 if (!q) {
2127 if (!errno)
2128 errno = ENOMEM;
2129
2130 log_error("Failed to get partition device of %s: %m", arg_image);
2131 return -errno;
2132 }
2133
2134 qn = udev_device_get_devnum(q);
2135 if (major(qn) == 0)
2136 continue;
2137
2138 if (st.st_rdev == qn)
2139 continue;
2140
2141 node = udev_device_get_devnode(q);
2142 if (!node)
2143 continue;
2144
2145 pp = blkid_partlist_devno_to_partition(pl, qn);
2146 if (!pp)
2147 continue;
2148
2149 flags = blkid_partition_get_flags(pp);
2150 if (flags & GPT_FLAG_NO_AUTO)
2151 continue;
2152
2153 nr = blkid_partition_get_partno(pp);
2154 if (nr < 0)
2155 continue;
2156
2157 stype = blkid_partition_get_type_string(pp);
2158 if (!stype)
2159 continue;
2160
2161 if (sd_id128_from_string(stype, &type_id) < 0)
2162 continue;
2163
2164 if (sd_id128_equal(type_id, GPT_HOME)) {
2165
2166 if (home && nr >= home_nr)
2167 continue;
2168
2169 home_nr = nr;
2170 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2171
2172 free(home);
2173 home = strdup(node);
2174 if (!home)
2175 return log_oom();
2176 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2177
2178 if (srv && nr >= srv_nr)
2179 continue;
2180
2181 srv_nr = nr;
2182 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2183
2184 free(srv);
2185 srv = strdup(node);
2186 if (!srv)
2187 return log_oom();
2188 }
2189#ifdef GPT_ROOT_NATIVE
2190 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2191
2192 if (root && nr >= root_nr)
2193 continue;
2194
2195 root_nr = nr;
2196 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2197
2198 free(root);
2199 root = strdup(node);
2200 if (!root)
2201 return log_oom();
2202 }
2203#endif
2204#ifdef GPT_ROOT_SECONDARY
2205 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2206
2207 if (secondary_root && nr >= secondary_root_nr)
2208 continue;
2209
2210 secondary_root_nr = nr;
2211 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2212
2213
2214 free(secondary_root);
2215 secondary_root = strdup(node);
2216 if (!secondary_root)
2217 return log_oom();
2218 }
2219#endif
2220 }
2221
2222 if (!root && !secondary_root) {
2223 log_error("Failed to identify root partition in disk image %s.\n"
2224 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2225 return -EINVAL;
2226 }
2227
2228 if (root) {
2229 *root_device = root;
2230 root = NULL;
2231
2232 *root_device_rw = root_rw;
2233 *secondary = false;
2234 } else if (secondary_root) {
2235 *root_device = secondary_root;
2236 secondary_root = NULL;
2237
2238 *root_device_rw = secondary_root_rw;
2239 *secondary = true;
2240 }
2241
2242 if (home) {
2243 *home_device = home;
2244 home = NULL;
2245
2246 *home_device_rw = home_rw;
2247 }
2248
2249 if (srv) {
2250 *srv_device = srv;
2251 srv = NULL;
2252
2253 *srv_device_rw = srv_rw;
2254 }
2255
2256 return 0;
2257#else
2258 log_error("--image= is not supported, compiled without blkid support.");
2259 return -ENOTSUP;
2260#endif
2261}
2262
2263static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2264#ifdef HAVE_BLKID
2265 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2266 const char *fstype, *p;
2267 int r;
2268
2269 assert(what);
2270 assert(where);
2271
2272 if (arg_read_only)
2273 rw = false;
2274
2275 if (directory)
2276 p = strappenda(where, directory);
2277 else
2278 p = where;
2279
2280 errno = 0;
2281 b = blkid_new_probe_from_filename(what);
2282 if (!b) {
2283 if (errno == 0)
2284 return log_oom();
2285 log_error("Failed to allocate prober for %s: %m", what);
2286 return -errno;
2287 }
2288
2289 blkid_probe_enable_superblocks(b, 1);
2290 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2291
2292 errno = 0;
2293 r = blkid_do_safeprobe(b);
2294 if (r == -1 || r == 1) {
2295 log_error("Cannot determine file system type of %s", what);
2296 return -EINVAL;
2297 } else if (r != 0) {
2298 if (errno == 0)
2299 errno = EIO;
2300 log_error("Failed to probe %s: %m", what);
2301 return -errno;
2302 }
2303
2304 errno = 0;
2305 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2306 if (errno == 0)
2307 errno = EINVAL;
2308 log_error("Failed to determine file system type of %s", what);
2309 return -errno;
2310 }
2311
2312 if (streq(fstype, "crypto_LUKS")) {
2313 log_error("nspawn currently does not support LUKS disk images.");
2314 return -ENOTSUP;
2315 }
2316
2317 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2318 log_error("Failed to mount %s: %m", what);
2319 return -errno;
2320 }
2321
2322 return 0;
2323#else
2324 log_error("--image= is not supported, compiled without blkid support.");
2325 return -ENOTSUP;
2326#endif
2327}
2328
2329static int mount_devices(
2330 const char *where,
2331 const char *root_device, bool root_device_rw,
2332 const char *home_device, bool home_device_rw,
2333 const char *srv_device, bool srv_device_rw) {
2334 int r;
2335
2336 assert(where);
2337
2338 if (root_device) {
2339 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2340 if (r < 0) {
2341 log_error("Failed to mount root directory: %s", strerror(-r));
2342 return r;
2343 }
2344 }
2345
2346 if (home_device) {
2347 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2348 if (r < 0) {
2349 log_error("Failed to mount home directory: %s", strerror(-r));
2350 return r;
2351 }
2352 }
2353
2354 if (srv_device) {
2355 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2356 if (r < 0) {
2357 log_error("Failed to mount server data directory: %s", strerror(-r));
2358 return r;
2359 }
2360 }
2361
2362 return 0;
2363}
2364
2365static void loop_remove(int nr, int *image_fd) {
2366 _cleanup_close_ int control = -1;
2367
2368 if (nr < 0)
2369 return;
2370
2371 if (image_fd && *image_fd >= 0) {
2372 ioctl(*image_fd, LOOP_CLR_FD);
2373 *image_fd = safe_close(*image_fd);
2374 }
2375
2376 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2377 if (control < 0)
2378 return;
2379
2380 ioctl(control, LOOP_CTL_REMOVE, nr);
2381}
2382
2383static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2384 int pipe_fds[2];
2385 pid_t pid;
2386
2387 assert(database);
2388 assert(key);
2389 assert(rpid);
2390
2391 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2392 log_error("Failed to allocate pipe: %m");
2393 return -errno;
2394 }
2395
2396 pid = fork();
2397 if (pid < 0) {
2398 log_error("Failed to fork getent child: %m");
2399 return -errno;
2400 } else if (pid == 0) {
2401 int nullfd;
2402 char *empty_env = NULL;
2403
2404 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2405 _exit(EXIT_FAILURE);
2406
2407 if (pipe_fds[0] > 2)
2408 safe_close(pipe_fds[0]);
2409 if (pipe_fds[1] > 2)
2410 safe_close(pipe_fds[1]);
2411
2412 nullfd = open("/dev/null", O_RDWR);
2413 if (nullfd < 0)
2414 _exit(EXIT_FAILURE);
2415
2416 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2417 _exit(EXIT_FAILURE);
2418
2419 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2420 _exit(EXIT_FAILURE);
2421
2422 if (nullfd > 2)
2423 safe_close(nullfd);
2424
2425 reset_all_signal_handlers();
2426 close_all_fds(NULL, 0);
2427
2428 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2429 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2430 _exit(EXIT_FAILURE);
2431 }
2432
2433 pipe_fds[1] = safe_close(pipe_fds[1]);
2434
2435 *rpid = pid;
2436
2437 return pipe_fds[0];
2438}
2439
2440static int change_uid_gid(char **_home) {
2441 char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2442 _cleanup_free_ uid_t *uids = NULL;
2443 _cleanup_free_ char *home = NULL;
2444 _cleanup_fclose_ FILE *f = NULL;
2445 _cleanup_close_ int fd = -1;
2446 unsigned n_uids = 0;
2447 size_t sz = 0, l;
2448 uid_t uid;
2449 gid_t gid;
2450 pid_t pid;
2451 int r;
2452
2453 assert(_home);
2454
2455 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2456 /* Reset everything fully to 0, just in case */
2457
2458 if (setgroups(0, NULL) < 0) {
2459 log_error("setgroups() failed: %m");
2460 return -errno;
2461 }
2462
2463 if (setresgid(0, 0, 0) < 0) {
2464 log_error("setregid() failed: %m");
2465 return -errno;
2466 }
2467
2468 if (setresuid(0, 0, 0) < 0) {
2469 log_error("setreuid() failed: %m");
2470 return -errno;
2471 }
2472
2473 *_home = NULL;
2474 return 0;
2475 }
2476
2477 /* First, get user credentials */
2478 fd = spawn_getent("passwd", arg_user, &pid);
2479 if (fd < 0)
2480 return fd;
2481
2482 f = fdopen(fd, "r");
2483 if (!f)
2484 return log_oom();
2485 fd = -1;
2486
2487 if (!fgets(line, sizeof(line), f)) {
2488
2489 if (!ferror(f)) {
2490 log_error("Failed to resolve user %s.", arg_user);
2491 return -ESRCH;
2492 }
2493
2494 log_error("Failed to read from getent: %m");
2495 return -errno;
2496 }
2497
2498 truncate_nl(line);
2499
2500 wait_for_terminate_and_warn("getent passwd", pid);
2501
2502 x = strchr(line, ':');
2503 if (!x) {
2504 log_error("/etc/passwd entry has invalid user field.");
2505 return -EIO;
2506 }
2507
2508 u = strchr(x+1, ':');
2509 if (!u) {
2510 log_error("/etc/passwd entry has invalid password field.");
2511 return -EIO;
2512 }
2513
2514 u++;
2515 g = strchr(u, ':');
2516 if (!g) {
2517 log_error("/etc/passwd entry has invalid UID field.");
2518 return -EIO;
2519 }
2520
2521 *g = 0;
2522 g++;
2523 x = strchr(g, ':');
2524 if (!x) {
2525 log_error("/etc/passwd entry has invalid GID field.");
2526 return -EIO;
2527 }
2528
2529 *x = 0;
2530 h = strchr(x+1, ':');
2531 if (!h) {
2532 log_error("/etc/passwd entry has invalid GECOS field.");
2533 return -EIO;
2534 }
2535
2536 h++;
2537 x = strchr(h, ':');
2538 if (!x) {
2539 log_error("/etc/passwd entry has invalid home directory field.");
2540 return -EIO;
2541 }
2542
2543 *x = 0;
2544
2545 r = parse_uid(u, &uid);
2546 if (r < 0) {
2547 log_error("Failed to parse UID of user.");
2548 return -EIO;
2549 }
2550
2551 r = parse_gid(g, &gid);
2552 if (r < 0) {
2553 log_error("Failed to parse GID of user.");
2554 return -EIO;
2555 }
2556
2557 home = strdup(h);
2558 if (!home)
2559 return log_oom();
2560
2561 /* Second, get group memberships */
2562 fd = spawn_getent("initgroups", arg_user, &pid);
2563 if (fd < 0)
2564 return fd;
2565
2566 fclose(f);
2567 f = fdopen(fd, "r");
2568 if (!f)
2569 return log_oom();
2570 fd = -1;
2571
2572 if (!fgets(line, sizeof(line), f)) {
2573 if (!ferror(f)) {
2574 log_error("Failed to resolve user %s.", arg_user);
2575 return -ESRCH;
2576 }
2577
2578 log_error("Failed to read from getent: %m");
2579 return -errno;
2580 }
2581
2582 truncate_nl(line);
2583
2584 wait_for_terminate_and_warn("getent initgroups", pid);
2585
2586 /* Skip over the username and subsequent separator whitespace */
2587 x = line;
2588 x += strcspn(x, WHITESPACE);
2589 x += strspn(x, WHITESPACE);
2590
2591 FOREACH_WORD(w, l, x, state) {
2592 char c[l+1];
2593
2594 memcpy(c, w, l);
2595 c[l] = 0;
2596
2597 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2598 return log_oom();
2599
2600 r = parse_uid(c, &uids[n_uids++]);
2601 if (r < 0) {
2602 log_error("Failed to parse group data from getent.");
2603 return -EIO;
2604 }
2605 }
2606
2607 r = mkdir_parents(home, 0775);
2608 if (r < 0) {
2609 log_error("Failed to make home root directory: %s", strerror(-r));
2610 return r;
2611 }
2612
2613 r = mkdir_safe(home, 0755, uid, gid);
2614 if (r < 0 && r != -EEXIST) {
2615 log_error("Failed to make home directory: %s", strerror(-r));
2616 return r;
2617 }
2618
2619 fchown(STDIN_FILENO, uid, gid);
2620 fchown(STDOUT_FILENO, uid, gid);
2621 fchown(STDERR_FILENO, uid, gid);
2622
2623 if (setgroups(n_uids, uids) < 0) {
2624 log_error("Failed to set auxiliary groups: %m");
2625 return -errno;
2626 }
2627
2628 if (setresgid(gid, gid, gid) < 0) {
2629 log_error("setregid() failed: %m");
2630 return -errno;
2631 }
2632
2633 if (setresuid(uid, uid, uid) < 0) {
2634 log_error("setreuid() failed: %m");
2635 return -errno;
2636 }
2637
2638 if (_home) {
2639 *_home = home;
2640 home = NULL;
2641 }
2642
2643 return 0;
2644}
2645
2646/*
2647 * Return 0 in case the container is being rebooted, has been shut
2648 * down or exited successfully. On failures a negative value is
2649 * returned.
2650 *
2651 * The status of the container "CONTAINER_TERMINATED" or
2652 * "CONTAINER_REBOOTED" will be saved in the container argument
2653 */
2654static int wait_for_container(pid_t pid, ContainerStatus *container) {
2655 int r;
2656 siginfo_t status;
2657
2658 r = wait_for_terminate(pid, &status);
2659 if (r < 0)
2660 return r;
2661
2662 switch (status.si_code) {
2663 case CLD_EXITED:
2664 r = status.si_status;
2665 if (r == 0) {
2666 if (!arg_quiet)
2667 log_debug("Container %s exited successfully.",
2668 arg_machine);
2669
2670 *container = CONTAINER_TERMINATED;
2671 } else {
2672 log_error("Container %s failed with error code %i.",
2673 arg_machine, status.si_status);
2674 r = -1;
2675 }
2676 break;
2677
2678 case CLD_KILLED:
2679 if (status.si_status == SIGINT) {
2680 if (!arg_quiet)
2681 log_info("Container %s has been shut down.",
2682 arg_machine);
2683
2684 *container = CONTAINER_TERMINATED;
2685 r = 0;
2686 break;
2687 } else if (status.si_status == SIGHUP) {
2688 if (!arg_quiet)
2689 log_info("Container %s is being rebooted.",
2690 arg_machine);
2691
2692 *container = CONTAINER_REBOOTED;
2693 r = 0;
2694 break;
2695 }
2696 /* CLD_KILLED fallthrough */
2697
2698 case CLD_DUMPED:
2699 log_error("Container %s terminated by signal %s.",
2700 arg_machine, signal_to_string(status.si_status));
2701 r = -1;
2702 break;
2703
2704 default:
2705 log_error("Container %s failed due to unknown reason.",
2706 arg_machine);
2707 r = -1;
2708 break;
2709 }
2710
2711 return r;
2712}
2713
2714static void nop_handler(int sig) {}
2715
2716int main(int argc, char *argv[]) {
2717
2718 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2719 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2720 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2721 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2722 _cleanup_fdset_free_ FDSet *fds = NULL;
2723 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2724 const char *console = NULL;
2725 char veth_name[IFNAMSIZ];
2726 bool secondary = false;
2727 sigset_t mask, mask_chld;
2728 pid_t pid = 0;
2729
2730 log_parse_environment();
2731 log_open();
2732
2733 k = parse_argv(argc, argv);
2734 if (k < 0)
2735 goto finish;
2736 else if (k == 0) {
2737 r = EXIT_SUCCESS;
2738 goto finish;
2739 }
2740
2741 if (!arg_image) {
2742 if (arg_directory) {
2743 char *p;
2744
2745 p = path_make_absolute_cwd(arg_directory);
2746 free(arg_directory);
2747 arg_directory = p;
2748 } else
2749 arg_directory = get_current_dir_name();
2750
2751 if (!arg_directory) {
2752 log_error("Failed to determine path, please use -D.");
2753 goto finish;
2754 }
2755 path_kill_slashes(arg_directory);
2756 }
2757
2758 if (!arg_machine) {
2759 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2760 if (!arg_machine) {
2761 log_oom();
2762 goto finish;
2763 }
2764
2765 hostname_cleanup(arg_machine, false);
2766 if (isempty(arg_machine)) {
2767 log_error("Failed to determine machine name automatically, please use -M.");
2768 goto finish;
2769 }
2770 }
2771
2772 if (geteuid() != 0) {
2773 log_error("Need to be root.");
2774 goto finish;
2775 }
2776
2777 if (sd_booted() <= 0) {
2778 log_error("Not running on a systemd system.");
2779 goto finish;
2780 }
2781
2782 log_close();
2783 n_fd_passed = sd_listen_fds(false);
2784 if (n_fd_passed > 0) {
2785 k = fdset_new_listen_fds(&fds, false);
2786 if (k < 0) {
2787 log_error("Failed to collect file descriptors: %s", strerror(-k));
2788 goto finish;
2789 }
2790 }
2791 fdset_close_others(fds);
2792 log_open();
2793
2794 if (arg_directory) {
2795 if (path_equal(arg_directory, "/")) {
2796 log_error("Spawning container on root directory not supported.");
2797 goto finish;
2798 }
2799
2800 if (arg_boot) {
2801 if (path_is_os_tree(arg_directory) <= 0) {
2802 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2803 goto finish;
2804 }
2805 } else {
2806 const char *p;
2807
2808 p = strappenda(arg_directory,
2809 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2810 if (access(p, F_OK) < 0) {
2811 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2812 goto finish;
2813
2814 }
2815 }
2816 } else {
2817 char template[] = "/tmp/nspawn-root-XXXXXX";
2818
2819 if (!mkdtemp(template)) {
2820 log_error("Failed to create temporary directory: %m");
2821 r = -errno;
2822 goto finish;
2823 }
2824
2825 arg_directory = strdup(template);
2826 if (!arg_directory) {
2827 r = log_oom();
2828 goto finish;
2829 }
2830
2831 image_fd = setup_image(&device_path, &loop_nr);
2832 if (image_fd < 0) {
2833 r = image_fd;
2834 goto finish;
2835 }
2836
2837 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2838 if (r < 0)
2839 goto finish;
2840 }
2841
2842 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2843 if (master < 0) {
2844 log_error("Failed to acquire pseudo tty: %m");
2845 goto finish;
2846 }
2847
2848 console = ptsname(master);
2849 if (!console) {
2850 log_error("Failed to determine tty name: %m");
2851 goto finish;
2852 }
2853
2854 if (!arg_quiet)
2855 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2856 arg_machine, arg_image ? arg_image : arg_directory);
2857
2858 if (unlockpt(master) < 0) {
2859 log_error("Failed to unlock tty: %m");
2860 goto finish;
2861 }
2862
2863 if (access("/dev/kdbus/control", F_OK) >= 0) {
2864
2865 if (arg_share_system) {
2866 kdbus_domain = strdup("/dev/kdbus");
2867 if (!kdbus_domain) {
2868 log_oom();
2869 goto finish;
2870 }
2871 } else {
2872 const char *ns;
2873
2874 ns = strappenda("machine-", arg_machine);
2875 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2876 if (r < 0)
2877 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2878 else
2879 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2880 }
2881 }
2882
2883 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2884 log_error("Failed to create kmsg socket pair: %m");
2885 goto finish;
2886 }
2887
2888 sd_notify(0, "READY=1");
2889
2890 assert_se(sigemptyset(&mask) == 0);
2891 assert_se(sigemptyset(&mask_chld) == 0);
2892 sigaddset(&mask_chld, SIGCHLD);
2893 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2894 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2895
2896 for (;;) {
2897 ContainerStatus container_status;
2898 int eventfds[2] = { -1, -1 };
2899 struct sigaction sa = {
2900 .sa_handler = nop_handler,
2901 .sa_flags = SA_NOCLDSTOP,
2902 };
2903
2904 /* Child can be killed before execv(), so handle SIGCHLD
2905 * in order to interrupt parent's blocking calls and
2906 * give it a chance to call wait() and terminate. */
2907 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2908 if (r < 0) {
2909 log_error("Failed to change the signal mask: %m");
2910 goto finish;
2911 }
2912
2913 r = sigaction(SIGCHLD, &sa, NULL);
2914 if (r < 0) {
2915 log_error("Failed to install SIGCHLD handler: %m");
2916 goto finish;
2917 }
2918
2919 pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
2920 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2921 (arg_private_network ? CLONE_NEWNET : 0), eventfds);
2922 if (pid < 0) {
2923 if (errno == EINVAL)
2924 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2925 else
2926 log_error("clone() failed: %m");
2927
2928 r = pid;
2929 goto finish;
2930 }
2931
2932 if (pid == 0) {
2933 /* child */
2934 _cleanup_free_ char *home = NULL;
2935 unsigned n_env = 2;
2936 const char *envp[] = {
2937 "PATH=" DEFAULT_PATH_SPLIT_USR,
2938 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2939 NULL, /* TERM */
2940 NULL, /* HOME */
2941 NULL, /* USER */
2942 NULL, /* LOGNAME */
2943 NULL, /* container_uuid */
2944 NULL, /* LISTEN_FDS */
2945 NULL, /* LISTEN_PID */
2946 NULL
2947 };
2948 char **env_use;
2949
2950 envp[n_env] = strv_find_prefix(environ, "TERM=");
2951 if (envp[n_env])
2952 n_env ++;
2953
2954 master = safe_close(master);
2955
2956 close_nointr(STDIN_FILENO);
2957 close_nointr(STDOUT_FILENO);
2958 close_nointr(STDERR_FILENO);
2959
2960 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
2961
2962 reset_all_signal_handlers();
2963
2964 assert_se(sigemptyset(&mask) == 0);
2965 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2966
2967 k = open_terminal(console, O_RDWR);
2968 if (k != STDIN_FILENO) {
2969 if (k >= 0) {
2970 safe_close(k);
2971 k = -EINVAL;
2972 }
2973
2974 log_error("Failed to open console: %s", strerror(-k));
2975 goto child_fail;
2976 }
2977
2978 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2979 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2980 log_error("Failed to duplicate console: %m");
2981 goto child_fail;
2982 }
2983
2984 if (setsid() < 0) {
2985 log_error("setsid() failed: %m");
2986 goto child_fail;
2987 }
2988
2989 if (reset_audit_loginuid() < 0)
2990 goto child_fail;
2991
2992 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2993 log_error("PR_SET_PDEATHSIG failed: %m");
2994 goto child_fail;
2995 }
2996
2997 /* Mark everything as slave, so that we still
2998 * receive mounts from the real root, but don't
2999 * propagate mounts to the real root. */
3000 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3001 log_error("MS_SLAVE|MS_REC failed: %m");
3002 goto child_fail;
3003 }
3004
3005 if (mount_devices(arg_directory,
3006 root_device, root_device_rw,
3007 home_device, home_device_rw,
3008 srv_device, srv_device_rw) < 0)
3009 goto child_fail;
3010
3011 /* Turn directory into bind mount */
3012 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3013 log_error("Failed to make bind mount: %m");
3014 goto child_fail;
3015 }
3016
3017 if (arg_read_only) {
3018 k = bind_remount_recursive(arg_directory, true);
3019 if (k < 0) {
3020 log_error("Failed to make tree read-only: %s", strerror(-k));
3021 goto child_fail;
3022 }
3023 }
3024
3025 if (mount_all(arg_directory) < 0)
3026 goto child_fail;
3027
3028 if (copy_devnodes(arg_directory) < 0)
3029 goto child_fail;
3030
3031 if (setup_ptmx(arg_directory) < 0)
3032 goto child_fail;
3033
3034 dev_setup(arg_directory);
3035
3036 if (audit_still_doesnt_work_in_containers() < 0)
3037 goto child_fail;
3038
3039 if (setup_dev_console(arg_directory, console) < 0)
3040 goto child_fail;
3041
3042 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3043 goto child_fail;
3044
3045 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3046
3047 if (setup_boot_id(arg_directory) < 0)
3048 goto child_fail;
3049
3050 if (setup_timezone(arg_directory) < 0)
3051 goto child_fail;
3052
3053 if (setup_resolv_conf(arg_directory) < 0)
3054 goto child_fail;
3055
3056 if (setup_journal(arg_directory) < 0)
3057 goto child_fail;
3058
3059 if (mount_binds(arg_directory, arg_bind, false) < 0)
3060 goto child_fail;
3061
3062 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3063 goto child_fail;
3064
3065 if (mount_tmpfs(arg_directory) < 0)
3066 goto child_fail;
3067
3068 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3069 goto child_fail;
3070
3071 /* Tell the parent that we are ready, and that
3072 * it can cgroupify us to that we lack access
3073 * to certain devices and resources. */
3074 r = eventfd_send_state(eventfds[1],
3075 EVENTFD_CHILD_SUCCEEDED);
3076 eventfds[1] = safe_close(eventfds[1]);
3077 if (r < 0)
3078 goto child_fail;
3079
3080 if (chdir(arg_directory) < 0) {
3081 log_error("chdir(%s) failed: %m", arg_directory);
3082 goto child_fail;
3083 }
3084
3085 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3086 log_error("mount(MS_MOVE) failed: %m");
3087 goto child_fail;
3088 }
3089
3090 if (chroot(".") < 0) {
3091 log_error("chroot() failed: %m");
3092 goto child_fail;
3093 }
3094
3095 if (chdir("/") < 0) {
3096 log_error("chdir() failed: %m");
3097 goto child_fail;
3098 }
3099
3100 umask(0022);
3101
3102 if (arg_private_network)
3103 loopback_setup();
3104
3105 if (drop_capabilities() < 0) {
3106 log_error("drop_capabilities() failed: %m");
3107 goto child_fail;
3108 }
3109
3110 r = change_uid_gid(&home);
3111 if (r < 0)
3112 goto child_fail;
3113
3114 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3115 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3116 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3117 log_oom();
3118 goto child_fail;
3119 }
3120
3121 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3122 char as_uuid[37];
3123
3124 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3125 log_oom();
3126 goto child_fail;
3127 }
3128 }
3129
3130 if (fdset_size(fds) > 0) {
3131 k = fdset_cloexec(fds, false);
3132 if (k < 0) {
3133 log_error("Failed to unset O_CLOEXEC for file descriptors.");
3134 goto child_fail;
3135 }
3136
3137 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3138 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3139 log_oom();
3140 goto child_fail;
3141 }
3142 }
3143
3144 setup_hostname();
3145
3146 if (arg_personality != 0xffffffffLU) {
3147 if (personality(arg_personality) < 0) {
3148 log_error("personality() failed: %m");
3149 goto child_fail;
3150 }
3151 } else if (secondary) {
3152 if (personality(PER_LINUX32) < 0) {
3153 log_error("personality() failed: %m");
3154 goto child_fail;
3155 }
3156 }
3157
3158#ifdef HAVE_SELINUX
3159 if (arg_selinux_context)
3160 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3161 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3162 goto child_fail;
3163 }
3164#endif
3165
3166 if (!strv_isempty(arg_setenv)) {
3167 char **n;
3168
3169 n = strv_env_merge(2, envp, arg_setenv);
3170 if (!n) {
3171 log_oom();
3172 goto child_fail;
3173 }
3174
3175 env_use = n;
3176 } else
3177 env_use = (char**) envp;
3178
3179 /* Wait until the parent is ready with the setup, too... */
3180 r = eventfd_parent_succeeded(eventfds[0]);
3181 eventfds[0] = safe_close(eventfds[0]);
3182 if (r < 0)
3183 goto child_fail;
3184
3185 if (arg_boot) {
3186 char **a;
3187 size_t l;
3188
3189 /* Automatically search for the init system */
3190
3191 l = 1 + argc - optind;
3192 a = newa(char*, l + 1);
3193 memcpy(a + 1, argv + optind, l * sizeof(char*));
3194
3195 a[0] = (char*) "/usr/lib/systemd/systemd";
3196 execve(a[0], a, env_use);
3197
3198 a[0] = (char*) "/lib/systemd/systemd";
3199 execve(a[0], a, env_use);
3200
3201 a[0] = (char*) "/sbin/init";
3202 execve(a[0], a, env_use);
3203 } else if (argc > optind)
3204 execvpe(argv[optind], argv + optind, env_use);
3205 else {
3206 chdir(home ? home : "/root");
3207 execle("/bin/bash", "-bash", NULL, env_use);
3208 execle("/bin/sh", "-sh", NULL, env_use);
3209 }
3210
3211 log_error("execv() failed: %m");
3212
3213 child_fail:
3214 /* Tell the parent that the setup failed, so he
3215 * can clean up resources and terminate. */
3216 if (eventfds[1] != -1)
3217 eventfd_send_state(eventfds[1],
3218 EVENTFD_CHILD_FAILED);
3219 _exit(EXIT_FAILURE);
3220 }
3221
3222 fdset_free(fds);
3223 fds = NULL;
3224
3225 /* Wait for the child event:
3226 * If EVENTFD_CHILD_FAILED, the child will terminate soon.
3227 * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
3228 * it is ready with all it needs to do with priviliges.
3229 * After we got the notification we can make the process
3230 * join its cgroup which might limit what it can do */
3231 r = eventfd_child_succeeded(eventfds[1]);
3232 eventfds[1] = safe_close(eventfds[1]);
3233 if (r < 0)
3234 goto check_container_status;
3235
3236 r = register_machine(pid);
3237 if (r < 0)
3238 goto finish;
3239
3240 r = move_network_interfaces(pid);
3241 if (r < 0)
3242 goto finish;
3243
3244 r = setup_veth(pid, veth_name);
3245 if (r < 0)
3246 goto finish;
3247
3248 r = setup_bridge(veth_name);
3249 if (r < 0)
3250 goto finish;
3251
3252 r = setup_macvlan(pid);
3253 if (r < 0)
3254 goto finish;
3255
3256 /* Block SIGCHLD here, before notifying child.
3257 * process_pty() will handle it with the other signals. */
3258 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3259 if (r < 0)
3260 goto finish;
3261
3262 /* Reset signal to default */
3263 r = default_signals(SIGCHLD, -1);
3264 if (r < 0)
3265 goto finish;
3266
3267 /* Notify the child that the parent is ready with all
3268 * its setup, and that the child can now hand over
3269 * control to the code to run inside the container. */
3270 r = eventfd_send_state(eventfds[0],
3271 EVENTFD_PARENT_SUCCEEDED);
3272 eventfds[0] = safe_close(eventfds[0]);
3273 if (r < 0)
3274 goto finish;
3275
3276 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3277 if (k < 0) {
3278 r = EXIT_FAILURE;
3279 break;
3280 }
3281
3282 if (!arg_quiet)
3283 putc('\n', stdout);
3284
3285 /* Kill if it is not dead yet anyway */
3286 terminate_machine(pid);
3287
3288check_container_status:
3289 /* Redundant, but better safe than sorry */
3290 kill(pid, SIGKILL);
3291
3292 r = wait_for_container(pid, &container_status);
3293 pid = 0;
3294
3295 if (r < 0) {
3296 r = EXIT_FAILURE;
3297 break;
3298 } else if (container_status == CONTAINER_TERMINATED)
3299 break;
3300
3301 /* CONTAINER_REBOOTED, loop again */
3302 }
3303
3304finish:
3305 loop_remove(loop_nr, &image_fd);
3306
3307 if (pid > 0)
3308 kill(pid, SIGKILL);
3309
3310 free(arg_directory);
3311 free(arg_machine);
3312 free(arg_user);
3313 strv_free(arg_setenv);
3314 strv_free(arg_network_interfaces);
3315 strv_free(arg_network_macvlan);
3316 strv_free(arg_bind);
3317 strv_free(arg_bind_ro);
3318 strv_free(arg_tmpfs);
3319
3320 return r;
3321}