]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
treewide: auto-convert the simple cases to log_*_errno()
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92 #include "event-util.h"
93
94 #ifdef HAVE_SECCOMP
95 #include "seccomp-util.h"
96 #endif
97
98 typedef enum ContainerStatus {
99 CONTAINER_TERMINATED,
100 CONTAINER_REBOOTED
101 } ContainerStatus;
102
103 typedef enum LinkJournal {
104 LINK_NO,
105 LINK_AUTO,
106 LINK_HOST,
107 LINK_GUEST
108 } LinkJournal;
109
110 typedef enum Volatile {
111 VOLATILE_NO,
112 VOLATILE_YES,
113 VOLATILE_STATE,
114 } Volatile;
115
116 static char *arg_directory = NULL;
117 static char *arg_user = NULL;
118 static sd_id128_t arg_uuid = {};
119 static char *arg_machine = NULL;
120 static const char *arg_selinux_context = NULL;
121 static const char *arg_selinux_apifs_context = NULL;
122 static const char *arg_slice = NULL;
123 static bool arg_private_network = false;
124 static bool arg_read_only = false;
125 static bool arg_boot = false;
126 static LinkJournal arg_link_journal = LINK_AUTO;
127 static bool arg_link_journal_try = false;
128 static uint64_t arg_retain =
129 (1ULL << CAP_CHOWN) |
130 (1ULL << CAP_DAC_OVERRIDE) |
131 (1ULL << CAP_DAC_READ_SEARCH) |
132 (1ULL << CAP_FOWNER) |
133 (1ULL << CAP_FSETID) |
134 (1ULL << CAP_IPC_OWNER) |
135 (1ULL << CAP_KILL) |
136 (1ULL << CAP_LEASE) |
137 (1ULL << CAP_LINUX_IMMUTABLE) |
138 (1ULL << CAP_NET_BIND_SERVICE) |
139 (1ULL << CAP_NET_BROADCAST) |
140 (1ULL << CAP_NET_RAW) |
141 (1ULL << CAP_SETGID) |
142 (1ULL << CAP_SETFCAP) |
143 (1ULL << CAP_SETPCAP) |
144 (1ULL << CAP_SETUID) |
145 (1ULL << CAP_SYS_ADMIN) |
146 (1ULL << CAP_SYS_CHROOT) |
147 (1ULL << CAP_SYS_NICE) |
148 (1ULL << CAP_SYS_PTRACE) |
149 (1ULL << CAP_SYS_TTY_CONFIG) |
150 (1ULL << CAP_SYS_RESOURCE) |
151 (1ULL << CAP_SYS_BOOT) |
152 (1ULL << CAP_AUDIT_WRITE) |
153 (1ULL << CAP_AUDIT_CONTROL) |
154 (1ULL << CAP_MKNOD);
155 static char **arg_bind = NULL;
156 static char **arg_bind_ro = NULL;
157 static char **arg_tmpfs = NULL;
158 static char **arg_setenv = NULL;
159 static bool arg_quiet = false;
160 static bool arg_share_system = false;
161 static bool arg_register = true;
162 static bool arg_keep_unit = false;
163 static char **arg_network_interfaces = NULL;
164 static char **arg_network_macvlan = NULL;
165 static bool arg_network_veth = false;
166 static const char *arg_network_bridge = NULL;
167 static unsigned long arg_personality = 0xffffffffLU;
168 static const char *arg_image = NULL;
169 static Volatile arg_volatile = VOLATILE_NO;
170
171 static void help(void) {
172 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
173 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
174 " -h --help Show this help\n"
175 " --version Print version string\n"
176 " -q --quiet Do not show status information\n"
177 " -D --directory=PATH Root directory for the container\n"
178 " -i --image=PATH File system device or image for the container\n"
179 " -b --boot Boot up full system (i.e. invoke init)\n"
180 " -u --user=USER Run the command under specified user or uid\n"
181 " -M --machine=NAME Set the machine name for the container\n"
182 " --uuid=UUID Set a specific machine UUID for the container\n"
183 " -S --slice=SLICE Place the container in the specified slice\n"
184 " --private-network Disable network in container\n"
185 " --network-interface=INTERFACE\n"
186 " Assign an existing network interface to the\n"
187 " container\n"
188 " --network-macvlan=INTERFACE\n"
189 " Create a macvlan network interface based on an\n"
190 " existing network interface to the container\n"
191 " --network-veth Add a virtual ethernet connection between host\n"
192 " and container\n"
193 " --network-bridge=INTERFACE\n"
194 " Add a virtual ethernet connection between host\n"
195 " and container and add it to an existing bridge on\n"
196 " the host\n"
197 " -Z --selinux-context=SECLABEL\n"
198 " Set the SELinux security context to be used by\n"
199 " processes in the container\n"
200 " -L --selinux-apifs-context=SECLABEL\n"
201 " Set the SELinux security context to be used by\n"
202 " API/tmpfs file systems in the container\n"
203 " --capability=CAP In addition to the default, retain specified\n"
204 " capability\n"
205 " --drop-capability=CAP Drop the specified capability from the default set\n"
206 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
207 " try-guest, try-host\n"
208 " -j Equivalent to --link-journal=try-guest\n"
209 " --read-only Mount the root directory read-only\n"
210 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
211 " the container\n"
212 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
213 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
214 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
215 " --share-system Share system namespaces with host\n"
216 " --register=BOOLEAN Register container as machine\n"
217 " --keep-unit Do not register a scope for the machine, reuse\n"
218 " the service unit nspawn is running in\n"
219 " --volatile[=MODE] Run the system in volatile mode\n",
220 program_invocation_short_name);
221 }
222
223 static int parse_argv(int argc, char *argv[]) {
224
225 enum {
226 ARG_VERSION = 0x100,
227 ARG_PRIVATE_NETWORK,
228 ARG_UUID,
229 ARG_READ_ONLY,
230 ARG_CAPABILITY,
231 ARG_DROP_CAPABILITY,
232 ARG_LINK_JOURNAL,
233 ARG_BIND,
234 ARG_BIND_RO,
235 ARG_TMPFS,
236 ARG_SETENV,
237 ARG_SHARE_SYSTEM,
238 ARG_REGISTER,
239 ARG_KEEP_UNIT,
240 ARG_NETWORK_INTERFACE,
241 ARG_NETWORK_MACVLAN,
242 ARG_NETWORK_VETH,
243 ARG_NETWORK_BRIDGE,
244 ARG_PERSONALITY,
245 ARG_VOLATILE,
246 };
247
248 static const struct option options[] = {
249 { "help", no_argument, NULL, 'h' },
250 { "version", no_argument, NULL, ARG_VERSION },
251 { "directory", required_argument, NULL, 'D' },
252 { "user", required_argument, NULL, 'u' },
253 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
254 { "boot", no_argument, NULL, 'b' },
255 { "uuid", required_argument, NULL, ARG_UUID },
256 { "read-only", no_argument, NULL, ARG_READ_ONLY },
257 { "capability", required_argument, NULL, ARG_CAPABILITY },
258 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
259 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
260 { "bind", required_argument, NULL, ARG_BIND },
261 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
262 { "tmpfs", required_argument, NULL, ARG_TMPFS },
263 { "machine", required_argument, NULL, 'M' },
264 { "slice", required_argument, NULL, 'S' },
265 { "setenv", required_argument, NULL, ARG_SETENV },
266 { "selinux-context", required_argument, NULL, 'Z' },
267 { "selinux-apifs-context", required_argument, NULL, 'L' },
268 { "quiet", no_argument, NULL, 'q' },
269 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
270 { "register", required_argument, NULL, ARG_REGISTER },
271 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
272 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
273 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
274 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
275 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
276 { "personality", required_argument, NULL, ARG_PERSONALITY },
277 { "image", required_argument, NULL, 'i' },
278 { "volatile", optional_argument, NULL, ARG_VOLATILE },
279 {}
280 };
281
282 int c, r;
283 uint64_t plus = 0, minus = 0;
284
285 assert(argc >= 0);
286 assert(argv);
287
288 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
289
290 switch (c) {
291
292 case 'h':
293 help();
294 return 0;
295
296 case ARG_VERSION:
297 puts(PACKAGE_STRING);
298 puts(SYSTEMD_FEATURES);
299 return 0;
300
301 case 'D':
302 free(arg_directory);
303 arg_directory = canonicalize_file_name(optarg);
304 if (!arg_directory) {
305 log_error("Invalid root directory: %m");
306 return -ENOMEM;
307 }
308
309 break;
310
311 case 'i':
312 arg_image = optarg;
313 break;
314
315 case 'u':
316 free(arg_user);
317 arg_user = strdup(optarg);
318 if (!arg_user)
319 return log_oom();
320
321 break;
322
323 case ARG_NETWORK_BRIDGE:
324 arg_network_bridge = optarg;
325
326 /* fall through */
327
328 case ARG_NETWORK_VETH:
329 arg_network_veth = true;
330 arg_private_network = true;
331 break;
332
333 case ARG_NETWORK_INTERFACE:
334 if (strv_extend(&arg_network_interfaces, optarg) < 0)
335 return log_oom();
336
337 arg_private_network = true;
338 break;
339
340 case ARG_NETWORK_MACVLAN:
341 if (strv_extend(&arg_network_macvlan, optarg) < 0)
342 return log_oom();
343
344 /* fall through */
345
346 case ARG_PRIVATE_NETWORK:
347 arg_private_network = true;
348 break;
349
350 case 'b':
351 arg_boot = true;
352 break;
353
354 case ARG_UUID:
355 r = sd_id128_from_string(optarg, &arg_uuid);
356 if (r < 0) {
357 log_error("Invalid UUID: %s", optarg);
358 return r;
359 }
360 break;
361
362 case 'S':
363 arg_slice = optarg;
364 break;
365
366 case 'M':
367 if (isempty(optarg)) {
368 free(arg_machine);
369 arg_machine = NULL;
370 } else {
371
372 if (!hostname_is_valid(optarg)) {
373 log_error("Invalid machine name: %s", optarg);
374 return -EINVAL;
375 }
376
377 free(arg_machine);
378 arg_machine = strdup(optarg);
379 if (!arg_machine)
380 return log_oom();
381
382 break;
383 }
384
385 case 'Z':
386 arg_selinux_context = optarg;
387 break;
388
389 case 'L':
390 arg_selinux_apifs_context = optarg;
391 break;
392
393 case ARG_READ_ONLY:
394 arg_read_only = true;
395 break;
396
397 case ARG_CAPABILITY:
398 case ARG_DROP_CAPABILITY: {
399 const char *state, *word;
400 size_t length;
401
402 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
403 _cleanup_free_ char *t;
404 cap_value_t cap;
405
406 t = strndup(word, length);
407 if (!t)
408 return log_oom();
409
410 if (streq(t, "all")) {
411 if (c == ARG_CAPABILITY)
412 plus = (uint64_t) -1;
413 else
414 minus = (uint64_t) -1;
415 } else {
416 if (cap_from_name(t, &cap) < 0) {
417 log_error("Failed to parse capability %s.", t);
418 return -EINVAL;
419 }
420
421 if (c == ARG_CAPABILITY)
422 plus |= 1ULL << (uint64_t) cap;
423 else
424 minus |= 1ULL << (uint64_t) cap;
425 }
426 }
427
428 break;
429 }
430
431 case 'j':
432 arg_link_journal = LINK_GUEST;
433 arg_link_journal_try = true;
434 break;
435
436 case ARG_LINK_JOURNAL:
437 if (streq(optarg, "auto"))
438 arg_link_journal = LINK_AUTO;
439 else if (streq(optarg, "no"))
440 arg_link_journal = LINK_NO;
441 else if (streq(optarg, "guest"))
442 arg_link_journal = LINK_GUEST;
443 else if (streq(optarg, "host"))
444 arg_link_journal = LINK_HOST;
445 else if (streq(optarg, "try-guest")) {
446 arg_link_journal = LINK_GUEST;
447 arg_link_journal_try = true;
448 } else if (streq(optarg, "try-host")) {
449 arg_link_journal = LINK_HOST;
450 arg_link_journal_try = true;
451 } else {
452 log_error("Failed to parse link journal mode %s", optarg);
453 return -EINVAL;
454 }
455
456 break;
457
458 case ARG_BIND:
459 case ARG_BIND_RO: {
460 _cleanup_free_ char *a = NULL, *b = NULL;
461 char *e;
462 char ***x;
463
464 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
465
466 e = strchr(optarg, ':');
467 if (e) {
468 a = strndup(optarg, e - optarg);
469 b = strdup(e + 1);
470 } else {
471 a = strdup(optarg);
472 b = strdup(optarg);
473 }
474
475 if (!a || !b)
476 return log_oom();
477
478 if (!path_is_absolute(a) || !path_is_absolute(b)) {
479 log_error("Invalid bind mount specification: %s", optarg);
480 return -EINVAL;
481 }
482
483 r = strv_extend(x, a);
484 if (r < 0)
485 return log_oom();
486
487 r = strv_extend(x, b);
488 if (r < 0)
489 return log_oom();
490
491 break;
492 }
493
494 case ARG_TMPFS: {
495 _cleanup_free_ char *a = NULL, *b = NULL;
496 char *e;
497
498 e = strchr(optarg, ':');
499 if (e) {
500 a = strndup(optarg, e - optarg);
501 b = strdup(e + 1);
502 } else {
503 a = strdup(optarg);
504 b = strdup("mode=0755");
505 }
506
507 if (!a || !b)
508 return log_oom();
509
510 if (!path_is_absolute(a)) {
511 log_error("Invalid tmpfs specification: %s", optarg);
512 return -EINVAL;
513 }
514
515 r = strv_push(&arg_tmpfs, a);
516 if (r < 0)
517 return log_oom();
518
519 a = NULL;
520
521 r = strv_push(&arg_tmpfs, b);
522 if (r < 0)
523 return log_oom();
524
525 b = NULL;
526
527 break;
528 }
529
530 case ARG_SETENV: {
531 char **n;
532
533 if (!env_assignment_is_valid(optarg)) {
534 log_error("Environment variable assignment '%s' is not valid.", optarg);
535 return -EINVAL;
536 }
537
538 n = strv_env_set(arg_setenv, optarg);
539 if (!n)
540 return log_oom();
541
542 strv_free(arg_setenv);
543 arg_setenv = n;
544 break;
545 }
546
547 case 'q':
548 arg_quiet = true;
549 break;
550
551 case ARG_SHARE_SYSTEM:
552 arg_share_system = true;
553 break;
554
555 case ARG_REGISTER:
556 r = parse_boolean(optarg);
557 if (r < 0) {
558 log_error("Failed to parse --register= argument: %s", optarg);
559 return r;
560 }
561
562 arg_register = r;
563 break;
564
565 case ARG_KEEP_UNIT:
566 arg_keep_unit = true;
567 break;
568
569 case ARG_PERSONALITY:
570
571 arg_personality = personality_from_string(optarg);
572 if (arg_personality == 0xffffffffLU) {
573 log_error("Unknown or unsupported personality '%s'.", optarg);
574 return -EINVAL;
575 }
576
577 break;
578
579 case ARG_VOLATILE:
580
581 if (!optarg)
582 arg_volatile = VOLATILE_YES;
583 else {
584 r = parse_boolean(optarg);
585 if (r < 0) {
586 if (streq(optarg, "state"))
587 arg_volatile = VOLATILE_STATE;
588 else {
589 log_error("Failed to parse --volatile= argument: %s", optarg);
590 return r;
591 }
592 } else
593 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
594 }
595
596 break;
597
598 case '?':
599 return -EINVAL;
600
601 default:
602 assert_not_reached("Unhandled option");
603 }
604
605 if (arg_share_system)
606 arg_register = false;
607
608 if (arg_boot && arg_share_system) {
609 log_error("--boot and --share-system may not be combined.");
610 return -EINVAL;
611 }
612
613 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
614 log_error("--keep-unit may not be used when invoked from a user session.");
615 return -EINVAL;
616 }
617
618 if (arg_directory && arg_image) {
619 log_error("--directory= and --image= may not be combined.");
620 return -EINVAL;
621 }
622
623 if (arg_volatile != VOLATILE_NO && arg_read_only) {
624 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
625 return -EINVAL;
626 }
627
628 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
629
630 return 1;
631 }
632
633 static int mount_all(const char *dest) {
634
635 typedef struct MountPoint {
636 const char *what;
637 const char *where;
638 const char *type;
639 const char *options;
640 unsigned long flags;
641 bool fatal;
642 } MountPoint;
643
644 static const MountPoint mount_table[] = {
645 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
646 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
647 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
648 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
649 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
650 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
651 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
652 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
653 #ifdef HAVE_SELINUX
654 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
655 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
656 #endif
657 };
658
659 unsigned k;
660 int r = 0;
661
662 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
663 _cleanup_free_ char *where = NULL;
664 #ifdef HAVE_SELINUX
665 _cleanup_free_ char *options = NULL;
666 #endif
667 const char *o;
668 int t;
669
670 where = strjoin(dest, "/", mount_table[k].where, NULL);
671 if (!where)
672 return log_oom();
673
674 t = path_is_mount_point(where, true);
675 if (t < 0) {
676 log_error_errno(-t, "Failed to detect whether %s is a mount point: %m", where);
677
678 if (r == 0)
679 r = t;
680
681 continue;
682 }
683
684 /* Skip this entry if it is not a remount. */
685 if (mount_table[k].what && t > 0)
686 continue;
687
688 t = mkdir_p(where, 0755);
689 if (t < 0) {
690 if (mount_table[k].fatal) {
691 log_error_errno(-t, "Failed to create directory %s: %m", where);
692
693 if (r == 0)
694 r = t;
695 } else
696 log_warning_errno(-t, "Failed to create directory %s: %m", where);
697
698 continue;
699 }
700
701 #ifdef HAVE_SELINUX
702 if (arg_selinux_apifs_context &&
703 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
704 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
705 if (!options)
706 return log_oom();
707
708 o = options;
709 } else
710 #endif
711 o = mount_table[k].options;
712
713
714 if (mount(mount_table[k].what,
715 where,
716 mount_table[k].type,
717 mount_table[k].flags,
718 o) < 0) {
719
720 if (mount_table[k].fatal) {
721 log_error("mount(%s) failed: %m", where);
722
723 if (r == 0)
724 r = -errno;
725 } else
726 log_warning("mount(%s) failed: %m", where);
727 }
728 }
729
730 return r;
731 }
732
733 static int mount_binds(const char *dest, char **l, bool ro) {
734 char **x, **y;
735
736 STRV_FOREACH_PAIR(x, y, l) {
737 _cleanup_free_ char *where = NULL;
738 struct stat source_st, dest_st;
739 int r;
740
741 if (stat(*x, &source_st) < 0) {
742 log_error("Failed to stat %s: %m", *x);
743 return -errno;
744 }
745
746 where = strappend(dest, *y);
747 if (!where)
748 return log_oom();
749
750 r = stat(where, &dest_st);
751 if (r == 0) {
752 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
753 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
754 return -EINVAL;
755 }
756 } else if (errno == ENOENT) {
757 r = mkdir_parents_label(where, 0755);
758 if (r < 0) {
759 log_error_errno(-r, "Failed to bind mount %s: %m", *x);
760 return r;
761 }
762 } else {
763 log_error("Failed to bind mount %s: %m", *x);
764 return -errno;
765 }
766
767 /* Create the mount point, but be conservative -- refuse to create block
768 * and char devices. */
769 if (S_ISDIR(source_st.st_mode)) {
770 r = mkdir_label(where, 0755);
771 if (r < 0 && errno != EEXIST) {
772 log_error_errno(-r, "Failed to create mount point %s: %m", where);
773
774 return r;
775 }
776 } else if (S_ISFIFO(source_st.st_mode)) {
777 r = mkfifo(where, 0644);
778 if (r < 0 && errno != EEXIST) {
779 log_error("Failed to create mount point %s: %m", where);
780
781 return -errno;
782 }
783 } else if (S_ISSOCK(source_st.st_mode)) {
784 r = mknod(where, 0644 | S_IFSOCK, 0);
785 if (r < 0 && errno != EEXIST) {
786 log_error("Failed to create mount point %s: %m", where);
787
788 return -errno;
789 }
790 } else if (S_ISREG(source_st.st_mode)) {
791 r = touch(where);
792 if (r < 0) {
793 log_error_errno(-r, "Failed to create mount point %s: %m", where);
794
795 return r;
796 }
797 } else {
798 log_error("Refusing to create mountpoint for file: %s", *x);
799 return -ENOTSUP;
800 }
801
802 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
803 log_error("mount(%s) failed: %m", where);
804 return -errno;
805 }
806
807 if (ro) {
808 r = bind_remount_recursive(where, true);
809 if (r < 0) {
810 log_error_errno(-r, "Read-Only bind mount failed: %m");
811 return r;
812 }
813 }
814 }
815
816 return 0;
817 }
818
819 static int mount_tmpfs(const char *dest) {
820 char **i, **o;
821
822 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
823 _cleanup_free_ char *where = NULL;
824 int r;
825
826 where = strappend(dest, *i);
827 if (!where)
828 return log_oom();
829
830 r = mkdir_label(where, 0755);
831 if (r < 0 && errno != EEXIST) {
832 log_error_errno(-r, "creating mount point for tmpfs %s failed: %m", where);
833
834 return r;
835 }
836
837 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
838 log_error("tmpfs mount to %s failed: %m", where);
839 return -errno;
840 }
841 }
842
843 return 0;
844 }
845
846 static int setup_timezone(const char *dest) {
847 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
848 char *z, *y;
849 int r;
850
851 assert(dest);
852
853 /* Fix the timezone, if possible */
854 r = readlink_malloc("/etc/localtime", &p);
855 if (r < 0) {
856 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
857 return 0;
858 }
859
860 z = path_startswith(p, "../usr/share/zoneinfo/");
861 if (!z)
862 z = path_startswith(p, "/usr/share/zoneinfo/");
863 if (!z) {
864 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
865 return 0;
866 }
867
868 where = strappend(dest, "/etc/localtime");
869 if (!where)
870 return log_oom();
871
872 r = readlink_malloc(where, &q);
873 if (r >= 0) {
874 y = path_startswith(q, "../usr/share/zoneinfo/");
875 if (!y)
876 y = path_startswith(q, "/usr/share/zoneinfo/");
877
878 /* Already pointing to the right place? Then do nothing .. */
879 if (y && streq(y, z))
880 return 0;
881 }
882
883 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
884 if (!check)
885 return log_oom();
886
887 if (access(check, F_OK) < 0) {
888 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
889 return 0;
890 }
891
892 what = strappend("../usr/share/zoneinfo/", z);
893 if (!what)
894 return log_oom();
895
896 r = mkdir_parents(where, 0755);
897 if (r < 0) {
898 log_error_errno(-r, "Failed to create directory for timezone info %s in container: %m", where);
899
900 return 0;
901 }
902
903 r = unlink(where);
904 if (r < 0 && errno != ENOENT) {
905 log_error("Failed to remove existing timezone info %s in container: %m", where);
906
907 return 0;
908 }
909
910 if (symlink(what, where) < 0) {
911 log_error("Failed to correct timezone of container: %m");
912 return 0;
913 }
914
915 return 0;
916 }
917
918 static int setup_resolv_conf(const char *dest) {
919 _cleanup_free_ char *where = NULL;
920 int r;
921
922 assert(dest);
923
924 if (arg_private_network)
925 return 0;
926
927 /* Fix resolv.conf, if possible */
928 where = strappend(dest, "/etc/resolv.conf");
929 if (!where)
930 return log_oom();
931
932 /* We don't really care for the results of this really. If it
933 * fails, it fails, but meh... */
934 r = mkdir_parents(where, 0755);
935 if (r < 0) {
936 log_warning_errno(-r, "Failed to create parent directory for resolv.conf %s: %m", where);
937
938 return 0;
939 }
940
941 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
942 if (r < 0) {
943 log_warning_errno(-r, "Failed to copy /etc/resolv.conf to %s: %m", where);
944
945 return 0;
946 }
947
948 return 0;
949 }
950
951 static int setup_volatile_state(const char *directory) {
952 const char *p;
953 int r;
954
955 assert(directory);
956
957 if (arg_volatile != VOLATILE_STATE)
958 return 0;
959
960 /* --volatile=state means we simply overmount /var
961 with a tmpfs, and the rest read-only. */
962
963 r = bind_remount_recursive(directory, true);
964 if (r < 0) {
965 log_error_errno(-r, "Failed to remount %s read-only: %m", directory);
966 return r;
967 }
968
969 p = strappenda(directory, "/var");
970 r = mkdir(p, 0755);
971 if (r < 0 && errno != EEXIST) {
972 log_error("Failed to create %s: %m", directory);
973 return -errno;
974 }
975
976 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
977 log_error("Failed to mount tmpfs to /var: %m");
978 return -errno;
979 }
980
981 return 0;
982 }
983
984 static int setup_volatile(const char *directory) {
985 bool tmpfs_mounted = false, bind_mounted = false;
986 char template[] = "/tmp/nspawn-volatile-XXXXXX";
987 const char *f, *t;
988 int r;
989
990 assert(directory);
991
992 if (arg_volatile != VOLATILE_YES)
993 return 0;
994
995 /* --volatile=yes means we mount a tmpfs to the root dir, and
996 the original /usr to use inside it, and that read-only. */
997
998 if (!mkdtemp(template)) {
999 log_error("Failed to create temporary directory: %m");
1000 return -errno;
1001 }
1002
1003 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1004 log_error("Failed to mount tmpfs for root directory: %m");
1005 r = -errno;
1006 goto fail;
1007 }
1008
1009 tmpfs_mounted = true;
1010
1011 f = strappenda(directory, "/usr");
1012 t = strappenda(template, "/usr");
1013
1014 r = mkdir(t, 0755);
1015 if (r < 0 && errno != EEXIST) {
1016 log_error("Failed to create %s: %m", t);
1017 r = -errno;
1018 goto fail;
1019 }
1020
1021 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1022 log_error("Failed to create /usr bind mount: %m");
1023 r = -errno;
1024 goto fail;
1025 }
1026
1027 bind_mounted = true;
1028
1029 r = bind_remount_recursive(t, true);
1030 if (r < 0) {
1031 log_error_errno(-r, "Failed to remount %s read-only: %m", t);
1032 goto fail;
1033 }
1034
1035 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1036 log_error("Failed to move root mount: %m");
1037 r = -errno;
1038 goto fail;
1039 }
1040
1041 rmdir(template);
1042
1043 return 0;
1044
1045 fail:
1046 if (bind_mounted)
1047 umount(t);
1048 if (tmpfs_mounted)
1049 umount(template);
1050 rmdir(template);
1051 return r;
1052 }
1053
1054 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1055
1056 snprintf(s, 37,
1057 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1058 SD_ID128_FORMAT_VAL(id));
1059
1060 return s;
1061 }
1062
1063 static int setup_boot_id(const char *dest) {
1064 _cleanup_free_ char *from = NULL, *to = NULL;
1065 sd_id128_t rnd = {};
1066 char as_uuid[37];
1067 int r;
1068
1069 assert(dest);
1070
1071 if (arg_share_system)
1072 return 0;
1073
1074 /* Generate a new randomized boot ID, so that each boot-up of
1075 * the container gets a new one */
1076
1077 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1078 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1079 if (!from || !to)
1080 return log_oom();
1081
1082 r = sd_id128_randomize(&rnd);
1083 if (r < 0) {
1084 log_error_errno(-r, "Failed to generate random boot id: %m");
1085 return r;
1086 }
1087
1088 id128_format_as_uuid(rnd, as_uuid);
1089
1090 r = write_string_file(from, as_uuid);
1091 if (r < 0) {
1092 log_error_errno(-r, "Failed to write boot id: %m");
1093 return r;
1094 }
1095
1096 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1097 log_error("Failed to bind mount boot id: %m");
1098 r = -errno;
1099 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1100 log_warning("Failed to make boot id read-only: %m");
1101
1102 unlink(from);
1103 return r;
1104 }
1105
1106 static int copy_devnodes(const char *dest) {
1107
1108 static const char devnodes[] =
1109 "null\0"
1110 "zero\0"
1111 "full\0"
1112 "random\0"
1113 "urandom\0"
1114 "tty\0"
1115 "net/tun\0";
1116
1117 const char *d;
1118 int r = 0;
1119 _cleanup_umask_ mode_t u;
1120
1121 assert(dest);
1122
1123 u = umask(0000);
1124
1125 NULSTR_FOREACH(d, devnodes) {
1126 _cleanup_free_ char *from = NULL, *to = NULL;
1127 struct stat st;
1128
1129 from = strappend("/dev/", d);
1130 to = strjoin(dest, "/dev/", d, NULL);
1131 if (!from || !to)
1132 return log_oom();
1133
1134 if (stat(from, &st) < 0) {
1135
1136 if (errno != ENOENT) {
1137 log_error("Failed to stat %s: %m", from);
1138 return -errno;
1139 }
1140
1141 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1142
1143 log_error("%s is not a char or block device, cannot copy", from);
1144 return -EIO;
1145
1146 } else {
1147 r = mkdir_parents(to, 0775);
1148 if (r < 0) {
1149 log_error_errno(-r, "Failed to create parent directory of %s: %m", to);
1150 return -r;
1151 }
1152
1153 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1154 log_error("mknod(%s) failed: %m", dest);
1155 return -errno;
1156 }
1157 }
1158 }
1159
1160 return r;
1161 }
1162
1163 static int setup_ptmx(const char *dest) {
1164 _cleanup_free_ char *p = NULL;
1165
1166 p = strappend(dest, "/dev/ptmx");
1167 if (!p)
1168 return log_oom();
1169
1170 if (symlink("pts/ptmx", p) < 0) {
1171 log_error("Failed to create /dev/ptmx symlink: %m");
1172 return -errno;
1173 }
1174
1175 return 0;
1176 }
1177
1178 static int setup_dev_console(const char *dest, const char *console) {
1179 _cleanup_umask_ mode_t u;
1180 const char *to;
1181 struct stat st;
1182 int r;
1183
1184 assert(dest);
1185 assert(console);
1186
1187 u = umask(0000);
1188
1189 if (stat("/dev/null", &st) < 0) {
1190 log_error("Failed to stat /dev/null: %m");
1191 return -errno;
1192 }
1193
1194 r = chmod_and_chown(console, 0600, 0, 0);
1195 if (r < 0) {
1196 log_error_errno(-r, "Failed to correct access mode for TTY: %m");
1197 return r;
1198 }
1199
1200 /* We need to bind mount the right tty to /dev/console since
1201 * ptys can only exist on pts file systems. To have something
1202 * to bind mount things on we create a device node first, and
1203 * use /dev/null for that since we the cgroups device policy
1204 * allows us to create that freely, while we cannot create
1205 * /dev/console. (Note that the major minor doesn't actually
1206 * matter here, since we mount it over anyway). */
1207
1208 to = strappenda(dest, "/dev/console");
1209 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1210 log_error("mknod() for /dev/console failed: %m");
1211 return -errno;
1212 }
1213
1214 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1215 log_error("Bind mount for /dev/console failed: %m");
1216 return -errno;
1217 }
1218
1219 return 0;
1220 }
1221
1222 static int setup_kmsg(const char *dest, int kmsg_socket) {
1223 _cleanup_free_ char *from = NULL, *to = NULL;
1224 int r, fd, k;
1225 _cleanup_umask_ mode_t u;
1226 union {
1227 struct cmsghdr cmsghdr;
1228 uint8_t buf[CMSG_SPACE(sizeof(int))];
1229 } control = {};
1230 struct msghdr mh = {
1231 .msg_control = &control,
1232 .msg_controllen = sizeof(control),
1233 };
1234 struct cmsghdr *cmsg;
1235
1236 assert(dest);
1237 assert(kmsg_socket >= 0);
1238
1239 u = umask(0000);
1240
1241 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1242 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1243 * on the reading side behave very similar to /proc/kmsg,
1244 * their writing side behaves differently from /dev/kmsg in
1245 * that writing blocks when nothing is reading. In order to
1246 * avoid any problems with containers deadlocking due to this
1247 * we simply make /dev/kmsg unavailable to the container. */
1248 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1249 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1250 return log_oom();
1251
1252 if (mkfifo(from, 0600) < 0) {
1253 log_error("mkfifo() for /dev/kmsg failed: %m");
1254 return -errno;
1255 }
1256
1257 r = chmod_and_chown(from, 0600, 0, 0);
1258 if (r < 0) {
1259 log_error_errno(-r, "Failed to correct access mode for /dev/kmsg: %m");
1260 return r;
1261 }
1262
1263 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1264 log_error("Bind mount for /proc/kmsg failed: %m");
1265 return -errno;
1266 }
1267
1268 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1269 if (fd < 0) {
1270 log_error("Failed to open fifo: %m");
1271 return -errno;
1272 }
1273
1274 cmsg = CMSG_FIRSTHDR(&mh);
1275 cmsg->cmsg_level = SOL_SOCKET;
1276 cmsg->cmsg_type = SCM_RIGHTS;
1277 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1278 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1279
1280 mh.msg_controllen = cmsg->cmsg_len;
1281
1282 /* Store away the fd in the socket, so that it stays open as
1283 * long as we run the child */
1284 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1285 safe_close(fd);
1286
1287 if (k < 0) {
1288 log_error("Failed to send FIFO fd: %m");
1289 return -errno;
1290 }
1291
1292 /* And now make the FIFO unavailable as /dev/kmsg... */
1293 unlink(from);
1294 return 0;
1295 }
1296
1297 static int setup_hostname(void) {
1298
1299 if (arg_share_system)
1300 return 0;
1301
1302 if (sethostname_idempotent(arg_machine) < 0)
1303 return -errno;
1304
1305 return 0;
1306 }
1307
1308 static int setup_journal(const char *directory) {
1309 sd_id128_t machine_id, this_id;
1310 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1311 char *id;
1312 int r;
1313
1314 p = strappend(directory, "/etc/machine-id");
1315 if (!p)
1316 return log_oom();
1317
1318 r = read_one_line_file(p, &b);
1319 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1320 return 0;
1321 else if (r < 0) {
1322 log_error_errno(-r, "Failed to read machine ID from %s: %m", p);
1323 return r;
1324 }
1325
1326 id = strstrip(b);
1327 if (isempty(id) && arg_link_journal == LINK_AUTO)
1328 return 0;
1329
1330 /* Verify validity */
1331 r = sd_id128_from_string(id, &machine_id);
1332 if (r < 0) {
1333 log_error_errno(-r, "Failed to parse machine ID from %s: %m", p);
1334 return r;
1335 }
1336
1337 r = sd_id128_get_machine(&this_id);
1338 if (r < 0) {
1339 log_error_errno(-r, "Failed to retrieve machine ID: %m");
1340 return r;
1341 }
1342
1343 if (sd_id128_equal(machine_id, this_id)) {
1344 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1345 "Host and machine ids are equal (%s): refusing to link journals", id);
1346 if (arg_link_journal == LINK_AUTO)
1347 return 0;
1348 return
1349 -EEXIST;
1350 }
1351
1352 if (arg_link_journal == LINK_NO)
1353 return 0;
1354
1355 free(p);
1356 p = strappend("/var/log/journal/", id);
1357 q = strjoin(directory, "/var/log/journal/", id, NULL);
1358 if (!p || !q)
1359 return log_oom();
1360
1361 if (path_is_mount_point(p, false) > 0) {
1362 if (arg_link_journal != LINK_AUTO) {
1363 log_error("%s: already a mount point, refusing to use for journal", p);
1364 return -EEXIST;
1365 }
1366
1367 return 0;
1368 }
1369
1370 if (path_is_mount_point(q, false) > 0) {
1371 if (arg_link_journal != LINK_AUTO) {
1372 log_error("%s: already a mount point, refusing to use for journal", q);
1373 return -EEXIST;
1374 }
1375
1376 return 0;
1377 }
1378
1379 r = readlink_and_make_absolute(p, &d);
1380 if (r >= 0) {
1381 if ((arg_link_journal == LINK_GUEST ||
1382 arg_link_journal == LINK_AUTO) &&
1383 path_equal(d, q)) {
1384
1385 r = mkdir_p(q, 0755);
1386 if (r < 0)
1387 log_warning("Failed to create directory %s: %m", q);
1388 return 0;
1389 }
1390
1391 if (unlink(p) < 0) {
1392 log_error("Failed to remove symlink %s: %m", p);
1393 return -errno;
1394 }
1395 } else if (r == -EINVAL) {
1396
1397 if (arg_link_journal == LINK_GUEST &&
1398 rmdir(p) < 0) {
1399
1400 if (errno == ENOTDIR) {
1401 log_error("%s already exists and is neither a symlink nor a directory", p);
1402 return r;
1403 } else {
1404 log_error("Failed to remove %s: %m", p);
1405 return -errno;
1406 }
1407 }
1408 } else if (r != -ENOENT) {
1409 log_error("readlink(%s) failed: %m", p);
1410 return r;
1411 }
1412
1413 if (arg_link_journal == LINK_GUEST) {
1414
1415 if (symlink(q, p) < 0) {
1416 if (arg_link_journal_try) {
1417 log_debug("Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1418 return 0;
1419 } else {
1420 log_error("Failed to symlink %s to %s: %m", q, p);
1421 return -errno;
1422 }
1423 }
1424
1425 r = mkdir_p(q, 0755);
1426 if (r < 0)
1427 log_warning("Failed to create directory %s: %m", q);
1428 return 0;
1429 }
1430
1431 if (arg_link_journal == LINK_HOST) {
1432 /* don't create parents here -- if the host doesn't have
1433 * permanent journal set up, don't force it here */
1434 r = mkdir(p, 0755);
1435 if (r < 0) {
1436 if (arg_link_journal_try) {
1437 log_debug("Failed to create %s, skipping journal setup: %m", p);
1438 return 0;
1439 } else {
1440 log_error("Failed to create %s: %m", p);
1441 return r;
1442 }
1443 }
1444
1445 } else if (access(p, F_OK) < 0)
1446 return 0;
1447
1448 if (dir_is_empty(q) == 0)
1449 log_warning("%s is not empty, proceeding anyway.", q);
1450
1451 r = mkdir_p(q, 0755);
1452 if (r < 0) {
1453 log_error("Failed to create %s: %m", q);
1454 return r;
1455 }
1456
1457 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1458 log_error("Failed to bind mount journal from host into guest: %m");
1459 return -errno;
1460 }
1461
1462 return 0;
1463 }
1464
1465 static int drop_capabilities(void) {
1466 return capability_bounding_set_drop(~arg_retain, false);
1467 }
1468
1469 static int register_machine(pid_t pid, int local_ifindex) {
1470 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1471 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1472 int r;
1473
1474 if (!arg_register)
1475 return 0;
1476
1477 r = sd_bus_default_system(&bus);
1478 if (r < 0) {
1479 log_error_errno(-r, "Failed to open system bus: %m");
1480 return r;
1481 }
1482
1483 if (arg_keep_unit) {
1484 r = sd_bus_call_method(
1485 bus,
1486 "org.freedesktop.machine1",
1487 "/org/freedesktop/machine1",
1488 "org.freedesktop.machine1.Manager",
1489 "RegisterMachineWithNetwork",
1490 &error,
1491 NULL,
1492 "sayssusai",
1493 arg_machine,
1494 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1495 "nspawn",
1496 "container",
1497 (uint32_t) pid,
1498 strempty(arg_directory),
1499 local_ifindex > 0 ? 1 : 0, local_ifindex);
1500 } else {
1501 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1502
1503 r = sd_bus_message_new_method_call(
1504 bus,
1505 &m,
1506 "org.freedesktop.machine1",
1507 "/org/freedesktop/machine1",
1508 "org.freedesktop.machine1.Manager",
1509 "CreateMachineWithNetwork");
1510 if (r < 0) {
1511 log_error_errno(-r, "Failed to create message: %m");
1512 return r;
1513 }
1514
1515 r = sd_bus_message_append(
1516 m,
1517 "sayssusai",
1518 arg_machine,
1519 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1520 "nspawn",
1521 "container",
1522 (uint32_t) pid,
1523 strempty(arg_directory),
1524 local_ifindex > 0 ? 1 : 0, local_ifindex);
1525 if (r < 0) {
1526 log_error_errno(-r, "Failed to append message arguments: %m");
1527 return r;
1528 }
1529
1530 r = sd_bus_message_open_container(m, 'a', "(sv)");
1531 if (r < 0) {
1532 log_error_errno(-r, "Failed to open container: %m");
1533 return r;
1534 }
1535
1536 if (!isempty(arg_slice)) {
1537 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1538 if (r < 0) {
1539 log_error_errno(-r, "Failed to append slice: %m");
1540 return r;
1541 }
1542 }
1543
1544 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1545 if (r < 0) {
1546 log_error_errno(-r, "Failed to add device policy: %m");
1547 return r;
1548 }
1549
1550 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1551 /* Allow the container to
1552 * access and create the API
1553 * device nodes, so that
1554 * PrivateDevices= in the
1555 * container can work
1556 * fine */
1557 "/dev/null", "rwm",
1558 "/dev/zero", "rwm",
1559 "/dev/full", "rwm",
1560 "/dev/random", "rwm",
1561 "/dev/urandom", "rwm",
1562 "/dev/tty", "rwm",
1563 "/dev/net/tun", "rwm",
1564 /* Allow the container
1565 * access to ptys. However,
1566 * do not permit the
1567 * container to ever create
1568 * these device nodes. */
1569 "/dev/pts/ptmx", "rw",
1570 "char-pts", "rw");
1571 if (r < 0) {
1572 log_error_errno(-r, "Failed to add device whitelist: %m");
1573 return r;
1574 }
1575
1576 r = sd_bus_message_close_container(m);
1577 if (r < 0) {
1578 log_error_errno(-r, "Failed to close container: %m");
1579 return r;
1580 }
1581
1582 r = sd_bus_call(bus, m, 0, &error, NULL);
1583 }
1584
1585 if (r < 0) {
1586 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1587 return r;
1588 }
1589
1590 return 0;
1591 }
1592
1593 static int terminate_machine(pid_t pid) {
1594 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1595 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1596 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1597 const char *path;
1598 int r;
1599
1600 if (!arg_register)
1601 return 0;
1602
1603 r = sd_bus_default_system(&bus);
1604 if (r < 0) {
1605 log_error_errno(-r, "Failed to open system bus: %m");
1606 return r;
1607 }
1608
1609 r = sd_bus_call_method(
1610 bus,
1611 "org.freedesktop.machine1",
1612 "/org/freedesktop/machine1",
1613 "org.freedesktop.machine1.Manager",
1614 "GetMachineByPID",
1615 &error,
1616 &reply,
1617 "u",
1618 (uint32_t) pid);
1619 if (r < 0) {
1620 /* Note that the machine might already have been
1621 * cleaned up automatically, hence don't consider it a
1622 * failure if we cannot get the machine object. */
1623 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1624 return 0;
1625 }
1626
1627 r = sd_bus_message_read(reply, "o", &path);
1628 if (r < 0)
1629 return bus_log_parse_error(r);
1630
1631 r = sd_bus_call_method(
1632 bus,
1633 "org.freedesktop.machine1",
1634 path,
1635 "org.freedesktop.machine1.Machine",
1636 "Terminate",
1637 &error,
1638 NULL,
1639 NULL);
1640 if (r < 0) {
1641 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1642 return 0;
1643 }
1644
1645 return 0;
1646 }
1647
1648 static int reset_audit_loginuid(void) {
1649 _cleanup_free_ char *p = NULL;
1650 int r;
1651
1652 if (arg_share_system)
1653 return 0;
1654
1655 r = read_one_line_file("/proc/self/loginuid", &p);
1656 if (r == -ENOENT)
1657 return 0;
1658 if (r < 0) {
1659 log_error_errno(-r, "Failed to read /proc/self/loginuid: %m");
1660 return r;
1661 }
1662
1663 /* Already reset? */
1664 if (streq(p, "4294967295"))
1665 return 0;
1666
1667 r = write_string_file("/proc/self/loginuid", "4294967295");
1668 if (r < 0) {
1669 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1670 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1671 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1672 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1673 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1674
1675 sleep(5);
1676 }
1677
1678 return 0;
1679 }
1680
1681 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1682 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1683
1684 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key) {
1685 int r;
1686
1687 uint8_t result[8];
1688 size_t l, sz;
1689 uint8_t *v;
1690
1691 l = strlen(arg_machine);
1692 sz = sizeof(sd_id128_t) + l;
1693 v = alloca(sz);
1694
1695 /* fetch some persistent data unique to the host */
1696 r = sd_id128_get_machine((sd_id128_t*) v);
1697 if (r < 0)
1698 return r;
1699
1700 /* combine with some data unique (on this host) to this
1701 * container instance */
1702 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1703
1704 /* Let's hash the host machine ID plus the container name. We
1705 * use a fixed, but originally randomly created hash key here. */
1706 siphash24(result, v, sz, hash_key.bytes);
1707
1708 assert_cc(ETH_ALEN <= sizeof(result));
1709 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1710
1711 /* see eth_random_addr in the kernel */
1712 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1713 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1714
1715 return 0;
1716 }
1717
1718 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1719 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1720 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1721 struct ether_addr mac_host, mac_container;
1722 int r, i;
1723
1724 if (!arg_private_network)
1725 return 0;
1726
1727 if (!arg_network_veth)
1728 return 0;
1729
1730 /* Use two different interface name prefixes depending whether
1731 * we are in bridge mode or not. */
1732 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1733 arg_network_bridge ? "vb" : "ve", arg_machine);
1734
1735 r = generate_mac(&mac_container, CONTAINER_HASH_KEY);
1736 if (r < 0) {
1737 log_error("Failed to generate predictable MAC address for container side");
1738 return r;
1739 }
1740
1741 r = generate_mac(&mac_host, HOST_HASH_KEY);
1742 if (r < 0) {
1743 log_error("Failed to generate predictable MAC address for host side");
1744 return r;
1745 }
1746
1747 r = sd_rtnl_open(&rtnl, 0);
1748 if (r < 0) {
1749 log_error_errno(-r, "Failed to connect to netlink: %m");
1750 return r;
1751 }
1752
1753 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1754 if (r < 0) {
1755 log_error_errno(-r, "Failed to allocate netlink message: %m");
1756 return r;
1757 }
1758
1759 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1760 if (r < 0) {
1761 log_error_errno(-r, "Failed to add netlink interface name: %m");
1762 return r;
1763 }
1764
1765 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1766 if (r < 0) {
1767 log_error_errno(-r, "Failed to add netlink MAC address: %m");
1768 return r;
1769 }
1770
1771 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1772 if (r < 0) {
1773 log_error_errno(-r, "Failed to open netlink container: %m");
1774 return r;
1775 }
1776
1777 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1778 if (r < 0) {
1779 log_error_errno(-r, "Failed to open netlink container: %m");
1780 return r;
1781 }
1782
1783 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1784 if (r < 0) {
1785 log_error_errno(-r, "Failed to open netlink container: %m");
1786 return r;
1787 }
1788
1789 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1790 if (r < 0) {
1791 log_error_errno(-r, "Failed to add netlink interface name: %m");
1792 return r;
1793 }
1794
1795 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1796 if (r < 0) {
1797 log_error_errno(-r, "Failed to add netlink MAC address: %m");
1798 return r;
1799 }
1800
1801 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1802 if (r < 0) {
1803 log_error_errno(-r, "Failed to add netlink namespace field: %m");
1804 return r;
1805 }
1806
1807 r = sd_rtnl_message_close_container(m);
1808 if (r < 0) {
1809 log_error_errno(-r, "Failed to close netlink container: %m");
1810 return r;
1811 }
1812
1813 r = sd_rtnl_message_close_container(m);
1814 if (r < 0) {
1815 log_error_errno(-r, "Failed to close netlink container: %m");
1816 return r;
1817 }
1818
1819 r = sd_rtnl_message_close_container(m);
1820 if (r < 0) {
1821 log_error_errno(-r, "Failed to close netlink container: %m");
1822 return r;
1823 }
1824
1825 r = sd_rtnl_call(rtnl, m, 0, NULL);
1826 if (r < 0) {
1827 log_error_errno(-r, "Failed to add new veth interfaces: %m");
1828 return r;
1829 }
1830
1831 i = (int) if_nametoindex(iface_name);
1832 if (i <= 0) {
1833 log_error("Failed to resolve interface %s: %m", iface_name);
1834 return -errno;
1835 }
1836
1837 *ifi = i;
1838
1839 return 0;
1840 }
1841
1842 static int setup_bridge(const char veth_name[], int *ifi) {
1843 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1844 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1845 int r, bridge;
1846
1847 if (!arg_private_network)
1848 return 0;
1849
1850 if (!arg_network_veth)
1851 return 0;
1852
1853 if (!arg_network_bridge)
1854 return 0;
1855
1856 bridge = (int) if_nametoindex(arg_network_bridge);
1857 if (bridge <= 0) {
1858 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1859 return -errno;
1860 }
1861
1862 *ifi = bridge;
1863
1864 r = sd_rtnl_open(&rtnl, 0);
1865 if (r < 0) {
1866 log_error_errno(-r, "Failed to connect to netlink: %m");
1867 return r;
1868 }
1869
1870 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1871 if (r < 0) {
1872 log_error_errno(-r, "Failed to allocate netlink message: %m");
1873 return r;
1874 }
1875
1876 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1877 if (r < 0) {
1878 log_error_errno(-r, "Failed to set IFF_UP flag: %m");
1879 return r;
1880 }
1881
1882 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1883 if (r < 0) {
1884 log_error_errno(-r, "Failed to add netlink interface name field: %m");
1885 return r;
1886 }
1887
1888 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1889 if (r < 0) {
1890 log_error_errno(-r, "Failed to add netlink master field: %m");
1891 return r;
1892 }
1893
1894 r = sd_rtnl_call(rtnl, m, 0, NULL);
1895 if (r < 0) {
1896 log_error_errno(-r, "Failed to add veth interface to bridge: %m");
1897 return r;
1898 }
1899
1900 return 0;
1901 }
1902
1903 static int parse_interface(struct udev *udev, const char *name) {
1904 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1905 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1906 int ifi;
1907
1908 ifi = (int) if_nametoindex(name);
1909 if (ifi <= 0) {
1910 log_error("Failed to resolve interface %s: %m", name);
1911 return -errno;
1912 }
1913
1914 sprintf(ifi_str, "n%i", ifi);
1915 d = udev_device_new_from_device_id(udev, ifi_str);
1916 if (!d) {
1917 log_error("Failed to get udev device for interface %s: %m", name);
1918 return -errno;
1919 }
1920
1921 if (udev_device_get_is_initialized(d) <= 0) {
1922 log_error("Network interface %s is not initialized yet.", name);
1923 return -EBUSY;
1924 }
1925
1926 return ifi;
1927 }
1928
1929 static int move_network_interfaces(pid_t pid) {
1930 _cleanup_udev_unref_ struct udev *udev = NULL;
1931 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1932 char **i;
1933 int r;
1934
1935 if (!arg_private_network)
1936 return 0;
1937
1938 if (strv_isempty(arg_network_interfaces))
1939 return 0;
1940
1941 r = sd_rtnl_open(&rtnl, 0);
1942 if (r < 0) {
1943 log_error_errno(-r, "Failed to connect to netlink: %m");
1944 return r;
1945 }
1946
1947 udev = udev_new();
1948 if (!udev) {
1949 log_error("Failed to connect to udev.");
1950 return -ENOMEM;
1951 }
1952
1953 STRV_FOREACH(i, arg_network_interfaces) {
1954 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1955 int ifi;
1956
1957 ifi = parse_interface(udev, *i);
1958 if (ifi < 0)
1959 return ifi;
1960
1961 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1962 if (r < 0) {
1963 log_error_errno(-r, "Failed to allocate netlink message: %m");
1964 return r;
1965 }
1966
1967 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1968 if (r < 0) {
1969 log_error_errno(-r, "Failed to append namespace PID to netlink message: %m");
1970 return r;
1971 }
1972
1973 r = sd_rtnl_call(rtnl, m, 0, NULL);
1974 if (r < 0) {
1975 log_error_errno(-r, "Failed to move interface %s to namespace: %m", *i);
1976 return r;
1977 }
1978 }
1979
1980 return 0;
1981 }
1982
1983 static int setup_macvlan(pid_t pid) {
1984 _cleanup_udev_unref_ struct udev *udev = NULL;
1985 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1986 char **i;
1987 int r;
1988
1989 if (!arg_private_network)
1990 return 0;
1991
1992 if (strv_isempty(arg_network_macvlan))
1993 return 0;
1994
1995 r = sd_rtnl_open(&rtnl, 0);
1996 if (r < 0) {
1997 log_error_errno(-r, "Failed to connect to netlink: %m");
1998 return r;
1999 }
2000
2001 udev = udev_new();
2002 if (!udev) {
2003 log_error("Failed to connect to udev.");
2004 return -ENOMEM;
2005 }
2006
2007 STRV_FOREACH(i, arg_network_macvlan) {
2008 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2009 _cleanup_free_ char *n = NULL;
2010 int ifi;
2011
2012 ifi = parse_interface(udev, *i);
2013 if (ifi < 0)
2014 return ifi;
2015
2016 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2017 if (r < 0) {
2018 log_error_errno(-r, "Failed to allocate netlink message: %m");
2019 return r;
2020 }
2021
2022 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2023 if (r < 0) {
2024 log_error_errno(-r, "Failed to add netlink interface index: %m");
2025 return r;
2026 }
2027
2028 n = strappend("mv-", *i);
2029 if (!n)
2030 return log_oom();
2031
2032 strshorten(n, IFNAMSIZ-1);
2033
2034 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2035 if (r < 0) {
2036 log_error_errno(-r, "Failed to add netlink interface name: %m");
2037 return r;
2038 }
2039
2040 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2041 if (r < 0) {
2042 log_error_errno(-r, "Failed to add netlink namespace field: %m");
2043 return r;
2044 }
2045
2046 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2047 if (r < 0) {
2048 log_error_errno(-r, "Failed to open netlink container: %m");
2049 return r;
2050 }
2051
2052 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2053 if (r < 0) {
2054 log_error_errno(-r, "Failed to open netlink container: %m");
2055 return r;
2056 }
2057
2058 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2059 if (r < 0) {
2060 log_error_errno(-r, "Failed to append macvlan mode: %m");
2061 return r;
2062 }
2063
2064 r = sd_rtnl_message_close_container(m);
2065 if (r < 0) {
2066 log_error_errno(-r, "Failed to close netlink container: %m");
2067 return r;
2068 }
2069
2070 r = sd_rtnl_message_close_container(m);
2071 if (r < 0) {
2072 log_error_errno(-r, "Failed to close netlink container: %m");
2073 return r;
2074 }
2075
2076 r = sd_rtnl_call(rtnl, m, 0, NULL);
2077 if (r < 0) {
2078 log_error_errno(-r, "Failed to add new macvlan interfaces: %m");
2079 return r;
2080 }
2081 }
2082
2083 return 0;
2084 }
2085
2086 static int setup_seccomp(void) {
2087
2088 #ifdef HAVE_SECCOMP
2089 static const int blacklist[] = {
2090 SCMP_SYS(kexec_load),
2091 SCMP_SYS(open_by_handle_at),
2092 SCMP_SYS(init_module),
2093 SCMP_SYS(finit_module),
2094 SCMP_SYS(delete_module),
2095 SCMP_SYS(iopl),
2096 SCMP_SYS(ioperm),
2097 SCMP_SYS(swapon),
2098 SCMP_SYS(swapoff),
2099 };
2100
2101 scmp_filter_ctx seccomp;
2102 unsigned i;
2103 int r;
2104
2105 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2106 if (!seccomp)
2107 return log_oom();
2108
2109 r = seccomp_add_secondary_archs(seccomp);
2110 if (r < 0) {
2111 log_error_errno(-r, "Failed to add secondary archs to seccomp filter: %m");
2112 goto finish;
2113 }
2114
2115 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2116 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2117 if (r == -EFAULT)
2118 continue; /* unknown syscall */
2119 if (r < 0) {
2120 log_error_errno(-r, "Failed to block syscall: %m");
2121 goto finish;
2122 }
2123 }
2124
2125 /*
2126 Audit is broken in containers, much of the userspace audit
2127 hookup will fail if running inside a container. We don't
2128 care and just turn off creation of audit sockets.
2129
2130 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2131 with EAFNOSUPPORT which audit userspace uses as indication
2132 that audit is disabled in the kernel.
2133 */
2134
2135 r = seccomp_rule_add(
2136 seccomp,
2137 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2138 SCMP_SYS(socket),
2139 2,
2140 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2141 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2142 if (r < 0) {
2143 log_error_errno(-r, "Failed to add audit seccomp rule: %m");
2144 goto finish;
2145 }
2146
2147 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2148 if (r < 0) {
2149 log_error_errno(-r, "Failed to unset NO_NEW_PRIVS: %m");
2150 goto finish;
2151 }
2152
2153 r = seccomp_load(seccomp);
2154 if (r < 0)
2155 log_error_errno(-r, "Failed to install seccomp audit filter: %m");
2156
2157 finish:
2158 seccomp_release(seccomp);
2159 return r;
2160 #else
2161 return 0;
2162 #endif
2163
2164 }
2165
2166 static int setup_image(char **device_path, int *loop_nr) {
2167 struct loop_info64 info = {
2168 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2169 };
2170 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2171 _cleanup_free_ char* loopdev = NULL;
2172 struct stat st;
2173 int r, nr;
2174
2175 assert(device_path);
2176 assert(loop_nr);
2177
2178 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2179 if (fd < 0) {
2180 log_error("Failed to open %s: %m", arg_image);
2181 return -errno;
2182 }
2183
2184 if (fstat(fd, &st) < 0) {
2185 log_error("Failed to stat %s: %m", arg_image);
2186 return -errno;
2187 }
2188
2189 if (S_ISBLK(st.st_mode)) {
2190 char *p;
2191
2192 p = strdup(arg_image);
2193 if (!p)
2194 return log_oom();
2195
2196 *device_path = p;
2197
2198 *loop_nr = -1;
2199
2200 r = fd;
2201 fd = -1;
2202
2203 return r;
2204 }
2205
2206 if (!S_ISREG(st.st_mode)) {
2207 log_error("%s is not a regular file or block device: %m", arg_image);
2208 return -EINVAL;
2209 }
2210
2211 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2212 if (control < 0) {
2213 log_error("Failed to open /dev/loop-control: %m");
2214 return -errno;
2215 }
2216
2217 nr = ioctl(control, LOOP_CTL_GET_FREE);
2218 if (nr < 0) {
2219 log_error("Failed to allocate loop device: %m");
2220 return -errno;
2221 }
2222
2223 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2224 return log_oom();
2225
2226 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2227 if (loop < 0) {
2228 log_error("Failed to open loop device %s: %m", loopdev);
2229 return -errno;
2230 }
2231
2232 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2233 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2234 return -errno;
2235 }
2236
2237 if (arg_read_only)
2238 info.lo_flags |= LO_FLAGS_READ_ONLY;
2239
2240 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2241 log_error("Failed to set loopback settings on %s: %m", loopdev);
2242 return -errno;
2243 }
2244
2245 *device_path = loopdev;
2246 loopdev = NULL;
2247
2248 *loop_nr = nr;
2249
2250 r = loop;
2251 loop = -1;
2252
2253 return r;
2254 }
2255
2256 static int dissect_image(
2257 int fd,
2258 char **root_device, bool *root_device_rw,
2259 char **home_device, bool *home_device_rw,
2260 char **srv_device, bool *srv_device_rw,
2261 bool *secondary) {
2262
2263 #ifdef HAVE_BLKID
2264 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2265 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2266 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2267 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2268 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2269 _cleanup_udev_unref_ struct udev *udev = NULL;
2270 struct udev_list_entry *first, *item;
2271 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2272 const char *pttype = NULL;
2273 blkid_partlist pl;
2274 struct stat st;
2275 int r;
2276
2277 assert(fd >= 0);
2278 assert(root_device);
2279 assert(home_device);
2280 assert(srv_device);
2281 assert(secondary);
2282
2283 b = blkid_new_probe();
2284 if (!b)
2285 return log_oom();
2286
2287 errno = 0;
2288 r = blkid_probe_set_device(b, fd, 0, 0);
2289 if (r != 0) {
2290 if (errno == 0)
2291 return log_oom();
2292
2293 log_error("Failed to set device on blkid probe: %m");
2294 return -errno;
2295 }
2296
2297 blkid_probe_enable_partitions(b, 1);
2298 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2299
2300 errno = 0;
2301 r = blkid_do_safeprobe(b);
2302 if (r == -2 || r == 1) {
2303 log_error("Failed to identify any partition table on %s.\n"
2304 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2305 return -EINVAL;
2306 } else if (r != 0) {
2307 if (errno == 0)
2308 errno = EIO;
2309 log_error("Failed to probe: %m");
2310 return -errno;
2311 }
2312
2313 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2314 if (!streq_ptr(pttype, "gpt")) {
2315 log_error("Image %s does not carry a GUID Partition Table.\n"
2316 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2317 return -EINVAL;
2318 }
2319
2320 errno = 0;
2321 pl = blkid_probe_get_partitions(b);
2322 if (!pl) {
2323 if (errno == 0)
2324 return log_oom();
2325
2326 log_error("Failed to list partitions of %s", arg_image);
2327 return -errno;
2328 }
2329
2330 udev = udev_new();
2331 if (!udev)
2332 return log_oom();
2333
2334 if (fstat(fd, &st) < 0) {
2335 log_error("Failed to stat block device: %m");
2336 return -errno;
2337 }
2338
2339 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2340 if (!d)
2341 return log_oom();
2342
2343 e = udev_enumerate_new(udev);
2344 if (!e)
2345 return log_oom();
2346
2347 r = udev_enumerate_add_match_parent(e, d);
2348 if (r < 0)
2349 return log_oom();
2350
2351 r = udev_enumerate_scan_devices(e);
2352 if (r < 0) {
2353 log_error_errno(-r, "Failed to scan for partition devices of %s: %m", arg_image);
2354 return r;
2355 }
2356
2357 first = udev_enumerate_get_list_entry(e);
2358 udev_list_entry_foreach(item, first) {
2359 _cleanup_udev_device_unref_ struct udev_device *q;
2360 const char *stype, *node;
2361 unsigned long long flags;
2362 sd_id128_t type_id;
2363 blkid_partition pp;
2364 dev_t qn;
2365 int nr;
2366
2367 errno = 0;
2368 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2369 if (!q) {
2370 if (!errno)
2371 errno = ENOMEM;
2372
2373 log_error("Failed to get partition device of %s: %m", arg_image);
2374 return -errno;
2375 }
2376
2377 qn = udev_device_get_devnum(q);
2378 if (major(qn) == 0)
2379 continue;
2380
2381 if (st.st_rdev == qn)
2382 continue;
2383
2384 node = udev_device_get_devnode(q);
2385 if (!node)
2386 continue;
2387
2388 pp = blkid_partlist_devno_to_partition(pl, qn);
2389 if (!pp)
2390 continue;
2391
2392 flags = blkid_partition_get_flags(pp);
2393 if (flags & GPT_FLAG_NO_AUTO)
2394 continue;
2395
2396 nr = blkid_partition_get_partno(pp);
2397 if (nr < 0)
2398 continue;
2399
2400 stype = blkid_partition_get_type_string(pp);
2401 if (!stype)
2402 continue;
2403
2404 if (sd_id128_from_string(stype, &type_id) < 0)
2405 continue;
2406
2407 if (sd_id128_equal(type_id, GPT_HOME)) {
2408
2409 if (home && nr >= home_nr)
2410 continue;
2411
2412 home_nr = nr;
2413 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2414
2415 free(home);
2416 home = strdup(node);
2417 if (!home)
2418 return log_oom();
2419 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2420
2421 if (srv && nr >= srv_nr)
2422 continue;
2423
2424 srv_nr = nr;
2425 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2426
2427 free(srv);
2428 srv = strdup(node);
2429 if (!srv)
2430 return log_oom();
2431 }
2432 #ifdef GPT_ROOT_NATIVE
2433 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2434
2435 if (root && nr >= root_nr)
2436 continue;
2437
2438 root_nr = nr;
2439 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2440
2441 free(root);
2442 root = strdup(node);
2443 if (!root)
2444 return log_oom();
2445 }
2446 #endif
2447 #ifdef GPT_ROOT_SECONDARY
2448 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2449
2450 if (secondary_root && nr >= secondary_root_nr)
2451 continue;
2452
2453 secondary_root_nr = nr;
2454 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2455
2456
2457 free(secondary_root);
2458 secondary_root = strdup(node);
2459 if (!secondary_root)
2460 return log_oom();
2461 }
2462 #endif
2463 }
2464
2465 if (!root && !secondary_root) {
2466 log_error("Failed to identify root partition in disk image %s.\n"
2467 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2468 return -EINVAL;
2469 }
2470
2471 if (root) {
2472 *root_device = root;
2473 root = NULL;
2474
2475 *root_device_rw = root_rw;
2476 *secondary = false;
2477 } else if (secondary_root) {
2478 *root_device = secondary_root;
2479 secondary_root = NULL;
2480
2481 *root_device_rw = secondary_root_rw;
2482 *secondary = true;
2483 }
2484
2485 if (home) {
2486 *home_device = home;
2487 home = NULL;
2488
2489 *home_device_rw = home_rw;
2490 }
2491
2492 if (srv) {
2493 *srv_device = srv;
2494 srv = NULL;
2495
2496 *srv_device_rw = srv_rw;
2497 }
2498
2499 return 0;
2500 #else
2501 log_error("--image= is not supported, compiled without blkid support.");
2502 return -ENOTSUP;
2503 #endif
2504 }
2505
2506 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2507 #ifdef HAVE_BLKID
2508 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2509 const char *fstype, *p;
2510 int r;
2511
2512 assert(what);
2513 assert(where);
2514
2515 if (arg_read_only)
2516 rw = false;
2517
2518 if (directory)
2519 p = strappenda(where, directory);
2520 else
2521 p = where;
2522
2523 errno = 0;
2524 b = blkid_new_probe_from_filename(what);
2525 if (!b) {
2526 if (errno == 0)
2527 return log_oom();
2528 log_error("Failed to allocate prober for %s: %m", what);
2529 return -errno;
2530 }
2531
2532 blkid_probe_enable_superblocks(b, 1);
2533 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2534
2535 errno = 0;
2536 r = blkid_do_safeprobe(b);
2537 if (r == -1 || r == 1) {
2538 log_error("Cannot determine file system type of %s", what);
2539 return -EINVAL;
2540 } else if (r != 0) {
2541 if (errno == 0)
2542 errno = EIO;
2543 log_error("Failed to probe %s: %m", what);
2544 return -errno;
2545 }
2546
2547 errno = 0;
2548 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2549 if (errno == 0)
2550 errno = EINVAL;
2551 log_error("Failed to determine file system type of %s", what);
2552 return -errno;
2553 }
2554
2555 if (streq(fstype, "crypto_LUKS")) {
2556 log_error("nspawn currently does not support LUKS disk images.");
2557 return -ENOTSUP;
2558 }
2559
2560 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2561 log_error("Failed to mount %s: %m", what);
2562 return -errno;
2563 }
2564
2565 return 0;
2566 #else
2567 log_error("--image= is not supported, compiled without blkid support.");
2568 return -ENOTSUP;
2569 #endif
2570 }
2571
2572 static int mount_devices(
2573 const char *where,
2574 const char *root_device, bool root_device_rw,
2575 const char *home_device, bool home_device_rw,
2576 const char *srv_device, bool srv_device_rw) {
2577 int r;
2578
2579 assert(where);
2580
2581 if (root_device) {
2582 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2583 if (r < 0) {
2584 log_error_errno(-r, "Failed to mount root directory: %m");
2585 return r;
2586 }
2587 }
2588
2589 if (home_device) {
2590 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2591 if (r < 0) {
2592 log_error_errno(-r, "Failed to mount home directory: %m");
2593 return r;
2594 }
2595 }
2596
2597 if (srv_device) {
2598 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2599 if (r < 0) {
2600 log_error_errno(-r, "Failed to mount server data directory: %m");
2601 return r;
2602 }
2603 }
2604
2605 return 0;
2606 }
2607
2608 static void loop_remove(int nr, int *image_fd) {
2609 _cleanup_close_ int control = -1;
2610 int r;
2611
2612 if (nr < 0)
2613 return;
2614
2615 if (image_fd && *image_fd >= 0) {
2616 r = ioctl(*image_fd, LOOP_CLR_FD);
2617 if (r < 0)
2618 log_warning("Failed to close loop image: %m");
2619 *image_fd = safe_close(*image_fd);
2620 }
2621
2622 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2623 if (control < 0) {
2624 log_warning("Failed to open /dev/loop-control: %m");
2625 return;
2626 }
2627
2628 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2629 if (r < 0)
2630 log_warning("Failed to remove loop %d: %m", nr);
2631 }
2632
2633 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2634 int pipe_fds[2];
2635 pid_t pid;
2636
2637 assert(database);
2638 assert(key);
2639 assert(rpid);
2640
2641 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2642 log_error("Failed to allocate pipe: %m");
2643 return -errno;
2644 }
2645
2646 pid = fork();
2647 if (pid < 0) {
2648 log_error("Failed to fork getent child: %m");
2649 return -errno;
2650 } else if (pid == 0) {
2651 int nullfd;
2652 char *empty_env = NULL;
2653
2654 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2655 _exit(EXIT_FAILURE);
2656
2657 if (pipe_fds[0] > 2)
2658 safe_close(pipe_fds[0]);
2659 if (pipe_fds[1] > 2)
2660 safe_close(pipe_fds[1]);
2661
2662 nullfd = open("/dev/null", O_RDWR);
2663 if (nullfd < 0)
2664 _exit(EXIT_FAILURE);
2665
2666 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2667 _exit(EXIT_FAILURE);
2668
2669 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2670 _exit(EXIT_FAILURE);
2671
2672 if (nullfd > 2)
2673 safe_close(nullfd);
2674
2675 reset_all_signal_handlers();
2676 close_all_fds(NULL, 0);
2677
2678 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2679 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2680 _exit(EXIT_FAILURE);
2681 }
2682
2683 pipe_fds[1] = safe_close(pipe_fds[1]);
2684
2685 *rpid = pid;
2686
2687 return pipe_fds[0];
2688 }
2689
2690 static int change_uid_gid(char **_home) {
2691 char line[LINE_MAX], *x, *u, *g, *h;
2692 const char *word, *state;
2693 _cleanup_free_ uid_t *uids = NULL;
2694 _cleanup_free_ char *home = NULL;
2695 _cleanup_fclose_ FILE *f = NULL;
2696 _cleanup_close_ int fd = -1;
2697 unsigned n_uids = 0;
2698 size_t sz = 0, l;
2699 uid_t uid;
2700 gid_t gid;
2701 pid_t pid;
2702 int r;
2703
2704 assert(_home);
2705
2706 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2707 /* Reset everything fully to 0, just in case */
2708
2709 if (setgroups(0, NULL) < 0) {
2710 log_error("setgroups() failed: %m");
2711 return -errno;
2712 }
2713
2714 if (setresgid(0, 0, 0) < 0) {
2715 log_error("setregid() failed: %m");
2716 return -errno;
2717 }
2718
2719 if (setresuid(0, 0, 0) < 0) {
2720 log_error("setreuid() failed: %m");
2721 return -errno;
2722 }
2723
2724 *_home = NULL;
2725 return 0;
2726 }
2727
2728 /* First, get user credentials */
2729 fd = spawn_getent("passwd", arg_user, &pid);
2730 if (fd < 0)
2731 return fd;
2732
2733 f = fdopen(fd, "r");
2734 if (!f)
2735 return log_oom();
2736 fd = -1;
2737
2738 if (!fgets(line, sizeof(line), f)) {
2739
2740 if (!ferror(f)) {
2741 log_error("Failed to resolve user %s.", arg_user);
2742 return -ESRCH;
2743 }
2744
2745 log_error("Failed to read from getent: %m");
2746 return -errno;
2747 }
2748
2749 truncate_nl(line);
2750
2751 wait_for_terminate_and_warn("getent passwd", pid);
2752
2753 x = strchr(line, ':');
2754 if (!x) {
2755 log_error("/etc/passwd entry has invalid user field.");
2756 return -EIO;
2757 }
2758
2759 u = strchr(x+1, ':');
2760 if (!u) {
2761 log_error("/etc/passwd entry has invalid password field.");
2762 return -EIO;
2763 }
2764
2765 u++;
2766 g = strchr(u, ':');
2767 if (!g) {
2768 log_error("/etc/passwd entry has invalid UID field.");
2769 return -EIO;
2770 }
2771
2772 *g = 0;
2773 g++;
2774 x = strchr(g, ':');
2775 if (!x) {
2776 log_error("/etc/passwd entry has invalid GID field.");
2777 return -EIO;
2778 }
2779
2780 *x = 0;
2781 h = strchr(x+1, ':');
2782 if (!h) {
2783 log_error("/etc/passwd entry has invalid GECOS field.");
2784 return -EIO;
2785 }
2786
2787 h++;
2788 x = strchr(h, ':');
2789 if (!x) {
2790 log_error("/etc/passwd entry has invalid home directory field.");
2791 return -EIO;
2792 }
2793
2794 *x = 0;
2795
2796 r = parse_uid(u, &uid);
2797 if (r < 0) {
2798 log_error("Failed to parse UID of user.");
2799 return -EIO;
2800 }
2801
2802 r = parse_gid(g, &gid);
2803 if (r < 0) {
2804 log_error("Failed to parse GID of user.");
2805 return -EIO;
2806 }
2807
2808 home = strdup(h);
2809 if (!home)
2810 return log_oom();
2811
2812 /* Second, get group memberships */
2813 fd = spawn_getent("initgroups", arg_user, &pid);
2814 if (fd < 0)
2815 return fd;
2816
2817 fclose(f);
2818 f = fdopen(fd, "r");
2819 if (!f)
2820 return log_oom();
2821 fd = -1;
2822
2823 if (!fgets(line, sizeof(line), f)) {
2824 if (!ferror(f)) {
2825 log_error("Failed to resolve user %s.", arg_user);
2826 return -ESRCH;
2827 }
2828
2829 log_error("Failed to read from getent: %m");
2830 return -errno;
2831 }
2832
2833 truncate_nl(line);
2834
2835 wait_for_terminate_and_warn("getent initgroups", pid);
2836
2837 /* Skip over the username and subsequent separator whitespace */
2838 x = line;
2839 x += strcspn(x, WHITESPACE);
2840 x += strspn(x, WHITESPACE);
2841
2842 FOREACH_WORD(word, l, x, state) {
2843 char c[l+1];
2844
2845 memcpy(c, word, l);
2846 c[l] = 0;
2847
2848 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2849 return log_oom();
2850
2851 r = parse_uid(c, &uids[n_uids++]);
2852 if (r < 0) {
2853 log_error("Failed to parse group data from getent.");
2854 return -EIO;
2855 }
2856 }
2857
2858 r = mkdir_parents(home, 0775);
2859 if (r < 0) {
2860 log_error_errno(-r, "Failed to make home root directory: %m");
2861 return r;
2862 }
2863
2864 r = mkdir_safe(home, 0755, uid, gid);
2865 if (r < 0 && r != -EEXIST) {
2866 log_error_errno(-r, "Failed to make home directory: %m");
2867 return r;
2868 }
2869
2870 fchown(STDIN_FILENO, uid, gid);
2871 fchown(STDOUT_FILENO, uid, gid);
2872 fchown(STDERR_FILENO, uid, gid);
2873
2874 if (setgroups(n_uids, uids) < 0) {
2875 log_error("Failed to set auxiliary groups: %m");
2876 return -errno;
2877 }
2878
2879 if (setresgid(gid, gid, gid) < 0) {
2880 log_error("setregid() failed: %m");
2881 return -errno;
2882 }
2883
2884 if (setresuid(uid, uid, uid) < 0) {
2885 log_error("setreuid() failed: %m");
2886 return -errno;
2887 }
2888
2889 if (_home) {
2890 *_home = home;
2891 home = NULL;
2892 }
2893
2894 return 0;
2895 }
2896
2897 /*
2898 * Return values:
2899 * < 0 : wait_for_terminate() failed to get the state of the
2900 * container, the container was terminated by a signal, or
2901 * failed for an unknown reason. No change is made to the
2902 * container argument.
2903 * > 0 : The program executed in the container terminated with an
2904 * error. The exit code of the program executed in the
2905 * container is returned. The container argument has been set
2906 * to CONTAINER_TERMINATED.
2907 * 0 : The container is being rebooted, has been shut down or exited
2908 * successfully. The container argument has been set to either
2909 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2910 *
2911 * That is, success is indicated by a return value of zero, and an
2912 * error is indicated by a non-zero value.
2913 */
2914 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2915 siginfo_t status;
2916 int r;
2917
2918 r = wait_for_terminate(pid, &status);
2919 if (r < 0) {
2920 log_warning_errno(-r, "Failed to wait for container: %m");
2921 return r;
2922 }
2923
2924 switch (status.si_code) {
2925
2926 case CLD_EXITED:
2927 if (status.si_status == 0) {
2928 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2929
2930 } else
2931 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2932
2933 *container = CONTAINER_TERMINATED;
2934 return status.si_status;
2935
2936 case CLD_KILLED:
2937 if (status.si_status == SIGINT) {
2938
2939 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2940 *container = CONTAINER_TERMINATED;
2941 return 0;
2942
2943 } else if (status.si_status == SIGHUP) {
2944
2945 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2946 *container = CONTAINER_REBOOTED;
2947 return 0;
2948 }
2949
2950 /* CLD_KILLED fallthrough */
2951
2952 case CLD_DUMPED:
2953 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2954 return -EIO;
2955
2956 default:
2957 log_error("Container %s failed due to unknown reason.", arg_machine);
2958 return -EIO;
2959 }
2960
2961 return r;
2962 }
2963
2964 static void nop_handler(int sig) {}
2965
2966 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2967 pid_t pid;
2968
2969 pid = PTR_TO_UINT32(userdata);
2970 if (pid > 0) {
2971 if (kill(pid, SIGRTMIN+3) >= 0) {
2972 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2973 sd_event_source_set_userdata(s, NULL);
2974 return 0;
2975 }
2976 }
2977
2978 sd_event_exit(sd_event_source_get_event(s), 0);
2979 return 0;
2980 }
2981
2982 int main(int argc, char *argv[]) {
2983
2984 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2985 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2986 _cleanup_close_ int master = -1, image_fd = -1;
2987 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2988 _cleanup_fdset_free_ FDSet *fds = NULL;
2989 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2990 const char *console = NULL;
2991 char veth_name[IFNAMSIZ];
2992 bool secondary = false;
2993 sigset_t mask, mask_chld;
2994 pid_t pid = 0;
2995
2996 log_parse_environment();
2997 log_open();
2998
2999 k = parse_argv(argc, argv);
3000 if (k < 0)
3001 goto finish;
3002 else if (k == 0) {
3003 r = EXIT_SUCCESS;
3004 goto finish;
3005 }
3006
3007 if (!arg_image) {
3008 if (arg_directory) {
3009 char *p;
3010
3011 p = path_make_absolute_cwd(arg_directory);
3012 free(arg_directory);
3013 arg_directory = p;
3014 } else
3015 arg_directory = get_current_dir_name();
3016
3017 if (!arg_directory) {
3018 log_error("Failed to determine path, please use -D.");
3019 goto finish;
3020 }
3021 path_kill_slashes(arg_directory);
3022 }
3023
3024 if (!arg_machine) {
3025 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
3026 if (!arg_machine) {
3027 log_oom();
3028 goto finish;
3029 }
3030
3031 hostname_cleanup(arg_machine, false);
3032 if (isempty(arg_machine)) {
3033 log_error("Failed to determine machine name automatically, please use -M.");
3034 goto finish;
3035 }
3036 }
3037
3038 if (geteuid() != 0) {
3039 log_error("Need to be root.");
3040 goto finish;
3041 }
3042
3043 if (sd_booted() <= 0) {
3044 log_error("Not running on a systemd system.");
3045 goto finish;
3046 }
3047
3048 log_close();
3049 n_fd_passed = sd_listen_fds(false);
3050 if (n_fd_passed > 0) {
3051 k = fdset_new_listen_fds(&fds, false);
3052 if (k < 0) {
3053 log_error_errno(-k, "Failed to collect file descriptors: %m");
3054 goto finish;
3055 }
3056 }
3057 fdset_close_others(fds);
3058 log_open();
3059
3060 if (arg_directory) {
3061 if (path_equal(arg_directory, "/")) {
3062 log_error("Spawning container on root directory not supported.");
3063 goto finish;
3064 }
3065
3066 if (arg_boot) {
3067 if (path_is_os_tree(arg_directory) <= 0) {
3068 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3069 goto finish;
3070 }
3071 } else {
3072 const char *p;
3073
3074 p = strappenda(arg_directory,
3075 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3076 if (access(p, F_OK) < 0) {
3077 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3078 goto finish;
3079
3080 }
3081 }
3082 } else {
3083 char template[] = "/tmp/nspawn-root-XXXXXX";
3084
3085 if (!mkdtemp(template)) {
3086 log_error("Failed to create temporary directory: %m");
3087 r = -errno;
3088 goto finish;
3089 }
3090
3091 arg_directory = strdup(template);
3092 if (!arg_directory) {
3093 r = log_oom();
3094 goto finish;
3095 }
3096
3097 image_fd = setup_image(&device_path, &loop_nr);
3098 if (image_fd < 0) {
3099 r = image_fd;
3100 goto finish;
3101 }
3102
3103 r = dissect_image(image_fd,
3104 &root_device, &root_device_rw,
3105 &home_device, &home_device_rw,
3106 &srv_device, &srv_device_rw,
3107 &secondary);
3108 if (r < 0)
3109 goto finish;
3110 }
3111
3112 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3113 if (master < 0) {
3114 log_error("Failed to acquire pseudo tty: %m");
3115 goto finish;
3116 }
3117
3118 console = ptsname(master);
3119 if (!console) {
3120 log_error("Failed to determine tty name: %m");
3121 goto finish;
3122 }
3123
3124 if (!arg_quiet)
3125 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3126 arg_machine, arg_image ? arg_image : arg_directory);
3127
3128 if (unlockpt(master) < 0) {
3129 log_error("Failed to unlock tty: %m");
3130 goto finish;
3131 }
3132
3133 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3134 log_error("Failed to create kmsg socket pair: %m");
3135 goto finish;
3136 }
3137
3138 sd_notify(false,
3139 "READY=1\n"
3140 "STATUS=Container running.");
3141
3142 assert_se(sigemptyset(&mask) == 0);
3143 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3144 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3145
3146 assert_se(sigemptyset(&mask_chld) == 0);
3147 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3148
3149 for (;;) {
3150 ContainerStatus container_status;
3151 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3152 struct sigaction sa = {
3153 .sa_handler = nop_handler,
3154 .sa_flags = SA_NOCLDSTOP,
3155 };
3156
3157 r = barrier_create(&barrier);
3158 if (r < 0) {
3159 log_error_errno(-r, "Cannot initialize IPC barrier: %m");
3160 goto finish;
3161 }
3162
3163 /* Child can be killed before execv(), so handle SIGCHLD
3164 * in order to interrupt parent's blocking calls and
3165 * give it a chance to call wait() and terminate. */
3166 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3167 if (r < 0) {
3168 log_error("Failed to change the signal mask: %m");
3169 goto finish;
3170 }
3171
3172 r = sigaction(SIGCHLD, &sa, NULL);
3173 if (r < 0) {
3174 log_error("Failed to install SIGCHLD handler: %m");
3175 goto finish;
3176 }
3177
3178 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3179 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3180 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3181 if (pid < 0) {
3182 if (errno == EINVAL)
3183 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3184 else
3185 log_error("clone() failed: %m");
3186
3187 r = pid;
3188 goto finish;
3189 }
3190
3191 if (pid == 0) {
3192 /* child */
3193 _cleanup_free_ char *home = NULL;
3194 unsigned n_env = 2;
3195 const char *envp[] = {
3196 "PATH=" DEFAULT_PATH_SPLIT_USR,
3197 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3198 NULL, /* TERM */
3199 NULL, /* HOME */
3200 NULL, /* USER */
3201 NULL, /* LOGNAME */
3202 NULL, /* container_uuid */
3203 NULL, /* LISTEN_FDS */
3204 NULL, /* LISTEN_PID */
3205 NULL
3206 };
3207 char **env_use;
3208
3209 barrier_set_role(&barrier, BARRIER_CHILD);
3210
3211 envp[n_env] = strv_find_prefix(environ, "TERM=");
3212 if (envp[n_env])
3213 n_env ++;
3214
3215 master = safe_close(master);
3216
3217 close_nointr(STDIN_FILENO);
3218 close_nointr(STDOUT_FILENO);
3219 close_nointr(STDERR_FILENO);
3220
3221 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3222
3223 reset_all_signal_handlers();
3224 reset_signal_mask();
3225
3226 k = open_terminal(console, O_RDWR);
3227 if (k != STDIN_FILENO) {
3228 if (k >= 0) {
3229 safe_close(k);
3230 k = -EINVAL;
3231 }
3232
3233 log_error_errno(-k, "Failed to open console: %m");
3234 _exit(EXIT_FAILURE);
3235 }
3236
3237 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3238 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3239 log_error("Failed to duplicate console: %m");
3240 _exit(EXIT_FAILURE);
3241 }
3242
3243 if (setsid() < 0) {
3244 log_error("setsid() failed: %m");
3245 _exit(EXIT_FAILURE);
3246 }
3247
3248 if (reset_audit_loginuid() < 0)
3249 _exit(EXIT_FAILURE);
3250
3251 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3252 log_error("PR_SET_PDEATHSIG failed: %m");
3253 _exit(EXIT_FAILURE);
3254 }
3255
3256 /* Mark everything as slave, so that we still
3257 * receive mounts from the real root, but don't
3258 * propagate mounts to the real root. */
3259 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3260 log_error("MS_SLAVE|MS_REC failed: %m");
3261 _exit(EXIT_FAILURE);
3262 }
3263
3264 if (mount_devices(arg_directory,
3265 root_device, root_device_rw,
3266 home_device, home_device_rw,
3267 srv_device, srv_device_rw) < 0)
3268 _exit(EXIT_FAILURE);
3269
3270 /* Turn directory into bind mount */
3271 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3272 log_error("Failed to make bind mount: %m");
3273 _exit(EXIT_FAILURE);
3274 }
3275
3276 r = setup_volatile(arg_directory);
3277 if (r < 0)
3278 _exit(EXIT_FAILURE);
3279
3280 if (setup_volatile_state(arg_directory) < 0)
3281 _exit(EXIT_FAILURE);
3282
3283 r = base_filesystem_create(arg_directory);
3284 if (r < 0)
3285 _exit(EXIT_FAILURE);
3286
3287 if (arg_read_only) {
3288 k = bind_remount_recursive(arg_directory, true);
3289 if (k < 0) {
3290 log_error_errno(-k, "Failed to make tree read-only: %m");
3291 _exit(EXIT_FAILURE);
3292 }
3293 }
3294
3295 if (mount_all(arg_directory) < 0)
3296 _exit(EXIT_FAILURE);
3297
3298 if (copy_devnodes(arg_directory) < 0)
3299 _exit(EXIT_FAILURE);
3300
3301 if (setup_ptmx(arg_directory) < 0)
3302 _exit(EXIT_FAILURE);
3303
3304 dev_setup(arg_directory);
3305
3306 if (setup_seccomp() < 0)
3307 _exit(EXIT_FAILURE);
3308
3309 if (setup_dev_console(arg_directory, console) < 0)
3310 _exit(EXIT_FAILURE);
3311
3312 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3313 _exit(EXIT_FAILURE);
3314
3315 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3316
3317 if (setup_boot_id(arg_directory) < 0)
3318 _exit(EXIT_FAILURE);
3319
3320 if (setup_timezone(arg_directory) < 0)
3321 _exit(EXIT_FAILURE);
3322
3323 if (setup_resolv_conf(arg_directory) < 0)
3324 _exit(EXIT_FAILURE);
3325
3326 if (setup_journal(arg_directory) < 0)
3327 _exit(EXIT_FAILURE);
3328
3329 if (mount_binds(arg_directory, arg_bind, false) < 0)
3330 _exit(EXIT_FAILURE);
3331
3332 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3333 _exit(EXIT_FAILURE);
3334
3335 if (mount_tmpfs(arg_directory) < 0)
3336 _exit(EXIT_FAILURE);
3337
3338 /* Tell the parent that we are ready, and that
3339 * it can cgroupify us to that we lack access
3340 * to certain devices and resources. */
3341 (void)barrier_place(&barrier);
3342
3343 if (chdir(arg_directory) < 0) {
3344 log_error("chdir(%s) failed: %m", arg_directory);
3345 _exit(EXIT_FAILURE);
3346 }
3347
3348 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3349 log_error("mount(MS_MOVE) failed: %m");
3350 _exit(EXIT_FAILURE);
3351 }
3352
3353 if (chroot(".") < 0) {
3354 log_error("chroot() failed: %m");
3355 _exit(EXIT_FAILURE);
3356 }
3357
3358 if (chdir("/") < 0) {
3359 log_error("chdir() failed: %m");
3360 _exit(EXIT_FAILURE);
3361 }
3362
3363 umask(0022);
3364
3365 if (arg_private_network)
3366 loopback_setup();
3367
3368 if (drop_capabilities() < 0) {
3369 log_error("drop_capabilities() failed: %m");
3370 _exit(EXIT_FAILURE);
3371 }
3372
3373 r = change_uid_gid(&home);
3374 if (r < 0)
3375 _exit(EXIT_FAILURE);
3376
3377 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3378 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3379 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3380 log_oom();
3381 _exit(EXIT_FAILURE);
3382 }
3383
3384 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3385 char as_uuid[37];
3386
3387 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3388 log_oom();
3389 _exit(EXIT_FAILURE);
3390 }
3391 }
3392
3393 if (fdset_size(fds) > 0) {
3394 k = fdset_cloexec(fds, false);
3395 if (k < 0) {
3396 log_error("Failed to unset O_CLOEXEC for file descriptors.");
3397 _exit(EXIT_FAILURE);
3398 }
3399
3400 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3401 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3402 log_oom();
3403 _exit(EXIT_FAILURE);
3404 }
3405 }
3406
3407 setup_hostname();
3408
3409 if (arg_personality != 0xffffffffLU) {
3410 if (personality(arg_personality) < 0) {
3411 log_error("personality() failed: %m");
3412 _exit(EXIT_FAILURE);
3413 }
3414 } else if (secondary) {
3415 if (personality(PER_LINUX32) < 0) {
3416 log_error("personality() failed: %m");
3417 _exit(EXIT_FAILURE);
3418 }
3419 }
3420
3421 #ifdef HAVE_SELINUX
3422 if (arg_selinux_context)
3423 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3424 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3425 _exit(EXIT_FAILURE);
3426 }
3427 #endif
3428
3429 if (!strv_isempty(arg_setenv)) {
3430 char **n;
3431
3432 n = strv_env_merge(2, envp, arg_setenv);
3433 if (!n) {
3434 log_oom();
3435 _exit(EXIT_FAILURE);
3436 }
3437
3438 env_use = n;
3439 } else
3440 env_use = (char**) envp;
3441
3442 /* Wait until the parent is ready with the setup, too... */
3443 if (!barrier_place_and_sync(&barrier))
3444 _exit(EXIT_FAILURE);
3445
3446 if (arg_boot) {
3447 char **a;
3448 size_t l;
3449
3450 /* Automatically search for the init system */
3451
3452 l = 1 + argc - optind;
3453 a = newa(char*, l + 1);
3454 memcpy(a + 1, argv + optind, l * sizeof(char*));
3455
3456 a[0] = (char*) "/usr/lib/systemd/systemd";
3457 execve(a[0], a, env_use);
3458
3459 a[0] = (char*) "/lib/systemd/systemd";
3460 execve(a[0], a, env_use);
3461
3462 a[0] = (char*) "/sbin/init";
3463 execve(a[0], a, env_use);
3464 } else if (argc > optind)
3465 execvpe(argv[optind], argv + optind, env_use);
3466 else {
3467 chdir(home ? home : "/root");
3468 execle("/bin/bash", "-bash", NULL, env_use);
3469 execle("/bin/sh", "-sh", NULL, env_use);
3470 }
3471
3472 log_error("execv() failed: %m");
3473 _exit(EXIT_FAILURE);
3474 }
3475
3476 barrier_set_role(&barrier, BARRIER_PARENT);
3477 fdset_free(fds);
3478 fds = NULL;
3479
3480 /* wait for child-setup to be done */
3481 if (barrier_place_and_sync(&barrier)) {
3482 _cleanup_event_unref_ sd_event *event = NULL;
3483 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3484 int ifi = 0;
3485
3486 r = move_network_interfaces(pid);
3487 if (r < 0)
3488 goto finish;
3489
3490 r = setup_veth(pid, veth_name, &ifi);
3491 if (r < 0)
3492 goto finish;
3493
3494 r = setup_bridge(veth_name, &ifi);
3495 if (r < 0)
3496 goto finish;
3497
3498 r = setup_macvlan(pid);
3499 if (r < 0)
3500 goto finish;
3501
3502 r = register_machine(pid, ifi);
3503 if (r < 0)
3504 goto finish;
3505
3506 /* Block SIGCHLD here, before notifying child.
3507 * process_pty() will handle it with the other signals. */
3508 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3509 if (r < 0)
3510 goto finish;
3511
3512 /* Reset signal to default */
3513 r = default_signals(SIGCHLD, -1);
3514 if (r < 0)
3515 goto finish;
3516
3517 /* Notify the child that the parent is ready with all
3518 * its setup, and that the child can now hand over
3519 * control to the code to run inside the container. */
3520 (void)barrier_place(&barrier);
3521
3522 r = sd_event_new(&event);
3523 if (r < 0) {
3524 log_error_errno(-r, "Failed to get default event source: %m");
3525 goto finish;
3526 }
3527
3528 if (arg_boot) {
3529 /* Try to kill the init system on SIGINT or SIGTERM */
3530 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3531 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3532 } else {
3533 /* Immediately exit */
3534 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3535 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3536 }
3537
3538 /* simply exit on sigchld */
3539 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3540
3541 r = pty_forward_new(event, master, &forward);
3542 if (r < 0) {
3543 log_error_errno(-r, "Failed to create PTY forwarder: %m");
3544 goto finish;
3545 }
3546
3547 r = sd_event_loop(event);
3548 if (r < 0) {
3549 log_error_errno(-r, "Failed to run event loop: %m");
3550 return r;
3551 }
3552
3553 forward = pty_forward_free(forward);
3554
3555 if (!arg_quiet)
3556 putc('\n', stdout);
3557
3558 /* Kill if it is not dead yet anyway */
3559 terminate_machine(pid);
3560 }
3561
3562 /* Normally redundant, but better safe than sorry */
3563 kill(pid, SIGKILL);
3564
3565 r = wait_for_container(pid, &container_status);
3566 pid = 0;
3567
3568 if (r < 0) {
3569 /* We failed to wait for the container, or the
3570 * container exited abnormally */
3571 r = EXIT_FAILURE;
3572 break;
3573 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3574 /* The container exited with a non-zero
3575 * status, or with zero status and no reboot
3576 * was requested. */
3577 break;
3578
3579 /* CONTAINER_REBOOTED, loop again */
3580
3581 if (arg_keep_unit) {
3582 /* Special handling if we are running as a
3583 * service: instead of simply restarting the
3584 * machine we want to restart the entire
3585 * service, so let's inform systemd about this
3586 * with the special exit code 133. The service
3587 * file uses RestartForceExitStatus=133 so
3588 * that this results in a full nspawn
3589 * restart. This is necessary since we might
3590 * have cgroup parameters set we want to have
3591 * flushed out. */
3592 r = 133;
3593 break;
3594 }
3595 }
3596
3597 finish:
3598 sd_notify(false,
3599 "STOPPING=1\n"
3600 "STATUS=Terminating...");
3601
3602 loop_remove(loop_nr, &image_fd);
3603
3604 if (pid > 0)
3605 kill(pid, SIGKILL);
3606
3607 free(arg_directory);
3608 free(arg_machine);
3609 free(arg_user);
3610 strv_free(arg_setenv);
3611 strv_free(arg_network_interfaces);
3612 strv_free(arg_network_macvlan);
3613 strv_free(arg_bind);
3614 strv_free(arg_bind_ro);
3615 strv_free(arg_tmpfs);
3616
3617 return r;
3618 }