]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: Allow module loading if CAP_SYS_MODULE is requested
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108 int protocol;
109 uint16_t host_port;
110 uint16_t container_port;
111 LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115 CONTAINER_TERMINATED,
116 CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120 LINK_NO,
121 LINK_AUTO,
122 LINK_HOST,
123 LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127 VOLATILE_NO,
128 VOLATILE_YES,
129 VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147 (1ULL << CAP_CHOWN) |
148 (1ULL << CAP_DAC_OVERRIDE) |
149 (1ULL << CAP_DAC_READ_SEARCH) |
150 (1ULL << CAP_FOWNER) |
151 (1ULL << CAP_FSETID) |
152 (1ULL << CAP_IPC_OWNER) |
153 (1ULL << CAP_KILL) |
154 (1ULL << CAP_LEASE) |
155 (1ULL << CAP_LINUX_IMMUTABLE) |
156 (1ULL << CAP_NET_BIND_SERVICE) |
157 (1ULL << CAP_NET_BROADCAST) |
158 (1ULL << CAP_NET_RAW) |
159 (1ULL << CAP_SETGID) |
160 (1ULL << CAP_SETFCAP) |
161 (1ULL << CAP_SETPCAP) |
162 (1ULL << CAP_SETUID) |
163 (1ULL << CAP_SYS_ADMIN) |
164 (1ULL << CAP_SYS_CHROOT) |
165 (1ULL << CAP_SYS_NICE) |
166 (1ULL << CAP_SYS_PTRACE) |
167 (1ULL << CAP_SYS_TTY_CONFIG) |
168 (1ULL << CAP_SYS_RESOURCE) |
169 (1ULL << CAP_SYS_BOOT) |
170 (1ULL << CAP_AUDIT_WRITE) |
171 (1ULL << CAP_AUDIT_CONTROL) |
172 (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static const char *arg_network_bridge = NULL;
186 static unsigned long arg_personality = 0xffffffffLU;
187 static char *arg_image = NULL;
188 static Volatile arg_volatile = VOLATILE_NO;
189 static ExposePort *arg_expose_ports = NULL;
190
191 static void help(void) {
192 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
193 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
194 " -h --help Show this help\n"
195 " --version Print version string\n"
196 " -q --quiet Do not show status information\n"
197 " -D --directory=PATH Root directory for the container\n"
198 " --template=PATH Initialize root directory from template directory,\n"
199 " if missing\n"
200 " -x --ephemeral Run container with snapshot of root directory, and\n"
201 " remove it after exit\n"
202 " -i --image=PATH File system device or disk image for the container\n"
203 " -b --boot Boot up full system (i.e. invoke init)\n"
204 " -u --user=USER Run the command under specified user or uid\n"
205 " -M --machine=NAME Set the machine name for the container\n"
206 " --uuid=UUID Set a specific machine UUID for the container\n"
207 " -S --slice=SLICE Place the container in the specified slice\n"
208 " --private-network Disable network in container\n"
209 " --network-interface=INTERFACE\n"
210 " Assign an existing network interface to the\n"
211 " container\n"
212 " --network-macvlan=INTERFACE\n"
213 " Create a macvlan network interface based on an\n"
214 " existing network interface to the container\n"
215 " --network-ipvlan=INTERFACE\n"
216 " Create a ipvlan network interface based on an\n"
217 " existing network interface to the container\n"
218 " -n --network-veth Add a virtual ethernet connection between host\n"
219 " and container\n"
220 " --network-bridge=INTERFACE\n"
221 " Add a virtual ethernet connection between host\n"
222 " and container and add it to an existing bridge on\n"
223 " the host\n"
224 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
225 " Expose a container IP port on the host\n"
226 " -Z --selinux-context=SECLABEL\n"
227 " Set the SELinux security context to be used by\n"
228 " processes in the container\n"
229 " -L --selinux-apifs-context=SECLABEL\n"
230 " Set the SELinux security context to be used by\n"
231 " API/tmpfs file systems in the container\n"
232 " --capability=CAP In addition to the default, retain specified\n"
233 " capability\n"
234 " --drop-capability=CAP Drop the specified capability from the default set\n"
235 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
236 " try-guest, try-host\n"
237 " -j Equivalent to --link-journal=try-guest\n"
238 " --read-only Mount the root directory read-only\n"
239 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
240 " the container\n"
241 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
242 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
243 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
244 " --share-system Share system namespaces with host\n"
245 " --register=BOOLEAN Register container as machine\n"
246 " --keep-unit Do not register a scope for the machine, reuse\n"
247 " the service unit nspawn is running in\n"
248 " --volatile[=MODE] Run the system in volatile mode\n"
249 , program_invocation_short_name);
250 }
251
252 static int set_sanitized_path(char **b, const char *path) {
253 char *p;
254
255 assert(b);
256 assert(path);
257
258 p = canonicalize_file_name(path);
259 if (!p) {
260 if (errno != ENOENT)
261 return -errno;
262
263 p = path_make_absolute_cwd(path);
264 if (!p)
265 return -ENOMEM;
266 }
267
268 free(*b);
269 *b = path_kill_slashes(p);
270 return 0;
271 }
272
273 static int parse_argv(int argc, char *argv[]) {
274
275 enum {
276 ARG_VERSION = 0x100,
277 ARG_PRIVATE_NETWORK,
278 ARG_UUID,
279 ARG_READ_ONLY,
280 ARG_CAPABILITY,
281 ARG_DROP_CAPABILITY,
282 ARG_LINK_JOURNAL,
283 ARG_BIND,
284 ARG_BIND_RO,
285 ARG_TMPFS,
286 ARG_SETENV,
287 ARG_SHARE_SYSTEM,
288 ARG_REGISTER,
289 ARG_KEEP_UNIT,
290 ARG_NETWORK_INTERFACE,
291 ARG_NETWORK_MACVLAN,
292 ARG_NETWORK_IPVLAN,
293 ARG_NETWORK_BRIDGE,
294 ARG_PERSONALITY,
295 ARG_VOLATILE,
296 ARG_TEMPLATE,
297 };
298
299 static const struct option options[] = {
300 { "help", no_argument, NULL, 'h' },
301 { "version", no_argument, NULL, ARG_VERSION },
302 { "directory", required_argument, NULL, 'D' },
303 { "template", required_argument, NULL, ARG_TEMPLATE },
304 { "ephemeral", no_argument, NULL, 'x' },
305 { "user", required_argument, NULL, 'u' },
306 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
307 { "boot", no_argument, NULL, 'b' },
308 { "uuid", required_argument, NULL, ARG_UUID },
309 { "read-only", no_argument, NULL, ARG_READ_ONLY },
310 { "capability", required_argument, NULL, ARG_CAPABILITY },
311 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
312 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
313 { "bind", required_argument, NULL, ARG_BIND },
314 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
315 { "tmpfs", required_argument, NULL, ARG_TMPFS },
316 { "machine", required_argument, NULL, 'M' },
317 { "slice", required_argument, NULL, 'S' },
318 { "setenv", required_argument, NULL, ARG_SETENV },
319 { "selinux-context", required_argument, NULL, 'Z' },
320 { "selinux-apifs-context", required_argument, NULL, 'L' },
321 { "quiet", no_argument, NULL, 'q' },
322 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
323 { "register", required_argument, NULL, ARG_REGISTER },
324 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
325 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
326 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
327 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
328 { "network-veth", no_argument, NULL, 'n' },
329 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
330 { "personality", required_argument, NULL, ARG_PERSONALITY },
331 { "image", required_argument, NULL, 'i' },
332 { "volatile", optional_argument, NULL, ARG_VOLATILE },
333 { "port", required_argument, NULL, 'p' },
334 {}
335 };
336
337 int c, r;
338 uint64_t plus = 0, minus = 0;
339
340 assert(argc >= 0);
341 assert(argv);
342
343 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
344
345 switch (c) {
346
347 case 'h':
348 help();
349 return 0;
350
351 case ARG_VERSION:
352 puts(PACKAGE_STRING);
353 puts(SYSTEMD_FEATURES);
354 return 0;
355
356 case 'D':
357 r = set_sanitized_path(&arg_directory, optarg);
358 if (r < 0)
359 return log_error_errno(r, "Invalid root directory: %m");
360
361 break;
362
363 case ARG_TEMPLATE:
364 r = set_sanitized_path(&arg_template, optarg);
365 if (r < 0)
366 return log_error_errno(r, "Invalid template directory: %m");
367
368 break;
369
370 case 'i':
371 r = set_sanitized_path(&arg_image, optarg);
372 if (r < 0)
373 return log_error_errno(r, "Invalid image path: %m");
374
375 break;
376
377 case 'x':
378 arg_ephemeral = true;
379 break;
380
381 case 'u':
382 free(arg_user);
383 arg_user = strdup(optarg);
384 if (!arg_user)
385 return log_oom();
386
387 break;
388
389 case ARG_NETWORK_BRIDGE:
390 arg_network_bridge = optarg;
391
392 /* fall through */
393
394 case 'n':
395 arg_network_veth = true;
396 arg_private_network = true;
397 break;
398
399 case ARG_NETWORK_INTERFACE:
400 if (strv_extend(&arg_network_interfaces, optarg) < 0)
401 return log_oom();
402
403 arg_private_network = true;
404 break;
405
406 case ARG_NETWORK_MACVLAN:
407 if (strv_extend(&arg_network_macvlan, optarg) < 0)
408 return log_oom();
409
410 arg_private_network = true;
411 break;
412
413 case ARG_NETWORK_IPVLAN:
414 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
415 return log_oom();
416
417 /* fall through */
418
419 case ARG_PRIVATE_NETWORK:
420 arg_private_network = true;
421 break;
422
423 case 'b':
424 arg_boot = true;
425 break;
426
427 case ARG_UUID:
428 r = sd_id128_from_string(optarg, &arg_uuid);
429 if (r < 0) {
430 log_error("Invalid UUID: %s", optarg);
431 return r;
432 }
433 break;
434
435 case 'S':
436 arg_slice = optarg;
437 break;
438
439 case 'M':
440 if (isempty(optarg)) {
441 free(arg_machine);
442 arg_machine = NULL;
443 } else {
444 if (!machine_name_is_valid(optarg)) {
445 log_error("Invalid machine name: %s", optarg);
446 return -EINVAL;
447 }
448
449 r = free_and_strdup(&arg_machine, optarg);
450 if (r < 0)
451 return log_oom();
452
453 break;
454 }
455
456 case 'Z':
457 arg_selinux_context = optarg;
458 break;
459
460 case 'L':
461 arg_selinux_apifs_context = optarg;
462 break;
463
464 case ARG_READ_ONLY:
465 arg_read_only = true;
466 break;
467
468 case ARG_CAPABILITY:
469 case ARG_DROP_CAPABILITY: {
470 const char *state, *word;
471 size_t length;
472
473 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
474 _cleanup_free_ char *t;
475
476 t = strndup(word, length);
477 if (!t)
478 return log_oom();
479
480 if (streq(t, "all")) {
481 if (c == ARG_CAPABILITY)
482 plus = (uint64_t) -1;
483 else
484 minus = (uint64_t) -1;
485 } else {
486 int cap;
487
488 cap = capability_from_name(t);
489 if (cap < 0) {
490 log_error("Failed to parse capability %s.", t);
491 return -EINVAL;
492 }
493
494 if (c == ARG_CAPABILITY)
495 plus |= 1ULL << (uint64_t) cap;
496 else
497 minus |= 1ULL << (uint64_t) cap;
498 }
499 }
500
501 break;
502 }
503
504 case 'j':
505 arg_link_journal = LINK_GUEST;
506 arg_link_journal_try = true;
507 break;
508
509 case ARG_LINK_JOURNAL:
510 if (streq(optarg, "auto")) {
511 arg_link_journal = LINK_AUTO;
512 arg_link_journal_try = false;
513 } else if (streq(optarg, "no")) {
514 arg_link_journal = LINK_NO;
515 arg_link_journal_try = false;
516 } else if (streq(optarg, "guest")) {
517 arg_link_journal = LINK_GUEST;
518 arg_link_journal_try = false;
519 } else if (streq(optarg, "host")) {
520 arg_link_journal = LINK_HOST;
521 arg_link_journal_try = false;
522 } else if (streq(optarg, "try-guest")) {
523 arg_link_journal = LINK_GUEST;
524 arg_link_journal_try = true;
525 } else if (streq(optarg, "try-host")) {
526 arg_link_journal = LINK_HOST;
527 arg_link_journal_try = true;
528 } else {
529 log_error("Failed to parse link journal mode %s", optarg);
530 return -EINVAL;
531 }
532
533 break;
534
535 case ARG_BIND:
536 case ARG_BIND_RO: {
537 _cleanup_free_ char *a = NULL, *b = NULL;
538 char *e;
539 char ***x;
540
541 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
542
543 e = strchr(optarg, ':');
544 if (e) {
545 a = strndup(optarg, e - optarg);
546 b = strdup(e + 1);
547 } else {
548 a = strdup(optarg);
549 b = strdup(optarg);
550 }
551
552 if (!a || !b)
553 return log_oom();
554
555 if (!path_is_absolute(a) || !path_is_absolute(b)) {
556 log_error("Invalid bind mount specification: %s", optarg);
557 return -EINVAL;
558 }
559
560 r = strv_extend(x, a);
561 if (r < 0)
562 return log_oom();
563
564 r = strv_extend(x, b);
565 if (r < 0)
566 return log_oom();
567
568 break;
569 }
570
571 case ARG_TMPFS: {
572 _cleanup_free_ char *a = NULL, *b = NULL;
573 char *e;
574
575 e = strchr(optarg, ':');
576 if (e) {
577 a = strndup(optarg, e - optarg);
578 b = strdup(e + 1);
579 } else {
580 a = strdup(optarg);
581 b = strdup("mode=0755");
582 }
583
584 if (!a || !b)
585 return log_oom();
586
587 if (!path_is_absolute(a)) {
588 log_error("Invalid tmpfs specification: %s", optarg);
589 return -EINVAL;
590 }
591
592 r = strv_push(&arg_tmpfs, a);
593 if (r < 0)
594 return log_oom();
595
596 a = NULL;
597
598 r = strv_push(&arg_tmpfs, b);
599 if (r < 0)
600 return log_oom();
601
602 b = NULL;
603
604 break;
605 }
606
607 case ARG_SETENV: {
608 char **n;
609
610 if (!env_assignment_is_valid(optarg)) {
611 log_error("Environment variable assignment '%s' is not valid.", optarg);
612 return -EINVAL;
613 }
614
615 n = strv_env_set(arg_setenv, optarg);
616 if (!n)
617 return log_oom();
618
619 strv_free(arg_setenv);
620 arg_setenv = n;
621 break;
622 }
623
624 case 'q':
625 arg_quiet = true;
626 break;
627
628 case ARG_SHARE_SYSTEM:
629 arg_share_system = true;
630 break;
631
632 case ARG_REGISTER:
633 r = parse_boolean(optarg);
634 if (r < 0) {
635 log_error("Failed to parse --register= argument: %s", optarg);
636 return r;
637 }
638
639 arg_register = r;
640 break;
641
642 case ARG_KEEP_UNIT:
643 arg_keep_unit = true;
644 break;
645
646 case ARG_PERSONALITY:
647
648 arg_personality = personality_from_string(optarg);
649 if (arg_personality == 0xffffffffLU) {
650 log_error("Unknown or unsupported personality '%s'.", optarg);
651 return -EINVAL;
652 }
653
654 break;
655
656 case ARG_VOLATILE:
657
658 if (!optarg)
659 arg_volatile = VOLATILE_YES;
660 else {
661 r = parse_boolean(optarg);
662 if (r < 0) {
663 if (streq(optarg, "state"))
664 arg_volatile = VOLATILE_STATE;
665 else {
666 log_error("Failed to parse --volatile= argument: %s", optarg);
667 return r;
668 }
669 } else
670 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
671 }
672
673 break;
674
675 case 'p': {
676 const char *split, *e;
677 uint16_t container_port, host_port;
678 int protocol;
679 ExposePort *p;
680
681 if ((e = startswith(optarg, "tcp:")))
682 protocol = IPPROTO_TCP;
683 else if ((e = startswith(optarg, "udp:")))
684 protocol = IPPROTO_UDP;
685 else {
686 e = optarg;
687 protocol = IPPROTO_TCP;
688 }
689
690 split = strchr(e, ':');
691 if (split) {
692 char v[split - e + 1];
693
694 memcpy(v, e, split - e);
695 v[split - e] = 0;
696
697 r = safe_atou16(v, &host_port);
698 if (r < 0 || host_port <= 0) {
699 log_error("Failed to parse host port: %s", optarg);
700 return -EINVAL;
701 }
702
703 r = safe_atou16(split + 1, &container_port);
704 } else {
705 r = safe_atou16(e, &container_port);
706 host_port = container_port;
707 }
708
709 if (r < 0 || container_port <= 0) {
710 log_error("Failed to parse host port: %s", optarg);
711 return -EINVAL;
712 }
713
714 LIST_FOREACH(ports, p, arg_expose_ports) {
715 if (p->protocol == protocol && p->host_port == host_port) {
716 log_error("Duplicate port specification: %s", optarg);
717 return -EINVAL;
718 }
719 }
720
721 p = new(ExposePort, 1);
722 if (!p)
723 return log_oom();
724
725 p->protocol = protocol;
726 p->host_port = host_port;
727 p->container_port = container_port;
728
729 LIST_PREPEND(ports, arg_expose_ports, p);
730
731 break;
732 }
733
734 case '?':
735 return -EINVAL;
736
737 default:
738 assert_not_reached("Unhandled option");
739 }
740
741 if (arg_share_system)
742 arg_register = false;
743
744 if (arg_boot && arg_share_system) {
745 log_error("--boot and --share-system may not be combined.");
746 return -EINVAL;
747 }
748
749 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
750 log_error("--keep-unit may not be used when invoked from a user session.");
751 return -EINVAL;
752 }
753
754 if (arg_directory && arg_image) {
755 log_error("--directory= and --image= may not be combined.");
756 return -EINVAL;
757 }
758
759 if (arg_template && arg_image) {
760 log_error("--template= and --image= may not be combined.");
761 return -EINVAL;
762 }
763
764 if (arg_template && !(arg_directory || arg_machine)) {
765 log_error("--template= needs --directory= or --machine=.");
766 return -EINVAL;
767 }
768
769 if (arg_ephemeral && arg_template) {
770 log_error("--ephemeral and --template= may not be combined.");
771 return -EINVAL;
772 }
773
774 if (arg_ephemeral && arg_image) {
775 log_error("--ephemeral and --image= may not be combined.");
776 return -EINVAL;
777 }
778
779 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
780 log_error("--ephemeral and --link-journal= may not be combined.");
781 return -EINVAL;
782 }
783
784 if (arg_volatile != VOLATILE_NO && arg_read_only) {
785 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
786 return -EINVAL;
787 }
788
789 if (arg_expose_ports && !arg_private_network) {
790 log_error("Cannot use --port= without private networking.");
791 return -EINVAL;
792 }
793
794 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
795
796 return 1;
797 }
798
799 static int mount_all(const char *dest) {
800
801 typedef struct MountPoint {
802 const char *what;
803 const char *where;
804 const char *type;
805 const char *options;
806 unsigned long flags;
807 bool fatal;
808 } MountPoint;
809
810 static const MountPoint mount_table[] = {
811 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
812 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
813 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
814 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
815 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
816 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
817 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
818 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
819 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true },
820 #ifdef HAVE_SELINUX
821 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
822 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
823 #endif
824 };
825
826 unsigned k;
827 int r = 0;
828
829 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
830 _cleanup_free_ char *where = NULL;
831 #ifdef HAVE_SELINUX
832 _cleanup_free_ char *options = NULL;
833 #endif
834 const char *o;
835 int t;
836
837 where = strjoin(dest, "/", mount_table[k].where, NULL);
838 if (!where)
839 return log_oom();
840
841 t = path_is_mount_point(where, true);
842 if (t < 0) {
843 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
844
845 if (r == 0)
846 r = t;
847
848 continue;
849 }
850
851 /* Skip this entry if it is not a remount. */
852 if (mount_table[k].what && t > 0)
853 continue;
854
855 t = mkdir_p(where, 0755);
856 if (t < 0) {
857 if (mount_table[k].fatal) {
858 log_error_errno(t, "Failed to create directory %s: %m", where);
859
860 if (r == 0)
861 r = t;
862 } else
863 log_warning_errno(t, "Failed to create directory %s: %m", where);
864
865 continue;
866 }
867
868 #ifdef HAVE_SELINUX
869 if (arg_selinux_apifs_context &&
870 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
871 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
872 if (!options)
873 return log_oom();
874
875 o = options;
876 } else
877 #endif
878 o = mount_table[k].options;
879
880
881 if (mount(mount_table[k].what,
882 where,
883 mount_table[k].type,
884 mount_table[k].flags,
885 o) < 0) {
886
887 if (mount_table[k].fatal) {
888 log_error_errno(errno, "mount(%s) failed: %m", where);
889
890 if (r == 0)
891 r = -errno;
892 } else
893 log_warning_errno(errno, "mount(%s) failed: %m", where);
894 }
895 }
896
897 return r;
898 }
899
900 static int mount_binds(const char *dest, char **l, bool ro) {
901 char **x, **y;
902
903 STRV_FOREACH_PAIR(x, y, l) {
904 _cleanup_free_ char *where = NULL;
905 struct stat source_st, dest_st;
906 int r;
907
908 if (stat(*x, &source_st) < 0)
909 return log_error_errno(errno, "Failed to stat %s: %m", *x);
910
911 where = strappend(dest, *y);
912 if (!where)
913 return log_oom();
914
915 r = stat(where, &dest_st);
916 if (r == 0) {
917 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
918 log_error("Cannot bind mount directory %s on file %s.", *x, where);
919 return -EINVAL;
920 }
921 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
922 log_error("Cannot bind mount file %s on directory %s.", *x, where);
923 return -EINVAL;
924 }
925 } else if (errno == ENOENT) {
926 r = mkdir_parents_label(where, 0755);
927 if (r < 0)
928 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
929 } else {
930 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
931 return -errno;
932 }
933
934 /* Create the mount point. Any non-directory file can be
935 * mounted on any non-directory file (regular, fifo, socket,
936 * char, block).
937 */
938 if (S_ISDIR(source_st.st_mode)) {
939 r = mkdir_label(where, 0755);
940 if (r < 0 && errno != EEXIST)
941 return log_error_errno(r, "Failed to create mount point %s: %m", where);
942 } else {
943 r = touch(where);
944 if (r < 0)
945 return log_error_errno(r, "Failed to create mount point %s: %m", where);
946 }
947
948 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
949 return log_error_errno(errno, "mount(%s) failed: %m", where);
950
951 if (ro) {
952 r = bind_remount_recursive(where, true);
953 if (r < 0)
954 return log_error_errno(r, "Read-Only bind mount failed: %m");
955 }
956 }
957
958 return 0;
959 }
960
961 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
962 char *to;
963 int r;
964
965 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
966
967 r = path_is_mount_point(to, false);
968 if (r < 0)
969 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
970 if (r > 0)
971 return 0;
972
973 mkdir_p(to, 0755);
974
975 /* The superblock mount options of the mount point need to be
976 * identical to the hosts', and hence writable... */
977 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
978 return log_error_errno(errno, "Failed to mount to %s: %m", to);
979
980 /* ... hence let's only make the bind mount read-only, not the
981 * superblock. */
982 if (read_only) {
983 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
984 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
985 }
986 return 1;
987 }
988
989 static int mount_cgroup(const char *dest) {
990 _cleanup_set_free_free_ Set *controllers = NULL;
991 _cleanup_free_ char *own_cgroup_path = NULL;
992 const char *cgroup_root, *systemd_root, *systemd_own;
993 int r;
994
995 controllers = set_new(&string_hash_ops);
996 if (!controllers)
997 return log_oom();
998
999 r = cg_kernel_controllers(controllers);
1000 if (r < 0)
1001 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1002
1003 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1004 if (r < 0)
1005 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1006
1007 cgroup_root = strjoina(dest, "/sys/fs/cgroup");
1008 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1009 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1010
1011 for (;;) {
1012 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1013
1014 controller = set_steal_first(controllers);
1015 if (!controller)
1016 break;
1017
1018 origin = strappend("/sys/fs/cgroup/", controller);
1019 if (!origin)
1020 return log_oom();
1021
1022 r = readlink_malloc(origin, &combined);
1023 if (r == -EINVAL) {
1024 /* Not a symbolic link, but directly a single cgroup hierarchy */
1025
1026 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1027 if (r < 0)
1028 return r;
1029
1030 } else if (r < 0)
1031 return log_error_errno(r, "Failed to read link %s: %m", origin);
1032 else {
1033 _cleanup_free_ char *target = NULL;
1034
1035 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1036 if (!target)
1037 return log_oom();
1038
1039 /* A symbolic link, a combination of controllers in one hierarchy */
1040
1041 if (!filename_is_valid(combined)) {
1042 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1043 continue;
1044 }
1045
1046 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1047 if (r < 0)
1048 return r;
1049
1050 if (symlink(combined, target) < 0)
1051 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1052 }
1053 }
1054
1055 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1056 if (r < 0)
1057 return r;
1058
1059 /* Make our own cgroup a (writable) bind mount */
1060 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1061 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1062 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1063
1064 /* And then remount the systemd cgroup root read-only */
1065 systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
1066 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1067 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1068
1069 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1070 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1071
1072 return 0;
1073 }
1074
1075 static int mount_tmpfs(const char *dest) {
1076 char **i, **o;
1077
1078 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1079 _cleanup_free_ char *where = NULL;
1080 int r;
1081
1082 where = strappend(dest, *i);
1083 if (!where)
1084 return log_oom();
1085
1086 r = mkdir_label(where, 0755);
1087 if (r < 0 && r != -EEXIST)
1088 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1089
1090 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1091 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1092 }
1093
1094 return 0;
1095 }
1096
1097 static int setup_timezone(const char *dest) {
1098 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1099 char *z, *y;
1100 int r;
1101
1102 assert(dest);
1103
1104 /* Fix the timezone, if possible */
1105 r = readlink_malloc("/etc/localtime", &p);
1106 if (r < 0) {
1107 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1108 return 0;
1109 }
1110
1111 z = path_startswith(p, "../usr/share/zoneinfo/");
1112 if (!z)
1113 z = path_startswith(p, "/usr/share/zoneinfo/");
1114 if (!z) {
1115 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1116 return 0;
1117 }
1118
1119 where = strappend(dest, "/etc/localtime");
1120 if (!where)
1121 return log_oom();
1122
1123 r = readlink_malloc(where, &q);
1124 if (r >= 0) {
1125 y = path_startswith(q, "../usr/share/zoneinfo/");
1126 if (!y)
1127 y = path_startswith(q, "/usr/share/zoneinfo/");
1128
1129 /* Already pointing to the right place? Then do nothing .. */
1130 if (y && streq(y, z))
1131 return 0;
1132 }
1133
1134 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1135 if (!check)
1136 return log_oom();
1137
1138 if (access(check, F_OK) < 0) {
1139 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1140 return 0;
1141 }
1142
1143 what = strappend("../usr/share/zoneinfo/", z);
1144 if (!what)
1145 return log_oom();
1146
1147 r = mkdir_parents(where, 0755);
1148 if (r < 0) {
1149 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1150
1151 return 0;
1152 }
1153
1154 r = unlink(where);
1155 if (r < 0 && errno != ENOENT) {
1156 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1157
1158 return 0;
1159 }
1160
1161 if (symlink(what, where) < 0) {
1162 log_error_errno(errno, "Failed to correct timezone of container: %m");
1163 return 0;
1164 }
1165
1166 return 0;
1167 }
1168
1169 static int setup_resolv_conf(const char *dest) {
1170 _cleanup_free_ char *where = NULL;
1171 int r;
1172
1173 assert(dest);
1174
1175 if (arg_private_network)
1176 return 0;
1177
1178 /* Fix resolv.conf, if possible */
1179 where = strappend(dest, "/etc/resolv.conf");
1180 if (!where)
1181 return log_oom();
1182
1183 /* We don't really care for the results of this really. If it
1184 * fails, it fails, but meh... */
1185 r = mkdir_parents(where, 0755);
1186 if (r < 0) {
1187 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1188
1189 return 0;
1190 }
1191
1192 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1193 if (r < 0) {
1194 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1195
1196 return 0;
1197 }
1198
1199 return 0;
1200 }
1201
1202 static int setup_volatile_state(const char *directory) {
1203 const char *p;
1204 int r;
1205
1206 assert(directory);
1207
1208 if (arg_volatile != VOLATILE_STATE)
1209 return 0;
1210
1211 /* --volatile=state means we simply overmount /var
1212 with a tmpfs, and the rest read-only. */
1213
1214 r = bind_remount_recursive(directory, true);
1215 if (r < 0)
1216 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1217
1218 p = strjoina(directory, "/var");
1219 r = mkdir(p, 0755);
1220 if (r < 0 && errno != EEXIST)
1221 return log_error_errno(errno, "Failed to create %s: %m", directory);
1222
1223 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1224 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1225
1226 return 0;
1227 }
1228
1229 static int setup_volatile(const char *directory) {
1230 bool tmpfs_mounted = false, bind_mounted = false;
1231 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1232 const char *f, *t;
1233 int r;
1234
1235 assert(directory);
1236
1237 if (arg_volatile != VOLATILE_YES)
1238 return 0;
1239
1240 /* --volatile=yes means we mount a tmpfs to the root dir, and
1241 the original /usr to use inside it, and that read-only. */
1242
1243 if (!mkdtemp(template))
1244 return log_error_errno(errno, "Failed to create temporary directory: %m");
1245
1246 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1247 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1248 r = -errno;
1249 goto fail;
1250 }
1251
1252 tmpfs_mounted = true;
1253
1254 f = strjoina(directory, "/usr");
1255 t = strjoina(template, "/usr");
1256
1257 r = mkdir(t, 0755);
1258 if (r < 0 && errno != EEXIST) {
1259 log_error_errno(errno, "Failed to create %s: %m", t);
1260 r = -errno;
1261 goto fail;
1262 }
1263
1264 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1265 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1266 r = -errno;
1267 goto fail;
1268 }
1269
1270 bind_mounted = true;
1271
1272 r = bind_remount_recursive(t, true);
1273 if (r < 0) {
1274 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1275 goto fail;
1276 }
1277
1278 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1279 log_error_errno(errno, "Failed to move root mount: %m");
1280 r = -errno;
1281 goto fail;
1282 }
1283
1284 rmdir(template);
1285
1286 return 0;
1287
1288 fail:
1289 if (bind_mounted)
1290 umount(t);
1291 if (tmpfs_mounted)
1292 umount(template);
1293 rmdir(template);
1294 return r;
1295 }
1296
1297 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1298
1299 snprintf(s, 37,
1300 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1301 SD_ID128_FORMAT_VAL(id));
1302
1303 return s;
1304 }
1305
1306 static int setup_boot_id(const char *dest) {
1307 _cleanup_free_ char *from = NULL, *to = NULL;
1308 sd_id128_t rnd = {};
1309 char as_uuid[37];
1310 int r;
1311
1312 assert(dest);
1313
1314 if (arg_share_system)
1315 return 0;
1316
1317 /* Generate a new randomized boot ID, so that each boot-up of
1318 * the container gets a new one */
1319
1320 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1321 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1322 if (!from || !to)
1323 return log_oom();
1324
1325 r = sd_id128_randomize(&rnd);
1326 if (r < 0)
1327 return log_error_errno(r, "Failed to generate random boot id: %m");
1328
1329 id128_format_as_uuid(rnd, as_uuid);
1330
1331 r = write_string_file(from, as_uuid);
1332 if (r < 0)
1333 return log_error_errno(r, "Failed to write boot id: %m");
1334
1335 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1336 log_error_errno(errno, "Failed to bind mount boot id: %m");
1337 r = -errno;
1338 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1339 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1340
1341 unlink(from);
1342 return r;
1343 }
1344
1345 static int copy_devnodes(const char *dest) {
1346
1347 static const char devnodes[] =
1348 "null\0"
1349 "zero\0"
1350 "full\0"
1351 "random\0"
1352 "urandom\0"
1353 "tty\0"
1354 "net/tun\0";
1355
1356 const char *d;
1357 int r = 0;
1358 _cleanup_umask_ mode_t u;
1359
1360 assert(dest);
1361
1362 u = umask(0000);
1363
1364 NULSTR_FOREACH(d, devnodes) {
1365 _cleanup_free_ char *from = NULL, *to = NULL;
1366 struct stat st;
1367
1368 from = strappend("/dev/", d);
1369 to = strjoin(dest, "/dev/", d, NULL);
1370 if (!from || !to)
1371 return log_oom();
1372
1373 if (stat(from, &st) < 0) {
1374
1375 if (errno != ENOENT)
1376 return log_error_errno(errno, "Failed to stat %s: %m", from);
1377
1378 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1379
1380 log_error("%s is not a char or block device, cannot copy", from);
1381 return -EIO;
1382
1383 } else {
1384 r = mkdir_parents(to, 0775);
1385 if (r < 0) {
1386 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1387 return -r;
1388 }
1389
1390 if (mknod(to, st.st_mode, st.st_rdev) < 0)
1391 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1392 }
1393 }
1394
1395 return r;
1396 }
1397
1398 static int setup_ptmx(const char *dest) {
1399 _cleanup_free_ char *p = NULL;
1400
1401 p = strappend(dest, "/dev/ptmx");
1402 if (!p)
1403 return log_oom();
1404
1405 if (symlink("pts/ptmx", p) < 0)
1406 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1407
1408 return 0;
1409 }
1410
1411 static int setup_dev_console(const char *dest, const char *console) {
1412 _cleanup_umask_ mode_t u;
1413 const char *to;
1414 struct stat st;
1415 int r;
1416
1417 assert(dest);
1418 assert(console);
1419
1420 u = umask(0000);
1421
1422 if (stat("/dev/null", &st) < 0)
1423 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1424
1425 r = chmod_and_chown(console, 0600, 0, 0);
1426 if (r < 0)
1427 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1428
1429 /* We need to bind mount the right tty to /dev/console since
1430 * ptys can only exist on pts file systems. To have something
1431 * to bind mount things on we create a device node first, and
1432 * use /dev/null for that since we the cgroups device policy
1433 * allows us to create that freely, while we cannot create
1434 * /dev/console. (Note that the major minor doesn't actually
1435 * matter here, since we mount it over anyway). */
1436
1437 to = strjoina(dest, "/dev/console");
1438 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1439 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1440
1441 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1442 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1443
1444 return 0;
1445 }
1446
1447 static int setup_kmsg(const char *dest, int kmsg_socket) {
1448 _cleanup_free_ char *from = NULL, *to = NULL;
1449 _cleanup_umask_ mode_t u;
1450 int r, fd, k;
1451 union {
1452 struct cmsghdr cmsghdr;
1453 uint8_t buf[CMSG_SPACE(sizeof(int))];
1454 } control = {};
1455 struct msghdr mh = {
1456 .msg_control = &control,
1457 .msg_controllen = sizeof(control),
1458 };
1459 struct cmsghdr *cmsg;
1460
1461 assert(dest);
1462 assert(kmsg_socket >= 0);
1463
1464 u = umask(0000);
1465
1466 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1467 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1468 * on the reading side behave very similar to /proc/kmsg,
1469 * their writing side behaves differently from /dev/kmsg in
1470 * that writing blocks when nothing is reading. In order to
1471 * avoid any problems with containers deadlocking due to this
1472 * we simply make /dev/kmsg unavailable to the container. */
1473 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1474 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1475 return log_oom();
1476
1477 if (mkfifo(from, 0600) < 0)
1478 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1479
1480 r = chmod_and_chown(from, 0600, 0, 0);
1481 if (r < 0)
1482 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1483
1484 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1485 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1486
1487 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1488 if (fd < 0)
1489 return log_error_errno(errno, "Failed to open fifo: %m");
1490
1491 cmsg = CMSG_FIRSTHDR(&mh);
1492 cmsg->cmsg_level = SOL_SOCKET;
1493 cmsg->cmsg_type = SCM_RIGHTS;
1494 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1495 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1496
1497 mh.msg_controllen = cmsg->cmsg_len;
1498
1499 /* Store away the fd in the socket, so that it stays open as
1500 * long as we run the child */
1501 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1502 safe_close(fd);
1503
1504 if (k < 0)
1505 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1506
1507 /* And now make the FIFO unavailable as /dev/kmsg... */
1508 unlink(from);
1509 return 0;
1510 }
1511
1512 static int send_rtnl(int send_fd) {
1513 union {
1514 struct cmsghdr cmsghdr;
1515 uint8_t buf[CMSG_SPACE(sizeof(int))];
1516 } control = {};
1517 struct msghdr mh = {
1518 .msg_control = &control,
1519 .msg_controllen = sizeof(control),
1520 };
1521 struct cmsghdr *cmsg;
1522 _cleanup_close_ int fd = -1;
1523 ssize_t k;
1524
1525 assert(send_fd >= 0);
1526
1527 if (!arg_expose_ports)
1528 return 0;
1529
1530 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1531 if (fd < 0)
1532 return log_error_errno(errno, "failed to allocate container netlink: %m");
1533
1534 cmsg = CMSG_FIRSTHDR(&mh);
1535 cmsg->cmsg_level = SOL_SOCKET;
1536 cmsg->cmsg_type = SCM_RIGHTS;
1537 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1538 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1539
1540 mh.msg_controllen = cmsg->cmsg_len;
1541
1542 /* Store away the fd in the socket, so that it stays open as
1543 * long as we run the child */
1544 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1545 if (k < 0)
1546 return log_error_errno(errno, "Failed to send netlink fd: %m");
1547
1548 return 0;
1549 }
1550
1551 static int flush_ports(union in_addr_union *exposed) {
1552 ExposePort *p;
1553 int r, af = AF_INET;
1554
1555 assert(exposed);
1556
1557 if (!arg_expose_ports)
1558 return 0;
1559
1560 if (in_addr_is_null(af, exposed))
1561 return 0;
1562
1563 log_debug("Lost IP address.");
1564
1565 LIST_FOREACH(ports, p, arg_expose_ports) {
1566 r = fw_add_local_dnat(false,
1567 af,
1568 p->protocol,
1569 NULL,
1570 NULL, 0,
1571 NULL, 0,
1572 p->host_port,
1573 exposed,
1574 p->container_port,
1575 NULL);
1576 if (r < 0)
1577 log_warning_errno(r, "Failed to modify firewall: %m");
1578 }
1579
1580 *exposed = IN_ADDR_NULL;
1581 return 0;
1582 }
1583
1584 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1585 _cleanup_free_ struct local_address *addresses = NULL;
1586 _cleanup_free_ char *pretty = NULL;
1587 union in_addr_union new_exposed;
1588 ExposePort *p;
1589 bool add;
1590 int af = AF_INET, r;
1591
1592 assert(exposed);
1593
1594 /* Invoked each time an address is added or removed inside the
1595 * container */
1596
1597 if (!arg_expose_ports)
1598 return 0;
1599
1600 r = local_addresses(rtnl, 0, af, &addresses);
1601 if (r < 0)
1602 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1603
1604 add = r > 0 &&
1605 addresses[0].family == af &&
1606 addresses[0].scope < RT_SCOPE_LINK;
1607
1608 if (!add)
1609 return flush_ports(exposed);
1610
1611 new_exposed = addresses[0].address;
1612 if (in_addr_equal(af, exposed, &new_exposed))
1613 return 0;
1614
1615 in_addr_to_string(af, &new_exposed, &pretty);
1616 log_debug("New container IP is %s.", strna(pretty));
1617
1618 LIST_FOREACH(ports, p, arg_expose_ports) {
1619
1620 r = fw_add_local_dnat(true,
1621 af,
1622 p->protocol,
1623 NULL,
1624 NULL, 0,
1625 NULL, 0,
1626 p->host_port,
1627 &new_exposed,
1628 p->container_port,
1629 in_addr_is_null(af, exposed) ? NULL : exposed);
1630 if (r < 0)
1631 log_warning_errno(r, "Failed to modify firewall: %m");
1632 }
1633
1634 *exposed = new_exposed;
1635 return 0;
1636 }
1637
1638 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1639 union in_addr_union *exposed = userdata;
1640
1641 assert(rtnl);
1642 assert(m);
1643 assert(exposed);
1644
1645 expose_ports(rtnl, exposed);
1646 return 0;
1647 }
1648
1649 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1650 union {
1651 struct cmsghdr cmsghdr;
1652 uint8_t buf[CMSG_SPACE(sizeof(int))];
1653 } control = {};
1654 struct msghdr mh = {
1655 .msg_control = &control,
1656 .msg_controllen = sizeof(control),
1657 };
1658 struct cmsghdr *cmsg;
1659 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1660 int fd, r;
1661 ssize_t k;
1662
1663 assert(event);
1664 assert(recv_fd >= 0);
1665 assert(ret);
1666
1667 if (!arg_expose_ports)
1668 return 0;
1669
1670 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1671 if (k < 0)
1672 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1673
1674 cmsg = CMSG_FIRSTHDR(&mh);
1675 assert(cmsg->cmsg_level == SOL_SOCKET);
1676 assert(cmsg->cmsg_type == SCM_RIGHTS);
1677 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1678 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1679
1680 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1681 if (r < 0) {
1682 safe_close(fd);
1683 return log_error_errno(r, "Failed to create rtnl object: %m");
1684 }
1685
1686 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1687 if (r < 0)
1688 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1689
1690 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1691 if (r < 0)
1692 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1693
1694 r = sd_rtnl_attach_event(rtnl, event, 0);
1695 if (r < 0)
1696 return log_error_errno(r, "Failed to add to even loop: %m");
1697
1698 *ret = rtnl;
1699 rtnl = NULL;
1700
1701 return 0;
1702 }
1703
1704 static int setup_hostname(void) {
1705
1706 if (arg_share_system)
1707 return 0;
1708
1709 if (sethostname_idempotent(arg_machine) < 0)
1710 return -errno;
1711
1712 return 0;
1713 }
1714
1715 static int setup_journal(const char *directory) {
1716 sd_id128_t machine_id, this_id;
1717 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1718 char *id;
1719 int r;
1720
1721 /* Don't link journals in ephemeral mode */
1722 if (arg_ephemeral)
1723 return 0;
1724
1725 p = strappend(directory, "/etc/machine-id");
1726 if (!p)
1727 return log_oom();
1728
1729 r = read_one_line_file(p, &b);
1730 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1731 return 0;
1732 else if (r < 0)
1733 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1734
1735 id = strstrip(b);
1736 if (isempty(id) && arg_link_journal == LINK_AUTO)
1737 return 0;
1738
1739 /* Verify validity */
1740 r = sd_id128_from_string(id, &machine_id);
1741 if (r < 0)
1742 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1743
1744 r = sd_id128_get_machine(&this_id);
1745 if (r < 0)
1746 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1747
1748 if (sd_id128_equal(machine_id, this_id)) {
1749 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1750 "Host and machine ids are equal (%s): refusing to link journals", id);
1751 if (arg_link_journal == LINK_AUTO)
1752 return 0;
1753 return -EEXIST;
1754 }
1755
1756 if (arg_link_journal == LINK_NO)
1757 return 0;
1758
1759 free(p);
1760 p = strappend("/var/log/journal/", id);
1761 q = strjoin(directory, "/var/log/journal/", id, NULL);
1762 if (!p || !q)
1763 return log_oom();
1764
1765 if (path_is_mount_point(p, false) > 0) {
1766 if (arg_link_journal != LINK_AUTO) {
1767 log_error("%s: already a mount point, refusing to use for journal", p);
1768 return -EEXIST;
1769 }
1770
1771 return 0;
1772 }
1773
1774 if (path_is_mount_point(q, false) > 0) {
1775 if (arg_link_journal != LINK_AUTO) {
1776 log_error("%s: already a mount point, refusing to use for journal", q);
1777 return -EEXIST;
1778 }
1779
1780 return 0;
1781 }
1782
1783 r = readlink_and_make_absolute(p, &d);
1784 if (r >= 0) {
1785 if ((arg_link_journal == LINK_GUEST ||
1786 arg_link_journal == LINK_AUTO) &&
1787 path_equal(d, q)) {
1788
1789 r = mkdir_p(q, 0755);
1790 if (r < 0)
1791 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1792 return 0;
1793 }
1794
1795 if (unlink(p) < 0)
1796 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1797 } else if (r == -EINVAL) {
1798
1799 if (arg_link_journal == LINK_GUEST &&
1800 rmdir(p) < 0) {
1801
1802 if (errno == ENOTDIR) {
1803 log_error("%s already exists and is neither a symlink nor a directory", p);
1804 return r;
1805 } else {
1806 log_error_errno(errno, "Failed to remove %s: %m", p);
1807 return -errno;
1808 }
1809 }
1810 } else if (r != -ENOENT) {
1811 log_error_errno(errno, "readlink(%s) failed: %m", p);
1812 return r;
1813 }
1814
1815 if (arg_link_journal == LINK_GUEST) {
1816
1817 if (symlink(q, p) < 0) {
1818 if (arg_link_journal_try) {
1819 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1820 return 0;
1821 } else {
1822 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1823 return -errno;
1824 }
1825 }
1826
1827 r = mkdir_p(q, 0755);
1828 if (r < 0)
1829 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1830 return 0;
1831 }
1832
1833 if (arg_link_journal == LINK_HOST) {
1834 /* don't create parents here -- if the host doesn't have
1835 * permanent journal set up, don't force it here */
1836 r = mkdir(p, 0755);
1837 if (r < 0) {
1838 if (arg_link_journal_try) {
1839 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1840 return 0;
1841 } else {
1842 log_error_errno(errno, "Failed to create %s: %m", p);
1843 return r;
1844 }
1845 }
1846
1847 } else if (access(p, F_OK) < 0)
1848 return 0;
1849
1850 if (dir_is_empty(q) == 0)
1851 log_warning("%s is not empty, proceeding anyway.", q);
1852
1853 r = mkdir_p(q, 0755);
1854 if (r < 0) {
1855 log_error_errno(errno, "Failed to create %s: %m", q);
1856 return r;
1857 }
1858
1859 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1860 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1861
1862 return 0;
1863 }
1864
1865 static int drop_capabilities(void) {
1866 return capability_bounding_set_drop(~arg_retain, false);
1867 }
1868
1869 static int register_machine(pid_t pid, int local_ifindex) {
1870 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1871 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1872 int r;
1873
1874 if (!arg_register)
1875 return 0;
1876
1877 r = sd_bus_default_system(&bus);
1878 if (r < 0)
1879 return log_error_errno(r, "Failed to open system bus: %m");
1880
1881 if (arg_keep_unit) {
1882 r = sd_bus_call_method(
1883 bus,
1884 "org.freedesktop.machine1",
1885 "/org/freedesktop/machine1",
1886 "org.freedesktop.machine1.Manager",
1887 "RegisterMachineWithNetwork",
1888 &error,
1889 NULL,
1890 "sayssusai",
1891 arg_machine,
1892 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1893 "nspawn",
1894 "container",
1895 (uint32_t) pid,
1896 strempty(arg_directory),
1897 local_ifindex > 0 ? 1 : 0, local_ifindex);
1898 } else {
1899 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1900
1901 r = sd_bus_message_new_method_call(
1902 bus,
1903 &m,
1904 "org.freedesktop.machine1",
1905 "/org/freedesktop/machine1",
1906 "org.freedesktop.machine1.Manager",
1907 "CreateMachineWithNetwork");
1908 if (r < 0)
1909 return log_error_errno(r, "Failed to create message: %m");
1910
1911 r = sd_bus_message_append(
1912 m,
1913 "sayssusai",
1914 arg_machine,
1915 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1916 "nspawn",
1917 "container",
1918 (uint32_t) pid,
1919 strempty(arg_directory),
1920 local_ifindex > 0 ? 1 : 0, local_ifindex);
1921 if (r < 0)
1922 return log_error_errno(r, "Failed to append message arguments: %m");
1923
1924 r = sd_bus_message_open_container(m, 'a', "(sv)");
1925 if (r < 0)
1926 return log_error_errno(r, "Failed to open container: %m");
1927
1928 if (!isempty(arg_slice)) {
1929 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1930 if (r < 0)
1931 return log_error_errno(r, "Failed to append slice: %m");
1932 }
1933
1934 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1935 if (r < 0)
1936 return log_error_errno(r, "Failed to add device policy: %m");
1937
1938 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1939 /* Allow the container to
1940 * access and create the API
1941 * device nodes, so that
1942 * PrivateDevices= in the
1943 * container can work
1944 * fine */
1945 "/dev/null", "rwm",
1946 "/dev/zero", "rwm",
1947 "/dev/full", "rwm",
1948 "/dev/random", "rwm",
1949 "/dev/urandom", "rwm",
1950 "/dev/tty", "rwm",
1951 "/dev/net/tun", "rwm",
1952 /* Allow the container
1953 * access to ptys. However,
1954 * do not permit the
1955 * container to ever create
1956 * these device nodes. */
1957 "/dev/pts/ptmx", "rw",
1958 "char-pts", "rw");
1959 if (r < 0)
1960 return log_error_errno(r, "Failed to add device whitelist: %m");
1961
1962 r = sd_bus_message_close_container(m);
1963 if (r < 0)
1964 return log_error_errno(r, "Failed to close container: %m");
1965
1966 r = sd_bus_call(bus, m, 0, &error, NULL);
1967 }
1968
1969 if (r < 0) {
1970 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1971 return r;
1972 }
1973
1974 return 0;
1975 }
1976
1977 static int terminate_machine(pid_t pid) {
1978 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1979 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1980 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1981 const char *path;
1982 int r;
1983
1984 if (!arg_register)
1985 return 0;
1986
1987 r = sd_bus_default_system(&bus);
1988 if (r < 0)
1989 return log_error_errno(r, "Failed to open system bus: %m");
1990
1991 r = sd_bus_call_method(
1992 bus,
1993 "org.freedesktop.machine1",
1994 "/org/freedesktop/machine1",
1995 "org.freedesktop.machine1.Manager",
1996 "GetMachineByPID",
1997 &error,
1998 &reply,
1999 "u",
2000 (uint32_t) pid);
2001 if (r < 0) {
2002 /* Note that the machine might already have been
2003 * cleaned up automatically, hence don't consider it a
2004 * failure if we cannot get the machine object. */
2005 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2006 return 0;
2007 }
2008
2009 r = sd_bus_message_read(reply, "o", &path);
2010 if (r < 0)
2011 return bus_log_parse_error(r);
2012
2013 r = sd_bus_call_method(
2014 bus,
2015 "org.freedesktop.machine1",
2016 path,
2017 "org.freedesktop.machine1.Machine",
2018 "Terminate",
2019 &error,
2020 NULL,
2021 NULL);
2022 if (r < 0) {
2023 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2024 return 0;
2025 }
2026
2027 return 0;
2028 }
2029
2030 static int reset_audit_loginuid(void) {
2031 _cleanup_free_ char *p = NULL;
2032 int r;
2033
2034 if (arg_share_system)
2035 return 0;
2036
2037 r = read_one_line_file("/proc/self/loginuid", &p);
2038 if (r == -ENOENT)
2039 return 0;
2040 if (r < 0)
2041 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2042
2043 /* Already reset? */
2044 if (streq(p, "4294967295"))
2045 return 0;
2046
2047 r = write_string_file("/proc/self/loginuid", "4294967295");
2048 if (r < 0) {
2049 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2050 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2051 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2052 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2053 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2054
2055 sleep(5);
2056 }
2057
2058 return 0;
2059 }
2060
2061 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2062 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2063 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2064
2065 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2066 uint8_t result[8];
2067 size_t l, sz;
2068 uint8_t *v, *i;
2069 int r;
2070
2071 l = strlen(arg_machine);
2072 sz = sizeof(sd_id128_t) + l;
2073 if (idx > 0)
2074 sz += sizeof(idx);
2075
2076 v = alloca(sz);
2077
2078 /* fetch some persistent data unique to the host */
2079 r = sd_id128_get_machine((sd_id128_t*) v);
2080 if (r < 0)
2081 return r;
2082
2083 /* combine with some data unique (on this host) to this
2084 * container instance */
2085 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2086 if (idx > 0) {
2087 idx = htole64(idx);
2088 memcpy(i, &idx, sizeof(idx));
2089 }
2090
2091 /* Let's hash the host machine ID plus the container name. We
2092 * use a fixed, but originally randomly created hash key here. */
2093 siphash24(result, v, sz, hash_key.bytes);
2094
2095 assert_cc(ETH_ALEN <= sizeof(result));
2096 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2097
2098 /* see eth_random_addr in the kernel */
2099 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2100 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2101
2102 return 0;
2103 }
2104
2105 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2106 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2107 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2108 struct ether_addr mac_host, mac_container;
2109 int r, i;
2110
2111 if (!arg_private_network)
2112 return 0;
2113
2114 if (!arg_network_veth)
2115 return 0;
2116
2117 /* Use two different interface name prefixes depending whether
2118 * we are in bridge mode or not. */
2119 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2120 arg_network_bridge ? "vb" : "ve", arg_machine);
2121
2122 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2123 if (r < 0)
2124 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2125
2126 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2127 if (r < 0)
2128 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2129
2130 r = sd_rtnl_open(&rtnl, 0);
2131 if (r < 0)
2132 return log_error_errno(r, "Failed to connect to netlink: %m");
2133
2134 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2135 if (r < 0)
2136 return log_error_errno(r, "Failed to allocate netlink message: %m");
2137
2138 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2139 if (r < 0)
2140 return log_error_errno(r, "Failed to add netlink interface name: %m");
2141
2142 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2143 if (r < 0)
2144 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2145
2146 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2147 if (r < 0)
2148 return log_error_errno(r, "Failed to open netlink container: %m");
2149
2150 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2151 if (r < 0)
2152 return log_error_errno(r, "Failed to open netlink container: %m");
2153
2154 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2155 if (r < 0)
2156 return log_error_errno(r, "Failed to open netlink container: %m");
2157
2158 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2159 if (r < 0)
2160 return log_error_errno(r, "Failed to add netlink interface name: %m");
2161
2162 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2163 if (r < 0)
2164 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2165
2166 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2167 if (r < 0)
2168 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2169
2170 r = sd_rtnl_message_close_container(m);
2171 if (r < 0)
2172 return log_error_errno(r, "Failed to close netlink container: %m");
2173
2174 r = sd_rtnl_message_close_container(m);
2175 if (r < 0)
2176 return log_error_errno(r, "Failed to close netlink container: %m");
2177
2178 r = sd_rtnl_message_close_container(m);
2179 if (r < 0)
2180 return log_error_errno(r, "Failed to close netlink container: %m");
2181
2182 r = sd_rtnl_call(rtnl, m, 0, NULL);
2183 if (r < 0)
2184 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2185
2186 i = (int) if_nametoindex(iface_name);
2187 if (i <= 0)
2188 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2189
2190 *ifi = i;
2191
2192 return 0;
2193 }
2194
2195 static int setup_bridge(const char veth_name[], int *ifi) {
2196 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2197 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2198 int r, bridge;
2199
2200 if (!arg_private_network)
2201 return 0;
2202
2203 if (!arg_network_veth)
2204 return 0;
2205
2206 if (!arg_network_bridge)
2207 return 0;
2208
2209 bridge = (int) if_nametoindex(arg_network_bridge);
2210 if (bridge <= 0)
2211 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2212
2213 *ifi = bridge;
2214
2215 r = sd_rtnl_open(&rtnl, 0);
2216 if (r < 0)
2217 return log_error_errno(r, "Failed to connect to netlink: %m");
2218
2219 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2220 if (r < 0)
2221 return log_error_errno(r, "Failed to allocate netlink message: %m");
2222
2223 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2224 if (r < 0)
2225 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2226
2227 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2228 if (r < 0)
2229 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2230
2231 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2232 if (r < 0)
2233 return log_error_errno(r, "Failed to add netlink master field: %m");
2234
2235 r = sd_rtnl_call(rtnl, m, 0, NULL);
2236 if (r < 0)
2237 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2238
2239 return 0;
2240 }
2241
2242 static int parse_interface(struct udev *udev, const char *name) {
2243 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2244 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2245 int ifi;
2246
2247 ifi = (int) if_nametoindex(name);
2248 if (ifi <= 0)
2249 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2250
2251 sprintf(ifi_str, "n%i", ifi);
2252 d = udev_device_new_from_device_id(udev, ifi_str);
2253 if (!d)
2254 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2255
2256 if (udev_device_get_is_initialized(d) <= 0) {
2257 log_error("Network interface %s is not initialized yet.", name);
2258 return -EBUSY;
2259 }
2260
2261 return ifi;
2262 }
2263
2264 static int move_network_interfaces(pid_t pid) {
2265 _cleanup_udev_unref_ struct udev *udev = NULL;
2266 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2267 char **i;
2268 int r;
2269
2270 if (!arg_private_network)
2271 return 0;
2272
2273 if (strv_isempty(arg_network_interfaces))
2274 return 0;
2275
2276 r = sd_rtnl_open(&rtnl, 0);
2277 if (r < 0)
2278 return log_error_errno(r, "Failed to connect to netlink: %m");
2279
2280 udev = udev_new();
2281 if (!udev) {
2282 log_error("Failed to connect to udev.");
2283 return -ENOMEM;
2284 }
2285
2286 STRV_FOREACH(i, arg_network_interfaces) {
2287 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2288 int ifi;
2289
2290 ifi = parse_interface(udev, *i);
2291 if (ifi < 0)
2292 return ifi;
2293
2294 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2295 if (r < 0)
2296 return log_error_errno(r, "Failed to allocate netlink message: %m");
2297
2298 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2299 if (r < 0)
2300 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2301
2302 r = sd_rtnl_call(rtnl, m, 0, NULL);
2303 if (r < 0)
2304 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2305 }
2306
2307 return 0;
2308 }
2309
2310 static int setup_macvlan(pid_t pid) {
2311 _cleanup_udev_unref_ struct udev *udev = NULL;
2312 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2313 unsigned idx = 0;
2314 char **i;
2315 int r;
2316
2317 if (!arg_private_network)
2318 return 0;
2319
2320 if (strv_isempty(arg_network_macvlan))
2321 return 0;
2322
2323 r = sd_rtnl_open(&rtnl, 0);
2324 if (r < 0)
2325 return log_error_errno(r, "Failed to connect to netlink: %m");
2326
2327 udev = udev_new();
2328 if (!udev) {
2329 log_error("Failed to connect to udev.");
2330 return -ENOMEM;
2331 }
2332
2333 STRV_FOREACH(i, arg_network_macvlan) {
2334 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2335 _cleanup_free_ char *n = NULL;
2336 struct ether_addr mac;
2337 int ifi;
2338
2339 ifi = parse_interface(udev, *i);
2340 if (ifi < 0)
2341 return ifi;
2342
2343 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2344 if (r < 0)
2345 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2346
2347 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2348 if (r < 0)
2349 return log_error_errno(r, "Failed to allocate netlink message: %m");
2350
2351 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2352 if (r < 0)
2353 return log_error_errno(r, "Failed to add netlink interface index: %m");
2354
2355 n = strappend("mv-", *i);
2356 if (!n)
2357 return log_oom();
2358
2359 strshorten(n, IFNAMSIZ-1);
2360
2361 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2362 if (r < 0)
2363 return log_error_errno(r, "Failed to add netlink interface name: %m");
2364
2365 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2366 if (r < 0)
2367 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2368
2369 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2370 if (r < 0)
2371 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2372
2373 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2374 if (r < 0)
2375 return log_error_errno(r, "Failed to open netlink container: %m");
2376
2377 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2378 if (r < 0)
2379 return log_error_errno(r, "Failed to open netlink container: %m");
2380
2381 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2382 if (r < 0)
2383 return log_error_errno(r, "Failed to append macvlan mode: %m");
2384
2385 r = sd_rtnl_message_close_container(m);
2386 if (r < 0)
2387 return log_error_errno(r, "Failed to close netlink container: %m");
2388
2389 r = sd_rtnl_message_close_container(m);
2390 if (r < 0)
2391 return log_error_errno(r, "Failed to close netlink container: %m");
2392
2393 r = sd_rtnl_call(rtnl, m, 0, NULL);
2394 if (r < 0)
2395 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2396 }
2397
2398 return 0;
2399 }
2400
2401 static int setup_ipvlan(pid_t pid) {
2402 _cleanup_udev_unref_ struct udev *udev = NULL;
2403 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2404 char **i;
2405 int r;
2406
2407 if (!arg_private_network)
2408 return 0;
2409
2410 if (strv_isempty(arg_network_ipvlan))
2411 return 0;
2412
2413 r = sd_rtnl_open(&rtnl, 0);
2414 if (r < 0)
2415 return log_error_errno(r, "Failed to connect to netlink: %m");
2416
2417 udev = udev_new();
2418 if (!udev) {
2419 log_error("Failed to connect to udev.");
2420 return -ENOMEM;
2421 }
2422
2423 STRV_FOREACH(i, arg_network_ipvlan) {
2424 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2425 _cleanup_free_ char *n = NULL;
2426 int ifi;
2427
2428 ifi = parse_interface(udev, *i);
2429 if (ifi < 0)
2430 return ifi;
2431
2432 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2433 if (r < 0)
2434 return log_error_errno(r, "Failed to allocate netlink message: %m");
2435
2436 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2437 if (r < 0)
2438 return log_error_errno(r, "Failed to add netlink interface index: %m");
2439
2440 n = strappend("iv-", *i);
2441 if (!n)
2442 return log_oom();
2443
2444 strshorten(n, IFNAMSIZ-1);
2445
2446 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2447 if (r < 0)
2448 return log_error_errno(r, "Failed to add netlink interface name: %m");
2449
2450 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2451 if (r < 0)
2452 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2453
2454 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2455 if (r < 0)
2456 return log_error_errno(r, "Failed to open netlink container: %m");
2457
2458 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2459 if (r < 0)
2460 return log_error_errno(r, "Failed to open netlink container: %m");
2461
2462 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2463 if (r < 0)
2464 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2465
2466 r = sd_rtnl_message_close_container(m);
2467 if (r < 0)
2468 return log_error_errno(r, "Failed to close netlink container: %m");
2469
2470 r = sd_rtnl_message_close_container(m);
2471 if (r < 0)
2472 return log_error_errno(r, "Failed to close netlink container: %m");
2473
2474 r = sd_rtnl_call(rtnl, m, 0, NULL);
2475 if (r < 0)
2476 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2477 }
2478
2479 return 0;
2480 }
2481
2482 static int setup_seccomp(void) {
2483
2484 #ifdef HAVE_SECCOMP
2485 static const int blacklist[] = {
2486 SCMP_SYS(kexec_load),
2487 SCMP_SYS(open_by_handle_at),
2488 SCMP_SYS(iopl),
2489 SCMP_SYS(ioperm),
2490 SCMP_SYS(swapon),
2491 SCMP_SYS(swapoff),
2492 };
2493
2494 static const int kmod_blacklist[] = {
2495 SCMP_SYS(init_module),
2496 SCMP_SYS(finit_module),
2497 SCMP_SYS(delete_module),
2498 };
2499
2500 scmp_filter_ctx seccomp;
2501 unsigned i;
2502 int r;
2503
2504 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2505 if (!seccomp)
2506 return log_oom();
2507
2508 r = seccomp_add_secondary_archs(seccomp);
2509 if (r < 0) {
2510 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2511 goto finish;
2512 }
2513
2514 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2515 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2516 if (r == -EFAULT)
2517 continue; /* unknown syscall */
2518 if (r < 0) {
2519 log_error_errno(r, "Failed to block syscall: %m");
2520 goto finish;
2521 }
2522 }
2523
2524 /* If the CAP_SYS_MODULE capability is not requested then
2525 * we'll block the kmod syscalls too */
2526 if (!(arg_retain & (1ULL << CAP_SYS_MODULE))) {
2527 for (i = 0; i < ELEMENTSOF(kmod_blacklist); i++) {
2528 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), kmod_blacklist[i], 0);
2529 if (r == -EFAULT)
2530 continue; /* unknown syscall */
2531 if (r < 0) {
2532 log_error_errno(r, "Failed to block syscall: %m");
2533 goto finish;
2534 }
2535 }
2536 }
2537
2538 /*
2539 Audit is broken in containers, much of the userspace audit
2540 hookup will fail if running inside a container. We don't
2541 care and just turn off creation of audit sockets.
2542
2543 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2544 with EAFNOSUPPORT which audit userspace uses as indication
2545 that audit is disabled in the kernel.
2546 */
2547
2548 r = seccomp_rule_add(
2549 seccomp,
2550 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2551 SCMP_SYS(socket),
2552 2,
2553 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2554 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2555 if (r < 0) {
2556 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2557 goto finish;
2558 }
2559
2560 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2561 if (r < 0) {
2562 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2563 goto finish;
2564 }
2565
2566 r = seccomp_load(seccomp);
2567 if (r < 0)
2568 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2569
2570 finish:
2571 seccomp_release(seccomp);
2572 return r;
2573 #else
2574 return 0;
2575 #endif
2576
2577 }
2578
2579 static int setup_propagate(const char *root) {
2580 const char *p, *q;
2581
2582 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2583 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2584 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2585 (void) mkdir_p(p, 0600);
2586
2587 q = strjoina(root, "/run/systemd/nspawn/incoming");
2588 mkdir_parents(q, 0755);
2589 mkdir_p(q, 0600);
2590
2591 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2592 return log_error_errno(errno, "Failed to install propagation bind mount.");
2593
2594 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2595 return log_error_errno(errno, "Failed to make propagation mount read-only");
2596
2597 return 0;
2598 }
2599
2600 static int setup_image(char **device_path, int *loop_nr) {
2601 struct loop_info64 info = {
2602 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2603 };
2604 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2605 _cleanup_free_ char* loopdev = NULL;
2606 struct stat st;
2607 int r, nr;
2608
2609 assert(device_path);
2610 assert(loop_nr);
2611 assert(arg_image);
2612
2613 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2614 if (fd < 0)
2615 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2616
2617 if (fstat(fd, &st) < 0)
2618 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2619
2620 if (S_ISBLK(st.st_mode)) {
2621 char *p;
2622
2623 p = strdup(arg_image);
2624 if (!p)
2625 return log_oom();
2626
2627 *device_path = p;
2628
2629 *loop_nr = -1;
2630
2631 r = fd;
2632 fd = -1;
2633
2634 return r;
2635 }
2636
2637 if (!S_ISREG(st.st_mode)) {
2638 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2639 return -EINVAL;
2640 }
2641
2642 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2643 if (control < 0)
2644 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2645
2646 nr = ioctl(control, LOOP_CTL_GET_FREE);
2647 if (nr < 0)
2648 return log_error_errno(errno, "Failed to allocate loop device: %m");
2649
2650 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2651 return log_oom();
2652
2653 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2654 if (loop < 0)
2655 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2656
2657 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2658 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2659
2660 if (arg_read_only)
2661 info.lo_flags |= LO_FLAGS_READ_ONLY;
2662
2663 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2664 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2665
2666 *device_path = loopdev;
2667 loopdev = NULL;
2668
2669 *loop_nr = nr;
2670
2671 r = loop;
2672 loop = -1;
2673
2674 return r;
2675 }
2676
2677 #define PARTITION_TABLE_BLURB \
2678 "Note that the disk image needs to either contain only a single MBR partition of\n" \
2679 "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2680 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2681 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2682 "to be bootable with systemd-nspawn."
2683
2684 static int dissect_image(
2685 int fd,
2686 char **root_device, bool *root_device_rw,
2687 char **home_device, bool *home_device_rw,
2688 char **srv_device, bool *srv_device_rw,
2689 bool *secondary) {
2690
2691 #ifdef HAVE_BLKID
2692 int home_nr = -1, srv_nr = -1;
2693 #ifdef GPT_ROOT_NATIVE
2694 int root_nr = -1;
2695 #endif
2696 #ifdef GPT_ROOT_SECONDARY
2697 int secondary_root_nr = -1;
2698 #endif
2699 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2700 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2701 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2702 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2703 _cleanup_udev_unref_ struct udev *udev = NULL;
2704 struct udev_list_entry *first, *item;
2705 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2706 bool is_gpt, is_mbr, multiple_generic = false;
2707 const char *pttype = NULL;
2708 blkid_partlist pl;
2709 struct stat st;
2710 unsigned i;
2711 int r;
2712
2713 assert(fd >= 0);
2714 assert(root_device);
2715 assert(home_device);
2716 assert(srv_device);
2717 assert(secondary);
2718 assert(arg_image);
2719
2720 b = blkid_new_probe();
2721 if (!b)
2722 return log_oom();
2723
2724 errno = 0;
2725 r = blkid_probe_set_device(b, fd, 0, 0);
2726 if (r != 0) {
2727 if (errno == 0)
2728 return log_oom();
2729
2730 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2731 return -errno;
2732 }
2733
2734 blkid_probe_enable_partitions(b, 1);
2735 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2736
2737 errno = 0;
2738 r = blkid_do_safeprobe(b);
2739 if (r == -2 || r == 1) {
2740 log_error("Failed to identify any partition table on\n"
2741 " %s\n"
2742 PARTITION_TABLE_BLURB, arg_image);
2743 return -EINVAL;
2744 } else if (r != 0) {
2745 if (errno == 0)
2746 errno = EIO;
2747 log_error_errno(errno, "Failed to probe: %m");
2748 return -errno;
2749 }
2750
2751 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2752
2753 is_gpt = streq_ptr(pttype, "gpt");
2754 is_mbr = streq_ptr(pttype, "dos");
2755
2756 if (!is_gpt && !is_mbr) {
2757 log_error("No GPT or MBR partition table discovered on\n"
2758 " %s\n"
2759 PARTITION_TABLE_BLURB, arg_image);
2760 return -EINVAL;
2761 }
2762
2763 errno = 0;
2764 pl = blkid_probe_get_partitions(b);
2765 if (!pl) {
2766 if (errno == 0)
2767 return log_oom();
2768
2769 log_error("Failed to list partitions of %s", arg_image);
2770 return -errno;
2771 }
2772
2773 udev = udev_new();
2774 if (!udev)
2775 return log_oom();
2776
2777 if (fstat(fd, &st) < 0)
2778 return log_error_errno(errno, "Failed to stat block device: %m");
2779
2780 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2781 if (!d)
2782 return log_oom();
2783
2784 for (i = 0;; i++) {
2785 int n, m;
2786
2787 if (i >= 10) {
2788 log_error("Kernel partitions never appeared.");
2789 return -ENXIO;
2790 }
2791
2792 e = udev_enumerate_new(udev);
2793 if (!e)
2794 return log_oom();
2795
2796 r = udev_enumerate_add_match_parent(e, d);
2797 if (r < 0)
2798 return log_oom();
2799
2800 r = udev_enumerate_scan_devices(e);
2801 if (r < 0)
2802 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2803
2804 /* Count the partitions enumerated by the kernel */
2805 n = 0;
2806 first = udev_enumerate_get_list_entry(e);
2807 udev_list_entry_foreach(item, first)
2808 n++;
2809
2810 /* Count the partitions enumerated by blkid */
2811 m = blkid_partlist_numof_partitions(pl);
2812 if (n == m + 1)
2813 break;
2814 if (n > m + 1) {
2815 log_error("blkid and kernel partition list do not match.");
2816 return -EIO;
2817 }
2818 if (n < m + 1) {
2819 unsigned j;
2820
2821 /* The kernel has probed fewer partitions than
2822 * blkid? Maybe the kernel prober is still
2823 * running or it got EBUSY because udev
2824 * already opened the device. Let's reprobe
2825 * the device, which is a synchronous call
2826 * that waits until probing is complete. */
2827
2828 for (j = 0; j < 20; j++) {
2829
2830 r = ioctl(fd, BLKRRPART, 0);
2831 if (r < 0)
2832 r = -errno;
2833 if (r >= 0 || r != -EBUSY)
2834 break;
2835
2836 /* If something else has the device
2837 * open, such as an udev rule, the
2838 * ioctl will return EBUSY. Since
2839 * there's no way to wait until it
2840 * isn't busy anymore, let's just wait
2841 * a bit, and try again.
2842 *
2843 * This is really something they
2844 * should fix in the kernel! */
2845
2846 usleep(50 * USEC_PER_MSEC);
2847 }
2848
2849 if (r < 0)
2850 return log_error_errno(r, "Failed to reread partition table: %m");
2851 }
2852
2853 e = udev_enumerate_unref(e);
2854 }
2855
2856 first = udev_enumerate_get_list_entry(e);
2857 udev_list_entry_foreach(item, first) {
2858 _cleanup_udev_device_unref_ struct udev_device *q;
2859 const char *node;
2860 unsigned long long flags;
2861 blkid_partition pp;
2862 dev_t qn;
2863 int nr;
2864
2865 errno = 0;
2866 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2867 if (!q) {
2868 if (!errno)
2869 errno = ENOMEM;
2870
2871 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2872 return -errno;
2873 }
2874
2875 qn = udev_device_get_devnum(q);
2876 if (major(qn) == 0)
2877 continue;
2878
2879 if (st.st_rdev == qn)
2880 continue;
2881
2882 node = udev_device_get_devnode(q);
2883 if (!node)
2884 continue;
2885
2886 pp = blkid_partlist_devno_to_partition(pl, qn);
2887 if (!pp)
2888 continue;
2889
2890 flags = blkid_partition_get_flags(pp);
2891
2892 nr = blkid_partition_get_partno(pp);
2893 if (nr < 0)
2894 continue;
2895
2896 if (is_gpt) {
2897 sd_id128_t type_id;
2898 const char *stype;
2899
2900 if (flags & GPT_FLAG_NO_AUTO)
2901 continue;
2902
2903 stype = blkid_partition_get_type_string(pp);
2904 if (!stype)
2905 continue;
2906
2907 if (sd_id128_from_string(stype, &type_id) < 0)
2908 continue;
2909
2910 if (sd_id128_equal(type_id, GPT_HOME)) {
2911
2912 if (home && nr >= home_nr)
2913 continue;
2914
2915 home_nr = nr;
2916 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2917
2918 r = free_and_strdup(&home, node);
2919 if (r < 0)
2920 return log_oom();
2921
2922 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2923
2924 if (srv && nr >= srv_nr)
2925 continue;
2926
2927 srv_nr = nr;
2928 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2929
2930 r = free_and_strdup(&srv, node);
2931 if (r < 0)
2932 return log_oom();
2933 }
2934 #ifdef GPT_ROOT_NATIVE
2935 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2936
2937 if (root && nr >= root_nr)
2938 continue;
2939
2940 root_nr = nr;
2941 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2942
2943 r = free_and_strdup(&root, node);
2944 if (r < 0)
2945 return log_oom();
2946 }
2947 #endif
2948 #ifdef GPT_ROOT_SECONDARY
2949 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2950
2951 if (secondary_root && nr >= secondary_root_nr)
2952 continue;
2953
2954 secondary_root_nr = nr;
2955 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2956
2957 r = free_and_strdup(&secondary_root, node);
2958 if (r < 0)
2959 return log_oom();
2960 }
2961 #endif
2962 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2963
2964 if (generic)
2965 multiple_generic = true;
2966 else {
2967 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2968
2969 r = free_and_strdup(&generic, node);
2970 if (r < 0)
2971 return log_oom();
2972 }
2973 }
2974
2975 } else if (is_mbr) {
2976 int type;
2977
2978 if (flags != 0x80) /* Bootable flag */
2979 continue;
2980
2981 type = blkid_partition_get_type(pp);
2982 if (type != 0x83) /* Linux partition */
2983 continue;
2984
2985 if (generic)
2986 multiple_generic = true;
2987 else {
2988 generic_rw = true;
2989
2990 r = free_and_strdup(&root, node);
2991 if (r < 0)
2992 return log_oom();
2993 }
2994 }
2995 }
2996
2997 if (root) {
2998 *root_device = root;
2999 root = NULL;
3000
3001 *root_device_rw = root_rw;
3002 *secondary = false;
3003 } else if (secondary_root) {
3004 *root_device = secondary_root;
3005 secondary_root = NULL;
3006
3007 *root_device_rw = secondary_root_rw;
3008 *secondary = true;
3009 } else if (generic) {
3010
3011 /* There were no partitions with precise meanings
3012 * around, but we found generic partitions. In this
3013 * case, if there's only one, we can go ahead and boot
3014 * it, otherwise we bail out, because we really cannot
3015 * make any sense of it. */
3016
3017 if (multiple_generic) {
3018 log_error("Identified multiple bootable Linux partitions on\n"
3019 " %s\n"
3020 PARTITION_TABLE_BLURB, arg_image);
3021 return -EINVAL;
3022 }
3023
3024 *root_device = generic;
3025 generic = NULL;
3026
3027 *root_device_rw = generic_rw;
3028 *secondary = false;
3029 } else {
3030 log_error("Failed to identify root partition in disk image\n"
3031 " %s\n"
3032 PARTITION_TABLE_BLURB, arg_image);
3033 return -EINVAL;
3034 }
3035
3036 if (home) {
3037 *home_device = home;
3038 home = NULL;
3039
3040 *home_device_rw = home_rw;
3041 }
3042
3043 if (srv) {
3044 *srv_device = srv;
3045 srv = NULL;
3046
3047 *srv_device_rw = srv_rw;
3048 }
3049
3050 return 0;
3051 #else
3052 log_error("--image= is not supported, compiled without blkid support.");
3053 return -ENOTSUP;
3054 #endif
3055 }
3056
3057 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3058 #ifdef HAVE_BLKID
3059 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3060 const char *fstype, *p;
3061 int r;
3062
3063 assert(what);
3064 assert(where);
3065
3066 if (arg_read_only)
3067 rw = false;
3068
3069 if (directory)
3070 p = strjoina(where, directory);
3071 else
3072 p = where;
3073
3074 errno = 0;
3075 b = blkid_new_probe_from_filename(what);
3076 if (!b) {
3077 if (errno == 0)
3078 return log_oom();
3079 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3080 return -errno;
3081 }
3082
3083 blkid_probe_enable_superblocks(b, 1);
3084 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3085
3086 errno = 0;
3087 r = blkid_do_safeprobe(b);
3088 if (r == -1 || r == 1) {
3089 log_error("Cannot determine file system type of %s", what);
3090 return -EINVAL;
3091 } else if (r != 0) {
3092 if (errno == 0)
3093 errno = EIO;
3094 log_error_errno(errno, "Failed to probe %s: %m", what);
3095 return -errno;
3096 }
3097
3098 errno = 0;
3099 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3100 if (errno == 0)
3101 errno = EINVAL;
3102 log_error("Failed to determine file system type of %s", what);
3103 return -errno;
3104 }
3105
3106 if (streq(fstype, "crypto_LUKS")) {
3107 log_error("nspawn currently does not support LUKS disk images.");
3108 return -ENOTSUP;
3109 }
3110
3111 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3112 return log_error_errno(errno, "Failed to mount %s: %m", what);
3113
3114 return 0;
3115 #else
3116 log_error("--image= is not supported, compiled without blkid support.");
3117 return -ENOTSUP;
3118 #endif
3119 }
3120
3121 static int mount_devices(
3122 const char *where,
3123 const char *root_device, bool root_device_rw,
3124 const char *home_device, bool home_device_rw,
3125 const char *srv_device, bool srv_device_rw) {
3126 int r;
3127
3128 assert(where);
3129
3130 if (root_device) {
3131 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3132 if (r < 0)
3133 return log_error_errno(r, "Failed to mount root directory: %m");
3134 }
3135
3136 if (home_device) {
3137 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3138 if (r < 0)
3139 return log_error_errno(r, "Failed to mount home directory: %m");
3140 }
3141
3142 if (srv_device) {
3143 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3144 if (r < 0)
3145 return log_error_errno(r, "Failed to mount server data directory: %m");
3146 }
3147
3148 return 0;
3149 }
3150
3151 static void loop_remove(int nr, int *image_fd) {
3152 _cleanup_close_ int control = -1;
3153 int r;
3154
3155 if (nr < 0)
3156 return;
3157
3158 if (image_fd && *image_fd >= 0) {
3159 r = ioctl(*image_fd, LOOP_CLR_FD);
3160 if (r < 0)
3161 log_debug_errno(errno, "Failed to close loop image: %m");
3162 *image_fd = safe_close(*image_fd);
3163 }
3164
3165 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3166 if (control < 0) {
3167 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3168 return;
3169 }
3170
3171 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3172 if (r < 0)
3173 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3174 }
3175
3176 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3177 int pipe_fds[2];
3178 pid_t pid;
3179
3180 assert(database);
3181 assert(key);
3182 assert(rpid);
3183
3184 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3185 return log_error_errno(errno, "Failed to allocate pipe: %m");
3186
3187 pid = fork();
3188 if (pid < 0)
3189 return log_error_errno(errno, "Failed to fork getent child: %m");
3190 else if (pid == 0) {
3191 int nullfd;
3192 char *empty_env = NULL;
3193
3194 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3195 _exit(EXIT_FAILURE);
3196
3197 if (pipe_fds[0] > 2)
3198 safe_close(pipe_fds[0]);
3199 if (pipe_fds[1] > 2)
3200 safe_close(pipe_fds[1]);
3201
3202 nullfd = open("/dev/null", O_RDWR);
3203 if (nullfd < 0)
3204 _exit(EXIT_FAILURE);
3205
3206 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3207 _exit(EXIT_FAILURE);
3208
3209 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3210 _exit(EXIT_FAILURE);
3211
3212 if (nullfd > 2)
3213 safe_close(nullfd);
3214
3215 reset_all_signal_handlers();
3216 close_all_fds(NULL, 0);
3217
3218 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3219 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3220 _exit(EXIT_FAILURE);
3221 }
3222
3223 pipe_fds[1] = safe_close(pipe_fds[1]);
3224
3225 *rpid = pid;
3226
3227 return pipe_fds[0];
3228 }
3229
3230 static int change_uid_gid(char **_home) {
3231 char line[LINE_MAX], *x, *u, *g, *h;
3232 const char *word, *state;
3233 _cleanup_free_ uid_t *uids = NULL;
3234 _cleanup_free_ char *home = NULL;
3235 _cleanup_fclose_ FILE *f = NULL;
3236 _cleanup_close_ int fd = -1;
3237 unsigned n_uids = 0;
3238 size_t sz = 0, l;
3239 uid_t uid;
3240 gid_t gid;
3241 pid_t pid;
3242 int r;
3243
3244 assert(_home);
3245
3246 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3247 /* Reset everything fully to 0, just in case */
3248
3249 if (setgroups(0, NULL) < 0)
3250 return log_error_errno(errno, "setgroups() failed: %m");
3251
3252 if (setresgid(0, 0, 0) < 0)
3253 return log_error_errno(errno, "setregid() failed: %m");
3254
3255 if (setresuid(0, 0, 0) < 0)
3256 return log_error_errno(errno, "setreuid() failed: %m");
3257
3258 *_home = NULL;
3259 return 0;
3260 }
3261
3262 /* First, get user credentials */
3263 fd = spawn_getent("passwd", arg_user, &pid);
3264 if (fd < 0)
3265 return fd;
3266
3267 f = fdopen(fd, "r");
3268 if (!f)
3269 return log_oom();
3270 fd = -1;
3271
3272 if (!fgets(line, sizeof(line), f)) {
3273
3274 if (!ferror(f)) {
3275 log_error("Failed to resolve user %s.", arg_user);
3276 return -ESRCH;
3277 }
3278
3279 log_error_errno(errno, "Failed to read from getent: %m");
3280 return -errno;
3281 }
3282
3283 truncate_nl(line);
3284
3285 wait_for_terminate_and_warn("getent passwd", pid, true);
3286
3287 x = strchr(line, ':');
3288 if (!x) {
3289 log_error("/etc/passwd entry has invalid user field.");
3290 return -EIO;
3291 }
3292
3293 u = strchr(x+1, ':');
3294 if (!u) {
3295 log_error("/etc/passwd entry has invalid password field.");
3296 return -EIO;
3297 }
3298
3299 u++;
3300 g = strchr(u, ':');
3301 if (!g) {
3302 log_error("/etc/passwd entry has invalid UID field.");
3303 return -EIO;
3304 }
3305
3306 *g = 0;
3307 g++;
3308 x = strchr(g, ':');
3309 if (!x) {
3310 log_error("/etc/passwd entry has invalid GID field.");
3311 return -EIO;
3312 }
3313
3314 *x = 0;
3315 h = strchr(x+1, ':');
3316 if (!h) {
3317 log_error("/etc/passwd entry has invalid GECOS field.");
3318 return -EIO;
3319 }
3320
3321 h++;
3322 x = strchr(h, ':');
3323 if (!x) {
3324 log_error("/etc/passwd entry has invalid home directory field.");
3325 return -EIO;
3326 }
3327
3328 *x = 0;
3329
3330 r = parse_uid(u, &uid);
3331 if (r < 0) {
3332 log_error("Failed to parse UID of user.");
3333 return -EIO;
3334 }
3335
3336 r = parse_gid(g, &gid);
3337 if (r < 0) {
3338 log_error("Failed to parse GID of user.");
3339 return -EIO;
3340 }
3341
3342 home = strdup(h);
3343 if (!home)
3344 return log_oom();
3345
3346 /* Second, get group memberships */
3347 fd = spawn_getent("initgroups", arg_user, &pid);
3348 if (fd < 0)
3349 return fd;
3350
3351 fclose(f);
3352 f = fdopen(fd, "r");
3353 if (!f)
3354 return log_oom();
3355 fd = -1;
3356
3357 if (!fgets(line, sizeof(line), f)) {
3358 if (!ferror(f)) {
3359 log_error("Failed to resolve user %s.", arg_user);
3360 return -ESRCH;
3361 }
3362
3363 log_error_errno(errno, "Failed to read from getent: %m");
3364 return -errno;
3365 }
3366
3367 truncate_nl(line);
3368
3369 wait_for_terminate_and_warn("getent initgroups", pid, true);
3370
3371 /* Skip over the username and subsequent separator whitespace */
3372 x = line;
3373 x += strcspn(x, WHITESPACE);
3374 x += strspn(x, WHITESPACE);
3375
3376 FOREACH_WORD(word, l, x, state) {
3377 char c[l+1];
3378
3379 memcpy(c, word, l);
3380 c[l] = 0;
3381
3382 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3383 return log_oom();
3384
3385 r = parse_uid(c, &uids[n_uids++]);
3386 if (r < 0) {
3387 log_error("Failed to parse group data from getent.");
3388 return -EIO;
3389 }
3390 }
3391
3392 r = mkdir_parents(home, 0775);
3393 if (r < 0)
3394 return log_error_errno(r, "Failed to make home root directory: %m");
3395
3396 r = mkdir_safe(home, 0755, uid, gid);
3397 if (r < 0 && r != -EEXIST)
3398 return log_error_errno(r, "Failed to make home directory: %m");
3399
3400 fchown(STDIN_FILENO, uid, gid);
3401 fchown(STDOUT_FILENO, uid, gid);
3402 fchown(STDERR_FILENO, uid, gid);
3403
3404 if (setgroups(n_uids, uids) < 0)
3405 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3406
3407 if (setresgid(gid, gid, gid) < 0)
3408 return log_error_errno(errno, "setregid() failed: %m");
3409
3410 if (setresuid(uid, uid, uid) < 0)
3411 return log_error_errno(errno, "setreuid() failed: %m");
3412
3413 if (_home) {
3414 *_home = home;
3415 home = NULL;
3416 }
3417
3418 return 0;
3419 }
3420
3421 /*
3422 * Return values:
3423 * < 0 : wait_for_terminate() failed to get the state of the
3424 * container, the container was terminated by a signal, or
3425 * failed for an unknown reason. No change is made to the
3426 * container argument.
3427 * > 0 : The program executed in the container terminated with an
3428 * error. The exit code of the program executed in the
3429 * container is returned. The container argument has been set
3430 * to CONTAINER_TERMINATED.
3431 * 0 : The container is being rebooted, has been shut down or exited
3432 * successfully. The container argument has been set to either
3433 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3434 *
3435 * That is, success is indicated by a return value of zero, and an
3436 * error is indicated by a non-zero value.
3437 */
3438 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3439 siginfo_t status;
3440 int r;
3441
3442 r = wait_for_terminate(pid, &status);
3443 if (r < 0)
3444 return log_warning_errno(r, "Failed to wait for container: %m");
3445
3446 switch (status.si_code) {
3447
3448 case CLD_EXITED:
3449 if (status.si_status == 0) {
3450 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3451
3452 } else
3453 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3454
3455 *container = CONTAINER_TERMINATED;
3456 return status.si_status;
3457
3458 case CLD_KILLED:
3459 if (status.si_status == SIGINT) {
3460
3461 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3462 *container = CONTAINER_TERMINATED;
3463 return 0;
3464
3465 } else if (status.si_status == SIGHUP) {
3466
3467 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3468 *container = CONTAINER_REBOOTED;
3469 return 0;
3470 }
3471
3472 /* CLD_KILLED fallthrough */
3473
3474 case CLD_DUMPED:
3475 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3476 return -EIO;
3477
3478 default:
3479 log_error("Container %s failed due to unknown reason.", arg_machine);
3480 return -EIO;
3481 }
3482
3483 return r;
3484 }
3485
3486 static void nop_handler(int sig) {}
3487
3488 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3489 pid_t pid;
3490
3491 pid = PTR_TO_UINT32(userdata);
3492 if (pid > 0) {
3493 if (kill(pid, SIGRTMIN+3) >= 0) {
3494 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3495 sd_event_source_set_userdata(s, NULL);
3496 return 0;
3497 }
3498 }
3499
3500 sd_event_exit(sd_event_source_get_event(s), 0);
3501 return 0;
3502 }
3503
3504 static int determine_names(void) {
3505 int r;
3506
3507 if (!arg_image && !arg_directory) {
3508 if (arg_machine) {
3509 _cleanup_(image_unrefp) Image *i = NULL;
3510
3511 r = image_find(arg_machine, &i);
3512 if (r < 0)
3513 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3514 else if (r == 0) {
3515 log_error("No image for machine '%s': %m", arg_machine);
3516 return -ENOENT;
3517 }
3518
3519 if (i->type == IMAGE_RAW)
3520 r = set_sanitized_path(&arg_image, i->path);
3521 else
3522 r = set_sanitized_path(&arg_directory, i->path);
3523 if (r < 0)
3524 return log_error_errno(r, "Invalid image directory: %m");
3525
3526 arg_read_only = arg_read_only || i->read_only;
3527 } else
3528 arg_directory = get_current_dir_name();
3529
3530 if (!arg_directory && !arg_machine) {
3531 log_error("Failed to determine path, please use -D or -i.");
3532 return -EINVAL;
3533 }
3534 }
3535
3536 if (!arg_machine) {
3537 if (arg_directory && path_equal(arg_directory, "/"))
3538 arg_machine = gethostname_malloc();
3539 else
3540 arg_machine = strdup(basename(arg_image ?: arg_directory));
3541
3542 if (!arg_machine)
3543 return log_oom();
3544
3545 hostname_cleanup(arg_machine, false);
3546 if (!machine_name_is_valid(arg_machine)) {
3547 log_error("Failed to determine machine name automatically, please use -M.");
3548 return -EINVAL;
3549 }
3550
3551 if (arg_ephemeral) {
3552 char *b;
3553
3554 /* Add a random suffix when this is an
3555 * ephemeral machine, so that we can run many
3556 * instances at once without manually having
3557 * to specify -M each time. */
3558
3559 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3560 return log_oom();
3561
3562 free(arg_machine);
3563 arg_machine = b;
3564 }
3565 }
3566
3567 return 0;
3568 }
3569
3570 int main(int argc, char *argv[]) {
3571
3572 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3573 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3574 _cleanup_close_ int master = -1, image_fd = -1;
3575 _cleanup_fdset_free_ FDSet *fds = NULL;
3576 int r, n_fd_passed, loop_nr = -1;
3577 char veth_name[IFNAMSIZ];
3578 bool secondary = false, remove_subvol = false;
3579 sigset_t mask, mask_chld;
3580 pid_t pid = 0;
3581 int ret = EXIT_SUCCESS;
3582 union in_addr_union exposed = {};
3583 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3584
3585 log_parse_environment();
3586 log_open();
3587
3588 r = parse_argv(argc, argv);
3589 if (r <= 0)
3590 goto finish;
3591
3592 r = determine_names();
3593 if (r < 0)
3594 goto finish;
3595
3596 if (geteuid() != 0) {
3597 log_error("Need to be root.");
3598 r = -EPERM;
3599 goto finish;
3600 }
3601
3602 if (sd_booted() <= 0) {
3603 log_error("Not running on a systemd system.");
3604 r = -EINVAL;
3605 goto finish;
3606 }
3607
3608 log_close();
3609 n_fd_passed = sd_listen_fds(false);
3610 if (n_fd_passed > 0) {
3611 r = fdset_new_listen_fds(&fds, false);
3612 if (r < 0) {
3613 log_error_errno(r, "Failed to collect file descriptors: %m");
3614 goto finish;
3615 }
3616 }
3617 fdset_close_others(fds);
3618 log_open();
3619
3620 if (arg_directory) {
3621 assert(!arg_image);
3622
3623 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3624 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3625 r = -EINVAL;
3626 goto finish;
3627 }
3628
3629 if (arg_ephemeral) {
3630 char *np;
3631
3632 /* If the specified path is a mount point we
3633 * generate the new snapshot immediately
3634 * inside it under a random name. However if
3635 * the specified is not a mount point we
3636 * create the new snapshot in the parent
3637 * directory, just next to it. */
3638 r = path_is_mount_point(arg_directory, false);
3639 if (r < 0) {
3640 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3641 goto finish;
3642 }
3643 if (r > 0)
3644 r = tempfn_random_child(arg_directory, &np);
3645 else
3646 r = tempfn_random(arg_directory, &np);
3647 if (r < 0) {
3648 log_error_errno(r, "Failed to generate name for snapshot: %m");
3649 goto finish;
3650 }
3651
3652 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3653 if (r < 0) {
3654 log_error_errno(r, "Failed to lock %s: %m", np);
3655 goto finish;
3656 }
3657
3658 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3659 if (r < 0) {
3660 free(np);
3661 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3662 goto finish;
3663 }
3664
3665 free(arg_directory);
3666 arg_directory = np;
3667
3668 remove_subvol = true;
3669
3670 } else {
3671 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3672 if (r == -EBUSY) {
3673 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3674 goto finish;
3675 }
3676 if (r < 0) {
3677 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3678 return r;
3679 }
3680
3681 if (arg_template) {
3682 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3683 if (r == -EEXIST) {
3684 if (!arg_quiet)
3685 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3686 } else if (r < 0) {
3687 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3688 goto finish;
3689 } else {
3690 if (!arg_quiet)
3691 log_info("Populated %s from template %s.", arg_directory, arg_template);
3692 }
3693 }
3694 }
3695
3696 if (arg_boot) {
3697 if (path_is_os_tree(arg_directory) <= 0) {
3698 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3699 r = -EINVAL;
3700 goto finish;
3701 }
3702 } else {
3703 const char *p;
3704
3705 p = strjoina(arg_directory,
3706 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3707 if (access(p, F_OK) < 0) {
3708 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3709 r = -EINVAL;
3710 goto finish;
3711 }
3712 }
3713
3714 } else {
3715 char template[] = "/tmp/nspawn-root-XXXXXX";
3716
3717 assert(arg_image);
3718 assert(!arg_template);
3719
3720 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3721 if (r == -EBUSY) {
3722 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3723 goto finish;
3724 }
3725 if (r < 0) {
3726 r = log_error_errno(r, "Failed to create image lock: %m");
3727 goto finish;
3728 }
3729
3730 if (!mkdtemp(template)) {
3731 log_error_errno(errno, "Failed to create temporary directory: %m");
3732 r = -errno;
3733 goto finish;
3734 }
3735
3736 arg_directory = strdup(template);
3737 if (!arg_directory) {
3738 r = log_oom();
3739 goto finish;
3740 }
3741
3742 image_fd = setup_image(&device_path, &loop_nr);
3743 if (image_fd < 0) {
3744 r = image_fd;
3745 goto finish;
3746 }
3747
3748 r = dissect_image(image_fd,
3749 &root_device, &root_device_rw,
3750 &home_device, &home_device_rw,
3751 &srv_device, &srv_device_rw,
3752 &secondary);
3753 if (r < 0)
3754 goto finish;
3755 }
3756
3757 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3758 if (master < 0) {
3759 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3760 goto finish;
3761 }
3762
3763 r = ptsname_malloc(master, &console);
3764 if (r < 0) {
3765 r = log_error_errno(r, "Failed to determine tty name: %m");
3766 goto finish;
3767 }
3768
3769 if (!arg_quiet)
3770 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3771 arg_machine, arg_image ?: arg_directory);
3772
3773 if (unlockpt(master) < 0) {
3774 r = log_error_errno(errno, "Failed to unlock tty: %m");
3775 goto finish;
3776 }
3777
3778 assert_se(sigemptyset(&mask) == 0);
3779 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3780 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3781
3782 assert_se(sigemptyset(&mask_chld) == 0);
3783 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3784
3785 for (;;) {
3786 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3787 ContainerStatus container_status;
3788 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3789 struct sigaction sa = {
3790 .sa_handler = nop_handler,
3791 .sa_flags = SA_NOCLDSTOP,
3792 };
3793
3794 r = barrier_create(&barrier);
3795 if (r < 0) {
3796 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3797 goto finish;
3798 }
3799
3800 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3801 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3802 goto finish;
3803 }
3804
3805 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3806 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3807 goto finish;
3808 }
3809
3810 /* Child can be killed before execv(), so handle SIGCHLD
3811 * in order to interrupt parent's blocking calls and
3812 * give it a chance to call wait() and terminate. */
3813 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3814 if (r < 0) {
3815 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3816 goto finish;
3817 }
3818
3819 r = sigaction(SIGCHLD, &sa, NULL);
3820 if (r < 0) {
3821 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3822 goto finish;
3823 }
3824
3825 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3826 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3827 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3828 if (pid < 0) {
3829 if (errno == EINVAL)
3830 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3831 else
3832 r = log_error_errno(errno, "clone() failed: %m");
3833
3834 goto finish;
3835 }
3836
3837 if (pid == 0) {
3838 /* child */
3839 _cleanup_free_ char *home = NULL;
3840 unsigned n_env = 2;
3841 const char *envp[] = {
3842 "PATH=" DEFAULT_PATH_SPLIT_USR,
3843 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3844 NULL, /* TERM */
3845 NULL, /* HOME */
3846 NULL, /* USER */
3847 NULL, /* LOGNAME */
3848 NULL, /* container_uuid */
3849 NULL, /* LISTEN_FDS */
3850 NULL, /* LISTEN_PID */
3851 NULL
3852 };
3853 char **env_use;
3854
3855 barrier_set_role(&barrier, BARRIER_CHILD);
3856
3857 envp[n_env] = strv_find_prefix(environ, "TERM=");
3858 if (envp[n_env])
3859 n_env ++;
3860
3861 master = safe_close(master);
3862
3863 close_nointr(STDIN_FILENO);
3864 close_nointr(STDOUT_FILENO);
3865 close_nointr(STDERR_FILENO);
3866
3867 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3868 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3869
3870 reset_all_signal_handlers();
3871 reset_signal_mask();
3872
3873 r = open_terminal(console, O_RDWR);
3874 if (r != STDIN_FILENO) {
3875 if (r >= 0) {
3876 safe_close(r);
3877 r = -EINVAL;
3878 }
3879
3880 log_error_errno(r, "Failed to open console: %m");
3881 _exit(EXIT_FAILURE);
3882 }
3883
3884 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3885 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3886 log_error_errno(errno, "Failed to duplicate console: %m");
3887 _exit(EXIT_FAILURE);
3888 }
3889
3890 if (setsid() < 0) {
3891 log_error_errno(errno, "setsid() failed: %m");
3892 _exit(EXIT_FAILURE);
3893 }
3894
3895 if (reset_audit_loginuid() < 0)
3896 _exit(EXIT_FAILURE);
3897
3898 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3899 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3900 _exit(EXIT_FAILURE);
3901 }
3902
3903 /* Mark everything as slave, so that we still
3904 * receive mounts from the real root, but don't
3905 * propagate mounts to the real root. */
3906 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3907 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3908 _exit(EXIT_FAILURE);
3909 }
3910
3911 if (mount_devices(arg_directory,
3912 root_device, root_device_rw,
3913 home_device, home_device_rw,
3914 srv_device, srv_device_rw) < 0)
3915 _exit(EXIT_FAILURE);
3916
3917 /* Turn directory into bind mount */
3918 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3919 log_error_errno(errno, "Failed to make bind mount: %m");
3920 _exit(EXIT_FAILURE);
3921 }
3922
3923 r = setup_volatile(arg_directory);
3924 if (r < 0)
3925 _exit(EXIT_FAILURE);
3926
3927 if (setup_volatile_state(arg_directory) < 0)
3928 _exit(EXIT_FAILURE);
3929
3930 r = base_filesystem_create(arg_directory);
3931 if (r < 0)
3932 _exit(EXIT_FAILURE);
3933
3934 if (arg_read_only) {
3935 r = bind_remount_recursive(arg_directory, true);
3936 if (r < 0) {
3937 log_error_errno(r, "Failed to make tree read-only: %m");
3938 _exit(EXIT_FAILURE);
3939 }
3940 }
3941
3942 if (mount_all(arg_directory) < 0)
3943 _exit(EXIT_FAILURE);
3944
3945 if (copy_devnodes(arg_directory) < 0)
3946 _exit(EXIT_FAILURE);
3947
3948 if (setup_ptmx(arg_directory) < 0)
3949 _exit(EXIT_FAILURE);
3950
3951 dev_setup(arg_directory);
3952
3953 if (setup_propagate(arg_directory) < 0)
3954 _exit(EXIT_FAILURE);
3955
3956 if (setup_seccomp() < 0)
3957 _exit(EXIT_FAILURE);
3958
3959 if (setup_dev_console(arg_directory, console) < 0)
3960 _exit(EXIT_FAILURE);
3961
3962 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3963 _exit(EXIT_FAILURE);
3964 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3965
3966 if (send_rtnl(rtnl_socket_pair[1]) < 0)
3967 _exit(EXIT_FAILURE);
3968 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3969
3970 /* Tell the parent that we are ready, and that
3971 * it can cgroupify us to that we lack access
3972 * to certain devices and resources. */
3973 (void) barrier_place(&barrier);
3974
3975 if (setup_boot_id(arg_directory) < 0)
3976 _exit(EXIT_FAILURE);
3977
3978 if (setup_timezone(arg_directory) < 0)
3979 _exit(EXIT_FAILURE);
3980
3981 if (setup_resolv_conf(arg_directory) < 0)
3982 _exit(EXIT_FAILURE);
3983
3984 if (setup_journal(arg_directory) < 0)
3985 _exit(EXIT_FAILURE);
3986
3987 if (mount_binds(arg_directory, arg_bind, false) < 0)
3988 _exit(EXIT_FAILURE);
3989
3990 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3991 _exit(EXIT_FAILURE);
3992
3993 if (mount_tmpfs(arg_directory) < 0)
3994 _exit(EXIT_FAILURE);
3995
3996 /* Wait until we are cgroup-ified, so that we
3997 * can mount the right cgroup path writable */
3998 (void) barrier_sync_next(&barrier);
3999
4000 if (mount_cgroup(arg_directory) < 0)
4001 _exit(EXIT_FAILURE);
4002
4003 if (chdir(arg_directory) < 0) {
4004 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
4005 _exit(EXIT_FAILURE);
4006 }
4007
4008 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
4009 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
4010 _exit(EXIT_FAILURE);
4011 }
4012
4013 if (chroot(".") < 0) {
4014 log_error_errno(errno, "chroot() failed: %m");
4015 _exit(EXIT_FAILURE);
4016 }
4017
4018 if (chdir("/") < 0) {
4019 log_error_errno(errno, "chdir() failed: %m");
4020 _exit(EXIT_FAILURE);
4021 }
4022
4023 umask(0022);
4024
4025 if (arg_private_network)
4026 loopback_setup();
4027
4028 if (drop_capabilities() < 0) {
4029 log_error_errno(errno, "drop_capabilities() failed: %m");
4030 _exit(EXIT_FAILURE);
4031 }
4032
4033 r = change_uid_gid(&home);
4034 if (r < 0)
4035 _exit(EXIT_FAILURE);
4036
4037 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4038 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4039 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4040 log_oom();
4041 _exit(EXIT_FAILURE);
4042 }
4043
4044 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4045 char as_uuid[37];
4046
4047 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4048 log_oom();
4049 _exit(EXIT_FAILURE);
4050 }
4051 }
4052
4053 if (fdset_size(fds) > 0) {
4054 r = fdset_cloexec(fds, false);
4055 if (r < 0) {
4056 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4057 _exit(EXIT_FAILURE);
4058 }
4059
4060 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4061 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4062 log_oom();
4063 _exit(EXIT_FAILURE);
4064 }
4065 }
4066
4067 setup_hostname();
4068
4069 if (arg_personality != 0xffffffffLU) {
4070 if (personality(arg_personality) < 0) {
4071 log_error_errno(errno, "personality() failed: %m");
4072 _exit(EXIT_FAILURE);
4073 }
4074 } else if (secondary) {
4075 if (personality(PER_LINUX32) < 0) {
4076 log_error_errno(errno, "personality() failed: %m");
4077 _exit(EXIT_FAILURE);
4078 }
4079 }
4080
4081 #ifdef HAVE_SELINUX
4082 if (arg_selinux_context)
4083 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4084 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4085 _exit(EXIT_FAILURE);
4086 }
4087 #endif
4088
4089 if (!strv_isempty(arg_setenv)) {
4090 char **n;
4091
4092 n = strv_env_merge(2, envp, arg_setenv);
4093 if (!n) {
4094 log_oom();
4095 _exit(EXIT_FAILURE);
4096 }
4097
4098 env_use = n;
4099 } else
4100 env_use = (char**) envp;
4101
4102 /* Wait until the parent is ready with the setup, too... */
4103 if (!barrier_place_and_sync(&barrier))
4104 _exit(EXIT_FAILURE);
4105
4106 if (arg_boot) {
4107 char **a;
4108 size_t l;
4109
4110 /* Automatically search for the init system */
4111
4112 l = 1 + argc - optind;
4113 a = newa(char*, l + 1);
4114 memcpy(a + 1, argv + optind, l * sizeof(char*));
4115
4116 a[0] = (char*) "/usr/lib/systemd/systemd";
4117 execve(a[0], a, env_use);
4118
4119 a[0] = (char*) "/lib/systemd/systemd";
4120 execve(a[0], a, env_use);
4121
4122 a[0] = (char*) "/sbin/init";
4123 execve(a[0], a, env_use);
4124 } else if (argc > optind)
4125 execvpe(argv[optind], argv + optind, env_use);
4126 else {
4127 chdir(home ? home : "/root");
4128 execle("/bin/bash", "-bash", NULL, env_use);
4129 execle("/bin/sh", "-sh", NULL, env_use);
4130 }
4131
4132 log_error_errno(errno, "execv() failed: %m");
4133 _exit(EXIT_FAILURE);
4134 }
4135
4136 barrier_set_role(&barrier, BARRIER_PARENT);
4137 fdset_free(fds);
4138 fds = NULL;
4139
4140 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4141 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4142
4143 /* Wait for the most basic Child-setup to be done,
4144 * before we add hardware to it, and place it in a
4145 * cgroup. */
4146 if (barrier_sync_next(&barrier)) {
4147 int ifi = 0;
4148
4149 r = move_network_interfaces(pid);
4150 if (r < 0)
4151 goto finish;
4152
4153 r = setup_veth(pid, veth_name, &ifi);
4154 if (r < 0)
4155 goto finish;
4156
4157 r = setup_bridge(veth_name, &ifi);
4158 if (r < 0)
4159 goto finish;
4160
4161 r = setup_macvlan(pid);
4162 if (r < 0)
4163 goto finish;
4164
4165 r = setup_ipvlan(pid);
4166 if (r < 0)
4167 goto finish;
4168
4169 r = register_machine(pid, ifi);
4170 if (r < 0)
4171 goto finish;
4172
4173 /* Block SIGCHLD here, before notifying child.
4174 * process_pty() will handle it with the other signals. */
4175 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4176 if (r < 0)
4177 goto finish;
4178
4179 /* Reset signal to default */
4180 r = default_signals(SIGCHLD, -1);
4181 if (r < 0)
4182 goto finish;
4183
4184 /* Notify the child that the parent is ready with all
4185 * its setup, and that the child can now hand over
4186 * control to the code to run inside the container. */
4187 (void) barrier_place(&barrier);
4188
4189 /* And wait that the child is completely ready now. */
4190 if (barrier_place_and_sync(&barrier)) {
4191 _cleanup_event_unref_ sd_event *event = NULL;
4192 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4193 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4194 char last_char = 0;
4195
4196 sd_notifyf(false,
4197 "READY=1\n"
4198 "STATUS=Container running.\n"
4199 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4200
4201 r = sd_event_new(&event);
4202 if (r < 0) {
4203 log_error_errno(r, "Failed to get default event source: %m");
4204 goto finish;
4205 }
4206
4207 if (arg_boot) {
4208 /* Try to kill the init system on SIGINT or SIGTERM */
4209 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4210 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4211 } else {
4212 /* Immediately exit */
4213 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4214 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4215 }
4216
4217 /* simply exit on sigchld */
4218 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4219
4220 if (arg_expose_ports) {
4221 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4222 if (r < 0)
4223 goto finish;
4224
4225 (void) expose_ports(rtnl, &exposed);
4226 }
4227
4228 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4229
4230 r = pty_forward_new(event, master, true, &forward);
4231 if (r < 0) {
4232 log_error_errno(r, "Failed to create PTY forwarder: %m");
4233 goto finish;
4234 }
4235
4236 r = sd_event_loop(event);
4237 if (r < 0) {
4238 log_error_errno(r, "Failed to run event loop: %m");
4239 goto finish;
4240 }
4241
4242 pty_forward_get_last_char(forward, &last_char);
4243
4244 forward = pty_forward_free(forward);
4245
4246 if (!arg_quiet && last_char != '\n')
4247 putc('\n', stdout);
4248
4249 /* Kill if it is not dead yet anyway */
4250 terminate_machine(pid);
4251 }
4252 }
4253
4254 /* Normally redundant, but better safe than sorry */
4255 kill(pid, SIGKILL);
4256
4257 r = wait_for_container(pid, &container_status);
4258 pid = 0;
4259
4260 if (r < 0)
4261 /* We failed to wait for the container, or the
4262 * container exited abnormally */
4263 goto finish;
4264 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4265 /* The container exited with a non-zero
4266 * status, or with zero status and no reboot
4267 * was requested. */
4268 ret = r;
4269 break;
4270 }
4271
4272 /* CONTAINER_REBOOTED, loop again */
4273
4274 if (arg_keep_unit) {
4275 /* Special handling if we are running as a
4276 * service: instead of simply restarting the
4277 * machine we want to restart the entire
4278 * service, so let's inform systemd about this
4279 * with the special exit code 133. The service
4280 * file uses RestartForceExitStatus=133 so
4281 * that this results in a full nspawn
4282 * restart. This is necessary since we might
4283 * have cgroup parameters set we want to have
4284 * flushed out. */
4285 ret = 133;
4286 r = 0;
4287 break;
4288 }
4289
4290 flush_ports(&exposed);
4291 }
4292
4293 finish:
4294 sd_notify(false,
4295 "STOPPING=1\n"
4296 "STATUS=Terminating...");
4297
4298 loop_remove(loop_nr, &image_fd);
4299
4300 if (pid > 0)
4301 kill(pid, SIGKILL);
4302
4303 if (remove_subvol && arg_directory) {
4304 int k;
4305
4306 k = btrfs_subvol_remove(arg_directory);
4307 if (k < 0)
4308 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4309 }
4310
4311 if (arg_machine) {
4312 const char *p;
4313
4314 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4315 (void) rm_rf(p, false, true, false);
4316 }
4317
4318 free(arg_directory);
4319 free(arg_template);
4320 free(arg_image);
4321 free(arg_machine);
4322 free(arg_user);
4323 strv_free(arg_setenv);
4324 strv_free(arg_network_interfaces);
4325 strv_free(arg_network_macvlan);
4326 strv_free(arg_network_ipvlan);
4327 strv_free(arg_bind);
4328 strv_free(arg_bind_ro);
4329 strv_free(arg_tmpfs);
4330
4331 flush_ports(&exposed);
4332
4333 while (arg_expose_ports) {
4334 ExposePort *p = arg_expose_ports;
4335 LIST_REMOVE(ports, arg_expose_ports, p);
4336 free(p);
4337 }
4338
4339 return r < 0 ? EXIT_FAILURE : ret;
4340 }