]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: mount /tmp in the container, don't leave this to the container's init
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108 int protocol;
109 uint16_t host_port;
110 uint16_t container_port;
111 LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115 CONTAINER_TERMINATED,
116 CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120 LINK_NO,
121 LINK_AUTO,
122 LINK_HOST,
123 LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127 VOLATILE_NO,
128 VOLATILE_YES,
129 VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147 (1ULL << CAP_CHOWN) |
148 (1ULL << CAP_DAC_OVERRIDE) |
149 (1ULL << CAP_DAC_READ_SEARCH) |
150 (1ULL << CAP_FOWNER) |
151 (1ULL << CAP_FSETID) |
152 (1ULL << CAP_IPC_OWNER) |
153 (1ULL << CAP_KILL) |
154 (1ULL << CAP_LEASE) |
155 (1ULL << CAP_LINUX_IMMUTABLE) |
156 (1ULL << CAP_NET_BIND_SERVICE) |
157 (1ULL << CAP_NET_BROADCAST) |
158 (1ULL << CAP_NET_RAW) |
159 (1ULL << CAP_SETGID) |
160 (1ULL << CAP_SETFCAP) |
161 (1ULL << CAP_SETPCAP) |
162 (1ULL << CAP_SETUID) |
163 (1ULL << CAP_SYS_ADMIN) |
164 (1ULL << CAP_SYS_CHROOT) |
165 (1ULL << CAP_SYS_NICE) |
166 (1ULL << CAP_SYS_PTRACE) |
167 (1ULL << CAP_SYS_TTY_CONFIG) |
168 (1ULL << CAP_SYS_RESOURCE) |
169 (1ULL << CAP_SYS_BOOT) |
170 (1ULL << CAP_AUDIT_WRITE) |
171 (1ULL << CAP_AUDIT_CONTROL) |
172 (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static const char *arg_network_bridge = NULL;
186 static unsigned long arg_personality = 0xffffffffLU;
187 static char *arg_image = NULL;
188 static Volatile arg_volatile = VOLATILE_NO;
189 static ExposePort *arg_expose_ports = NULL;
190
191 static void help(void) {
192 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
193 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
194 " -h --help Show this help\n"
195 " --version Print version string\n"
196 " -q --quiet Do not show status information\n"
197 " -D --directory=PATH Root directory for the container\n"
198 " --template=PATH Initialize root directory from template directory,\n"
199 " if missing\n"
200 " -x --ephemeral Run container with snapshot of root directory, and\n"
201 " remove it after exit\n"
202 " -i --image=PATH File system device or disk image for the container\n"
203 " -b --boot Boot up full system (i.e. invoke init)\n"
204 " -u --user=USER Run the command under specified user or uid\n"
205 " -M --machine=NAME Set the machine name for the container\n"
206 " --uuid=UUID Set a specific machine UUID for the container\n"
207 " -S --slice=SLICE Place the container in the specified slice\n"
208 " --private-network Disable network in container\n"
209 " --network-interface=INTERFACE\n"
210 " Assign an existing network interface to the\n"
211 " container\n"
212 " --network-macvlan=INTERFACE\n"
213 " Create a macvlan network interface based on an\n"
214 " existing network interface to the container\n"
215 " --network-ipvlan=INTERFACE\n"
216 " Create a ipvlan network interface based on an\n"
217 " existing network interface to the container\n"
218 " -n --network-veth Add a virtual ethernet connection between host\n"
219 " and container\n"
220 " --network-bridge=INTERFACE\n"
221 " Add a virtual ethernet connection between host\n"
222 " and container and add it to an existing bridge on\n"
223 " the host\n"
224 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
225 " Expose a container IP port on the host\n"
226 " -Z --selinux-context=SECLABEL\n"
227 " Set the SELinux security context to be used by\n"
228 " processes in the container\n"
229 " -L --selinux-apifs-context=SECLABEL\n"
230 " Set the SELinux security context to be used by\n"
231 " API/tmpfs file systems in the container\n"
232 " --capability=CAP In addition to the default, retain specified\n"
233 " capability\n"
234 " --drop-capability=CAP Drop the specified capability from the default set\n"
235 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
236 " try-guest, try-host\n"
237 " -j Equivalent to --link-journal=try-guest\n"
238 " --read-only Mount the root directory read-only\n"
239 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
240 " the container\n"
241 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
242 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
243 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
244 " --share-system Share system namespaces with host\n"
245 " --register=BOOLEAN Register container as machine\n"
246 " --keep-unit Do not register a scope for the machine, reuse\n"
247 " the service unit nspawn is running in\n"
248 " --volatile[=MODE] Run the system in volatile mode\n"
249 , program_invocation_short_name);
250 }
251
252 static int set_sanitized_path(char **b, const char *path) {
253 char *p;
254
255 assert(b);
256 assert(path);
257
258 p = canonicalize_file_name(path);
259 if (!p) {
260 if (errno != ENOENT)
261 return -errno;
262
263 p = path_make_absolute_cwd(path);
264 if (!p)
265 return -ENOMEM;
266 }
267
268 free(*b);
269 *b = path_kill_slashes(p);
270 return 0;
271 }
272
273 static int parse_argv(int argc, char *argv[]) {
274
275 enum {
276 ARG_VERSION = 0x100,
277 ARG_PRIVATE_NETWORK,
278 ARG_UUID,
279 ARG_READ_ONLY,
280 ARG_CAPABILITY,
281 ARG_DROP_CAPABILITY,
282 ARG_LINK_JOURNAL,
283 ARG_BIND,
284 ARG_BIND_RO,
285 ARG_TMPFS,
286 ARG_SETENV,
287 ARG_SHARE_SYSTEM,
288 ARG_REGISTER,
289 ARG_KEEP_UNIT,
290 ARG_NETWORK_INTERFACE,
291 ARG_NETWORK_MACVLAN,
292 ARG_NETWORK_IPVLAN,
293 ARG_NETWORK_BRIDGE,
294 ARG_PERSONALITY,
295 ARG_VOLATILE,
296 ARG_TEMPLATE,
297 };
298
299 static const struct option options[] = {
300 { "help", no_argument, NULL, 'h' },
301 { "version", no_argument, NULL, ARG_VERSION },
302 { "directory", required_argument, NULL, 'D' },
303 { "template", required_argument, NULL, ARG_TEMPLATE },
304 { "ephemeral", no_argument, NULL, 'x' },
305 { "user", required_argument, NULL, 'u' },
306 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
307 { "boot", no_argument, NULL, 'b' },
308 { "uuid", required_argument, NULL, ARG_UUID },
309 { "read-only", no_argument, NULL, ARG_READ_ONLY },
310 { "capability", required_argument, NULL, ARG_CAPABILITY },
311 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
312 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
313 { "bind", required_argument, NULL, ARG_BIND },
314 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
315 { "tmpfs", required_argument, NULL, ARG_TMPFS },
316 { "machine", required_argument, NULL, 'M' },
317 { "slice", required_argument, NULL, 'S' },
318 { "setenv", required_argument, NULL, ARG_SETENV },
319 { "selinux-context", required_argument, NULL, 'Z' },
320 { "selinux-apifs-context", required_argument, NULL, 'L' },
321 { "quiet", no_argument, NULL, 'q' },
322 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
323 { "register", required_argument, NULL, ARG_REGISTER },
324 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
325 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
326 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
327 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
328 { "network-veth", no_argument, NULL, 'n' },
329 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
330 { "personality", required_argument, NULL, ARG_PERSONALITY },
331 { "image", required_argument, NULL, 'i' },
332 { "volatile", optional_argument, NULL, ARG_VOLATILE },
333 { "port", required_argument, NULL, 'p' },
334 {}
335 };
336
337 int c, r;
338 uint64_t plus = 0, minus = 0;
339
340 assert(argc >= 0);
341 assert(argv);
342
343 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
344
345 switch (c) {
346
347 case 'h':
348 help();
349 return 0;
350
351 case ARG_VERSION:
352 puts(PACKAGE_STRING);
353 puts(SYSTEMD_FEATURES);
354 return 0;
355
356 case 'D':
357 r = set_sanitized_path(&arg_directory, optarg);
358 if (r < 0)
359 return log_error_errno(r, "Invalid root directory: %m");
360
361 break;
362
363 case ARG_TEMPLATE:
364 r = set_sanitized_path(&arg_template, optarg);
365 if (r < 0)
366 return log_error_errno(r, "Invalid template directory: %m");
367
368 break;
369
370 case 'i':
371 r = set_sanitized_path(&arg_image, optarg);
372 if (r < 0)
373 return log_error_errno(r, "Invalid image path: %m");
374
375 break;
376
377 case 'x':
378 arg_ephemeral = true;
379 break;
380
381 case 'u':
382 free(arg_user);
383 arg_user = strdup(optarg);
384 if (!arg_user)
385 return log_oom();
386
387 break;
388
389 case ARG_NETWORK_BRIDGE:
390 arg_network_bridge = optarg;
391
392 /* fall through */
393
394 case 'n':
395 arg_network_veth = true;
396 arg_private_network = true;
397 break;
398
399 case ARG_NETWORK_INTERFACE:
400 if (strv_extend(&arg_network_interfaces, optarg) < 0)
401 return log_oom();
402
403 arg_private_network = true;
404 break;
405
406 case ARG_NETWORK_MACVLAN:
407 if (strv_extend(&arg_network_macvlan, optarg) < 0)
408 return log_oom();
409
410 arg_private_network = true;
411 break;
412
413 case ARG_NETWORK_IPVLAN:
414 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
415 return log_oom();
416
417 /* fall through */
418
419 case ARG_PRIVATE_NETWORK:
420 arg_private_network = true;
421 break;
422
423 case 'b':
424 arg_boot = true;
425 break;
426
427 case ARG_UUID:
428 r = sd_id128_from_string(optarg, &arg_uuid);
429 if (r < 0) {
430 log_error("Invalid UUID: %s", optarg);
431 return r;
432 }
433 break;
434
435 case 'S':
436 arg_slice = optarg;
437 break;
438
439 case 'M':
440 if (isempty(optarg)) {
441 free(arg_machine);
442 arg_machine = NULL;
443 } else {
444 if (!machine_name_is_valid(optarg)) {
445 log_error("Invalid machine name: %s", optarg);
446 return -EINVAL;
447 }
448
449 r = free_and_strdup(&arg_machine, optarg);
450 if (r < 0)
451 return log_oom();
452
453 break;
454 }
455
456 case 'Z':
457 arg_selinux_context = optarg;
458 break;
459
460 case 'L':
461 arg_selinux_apifs_context = optarg;
462 break;
463
464 case ARG_READ_ONLY:
465 arg_read_only = true;
466 break;
467
468 case ARG_CAPABILITY:
469 case ARG_DROP_CAPABILITY: {
470 const char *state, *word;
471 size_t length;
472
473 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
474 _cleanup_free_ char *t;
475
476 t = strndup(word, length);
477 if (!t)
478 return log_oom();
479
480 if (streq(t, "all")) {
481 if (c == ARG_CAPABILITY)
482 plus = (uint64_t) -1;
483 else
484 minus = (uint64_t) -1;
485 } else {
486 int cap;
487
488 cap = capability_from_name(t);
489 if (cap < 0) {
490 log_error("Failed to parse capability %s.", t);
491 return -EINVAL;
492 }
493
494 if (c == ARG_CAPABILITY)
495 plus |= 1ULL << (uint64_t) cap;
496 else
497 minus |= 1ULL << (uint64_t) cap;
498 }
499 }
500
501 break;
502 }
503
504 case 'j':
505 arg_link_journal = LINK_GUEST;
506 arg_link_journal_try = true;
507 break;
508
509 case ARG_LINK_JOURNAL:
510 if (streq(optarg, "auto")) {
511 arg_link_journal = LINK_AUTO;
512 arg_link_journal_try = false;
513 } else if (streq(optarg, "no")) {
514 arg_link_journal = LINK_NO;
515 arg_link_journal_try = false;
516 } else if (streq(optarg, "guest")) {
517 arg_link_journal = LINK_GUEST;
518 arg_link_journal_try = false;
519 } else if (streq(optarg, "host")) {
520 arg_link_journal = LINK_HOST;
521 arg_link_journal_try = false;
522 } else if (streq(optarg, "try-guest")) {
523 arg_link_journal = LINK_GUEST;
524 arg_link_journal_try = true;
525 } else if (streq(optarg, "try-host")) {
526 arg_link_journal = LINK_HOST;
527 arg_link_journal_try = true;
528 } else {
529 log_error("Failed to parse link journal mode %s", optarg);
530 return -EINVAL;
531 }
532
533 break;
534
535 case ARG_BIND:
536 case ARG_BIND_RO: {
537 _cleanup_free_ char *a = NULL, *b = NULL;
538 char *e;
539 char ***x;
540
541 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
542
543 e = strchr(optarg, ':');
544 if (e) {
545 a = strndup(optarg, e - optarg);
546 b = strdup(e + 1);
547 } else {
548 a = strdup(optarg);
549 b = strdup(optarg);
550 }
551
552 if (!a || !b)
553 return log_oom();
554
555 if (!path_is_absolute(a) || !path_is_absolute(b)) {
556 log_error("Invalid bind mount specification: %s", optarg);
557 return -EINVAL;
558 }
559
560 r = strv_extend(x, a);
561 if (r < 0)
562 return log_oom();
563
564 r = strv_extend(x, b);
565 if (r < 0)
566 return log_oom();
567
568 break;
569 }
570
571 case ARG_TMPFS: {
572 _cleanup_free_ char *a = NULL, *b = NULL;
573 char *e;
574
575 e = strchr(optarg, ':');
576 if (e) {
577 a = strndup(optarg, e - optarg);
578 b = strdup(e + 1);
579 } else {
580 a = strdup(optarg);
581 b = strdup("mode=0755");
582 }
583
584 if (!a || !b)
585 return log_oom();
586
587 if (!path_is_absolute(a)) {
588 log_error("Invalid tmpfs specification: %s", optarg);
589 return -EINVAL;
590 }
591
592 r = strv_push(&arg_tmpfs, a);
593 if (r < 0)
594 return log_oom();
595
596 a = NULL;
597
598 r = strv_push(&arg_tmpfs, b);
599 if (r < 0)
600 return log_oom();
601
602 b = NULL;
603
604 break;
605 }
606
607 case ARG_SETENV: {
608 char **n;
609
610 if (!env_assignment_is_valid(optarg)) {
611 log_error("Environment variable assignment '%s' is not valid.", optarg);
612 return -EINVAL;
613 }
614
615 n = strv_env_set(arg_setenv, optarg);
616 if (!n)
617 return log_oom();
618
619 strv_free(arg_setenv);
620 arg_setenv = n;
621 break;
622 }
623
624 case 'q':
625 arg_quiet = true;
626 break;
627
628 case ARG_SHARE_SYSTEM:
629 arg_share_system = true;
630 break;
631
632 case ARG_REGISTER:
633 r = parse_boolean(optarg);
634 if (r < 0) {
635 log_error("Failed to parse --register= argument: %s", optarg);
636 return r;
637 }
638
639 arg_register = r;
640 break;
641
642 case ARG_KEEP_UNIT:
643 arg_keep_unit = true;
644 break;
645
646 case ARG_PERSONALITY:
647
648 arg_personality = personality_from_string(optarg);
649 if (arg_personality == 0xffffffffLU) {
650 log_error("Unknown or unsupported personality '%s'.", optarg);
651 return -EINVAL;
652 }
653
654 break;
655
656 case ARG_VOLATILE:
657
658 if (!optarg)
659 arg_volatile = VOLATILE_YES;
660 else {
661 r = parse_boolean(optarg);
662 if (r < 0) {
663 if (streq(optarg, "state"))
664 arg_volatile = VOLATILE_STATE;
665 else {
666 log_error("Failed to parse --volatile= argument: %s", optarg);
667 return r;
668 }
669 } else
670 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
671 }
672
673 break;
674
675 case 'p': {
676 const char *split, *e;
677 uint16_t container_port, host_port;
678 int protocol;
679 ExposePort *p;
680
681 if ((e = startswith(optarg, "tcp:")))
682 protocol = IPPROTO_TCP;
683 else if ((e = startswith(optarg, "udp:")))
684 protocol = IPPROTO_UDP;
685 else {
686 e = optarg;
687 protocol = IPPROTO_TCP;
688 }
689
690 split = strchr(e, ':');
691 if (split) {
692 char v[split - e + 1];
693
694 memcpy(v, e, split - e);
695 v[split - e] = 0;
696
697 r = safe_atou16(v, &host_port);
698 if (r < 0 || host_port <= 0) {
699 log_error("Failed to parse host port: %s", optarg);
700 return -EINVAL;
701 }
702
703 r = safe_atou16(split + 1, &container_port);
704 } else {
705 r = safe_atou16(e, &container_port);
706 host_port = container_port;
707 }
708
709 if (r < 0 || container_port <= 0) {
710 log_error("Failed to parse host port: %s", optarg);
711 return -EINVAL;
712 }
713
714 LIST_FOREACH(ports, p, arg_expose_ports) {
715 if (p->protocol == protocol && p->host_port == host_port) {
716 log_error("Duplicate port specification: %s", optarg);
717 return -EINVAL;
718 }
719 }
720
721 p = new(ExposePort, 1);
722 if (!p)
723 return log_oom();
724
725 p->protocol = protocol;
726 p->host_port = host_port;
727 p->container_port = container_port;
728
729 LIST_PREPEND(ports, arg_expose_ports, p);
730
731 break;
732 }
733
734 case '?':
735 return -EINVAL;
736
737 default:
738 assert_not_reached("Unhandled option");
739 }
740
741 if (arg_share_system)
742 arg_register = false;
743
744 if (arg_boot && arg_share_system) {
745 log_error("--boot and --share-system may not be combined.");
746 return -EINVAL;
747 }
748
749 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
750 log_error("--keep-unit may not be used when invoked from a user session.");
751 return -EINVAL;
752 }
753
754 if (arg_directory && arg_image) {
755 log_error("--directory= and --image= may not be combined.");
756 return -EINVAL;
757 }
758
759 if (arg_template && arg_image) {
760 log_error("--template= and --image= may not be combined.");
761 return -EINVAL;
762 }
763
764 if (arg_template && !(arg_directory || arg_machine)) {
765 log_error("--template= needs --directory= or --machine=.");
766 return -EINVAL;
767 }
768
769 if (arg_ephemeral && arg_template) {
770 log_error("--ephemeral and --template= may not be combined.");
771 return -EINVAL;
772 }
773
774 if (arg_ephemeral && arg_image) {
775 log_error("--ephemeral and --image= may not be combined.");
776 return -EINVAL;
777 }
778
779 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
780 log_error("--ephemeral and --link-journal= may not be combined.");
781 return -EINVAL;
782 }
783
784 if (arg_volatile != VOLATILE_NO && arg_read_only) {
785 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
786 return -EINVAL;
787 }
788
789 if (arg_expose_ports && !arg_private_network) {
790 log_error("Cannot use --port= without private networking.");
791 return -EINVAL;
792 }
793
794 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
795
796 return 1;
797 }
798
799 static int mount_all(const char *dest) {
800
801 typedef struct MountPoint {
802 const char *what;
803 const char *where;
804 const char *type;
805 const char *options;
806 unsigned long flags;
807 bool fatal;
808 } MountPoint;
809
810 static const MountPoint mount_table[] = {
811 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
812 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
813 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
814 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
815 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
816 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
817 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
818 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
819 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true },
820 #ifdef HAVE_SELINUX
821 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
822 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
823 #endif
824 };
825
826 unsigned k;
827 int r = 0;
828
829 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
830 _cleanup_free_ char *where = NULL;
831 #ifdef HAVE_SELINUX
832 _cleanup_free_ char *options = NULL;
833 #endif
834 const char *o;
835 int t;
836
837 where = strjoin(dest, "/", mount_table[k].where, NULL);
838 if (!where)
839 return log_oom();
840
841 t = path_is_mount_point(where, true);
842 if (t < 0) {
843 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
844
845 if (r == 0)
846 r = t;
847
848 continue;
849 }
850
851 /* Skip this entry if it is not a remount. */
852 if (mount_table[k].what && t > 0)
853 continue;
854
855 t = mkdir_p(where, 0755);
856 if (t < 0) {
857 if (mount_table[k].fatal) {
858 log_error_errno(t, "Failed to create directory %s: %m", where);
859
860 if (r == 0)
861 r = t;
862 } else
863 log_warning_errno(t, "Failed to create directory %s: %m", where);
864
865 continue;
866 }
867
868 #ifdef HAVE_SELINUX
869 if (arg_selinux_apifs_context &&
870 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
871 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
872 if (!options)
873 return log_oom();
874
875 o = options;
876 } else
877 #endif
878 o = mount_table[k].options;
879
880
881 if (mount(mount_table[k].what,
882 where,
883 mount_table[k].type,
884 mount_table[k].flags,
885 o) < 0) {
886
887 if (mount_table[k].fatal) {
888 log_error_errno(errno, "mount(%s) failed: %m", where);
889
890 if (r == 0)
891 r = -errno;
892 } else
893 log_warning_errno(errno, "mount(%s) failed: %m", where);
894 }
895 }
896
897 return r;
898 }
899
900 static int mount_binds(const char *dest, char **l, bool ro) {
901 char **x, **y;
902
903 STRV_FOREACH_PAIR(x, y, l) {
904 _cleanup_free_ char *where = NULL;
905 struct stat source_st, dest_st;
906 int r;
907
908 if (stat(*x, &source_st) < 0)
909 return log_error_errno(errno, "Failed to stat %s: %m", *x);
910
911 where = strappend(dest, *y);
912 if (!where)
913 return log_oom();
914
915 r = stat(where, &dest_st);
916 if (r == 0) {
917 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
918 log_error("Cannot bind mount directory %s on file %s.", *x, where);
919 return -EINVAL;
920 }
921 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
922 log_error("Cannot bind mount file %s on directory %s.", *x, where);
923 return -EINVAL;
924 }
925 } else if (errno == ENOENT) {
926 r = mkdir_parents_label(where, 0755);
927 if (r < 0)
928 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
929 } else {
930 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
931 return -errno;
932 }
933
934 /* Create the mount point. Any non-directory file can be
935 * mounted on any non-directory file (regular, fifo, socket,
936 * char, block).
937 */
938 if (S_ISDIR(source_st.st_mode)) {
939 r = mkdir_label(where, 0755);
940 if (r < 0 && errno != EEXIST)
941 return log_error_errno(r, "Failed to create mount point %s: %m", where);
942 } else {
943 r = touch(where);
944 if (r < 0)
945 return log_error_errno(r, "Failed to create mount point %s: %m", where);
946 }
947
948 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
949 return log_error_errno(errno, "mount(%s) failed: %m", where);
950
951 if (ro) {
952 r = bind_remount_recursive(where, true);
953 if (r < 0)
954 return log_error_errno(r, "Read-Only bind mount failed: %m");
955 }
956 }
957
958 return 0;
959 }
960
961 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
962 char *to;
963 int r;
964
965 to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
966
967 r = path_is_mount_point(to, false);
968 if (r < 0)
969 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
970 if (r > 0)
971 return 0;
972
973 mkdir_p(to, 0755);
974
975 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
976 return log_error_errno(errno, "Failed to mount to %s: %m", to);
977
978 return 1;
979 }
980
981 static int mount_cgroup(const char *dest) {
982 _cleanup_set_free_free_ Set *controllers = NULL;
983 _cleanup_free_ char *own_cgroup_path = NULL;
984 const char *cgroup_root, *systemd_root, *systemd_own;
985 int r;
986
987 controllers = set_new(&string_hash_ops);
988 if (!controllers)
989 return log_oom();
990
991 r = cg_kernel_controllers(controllers);
992 if (r < 0)
993 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
994
995 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
996 if (r < 0)
997 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
998
999 cgroup_root = strappenda(dest, "/sys/fs/cgroup");
1000 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1001 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1002
1003 for (;;) {
1004 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1005
1006 controller = set_steal_first(controllers);
1007 if (!controller)
1008 break;
1009
1010 origin = strappend("/sys/fs/cgroup/", controller);
1011 if (!origin)
1012 return log_oom();
1013
1014 r = readlink_malloc(origin, &combined);
1015 if (r == -EINVAL) {
1016 /* Not a symbolic link, but directly a single cgroup hierarchy */
1017
1018 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1019 if (r < 0)
1020 return r;
1021
1022 } else if (r < 0)
1023 return log_error_errno(r, "Failed to read link %s: %m", origin);
1024 else {
1025 _cleanup_free_ char *target = NULL;
1026
1027 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1028 if (!target)
1029 return log_oom();
1030
1031 /* A symbolic link, a combination of controllers in one hierarchy */
1032
1033 if (!filename_is_valid(combined)) {
1034 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1035 continue;
1036 }
1037
1038 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1039 if (r < 0)
1040 return r;
1041
1042 if (symlink(combined, target) < 0)
1043 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1044 }
1045 }
1046
1047 r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1048 if (r < 0)
1049 return r;
1050
1051 /* Make our own cgroup a (writable) bind mount */
1052 systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1053 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1054 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1055
1056 /* And then remount the systemd cgroup root read-only */
1057 systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1058 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1059 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1060
1061 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1062 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1063
1064 return 0;
1065 }
1066
1067 static int mount_tmpfs(const char *dest) {
1068 char **i, **o;
1069
1070 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1071 _cleanup_free_ char *where = NULL;
1072 int r;
1073
1074 where = strappend(dest, *i);
1075 if (!where)
1076 return log_oom();
1077
1078 r = mkdir_label(where, 0755);
1079 if (r < 0 && r != -EEXIST)
1080 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1081
1082 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1083 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1084 }
1085
1086 return 0;
1087 }
1088
1089 static int setup_timezone(const char *dest) {
1090 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1091 char *z, *y;
1092 int r;
1093
1094 assert(dest);
1095
1096 /* Fix the timezone, if possible */
1097 r = readlink_malloc("/etc/localtime", &p);
1098 if (r < 0) {
1099 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1100 return 0;
1101 }
1102
1103 z = path_startswith(p, "../usr/share/zoneinfo/");
1104 if (!z)
1105 z = path_startswith(p, "/usr/share/zoneinfo/");
1106 if (!z) {
1107 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1108 return 0;
1109 }
1110
1111 where = strappend(dest, "/etc/localtime");
1112 if (!where)
1113 return log_oom();
1114
1115 r = readlink_malloc(where, &q);
1116 if (r >= 0) {
1117 y = path_startswith(q, "../usr/share/zoneinfo/");
1118 if (!y)
1119 y = path_startswith(q, "/usr/share/zoneinfo/");
1120
1121 /* Already pointing to the right place? Then do nothing .. */
1122 if (y && streq(y, z))
1123 return 0;
1124 }
1125
1126 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1127 if (!check)
1128 return log_oom();
1129
1130 if (access(check, F_OK) < 0) {
1131 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1132 return 0;
1133 }
1134
1135 what = strappend("../usr/share/zoneinfo/", z);
1136 if (!what)
1137 return log_oom();
1138
1139 r = mkdir_parents(where, 0755);
1140 if (r < 0) {
1141 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1142
1143 return 0;
1144 }
1145
1146 r = unlink(where);
1147 if (r < 0 && errno != ENOENT) {
1148 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1149
1150 return 0;
1151 }
1152
1153 if (symlink(what, where) < 0) {
1154 log_error_errno(errno, "Failed to correct timezone of container: %m");
1155 return 0;
1156 }
1157
1158 return 0;
1159 }
1160
1161 static int setup_resolv_conf(const char *dest) {
1162 _cleanup_free_ char *where = NULL;
1163 int r;
1164
1165 assert(dest);
1166
1167 if (arg_private_network)
1168 return 0;
1169
1170 /* Fix resolv.conf, if possible */
1171 where = strappend(dest, "/etc/resolv.conf");
1172 if (!where)
1173 return log_oom();
1174
1175 /* We don't really care for the results of this really. If it
1176 * fails, it fails, but meh... */
1177 r = mkdir_parents(where, 0755);
1178 if (r < 0) {
1179 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1180
1181 return 0;
1182 }
1183
1184 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1185 if (r < 0) {
1186 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1187
1188 return 0;
1189 }
1190
1191 return 0;
1192 }
1193
1194 static int setup_volatile_state(const char *directory) {
1195 const char *p;
1196 int r;
1197
1198 assert(directory);
1199
1200 if (arg_volatile != VOLATILE_STATE)
1201 return 0;
1202
1203 /* --volatile=state means we simply overmount /var
1204 with a tmpfs, and the rest read-only. */
1205
1206 r = bind_remount_recursive(directory, true);
1207 if (r < 0)
1208 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1209
1210 p = strappenda(directory, "/var");
1211 r = mkdir(p, 0755);
1212 if (r < 0 && errno != EEXIST)
1213 return log_error_errno(errno, "Failed to create %s: %m", directory);
1214
1215 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1216 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1217
1218 return 0;
1219 }
1220
1221 static int setup_volatile(const char *directory) {
1222 bool tmpfs_mounted = false, bind_mounted = false;
1223 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1224 const char *f, *t;
1225 int r;
1226
1227 assert(directory);
1228
1229 if (arg_volatile != VOLATILE_YES)
1230 return 0;
1231
1232 /* --volatile=yes means we mount a tmpfs to the root dir, and
1233 the original /usr to use inside it, and that read-only. */
1234
1235 if (!mkdtemp(template))
1236 return log_error_errno(errno, "Failed to create temporary directory: %m");
1237
1238 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1239 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1240 r = -errno;
1241 goto fail;
1242 }
1243
1244 tmpfs_mounted = true;
1245
1246 f = strappenda(directory, "/usr");
1247 t = strappenda(template, "/usr");
1248
1249 r = mkdir(t, 0755);
1250 if (r < 0 && errno != EEXIST) {
1251 log_error_errno(errno, "Failed to create %s: %m", t);
1252 r = -errno;
1253 goto fail;
1254 }
1255
1256 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1257 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1258 r = -errno;
1259 goto fail;
1260 }
1261
1262 bind_mounted = true;
1263
1264 r = bind_remount_recursive(t, true);
1265 if (r < 0) {
1266 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1267 goto fail;
1268 }
1269
1270 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1271 log_error_errno(errno, "Failed to move root mount: %m");
1272 r = -errno;
1273 goto fail;
1274 }
1275
1276 rmdir(template);
1277
1278 return 0;
1279
1280 fail:
1281 if (bind_mounted)
1282 umount(t);
1283 if (tmpfs_mounted)
1284 umount(template);
1285 rmdir(template);
1286 return r;
1287 }
1288
1289 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1290
1291 snprintf(s, 37,
1292 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1293 SD_ID128_FORMAT_VAL(id));
1294
1295 return s;
1296 }
1297
1298 static int setup_boot_id(const char *dest) {
1299 _cleanup_free_ char *from = NULL, *to = NULL;
1300 sd_id128_t rnd = {};
1301 char as_uuid[37];
1302 int r;
1303
1304 assert(dest);
1305
1306 if (arg_share_system)
1307 return 0;
1308
1309 /* Generate a new randomized boot ID, so that each boot-up of
1310 * the container gets a new one */
1311
1312 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1313 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1314 if (!from || !to)
1315 return log_oom();
1316
1317 r = sd_id128_randomize(&rnd);
1318 if (r < 0)
1319 return log_error_errno(r, "Failed to generate random boot id: %m");
1320
1321 id128_format_as_uuid(rnd, as_uuid);
1322
1323 r = write_string_file(from, as_uuid);
1324 if (r < 0)
1325 return log_error_errno(r, "Failed to write boot id: %m");
1326
1327 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1328 log_error_errno(errno, "Failed to bind mount boot id: %m");
1329 r = -errno;
1330 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1331 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1332
1333 unlink(from);
1334 return r;
1335 }
1336
1337 static int copy_devnodes(const char *dest) {
1338
1339 static const char devnodes[] =
1340 "null\0"
1341 "zero\0"
1342 "full\0"
1343 "random\0"
1344 "urandom\0"
1345 "tty\0"
1346 "net/tun\0";
1347
1348 const char *d;
1349 int r = 0;
1350 _cleanup_umask_ mode_t u;
1351
1352 assert(dest);
1353
1354 u = umask(0000);
1355
1356 NULSTR_FOREACH(d, devnodes) {
1357 _cleanup_free_ char *from = NULL, *to = NULL;
1358 struct stat st;
1359
1360 from = strappend("/dev/", d);
1361 to = strjoin(dest, "/dev/", d, NULL);
1362 if (!from || !to)
1363 return log_oom();
1364
1365 if (stat(from, &st) < 0) {
1366
1367 if (errno != ENOENT)
1368 return log_error_errno(errno, "Failed to stat %s: %m", from);
1369
1370 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1371
1372 log_error("%s is not a char or block device, cannot copy", from);
1373 return -EIO;
1374
1375 } else {
1376 r = mkdir_parents(to, 0775);
1377 if (r < 0) {
1378 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1379 return -r;
1380 }
1381
1382 if (mknod(to, st.st_mode, st.st_rdev) < 0)
1383 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1384 }
1385 }
1386
1387 return r;
1388 }
1389
1390 static int setup_ptmx(const char *dest) {
1391 _cleanup_free_ char *p = NULL;
1392
1393 p = strappend(dest, "/dev/ptmx");
1394 if (!p)
1395 return log_oom();
1396
1397 if (symlink("pts/ptmx", p) < 0)
1398 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1399
1400 return 0;
1401 }
1402
1403 static int setup_dev_console(const char *dest, const char *console) {
1404 _cleanup_umask_ mode_t u;
1405 const char *to;
1406 struct stat st;
1407 int r;
1408
1409 assert(dest);
1410 assert(console);
1411
1412 u = umask(0000);
1413
1414 if (stat("/dev/null", &st) < 0)
1415 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1416
1417 r = chmod_and_chown(console, 0600, 0, 0);
1418 if (r < 0)
1419 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1420
1421 /* We need to bind mount the right tty to /dev/console since
1422 * ptys can only exist on pts file systems. To have something
1423 * to bind mount things on we create a device node first, and
1424 * use /dev/null for that since we the cgroups device policy
1425 * allows us to create that freely, while we cannot create
1426 * /dev/console. (Note that the major minor doesn't actually
1427 * matter here, since we mount it over anyway). */
1428
1429 to = strappenda(dest, "/dev/console");
1430 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1431 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1432
1433 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1434 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1435
1436 return 0;
1437 }
1438
1439 static int setup_kmsg(const char *dest, int kmsg_socket) {
1440 _cleanup_free_ char *from = NULL, *to = NULL;
1441 _cleanup_umask_ mode_t u;
1442 int r, fd, k;
1443 union {
1444 struct cmsghdr cmsghdr;
1445 uint8_t buf[CMSG_SPACE(sizeof(int))];
1446 } control = {};
1447 struct msghdr mh = {
1448 .msg_control = &control,
1449 .msg_controllen = sizeof(control),
1450 };
1451 struct cmsghdr *cmsg;
1452
1453 assert(dest);
1454 assert(kmsg_socket >= 0);
1455
1456 u = umask(0000);
1457
1458 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1459 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1460 * on the reading side behave very similar to /proc/kmsg,
1461 * their writing side behaves differently from /dev/kmsg in
1462 * that writing blocks when nothing is reading. In order to
1463 * avoid any problems with containers deadlocking due to this
1464 * we simply make /dev/kmsg unavailable to the container. */
1465 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1466 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1467 return log_oom();
1468
1469 if (mkfifo(from, 0600) < 0)
1470 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1471
1472 r = chmod_and_chown(from, 0600, 0, 0);
1473 if (r < 0)
1474 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1475
1476 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1477 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1478
1479 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1480 if (fd < 0)
1481 return log_error_errno(errno, "Failed to open fifo: %m");
1482
1483 cmsg = CMSG_FIRSTHDR(&mh);
1484 cmsg->cmsg_level = SOL_SOCKET;
1485 cmsg->cmsg_type = SCM_RIGHTS;
1486 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1487 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1488
1489 mh.msg_controllen = cmsg->cmsg_len;
1490
1491 /* Store away the fd in the socket, so that it stays open as
1492 * long as we run the child */
1493 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1494 safe_close(fd);
1495
1496 if (k < 0)
1497 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1498
1499 /* And now make the FIFO unavailable as /dev/kmsg... */
1500 unlink(from);
1501 return 0;
1502 }
1503
1504 static int send_rtnl(int send_fd) {
1505 union {
1506 struct cmsghdr cmsghdr;
1507 uint8_t buf[CMSG_SPACE(sizeof(int))];
1508 } control = {};
1509 struct msghdr mh = {
1510 .msg_control = &control,
1511 .msg_controllen = sizeof(control),
1512 };
1513 struct cmsghdr *cmsg;
1514 _cleanup_close_ int fd = -1;
1515 ssize_t k;
1516
1517 assert(send_fd >= 0);
1518
1519 if (!arg_expose_ports)
1520 return 0;
1521
1522 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1523 if (fd < 0)
1524 return log_error_errno(errno, "failed to allocate container netlink: %m");
1525
1526 cmsg = CMSG_FIRSTHDR(&mh);
1527 cmsg->cmsg_level = SOL_SOCKET;
1528 cmsg->cmsg_type = SCM_RIGHTS;
1529 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1530 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1531
1532 mh.msg_controllen = cmsg->cmsg_len;
1533
1534 /* Store away the fd in the socket, so that it stays open as
1535 * long as we run the child */
1536 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1537 if (k < 0)
1538 return log_error_errno(errno, "Failed to send netlink fd: %m");
1539
1540 return 0;
1541 }
1542
1543 static int flush_ports(union in_addr_union *exposed) {
1544 ExposePort *p;
1545 int r, af = AF_INET;
1546
1547 assert(exposed);
1548
1549 if (!arg_expose_ports)
1550 return 0;
1551
1552 if (in_addr_is_null(af, exposed))
1553 return 0;
1554
1555 log_debug("Lost IP address.");
1556
1557 LIST_FOREACH(ports, p, arg_expose_ports) {
1558 r = fw_add_local_dnat(false,
1559 af,
1560 p->protocol,
1561 NULL,
1562 NULL, 0,
1563 NULL, 0,
1564 p->host_port,
1565 exposed,
1566 p->container_port,
1567 NULL);
1568 if (r < 0)
1569 log_warning_errno(r, "Failed to modify firewall: %m");
1570 }
1571
1572 *exposed = IN_ADDR_NULL;
1573 return 0;
1574 }
1575
1576 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1577 _cleanup_free_ struct local_address *addresses = NULL;
1578 _cleanup_free_ char *pretty = NULL;
1579 union in_addr_union new_exposed;
1580 ExposePort *p;
1581 bool add;
1582 int af = AF_INET, r;
1583
1584 assert(exposed);
1585
1586 /* Invoked each time an address is added or removed inside the
1587 * container */
1588
1589 if (!arg_expose_ports)
1590 return 0;
1591
1592 r = local_addresses(rtnl, 0, af, &addresses);
1593 if (r < 0)
1594 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1595
1596 add = r > 0 &&
1597 addresses[0].family == af &&
1598 addresses[0].scope < RT_SCOPE_LINK;
1599
1600 if (!add)
1601 return flush_ports(exposed);
1602
1603 new_exposed = addresses[0].address;
1604 if (in_addr_equal(af, exposed, &new_exposed))
1605 return 0;
1606
1607 in_addr_to_string(af, &new_exposed, &pretty);
1608 log_debug("New container IP is %s.", strna(pretty));
1609
1610 LIST_FOREACH(ports, p, arg_expose_ports) {
1611
1612 r = fw_add_local_dnat(true,
1613 af,
1614 p->protocol,
1615 NULL,
1616 NULL, 0,
1617 NULL, 0,
1618 p->host_port,
1619 &new_exposed,
1620 p->container_port,
1621 in_addr_is_null(af, exposed) ? NULL : exposed);
1622 if (r < 0)
1623 log_warning_errno(r, "Failed to modify firewall: %m");
1624 }
1625
1626 *exposed = new_exposed;
1627 return 0;
1628 }
1629
1630 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1631 union in_addr_union *exposed = userdata;
1632
1633 assert(rtnl);
1634 assert(m);
1635 assert(exposed);
1636
1637 expose_ports(rtnl, exposed);
1638 return 0;
1639 }
1640
1641 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1642 union {
1643 struct cmsghdr cmsghdr;
1644 uint8_t buf[CMSG_SPACE(sizeof(int))];
1645 } control = {};
1646 struct msghdr mh = {
1647 .msg_control = &control,
1648 .msg_controllen = sizeof(control),
1649 };
1650 struct cmsghdr *cmsg;
1651 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1652 int fd, r;
1653 ssize_t k;
1654
1655 assert(event);
1656 assert(recv_fd >= 0);
1657 assert(ret);
1658
1659 if (!arg_expose_ports)
1660 return 0;
1661
1662 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1663 if (k < 0)
1664 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1665
1666 cmsg = CMSG_FIRSTHDR(&mh);
1667 assert(cmsg->cmsg_level == SOL_SOCKET);
1668 assert(cmsg->cmsg_type == SCM_RIGHTS);
1669 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1670 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1671
1672 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1673 if (r < 0) {
1674 safe_close(fd);
1675 return log_error_errno(r, "Failed to create rtnl object: %m");
1676 }
1677
1678 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1679 if (r < 0)
1680 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1681
1682 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1683 if (r < 0)
1684 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1685
1686 r = sd_rtnl_attach_event(rtnl, event, 0);
1687 if (r < 0)
1688 return log_error_errno(r, "Failed to add to even loop: %m");
1689
1690 *ret = rtnl;
1691 rtnl = NULL;
1692
1693 return 0;
1694 }
1695
1696 static int setup_hostname(void) {
1697
1698 if (arg_share_system)
1699 return 0;
1700
1701 if (sethostname_idempotent(arg_machine) < 0)
1702 return -errno;
1703
1704 return 0;
1705 }
1706
1707 static int setup_journal(const char *directory) {
1708 sd_id128_t machine_id, this_id;
1709 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1710 char *id;
1711 int r;
1712
1713 /* Don't link journals in ephemeral mode */
1714 if (arg_ephemeral)
1715 return 0;
1716
1717 p = strappend(directory, "/etc/machine-id");
1718 if (!p)
1719 return log_oom();
1720
1721 r = read_one_line_file(p, &b);
1722 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1723 return 0;
1724 else if (r < 0)
1725 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1726
1727 id = strstrip(b);
1728 if (isempty(id) && arg_link_journal == LINK_AUTO)
1729 return 0;
1730
1731 /* Verify validity */
1732 r = sd_id128_from_string(id, &machine_id);
1733 if (r < 0)
1734 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1735
1736 r = sd_id128_get_machine(&this_id);
1737 if (r < 0)
1738 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1739
1740 if (sd_id128_equal(machine_id, this_id)) {
1741 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1742 "Host and machine ids are equal (%s): refusing to link journals", id);
1743 if (arg_link_journal == LINK_AUTO)
1744 return 0;
1745 return -EEXIST;
1746 }
1747
1748 if (arg_link_journal == LINK_NO)
1749 return 0;
1750
1751 free(p);
1752 p = strappend("/var/log/journal/", id);
1753 q = strjoin(directory, "/var/log/journal/", id, NULL);
1754 if (!p || !q)
1755 return log_oom();
1756
1757 if (path_is_mount_point(p, false) > 0) {
1758 if (arg_link_journal != LINK_AUTO) {
1759 log_error("%s: already a mount point, refusing to use for journal", p);
1760 return -EEXIST;
1761 }
1762
1763 return 0;
1764 }
1765
1766 if (path_is_mount_point(q, false) > 0) {
1767 if (arg_link_journal != LINK_AUTO) {
1768 log_error("%s: already a mount point, refusing to use for journal", q);
1769 return -EEXIST;
1770 }
1771
1772 return 0;
1773 }
1774
1775 r = readlink_and_make_absolute(p, &d);
1776 if (r >= 0) {
1777 if ((arg_link_journal == LINK_GUEST ||
1778 arg_link_journal == LINK_AUTO) &&
1779 path_equal(d, q)) {
1780
1781 r = mkdir_p(q, 0755);
1782 if (r < 0)
1783 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1784 return 0;
1785 }
1786
1787 if (unlink(p) < 0)
1788 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1789 } else if (r == -EINVAL) {
1790
1791 if (arg_link_journal == LINK_GUEST &&
1792 rmdir(p) < 0) {
1793
1794 if (errno == ENOTDIR) {
1795 log_error("%s already exists and is neither a symlink nor a directory", p);
1796 return r;
1797 } else {
1798 log_error_errno(errno, "Failed to remove %s: %m", p);
1799 return -errno;
1800 }
1801 }
1802 } else if (r != -ENOENT) {
1803 log_error_errno(errno, "readlink(%s) failed: %m", p);
1804 return r;
1805 }
1806
1807 if (arg_link_journal == LINK_GUEST) {
1808
1809 if (symlink(q, p) < 0) {
1810 if (arg_link_journal_try) {
1811 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1812 return 0;
1813 } else {
1814 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1815 return -errno;
1816 }
1817 }
1818
1819 r = mkdir_p(q, 0755);
1820 if (r < 0)
1821 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1822 return 0;
1823 }
1824
1825 if (arg_link_journal == LINK_HOST) {
1826 /* don't create parents here -- if the host doesn't have
1827 * permanent journal set up, don't force it here */
1828 r = mkdir(p, 0755);
1829 if (r < 0) {
1830 if (arg_link_journal_try) {
1831 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1832 return 0;
1833 } else {
1834 log_error_errno(errno, "Failed to create %s: %m", p);
1835 return r;
1836 }
1837 }
1838
1839 } else if (access(p, F_OK) < 0)
1840 return 0;
1841
1842 if (dir_is_empty(q) == 0)
1843 log_warning("%s is not empty, proceeding anyway.", q);
1844
1845 r = mkdir_p(q, 0755);
1846 if (r < 0) {
1847 log_error_errno(errno, "Failed to create %s: %m", q);
1848 return r;
1849 }
1850
1851 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1852 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1853
1854 return 0;
1855 }
1856
1857 static int drop_capabilities(void) {
1858 return capability_bounding_set_drop(~arg_retain, false);
1859 }
1860
1861 static int register_machine(pid_t pid, int local_ifindex) {
1862 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1863 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1864 int r;
1865
1866 if (!arg_register)
1867 return 0;
1868
1869 r = sd_bus_default_system(&bus);
1870 if (r < 0)
1871 return log_error_errno(r, "Failed to open system bus: %m");
1872
1873 if (arg_keep_unit) {
1874 r = sd_bus_call_method(
1875 bus,
1876 "org.freedesktop.machine1",
1877 "/org/freedesktop/machine1",
1878 "org.freedesktop.machine1.Manager",
1879 "RegisterMachineWithNetwork",
1880 &error,
1881 NULL,
1882 "sayssusai",
1883 arg_machine,
1884 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1885 "nspawn",
1886 "container",
1887 (uint32_t) pid,
1888 strempty(arg_directory),
1889 local_ifindex > 0 ? 1 : 0, local_ifindex);
1890 } else {
1891 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1892
1893 r = sd_bus_message_new_method_call(
1894 bus,
1895 &m,
1896 "org.freedesktop.machine1",
1897 "/org/freedesktop/machine1",
1898 "org.freedesktop.machine1.Manager",
1899 "CreateMachineWithNetwork");
1900 if (r < 0)
1901 return log_error_errno(r, "Failed to create message: %m");
1902
1903 r = sd_bus_message_append(
1904 m,
1905 "sayssusai",
1906 arg_machine,
1907 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1908 "nspawn",
1909 "container",
1910 (uint32_t) pid,
1911 strempty(arg_directory),
1912 local_ifindex > 0 ? 1 : 0, local_ifindex);
1913 if (r < 0)
1914 return log_error_errno(r, "Failed to append message arguments: %m");
1915
1916 r = sd_bus_message_open_container(m, 'a', "(sv)");
1917 if (r < 0)
1918 return log_error_errno(r, "Failed to open container: %m");
1919
1920 if (!isempty(arg_slice)) {
1921 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1922 if (r < 0)
1923 return log_error_errno(r, "Failed to append slice: %m");
1924 }
1925
1926 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1927 if (r < 0)
1928 return log_error_errno(r, "Failed to add device policy: %m");
1929
1930 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1931 /* Allow the container to
1932 * access and create the API
1933 * device nodes, so that
1934 * PrivateDevices= in the
1935 * container can work
1936 * fine */
1937 "/dev/null", "rwm",
1938 "/dev/zero", "rwm",
1939 "/dev/full", "rwm",
1940 "/dev/random", "rwm",
1941 "/dev/urandom", "rwm",
1942 "/dev/tty", "rwm",
1943 "/dev/net/tun", "rwm",
1944 /* Allow the container
1945 * access to ptys. However,
1946 * do not permit the
1947 * container to ever create
1948 * these device nodes. */
1949 "/dev/pts/ptmx", "rw",
1950 "char-pts", "rw");
1951 if (r < 0)
1952 return log_error_errno(r, "Failed to add device whitelist: %m");
1953
1954 r = sd_bus_message_close_container(m);
1955 if (r < 0)
1956 return log_error_errno(r, "Failed to close container: %m");
1957
1958 r = sd_bus_call(bus, m, 0, &error, NULL);
1959 }
1960
1961 if (r < 0) {
1962 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1963 return r;
1964 }
1965
1966 return 0;
1967 }
1968
1969 static int terminate_machine(pid_t pid) {
1970 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1971 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1972 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1973 const char *path;
1974 int r;
1975
1976 if (!arg_register)
1977 return 0;
1978
1979 r = sd_bus_default_system(&bus);
1980 if (r < 0)
1981 return log_error_errno(r, "Failed to open system bus: %m");
1982
1983 r = sd_bus_call_method(
1984 bus,
1985 "org.freedesktop.machine1",
1986 "/org/freedesktop/machine1",
1987 "org.freedesktop.machine1.Manager",
1988 "GetMachineByPID",
1989 &error,
1990 &reply,
1991 "u",
1992 (uint32_t) pid);
1993 if (r < 0) {
1994 /* Note that the machine might already have been
1995 * cleaned up automatically, hence don't consider it a
1996 * failure if we cannot get the machine object. */
1997 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1998 return 0;
1999 }
2000
2001 r = sd_bus_message_read(reply, "o", &path);
2002 if (r < 0)
2003 return bus_log_parse_error(r);
2004
2005 r = sd_bus_call_method(
2006 bus,
2007 "org.freedesktop.machine1",
2008 path,
2009 "org.freedesktop.machine1.Machine",
2010 "Terminate",
2011 &error,
2012 NULL,
2013 NULL);
2014 if (r < 0) {
2015 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2016 return 0;
2017 }
2018
2019 return 0;
2020 }
2021
2022 static int reset_audit_loginuid(void) {
2023 _cleanup_free_ char *p = NULL;
2024 int r;
2025
2026 if (arg_share_system)
2027 return 0;
2028
2029 r = read_one_line_file("/proc/self/loginuid", &p);
2030 if (r == -ENOENT)
2031 return 0;
2032 if (r < 0)
2033 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2034
2035 /* Already reset? */
2036 if (streq(p, "4294967295"))
2037 return 0;
2038
2039 r = write_string_file("/proc/self/loginuid", "4294967295");
2040 if (r < 0) {
2041 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2042 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2043 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2044 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2045 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2046
2047 sleep(5);
2048 }
2049
2050 return 0;
2051 }
2052
2053 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2054 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2055 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2056
2057 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2058 uint8_t result[8];
2059 size_t l, sz;
2060 uint8_t *v, *i;
2061 int r;
2062
2063 l = strlen(arg_machine);
2064 sz = sizeof(sd_id128_t) + l;
2065 if (idx > 0)
2066 sz += sizeof(idx);
2067
2068 v = alloca(sz);
2069
2070 /* fetch some persistent data unique to the host */
2071 r = sd_id128_get_machine((sd_id128_t*) v);
2072 if (r < 0)
2073 return r;
2074
2075 /* combine with some data unique (on this host) to this
2076 * container instance */
2077 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2078 if (idx > 0) {
2079 idx = htole64(idx);
2080 memcpy(i, &idx, sizeof(idx));
2081 }
2082
2083 /* Let's hash the host machine ID plus the container name. We
2084 * use a fixed, but originally randomly created hash key here. */
2085 siphash24(result, v, sz, hash_key.bytes);
2086
2087 assert_cc(ETH_ALEN <= sizeof(result));
2088 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2089
2090 /* see eth_random_addr in the kernel */
2091 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2092 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2093
2094 return 0;
2095 }
2096
2097 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2098 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2099 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2100 struct ether_addr mac_host, mac_container;
2101 int r, i;
2102
2103 if (!arg_private_network)
2104 return 0;
2105
2106 if (!arg_network_veth)
2107 return 0;
2108
2109 /* Use two different interface name prefixes depending whether
2110 * we are in bridge mode or not. */
2111 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2112 arg_network_bridge ? "vb" : "ve", arg_machine);
2113
2114 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2115 if (r < 0)
2116 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2117
2118 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2119 if (r < 0)
2120 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2121
2122 r = sd_rtnl_open(&rtnl, 0);
2123 if (r < 0)
2124 return log_error_errno(r, "Failed to connect to netlink: %m");
2125
2126 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2127 if (r < 0)
2128 return log_error_errno(r, "Failed to allocate netlink message: %m");
2129
2130 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2131 if (r < 0)
2132 return log_error_errno(r, "Failed to add netlink interface name: %m");
2133
2134 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2135 if (r < 0)
2136 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2137
2138 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2139 if (r < 0)
2140 return log_error_errno(r, "Failed to open netlink container: %m");
2141
2142 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2143 if (r < 0)
2144 return log_error_errno(r, "Failed to open netlink container: %m");
2145
2146 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2147 if (r < 0)
2148 return log_error_errno(r, "Failed to open netlink container: %m");
2149
2150 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2151 if (r < 0)
2152 return log_error_errno(r, "Failed to add netlink interface name: %m");
2153
2154 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2155 if (r < 0)
2156 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2157
2158 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2159 if (r < 0)
2160 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2161
2162 r = sd_rtnl_message_close_container(m);
2163 if (r < 0)
2164 return log_error_errno(r, "Failed to close netlink container: %m");
2165
2166 r = sd_rtnl_message_close_container(m);
2167 if (r < 0)
2168 return log_error_errno(r, "Failed to close netlink container: %m");
2169
2170 r = sd_rtnl_message_close_container(m);
2171 if (r < 0)
2172 return log_error_errno(r, "Failed to close netlink container: %m");
2173
2174 r = sd_rtnl_call(rtnl, m, 0, NULL);
2175 if (r < 0)
2176 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2177
2178 i = (int) if_nametoindex(iface_name);
2179 if (i <= 0)
2180 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2181
2182 *ifi = i;
2183
2184 return 0;
2185 }
2186
2187 static int setup_bridge(const char veth_name[], int *ifi) {
2188 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2189 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2190 int r, bridge;
2191
2192 if (!arg_private_network)
2193 return 0;
2194
2195 if (!arg_network_veth)
2196 return 0;
2197
2198 if (!arg_network_bridge)
2199 return 0;
2200
2201 bridge = (int) if_nametoindex(arg_network_bridge);
2202 if (bridge <= 0)
2203 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2204
2205 *ifi = bridge;
2206
2207 r = sd_rtnl_open(&rtnl, 0);
2208 if (r < 0)
2209 return log_error_errno(r, "Failed to connect to netlink: %m");
2210
2211 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2212 if (r < 0)
2213 return log_error_errno(r, "Failed to allocate netlink message: %m");
2214
2215 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2216 if (r < 0)
2217 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2218
2219 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2220 if (r < 0)
2221 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2222
2223 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2224 if (r < 0)
2225 return log_error_errno(r, "Failed to add netlink master field: %m");
2226
2227 r = sd_rtnl_call(rtnl, m, 0, NULL);
2228 if (r < 0)
2229 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2230
2231 return 0;
2232 }
2233
2234 static int parse_interface(struct udev *udev, const char *name) {
2235 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2236 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2237 int ifi;
2238
2239 ifi = (int) if_nametoindex(name);
2240 if (ifi <= 0)
2241 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2242
2243 sprintf(ifi_str, "n%i", ifi);
2244 d = udev_device_new_from_device_id(udev, ifi_str);
2245 if (!d)
2246 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2247
2248 if (udev_device_get_is_initialized(d) <= 0) {
2249 log_error("Network interface %s is not initialized yet.", name);
2250 return -EBUSY;
2251 }
2252
2253 return ifi;
2254 }
2255
2256 static int move_network_interfaces(pid_t pid) {
2257 _cleanup_udev_unref_ struct udev *udev = NULL;
2258 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2259 char **i;
2260 int r;
2261
2262 if (!arg_private_network)
2263 return 0;
2264
2265 if (strv_isempty(arg_network_interfaces))
2266 return 0;
2267
2268 r = sd_rtnl_open(&rtnl, 0);
2269 if (r < 0)
2270 return log_error_errno(r, "Failed to connect to netlink: %m");
2271
2272 udev = udev_new();
2273 if (!udev) {
2274 log_error("Failed to connect to udev.");
2275 return -ENOMEM;
2276 }
2277
2278 STRV_FOREACH(i, arg_network_interfaces) {
2279 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2280 int ifi;
2281
2282 ifi = parse_interface(udev, *i);
2283 if (ifi < 0)
2284 return ifi;
2285
2286 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2287 if (r < 0)
2288 return log_error_errno(r, "Failed to allocate netlink message: %m");
2289
2290 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2291 if (r < 0)
2292 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2293
2294 r = sd_rtnl_call(rtnl, m, 0, NULL);
2295 if (r < 0)
2296 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2297 }
2298
2299 return 0;
2300 }
2301
2302 static int setup_macvlan(pid_t pid) {
2303 _cleanup_udev_unref_ struct udev *udev = NULL;
2304 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2305 unsigned idx = 0;
2306 char **i;
2307 int r;
2308
2309 if (!arg_private_network)
2310 return 0;
2311
2312 if (strv_isempty(arg_network_macvlan))
2313 return 0;
2314
2315 r = sd_rtnl_open(&rtnl, 0);
2316 if (r < 0)
2317 return log_error_errno(r, "Failed to connect to netlink: %m");
2318
2319 udev = udev_new();
2320 if (!udev) {
2321 log_error("Failed to connect to udev.");
2322 return -ENOMEM;
2323 }
2324
2325 STRV_FOREACH(i, arg_network_macvlan) {
2326 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2327 _cleanup_free_ char *n = NULL;
2328 struct ether_addr mac;
2329 int ifi;
2330
2331 ifi = parse_interface(udev, *i);
2332 if (ifi < 0)
2333 return ifi;
2334
2335 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2336 if (r < 0)
2337 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2338
2339 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2340 if (r < 0)
2341 return log_error_errno(r, "Failed to allocate netlink message: %m");
2342
2343 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2344 if (r < 0)
2345 return log_error_errno(r, "Failed to add netlink interface index: %m");
2346
2347 n = strappend("mv-", *i);
2348 if (!n)
2349 return log_oom();
2350
2351 strshorten(n, IFNAMSIZ-1);
2352
2353 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2354 if (r < 0)
2355 return log_error_errno(r, "Failed to add netlink interface name: %m");
2356
2357 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2358 if (r < 0)
2359 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2360
2361 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2362 if (r < 0)
2363 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2364
2365 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2366 if (r < 0)
2367 return log_error_errno(r, "Failed to open netlink container: %m");
2368
2369 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2370 if (r < 0)
2371 return log_error_errno(r, "Failed to open netlink container: %m");
2372
2373 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2374 if (r < 0)
2375 return log_error_errno(r, "Failed to append macvlan mode: %m");
2376
2377 r = sd_rtnl_message_close_container(m);
2378 if (r < 0)
2379 return log_error_errno(r, "Failed to close netlink container: %m");
2380
2381 r = sd_rtnl_message_close_container(m);
2382 if (r < 0)
2383 return log_error_errno(r, "Failed to close netlink container: %m");
2384
2385 r = sd_rtnl_call(rtnl, m, 0, NULL);
2386 if (r < 0)
2387 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2388 }
2389
2390 return 0;
2391 }
2392
2393 static int setup_ipvlan(pid_t pid) {
2394 _cleanup_udev_unref_ struct udev *udev = NULL;
2395 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2396 char **i;
2397 int r;
2398
2399 if (!arg_private_network)
2400 return 0;
2401
2402 if (strv_isempty(arg_network_ipvlan))
2403 return 0;
2404
2405 r = sd_rtnl_open(&rtnl, 0);
2406 if (r < 0)
2407 return log_error_errno(r, "Failed to connect to netlink: %m");
2408
2409 udev = udev_new();
2410 if (!udev) {
2411 log_error("Failed to connect to udev.");
2412 return -ENOMEM;
2413 }
2414
2415 STRV_FOREACH(i, arg_network_ipvlan) {
2416 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2417 _cleanup_free_ char *n = NULL;
2418 int ifi;
2419
2420 ifi = parse_interface(udev, *i);
2421 if (ifi < 0)
2422 return ifi;
2423
2424 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2425 if (r < 0)
2426 return log_error_errno(r, "Failed to allocate netlink message: %m");
2427
2428 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2429 if (r < 0)
2430 return log_error_errno(r, "Failed to add netlink interface index: %m");
2431
2432 n = strappend("iv-", *i);
2433 if (!n)
2434 return log_oom();
2435
2436 strshorten(n, IFNAMSIZ-1);
2437
2438 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2439 if (r < 0)
2440 return log_error_errno(r, "Failed to add netlink interface name: %m");
2441
2442 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2443 if (r < 0)
2444 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2445
2446 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2447 if (r < 0)
2448 return log_error_errno(r, "Failed to open netlink container: %m");
2449
2450 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2451 if (r < 0)
2452 return log_error_errno(r, "Failed to open netlink container: %m");
2453
2454 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2455 if (r < 0)
2456 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2457
2458 r = sd_rtnl_message_close_container(m);
2459 if (r < 0)
2460 return log_error_errno(r, "Failed to close netlink container: %m");
2461
2462 r = sd_rtnl_message_close_container(m);
2463 if (r < 0)
2464 return log_error_errno(r, "Failed to close netlink container: %m");
2465
2466 r = sd_rtnl_call(rtnl, m, 0, NULL);
2467 if (r < 0)
2468 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2469 }
2470
2471 return 0;
2472 }
2473
2474 static int setup_seccomp(void) {
2475
2476 #ifdef HAVE_SECCOMP
2477 static const int blacklist[] = {
2478 SCMP_SYS(kexec_load),
2479 SCMP_SYS(open_by_handle_at),
2480 SCMP_SYS(init_module),
2481 SCMP_SYS(finit_module),
2482 SCMP_SYS(delete_module),
2483 SCMP_SYS(iopl),
2484 SCMP_SYS(ioperm),
2485 SCMP_SYS(swapon),
2486 SCMP_SYS(swapoff),
2487 };
2488
2489 scmp_filter_ctx seccomp;
2490 unsigned i;
2491 int r;
2492
2493 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2494 if (!seccomp)
2495 return log_oom();
2496
2497 r = seccomp_add_secondary_archs(seccomp);
2498 if (r < 0) {
2499 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2500 goto finish;
2501 }
2502
2503 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2504 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2505 if (r == -EFAULT)
2506 continue; /* unknown syscall */
2507 if (r < 0) {
2508 log_error_errno(r, "Failed to block syscall: %m");
2509 goto finish;
2510 }
2511 }
2512
2513 /*
2514 Audit is broken in containers, much of the userspace audit
2515 hookup will fail if running inside a container. We don't
2516 care and just turn off creation of audit sockets.
2517
2518 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2519 with EAFNOSUPPORT which audit userspace uses as indication
2520 that audit is disabled in the kernel.
2521 */
2522
2523 r = seccomp_rule_add(
2524 seccomp,
2525 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2526 SCMP_SYS(socket),
2527 2,
2528 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2529 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2530 if (r < 0) {
2531 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2532 goto finish;
2533 }
2534
2535 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2536 if (r < 0) {
2537 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2538 goto finish;
2539 }
2540
2541 r = seccomp_load(seccomp);
2542 if (r < 0)
2543 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2544
2545 finish:
2546 seccomp_release(seccomp);
2547 return r;
2548 #else
2549 return 0;
2550 #endif
2551
2552 }
2553
2554 static int setup_propagate(const char *root) {
2555 const char *p, *q;
2556
2557 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2558 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2559 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2560 (void) mkdir_p(p, 0600);
2561
2562 q = strappenda(root, "/run/systemd/nspawn/incoming");
2563 mkdir_parents(q, 0755);
2564 mkdir_p(q, 0600);
2565
2566 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2567 return log_error_errno(errno, "Failed to install propagation bind mount.");
2568
2569 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2570 return log_error_errno(errno, "Failed to make propagation mount read-only");
2571
2572 return 0;
2573 }
2574
2575 static int setup_image(char **device_path, int *loop_nr) {
2576 struct loop_info64 info = {
2577 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2578 };
2579 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2580 _cleanup_free_ char* loopdev = NULL;
2581 struct stat st;
2582 int r, nr;
2583
2584 assert(device_path);
2585 assert(loop_nr);
2586 assert(arg_image);
2587
2588 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2589 if (fd < 0)
2590 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2591
2592 if (fstat(fd, &st) < 0)
2593 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2594
2595 if (S_ISBLK(st.st_mode)) {
2596 char *p;
2597
2598 p = strdup(arg_image);
2599 if (!p)
2600 return log_oom();
2601
2602 *device_path = p;
2603
2604 *loop_nr = -1;
2605
2606 r = fd;
2607 fd = -1;
2608
2609 return r;
2610 }
2611
2612 if (!S_ISREG(st.st_mode)) {
2613 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2614 return -EINVAL;
2615 }
2616
2617 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2618 if (control < 0)
2619 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2620
2621 nr = ioctl(control, LOOP_CTL_GET_FREE);
2622 if (nr < 0)
2623 return log_error_errno(errno, "Failed to allocate loop device: %m");
2624
2625 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2626 return log_oom();
2627
2628 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2629 if (loop < 0)
2630 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2631
2632 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2633 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2634
2635 if (arg_read_only)
2636 info.lo_flags |= LO_FLAGS_READ_ONLY;
2637
2638 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2639 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2640
2641 *device_path = loopdev;
2642 loopdev = NULL;
2643
2644 *loop_nr = nr;
2645
2646 r = loop;
2647 loop = -1;
2648
2649 return r;
2650 }
2651
2652 #define PARTITION_TABLE_BLURB \
2653 "Note that the disk image needs to either contain only a single MBR partition of\n" \
2654 "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2655 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2656 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2657 "to be bootable with systemd-nspawn."
2658
2659 static int dissect_image(
2660 int fd,
2661 char **root_device, bool *root_device_rw,
2662 char **home_device, bool *home_device_rw,
2663 char **srv_device, bool *srv_device_rw,
2664 bool *secondary) {
2665
2666 #ifdef HAVE_BLKID
2667 int home_nr = -1, srv_nr = -1;
2668 #ifdef GPT_ROOT_NATIVE
2669 int root_nr = -1;
2670 #endif
2671 #ifdef GPT_ROOT_SECONDARY
2672 int secondary_root_nr = -1;
2673 #endif
2674 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2675 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2676 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2677 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2678 _cleanup_udev_unref_ struct udev *udev = NULL;
2679 struct udev_list_entry *first, *item;
2680 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2681 bool is_gpt, is_mbr, multiple_generic = false;
2682 const char *pttype = NULL;
2683 blkid_partlist pl;
2684 struct stat st;
2685 unsigned i;
2686 int r;
2687
2688 assert(fd >= 0);
2689 assert(root_device);
2690 assert(home_device);
2691 assert(srv_device);
2692 assert(secondary);
2693 assert(arg_image);
2694
2695 b = blkid_new_probe();
2696 if (!b)
2697 return log_oom();
2698
2699 errno = 0;
2700 r = blkid_probe_set_device(b, fd, 0, 0);
2701 if (r != 0) {
2702 if (errno == 0)
2703 return log_oom();
2704
2705 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2706 return -errno;
2707 }
2708
2709 blkid_probe_enable_partitions(b, 1);
2710 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2711
2712 errno = 0;
2713 r = blkid_do_safeprobe(b);
2714 if (r == -2 || r == 1) {
2715 log_error("Failed to identify any partition table on\n"
2716 " %s\n"
2717 PARTITION_TABLE_BLURB, arg_image);
2718 return -EINVAL;
2719 } else if (r != 0) {
2720 if (errno == 0)
2721 errno = EIO;
2722 log_error_errno(errno, "Failed to probe: %m");
2723 return -errno;
2724 }
2725
2726 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2727
2728 is_gpt = streq_ptr(pttype, "gpt");
2729 is_mbr = streq_ptr(pttype, "dos");
2730
2731 if (!is_gpt && !is_mbr) {
2732 log_error("No GPT or MBR partition table discovered on\n"
2733 " %s\n"
2734 PARTITION_TABLE_BLURB, arg_image);
2735 return -EINVAL;
2736 }
2737
2738 errno = 0;
2739 pl = blkid_probe_get_partitions(b);
2740 if (!pl) {
2741 if (errno == 0)
2742 return log_oom();
2743
2744 log_error("Failed to list partitions of %s", arg_image);
2745 return -errno;
2746 }
2747
2748 udev = udev_new();
2749 if (!udev)
2750 return log_oom();
2751
2752 if (fstat(fd, &st) < 0)
2753 return log_error_errno(errno, "Failed to stat block device: %m");
2754
2755 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2756 if (!d)
2757 return log_oom();
2758
2759 for (i = 0;; i++) {
2760 int n, m;
2761
2762 if (i >= 10) {
2763 log_error("Kernel partitions never appeared.");
2764 return -ENXIO;
2765 }
2766
2767 e = udev_enumerate_new(udev);
2768 if (!e)
2769 return log_oom();
2770
2771 r = udev_enumerate_add_match_parent(e, d);
2772 if (r < 0)
2773 return log_oom();
2774
2775 r = udev_enumerate_scan_devices(e);
2776 if (r < 0)
2777 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2778
2779 /* Count the partitions enumerated by the kernel */
2780 n = 0;
2781 first = udev_enumerate_get_list_entry(e);
2782 udev_list_entry_foreach(item, first)
2783 n++;
2784
2785 /* Count the partitions enumerated by blkid */
2786 m = blkid_partlist_numof_partitions(pl);
2787 if (n == m + 1)
2788 break;
2789 if (n > m + 1) {
2790 log_error("blkid and kernel partition list do not match.");
2791 return -EIO;
2792 }
2793 if (n < m + 1) {
2794 unsigned j;
2795
2796 /* The kernel has probed fewer partitions than
2797 * blkid? Maybe the kernel prober is still
2798 * running or it got EBUSY because udev
2799 * already opened the device. Let's reprobe
2800 * the device, which is a synchronous call
2801 * that waits until probing is complete. */
2802
2803 for (j = 0; j < 20; j++) {
2804
2805 r = ioctl(fd, BLKRRPART, 0);
2806 if (r < 0)
2807 r = -errno;
2808 if (r >= 0 || r != -EBUSY)
2809 break;
2810
2811 /* If something else has the device
2812 * open, such as an udev rule, the
2813 * ioctl will return EBUSY. Since
2814 * there's no way to wait until it
2815 * isn't busy anymore, let's just wait
2816 * a bit, and try again.
2817 *
2818 * This is really something they
2819 * should fix in the kernel! */
2820
2821 usleep(50 * USEC_PER_MSEC);
2822 }
2823
2824 if (r < 0)
2825 return log_error_errno(r, "Failed to reread partition table: %m");
2826 }
2827
2828 e = udev_enumerate_unref(e);
2829 }
2830
2831 first = udev_enumerate_get_list_entry(e);
2832 udev_list_entry_foreach(item, first) {
2833 _cleanup_udev_device_unref_ struct udev_device *q;
2834 const char *node;
2835 unsigned long long flags;
2836 blkid_partition pp;
2837 dev_t qn;
2838 int nr;
2839
2840 errno = 0;
2841 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2842 if (!q) {
2843 if (!errno)
2844 errno = ENOMEM;
2845
2846 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2847 return -errno;
2848 }
2849
2850 qn = udev_device_get_devnum(q);
2851 if (major(qn) == 0)
2852 continue;
2853
2854 if (st.st_rdev == qn)
2855 continue;
2856
2857 node = udev_device_get_devnode(q);
2858 if (!node)
2859 continue;
2860
2861 pp = blkid_partlist_devno_to_partition(pl, qn);
2862 if (!pp)
2863 continue;
2864
2865 flags = blkid_partition_get_flags(pp);
2866
2867 nr = blkid_partition_get_partno(pp);
2868 if (nr < 0)
2869 continue;
2870
2871 if (is_gpt) {
2872 sd_id128_t type_id;
2873 const char *stype;
2874
2875 if (flags & GPT_FLAG_NO_AUTO)
2876 continue;
2877
2878 stype = blkid_partition_get_type_string(pp);
2879 if (!stype)
2880 continue;
2881
2882 if (sd_id128_from_string(stype, &type_id) < 0)
2883 continue;
2884
2885 if (sd_id128_equal(type_id, GPT_HOME)) {
2886
2887 if (home && nr >= home_nr)
2888 continue;
2889
2890 home_nr = nr;
2891 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2892
2893 r = free_and_strdup(&home, node);
2894 if (r < 0)
2895 return log_oom();
2896
2897 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2898
2899 if (srv && nr >= srv_nr)
2900 continue;
2901
2902 srv_nr = nr;
2903 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2904
2905 r = free_and_strdup(&srv, node);
2906 if (r < 0)
2907 return log_oom();
2908 }
2909 #ifdef GPT_ROOT_NATIVE
2910 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2911
2912 if (root && nr >= root_nr)
2913 continue;
2914
2915 root_nr = nr;
2916 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2917
2918 r = free_and_strdup(&root, node);
2919 if (r < 0)
2920 return log_oom();
2921 }
2922 #endif
2923 #ifdef GPT_ROOT_SECONDARY
2924 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2925
2926 if (secondary_root && nr >= secondary_root_nr)
2927 continue;
2928
2929 secondary_root_nr = nr;
2930 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2931
2932 r = free_and_strdup(&secondary_root, node);
2933 if (r < 0)
2934 return log_oom();
2935 }
2936 #endif
2937 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2938
2939 if (generic)
2940 multiple_generic = true;
2941 else {
2942 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2943
2944 r = free_and_strdup(&generic, node);
2945 if (r < 0)
2946 return log_oom();
2947 }
2948 }
2949
2950 } else if (is_mbr) {
2951 int type;
2952
2953 if (flags != 0x80) /* Bootable flag */
2954 continue;
2955
2956 type = blkid_partition_get_type(pp);
2957 if (type != 0x83) /* Linux partition */
2958 continue;
2959
2960 if (generic)
2961 multiple_generic = true;
2962 else {
2963 generic_rw = true;
2964
2965 r = free_and_strdup(&root, node);
2966 if (r < 0)
2967 return log_oom();
2968 }
2969 }
2970 }
2971
2972 if (root) {
2973 *root_device = root;
2974 root = NULL;
2975
2976 *root_device_rw = root_rw;
2977 *secondary = false;
2978 } else if (secondary_root) {
2979 *root_device = secondary_root;
2980 secondary_root = NULL;
2981
2982 *root_device_rw = secondary_root_rw;
2983 *secondary = true;
2984 } else if (generic) {
2985
2986 /* There were no partitions with precise meanings
2987 * around, but we found generic partitions. In this
2988 * case, if there's only one, we can go ahead and boot
2989 * it, otherwise we bail out, because we really cannot
2990 * make any sense of it. */
2991
2992 if (multiple_generic) {
2993 log_error("Identified multiple bootable Linux partitions on\n"
2994 " %s\n"
2995 PARTITION_TABLE_BLURB, arg_image);
2996 return -EINVAL;
2997 }
2998
2999 *root_device = generic;
3000 generic = NULL;
3001
3002 *root_device_rw = generic_rw;
3003 *secondary = false;
3004 } else {
3005 log_error("Failed to identify root partition in disk image\n"
3006 " %s\n"
3007 PARTITION_TABLE_BLURB, arg_image);
3008 return -EINVAL;
3009 }
3010
3011 if (home) {
3012 *home_device = home;
3013 home = NULL;
3014
3015 *home_device_rw = home_rw;
3016 }
3017
3018 if (srv) {
3019 *srv_device = srv;
3020 srv = NULL;
3021
3022 *srv_device_rw = srv_rw;
3023 }
3024
3025 return 0;
3026 #else
3027 log_error("--image= is not supported, compiled without blkid support.");
3028 return -ENOTSUP;
3029 #endif
3030 }
3031
3032 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3033 #ifdef HAVE_BLKID
3034 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3035 const char *fstype, *p;
3036 int r;
3037
3038 assert(what);
3039 assert(where);
3040
3041 if (arg_read_only)
3042 rw = false;
3043
3044 if (directory)
3045 p = strappenda(where, directory);
3046 else
3047 p = where;
3048
3049 errno = 0;
3050 b = blkid_new_probe_from_filename(what);
3051 if (!b) {
3052 if (errno == 0)
3053 return log_oom();
3054 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3055 return -errno;
3056 }
3057
3058 blkid_probe_enable_superblocks(b, 1);
3059 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3060
3061 errno = 0;
3062 r = blkid_do_safeprobe(b);
3063 if (r == -1 || r == 1) {
3064 log_error("Cannot determine file system type of %s", what);
3065 return -EINVAL;
3066 } else if (r != 0) {
3067 if (errno == 0)
3068 errno = EIO;
3069 log_error_errno(errno, "Failed to probe %s: %m", what);
3070 return -errno;
3071 }
3072
3073 errno = 0;
3074 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3075 if (errno == 0)
3076 errno = EINVAL;
3077 log_error("Failed to determine file system type of %s", what);
3078 return -errno;
3079 }
3080
3081 if (streq(fstype, "crypto_LUKS")) {
3082 log_error("nspawn currently does not support LUKS disk images.");
3083 return -ENOTSUP;
3084 }
3085
3086 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3087 return log_error_errno(errno, "Failed to mount %s: %m", what);
3088
3089 return 0;
3090 #else
3091 log_error("--image= is not supported, compiled without blkid support.");
3092 return -ENOTSUP;
3093 #endif
3094 }
3095
3096 static int mount_devices(
3097 const char *where,
3098 const char *root_device, bool root_device_rw,
3099 const char *home_device, bool home_device_rw,
3100 const char *srv_device, bool srv_device_rw) {
3101 int r;
3102
3103 assert(where);
3104
3105 if (root_device) {
3106 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3107 if (r < 0)
3108 return log_error_errno(r, "Failed to mount root directory: %m");
3109 }
3110
3111 if (home_device) {
3112 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3113 if (r < 0)
3114 return log_error_errno(r, "Failed to mount home directory: %m");
3115 }
3116
3117 if (srv_device) {
3118 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3119 if (r < 0)
3120 return log_error_errno(r, "Failed to mount server data directory: %m");
3121 }
3122
3123 return 0;
3124 }
3125
3126 static void loop_remove(int nr, int *image_fd) {
3127 _cleanup_close_ int control = -1;
3128 int r;
3129
3130 if (nr < 0)
3131 return;
3132
3133 if (image_fd && *image_fd >= 0) {
3134 r = ioctl(*image_fd, LOOP_CLR_FD);
3135 if (r < 0)
3136 log_debug_errno(errno, "Failed to close loop image: %m");
3137 *image_fd = safe_close(*image_fd);
3138 }
3139
3140 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3141 if (control < 0) {
3142 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3143 return;
3144 }
3145
3146 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3147 if (r < 0)
3148 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3149 }
3150
3151 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3152 int pipe_fds[2];
3153 pid_t pid;
3154
3155 assert(database);
3156 assert(key);
3157 assert(rpid);
3158
3159 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3160 return log_error_errno(errno, "Failed to allocate pipe: %m");
3161
3162 pid = fork();
3163 if (pid < 0)
3164 return log_error_errno(errno, "Failed to fork getent child: %m");
3165 else if (pid == 0) {
3166 int nullfd;
3167 char *empty_env = NULL;
3168
3169 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3170 _exit(EXIT_FAILURE);
3171
3172 if (pipe_fds[0] > 2)
3173 safe_close(pipe_fds[0]);
3174 if (pipe_fds[1] > 2)
3175 safe_close(pipe_fds[1]);
3176
3177 nullfd = open("/dev/null", O_RDWR);
3178 if (nullfd < 0)
3179 _exit(EXIT_FAILURE);
3180
3181 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3182 _exit(EXIT_FAILURE);
3183
3184 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3185 _exit(EXIT_FAILURE);
3186
3187 if (nullfd > 2)
3188 safe_close(nullfd);
3189
3190 reset_all_signal_handlers();
3191 close_all_fds(NULL, 0);
3192
3193 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3194 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3195 _exit(EXIT_FAILURE);
3196 }
3197
3198 pipe_fds[1] = safe_close(pipe_fds[1]);
3199
3200 *rpid = pid;
3201
3202 return pipe_fds[0];
3203 }
3204
3205 static int change_uid_gid(char **_home) {
3206 char line[LINE_MAX], *x, *u, *g, *h;
3207 const char *word, *state;
3208 _cleanup_free_ uid_t *uids = NULL;
3209 _cleanup_free_ char *home = NULL;
3210 _cleanup_fclose_ FILE *f = NULL;
3211 _cleanup_close_ int fd = -1;
3212 unsigned n_uids = 0;
3213 size_t sz = 0, l;
3214 uid_t uid;
3215 gid_t gid;
3216 pid_t pid;
3217 int r;
3218
3219 assert(_home);
3220
3221 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3222 /* Reset everything fully to 0, just in case */
3223
3224 if (setgroups(0, NULL) < 0)
3225 return log_error_errno(errno, "setgroups() failed: %m");
3226
3227 if (setresgid(0, 0, 0) < 0)
3228 return log_error_errno(errno, "setregid() failed: %m");
3229
3230 if (setresuid(0, 0, 0) < 0)
3231 return log_error_errno(errno, "setreuid() failed: %m");
3232
3233 *_home = NULL;
3234 return 0;
3235 }
3236
3237 /* First, get user credentials */
3238 fd = spawn_getent("passwd", arg_user, &pid);
3239 if (fd < 0)
3240 return fd;
3241
3242 f = fdopen(fd, "r");
3243 if (!f)
3244 return log_oom();
3245 fd = -1;
3246
3247 if (!fgets(line, sizeof(line), f)) {
3248
3249 if (!ferror(f)) {
3250 log_error("Failed to resolve user %s.", arg_user);
3251 return -ESRCH;
3252 }
3253
3254 log_error_errno(errno, "Failed to read from getent: %m");
3255 return -errno;
3256 }
3257
3258 truncate_nl(line);
3259
3260 wait_for_terminate_and_warn("getent passwd", pid, true);
3261
3262 x = strchr(line, ':');
3263 if (!x) {
3264 log_error("/etc/passwd entry has invalid user field.");
3265 return -EIO;
3266 }
3267
3268 u = strchr(x+1, ':');
3269 if (!u) {
3270 log_error("/etc/passwd entry has invalid password field.");
3271 return -EIO;
3272 }
3273
3274 u++;
3275 g = strchr(u, ':');
3276 if (!g) {
3277 log_error("/etc/passwd entry has invalid UID field.");
3278 return -EIO;
3279 }
3280
3281 *g = 0;
3282 g++;
3283 x = strchr(g, ':');
3284 if (!x) {
3285 log_error("/etc/passwd entry has invalid GID field.");
3286 return -EIO;
3287 }
3288
3289 *x = 0;
3290 h = strchr(x+1, ':');
3291 if (!h) {
3292 log_error("/etc/passwd entry has invalid GECOS field.");
3293 return -EIO;
3294 }
3295
3296 h++;
3297 x = strchr(h, ':');
3298 if (!x) {
3299 log_error("/etc/passwd entry has invalid home directory field.");
3300 return -EIO;
3301 }
3302
3303 *x = 0;
3304
3305 r = parse_uid(u, &uid);
3306 if (r < 0) {
3307 log_error("Failed to parse UID of user.");
3308 return -EIO;
3309 }
3310
3311 r = parse_gid(g, &gid);
3312 if (r < 0) {
3313 log_error("Failed to parse GID of user.");
3314 return -EIO;
3315 }
3316
3317 home = strdup(h);
3318 if (!home)
3319 return log_oom();
3320
3321 /* Second, get group memberships */
3322 fd = spawn_getent("initgroups", arg_user, &pid);
3323 if (fd < 0)
3324 return fd;
3325
3326 fclose(f);
3327 f = fdopen(fd, "r");
3328 if (!f)
3329 return log_oom();
3330 fd = -1;
3331
3332 if (!fgets(line, sizeof(line), f)) {
3333 if (!ferror(f)) {
3334 log_error("Failed to resolve user %s.", arg_user);
3335 return -ESRCH;
3336 }
3337
3338 log_error_errno(errno, "Failed to read from getent: %m");
3339 return -errno;
3340 }
3341
3342 truncate_nl(line);
3343
3344 wait_for_terminate_and_warn("getent initgroups", pid, true);
3345
3346 /* Skip over the username and subsequent separator whitespace */
3347 x = line;
3348 x += strcspn(x, WHITESPACE);
3349 x += strspn(x, WHITESPACE);
3350
3351 FOREACH_WORD(word, l, x, state) {
3352 char c[l+1];
3353
3354 memcpy(c, word, l);
3355 c[l] = 0;
3356
3357 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3358 return log_oom();
3359
3360 r = parse_uid(c, &uids[n_uids++]);
3361 if (r < 0) {
3362 log_error("Failed to parse group data from getent.");
3363 return -EIO;
3364 }
3365 }
3366
3367 r = mkdir_parents(home, 0775);
3368 if (r < 0)
3369 return log_error_errno(r, "Failed to make home root directory: %m");
3370
3371 r = mkdir_safe(home, 0755, uid, gid);
3372 if (r < 0 && r != -EEXIST)
3373 return log_error_errno(r, "Failed to make home directory: %m");
3374
3375 fchown(STDIN_FILENO, uid, gid);
3376 fchown(STDOUT_FILENO, uid, gid);
3377 fchown(STDERR_FILENO, uid, gid);
3378
3379 if (setgroups(n_uids, uids) < 0)
3380 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3381
3382 if (setresgid(gid, gid, gid) < 0)
3383 return log_error_errno(errno, "setregid() failed: %m");
3384
3385 if (setresuid(uid, uid, uid) < 0)
3386 return log_error_errno(errno, "setreuid() failed: %m");
3387
3388 if (_home) {
3389 *_home = home;
3390 home = NULL;
3391 }
3392
3393 return 0;
3394 }
3395
3396 /*
3397 * Return values:
3398 * < 0 : wait_for_terminate() failed to get the state of the
3399 * container, the container was terminated by a signal, or
3400 * failed for an unknown reason. No change is made to the
3401 * container argument.
3402 * > 0 : The program executed in the container terminated with an
3403 * error. The exit code of the program executed in the
3404 * container is returned. The container argument has been set
3405 * to CONTAINER_TERMINATED.
3406 * 0 : The container is being rebooted, has been shut down or exited
3407 * successfully. The container argument has been set to either
3408 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3409 *
3410 * That is, success is indicated by a return value of zero, and an
3411 * error is indicated by a non-zero value.
3412 */
3413 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3414 siginfo_t status;
3415 int r;
3416
3417 r = wait_for_terminate(pid, &status);
3418 if (r < 0)
3419 return log_warning_errno(r, "Failed to wait for container: %m");
3420
3421 switch (status.si_code) {
3422
3423 case CLD_EXITED:
3424 if (status.si_status == 0) {
3425 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3426
3427 } else
3428 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3429
3430 *container = CONTAINER_TERMINATED;
3431 return status.si_status;
3432
3433 case CLD_KILLED:
3434 if (status.si_status == SIGINT) {
3435
3436 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3437 *container = CONTAINER_TERMINATED;
3438 return 0;
3439
3440 } else if (status.si_status == SIGHUP) {
3441
3442 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3443 *container = CONTAINER_REBOOTED;
3444 return 0;
3445 }
3446
3447 /* CLD_KILLED fallthrough */
3448
3449 case CLD_DUMPED:
3450 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3451 return -EIO;
3452
3453 default:
3454 log_error("Container %s failed due to unknown reason.", arg_machine);
3455 return -EIO;
3456 }
3457
3458 return r;
3459 }
3460
3461 static void nop_handler(int sig) {}
3462
3463 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3464 pid_t pid;
3465
3466 pid = PTR_TO_UINT32(userdata);
3467 if (pid > 0) {
3468 if (kill(pid, SIGRTMIN+3) >= 0) {
3469 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3470 sd_event_source_set_userdata(s, NULL);
3471 return 0;
3472 }
3473 }
3474
3475 sd_event_exit(sd_event_source_get_event(s), 0);
3476 return 0;
3477 }
3478
3479 static int determine_names(void) {
3480 int r;
3481
3482 if (!arg_image && !arg_directory) {
3483 if (arg_machine) {
3484 _cleanup_(image_unrefp) Image *i = NULL;
3485
3486 r = image_find(arg_machine, &i);
3487 if (r < 0)
3488 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3489 else if (r == 0) {
3490 log_error("No image for machine '%s': %m", arg_machine);
3491 return -ENOENT;
3492 }
3493
3494 if (i->type == IMAGE_RAW)
3495 r = set_sanitized_path(&arg_image, i->path);
3496 else
3497 r = set_sanitized_path(&arg_directory, i->path);
3498 if (r < 0)
3499 return log_error_errno(r, "Invalid image directory: %m");
3500
3501 arg_read_only = arg_read_only || i->read_only;
3502 } else
3503 arg_directory = get_current_dir_name();
3504
3505 if (!arg_directory && !arg_machine) {
3506 log_error("Failed to determine path, please use -D or -i.");
3507 return -EINVAL;
3508 }
3509 }
3510
3511 if (!arg_machine) {
3512 if (arg_directory && path_equal(arg_directory, "/"))
3513 arg_machine = gethostname_malloc();
3514 else
3515 arg_machine = strdup(basename(arg_image ?: arg_directory));
3516
3517 if (!arg_machine)
3518 return log_oom();
3519
3520 hostname_cleanup(arg_machine, false);
3521 if (!machine_name_is_valid(arg_machine)) {
3522 log_error("Failed to determine machine name automatically, please use -M.");
3523 return -EINVAL;
3524 }
3525
3526 if (arg_ephemeral) {
3527 char *b;
3528
3529 /* Add a random suffix when this is an
3530 * ephemeral machine, so that we can run many
3531 * instances at once without manually having
3532 * to specify -M each time. */
3533
3534 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3535 return log_oom();
3536
3537 free(arg_machine);
3538 arg_machine = b;
3539 }
3540 }
3541
3542 return 0;
3543 }
3544
3545 int main(int argc, char *argv[]) {
3546
3547 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3548 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3549 _cleanup_close_ int master = -1, image_fd = -1;
3550 _cleanup_fdset_free_ FDSet *fds = NULL;
3551 int r, n_fd_passed, loop_nr = -1;
3552 char veth_name[IFNAMSIZ];
3553 bool secondary = false, remove_subvol = false;
3554 sigset_t mask, mask_chld;
3555 pid_t pid = 0;
3556 int ret = EXIT_SUCCESS;
3557 union in_addr_union exposed = {};
3558 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3559
3560 log_parse_environment();
3561 log_open();
3562
3563 r = parse_argv(argc, argv);
3564 if (r <= 0)
3565 goto finish;
3566
3567 r = determine_names();
3568 if (r < 0)
3569 goto finish;
3570
3571 if (geteuid() != 0) {
3572 log_error("Need to be root.");
3573 r = -EPERM;
3574 goto finish;
3575 }
3576
3577 if (sd_booted() <= 0) {
3578 log_error("Not running on a systemd system.");
3579 r = -EINVAL;
3580 goto finish;
3581 }
3582
3583 log_close();
3584 n_fd_passed = sd_listen_fds(false);
3585 if (n_fd_passed > 0) {
3586 r = fdset_new_listen_fds(&fds, false);
3587 if (r < 0) {
3588 log_error_errno(r, "Failed to collect file descriptors: %m");
3589 goto finish;
3590 }
3591 }
3592 fdset_close_others(fds);
3593 log_open();
3594
3595 if (arg_directory) {
3596 assert(!arg_image);
3597
3598 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3599 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3600 r = -EINVAL;
3601 goto finish;
3602 }
3603
3604 if (arg_ephemeral) {
3605 _cleanup_release_lock_file_ LockFile original_lock = LOCK_FILE_INIT;
3606 char *np;
3607
3608 /* If the specified path is a mount point we
3609 * generate the new snapshot immediately
3610 * inside it under a random name. However if
3611 * the specified is not a mount point we
3612 * create the new snapshot in the parent
3613 * directory, just next to it. */
3614 r = path_is_mount_point(arg_directory, false);
3615 if (r < 0) {
3616 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3617 goto finish;
3618 }
3619 if (r > 0)
3620 r = tempfn_random_child(arg_directory, &np);
3621 else
3622 r = tempfn_random(arg_directory, &np);
3623 if (r < 0) {
3624 log_error_errno(r, "Failed to generate name for snapshot: %m");
3625 goto finish;
3626 }
3627
3628 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3629 if (r < 0) {
3630 log_error_errno(r, "Failed to lock %s: %m", np);
3631 goto finish;
3632 }
3633
3634 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3635 if (r < 0) {
3636 free(np);
3637 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3638 goto finish;
3639 }
3640
3641 free(arg_directory);
3642 arg_directory = np;
3643
3644 remove_subvol = true;
3645
3646 } else {
3647 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3648 if (r == -EBUSY) {
3649 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3650 goto finish;
3651 }
3652 if (r < 0) {
3653 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3654 return r;
3655 }
3656
3657 if (arg_template) {
3658 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3659 if (r == -EEXIST) {
3660 if (!arg_quiet)
3661 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3662 } else if (r < 0) {
3663 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3664 goto finish;
3665 } else {
3666 if (!arg_quiet)
3667 log_info("Populated %s from template %s.", arg_directory, arg_template);
3668 }
3669 }
3670 }
3671
3672 if (arg_boot) {
3673 if (path_is_os_tree(arg_directory) <= 0) {
3674 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3675 r = -EINVAL;
3676 goto finish;
3677 }
3678 } else {
3679 const char *p;
3680
3681 p = strappenda(arg_directory,
3682 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3683 if (access(p, F_OK) < 0) {
3684 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3685 r = -EINVAL;
3686 goto finish;
3687 }
3688 }
3689
3690 } else {
3691 char template[] = "/tmp/nspawn-root-XXXXXX";
3692
3693 assert(arg_image);
3694 assert(!arg_template);
3695
3696 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3697 if (r == -EBUSY) {
3698 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3699 goto finish;
3700 }
3701 if (r < 0) {
3702 r = log_error_errno(r, "Failed to create image lock: %m");
3703 goto finish;
3704 }
3705
3706 if (!mkdtemp(template)) {
3707 log_error_errno(errno, "Failed to create temporary directory: %m");
3708 r = -errno;
3709 goto finish;
3710 }
3711
3712 arg_directory = strdup(template);
3713 if (!arg_directory) {
3714 r = log_oom();
3715 goto finish;
3716 }
3717
3718 image_fd = setup_image(&device_path, &loop_nr);
3719 if (image_fd < 0) {
3720 r = image_fd;
3721 goto finish;
3722 }
3723
3724 r = dissect_image(image_fd,
3725 &root_device, &root_device_rw,
3726 &home_device, &home_device_rw,
3727 &srv_device, &srv_device_rw,
3728 &secondary);
3729 if (r < 0)
3730 goto finish;
3731 }
3732
3733 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3734 if (master < 0) {
3735 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3736 goto finish;
3737 }
3738
3739 r = ptsname_malloc(master, &console);
3740 if (r < 0) {
3741 r = log_error_errno(r, "Failed to determine tty name: %m");
3742 goto finish;
3743 }
3744
3745 if (!arg_quiet)
3746 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3747 arg_machine, arg_image ?: arg_directory);
3748
3749 if (unlockpt(master) < 0) {
3750 r = log_error_errno(errno, "Failed to unlock tty: %m");
3751 goto finish;
3752 }
3753
3754 assert_se(sigemptyset(&mask) == 0);
3755 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3756 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3757
3758 assert_se(sigemptyset(&mask_chld) == 0);
3759 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3760
3761 for (;;) {
3762 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3763 ContainerStatus container_status;
3764 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3765 struct sigaction sa = {
3766 .sa_handler = nop_handler,
3767 .sa_flags = SA_NOCLDSTOP,
3768 };
3769
3770 r = barrier_create(&barrier);
3771 if (r < 0) {
3772 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3773 goto finish;
3774 }
3775
3776 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3777 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3778 goto finish;
3779 }
3780
3781 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3782 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3783 goto finish;
3784 }
3785
3786 /* Child can be killed before execv(), so handle SIGCHLD
3787 * in order to interrupt parent's blocking calls and
3788 * give it a chance to call wait() and terminate. */
3789 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3790 if (r < 0) {
3791 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3792 goto finish;
3793 }
3794
3795 r = sigaction(SIGCHLD, &sa, NULL);
3796 if (r < 0) {
3797 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3798 goto finish;
3799 }
3800
3801 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3802 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3803 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3804 if (pid < 0) {
3805 if (errno == EINVAL)
3806 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3807 else
3808 r = log_error_errno(errno, "clone() failed: %m");
3809
3810 goto finish;
3811 }
3812
3813 if (pid == 0) {
3814 /* child */
3815 _cleanup_free_ char *home = NULL;
3816 unsigned n_env = 2;
3817 const char *envp[] = {
3818 "PATH=" DEFAULT_PATH_SPLIT_USR,
3819 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3820 NULL, /* TERM */
3821 NULL, /* HOME */
3822 NULL, /* USER */
3823 NULL, /* LOGNAME */
3824 NULL, /* container_uuid */
3825 NULL, /* LISTEN_FDS */
3826 NULL, /* LISTEN_PID */
3827 NULL
3828 };
3829 char **env_use;
3830
3831 barrier_set_role(&barrier, BARRIER_CHILD);
3832
3833 envp[n_env] = strv_find_prefix(environ, "TERM=");
3834 if (envp[n_env])
3835 n_env ++;
3836
3837 master = safe_close(master);
3838
3839 close_nointr(STDIN_FILENO);
3840 close_nointr(STDOUT_FILENO);
3841 close_nointr(STDERR_FILENO);
3842
3843 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3844 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3845
3846 reset_all_signal_handlers();
3847 reset_signal_mask();
3848
3849 r = open_terminal(console, O_RDWR);
3850 if (r != STDIN_FILENO) {
3851 if (r >= 0) {
3852 safe_close(r);
3853 r = -EINVAL;
3854 }
3855
3856 log_error_errno(r, "Failed to open console: %m");
3857 _exit(EXIT_FAILURE);
3858 }
3859
3860 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3861 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3862 log_error_errno(errno, "Failed to duplicate console: %m");
3863 _exit(EXIT_FAILURE);
3864 }
3865
3866 if (setsid() < 0) {
3867 log_error_errno(errno, "setsid() failed: %m");
3868 _exit(EXIT_FAILURE);
3869 }
3870
3871 if (reset_audit_loginuid() < 0)
3872 _exit(EXIT_FAILURE);
3873
3874 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3875 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3876 _exit(EXIT_FAILURE);
3877 }
3878
3879 /* Mark everything as slave, so that we still
3880 * receive mounts from the real root, but don't
3881 * propagate mounts to the real root. */
3882 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3883 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3884 _exit(EXIT_FAILURE);
3885 }
3886
3887 if (mount_devices(arg_directory,
3888 root_device, root_device_rw,
3889 home_device, home_device_rw,
3890 srv_device, srv_device_rw) < 0)
3891 _exit(EXIT_FAILURE);
3892
3893 /* Turn directory into bind mount */
3894 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3895 log_error_errno(errno, "Failed to make bind mount: %m");
3896 _exit(EXIT_FAILURE);
3897 }
3898
3899 r = setup_volatile(arg_directory);
3900 if (r < 0)
3901 _exit(EXIT_FAILURE);
3902
3903 if (setup_volatile_state(arg_directory) < 0)
3904 _exit(EXIT_FAILURE);
3905
3906 r = base_filesystem_create(arg_directory);
3907 if (r < 0)
3908 _exit(EXIT_FAILURE);
3909
3910 if (arg_read_only) {
3911 r = bind_remount_recursive(arg_directory, true);
3912 if (r < 0) {
3913 log_error_errno(r, "Failed to make tree read-only: %m");
3914 _exit(EXIT_FAILURE);
3915 }
3916 }
3917
3918 if (mount_all(arg_directory) < 0)
3919 _exit(EXIT_FAILURE);
3920
3921 if (copy_devnodes(arg_directory) < 0)
3922 _exit(EXIT_FAILURE);
3923
3924 if (setup_ptmx(arg_directory) < 0)
3925 _exit(EXIT_FAILURE);
3926
3927 dev_setup(arg_directory);
3928
3929 if (setup_propagate(arg_directory) < 0)
3930 _exit(EXIT_FAILURE);
3931
3932 if (setup_seccomp() < 0)
3933 _exit(EXIT_FAILURE);
3934
3935 if (setup_dev_console(arg_directory, console) < 0)
3936 _exit(EXIT_FAILURE);
3937
3938 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3939 _exit(EXIT_FAILURE);
3940 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3941
3942 if (send_rtnl(rtnl_socket_pair[1]) < 0)
3943 _exit(EXIT_FAILURE);
3944 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3945
3946 /* Tell the parent that we are ready, and that
3947 * it can cgroupify us to that we lack access
3948 * to certain devices and resources. */
3949 (void) barrier_place(&barrier);
3950
3951 if (setup_boot_id(arg_directory) < 0)
3952 _exit(EXIT_FAILURE);
3953
3954 if (setup_timezone(arg_directory) < 0)
3955 _exit(EXIT_FAILURE);
3956
3957 if (setup_resolv_conf(arg_directory) < 0)
3958 _exit(EXIT_FAILURE);
3959
3960 if (setup_journal(arg_directory) < 0)
3961 _exit(EXIT_FAILURE);
3962
3963 if (mount_binds(arg_directory, arg_bind, false) < 0)
3964 _exit(EXIT_FAILURE);
3965
3966 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3967 _exit(EXIT_FAILURE);
3968
3969 if (mount_tmpfs(arg_directory) < 0)
3970 _exit(EXIT_FAILURE);
3971
3972 /* Wait until we are cgroup-ified, so that we
3973 * can mount the right cgroup path writable */
3974 (void) barrier_sync_next(&barrier);
3975
3976 if (mount_cgroup(arg_directory) < 0)
3977 _exit(EXIT_FAILURE);
3978
3979 if (chdir(arg_directory) < 0) {
3980 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3981 _exit(EXIT_FAILURE);
3982 }
3983
3984 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3985 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3986 _exit(EXIT_FAILURE);
3987 }
3988
3989 if (chroot(".") < 0) {
3990 log_error_errno(errno, "chroot() failed: %m");
3991 _exit(EXIT_FAILURE);
3992 }
3993
3994 if (chdir("/") < 0) {
3995 log_error_errno(errno, "chdir() failed: %m");
3996 _exit(EXIT_FAILURE);
3997 }
3998
3999 umask(0022);
4000
4001 if (arg_private_network)
4002 loopback_setup();
4003
4004 if (drop_capabilities() < 0) {
4005 log_error_errno(errno, "drop_capabilities() failed: %m");
4006 _exit(EXIT_FAILURE);
4007 }
4008
4009 r = change_uid_gid(&home);
4010 if (r < 0)
4011 _exit(EXIT_FAILURE);
4012
4013 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4014 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4015 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4016 log_oom();
4017 _exit(EXIT_FAILURE);
4018 }
4019
4020 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4021 char as_uuid[37];
4022
4023 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4024 log_oom();
4025 _exit(EXIT_FAILURE);
4026 }
4027 }
4028
4029 if (fdset_size(fds) > 0) {
4030 r = fdset_cloexec(fds, false);
4031 if (r < 0) {
4032 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4033 _exit(EXIT_FAILURE);
4034 }
4035
4036 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4037 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4038 log_oom();
4039 _exit(EXIT_FAILURE);
4040 }
4041 }
4042
4043 setup_hostname();
4044
4045 if (arg_personality != 0xffffffffLU) {
4046 if (personality(arg_personality) < 0) {
4047 log_error_errno(errno, "personality() failed: %m");
4048 _exit(EXIT_FAILURE);
4049 }
4050 } else if (secondary) {
4051 if (personality(PER_LINUX32) < 0) {
4052 log_error_errno(errno, "personality() failed: %m");
4053 _exit(EXIT_FAILURE);
4054 }
4055 }
4056
4057 #ifdef HAVE_SELINUX
4058 if (arg_selinux_context)
4059 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4060 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4061 _exit(EXIT_FAILURE);
4062 }
4063 #endif
4064
4065 if (!strv_isempty(arg_setenv)) {
4066 char **n;
4067
4068 n = strv_env_merge(2, envp, arg_setenv);
4069 if (!n) {
4070 log_oom();
4071 _exit(EXIT_FAILURE);
4072 }
4073
4074 env_use = n;
4075 } else
4076 env_use = (char**) envp;
4077
4078 /* Wait until the parent is ready with the setup, too... */
4079 if (!barrier_place_and_sync(&barrier))
4080 _exit(EXIT_FAILURE);
4081
4082 if (arg_boot) {
4083 char **a;
4084 size_t l;
4085
4086 /* Automatically search for the init system */
4087
4088 l = 1 + argc - optind;
4089 a = newa(char*, l + 1);
4090 memcpy(a + 1, argv + optind, l * sizeof(char*));
4091
4092 a[0] = (char*) "/usr/lib/systemd/systemd";
4093 execve(a[0], a, env_use);
4094
4095 a[0] = (char*) "/lib/systemd/systemd";
4096 execve(a[0], a, env_use);
4097
4098 a[0] = (char*) "/sbin/init";
4099 execve(a[0], a, env_use);
4100 } else if (argc > optind)
4101 execvpe(argv[optind], argv + optind, env_use);
4102 else {
4103 chdir(home ? home : "/root");
4104 execle("/bin/bash", "-bash", NULL, env_use);
4105 execle("/bin/sh", "-sh", NULL, env_use);
4106 }
4107
4108 log_error_errno(errno, "execv() failed: %m");
4109 _exit(EXIT_FAILURE);
4110 }
4111
4112 barrier_set_role(&barrier, BARRIER_PARENT);
4113 fdset_free(fds);
4114 fds = NULL;
4115
4116 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4117 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4118
4119 /* Wait for the most basic Child-setup to be done,
4120 * before we add hardware to it, and place it in a
4121 * cgroup. */
4122 if (barrier_sync_next(&barrier)) {
4123 int ifi = 0;
4124
4125 r = move_network_interfaces(pid);
4126 if (r < 0)
4127 goto finish;
4128
4129 r = setup_veth(pid, veth_name, &ifi);
4130 if (r < 0)
4131 goto finish;
4132
4133 r = setup_bridge(veth_name, &ifi);
4134 if (r < 0)
4135 goto finish;
4136
4137 r = setup_macvlan(pid);
4138 if (r < 0)
4139 goto finish;
4140
4141 r = setup_ipvlan(pid);
4142 if (r < 0)
4143 goto finish;
4144
4145 r = register_machine(pid, ifi);
4146 if (r < 0)
4147 goto finish;
4148
4149 /* Block SIGCHLD here, before notifying child.
4150 * process_pty() will handle it with the other signals. */
4151 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4152 if (r < 0)
4153 goto finish;
4154
4155 /* Reset signal to default */
4156 r = default_signals(SIGCHLD, -1);
4157 if (r < 0)
4158 goto finish;
4159
4160 /* Notify the child that the parent is ready with all
4161 * its setup, and that the child can now hand over
4162 * control to the code to run inside the container. */
4163 (void) barrier_place(&barrier);
4164
4165 /* And wait that the child is completely ready now. */
4166 if (barrier_place_and_sync(&barrier)) {
4167 _cleanup_event_unref_ sd_event *event = NULL;
4168 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4169 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4170 char last_char = 0;
4171
4172 sd_notifyf(false,
4173 "READY=1\n"
4174 "STATUS=Container running.\n"
4175 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4176
4177 r = sd_event_new(&event);
4178 if (r < 0) {
4179 log_error_errno(r, "Failed to get default event source: %m");
4180 goto finish;
4181 }
4182
4183 if (arg_boot) {
4184 /* Try to kill the init system on SIGINT or SIGTERM */
4185 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4186 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4187 } else {
4188 /* Immediately exit */
4189 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4190 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4191 }
4192
4193 /* simply exit on sigchld */
4194 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4195
4196 if (arg_expose_ports) {
4197 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4198 if (r < 0)
4199 goto finish;
4200
4201 (void) expose_ports(rtnl, &exposed);
4202 }
4203
4204 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4205
4206 r = pty_forward_new(event, master, true, &forward);
4207 if (r < 0) {
4208 log_error_errno(r, "Failed to create PTY forwarder: %m");
4209 goto finish;
4210 }
4211
4212 r = sd_event_loop(event);
4213 if (r < 0) {
4214 log_error_errno(r, "Failed to run event loop: %m");
4215 goto finish;
4216 }
4217
4218 pty_forward_get_last_char(forward, &last_char);
4219
4220 forward = pty_forward_free(forward);
4221
4222 if (!arg_quiet && last_char != '\n')
4223 putc('\n', stdout);
4224
4225 /* Kill if it is not dead yet anyway */
4226 terminate_machine(pid);
4227 }
4228 }
4229
4230 /* Normally redundant, but better safe than sorry */
4231 kill(pid, SIGKILL);
4232
4233 r = wait_for_container(pid, &container_status);
4234 pid = 0;
4235
4236 if (r < 0)
4237 /* We failed to wait for the container, or the
4238 * container exited abnormally */
4239 goto finish;
4240 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4241 /* The container exited with a non-zero
4242 * status, or with zero status and no reboot
4243 * was requested. */
4244 ret = r;
4245 break;
4246 }
4247
4248 /* CONTAINER_REBOOTED, loop again */
4249
4250 if (arg_keep_unit) {
4251 /* Special handling if we are running as a
4252 * service: instead of simply restarting the
4253 * machine we want to restart the entire
4254 * service, so let's inform systemd about this
4255 * with the special exit code 133. The service
4256 * file uses RestartForceExitStatus=133 so
4257 * that this results in a full nspawn
4258 * restart. This is necessary since we might
4259 * have cgroup parameters set we want to have
4260 * flushed out. */
4261 ret = 133;
4262 r = 0;
4263 break;
4264 }
4265
4266 flush_ports(&exposed);
4267 }
4268
4269 finish:
4270 sd_notify(false,
4271 "STOPPING=1\n"
4272 "STATUS=Terminating...");
4273
4274 loop_remove(loop_nr, &image_fd);
4275
4276 if (pid > 0)
4277 kill(pid, SIGKILL);
4278
4279 if (remove_subvol && arg_directory) {
4280 int k;
4281
4282 k = btrfs_subvol_remove(arg_directory);
4283 if (k < 0)
4284 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4285 }
4286
4287 if (arg_machine) {
4288 const char *p;
4289
4290 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
4291 (void) rm_rf(p, false, true, false);
4292 }
4293
4294 free(arg_directory);
4295 free(arg_template);
4296 free(arg_image);
4297 free(arg_machine);
4298 free(arg_user);
4299 strv_free(arg_setenv);
4300 strv_free(arg_network_interfaces);
4301 strv_free(arg_network_macvlan);
4302 strv_free(arg_network_ipvlan);
4303 strv_free(arg_bind);
4304 strv_free(arg_bind_ro);
4305 strv_free(arg_tmpfs);
4306
4307 flush_ports(&exposed);
4308
4309 while (arg_expose_ports) {
4310 ExposePort *p = arg_expose_ports;
4311 LIST_REMOVE(ports, arg_expose_ports, p);
4312 free(p);
4313 }
4314
4315 return r < 0 ? EXIT_FAILURE : ret;
4316 }