]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
232629d20ad80c9e4b95f894edf730bf5017ebff
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108 int protocol;
109 uint16_t host_port;
110 uint16_t container_port;
111 LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115 CONTAINER_TERMINATED,
116 CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120 LINK_NO,
121 LINK_AUTO,
122 LINK_HOST,
123 LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127 VOLATILE_NO,
128 VOLATILE_YES,
129 VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147 (1ULL << CAP_CHOWN) |
148 (1ULL << CAP_DAC_OVERRIDE) |
149 (1ULL << CAP_DAC_READ_SEARCH) |
150 (1ULL << CAP_FOWNER) |
151 (1ULL << CAP_FSETID) |
152 (1ULL << CAP_IPC_OWNER) |
153 (1ULL << CAP_KILL) |
154 (1ULL << CAP_LEASE) |
155 (1ULL << CAP_LINUX_IMMUTABLE) |
156 (1ULL << CAP_NET_BIND_SERVICE) |
157 (1ULL << CAP_NET_BROADCAST) |
158 (1ULL << CAP_NET_RAW) |
159 (1ULL << CAP_SETGID) |
160 (1ULL << CAP_SETFCAP) |
161 (1ULL << CAP_SETPCAP) |
162 (1ULL << CAP_SETUID) |
163 (1ULL << CAP_SYS_ADMIN) |
164 (1ULL << CAP_SYS_CHROOT) |
165 (1ULL << CAP_SYS_NICE) |
166 (1ULL << CAP_SYS_PTRACE) |
167 (1ULL << CAP_SYS_TTY_CONFIG) |
168 (1ULL << CAP_SYS_RESOURCE) |
169 (1ULL << CAP_SYS_BOOT) |
170 (1ULL << CAP_AUDIT_WRITE) |
171 (1ULL << CAP_AUDIT_CONTROL) |
172 (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static const char *arg_network_bridge = NULL;
186 static unsigned long arg_personality = 0xffffffffLU;
187 static char *arg_image = NULL;
188 static Volatile arg_volatile = VOLATILE_NO;
189 static ExposePort *arg_expose_ports = NULL;
190 static char **arg_property = NULL;
191
192 static void help(void) {
193 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
194 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
195 " -h --help Show this help\n"
196 " --version Print version string\n"
197 " -q --quiet Do not show status information\n"
198 " -D --directory=PATH Root directory for the container\n"
199 " --template=PATH Initialize root directory from template directory,\n"
200 " if missing\n"
201 " -x --ephemeral Run container with snapshot of root directory, and\n"
202 " remove it after exit\n"
203 " -i --image=PATH File system device or disk image for the container\n"
204 " -b --boot Boot up full system (i.e. invoke init)\n"
205 " -u --user=USER Run the command under specified user or uid\n"
206 " -M --machine=NAME Set the machine name for the container\n"
207 " --uuid=UUID Set a specific machine UUID for the container\n"
208 " -S --slice=SLICE Place the container in the specified slice\n"
209 " --property=NAME=VALUE Set scope unit property\n"
210 " --private-network Disable network in container\n"
211 " --network-interface=INTERFACE\n"
212 " Assign an existing network interface to the\n"
213 " container\n"
214 " --network-macvlan=INTERFACE\n"
215 " Create a macvlan network interface based on an\n"
216 " existing network interface to the container\n"
217 " --network-ipvlan=INTERFACE\n"
218 " Create a ipvlan network interface based on an\n"
219 " existing network interface to the container\n"
220 " -n --network-veth Add a virtual ethernet connection between host\n"
221 " and container\n"
222 " --network-bridge=INTERFACE\n"
223 " Add a virtual ethernet connection between host\n"
224 " and container and add it to an existing bridge on\n"
225 " the host\n"
226 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
227 " Expose a container IP port on the host\n"
228 " -Z --selinux-context=SECLABEL\n"
229 " Set the SELinux security context to be used by\n"
230 " processes in the container\n"
231 " -L --selinux-apifs-context=SECLABEL\n"
232 " Set the SELinux security context to be used by\n"
233 " API/tmpfs file systems in the container\n"
234 " --capability=CAP In addition to the default, retain specified\n"
235 " capability\n"
236 " --drop-capability=CAP Drop the specified capability from the default set\n"
237 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
238 " try-guest, try-host\n"
239 " -j Equivalent to --link-journal=try-guest\n"
240 " --read-only Mount the root directory read-only\n"
241 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
242 " the container\n"
243 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
244 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
245 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
246 " --share-system Share system namespaces with host\n"
247 " --register=BOOLEAN Register container as machine\n"
248 " --keep-unit Do not register a scope for the machine, reuse\n"
249 " the service unit nspawn is running in\n"
250 " --volatile[=MODE] Run the system in volatile mode\n"
251 , program_invocation_short_name);
252 }
253
254 static int set_sanitized_path(char **b, const char *path) {
255 char *p;
256
257 assert(b);
258 assert(path);
259
260 p = canonicalize_file_name(path);
261 if (!p) {
262 if (errno != ENOENT)
263 return -errno;
264
265 p = path_make_absolute_cwd(path);
266 if (!p)
267 return -ENOMEM;
268 }
269
270 free(*b);
271 *b = path_kill_slashes(p);
272 return 0;
273 }
274
275 static int parse_argv(int argc, char *argv[]) {
276
277 enum {
278 ARG_VERSION = 0x100,
279 ARG_PRIVATE_NETWORK,
280 ARG_UUID,
281 ARG_READ_ONLY,
282 ARG_CAPABILITY,
283 ARG_DROP_CAPABILITY,
284 ARG_LINK_JOURNAL,
285 ARG_BIND,
286 ARG_BIND_RO,
287 ARG_TMPFS,
288 ARG_SETENV,
289 ARG_SHARE_SYSTEM,
290 ARG_REGISTER,
291 ARG_KEEP_UNIT,
292 ARG_NETWORK_INTERFACE,
293 ARG_NETWORK_MACVLAN,
294 ARG_NETWORK_IPVLAN,
295 ARG_NETWORK_BRIDGE,
296 ARG_PERSONALITY,
297 ARG_VOLATILE,
298 ARG_TEMPLATE,
299 ARG_PROPERTY,
300 };
301
302 static const struct option options[] = {
303 { "help", no_argument, NULL, 'h' },
304 { "version", no_argument, NULL, ARG_VERSION },
305 { "directory", required_argument, NULL, 'D' },
306 { "template", required_argument, NULL, ARG_TEMPLATE },
307 { "ephemeral", no_argument, NULL, 'x' },
308 { "user", required_argument, NULL, 'u' },
309 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
310 { "boot", no_argument, NULL, 'b' },
311 { "uuid", required_argument, NULL, ARG_UUID },
312 { "read-only", no_argument, NULL, ARG_READ_ONLY },
313 { "capability", required_argument, NULL, ARG_CAPABILITY },
314 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
315 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
316 { "bind", required_argument, NULL, ARG_BIND },
317 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
318 { "tmpfs", required_argument, NULL, ARG_TMPFS },
319 { "machine", required_argument, NULL, 'M' },
320 { "slice", required_argument, NULL, 'S' },
321 { "setenv", required_argument, NULL, ARG_SETENV },
322 { "selinux-context", required_argument, NULL, 'Z' },
323 { "selinux-apifs-context", required_argument, NULL, 'L' },
324 { "quiet", no_argument, NULL, 'q' },
325 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
326 { "register", required_argument, NULL, ARG_REGISTER },
327 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
328 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
329 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
330 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
331 { "network-veth", no_argument, NULL, 'n' },
332 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
333 { "personality", required_argument, NULL, ARG_PERSONALITY },
334 { "image", required_argument, NULL, 'i' },
335 { "volatile", optional_argument, NULL, ARG_VOLATILE },
336 { "port", required_argument, NULL, 'p' },
337 { "property", required_argument, NULL, ARG_PROPERTY },
338 {}
339 };
340
341 int c, r;
342 uint64_t plus = 0, minus = 0;
343
344 assert(argc >= 0);
345 assert(argv);
346
347 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
348
349 switch (c) {
350
351 case 'h':
352 help();
353 return 0;
354
355 case ARG_VERSION:
356 puts(PACKAGE_STRING);
357 puts(SYSTEMD_FEATURES);
358 return 0;
359
360 case 'D':
361 r = set_sanitized_path(&arg_directory, optarg);
362 if (r < 0)
363 return log_error_errno(r, "Invalid root directory: %m");
364
365 break;
366
367 case ARG_TEMPLATE:
368 r = set_sanitized_path(&arg_template, optarg);
369 if (r < 0)
370 return log_error_errno(r, "Invalid template directory: %m");
371
372 break;
373
374 case 'i':
375 r = set_sanitized_path(&arg_image, optarg);
376 if (r < 0)
377 return log_error_errno(r, "Invalid image path: %m");
378
379 break;
380
381 case 'x':
382 arg_ephemeral = true;
383 break;
384
385 case 'u':
386 free(arg_user);
387 arg_user = strdup(optarg);
388 if (!arg_user)
389 return log_oom();
390
391 break;
392
393 case ARG_NETWORK_BRIDGE:
394 arg_network_bridge = optarg;
395
396 /* fall through */
397
398 case 'n':
399 arg_network_veth = true;
400 arg_private_network = true;
401 break;
402
403 case ARG_NETWORK_INTERFACE:
404 if (strv_extend(&arg_network_interfaces, optarg) < 0)
405 return log_oom();
406
407 arg_private_network = true;
408 break;
409
410 case ARG_NETWORK_MACVLAN:
411 if (strv_extend(&arg_network_macvlan, optarg) < 0)
412 return log_oom();
413
414 arg_private_network = true;
415 break;
416
417 case ARG_NETWORK_IPVLAN:
418 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
419 return log_oom();
420
421 /* fall through */
422
423 case ARG_PRIVATE_NETWORK:
424 arg_private_network = true;
425 break;
426
427 case 'b':
428 arg_boot = true;
429 break;
430
431 case ARG_UUID:
432 r = sd_id128_from_string(optarg, &arg_uuid);
433 if (r < 0) {
434 log_error("Invalid UUID: %s", optarg);
435 return r;
436 }
437 break;
438
439 case 'S':
440 arg_slice = optarg;
441 break;
442
443 case 'M':
444 if (isempty(optarg)) {
445 free(arg_machine);
446 arg_machine = NULL;
447 } else {
448 if (!machine_name_is_valid(optarg)) {
449 log_error("Invalid machine name: %s", optarg);
450 return -EINVAL;
451 }
452
453 r = free_and_strdup(&arg_machine, optarg);
454 if (r < 0)
455 return log_oom();
456
457 break;
458 }
459
460 case 'Z':
461 arg_selinux_context = optarg;
462 break;
463
464 case 'L':
465 arg_selinux_apifs_context = optarg;
466 break;
467
468 case ARG_READ_ONLY:
469 arg_read_only = true;
470 break;
471
472 case ARG_CAPABILITY:
473 case ARG_DROP_CAPABILITY: {
474 const char *state, *word;
475 size_t length;
476
477 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
478 _cleanup_free_ char *t;
479
480 t = strndup(word, length);
481 if (!t)
482 return log_oom();
483
484 if (streq(t, "all")) {
485 if (c == ARG_CAPABILITY)
486 plus = (uint64_t) -1;
487 else
488 minus = (uint64_t) -1;
489 } else {
490 int cap;
491
492 cap = capability_from_name(t);
493 if (cap < 0) {
494 log_error("Failed to parse capability %s.", t);
495 return -EINVAL;
496 }
497
498 if (c == ARG_CAPABILITY)
499 plus |= 1ULL << (uint64_t) cap;
500 else
501 minus |= 1ULL << (uint64_t) cap;
502 }
503 }
504
505 break;
506 }
507
508 case 'j':
509 arg_link_journal = LINK_GUEST;
510 arg_link_journal_try = true;
511 break;
512
513 case ARG_LINK_JOURNAL:
514 if (streq(optarg, "auto")) {
515 arg_link_journal = LINK_AUTO;
516 arg_link_journal_try = false;
517 } else if (streq(optarg, "no")) {
518 arg_link_journal = LINK_NO;
519 arg_link_journal_try = false;
520 } else if (streq(optarg, "guest")) {
521 arg_link_journal = LINK_GUEST;
522 arg_link_journal_try = false;
523 } else if (streq(optarg, "host")) {
524 arg_link_journal = LINK_HOST;
525 arg_link_journal_try = false;
526 } else if (streq(optarg, "try-guest")) {
527 arg_link_journal = LINK_GUEST;
528 arg_link_journal_try = true;
529 } else if (streq(optarg, "try-host")) {
530 arg_link_journal = LINK_HOST;
531 arg_link_journal_try = true;
532 } else {
533 log_error("Failed to parse link journal mode %s", optarg);
534 return -EINVAL;
535 }
536
537 break;
538
539 case ARG_BIND:
540 case ARG_BIND_RO: {
541 _cleanup_free_ char *a = NULL, *b = NULL;
542 char *e;
543 char ***x;
544
545 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
546
547 e = strchr(optarg, ':');
548 if (e) {
549 a = strndup(optarg, e - optarg);
550 b = strdup(e + 1);
551 } else {
552 a = strdup(optarg);
553 b = strdup(optarg);
554 }
555
556 if (!a || !b)
557 return log_oom();
558
559 if (!path_is_absolute(a) || !path_is_absolute(b)) {
560 log_error("Invalid bind mount specification: %s", optarg);
561 return -EINVAL;
562 }
563
564 r = strv_extend(x, a);
565 if (r < 0)
566 return log_oom();
567
568 r = strv_extend(x, b);
569 if (r < 0)
570 return log_oom();
571
572 break;
573 }
574
575 case ARG_TMPFS: {
576 _cleanup_free_ char *a = NULL, *b = NULL;
577 char *e;
578
579 e = strchr(optarg, ':');
580 if (e) {
581 a = strndup(optarg, e - optarg);
582 b = strdup(e + 1);
583 } else {
584 a = strdup(optarg);
585 b = strdup("mode=0755");
586 }
587
588 if (!a || !b)
589 return log_oom();
590
591 if (!path_is_absolute(a)) {
592 log_error("Invalid tmpfs specification: %s", optarg);
593 return -EINVAL;
594 }
595
596 r = strv_push(&arg_tmpfs, a);
597 if (r < 0)
598 return log_oom();
599
600 a = NULL;
601
602 r = strv_push(&arg_tmpfs, b);
603 if (r < 0)
604 return log_oom();
605
606 b = NULL;
607
608 break;
609 }
610
611 case ARG_SETENV: {
612 char **n;
613
614 if (!env_assignment_is_valid(optarg)) {
615 log_error("Environment variable assignment '%s' is not valid.", optarg);
616 return -EINVAL;
617 }
618
619 n = strv_env_set(arg_setenv, optarg);
620 if (!n)
621 return log_oom();
622
623 strv_free(arg_setenv);
624 arg_setenv = n;
625 break;
626 }
627
628 case 'q':
629 arg_quiet = true;
630 break;
631
632 case ARG_SHARE_SYSTEM:
633 arg_share_system = true;
634 break;
635
636 case ARG_REGISTER:
637 r = parse_boolean(optarg);
638 if (r < 0) {
639 log_error("Failed to parse --register= argument: %s", optarg);
640 return r;
641 }
642
643 arg_register = r;
644 break;
645
646 case ARG_KEEP_UNIT:
647 arg_keep_unit = true;
648 break;
649
650 case ARG_PERSONALITY:
651
652 arg_personality = personality_from_string(optarg);
653 if (arg_personality == 0xffffffffLU) {
654 log_error("Unknown or unsupported personality '%s'.", optarg);
655 return -EINVAL;
656 }
657
658 break;
659
660 case ARG_VOLATILE:
661
662 if (!optarg)
663 arg_volatile = VOLATILE_YES;
664 else {
665 r = parse_boolean(optarg);
666 if (r < 0) {
667 if (streq(optarg, "state"))
668 arg_volatile = VOLATILE_STATE;
669 else {
670 log_error("Failed to parse --volatile= argument: %s", optarg);
671 return r;
672 }
673 } else
674 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
675 }
676
677 break;
678
679 case 'p': {
680 const char *split, *e;
681 uint16_t container_port, host_port;
682 int protocol;
683 ExposePort *p;
684
685 if ((e = startswith(optarg, "tcp:")))
686 protocol = IPPROTO_TCP;
687 else if ((e = startswith(optarg, "udp:")))
688 protocol = IPPROTO_UDP;
689 else {
690 e = optarg;
691 protocol = IPPROTO_TCP;
692 }
693
694 split = strchr(e, ':');
695 if (split) {
696 char v[split - e + 1];
697
698 memcpy(v, e, split - e);
699 v[split - e] = 0;
700
701 r = safe_atou16(v, &host_port);
702 if (r < 0 || host_port <= 0) {
703 log_error("Failed to parse host port: %s", optarg);
704 return -EINVAL;
705 }
706
707 r = safe_atou16(split + 1, &container_port);
708 } else {
709 r = safe_atou16(e, &container_port);
710 host_port = container_port;
711 }
712
713 if (r < 0 || container_port <= 0) {
714 log_error("Failed to parse host port: %s", optarg);
715 return -EINVAL;
716 }
717
718 LIST_FOREACH(ports, p, arg_expose_ports) {
719 if (p->protocol == protocol && p->host_port == host_port) {
720 log_error("Duplicate port specification: %s", optarg);
721 return -EINVAL;
722 }
723 }
724
725 p = new(ExposePort, 1);
726 if (!p)
727 return log_oom();
728
729 p->protocol = protocol;
730 p->host_port = host_port;
731 p->container_port = container_port;
732
733 LIST_PREPEND(ports, arg_expose_ports, p);
734
735 break;
736 }
737
738 case ARG_PROPERTY:
739 if (strv_extend(&arg_property, optarg) < 0)
740 return log_oom();
741
742 break;
743
744 case '?':
745 return -EINVAL;
746
747 default:
748 assert_not_reached("Unhandled option");
749 }
750
751 if (arg_share_system)
752 arg_register = false;
753
754 if (arg_boot && arg_share_system) {
755 log_error("--boot and --share-system may not be combined.");
756 return -EINVAL;
757 }
758
759 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
760 log_error("--keep-unit may not be used when invoked from a user session.");
761 return -EINVAL;
762 }
763
764 if (arg_directory && arg_image) {
765 log_error("--directory= and --image= may not be combined.");
766 return -EINVAL;
767 }
768
769 if (arg_template && arg_image) {
770 log_error("--template= and --image= may not be combined.");
771 return -EINVAL;
772 }
773
774 if (arg_template && !(arg_directory || arg_machine)) {
775 log_error("--template= needs --directory= or --machine=.");
776 return -EINVAL;
777 }
778
779 if (arg_ephemeral && arg_template) {
780 log_error("--ephemeral and --template= may not be combined.");
781 return -EINVAL;
782 }
783
784 if (arg_ephemeral && arg_image) {
785 log_error("--ephemeral and --image= may not be combined.");
786 return -EINVAL;
787 }
788
789 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
790 log_error("--ephemeral and --link-journal= may not be combined.");
791 return -EINVAL;
792 }
793
794 if (arg_volatile != VOLATILE_NO && arg_read_only) {
795 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
796 return -EINVAL;
797 }
798
799 if (arg_expose_ports && !arg_private_network) {
800 log_error("Cannot use --port= without private networking.");
801 return -EINVAL;
802 }
803
804 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
805
806 return 1;
807 }
808
809 static int mount_all(const char *dest) {
810
811 typedef struct MountPoint {
812 const char *what;
813 const char *where;
814 const char *type;
815 const char *options;
816 unsigned long flags;
817 bool fatal;
818 } MountPoint;
819
820 static const MountPoint mount_table[] = {
821 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
822 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
823 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
824 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
825 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
826 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
827 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
828 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
829 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true },
830 #ifdef HAVE_SELINUX
831 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
832 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
833 #endif
834 };
835
836 unsigned k;
837 int r = 0;
838
839 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
840 _cleanup_free_ char *where = NULL;
841 #ifdef HAVE_SELINUX
842 _cleanup_free_ char *options = NULL;
843 #endif
844 const char *o;
845 int t;
846
847 where = strjoin(dest, "/", mount_table[k].where, NULL);
848 if (!where)
849 return log_oom();
850
851 t = path_is_mount_point(where, true);
852 if (t < 0) {
853 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
854
855 if (r == 0)
856 r = t;
857
858 continue;
859 }
860
861 /* Skip this entry if it is not a remount. */
862 if (mount_table[k].what && t > 0)
863 continue;
864
865 t = mkdir_p(where, 0755);
866 if (t < 0) {
867 if (mount_table[k].fatal) {
868 log_error_errno(t, "Failed to create directory %s: %m", where);
869
870 if (r == 0)
871 r = t;
872 } else
873 log_warning_errno(t, "Failed to create directory %s: %m", where);
874
875 continue;
876 }
877
878 #ifdef HAVE_SELINUX
879 if (arg_selinux_apifs_context &&
880 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
881 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
882 if (!options)
883 return log_oom();
884
885 o = options;
886 } else
887 #endif
888 o = mount_table[k].options;
889
890
891 if (mount(mount_table[k].what,
892 where,
893 mount_table[k].type,
894 mount_table[k].flags,
895 o) < 0) {
896
897 if (mount_table[k].fatal) {
898 log_error_errno(errno, "mount(%s) failed: %m", where);
899
900 if (r == 0)
901 r = -errno;
902 } else
903 log_warning_errno(errno, "mount(%s) failed: %m", where);
904 }
905 }
906
907 return r;
908 }
909
910 static int mount_binds(const char *dest, char **l, bool ro) {
911 char **x, **y;
912
913 STRV_FOREACH_PAIR(x, y, l) {
914 _cleanup_free_ char *where = NULL;
915 struct stat source_st, dest_st;
916 int r;
917
918 if (stat(*x, &source_st) < 0)
919 return log_error_errno(errno, "Failed to stat %s: %m", *x);
920
921 where = strappend(dest, *y);
922 if (!where)
923 return log_oom();
924
925 r = stat(where, &dest_st);
926 if (r == 0) {
927 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
928 log_error("Cannot bind mount directory %s on file %s.", *x, where);
929 return -EINVAL;
930 }
931 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
932 log_error("Cannot bind mount file %s on directory %s.", *x, where);
933 return -EINVAL;
934 }
935 } else if (errno == ENOENT) {
936 r = mkdir_parents_label(where, 0755);
937 if (r < 0)
938 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
939 } else {
940 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
941 return -errno;
942 }
943
944 /* Create the mount point. Any non-directory file can be
945 * mounted on any non-directory file (regular, fifo, socket,
946 * char, block).
947 */
948 if (S_ISDIR(source_st.st_mode)) {
949 r = mkdir_label(where, 0755);
950 if (r < 0 && errno != EEXIST)
951 return log_error_errno(r, "Failed to create mount point %s: %m", where);
952 } else {
953 r = touch(where);
954 if (r < 0)
955 return log_error_errno(r, "Failed to create mount point %s: %m", where);
956 }
957
958 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
959 return log_error_errno(errno, "mount(%s) failed: %m", where);
960
961 if (ro) {
962 r = bind_remount_recursive(where, true);
963 if (r < 0)
964 return log_error_errno(r, "Read-Only bind mount failed: %m");
965 }
966 }
967
968 return 0;
969 }
970
971 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
972 char *to;
973 int r;
974
975 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
976
977 r = path_is_mount_point(to, false);
978 if (r < 0)
979 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
980 if (r > 0)
981 return 0;
982
983 mkdir_p(to, 0755);
984
985 /* The superblock mount options of the mount point need to be
986 * identical to the hosts', and hence writable... */
987 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
988 return log_error_errno(errno, "Failed to mount to %s: %m", to);
989
990 /* ... hence let's only make the bind mount read-only, not the
991 * superblock. */
992 if (read_only) {
993 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
994 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
995 }
996 return 1;
997 }
998
999 static int mount_cgroup(const char *dest) {
1000 _cleanup_set_free_free_ Set *controllers = NULL;
1001 _cleanup_free_ char *own_cgroup_path = NULL;
1002 const char *cgroup_root, *systemd_root, *systemd_own;
1003 int r;
1004
1005 controllers = set_new(&string_hash_ops);
1006 if (!controllers)
1007 return log_oom();
1008
1009 r = cg_kernel_controllers(controllers);
1010 if (r < 0)
1011 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1012
1013 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1014 if (r < 0)
1015 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1016
1017 cgroup_root = strjoina(dest, "/sys/fs/cgroup");
1018 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1019 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1020
1021 for (;;) {
1022 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1023
1024 controller = set_steal_first(controllers);
1025 if (!controller)
1026 break;
1027
1028 origin = strappend("/sys/fs/cgroup/", controller);
1029 if (!origin)
1030 return log_oom();
1031
1032 r = readlink_malloc(origin, &combined);
1033 if (r == -EINVAL) {
1034 /* Not a symbolic link, but directly a single cgroup hierarchy */
1035
1036 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1037 if (r < 0)
1038 return r;
1039
1040 } else if (r < 0)
1041 return log_error_errno(r, "Failed to read link %s: %m", origin);
1042 else {
1043 _cleanup_free_ char *target = NULL;
1044
1045 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1046 if (!target)
1047 return log_oom();
1048
1049 /* A symbolic link, a combination of controllers in one hierarchy */
1050
1051 if (!filename_is_valid(combined)) {
1052 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1053 continue;
1054 }
1055
1056 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1057 if (r < 0)
1058 return r;
1059
1060 if (symlink(combined, target) < 0)
1061 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1062 }
1063 }
1064
1065 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1066 if (r < 0)
1067 return r;
1068
1069 /* Make our own cgroup a (writable) bind mount */
1070 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1071 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1072 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1073
1074 /* And then remount the systemd cgroup root read-only */
1075 systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
1076 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1077 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1078
1079 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1080 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1081
1082 return 0;
1083 }
1084
1085 static int mount_tmpfs(const char *dest) {
1086 char **i, **o;
1087
1088 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1089 _cleanup_free_ char *where = NULL;
1090 int r;
1091
1092 where = strappend(dest, *i);
1093 if (!where)
1094 return log_oom();
1095
1096 r = mkdir_label(where, 0755);
1097 if (r < 0 && r != -EEXIST)
1098 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1099
1100 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1101 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1102 }
1103
1104 return 0;
1105 }
1106
1107 static int setup_timezone(const char *dest) {
1108 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1109 char *z, *y;
1110 int r;
1111
1112 assert(dest);
1113
1114 /* Fix the timezone, if possible */
1115 r = readlink_malloc("/etc/localtime", &p);
1116 if (r < 0) {
1117 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1118 return 0;
1119 }
1120
1121 z = path_startswith(p, "../usr/share/zoneinfo/");
1122 if (!z)
1123 z = path_startswith(p, "/usr/share/zoneinfo/");
1124 if (!z) {
1125 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1126 return 0;
1127 }
1128
1129 where = strappend(dest, "/etc/localtime");
1130 if (!where)
1131 return log_oom();
1132
1133 r = readlink_malloc(where, &q);
1134 if (r >= 0) {
1135 y = path_startswith(q, "../usr/share/zoneinfo/");
1136 if (!y)
1137 y = path_startswith(q, "/usr/share/zoneinfo/");
1138
1139 /* Already pointing to the right place? Then do nothing .. */
1140 if (y && streq(y, z))
1141 return 0;
1142 }
1143
1144 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1145 if (!check)
1146 return log_oom();
1147
1148 if (access(check, F_OK) < 0) {
1149 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1150 return 0;
1151 }
1152
1153 what = strappend("../usr/share/zoneinfo/", z);
1154 if (!what)
1155 return log_oom();
1156
1157 r = mkdir_parents(where, 0755);
1158 if (r < 0) {
1159 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1160
1161 return 0;
1162 }
1163
1164 r = unlink(where);
1165 if (r < 0 && errno != ENOENT) {
1166 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1167
1168 return 0;
1169 }
1170
1171 if (symlink(what, where) < 0) {
1172 log_error_errno(errno, "Failed to correct timezone of container: %m");
1173 return 0;
1174 }
1175
1176 return 0;
1177 }
1178
1179 static int setup_resolv_conf(const char *dest) {
1180 _cleanup_free_ char *where = NULL;
1181 int r;
1182
1183 assert(dest);
1184
1185 if (arg_private_network)
1186 return 0;
1187
1188 /* Fix resolv.conf, if possible */
1189 where = strappend(dest, "/etc/resolv.conf");
1190 if (!where)
1191 return log_oom();
1192
1193 /* We don't really care for the results of this really. If it
1194 * fails, it fails, but meh... */
1195 r = mkdir_parents(where, 0755);
1196 if (r < 0) {
1197 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1198
1199 return 0;
1200 }
1201
1202 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1203 if (r < 0) {
1204 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1205
1206 return 0;
1207 }
1208
1209 return 0;
1210 }
1211
1212 static int setup_volatile_state(const char *directory) {
1213 const char *p;
1214 int r;
1215
1216 assert(directory);
1217
1218 if (arg_volatile != VOLATILE_STATE)
1219 return 0;
1220
1221 /* --volatile=state means we simply overmount /var
1222 with a tmpfs, and the rest read-only. */
1223
1224 r = bind_remount_recursive(directory, true);
1225 if (r < 0)
1226 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1227
1228 p = strjoina(directory, "/var");
1229 r = mkdir(p, 0755);
1230 if (r < 0 && errno != EEXIST)
1231 return log_error_errno(errno, "Failed to create %s: %m", directory);
1232
1233 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1234 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1235
1236 return 0;
1237 }
1238
1239 static int setup_volatile(const char *directory) {
1240 bool tmpfs_mounted = false, bind_mounted = false;
1241 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1242 const char *f, *t;
1243 int r;
1244
1245 assert(directory);
1246
1247 if (arg_volatile != VOLATILE_YES)
1248 return 0;
1249
1250 /* --volatile=yes means we mount a tmpfs to the root dir, and
1251 the original /usr to use inside it, and that read-only. */
1252
1253 if (!mkdtemp(template))
1254 return log_error_errno(errno, "Failed to create temporary directory: %m");
1255
1256 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1257 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1258 r = -errno;
1259 goto fail;
1260 }
1261
1262 tmpfs_mounted = true;
1263
1264 f = strjoina(directory, "/usr");
1265 t = strjoina(template, "/usr");
1266
1267 r = mkdir(t, 0755);
1268 if (r < 0 && errno != EEXIST) {
1269 log_error_errno(errno, "Failed to create %s: %m", t);
1270 r = -errno;
1271 goto fail;
1272 }
1273
1274 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1275 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1276 r = -errno;
1277 goto fail;
1278 }
1279
1280 bind_mounted = true;
1281
1282 r = bind_remount_recursive(t, true);
1283 if (r < 0) {
1284 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1285 goto fail;
1286 }
1287
1288 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1289 log_error_errno(errno, "Failed to move root mount: %m");
1290 r = -errno;
1291 goto fail;
1292 }
1293
1294 rmdir(template);
1295
1296 return 0;
1297
1298 fail:
1299 if (bind_mounted)
1300 umount(t);
1301 if (tmpfs_mounted)
1302 umount(template);
1303 rmdir(template);
1304 return r;
1305 }
1306
1307 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1308
1309 snprintf(s, 37,
1310 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1311 SD_ID128_FORMAT_VAL(id));
1312
1313 return s;
1314 }
1315
1316 static int setup_boot_id(const char *dest) {
1317 _cleanup_free_ char *from = NULL, *to = NULL;
1318 sd_id128_t rnd = {};
1319 char as_uuid[37];
1320 int r;
1321
1322 assert(dest);
1323
1324 if (arg_share_system)
1325 return 0;
1326
1327 /* Generate a new randomized boot ID, so that each boot-up of
1328 * the container gets a new one */
1329
1330 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1331 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1332 if (!from || !to)
1333 return log_oom();
1334
1335 r = sd_id128_randomize(&rnd);
1336 if (r < 0)
1337 return log_error_errno(r, "Failed to generate random boot id: %m");
1338
1339 id128_format_as_uuid(rnd, as_uuid);
1340
1341 r = write_string_file(from, as_uuid);
1342 if (r < 0)
1343 return log_error_errno(r, "Failed to write boot id: %m");
1344
1345 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1346 log_error_errno(errno, "Failed to bind mount boot id: %m");
1347 r = -errno;
1348 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1349 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1350
1351 unlink(from);
1352 return r;
1353 }
1354
1355 static int copy_devnodes(const char *dest) {
1356
1357 static const char devnodes[] =
1358 "null\0"
1359 "zero\0"
1360 "full\0"
1361 "random\0"
1362 "urandom\0"
1363 "tty\0"
1364 "net/tun\0";
1365
1366 const char *d;
1367 int r = 0;
1368 _cleanup_umask_ mode_t u;
1369
1370 assert(dest);
1371
1372 u = umask(0000);
1373
1374 NULSTR_FOREACH(d, devnodes) {
1375 _cleanup_free_ char *from = NULL, *to = NULL;
1376 struct stat st;
1377
1378 from = strappend("/dev/", d);
1379 to = strjoin(dest, "/dev/", d, NULL);
1380 if (!from || !to)
1381 return log_oom();
1382
1383 if (stat(from, &st) < 0) {
1384
1385 if (errno != ENOENT)
1386 return log_error_errno(errno, "Failed to stat %s: %m", from);
1387
1388 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1389
1390 log_error("%s is not a char or block device, cannot copy", from);
1391 return -EIO;
1392
1393 } else {
1394 r = mkdir_parents(to, 0775);
1395 if (r < 0) {
1396 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1397 return -r;
1398 }
1399
1400 if (mknod(to, st.st_mode, st.st_rdev) < 0)
1401 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1402 }
1403 }
1404
1405 return r;
1406 }
1407
1408 static int setup_ptmx(const char *dest) {
1409 _cleanup_free_ char *p = NULL;
1410
1411 p = strappend(dest, "/dev/ptmx");
1412 if (!p)
1413 return log_oom();
1414
1415 if (symlink("pts/ptmx", p) < 0)
1416 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1417
1418 return 0;
1419 }
1420
1421 static int setup_dev_console(const char *dest, const char *console) {
1422 _cleanup_umask_ mode_t u;
1423 const char *to;
1424 struct stat st;
1425 int r;
1426
1427 assert(dest);
1428 assert(console);
1429
1430 u = umask(0000);
1431
1432 if (stat("/dev/null", &st) < 0)
1433 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1434
1435 r = chmod_and_chown(console, 0600, 0, 0);
1436 if (r < 0)
1437 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1438
1439 /* We need to bind mount the right tty to /dev/console since
1440 * ptys can only exist on pts file systems. To have something
1441 * to bind mount things on we create a device node first, and
1442 * use /dev/null for that since we the cgroups device policy
1443 * allows us to create that freely, while we cannot create
1444 * /dev/console. (Note that the major minor doesn't actually
1445 * matter here, since we mount it over anyway). */
1446
1447 to = strjoina(dest, "/dev/console");
1448 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1449 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1450
1451 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1452 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1453
1454 return 0;
1455 }
1456
1457 static int setup_kmsg(const char *dest, int kmsg_socket) {
1458 _cleanup_free_ char *from = NULL, *to = NULL;
1459 _cleanup_umask_ mode_t u;
1460 int r, fd, k;
1461 union {
1462 struct cmsghdr cmsghdr;
1463 uint8_t buf[CMSG_SPACE(sizeof(int))];
1464 } control = {};
1465 struct msghdr mh = {
1466 .msg_control = &control,
1467 .msg_controllen = sizeof(control),
1468 };
1469 struct cmsghdr *cmsg;
1470
1471 assert(dest);
1472 assert(kmsg_socket >= 0);
1473
1474 u = umask(0000);
1475
1476 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1477 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1478 * on the reading side behave very similar to /proc/kmsg,
1479 * their writing side behaves differently from /dev/kmsg in
1480 * that writing blocks when nothing is reading. In order to
1481 * avoid any problems with containers deadlocking due to this
1482 * we simply make /dev/kmsg unavailable to the container. */
1483 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1484 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1485 return log_oom();
1486
1487 if (mkfifo(from, 0600) < 0)
1488 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1489
1490 r = chmod_and_chown(from, 0600, 0, 0);
1491 if (r < 0)
1492 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1493
1494 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1495 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1496
1497 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1498 if (fd < 0)
1499 return log_error_errno(errno, "Failed to open fifo: %m");
1500
1501 cmsg = CMSG_FIRSTHDR(&mh);
1502 cmsg->cmsg_level = SOL_SOCKET;
1503 cmsg->cmsg_type = SCM_RIGHTS;
1504 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1505 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1506
1507 mh.msg_controllen = cmsg->cmsg_len;
1508
1509 /* Store away the fd in the socket, so that it stays open as
1510 * long as we run the child */
1511 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1512 safe_close(fd);
1513
1514 if (k < 0)
1515 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1516
1517 /* And now make the FIFO unavailable as /dev/kmsg... */
1518 unlink(from);
1519 return 0;
1520 }
1521
1522 static int send_rtnl(int send_fd) {
1523 union {
1524 struct cmsghdr cmsghdr;
1525 uint8_t buf[CMSG_SPACE(sizeof(int))];
1526 } control = {};
1527 struct msghdr mh = {
1528 .msg_control = &control,
1529 .msg_controllen = sizeof(control),
1530 };
1531 struct cmsghdr *cmsg;
1532 _cleanup_close_ int fd = -1;
1533 ssize_t k;
1534
1535 assert(send_fd >= 0);
1536
1537 if (!arg_expose_ports)
1538 return 0;
1539
1540 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1541 if (fd < 0)
1542 return log_error_errno(errno, "failed to allocate container netlink: %m");
1543
1544 cmsg = CMSG_FIRSTHDR(&mh);
1545 cmsg->cmsg_level = SOL_SOCKET;
1546 cmsg->cmsg_type = SCM_RIGHTS;
1547 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1548 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1549
1550 mh.msg_controllen = cmsg->cmsg_len;
1551
1552 /* Store away the fd in the socket, so that it stays open as
1553 * long as we run the child */
1554 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1555 if (k < 0)
1556 return log_error_errno(errno, "Failed to send netlink fd: %m");
1557
1558 return 0;
1559 }
1560
1561 static int flush_ports(union in_addr_union *exposed) {
1562 ExposePort *p;
1563 int r, af = AF_INET;
1564
1565 assert(exposed);
1566
1567 if (!arg_expose_ports)
1568 return 0;
1569
1570 if (in_addr_is_null(af, exposed))
1571 return 0;
1572
1573 log_debug("Lost IP address.");
1574
1575 LIST_FOREACH(ports, p, arg_expose_ports) {
1576 r = fw_add_local_dnat(false,
1577 af,
1578 p->protocol,
1579 NULL,
1580 NULL, 0,
1581 NULL, 0,
1582 p->host_port,
1583 exposed,
1584 p->container_port,
1585 NULL);
1586 if (r < 0)
1587 log_warning_errno(r, "Failed to modify firewall: %m");
1588 }
1589
1590 *exposed = IN_ADDR_NULL;
1591 return 0;
1592 }
1593
1594 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1595 _cleanup_free_ struct local_address *addresses = NULL;
1596 _cleanup_free_ char *pretty = NULL;
1597 union in_addr_union new_exposed;
1598 ExposePort *p;
1599 bool add;
1600 int af = AF_INET, r;
1601
1602 assert(exposed);
1603
1604 /* Invoked each time an address is added or removed inside the
1605 * container */
1606
1607 if (!arg_expose_ports)
1608 return 0;
1609
1610 r = local_addresses(rtnl, 0, af, &addresses);
1611 if (r < 0)
1612 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1613
1614 add = r > 0 &&
1615 addresses[0].family == af &&
1616 addresses[0].scope < RT_SCOPE_LINK;
1617
1618 if (!add)
1619 return flush_ports(exposed);
1620
1621 new_exposed = addresses[0].address;
1622 if (in_addr_equal(af, exposed, &new_exposed))
1623 return 0;
1624
1625 in_addr_to_string(af, &new_exposed, &pretty);
1626 log_debug("New container IP is %s.", strna(pretty));
1627
1628 LIST_FOREACH(ports, p, arg_expose_ports) {
1629
1630 r = fw_add_local_dnat(true,
1631 af,
1632 p->protocol,
1633 NULL,
1634 NULL, 0,
1635 NULL, 0,
1636 p->host_port,
1637 &new_exposed,
1638 p->container_port,
1639 in_addr_is_null(af, exposed) ? NULL : exposed);
1640 if (r < 0)
1641 log_warning_errno(r, "Failed to modify firewall: %m");
1642 }
1643
1644 *exposed = new_exposed;
1645 return 0;
1646 }
1647
1648 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1649 union in_addr_union *exposed = userdata;
1650
1651 assert(rtnl);
1652 assert(m);
1653 assert(exposed);
1654
1655 expose_ports(rtnl, exposed);
1656 return 0;
1657 }
1658
1659 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1660 union {
1661 struct cmsghdr cmsghdr;
1662 uint8_t buf[CMSG_SPACE(sizeof(int))];
1663 } control = {};
1664 struct msghdr mh = {
1665 .msg_control = &control,
1666 .msg_controllen = sizeof(control),
1667 };
1668 struct cmsghdr *cmsg;
1669 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1670 int fd, r;
1671 ssize_t k;
1672
1673 assert(event);
1674 assert(recv_fd >= 0);
1675 assert(ret);
1676
1677 if (!arg_expose_ports)
1678 return 0;
1679
1680 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1681 if (k < 0)
1682 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1683
1684 cmsg = CMSG_FIRSTHDR(&mh);
1685 assert(cmsg->cmsg_level == SOL_SOCKET);
1686 assert(cmsg->cmsg_type == SCM_RIGHTS);
1687 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1688 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1689
1690 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1691 if (r < 0) {
1692 safe_close(fd);
1693 return log_error_errno(r, "Failed to create rtnl object: %m");
1694 }
1695
1696 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1697 if (r < 0)
1698 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1699
1700 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1701 if (r < 0)
1702 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1703
1704 r = sd_rtnl_attach_event(rtnl, event, 0);
1705 if (r < 0)
1706 return log_error_errno(r, "Failed to add to even loop: %m");
1707
1708 *ret = rtnl;
1709 rtnl = NULL;
1710
1711 return 0;
1712 }
1713
1714 static int setup_hostname(void) {
1715
1716 if (arg_share_system)
1717 return 0;
1718
1719 if (sethostname_idempotent(arg_machine) < 0)
1720 return -errno;
1721
1722 return 0;
1723 }
1724
1725 static int setup_journal(const char *directory) {
1726 sd_id128_t machine_id, this_id;
1727 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1728 char *id;
1729 int r;
1730
1731 /* Don't link journals in ephemeral mode */
1732 if (arg_ephemeral)
1733 return 0;
1734
1735 p = strappend(directory, "/etc/machine-id");
1736 if (!p)
1737 return log_oom();
1738
1739 r = read_one_line_file(p, &b);
1740 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1741 return 0;
1742 else if (r < 0)
1743 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1744
1745 id = strstrip(b);
1746 if (isempty(id) && arg_link_journal == LINK_AUTO)
1747 return 0;
1748
1749 /* Verify validity */
1750 r = sd_id128_from_string(id, &machine_id);
1751 if (r < 0)
1752 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1753
1754 r = sd_id128_get_machine(&this_id);
1755 if (r < 0)
1756 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1757
1758 if (sd_id128_equal(machine_id, this_id)) {
1759 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1760 "Host and machine ids are equal (%s): refusing to link journals", id);
1761 if (arg_link_journal == LINK_AUTO)
1762 return 0;
1763 return -EEXIST;
1764 }
1765
1766 if (arg_link_journal == LINK_NO)
1767 return 0;
1768
1769 free(p);
1770 p = strappend("/var/log/journal/", id);
1771 q = strjoin(directory, "/var/log/journal/", id, NULL);
1772 if (!p || !q)
1773 return log_oom();
1774
1775 if (path_is_mount_point(p, false) > 0) {
1776 if (arg_link_journal != LINK_AUTO) {
1777 log_error("%s: already a mount point, refusing to use for journal", p);
1778 return -EEXIST;
1779 }
1780
1781 return 0;
1782 }
1783
1784 if (path_is_mount_point(q, false) > 0) {
1785 if (arg_link_journal != LINK_AUTO) {
1786 log_error("%s: already a mount point, refusing to use for journal", q);
1787 return -EEXIST;
1788 }
1789
1790 return 0;
1791 }
1792
1793 r = readlink_and_make_absolute(p, &d);
1794 if (r >= 0) {
1795 if ((arg_link_journal == LINK_GUEST ||
1796 arg_link_journal == LINK_AUTO) &&
1797 path_equal(d, q)) {
1798
1799 r = mkdir_p(q, 0755);
1800 if (r < 0)
1801 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1802 return 0;
1803 }
1804
1805 if (unlink(p) < 0)
1806 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1807 } else if (r == -EINVAL) {
1808
1809 if (arg_link_journal == LINK_GUEST &&
1810 rmdir(p) < 0) {
1811
1812 if (errno == ENOTDIR) {
1813 log_error("%s already exists and is neither a symlink nor a directory", p);
1814 return r;
1815 } else {
1816 log_error_errno(errno, "Failed to remove %s: %m", p);
1817 return -errno;
1818 }
1819 }
1820 } else if (r != -ENOENT) {
1821 log_error_errno(errno, "readlink(%s) failed: %m", p);
1822 return r;
1823 }
1824
1825 if (arg_link_journal == LINK_GUEST) {
1826
1827 if (symlink(q, p) < 0) {
1828 if (arg_link_journal_try) {
1829 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1830 return 0;
1831 } else {
1832 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1833 return -errno;
1834 }
1835 }
1836
1837 r = mkdir_p(q, 0755);
1838 if (r < 0)
1839 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1840 return 0;
1841 }
1842
1843 if (arg_link_journal == LINK_HOST) {
1844 /* don't create parents here -- if the host doesn't have
1845 * permanent journal set up, don't force it here */
1846 r = mkdir(p, 0755);
1847 if (r < 0) {
1848 if (arg_link_journal_try) {
1849 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1850 return 0;
1851 } else {
1852 log_error_errno(errno, "Failed to create %s: %m", p);
1853 return r;
1854 }
1855 }
1856
1857 } else if (access(p, F_OK) < 0)
1858 return 0;
1859
1860 if (dir_is_empty(q) == 0)
1861 log_warning("%s is not empty, proceeding anyway.", q);
1862
1863 r = mkdir_p(q, 0755);
1864 if (r < 0) {
1865 log_error_errno(errno, "Failed to create %s: %m", q);
1866 return r;
1867 }
1868
1869 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1870 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1871
1872 return 0;
1873 }
1874
1875 static int drop_capabilities(void) {
1876 return capability_bounding_set_drop(~arg_retain, false);
1877 }
1878
1879 static int register_machine(pid_t pid, int local_ifindex) {
1880 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1881 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1882 int r;
1883
1884 if (!arg_register)
1885 return 0;
1886
1887 r = sd_bus_default_system(&bus);
1888 if (r < 0)
1889 return log_error_errno(r, "Failed to open system bus: %m");
1890
1891 if (arg_keep_unit) {
1892 r = sd_bus_call_method(
1893 bus,
1894 "org.freedesktop.machine1",
1895 "/org/freedesktop/machine1",
1896 "org.freedesktop.machine1.Manager",
1897 "RegisterMachineWithNetwork",
1898 &error,
1899 NULL,
1900 "sayssusai",
1901 arg_machine,
1902 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1903 "nspawn",
1904 "container",
1905 (uint32_t) pid,
1906 strempty(arg_directory),
1907 local_ifindex > 0 ? 1 : 0, local_ifindex);
1908 } else {
1909 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1910 char **i;
1911
1912 r = sd_bus_message_new_method_call(
1913 bus,
1914 &m,
1915 "org.freedesktop.machine1",
1916 "/org/freedesktop/machine1",
1917 "org.freedesktop.machine1.Manager",
1918 "CreateMachineWithNetwork");
1919 if (r < 0)
1920 return bus_log_create_error(r);
1921
1922 r = sd_bus_message_append(
1923 m,
1924 "sayssusai",
1925 arg_machine,
1926 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1927 "nspawn",
1928 "container",
1929 (uint32_t) pid,
1930 strempty(arg_directory),
1931 local_ifindex > 0 ? 1 : 0, local_ifindex);
1932 if (r < 0)
1933 return bus_log_create_error(r);
1934
1935 r = sd_bus_message_open_container(m, 'a', "(sv)");
1936 if (r < 0)
1937 return bus_log_create_error(r);
1938
1939 if (!isempty(arg_slice)) {
1940 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1941 if (r < 0)
1942 return bus_log_create_error(r);
1943 }
1944
1945 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1946 if (r < 0)
1947 return bus_log_create_error(r);
1948
1949 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1950 /* Allow the container to
1951 * access and create the API
1952 * device nodes, so that
1953 * PrivateDevices= in the
1954 * container can work
1955 * fine */
1956 "/dev/null", "rwm",
1957 "/dev/zero", "rwm",
1958 "/dev/full", "rwm",
1959 "/dev/random", "rwm",
1960 "/dev/urandom", "rwm",
1961 "/dev/tty", "rwm",
1962 "/dev/net/tun", "rwm",
1963 /* Allow the container
1964 * access to ptys. However,
1965 * do not permit the
1966 * container to ever create
1967 * these device nodes. */
1968 "/dev/pts/ptmx", "rw",
1969 "char-pts", "rw");
1970 if (r < 0)
1971 return log_error_errno(r, "Failed to add device whitelist: %m");
1972
1973 STRV_FOREACH(i, arg_property) {
1974 r = sd_bus_message_open_container(m, 'r', "sv");
1975 if (r < 0)
1976 return bus_log_create_error(r);
1977
1978 r = bus_append_unit_property_assignment(m, *i);
1979 if (r < 0)
1980 return r;
1981
1982 r = sd_bus_message_close_container(m);
1983 if (r < 0)
1984 return bus_log_create_error(r);
1985 }
1986
1987 r = sd_bus_message_close_container(m);
1988 if (r < 0)
1989 return bus_log_create_error(r);
1990
1991 r = sd_bus_call(bus, m, 0, &error, NULL);
1992 }
1993
1994 if (r < 0) {
1995 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1996 return r;
1997 }
1998
1999 return 0;
2000 }
2001
2002 static int terminate_machine(pid_t pid) {
2003 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2004 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2005 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2006 const char *path;
2007 int r;
2008
2009 if (!arg_register)
2010 return 0;
2011
2012 r = sd_bus_default_system(&bus);
2013 if (r < 0)
2014 return log_error_errno(r, "Failed to open system bus: %m");
2015
2016 r = sd_bus_call_method(
2017 bus,
2018 "org.freedesktop.machine1",
2019 "/org/freedesktop/machine1",
2020 "org.freedesktop.machine1.Manager",
2021 "GetMachineByPID",
2022 &error,
2023 &reply,
2024 "u",
2025 (uint32_t) pid);
2026 if (r < 0) {
2027 /* Note that the machine might already have been
2028 * cleaned up automatically, hence don't consider it a
2029 * failure if we cannot get the machine object. */
2030 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2031 return 0;
2032 }
2033
2034 r = sd_bus_message_read(reply, "o", &path);
2035 if (r < 0)
2036 return bus_log_parse_error(r);
2037
2038 r = sd_bus_call_method(
2039 bus,
2040 "org.freedesktop.machine1",
2041 path,
2042 "org.freedesktop.machine1.Machine",
2043 "Terminate",
2044 &error,
2045 NULL,
2046 NULL);
2047 if (r < 0) {
2048 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2049 return 0;
2050 }
2051
2052 return 0;
2053 }
2054
2055 static int reset_audit_loginuid(void) {
2056 _cleanup_free_ char *p = NULL;
2057 int r;
2058
2059 if (arg_share_system)
2060 return 0;
2061
2062 r = read_one_line_file("/proc/self/loginuid", &p);
2063 if (r == -ENOENT)
2064 return 0;
2065 if (r < 0)
2066 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2067
2068 /* Already reset? */
2069 if (streq(p, "4294967295"))
2070 return 0;
2071
2072 r = write_string_file("/proc/self/loginuid", "4294967295");
2073 if (r < 0) {
2074 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2075 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2076 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2077 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2078 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2079
2080 sleep(5);
2081 }
2082
2083 return 0;
2084 }
2085
2086 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2087 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2088 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2089
2090 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2091 uint8_t result[8];
2092 size_t l, sz;
2093 uint8_t *v, *i;
2094 int r;
2095
2096 l = strlen(arg_machine);
2097 sz = sizeof(sd_id128_t) + l;
2098 if (idx > 0)
2099 sz += sizeof(idx);
2100
2101 v = alloca(sz);
2102
2103 /* fetch some persistent data unique to the host */
2104 r = sd_id128_get_machine((sd_id128_t*) v);
2105 if (r < 0)
2106 return r;
2107
2108 /* combine with some data unique (on this host) to this
2109 * container instance */
2110 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2111 if (idx > 0) {
2112 idx = htole64(idx);
2113 memcpy(i, &idx, sizeof(idx));
2114 }
2115
2116 /* Let's hash the host machine ID plus the container name. We
2117 * use a fixed, but originally randomly created hash key here. */
2118 siphash24(result, v, sz, hash_key.bytes);
2119
2120 assert_cc(ETH_ALEN <= sizeof(result));
2121 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2122
2123 /* see eth_random_addr in the kernel */
2124 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2125 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2126
2127 return 0;
2128 }
2129
2130 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2131 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2132 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2133 struct ether_addr mac_host, mac_container;
2134 int r, i;
2135
2136 if (!arg_private_network)
2137 return 0;
2138
2139 if (!arg_network_veth)
2140 return 0;
2141
2142 /* Use two different interface name prefixes depending whether
2143 * we are in bridge mode or not. */
2144 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2145 arg_network_bridge ? "vb" : "ve", arg_machine);
2146
2147 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2148 if (r < 0)
2149 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2150
2151 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2152 if (r < 0)
2153 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2154
2155 r = sd_rtnl_open(&rtnl, 0);
2156 if (r < 0)
2157 return log_error_errno(r, "Failed to connect to netlink: %m");
2158
2159 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2160 if (r < 0)
2161 return log_error_errno(r, "Failed to allocate netlink message: %m");
2162
2163 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2164 if (r < 0)
2165 return log_error_errno(r, "Failed to add netlink interface name: %m");
2166
2167 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2168 if (r < 0)
2169 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2170
2171 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2172 if (r < 0)
2173 return log_error_errno(r, "Failed to open netlink container: %m");
2174
2175 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2176 if (r < 0)
2177 return log_error_errno(r, "Failed to open netlink container: %m");
2178
2179 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2180 if (r < 0)
2181 return log_error_errno(r, "Failed to open netlink container: %m");
2182
2183 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2184 if (r < 0)
2185 return log_error_errno(r, "Failed to add netlink interface name: %m");
2186
2187 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2188 if (r < 0)
2189 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2190
2191 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2192 if (r < 0)
2193 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2194
2195 r = sd_rtnl_message_close_container(m);
2196 if (r < 0)
2197 return log_error_errno(r, "Failed to close netlink container: %m");
2198
2199 r = sd_rtnl_message_close_container(m);
2200 if (r < 0)
2201 return log_error_errno(r, "Failed to close netlink container: %m");
2202
2203 r = sd_rtnl_message_close_container(m);
2204 if (r < 0)
2205 return log_error_errno(r, "Failed to close netlink container: %m");
2206
2207 r = sd_rtnl_call(rtnl, m, 0, NULL);
2208 if (r < 0)
2209 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2210
2211 i = (int) if_nametoindex(iface_name);
2212 if (i <= 0)
2213 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2214
2215 *ifi = i;
2216
2217 return 0;
2218 }
2219
2220 static int setup_bridge(const char veth_name[], int *ifi) {
2221 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2222 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2223 int r, bridge;
2224
2225 if (!arg_private_network)
2226 return 0;
2227
2228 if (!arg_network_veth)
2229 return 0;
2230
2231 if (!arg_network_bridge)
2232 return 0;
2233
2234 bridge = (int) if_nametoindex(arg_network_bridge);
2235 if (bridge <= 0)
2236 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2237
2238 *ifi = bridge;
2239
2240 r = sd_rtnl_open(&rtnl, 0);
2241 if (r < 0)
2242 return log_error_errno(r, "Failed to connect to netlink: %m");
2243
2244 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2245 if (r < 0)
2246 return log_error_errno(r, "Failed to allocate netlink message: %m");
2247
2248 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2249 if (r < 0)
2250 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2251
2252 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2253 if (r < 0)
2254 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2255
2256 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2257 if (r < 0)
2258 return log_error_errno(r, "Failed to add netlink master field: %m");
2259
2260 r = sd_rtnl_call(rtnl, m, 0, NULL);
2261 if (r < 0)
2262 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2263
2264 return 0;
2265 }
2266
2267 static int parse_interface(struct udev *udev, const char *name) {
2268 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2269 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2270 int ifi;
2271
2272 ifi = (int) if_nametoindex(name);
2273 if (ifi <= 0)
2274 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2275
2276 sprintf(ifi_str, "n%i", ifi);
2277 d = udev_device_new_from_device_id(udev, ifi_str);
2278 if (!d)
2279 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2280
2281 if (udev_device_get_is_initialized(d) <= 0) {
2282 log_error("Network interface %s is not initialized yet.", name);
2283 return -EBUSY;
2284 }
2285
2286 return ifi;
2287 }
2288
2289 static int move_network_interfaces(pid_t pid) {
2290 _cleanup_udev_unref_ struct udev *udev = NULL;
2291 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2292 char **i;
2293 int r;
2294
2295 if (!arg_private_network)
2296 return 0;
2297
2298 if (strv_isempty(arg_network_interfaces))
2299 return 0;
2300
2301 r = sd_rtnl_open(&rtnl, 0);
2302 if (r < 0)
2303 return log_error_errno(r, "Failed to connect to netlink: %m");
2304
2305 udev = udev_new();
2306 if (!udev) {
2307 log_error("Failed to connect to udev.");
2308 return -ENOMEM;
2309 }
2310
2311 STRV_FOREACH(i, arg_network_interfaces) {
2312 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2313 int ifi;
2314
2315 ifi = parse_interface(udev, *i);
2316 if (ifi < 0)
2317 return ifi;
2318
2319 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2320 if (r < 0)
2321 return log_error_errno(r, "Failed to allocate netlink message: %m");
2322
2323 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2324 if (r < 0)
2325 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2326
2327 r = sd_rtnl_call(rtnl, m, 0, NULL);
2328 if (r < 0)
2329 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2330 }
2331
2332 return 0;
2333 }
2334
2335 static int setup_macvlan(pid_t pid) {
2336 _cleanup_udev_unref_ struct udev *udev = NULL;
2337 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2338 unsigned idx = 0;
2339 char **i;
2340 int r;
2341
2342 if (!arg_private_network)
2343 return 0;
2344
2345 if (strv_isempty(arg_network_macvlan))
2346 return 0;
2347
2348 r = sd_rtnl_open(&rtnl, 0);
2349 if (r < 0)
2350 return log_error_errno(r, "Failed to connect to netlink: %m");
2351
2352 udev = udev_new();
2353 if (!udev) {
2354 log_error("Failed to connect to udev.");
2355 return -ENOMEM;
2356 }
2357
2358 STRV_FOREACH(i, arg_network_macvlan) {
2359 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2360 _cleanup_free_ char *n = NULL;
2361 struct ether_addr mac;
2362 int ifi;
2363
2364 ifi = parse_interface(udev, *i);
2365 if (ifi < 0)
2366 return ifi;
2367
2368 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2369 if (r < 0)
2370 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2371
2372 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2373 if (r < 0)
2374 return log_error_errno(r, "Failed to allocate netlink message: %m");
2375
2376 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2377 if (r < 0)
2378 return log_error_errno(r, "Failed to add netlink interface index: %m");
2379
2380 n = strappend("mv-", *i);
2381 if (!n)
2382 return log_oom();
2383
2384 strshorten(n, IFNAMSIZ-1);
2385
2386 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2387 if (r < 0)
2388 return log_error_errno(r, "Failed to add netlink interface name: %m");
2389
2390 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2391 if (r < 0)
2392 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2393
2394 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2395 if (r < 0)
2396 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2397
2398 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2399 if (r < 0)
2400 return log_error_errno(r, "Failed to open netlink container: %m");
2401
2402 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2403 if (r < 0)
2404 return log_error_errno(r, "Failed to open netlink container: %m");
2405
2406 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2407 if (r < 0)
2408 return log_error_errno(r, "Failed to append macvlan mode: %m");
2409
2410 r = sd_rtnl_message_close_container(m);
2411 if (r < 0)
2412 return log_error_errno(r, "Failed to close netlink container: %m");
2413
2414 r = sd_rtnl_message_close_container(m);
2415 if (r < 0)
2416 return log_error_errno(r, "Failed to close netlink container: %m");
2417
2418 r = sd_rtnl_call(rtnl, m, 0, NULL);
2419 if (r < 0)
2420 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2421 }
2422
2423 return 0;
2424 }
2425
2426 static int setup_ipvlan(pid_t pid) {
2427 _cleanup_udev_unref_ struct udev *udev = NULL;
2428 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2429 char **i;
2430 int r;
2431
2432 if (!arg_private_network)
2433 return 0;
2434
2435 if (strv_isempty(arg_network_ipvlan))
2436 return 0;
2437
2438 r = sd_rtnl_open(&rtnl, 0);
2439 if (r < 0)
2440 return log_error_errno(r, "Failed to connect to netlink: %m");
2441
2442 udev = udev_new();
2443 if (!udev) {
2444 log_error("Failed to connect to udev.");
2445 return -ENOMEM;
2446 }
2447
2448 STRV_FOREACH(i, arg_network_ipvlan) {
2449 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2450 _cleanup_free_ char *n = NULL;
2451 int ifi;
2452
2453 ifi = parse_interface(udev, *i);
2454 if (ifi < 0)
2455 return ifi;
2456
2457 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2458 if (r < 0)
2459 return log_error_errno(r, "Failed to allocate netlink message: %m");
2460
2461 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2462 if (r < 0)
2463 return log_error_errno(r, "Failed to add netlink interface index: %m");
2464
2465 n = strappend("iv-", *i);
2466 if (!n)
2467 return log_oom();
2468
2469 strshorten(n, IFNAMSIZ-1);
2470
2471 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2472 if (r < 0)
2473 return log_error_errno(r, "Failed to add netlink interface name: %m");
2474
2475 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2476 if (r < 0)
2477 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2478
2479 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2480 if (r < 0)
2481 return log_error_errno(r, "Failed to open netlink container: %m");
2482
2483 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2484 if (r < 0)
2485 return log_error_errno(r, "Failed to open netlink container: %m");
2486
2487 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2488 if (r < 0)
2489 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2490
2491 r = sd_rtnl_message_close_container(m);
2492 if (r < 0)
2493 return log_error_errno(r, "Failed to close netlink container: %m");
2494
2495 r = sd_rtnl_message_close_container(m);
2496 if (r < 0)
2497 return log_error_errno(r, "Failed to close netlink container: %m");
2498
2499 r = sd_rtnl_call(rtnl, m, 0, NULL);
2500 if (r < 0)
2501 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2502 }
2503
2504 return 0;
2505 }
2506
2507 static int setup_seccomp(void) {
2508
2509 #ifdef HAVE_SECCOMP
2510 static const int blacklist[] = {
2511 SCMP_SYS(kexec_load),
2512 SCMP_SYS(open_by_handle_at),
2513 SCMP_SYS(iopl),
2514 SCMP_SYS(ioperm),
2515 SCMP_SYS(swapon),
2516 SCMP_SYS(swapoff),
2517 };
2518
2519 static const int kmod_blacklist[] = {
2520 SCMP_SYS(init_module),
2521 SCMP_SYS(finit_module),
2522 SCMP_SYS(delete_module),
2523 };
2524
2525 scmp_filter_ctx seccomp;
2526 unsigned i;
2527 int r;
2528
2529 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2530 if (!seccomp)
2531 return log_oom();
2532
2533 r = seccomp_add_secondary_archs(seccomp);
2534 if (r < 0) {
2535 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2536 goto finish;
2537 }
2538
2539 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2540 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2541 if (r == -EFAULT)
2542 continue; /* unknown syscall */
2543 if (r < 0) {
2544 log_error_errno(r, "Failed to block syscall: %m");
2545 goto finish;
2546 }
2547 }
2548
2549 /* If the CAP_SYS_MODULE capability is not requested then
2550 * we'll block the kmod syscalls too */
2551 if (!(arg_retain & (1ULL << CAP_SYS_MODULE))) {
2552 for (i = 0; i < ELEMENTSOF(kmod_blacklist); i++) {
2553 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), kmod_blacklist[i], 0);
2554 if (r == -EFAULT)
2555 continue; /* unknown syscall */
2556 if (r < 0) {
2557 log_error_errno(r, "Failed to block syscall: %m");
2558 goto finish;
2559 }
2560 }
2561 }
2562
2563 /*
2564 Audit is broken in containers, much of the userspace audit
2565 hookup will fail if running inside a container. We don't
2566 care and just turn off creation of audit sockets.
2567
2568 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2569 with EAFNOSUPPORT which audit userspace uses as indication
2570 that audit is disabled in the kernel.
2571 */
2572
2573 r = seccomp_rule_add(
2574 seccomp,
2575 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2576 SCMP_SYS(socket),
2577 2,
2578 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2579 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2580 if (r < 0) {
2581 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2582 goto finish;
2583 }
2584
2585 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2586 if (r < 0) {
2587 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2588 goto finish;
2589 }
2590
2591 r = seccomp_load(seccomp);
2592 if (r < 0)
2593 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2594
2595 finish:
2596 seccomp_release(seccomp);
2597 return r;
2598 #else
2599 return 0;
2600 #endif
2601
2602 }
2603
2604 static int setup_propagate(const char *root) {
2605 const char *p, *q;
2606
2607 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2608 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2609 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2610 (void) mkdir_p(p, 0600);
2611
2612 q = strjoina(root, "/run/systemd/nspawn/incoming");
2613 mkdir_parents(q, 0755);
2614 mkdir_p(q, 0600);
2615
2616 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2617 return log_error_errno(errno, "Failed to install propagation bind mount.");
2618
2619 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2620 return log_error_errno(errno, "Failed to make propagation mount read-only");
2621
2622 return 0;
2623 }
2624
2625 static int setup_image(char **device_path, int *loop_nr) {
2626 struct loop_info64 info = {
2627 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2628 };
2629 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2630 _cleanup_free_ char* loopdev = NULL;
2631 struct stat st;
2632 int r, nr;
2633
2634 assert(device_path);
2635 assert(loop_nr);
2636 assert(arg_image);
2637
2638 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2639 if (fd < 0)
2640 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2641
2642 if (fstat(fd, &st) < 0)
2643 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2644
2645 if (S_ISBLK(st.st_mode)) {
2646 char *p;
2647
2648 p = strdup(arg_image);
2649 if (!p)
2650 return log_oom();
2651
2652 *device_path = p;
2653
2654 *loop_nr = -1;
2655
2656 r = fd;
2657 fd = -1;
2658
2659 return r;
2660 }
2661
2662 if (!S_ISREG(st.st_mode)) {
2663 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2664 return -EINVAL;
2665 }
2666
2667 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2668 if (control < 0)
2669 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2670
2671 nr = ioctl(control, LOOP_CTL_GET_FREE);
2672 if (nr < 0)
2673 return log_error_errno(errno, "Failed to allocate loop device: %m");
2674
2675 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2676 return log_oom();
2677
2678 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2679 if (loop < 0)
2680 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2681
2682 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2683 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2684
2685 if (arg_read_only)
2686 info.lo_flags |= LO_FLAGS_READ_ONLY;
2687
2688 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2689 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2690
2691 *device_path = loopdev;
2692 loopdev = NULL;
2693
2694 *loop_nr = nr;
2695
2696 r = loop;
2697 loop = -1;
2698
2699 return r;
2700 }
2701
2702 #define PARTITION_TABLE_BLURB \
2703 "Note that the disk image needs to either contain only a single MBR partition of\n" \
2704 "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2705 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2706 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2707 "to be bootable with systemd-nspawn."
2708
2709 static int dissect_image(
2710 int fd,
2711 char **root_device, bool *root_device_rw,
2712 char **home_device, bool *home_device_rw,
2713 char **srv_device, bool *srv_device_rw,
2714 bool *secondary) {
2715
2716 #ifdef HAVE_BLKID
2717 int home_nr = -1, srv_nr = -1;
2718 #ifdef GPT_ROOT_NATIVE
2719 int root_nr = -1;
2720 #endif
2721 #ifdef GPT_ROOT_SECONDARY
2722 int secondary_root_nr = -1;
2723 #endif
2724 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2725 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2726 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2727 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2728 _cleanup_udev_unref_ struct udev *udev = NULL;
2729 struct udev_list_entry *first, *item;
2730 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2731 bool is_gpt, is_mbr, multiple_generic = false;
2732 const char *pttype = NULL;
2733 blkid_partlist pl;
2734 struct stat st;
2735 unsigned i;
2736 int r;
2737
2738 assert(fd >= 0);
2739 assert(root_device);
2740 assert(home_device);
2741 assert(srv_device);
2742 assert(secondary);
2743 assert(arg_image);
2744
2745 b = blkid_new_probe();
2746 if (!b)
2747 return log_oom();
2748
2749 errno = 0;
2750 r = blkid_probe_set_device(b, fd, 0, 0);
2751 if (r != 0) {
2752 if (errno == 0)
2753 return log_oom();
2754
2755 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2756 return -errno;
2757 }
2758
2759 blkid_probe_enable_partitions(b, 1);
2760 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2761
2762 errno = 0;
2763 r = blkid_do_safeprobe(b);
2764 if (r == -2 || r == 1) {
2765 log_error("Failed to identify any partition table on\n"
2766 " %s\n"
2767 PARTITION_TABLE_BLURB, arg_image);
2768 return -EINVAL;
2769 } else if (r != 0) {
2770 if (errno == 0)
2771 errno = EIO;
2772 log_error_errno(errno, "Failed to probe: %m");
2773 return -errno;
2774 }
2775
2776 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2777
2778 is_gpt = streq_ptr(pttype, "gpt");
2779 is_mbr = streq_ptr(pttype, "dos");
2780
2781 if (!is_gpt && !is_mbr) {
2782 log_error("No GPT or MBR partition table discovered on\n"
2783 " %s\n"
2784 PARTITION_TABLE_BLURB, arg_image);
2785 return -EINVAL;
2786 }
2787
2788 errno = 0;
2789 pl = blkid_probe_get_partitions(b);
2790 if (!pl) {
2791 if (errno == 0)
2792 return log_oom();
2793
2794 log_error("Failed to list partitions of %s", arg_image);
2795 return -errno;
2796 }
2797
2798 udev = udev_new();
2799 if (!udev)
2800 return log_oom();
2801
2802 if (fstat(fd, &st) < 0)
2803 return log_error_errno(errno, "Failed to stat block device: %m");
2804
2805 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2806 if (!d)
2807 return log_oom();
2808
2809 for (i = 0;; i++) {
2810 int n, m;
2811
2812 if (i >= 10) {
2813 log_error("Kernel partitions never appeared.");
2814 return -ENXIO;
2815 }
2816
2817 e = udev_enumerate_new(udev);
2818 if (!e)
2819 return log_oom();
2820
2821 r = udev_enumerate_add_match_parent(e, d);
2822 if (r < 0)
2823 return log_oom();
2824
2825 r = udev_enumerate_scan_devices(e);
2826 if (r < 0)
2827 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2828
2829 /* Count the partitions enumerated by the kernel */
2830 n = 0;
2831 first = udev_enumerate_get_list_entry(e);
2832 udev_list_entry_foreach(item, first)
2833 n++;
2834
2835 /* Count the partitions enumerated by blkid */
2836 m = blkid_partlist_numof_partitions(pl);
2837 if (n == m + 1)
2838 break;
2839 if (n > m + 1) {
2840 log_error("blkid and kernel partition list do not match.");
2841 return -EIO;
2842 }
2843 if (n < m + 1) {
2844 unsigned j;
2845
2846 /* The kernel has probed fewer partitions than
2847 * blkid? Maybe the kernel prober is still
2848 * running or it got EBUSY because udev
2849 * already opened the device. Let's reprobe
2850 * the device, which is a synchronous call
2851 * that waits until probing is complete. */
2852
2853 for (j = 0; j < 20; j++) {
2854
2855 r = ioctl(fd, BLKRRPART, 0);
2856 if (r < 0)
2857 r = -errno;
2858 if (r >= 0 || r != -EBUSY)
2859 break;
2860
2861 /* If something else has the device
2862 * open, such as an udev rule, the
2863 * ioctl will return EBUSY. Since
2864 * there's no way to wait until it
2865 * isn't busy anymore, let's just wait
2866 * a bit, and try again.
2867 *
2868 * This is really something they
2869 * should fix in the kernel! */
2870
2871 usleep(50 * USEC_PER_MSEC);
2872 }
2873
2874 if (r < 0)
2875 return log_error_errno(r, "Failed to reread partition table: %m");
2876 }
2877
2878 e = udev_enumerate_unref(e);
2879 }
2880
2881 first = udev_enumerate_get_list_entry(e);
2882 udev_list_entry_foreach(item, first) {
2883 _cleanup_udev_device_unref_ struct udev_device *q;
2884 const char *node;
2885 unsigned long long flags;
2886 blkid_partition pp;
2887 dev_t qn;
2888 int nr;
2889
2890 errno = 0;
2891 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2892 if (!q) {
2893 if (!errno)
2894 errno = ENOMEM;
2895
2896 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2897 return -errno;
2898 }
2899
2900 qn = udev_device_get_devnum(q);
2901 if (major(qn) == 0)
2902 continue;
2903
2904 if (st.st_rdev == qn)
2905 continue;
2906
2907 node = udev_device_get_devnode(q);
2908 if (!node)
2909 continue;
2910
2911 pp = blkid_partlist_devno_to_partition(pl, qn);
2912 if (!pp)
2913 continue;
2914
2915 flags = blkid_partition_get_flags(pp);
2916
2917 nr = blkid_partition_get_partno(pp);
2918 if (nr < 0)
2919 continue;
2920
2921 if (is_gpt) {
2922 sd_id128_t type_id;
2923 const char *stype;
2924
2925 if (flags & GPT_FLAG_NO_AUTO)
2926 continue;
2927
2928 stype = blkid_partition_get_type_string(pp);
2929 if (!stype)
2930 continue;
2931
2932 if (sd_id128_from_string(stype, &type_id) < 0)
2933 continue;
2934
2935 if (sd_id128_equal(type_id, GPT_HOME)) {
2936
2937 if (home && nr >= home_nr)
2938 continue;
2939
2940 home_nr = nr;
2941 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2942
2943 r = free_and_strdup(&home, node);
2944 if (r < 0)
2945 return log_oom();
2946
2947 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2948
2949 if (srv && nr >= srv_nr)
2950 continue;
2951
2952 srv_nr = nr;
2953 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2954
2955 r = free_and_strdup(&srv, node);
2956 if (r < 0)
2957 return log_oom();
2958 }
2959 #ifdef GPT_ROOT_NATIVE
2960 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2961
2962 if (root && nr >= root_nr)
2963 continue;
2964
2965 root_nr = nr;
2966 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2967
2968 r = free_and_strdup(&root, node);
2969 if (r < 0)
2970 return log_oom();
2971 }
2972 #endif
2973 #ifdef GPT_ROOT_SECONDARY
2974 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2975
2976 if (secondary_root && nr >= secondary_root_nr)
2977 continue;
2978
2979 secondary_root_nr = nr;
2980 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2981
2982 r = free_and_strdup(&secondary_root, node);
2983 if (r < 0)
2984 return log_oom();
2985 }
2986 #endif
2987 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2988
2989 if (generic)
2990 multiple_generic = true;
2991 else {
2992 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2993
2994 r = free_and_strdup(&generic, node);
2995 if (r < 0)
2996 return log_oom();
2997 }
2998 }
2999
3000 } else if (is_mbr) {
3001 int type;
3002
3003 if (flags != 0x80) /* Bootable flag */
3004 continue;
3005
3006 type = blkid_partition_get_type(pp);
3007 if (type != 0x83) /* Linux partition */
3008 continue;
3009
3010 if (generic)
3011 multiple_generic = true;
3012 else {
3013 generic_rw = true;
3014
3015 r = free_and_strdup(&root, node);
3016 if (r < 0)
3017 return log_oom();
3018 }
3019 }
3020 }
3021
3022 if (root) {
3023 *root_device = root;
3024 root = NULL;
3025
3026 *root_device_rw = root_rw;
3027 *secondary = false;
3028 } else if (secondary_root) {
3029 *root_device = secondary_root;
3030 secondary_root = NULL;
3031
3032 *root_device_rw = secondary_root_rw;
3033 *secondary = true;
3034 } else if (generic) {
3035
3036 /* There were no partitions with precise meanings
3037 * around, but we found generic partitions. In this
3038 * case, if there's only one, we can go ahead and boot
3039 * it, otherwise we bail out, because we really cannot
3040 * make any sense of it. */
3041
3042 if (multiple_generic) {
3043 log_error("Identified multiple bootable Linux partitions on\n"
3044 " %s\n"
3045 PARTITION_TABLE_BLURB, arg_image);
3046 return -EINVAL;
3047 }
3048
3049 *root_device = generic;
3050 generic = NULL;
3051
3052 *root_device_rw = generic_rw;
3053 *secondary = false;
3054 } else {
3055 log_error("Failed to identify root partition in disk image\n"
3056 " %s\n"
3057 PARTITION_TABLE_BLURB, arg_image);
3058 return -EINVAL;
3059 }
3060
3061 if (home) {
3062 *home_device = home;
3063 home = NULL;
3064
3065 *home_device_rw = home_rw;
3066 }
3067
3068 if (srv) {
3069 *srv_device = srv;
3070 srv = NULL;
3071
3072 *srv_device_rw = srv_rw;
3073 }
3074
3075 return 0;
3076 #else
3077 log_error("--image= is not supported, compiled without blkid support.");
3078 return -ENOTSUP;
3079 #endif
3080 }
3081
3082 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3083 #ifdef HAVE_BLKID
3084 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3085 const char *fstype, *p;
3086 int r;
3087
3088 assert(what);
3089 assert(where);
3090
3091 if (arg_read_only)
3092 rw = false;
3093
3094 if (directory)
3095 p = strjoina(where, directory);
3096 else
3097 p = where;
3098
3099 errno = 0;
3100 b = blkid_new_probe_from_filename(what);
3101 if (!b) {
3102 if (errno == 0)
3103 return log_oom();
3104 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3105 return -errno;
3106 }
3107
3108 blkid_probe_enable_superblocks(b, 1);
3109 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3110
3111 errno = 0;
3112 r = blkid_do_safeprobe(b);
3113 if (r == -1 || r == 1) {
3114 log_error("Cannot determine file system type of %s", what);
3115 return -EINVAL;
3116 } else if (r != 0) {
3117 if (errno == 0)
3118 errno = EIO;
3119 log_error_errno(errno, "Failed to probe %s: %m", what);
3120 return -errno;
3121 }
3122
3123 errno = 0;
3124 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3125 if (errno == 0)
3126 errno = EINVAL;
3127 log_error("Failed to determine file system type of %s", what);
3128 return -errno;
3129 }
3130
3131 if (streq(fstype, "crypto_LUKS")) {
3132 log_error("nspawn currently does not support LUKS disk images.");
3133 return -ENOTSUP;
3134 }
3135
3136 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3137 return log_error_errno(errno, "Failed to mount %s: %m", what);
3138
3139 return 0;
3140 #else
3141 log_error("--image= is not supported, compiled without blkid support.");
3142 return -ENOTSUP;
3143 #endif
3144 }
3145
3146 static int mount_devices(
3147 const char *where,
3148 const char *root_device, bool root_device_rw,
3149 const char *home_device, bool home_device_rw,
3150 const char *srv_device, bool srv_device_rw) {
3151 int r;
3152
3153 assert(where);
3154
3155 if (root_device) {
3156 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3157 if (r < 0)
3158 return log_error_errno(r, "Failed to mount root directory: %m");
3159 }
3160
3161 if (home_device) {
3162 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3163 if (r < 0)
3164 return log_error_errno(r, "Failed to mount home directory: %m");
3165 }
3166
3167 if (srv_device) {
3168 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3169 if (r < 0)
3170 return log_error_errno(r, "Failed to mount server data directory: %m");
3171 }
3172
3173 return 0;
3174 }
3175
3176 static void loop_remove(int nr, int *image_fd) {
3177 _cleanup_close_ int control = -1;
3178 int r;
3179
3180 if (nr < 0)
3181 return;
3182
3183 if (image_fd && *image_fd >= 0) {
3184 r = ioctl(*image_fd, LOOP_CLR_FD);
3185 if (r < 0)
3186 log_debug_errno(errno, "Failed to close loop image: %m");
3187 *image_fd = safe_close(*image_fd);
3188 }
3189
3190 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3191 if (control < 0) {
3192 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3193 return;
3194 }
3195
3196 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3197 if (r < 0)
3198 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3199 }
3200
3201 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3202 int pipe_fds[2];
3203 pid_t pid;
3204
3205 assert(database);
3206 assert(key);
3207 assert(rpid);
3208
3209 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3210 return log_error_errno(errno, "Failed to allocate pipe: %m");
3211
3212 pid = fork();
3213 if (pid < 0)
3214 return log_error_errno(errno, "Failed to fork getent child: %m");
3215 else if (pid == 0) {
3216 int nullfd;
3217 char *empty_env = NULL;
3218
3219 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3220 _exit(EXIT_FAILURE);
3221
3222 if (pipe_fds[0] > 2)
3223 safe_close(pipe_fds[0]);
3224 if (pipe_fds[1] > 2)
3225 safe_close(pipe_fds[1]);
3226
3227 nullfd = open("/dev/null", O_RDWR);
3228 if (nullfd < 0)
3229 _exit(EXIT_FAILURE);
3230
3231 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3232 _exit(EXIT_FAILURE);
3233
3234 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3235 _exit(EXIT_FAILURE);
3236
3237 if (nullfd > 2)
3238 safe_close(nullfd);
3239
3240 reset_all_signal_handlers();
3241 close_all_fds(NULL, 0);
3242
3243 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3244 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3245 _exit(EXIT_FAILURE);
3246 }
3247
3248 pipe_fds[1] = safe_close(pipe_fds[1]);
3249
3250 *rpid = pid;
3251
3252 return pipe_fds[0];
3253 }
3254
3255 static int change_uid_gid(char **_home) {
3256 char line[LINE_MAX], *x, *u, *g, *h;
3257 const char *word, *state;
3258 _cleanup_free_ uid_t *uids = NULL;
3259 _cleanup_free_ char *home = NULL;
3260 _cleanup_fclose_ FILE *f = NULL;
3261 _cleanup_close_ int fd = -1;
3262 unsigned n_uids = 0;
3263 size_t sz = 0, l;
3264 uid_t uid;
3265 gid_t gid;
3266 pid_t pid;
3267 int r;
3268
3269 assert(_home);
3270
3271 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3272 /* Reset everything fully to 0, just in case */
3273
3274 if (setgroups(0, NULL) < 0)
3275 return log_error_errno(errno, "setgroups() failed: %m");
3276
3277 if (setresgid(0, 0, 0) < 0)
3278 return log_error_errno(errno, "setregid() failed: %m");
3279
3280 if (setresuid(0, 0, 0) < 0)
3281 return log_error_errno(errno, "setreuid() failed: %m");
3282
3283 *_home = NULL;
3284 return 0;
3285 }
3286
3287 /* First, get user credentials */
3288 fd = spawn_getent("passwd", arg_user, &pid);
3289 if (fd < 0)
3290 return fd;
3291
3292 f = fdopen(fd, "r");
3293 if (!f)
3294 return log_oom();
3295 fd = -1;
3296
3297 if (!fgets(line, sizeof(line), f)) {
3298
3299 if (!ferror(f)) {
3300 log_error("Failed to resolve user %s.", arg_user);
3301 return -ESRCH;
3302 }
3303
3304 log_error_errno(errno, "Failed to read from getent: %m");
3305 return -errno;
3306 }
3307
3308 truncate_nl(line);
3309
3310 wait_for_terminate_and_warn("getent passwd", pid, true);
3311
3312 x = strchr(line, ':');
3313 if (!x) {
3314 log_error("/etc/passwd entry has invalid user field.");
3315 return -EIO;
3316 }
3317
3318 u = strchr(x+1, ':');
3319 if (!u) {
3320 log_error("/etc/passwd entry has invalid password field.");
3321 return -EIO;
3322 }
3323
3324 u++;
3325 g = strchr(u, ':');
3326 if (!g) {
3327 log_error("/etc/passwd entry has invalid UID field.");
3328 return -EIO;
3329 }
3330
3331 *g = 0;
3332 g++;
3333 x = strchr(g, ':');
3334 if (!x) {
3335 log_error("/etc/passwd entry has invalid GID field.");
3336 return -EIO;
3337 }
3338
3339 *x = 0;
3340 h = strchr(x+1, ':');
3341 if (!h) {
3342 log_error("/etc/passwd entry has invalid GECOS field.");
3343 return -EIO;
3344 }
3345
3346 h++;
3347 x = strchr(h, ':');
3348 if (!x) {
3349 log_error("/etc/passwd entry has invalid home directory field.");
3350 return -EIO;
3351 }
3352
3353 *x = 0;
3354
3355 r = parse_uid(u, &uid);
3356 if (r < 0) {
3357 log_error("Failed to parse UID of user.");
3358 return -EIO;
3359 }
3360
3361 r = parse_gid(g, &gid);
3362 if (r < 0) {
3363 log_error("Failed to parse GID of user.");
3364 return -EIO;
3365 }
3366
3367 home = strdup(h);
3368 if (!home)
3369 return log_oom();
3370
3371 /* Second, get group memberships */
3372 fd = spawn_getent("initgroups", arg_user, &pid);
3373 if (fd < 0)
3374 return fd;
3375
3376 fclose(f);
3377 f = fdopen(fd, "r");
3378 if (!f)
3379 return log_oom();
3380 fd = -1;
3381
3382 if (!fgets(line, sizeof(line), f)) {
3383 if (!ferror(f)) {
3384 log_error("Failed to resolve user %s.", arg_user);
3385 return -ESRCH;
3386 }
3387
3388 log_error_errno(errno, "Failed to read from getent: %m");
3389 return -errno;
3390 }
3391
3392 truncate_nl(line);
3393
3394 wait_for_terminate_and_warn("getent initgroups", pid, true);
3395
3396 /* Skip over the username and subsequent separator whitespace */
3397 x = line;
3398 x += strcspn(x, WHITESPACE);
3399 x += strspn(x, WHITESPACE);
3400
3401 FOREACH_WORD(word, l, x, state) {
3402 char c[l+1];
3403
3404 memcpy(c, word, l);
3405 c[l] = 0;
3406
3407 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3408 return log_oom();
3409
3410 r = parse_uid(c, &uids[n_uids++]);
3411 if (r < 0) {
3412 log_error("Failed to parse group data from getent.");
3413 return -EIO;
3414 }
3415 }
3416
3417 r = mkdir_parents(home, 0775);
3418 if (r < 0)
3419 return log_error_errno(r, "Failed to make home root directory: %m");
3420
3421 r = mkdir_safe(home, 0755, uid, gid);
3422 if (r < 0 && r != -EEXIST)
3423 return log_error_errno(r, "Failed to make home directory: %m");
3424
3425 fchown(STDIN_FILENO, uid, gid);
3426 fchown(STDOUT_FILENO, uid, gid);
3427 fchown(STDERR_FILENO, uid, gid);
3428
3429 if (setgroups(n_uids, uids) < 0)
3430 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3431
3432 if (setresgid(gid, gid, gid) < 0)
3433 return log_error_errno(errno, "setregid() failed: %m");
3434
3435 if (setresuid(uid, uid, uid) < 0)
3436 return log_error_errno(errno, "setreuid() failed: %m");
3437
3438 if (_home) {
3439 *_home = home;
3440 home = NULL;
3441 }
3442
3443 return 0;
3444 }
3445
3446 /*
3447 * Return values:
3448 * < 0 : wait_for_terminate() failed to get the state of the
3449 * container, the container was terminated by a signal, or
3450 * failed for an unknown reason. No change is made to the
3451 * container argument.
3452 * > 0 : The program executed in the container terminated with an
3453 * error. The exit code of the program executed in the
3454 * container is returned. The container argument has been set
3455 * to CONTAINER_TERMINATED.
3456 * 0 : The container is being rebooted, has been shut down or exited
3457 * successfully. The container argument has been set to either
3458 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3459 *
3460 * That is, success is indicated by a return value of zero, and an
3461 * error is indicated by a non-zero value.
3462 */
3463 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3464 siginfo_t status;
3465 int r;
3466
3467 r = wait_for_terminate(pid, &status);
3468 if (r < 0)
3469 return log_warning_errno(r, "Failed to wait for container: %m");
3470
3471 switch (status.si_code) {
3472
3473 case CLD_EXITED:
3474 if (status.si_status == 0) {
3475 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3476
3477 } else
3478 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3479
3480 *container = CONTAINER_TERMINATED;
3481 return status.si_status;
3482
3483 case CLD_KILLED:
3484 if (status.si_status == SIGINT) {
3485
3486 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3487 *container = CONTAINER_TERMINATED;
3488 return 0;
3489
3490 } else if (status.si_status == SIGHUP) {
3491
3492 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3493 *container = CONTAINER_REBOOTED;
3494 return 0;
3495 }
3496
3497 /* CLD_KILLED fallthrough */
3498
3499 case CLD_DUMPED:
3500 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3501 return -EIO;
3502
3503 default:
3504 log_error("Container %s failed due to unknown reason.", arg_machine);
3505 return -EIO;
3506 }
3507
3508 return r;
3509 }
3510
3511 static void nop_handler(int sig) {}
3512
3513 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3514 pid_t pid;
3515
3516 pid = PTR_TO_UINT32(userdata);
3517 if (pid > 0) {
3518 if (kill(pid, SIGRTMIN+3) >= 0) {
3519 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3520 sd_event_source_set_userdata(s, NULL);
3521 return 0;
3522 }
3523 }
3524
3525 sd_event_exit(sd_event_source_get_event(s), 0);
3526 return 0;
3527 }
3528
3529 static int determine_names(void) {
3530 int r;
3531
3532 if (!arg_image && !arg_directory) {
3533 if (arg_machine) {
3534 _cleanup_(image_unrefp) Image *i = NULL;
3535
3536 r = image_find(arg_machine, &i);
3537 if (r < 0)
3538 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3539 else if (r == 0) {
3540 log_error("No image for machine '%s': %m", arg_machine);
3541 return -ENOENT;
3542 }
3543
3544 if (i->type == IMAGE_RAW)
3545 r = set_sanitized_path(&arg_image, i->path);
3546 else
3547 r = set_sanitized_path(&arg_directory, i->path);
3548 if (r < 0)
3549 return log_error_errno(r, "Invalid image directory: %m");
3550
3551 arg_read_only = arg_read_only || i->read_only;
3552 } else
3553 arg_directory = get_current_dir_name();
3554
3555 if (!arg_directory && !arg_machine) {
3556 log_error("Failed to determine path, please use -D or -i.");
3557 return -EINVAL;
3558 }
3559 }
3560
3561 if (!arg_machine) {
3562 if (arg_directory && path_equal(arg_directory, "/"))
3563 arg_machine = gethostname_malloc();
3564 else
3565 arg_machine = strdup(basename(arg_image ?: arg_directory));
3566
3567 if (!arg_machine)
3568 return log_oom();
3569
3570 hostname_cleanup(arg_machine, false);
3571 if (!machine_name_is_valid(arg_machine)) {
3572 log_error("Failed to determine machine name automatically, please use -M.");
3573 return -EINVAL;
3574 }
3575
3576 if (arg_ephemeral) {
3577 char *b;
3578
3579 /* Add a random suffix when this is an
3580 * ephemeral machine, so that we can run many
3581 * instances at once without manually having
3582 * to specify -M each time. */
3583
3584 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3585 return log_oom();
3586
3587 free(arg_machine);
3588 arg_machine = b;
3589 }
3590 }
3591
3592 return 0;
3593 }
3594
3595 int main(int argc, char *argv[]) {
3596
3597 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3598 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3599 _cleanup_close_ int master = -1, image_fd = -1;
3600 _cleanup_fdset_free_ FDSet *fds = NULL;
3601 int r, n_fd_passed, loop_nr = -1;
3602 char veth_name[IFNAMSIZ];
3603 bool secondary = false, remove_subvol = false;
3604 sigset_t mask, mask_chld;
3605 pid_t pid = 0;
3606 int ret = EXIT_SUCCESS;
3607 union in_addr_union exposed = {};
3608 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3609
3610 log_parse_environment();
3611 log_open();
3612
3613 r = parse_argv(argc, argv);
3614 if (r <= 0)
3615 goto finish;
3616
3617 r = determine_names();
3618 if (r < 0)
3619 goto finish;
3620
3621 if (geteuid() != 0) {
3622 log_error("Need to be root.");
3623 r = -EPERM;
3624 goto finish;
3625 }
3626
3627 if (sd_booted() <= 0) {
3628 log_error("Not running on a systemd system.");
3629 r = -EINVAL;
3630 goto finish;
3631 }
3632
3633 log_close();
3634 n_fd_passed = sd_listen_fds(false);
3635 if (n_fd_passed > 0) {
3636 r = fdset_new_listen_fds(&fds, false);
3637 if (r < 0) {
3638 log_error_errno(r, "Failed to collect file descriptors: %m");
3639 goto finish;
3640 }
3641 }
3642 fdset_close_others(fds);
3643 log_open();
3644
3645 if (arg_directory) {
3646 assert(!arg_image);
3647
3648 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3649 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3650 r = -EINVAL;
3651 goto finish;
3652 }
3653
3654 if (arg_ephemeral) {
3655 char *np;
3656
3657 /* If the specified path is a mount point we
3658 * generate the new snapshot immediately
3659 * inside it under a random name. However if
3660 * the specified is not a mount point we
3661 * create the new snapshot in the parent
3662 * directory, just next to it. */
3663 r = path_is_mount_point(arg_directory, false);
3664 if (r < 0) {
3665 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3666 goto finish;
3667 }
3668 if (r > 0)
3669 r = tempfn_random_child(arg_directory, &np);
3670 else
3671 r = tempfn_random(arg_directory, &np);
3672 if (r < 0) {
3673 log_error_errno(r, "Failed to generate name for snapshot: %m");
3674 goto finish;
3675 }
3676
3677 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3678 if (r < 0) {
3679 log_error_errno(r, "Failed to lock %s: %m", np);
3680 goto finish;
3681 }
3682
3683 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3684 if (r < 0) {
3685 free(np);
3686 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3687 goto finish;
3688 }
3689
3690 free(arg_directory);
3691 arg_directory = np;
3692
3693 remove_subvol = true;
3694
3695 } else {
3696 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3697 if (r == -EBUSY) {
3698 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3699 goto finish;
3700 }
3701 if (r < 0) {
3702 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3703 return r;
3704 }
3705
3706 if (arg_template) {
3707 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3708 if (r == -EEXIST) {
3709 if (!arg_quiet)
3710 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3711 } else if (r < 0) {
3712 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3713 goto finish;
3714 } else {
3715 if (!arg_quiet)
3716 log_info("Populated %s from template %s.", arg_directory, arg_template);
3717 }
3718 }
3719 }
3720
3721 if (arg_boot) {
3722 if (path_is_os_tree(arg_directory) <= 0) {
3723 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3724 r = -EINVAL;
3725 goto finish;
3726 }
3727 } else {
3728 const char *p;
3729
3730 p = strjoina(arg_directory,
3731 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3732 if (access(p, F_OK) < 0) {
3733 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3734 r = -EINVAL;
3735 goto finish;
3736 }
3737 }
3738
3739 } else {
3740 char template[] = "/tmp/nspawn-root-XXXXXX";
3741
3742 assert(arg_image);
3743 assert(!arg_template);
3744
3745 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3746 if (r == -EBUSY) {
3747 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3748 goto finish;
3749 }
3750 if (r < 0) {
3751 r = log_error_errno(r, "Failed to create image lock: %m");
3752 goto finish;
3753 }
3754
3755 if (!mkdtemp(template)) {
3756 log_error_errno(errno, "Failed to create temporary directory: %m");
3757 r = -errno;
3758 goto finish;
3759 }
3760
3761 arg_directory = strdup(template);
3762 if (!arg_directory) {
3763 r = log_oom();
3764 goto finish;
3765 }
3766
3767 image_fd = setup_image(&device_path, &loop_nr);
3768 if (image_fd < 0) {
3769 r = image_fd;
3770 goto finish;
3771 }
3772
3773 r = dissect_image(image_fd,
3774 &root_device, &root_device_rw,
3775 &home_device, &home_device_rw,
3776 &srv_device, &srv_device_rw,
3777 &secondary);
3778 if (r < 0)
3779 goto finish;
3780 }
3781
3782 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3783 if (master < 0) {
3784 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3785 goto finish;
3786 }
3787
3788 r = ptsname_malloc(master, &console);
3789 if (r < 0) {
3790 r = log_error_errno(r, "Failed to determine tty name: %m");
3791 goto finish;
3792 }
3793
3794 if (!arg_quiet)
3795 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3796 arg_machine, arg_image ?: arg_directory);
3797
3798 if (unlockpt(master) < 0) {
3799 r = log_error_errno(errno, "Failed to unlock tty: %m");
3800 goto finish;
3801 }
3802
3803 assert_se(sigemptyset(&mask) == 0);
3804 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3805 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3806
3807 assert_se(sigemptyset(&mask_chld) == 0);
3808 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3809
3810 for (;;) {
3811 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3812 ContainerStatus container_status;
3813 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3814 struct sigaction sa = {
3815 .sa_handler = nop_handler,
3816 .sa_flags = SA_NOCLDSTOP,
3817 };
3818
3819 r = barrier_create(&barrier);
3820 if (r < 0) {
3821 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3822 goto finish;
3823 }
3824
3825 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3826 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3827 goto finish;
3828 }
3829
3830 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3831 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3832 goto finish;
3833 }
3834
3835 /* Child can be killed before execv(), so handle SIGCHLD
3836 * in order to interrupt parent's blocking calls and
3837 * give it a chance to call wait() and terminate. */
3838 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3839 if (r < 0) {
3840 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3841 goto finish;
3842 }
3843
3844 r = sigaction(SIGCHLD, &sa, NULL);
3845 if (r < 0) {
3846 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3847 goto finish;
3848 }
3849
3850 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3851 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3852 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3853 if (pid < 0) {
3854 if (errno == EINVAL)
3855 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3856 else
3857 r = log_error_errno(errno, "clone() failed: %m");
3858
3859 goto finish;
3860 }
3861
3862 if (pid == 0) {
3863 /* child */
3864 _cleanup_free_ char *home = NULL;
3865 unsigned n_env = 2;
3866 const char *envp[] = {
3867 "PATH=" DEFAULT_PATH_SPLIT_USR,
3868 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3869 NULL, /* TERM */
3870 NULL, /* HOME */
3871 NULL, /* USER */
3872 NULL, /* LOGNAME */
3873 NULL, /* container_uuid */
3874 NULL, /* LISTEN_FDS */
3875 NULL, /* LISTEN_PID */
3876 NULL
3877 };
3878 char **env_use;
3879
3880 barrier_set_role(&barrier, BARRIER_CHILD);
3881
3882 envp[n_env] = strv_find_prefix(environ, "TERM=");
3883 if (envp[n_env])
3884 n_env ++;
3885
3886 master = safe_close(master);
3887
3888 close_nointr(STDIN_FILENO);
3889 close_nointr(STDOUT_FILENO);
3890 close_nointr(STDERR_FILENO);
3891
3892 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3893 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3894
3895 reset_all_signal_handlers();
3896 reset_signal_mask();
3897
3898 r = open_terminal(console, O_RDWR);
3899 if (r != STDIN_FILENO) {
3900 if (r >= 0) {
3901 safe_close(r);
3902 r = -EINVAL;
3903 }
3904
3905 log_error_errno(r, "Failed to open console: %m");
3906 _exit(EXIT_FAILURE);
3907 }
3908
3909 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3910 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3911 log_error_errno(errno, "Failed to duplicate console: %m");
3912 _exit(EXIT_FAILURE);
3913 }
3914
3915 if (setsid() < 0) {
3916 log_error_errno(errno, "setsid() failed: %m");
3917 _exit(EXIT_FAILURE);
3918 }
3919
3920 if (reset_audit_loginuid() < 0)
3921 _exit(EXIT_FAILURE);
3922
3923 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3924 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3925 _exit(EXIT_FAILURE);
3926 }
3927
3928 /* Mark everything as slave, so that we still
3929 * receive mounts from the real root, but don't
3930 * propagate mounts to the real root. */
3931 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3932 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3933 _exit(EXIT_FAILURE);
3934 }
3935
3936 if (mount_devices(arg_directory,
3937 root_device, root_device_rw,
3938 home_device, home_device_rw,
3939 srv_device, srv_device_rw) < 0)
3940 _exit(EXIT_FAILURE);
3941
3942 /* Turn directory into bind mount */
3943 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3944 log_error_errno(errno, "Failed to make bind mount: %m");
3945 _exit(EXIT_FAILURE);
3946 }
3947
3948 r = setup_volatile(arg_directory);
3949 if (r < 0)
3950 _exit(EXIT_FAILURE);
3951
3952 if (setup_volatile_state(arg_directory) < 0)
3953 _exit(EXIT_FAILURE);
3954
3955 r = base_filesystem_create(arg_directory);
3956 if (r < 0)
3957 _exit(EXIT_FAILURE);
3958
3959 if (arg_read_only) {
3960 r = bind_remount_recursive(arg_directory, true);
3961 if (r < 0) {
3962 log_error_errno(r, "Failed to make tree read-only: %m");
3963 _exit(EXIT_FAILURE);
3964 }
3965 }
3966
3967 if (mount_all(arg_directory) < 0)
3968 _exit(EXIT_FAILURE);
3969
3970 if (copy_devnodes(arg_directory) < 0)
3971 _exit(EXIT_FAILURE);
3972
3973 if (setup_ptmx(arg_directory) < 0)
3974 _exit(EXIT_FAILURE);
3975
3976 dev_setup(arg_directory);
3977
3978 if (setup_propagate(arg_directory) < 0)
3979 _exit(EXIT_FAILURE);
3980
3981 if (setup_seccomp() < 0)
3982 _exit(EXIT_FAILURE);
3983
3984 if (setup_dev_console(arg_directory, console) < 0)
3985 _exit(EXIT_FAILURE);
3986
3987 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3988 _exit(EXIT_FAILURE);
3989 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3990
3991 if (send_rtnl(rtnl_socket_pair[1]) < 0)
3992 _exit(EXIT_FAILURE);
3993 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3994
3995 /* Tell the parent that we are ready, and that
3996 * it can cgroupify us to that we lack access
3997 * to certain devices and resources. */
3998 (void) barrier_place(&barrier);
3999
4000 if (setup_boot_id(arg_directory) < 0)
4001 _exit(EXIT_FAILURE);
4002
4003 if (setup_timezone(arg_directory) < 0)
4004 _exit(EXIT_FAILURE);
4005
4006 if (setup_resolv_conf(arg_directory) < 0)
4007 _exit(EXIT_FAILURE);
4008
4009 if (setup_journal(arg_directory) < 0)
4010 _exit(EXIT_FAILURE);
4011
4012 if (mount_binds(arg_directory, arg_bind, false) < 0)
4013 _exit(EXIT_FAILURE);
4014
4015 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
4016 _exit(EXIT_FAILURE);
4017
4018 if (mount_tmpfs(arg_directory) < 0)
4019 _exit(EXIT_FAILURE);
4020
4021 /* Wait until we are cgroup-ified, so that we
4022 * can mount the right cgroup path writable */
4023 (void) barrier_sync_next(&barrier);
4024
4025 if (mount_cgroup(arg_directory) < 0)
4026 _exit(EXIT_FAILURE);
4027
4028 if (chdir(arg_directory) < 0) {
4029 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
4030 _exit(EXIT_FAILURE);
4031 }
4032
4033 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
4034 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
4035 _exit(EXIT_FAILURE);
4036 }
4037
4038 if (chroot(".") < 0) {
4039 log_error_errno(errno, "chroot() failed: %m");
4040 _exit(EXIT_FAILURE);
4041 }
4042
4043 if (chdir("/") < 0) {
4044 log_error_errno(errno, "chdir() failed: %m");
4045 _exit(EXIT_FAILURE);
4046 }
4047
4048 umask(0022);
4049
4050 if (arg_private_network)
4051 loopback_setup();
4052
4053 if (drop_capabilities() < 0) {
4054 log_error_errno(errno, "drop_capabilities() failed: %m");
4055 _exit(EXIT_FAILURE);
4056 }
4057
4058 r = change_uid_gid(&home);
4059 if (r < 0)
4060 _exit(EXIT_FAILURE);
4061
4062 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4063 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4064 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4065 log_oom();
4066 _exit(EXIT_FAILURE);
4067 }
4068
4069 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4070 char as_uuid[37];
4071
4072 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4073 log_oom();
4074 _exit(EXIT_FAILURE);
4075 }
4076 }
4077
4078 if (fdset_size(fds) > 0) {
4079 r = fdset_cloexec(fds, false);
4080 if (r < 0) {
4081 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4082 _exit(EXIT_FAILURE);
4083 }
4084
4085 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4086 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4087 log_oom();
4088 _exit(EXIT_FAILURE);
4089 }
4090 }
4091
4092 setup_hostname();
4093
4094 if (arg_personality != 0xffffffffLU) {
4095 if (personality(arg_personality) < 0) {
4096 log_error_errno(errno, "personality() failed: %m");
4097 _exit(EXIT_FAILURE);
4098 }
4099 } else if (secondary) {
4100 if (personality(PER_LINUX32) < 0) {
4101 log_error_errno(errno, "personality() failed: %m");
4102 _exit(EXIT_FAILURE);
4103 }
4104 }
4105
4106 #ifdef HAVE_SELINUX
4107 if (arg_selinux_context)
4108 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4109 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4110 _exit(EXIT_FAILURE);
4111 }
4112 #endif
4113
4114 if (!strv_isempty(arg_setenv)) {
4115 char **n;
4116
4117 n = strv_env_merge(2, envp, arg_setenv);
4118 if (!n) {
4119 log_oom();
4120 _exit(EXIT_FAILURE);
4121 }
4122
4123 env_use = n;
4124 } else
4125 env_use = (char**) envp;
4126
4127 /* Wait until the parent is ready with the setup, too... */
4128 if (!barrier_place_and_sync(&barrier))
4129 _exit(EXIT_FAILURE);
4130
4131 if (arg_boot) {
4132 char **a;
4133 size_t l;
4134
4135 /* Automatically search for the init system */
4136
4137 l = 1 + argc - optind;
4138 a = newa(char*, l + 1);
4139 memcpy(a + 1, argv + optind, l * sizeof(char*));
4140
4141 a[0] = (char*) "/usr/lib/systemd/systemd";
4142 execve(a[0], a, env_use);
4143
4144 a[0] = (char*) "/lib/systemd/systemd";
4145 execve(a[0], a, env_use);
4146
4147 a[0] = (char*) "/sbin/init";
4148 execve(a[0], a, env_use);
4149 } else if (argc > optind)
4150 execvpe(argv[optind], argv + optind, env_use);
4151 else {
4152 chdir(home ? home : "/root");
4153 execle("/bin/bash", "-bash", NULL, env_use);
4154 execle("/bin/sh", "-sh", NULL, env_use);
4155 }
4156
4157 log_error_errno(errno, "execv() failed: %m");
4158 _exit(EXIT_FAILURE);
4159 }
4160
4161 barrier_set_role(&barrier, BARRIER_PARENT);
4162 fdset_free(fds);
4163 fds = NULL;
4164
4165 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4166 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4167
4168 /* Wait for the most basic Child-setup to be done,
4169 * before we add hardware to it, and place it in a
4170 * cgroup. */
4171 if (barrier_sync_next(&barrier)) {
4172 int ifi = 0;
4173
4174 r = move_network_interfaces(pid);
4175 if (r < 0)
4176 goto finish;
4177
4178 r = setup_veth(pid, veth_name, &ifi);
4179 if (r < 0)
4180 goto finish;
4181
4182 r = setup_bridge(veth_name, &ifi);
4183 if (r < 0)
4184 goto finish;
4185
4186 r = setup_macvlan(pid);
4187 if (r < 0)
4188 goto finish;
4189
4190 r = setup_ipvlan(pid);
4191 if (r < 0)
4192 goto finish;
4193
4194 r = register_machine(pid, ifi);
4195 if (r < 0)
4196 goto finish;
4197
4198 /* Block SIGCHLD here, before notifying child.
4199 * process_pty() will handle it with the other signals. */
4200 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4201 if (r < 0)
4202 goto finish;
4203
4204 /* Reset signal to default */
4205 r = default_signals(SIGCHLD, -1);
4206 if (r < 0)
4207 goto finish;
4208
4209 /* Notify the child that the parent is ready with all
4210 * its setup, and that the child can now hand over
4211 * control to the code to run inside the container. */
4212 (void) barrier_place(&barrier);
4213
4214 /* And wait that the child is completely ready now. */
4215 if (barrier_place_and_sync(&barrier)) {
4216 _cleanup_event_unref_ sd_event *event = NULL;
4217 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4218 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4219 char last_char = 0;
4220
4221 sd_notifyf(false,
4222 "READY=1\n"
4223 "STATUS=Container running.\n"
4224 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4225
4226 r = sd_event_new(&event);
4227 if (r < 0) {
4228 log_error_errno(r, "Failed to get default event source: %m");
4229 goto finish;
4230 }
4231
4232 if (arg_boot) {
4233 /* Try to kill the init system on SIGINT or SIGTERM */
4234 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4235 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4236 } else {
4237 /* Immediately exit */
4238 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4239 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4240 }
4241
4242 /* simply exit on sigchld */
4243 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4244
4245 if (arg_expose_ports) {
4246 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4247 if (r < 0)
4248 goto finish;
4249
4250 (void) expose_ports(rtnl, &exposed);
4251 }
4252
4253 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4254
4255 r = pty_forward_new(event, master, true, &forward);
4256 if (r < 0) {
4257 log_error_errno(r, "Failed to create PTY forwarder: %m");
4258 goto finish;
4259 }
4260
4261 r = sd_event_loop(event);
4262 if (r < 0) {
4263 log_error_errno(r, "Failed to run event loop: %m");
4264 goto finish;
4265 }
4266
4267 pty_forward_get_last_char(forward, &last_char);
4268
4269 forward = pty_forward_free(forward);
4270
4271 if (!arg_quiet && last_char != '\n')
4272 putc('\n', stdout);
4273
4274 /* Kill if it is not dead yet anyway */
4275 terminate_machine(pid);
4276 }
4277 }
4278
4279 /* Normally redundant, but better safe than sorry */
4280 kill(pid, SIGKILL);
4281
4282 r = wait_for_container(pid, &container_status);
4283 pid = 0;
4284
4285 if (r < 0)
4286 /* We failed to wait for the container, or the
4287 * container exited abnormally */
4288 goto finish;
4289 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4290 /* The container exited with a non-zero
4291 * status, or with zero status and no reboot
4292 * was requested. */
4293 ret = r;
4294 break;
4295 }
4296
4297 /* CONTAINER_REBOOTED, loop again */
4298
4299 if (arg_keep_unit) {
4300 /* Special handling if we are running as a
4301 * service: instead of simply restarting the
4302 * machine we want to restart the entire
4303 * service, so let's inform systemd about this
4304 * with the special exit code 133. The service
4305 * file uses RestartForceExitStatus=133 so
4306 * that this results in a full nspawn
4307 * restart. This is necessary since we might
4308 * have cgroup parameters set we want to have
4309 * flushed out. */
4310 ret = 133;
4311 r = 0;
4312 break;
4313 }
4314
4315 flush_ports(&exposed);
4316 }
4317
4318 finish:
4319 sd_notify(false,
4320 "STOPPING=1\n"
4321 "STATUS=Terminating...");
4322
4323 loop_remove(loop_nr, &image_fd);
4324
4325 if (pid > 0)
4326 kill(pid, SIGKILL);
4327
4328 if (remove_subvol && arg_directory) {
4329 int k;
4330
4331 k = btrfs_subvol_remove(arg_directory);
4332 if (k < 0)
4333 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4334 }
4335
4336 if (arg_machine) {
4337 const char *p;
4338
4339 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4340 (void) rm_rf(p, false, true, false);
4341 }
4342
4343 free(arg_directory);
4344 free(arg_template);
4345 free(arg_image);
4346 free(arg_machine);
4347 free(arg_user);
4348 strv_free(arg_setenv);
4349 strv_free(arg_network_interfaces);
4350 strv_free(arg_network_macvlan);
4351 strv_free(arg_network_ipvlan);
4352 strv_free(arg_bind);
4353 strv_free(arg_bind_ro);
4354 strv_free(arg_tmpfs);
4355
4356 flush_ports(&exposed);
4357
4358 while (arg_expose_ports) {
4359 ExposePort *p = arg_expose_ports;
4360 LIST_REMOVE(ports, arg_expose_ports, p);
4361 free(p);
4362 }
4363
4364 return r < 0 ? EXIT_FAILURE : ret;
4365 }