]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
tty-ask-password: Split out password sending
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #ifdef HAVE_BLKID
23 #include <blkid/blkid.h>
24 #endif
25 #include <errno.h>
26 #include <getopt.h>
27 #include <linux/loop.h>
28 #include <sched.h>
29 #ifdef HAVE_SECCOMP
30 #include <seccomp.h>
31 #endif
32 #ifdef HAVE_SELINUX
33 #include <selinux/selinux.h>
34 #endif
35 #include <signal.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <sys/file.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
44 #include <unistd.h>
45
46 #include "sd-daemon.h"
47 #include "sd-id128.h"
48
49 #include "alloc-util.h"
50 #include "barrier.h"
51 #include "base-filesystem.h"
52 #include "blkid-util.h"
53 #include "btrfs-util.h"
54 #include "cap-list.h"
55 #include "capability-util.h"
56 #include "cgroup-util.h"
57 #include "copy.h"
58 #include "dev-setup.h"
59 #include "env-util.h"
60 #include "fd-util.h"
61 #include "fdset.h"
62 #include "fileio.h"
63 #include "formats-util.h"
64 #include "fs-util.h"
65 #include "gpt.h"
66 #include "hostname-util.h"
67 #include "log.h"
68 #include "loopback-setup.h"
69 #include "machine-image.h"
70 #include "macro.h"
71 #include "missing.h"
72 #include "mkdir.h"
73 #include "mount-util.h"
74 #include "netlink-util.h"
75 #include "nspawn-cgroup.h"
76 #include "nspawn-expose-ports.h"
77 #include "nspawn-mount.h"
78 #include "nspawn-network.h"
79 #include "nspawn-register.h"
80 #include "nspawn-settings.h"
81 #include "nspawn-setuid.h"
82 #include "parse-util.h"
83 #include "path-util.h"
84 #include "process-util.h"
85 #include "ptyfwd.h"
86 #include "random-util.h"
87 #include "rm-rf.h"
88 #ifdef HAVE_SECCOMP
89 #include "seccomp-util.h"
90 #endif
91 #include "signal-util.h"
92 #include "socket-util.h"
93 #include "stat-util.h"
94 #include "stdio-util.h"
95 #include "string-util.h"
96 #include "strv.h"
97 #include "terminal-util.h"
98 #include "udev-util.h"
99 #include "umask-util.h"
100 #include "user-util.h"
101 #include "util.h"
102
103 typedef enum ContainerStatus {
104 CONTAINER_TERMINATED,
105 CONTAINER_REBOOTED
106 } ContainerStatus;
107
108 typedef enum LinkJournal {
109 LINK_NO,
110 LINK_AUTO,
111 LINK_HOST,
112 LINK_GUEST
113 } LinkJournal;
114
115 static char *arg_directory = NULL;
116 static char *arg_template = NULL;
117 static char *arg_user = NULL;
118 static sd_id128_t arg_uuid = {};
119 static char *arg_machine = NULL;
120 static const char *arg_selinux_context = NULL;
121 static const char *arg_selinux_apifs_context = NULL;
122 static const char *arg_slice = NULL;
123 static bool arg_private_network = false;
124 static bool arg_read_only = false;
125 static bool arg_boot = false;
126 static bool arg_ephemeral = false;
127 static LinkJournal arg_link_journal = LINK_AUTO;
128 static bool arg_link_journal_try = false;
129 static uint64_t arg_retain =
130 (1ULL << CAP_CHOWN) |
131 (1ULL << CAP_DAC_OVERRIDE) |
132 (1ULL << CAP_DAC_READ_SEARCH) |
133 (1ULL << CAP_FOWNER) |
134 (1ULL << CAP_FSETID) |
135 (1ULL << CAP_IPC_OWNER) |
136 (1ULL << CAP_KILL) |
137 (1ULL << CAP_LEASE) |
138 (1ULL << CAP_LINUX_IMMUTABLE) |
139 (1ULL << CAP_NET_BIND_SERVICE) |
140 (1ULL << CAP_NET_BROADCAST) |
141 (1ULL << CAP_NET_RAW) |
142 (1ULL << CAP_SETGID) |
143 (1ULL << CAP_SETFCAP) |
144 (1ULL << CAP_SETPCAP) |
145 (1ULL << CAP_SETUID) |
146 (1ULL << CAP_SYS_ADMIN) |
147 (1ULL << CAP_SYS_CHROOT) |
148 (1ULL << CAP_SYS_NICE) |
149 (1ULL << CAP_SYS_PTRACE) |
150 (1ULL << CAP_SYS_TTY_CONFIG) |
151 (1ULL << CAP_SYS_RESOURCE) |
152 (1ULL << CAP_SYS_BOOT) |
153 (1ULL << CAP_AUDIT_WRITE) |
154 (1ULL << CAP_AUDIT_CONTROL) |
155 (1ULL << CAP_MKNOD);
156 static CustomMount *arg_custom_mounts = NULL;
157 static unsigned arg_n_custom_mounts = 0;
158 static char **arg_setenv = NULL;
159 static bool arg_quiet = false;
160 static bool arg_share_system = false;
161 static bool arg_register = true;
162 static bool arg_keep_unit = false;
163 static char **arg_network_interfaces = NULL;
164 static char **arg_network_macvlan = NULL;
165 static char **arg_network_ipvlan = NULL;
166 static bool arg_network_veth = false;
167 static char **arg_network_veth_extra = NULL;
168 static char *arg_network_bridge = NULL;
169 static unsigned long arg_personality = PERSONALITY_INVALID;
170 static char *arg_image = NULL;
171 static VolatileMode arg_volatile_mode = VOLATILE_NO;
172 static ExposePort *arg_expose_ports = NULL;
173 static char **arg_property = NULL;
174 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
175 static bool arg_userns = false;
176 static int arg_kill_signal = 0;
177 static bool arg_unified_cgroup_hierarchy = false;
178 static SettingsMask arg_settings_mask = 0;
179 static int arg_settings_trusted = -1;
180 static char **arg_parameters = NULL;
181 static const char *arg_container_service_name = "systemd-nspawn";
182
183 static void help(void) {
184 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
185 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
186 " -h --help Show this help\n"
187 " --version Print version string\n"
188 " -q --quiet Do not show status information\n"
189 " -D --directory=PATH Root directory for the container\n"
190 " --template=PATH Initialize root directory from template directory,\n"
191 " if missing\n"
192 " -x --ephemeral Run container with snapshot of root directory, and\n"
193 " remove it after exit\n"
194 " -i --image=PATH File system device or disk image for the container\n"
195 " -b --boot Boot up full system (i.e. invoke init)\n"
196 " -u --user=USER Run the command under specified user or uid\n"
197 " -M --machine=NAME Set the machine name for the container\n"
198 " --uuid=UUID Set a specific machine UUID for the container\n"
199 " -S --slice=SLICE Place the container in the specified slice\n"
200 " --property=NAME=VALUE Set scope unit property\n"
201 " --private-users[=UIDBASE[:NUIDS]]\n"
202 " Run within user namespace\n"
203 " --private-network Disable network in container\n"
204 " --network-interface=INTERFACE\n"
205 " Assign an existing network interface to the\n"
206 " container\n"
207 " --network-macvlan=INTERFACE\n"
208 " Create a macvlan network interface based on an\n"
209 " existing network interface to the container\n"
210 " --network-ipvlan=INTERFACE\n"
211 " Create a ipvlan network interface based on an\n"
212 " existing network interface to the container\n"
213 " -n --network-veth Add a virtual Ethernet connection between host\n"
214 " and container\n"
215 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
216 " Add an additional virtual Ethernet link between\n"
217 " host and container\n"
218 " --network-bridge=INTERFACE\n"
219 " Add a virtual Ethernet connection between host\n"
220 " and container and add it to an existing bridge on\n"
221 " the host\n"
222 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
223 " Expose a container IP port on the host\n"
224 " -Z --selinux-context=SECLABEL\n"
225 " Set the SELinux security context to be used by\n"
226 " processes in the container\n"
227 " -L --selinux-apifs-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " API/tmpfs file systems in the container\n"
230 " --capability=CAP In addition to the default, retain specified\n"
231 " capability\n"
232 " --drop-capability=CAP Drop the specified capability from the default set\n"
233 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
234 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
235 " try-guest, try-host\n"
236 " -j Equivalent to --link-journal=try-guest\n"
237 " --read-only Mount the root directory read-only\n"
238 " --bind=PATH[:PATH[:OPTIONS]]\n"
239 " Bind mount a file or directory from the host into\n"
240 " the container\n"
241 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
242 " Similar, but creates a read-only bind mount\n"
243 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
244 " --overlay=PATH[:PATH...]:PATH\n"
245 " Create an overlay mount from the host to \n"
246 " the container\n"
247 " --overlay-ro=PATH[:PATH...]:PATH\n"
248 " Similar, but creates a read-only overlay mount\n"
249 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
250 " --share-system Share system namespaces with host\n"
251 " --register=BOOLEAN Register container as machine\n"
252 " --keep-unit Do not register a scope for the machine, reuse\n"
253 " the service unit nspawn is running in\n"
254 " --volatile[=MODE] Run the system in volatile mode\n"
255 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
256 , program_invocation_short_name);
257 }
258
259
260 static int custom_mounts_prepare(void) {
261 unsigned i;
262 int r;
263
264 /* Ensure the mounts are applied prefix first. */
265 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
266
267 /* Allocate working directories for the overlay file systems that need it */
268 for (i = 0; i < arg_n_custom_mounts; i++) {
269 CustomMount *m = &arg_custom_mounts[i];
270
271 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
272 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
273 return -EINVAL;
274 }
275
276 if (m->type != CUSTOM_MOUNT_OVERLAY)
277 continue;
278
279 if (m->work_dir)
280 continue;
281
282 if (m->read_only)
283 continue;
284
285 r = tempfn_random(m->source, NULL, &m->work_dir);
286 if (r < 0)
287 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
288 }
289
290 return 0;
291 }
292
293 static int detect_unified_cgroup_hierarchy(void) {
294 const char *e;
295 int r;
296
297 /* Allow the user to control whether the unified hierarchy is used */
298 e = getenv("UNIFIED_CGROUP_HIERARCHY");
299 if (e) {
300 r = parse_boolean(e);
301 if (r < 0)
302 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
303
304 arg_unified_cgroup_hierarchy = r;
305 return 0;
306 }
307
308 /* Otherwise inherit the default from the host system */
309 r = cg_unified();
310 if (r < 0)
311 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
312
313 arg_unified_cgroup_hierarchy = r;
314 return 0;
315 }
316
317 static int parse_argv(int argc, char *argv[]) {
318
319 enum {
320 ARG_VERSION = 0x100,
321 ARG_PRIVATE_NETWORK,
322 ARG_UUID,
323 ARG_READ_ONLY,
324 ARG_CAPABILITY,
325 ARG_DROP_CAPABILITY,
326 ARG_LINK_JOURNAL,
327 ARG_BIND,
328 ARG_BIND_RO,
329 ARG_TMPFS,
330 ARG_OVERLAY,
331 ARG_OVERLAY_RO,
332 ARG_SETENV,
333 ARG_SHARE_SYSTEM,
334 ARG_REGISTER,
335 ARG_KEEP_UNIT,
336 ARG_NETWORK_INTERFACE,
337 ARG_NETWORK_MACVLAN,
338 ARG_NETWORK_IPVLAN,
339 ARG_NETWORK_BRIDGE,
340 ARG_NETWORK_VETH_EXTRA,
341 ARG_PERSONALITY,
342 ARG_VOLATILE,
343 ARG_TEMPLATE,
344 ARG_PROPERTY,
345 ARG_PRIVATE_USERS,
346 ARG_KILL_SIGNAL,
347 ARG_SETTINGS,
348 };
349
350 static const struct option options[] = {
351 { "help", no_argument, NULL, 'h' },
352 { "version", no_argument, NULL, ARG_VERSION },
353 { "directory", required_argument, NULL, 'D' },
354 { "template", required_argument, NULL, ARG_TEMPLATE },
355 { "ephemeral", no_argument, NULL, 'x' },
356 { "user", required_argument, NULL, 'u' },
357 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
358 { "boot", no_argument, NULL, 'b' },
359 { "uuid", required_argument, NULL, ARG_UUID },
360 { "read-only", no_argument, NULL, ARG_READ_ONLY },
361 { "capability", required_argument, NULL, ARG_CAPABILITY },
362 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
363 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
364 { "bind", required_argument, NULL, ARG_BIND },
365 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
366 { "tmpfs", required_argument, NULL, ARG_TMPFS },
367 { "overlay", required_argument, NULL, ARG_OVERLAY },
368 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
369 { "machine", required_argument, NULL, 'M' },
370 { "slice", required_argument, NULL, 'S' },
371 { "setenv", required_argument, NULL, ARG_SETENV },
372 { "selinux-context", required_argument, NULL, 'Z' },
373 { "selinux-apifs-context", required_argument, NULL, 'L' },
374 { "quiet", no_argument, NULL, 'q' },
375 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
376 { "register", required_argument, NULL, ARG_REGISTER },
377 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
378 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
379 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
380 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
381 { "network-veth", no_argument, NULL, 'n' },
382 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
383 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
384 { "personality", required_argument, NULL, ARG_PERSONALITY },
385 { "image", required_argument, NULL, 'i' },
386 { "volatile", optional_argument, NULL, ARG_VOLATILE },
387 { "port", required_argument, NULL, 'p' },
388 { "property", required_argument, NULL, ARG_PROPERTY },
389 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
390 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
391 { "settings", required_argument, NULL, ARG_SETTINGS },
392 {}
393 };
394
395 int c, r;
396 const char *p, *e;
397 uint64_t plus = 0, minus = 0;
398 bool mask_all_settings = false, mask_no_settings = false;
399
400 assert(argc >= 0);
401 assert(argv);
402
403 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
404
405 switch (c) {
406
407 case 'h':
408 help();
409 return 0;
410
411 case ARG_VERSION:
412 return version();
413
414 case 'D':
415 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
416 if (r < 0)
417 return r;
418 break;
419
420 case ARG_TEMPLATE:
421 r = parse_path_argument_and_warn(optarg, false, &arg_template);
422 if (r < 0)
423 return r;
424 break;
425
426 case 'i':
427 r = parse_path_argument_and_warn(optarg, false, &arg_image);
428 if (r < 0)
429 return r;
430 break;
431
432 case 'x':
433 arg_ephemeral = true;
434 break;
435
436 case 'u':
437 r = free_and_strdup(&arg_user, optarg);
438 if (r < 0)
439 return log_oom();
440
441 arg_settings_mask |= SETTING_USER;
442 break;
443
444 case ARG_NETWORK_BRIDGE:
445 r = free_and_strdup(&arg_network_bridge, optarg);
446 if (r < 0)
447 return log_oom();
448
449 /* fall through */
450
451 case 'n':
452 arg_network_veth = true;
453 arg_private_network = true;
454 arg_settings_mask |= SETTING_NETWORK;
455 break;
456
457 case ARG_NETWORK_VETH_EXTRA:
458 r = veth_extra_parse(&arg_network_veth_extra, optarg);
459 if (r < 0)
460 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
461
462 arg_private_network = true;
463 arg_settings_mask |= SETTING_NETWORK;
464 break;
465
466 case ARG_NETWORK_INTERFACE:
467 if (strv_extend(&arg_network_interfaces, optarg) < 0)
468 return log_oom();
469
470 arg_private_network = true;
471 arg_settings_mask |= SETTING_NETWORK;
472 break;
473
474 case ARG_NETWORK_MACVLAN:
475 if (strv_extend(&arg_network_macvlan, optarg) < 0)
476 return log_oom();
477
478 arg_private_network = true;
479 arg_settings_mask |= SETTING_NETWORK;
480 break;
481
482 case ARG_NETWORK_IPVLAN:
483 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
484 return log_oom();
485
486 /* fall through */
487
488 case ARG_PRIVATE_NETWORK:
489 arg_private_network = true;
490 arg_settings_mask |= SETTING_NETWORK;
491 break;
492
493 case 'b':
494 arg_boot = true;
495 arg_settings_mask |= SETTING_BOOT;
496 break;
497
498 case ARG_UUID:
499 r = sd_id128_from_string(optarg, &arg_uuid);
500 if (r < 0) {
501 log_error("Invalid UUID: %s", optarg);
502 return r;
503 }
504
505 arg_settings_mask |= SETTING_MACHINE_ID;
506 break;
507
508 case 'S':
509 arg_slice = optarg;
510 break;
511
512 case 'M':
513 if (isempty(optarg))
514 arg_machine = mfree(arg_machine);
515 else {
516 if (!machine_name_is_valid(optarg)) {
517 log_error("Invalid machine name: %s", optarg);
518 return -EINVAL;
519 }
520
521 r = free_and_strdup(&arg_machine, optarg);
522 if (r < 0)
523 return log_oom();
524
525 break;
526 }
527
528 case 'Z':
529 arg_selinux_context = optarg;
530 break;
531
532 case 'L':
533 arg_selinux_apifs_context = optarg;
534 break;
535
536 case ARG_READ_ONLY:
537 arg_read_only = true;
538 arg_settings_mask |= SETTING_READ_ONLY;
539 break;
540
541 case ARG_CAPABILITY:
542 case ARG_DROP_CAPABILITY: {
543 p = optarg;
544 for(;;) {
545 _cleanup_free_ char *t = NULL;
546
547 r = extract_first_word(&p, &t, ",", 0);
548 if (r < 0)
549 return log_error_errno(r, "Failed to parse capability %s.", t);
550
551 if (r == 0)
552 break;
553
554 if (streq(t, "all")) {
555 if (c == ARG_CAPABILITY)
556 plus = (uint64_t) -1;
557 else
558 minus = (uint64_t) -1;
559 } else {
560 int cap;
561
562 cap = capability_from_name(t);
563 if (cap < 0) {
564 log_error("Failed to parse capability %s.", t);
565 return -EINVAL;
566 }
567
568 if (c == ARG_CAPABILITY)
569 plus |= 1ULL << (uint64_t) cap;
570 else
571 minus |= 1ULL << (uint64_t) cap;
572 }
573 }
574
575 arg_settings_mask |= SETTING_CAPABILITY;
576 break;
577 }
578
579 case 'j':
580 arg_link_journal = LINK_GUEST;
581 arg_link_journal_try = true;
582 break;
583
584 case ARG_LINK_JOURNAL:
585 if (streq(optarg, "auto")) {
586 arg_link_journal = LINK_AUTO;
587 arg_link_journal_try = false;
588 } else if (streq(optarg, "no")) {
589 arg_link_journal = LINK_NO;
590 arg_link_journal_try = false;
591 } else if (streq(optarg, "guest")) {
592 arg_link_journal = LINK_GUEST;
593 arg_link_journal_try = false;
594 } else if (streq(optarg, "host")) {
595 arg_link_journal = LINK_HOST;
596 arg_link_journal_try = false;
597 } else if (streq(optarg, "try-guest")) {
598 arg_link_journal = LINK_GUEST;
599 arg_link_journal_try = true;
600 } else if (streq(optarg, "try-host")) {
601 arg_link_journal = LINK_HOST;
602 arg_link_journal_try = true;
603 } else {
604 log_error("Failed to parse link journal mode %s", optarg);
605 return -EINVAL;
606 }
607
608 break;
609
610 case ARG_BIND:
611 case ARG_BIND_RO:
612 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
613 if (r < 0)
614 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
615
616 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
617 break;
618
619 case ARG_TMPFS:
620 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
621 if (r < 0)
622 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
623
624 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
625 break;
626
627 case ARG_OVERLAY:
628 case ARG_OVERLAY_RO: {
629 _cleanup_free_ char *upper = NULL, *destination = NULL;
630 _cleanup_strv_free_ char **lower = NULL;
631 CustomMount *m;
632 unsigned n = 0;
633 char **i;
634
635 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
636 if (r == -ENOMEM)
637 return log_oom();
638 else if (r < 0) {
639 log_error("Invalid overlay specification: %s", optarg);
640 return r;
641 }
642
643 STRV_FOREACH(i, lower) {
644 if (!path_is_absolute(*i)) {
645 log_error("Overlay path %s is not absolute.", *i);
646 return -EINVAL;
647 }
648
649 n++;
650 }
651
652 if (n < 2) {
653 log_error("--overlay= needs at least two colon-separated directories specified.");
654 return -EINVAL;
655 }
656
657 if (n == 2) {
658 /* If two parameters are specified,
659 * the first one is the lower, the
660 * second one the upper directory. And
661 * we'll also define the destination
662 * mount point the same as the upper. */
663 upper = lower[1];
664 lower[1] = NULL;
665
666 destination = strdup(upper);
667 if (!destination)
668 return log_oom();
669
670 } else {
671 upper = lower[n - 2];
672 destination = lower[n - 1];
673 lower[n - 2] = NULL;
674 }
675
676 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
677 if (!m)
678 return log_oom();
679
680 m->destination = destination;
681 m->source = upper;
682 m->lower = lower;
683 m->read_only = c == ARG_OVERLAY_RO;
684
685 upper = destination = NULL;
686 lower = NULL;
687
688 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
689 break;
690 }
691
692 case ARG_SETENV: {
693 char **n;
694
695 if (!env_assignment_is_valid(optarg)) {
696 log_error("Environment variable assignment '%s' is not valid.", optarg);
697 return -EINVAL;
698 }
699
700 n = strv_env_set(arg_setenv, optarg);
701 if (!n)
702 return log_oom();
703
704 strv_free(arg_setenv);
705 arg_setenv = n;
706
707 arg_settings_mask |= SETTING_ENVIRONMENT;
708 break;
709 }
710
711 case 'q':
712 arg_quiet = true;
713 break;
714
715 case ARG_SHARE_SYSTEM:
716 arg_share_system = true;
717 break;
718
719 case ARG_REGISTER:
720 r = parse_boolean(optarg);
721 if (r < 0) {
722 log_error("Failed to parse --register= argument: %s", optarg);
723 return r;
724 }
725
726 arg_register = r;
727 break;
728
729 case ARG_KEEP_UNIT:
730 arg_keep_unit = true;
731 break;
732
733 case ARG_PERSONALITY:
734
735 arg_personality = personality_from_string(optarg);
736 if (arg_personality == PERSONALITY_INVALID) {
737 log_error("Unknown or unsupported personality '%s'.", optarg);
738 return -EINVAL;
739 }
740
741 arg_settings_mask |= SETTING_PERSONALITY;
742 break;
743
744 case ARG_VOLATILE:
745
746 if (!optarg)
747 arg_volatile_mode = VOLATILE_YES;
748 else {
749 VolatileMode m;
750
751 m = volatile_mode_from_string(optarg);
752 if (m < 0) {
753 log_error("Failed to parse --volatile= argument: %s", optarg);
754 return -EINVAL;
755 } else
756 arg_volatile_mode = m;
757 }
758
759 arg_settings_mask |= SETTING_VOLATILE_MODE;
760 break;
761
762 case 'p':
763 r = expose_port_parse(&arg_expose_ports, optarg);
764 if (r == -EEXIST)
765 return log_error_errno(r, "Duplicate port specification: %s", optarg);
766 if (r < 0)
767 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
768
769 arg_settings_mask |= SETTING_EXPOSE_PORTS;
770 break;
771
772 case ARG_PROPERTY:
773 if (strv_extend(&arg_property, optarg) < 0)
774 return log_oom();
775
776 break;
777
778 case ARG_PRIVATE_USERS:
779 if (optarg) {
780 _cleanup_free_ char *buffer = NULL;
781 const char *range, *shift;
782
783 range = strchr(optarg, ':');
784 if (range) {
785 buffer = strndup(optarg, range - optarg);
786 if (!buffer)
787 return log_oom();
788 shift = buffer;
789
790 range++;
791 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
792 log_error("Failed to parse UID range: %s", range);
793 return -EINVAL;
794 }
795 } else
796 shift = optarg;
797
798 if (parse_uid(shift, &arg_uid_shift) < 0) {
799 log_error("Failed to parse UID: %s", optarg);
800 return -EINVAL;
801 }
802 }
803
804 arg_userns = true;
805 break;
806
807 case ARG_KILL_SIGNAL:
808 arg_kill_signal = signal_from_string_try_harder(optarg);
809 if (arg_kill_signal < 0) {
810 log_error("Cannot parse signal: %s", optarg);
811 return -EINVAL;
812 }
813
814 arg_settings_mask |= SETTING_KILL_SIGNAL;
815 break;
816
817 case ARG_SETTINGS:
818
819 /* no → do not read files
820 * yes → read files, do not override cmdline, trust only subset
821 * override → read files, override cmdline, trust only subset
822 * trusted → read files, do not override cmdline, trust all
823 */
824
825 r = parse_boolean(optarg);
826 if (r < 0) {
827 if (streq(optarg, "trusted")) {
828 mask_all_settings = false;
829 mask_no_settings = false;
830 arg_settings_trusted = true;
831
832 } else if (streq(optarg, "override")) {
833 mask_all_settings = false;
834 mask_no_settings = true;
835 arg_settings_trusted = -1;
836 } else
837 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
838 } else if (r > 0) {
839 /* yes */
840 mask_all_settings = false;
841 mask_no_settings = false;
842 arg_settings_trusted = -1;
843 } else {
844 /* no */
845 mask_all_settings = true;
846 mask_no_settings = false;
847 arg_settings_trusted = false;
848 }
849
850 break;
851
852 case '?':
853 return -EINVAL;
854
855 default:
856 assert_not_reached("Unhandled option");
857 }
858
859 if (arg_share_system)
860 arg_register = false;
861
862 if (arg_boot && arg_share_system) {
863 log_error("--boot and --share-system may not be combined.");
864 return -EINVAL;
865 }
866
867 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
868 log_error("--keep-unit may not be used when invoked from a user session.");
869 return -EINVAL;
870 }
871
872 if (arg_directory && arg_image) {
873 log_error("--directory= and --image= may not be combined.");
874 return -EINVAL;
875 }
876
877 if (arg_template && arg_image) {
878 log_error("--template= and --image= may not be combined.");
879 return -EINVAL;
880 }
881
882 if (arg_template && !(arg_directory || arg_machine)) {
883 log_error("--template= needs --directory= or --machine=.");
884 return -EINVAL;
885 }
886
887 if (arg_ephemeral && arg_template) {
888 log_error("--ephemeral and --template= may not be combined.");
889 return -EINVAL;
890 }
891
892 if (arg_ephemeral && arg_image) {
893 log_error("--ephemeral and --image= may not be combined.");
894 return -EINVAL;
895 }
896
897 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
898 log_error("--ephemeral and --link-journal= may not be combined.");
899 return -EINVAL;
900 }
901
902 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
903 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
904
905 if (argc > optind) {
906 arg_parameters = strv_copy(argv + optind);
907 if (!arg_parameters)
908 return log_oom();
909
910 arg_settings_mask |= SETTING_BOOT;
911 }
912
913 /* Load all settings from .nspawn files */
914 if (mask_no_settings)
915 arg_settings_mask = 0;
916
917 /* Don't load any settings from .nspawn files */
918 if (mask_all_settings)
919 arg_settings_mask = _SETTINGS_MASK_ALL;
920
921 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
922
923 r = detect_unified_cgroup_hierarchy();
924 if (r < 0)
925 return r;
926
927 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
928 if (e)
929 arg_container_service_name = e;
930
931 return 1;
932 }
933
934 static int verify_arguments(void) {
935
936 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
937 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
938 return -EINVAL;
939 }
940
941 if (arg_expose_ports && !arg_private_network) {
942 log_error("Cannot use --port= without private networking.");
943 return -EINVAL;
944 }
945
946 if (arg_boot && arg_kill_signal <= 0)
947 arg_kill_signal = SIGRTMIN+3;
948
949 return 0;
950 }
951
952 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
953 assert(p);
954
955 if (!arg_userns)
956 return 0;
957
958 if (uid == UID_INVALID && gid == GID_INVALID)
959 return 0;
960
961 if (uid != UID_INVALID) {
962 uid += arg_uid_shift;
963
964 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
965 return -EOVERFLOW;
966 }
967
968 if (gid != GID_INVALID) {
969 gid += (gid_t) arg_uid_shift;
970
971 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
972 return -EOVERFLOW;
973 }
974
975 if (lchown(p, uid, gid) < 0)
976 return -errno;
977
978 return 0;
979 }
980
981 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
982 const char *q;
983
984 q = prefix_roota(root, path);
985 if (mkdir(q, mode) < 0) {
986 if (errno == EEXIST)
987 return 0;
988 return -errno;
989 }
990
991 return userns_lchown(q, uid, gid);
992 }
993
994 static int setup_timezone(const char *dest) {
995 _cleanup_free_ char *p = NULL, *q = NULL;
996 const char *where, *check, *what;
997 char *z, *y;
998 int r;
999
1000 assert(dest);
1001
1002 /* Fix the timezone, if possible */
1003 r = readlink_malloc("/etc/localtime", &p);
1004 if (r < 0) {
1005 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1006 return 0;
1007 }
1008
1009 z = path_startswith(p, "../usr/share/zoneinfo/");
1010 if (!z)
1011 z = path_startswith(p, "/usr/share/zoneinfo/");
1012 if (!z) {
1013 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1014 return 0;
1015 }
1016
1017 where = prefix_roota(dest, "/etc/localtime");
1018 r = readlink_malloc(where, &q);
1019 if (r >= 0) {
1020 y = path_startswith(q, "../usr/share/zoneinfo/");
1021 if (!y)
1022 y = path_startswith(q, "/usr/share/zoneinfo/");
1023
1024 /* Already pointing to the right place? Then do nothing .. */
1025 if (y && streq(y, z))
1026 return 0;
1027 }
1028
1029 check = strjoina("/usr/share/zoneinfo/", z);
1030 check = prefix_roota(dest, check);
1031 if (laccess(check, F_OK) < 0) {
1032 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1033 return 0;
1034 }
1035
1036 r = unlink(where);
1037 if (r < 0 && errno != ENOENT) {
1038 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1039 return 0;
1040 }
1041
1042 what = strjoina("../usr/share/zoneinfo/", z);
1043 if (symlink(what, where) < 0) {
1044 log_error_errno(errno, "Failed to correct timezone of container: %m");
1045 return 0;
1046 }
1047
1048 r = userns_lchown(where, 0, 0);
1049 if (r < 0)
1050 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1051
1052 return 0;
1053 }
1054
1055 static int setup_resolv_conf(const char *dest) {
1056 const char *where = NULL;
1057 int r;
1058
1059 assert(dest);
1060
1061 if (arg_private_network)
1062 return 0;
1063
1064 /* Fix resolv.conf, if possible */
1065 where = prefix_roota(dest, "/etc/resolv.conf");
1066
1067 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1068 if (r < 0) {
1069 /* If the file already exists as symlink, let's
1070 * suppress the warning, under the assumption that
1071 * resolved or something similar runs inside and the
1072 * symlink points there.
1073 *
1074 * If the disk image is read-only, there's also no
1075 * point in complaining.
1076 */
1077 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1078 "Failed to copy /etc/resolv.conf to %s: %m", where);
1079 return 0;
1080 }
1081
1082 r = userns_lchown(where, 0, 0);
1083 if (r < 0)
1084 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1085
1086 return 0;
1087 }
1088
1089 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1090 assert(s);
1091
1092 snprintf(s, 37,
1093 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1094 SD_ID128_FORMAT_VAL(id));
1095
1096 return s;
1097 }
1098
1099 static int setup_boot_id(const char *dest) {
1100 const char *from, *to;
1101 sd_id128_t rnd = {};
1102 char as_uuid[37];
1103 int r;
1104
1105 if (arg_share_system)
1106 return 0;
1107
1108 /* Generate a new randomized boot ID, so that each boot-up of
1109 * the container gets a new one */
1110
1111 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1112 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1113
1114 r = sd_id128_randomize(&rnd);
1115 if (r < 0)
1116 return log_error_errno(r, "Failed to generate random boot id: %m");
1117
1118 id128_format_as_uuid(rnd, as_uuid);
1119
1120 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1121 if (r < 0)
1122 return log_error_errno(r, "Failed to write boot id: %m");
1123
1124 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1125 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1126 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1127 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1128
1129 unlink(from);
1130 return r;
1131 }
1132
1133 static int copy_devnodes(const char *dest) {
1134
1135 static const char devnodes[] =
1136 "null\0"
1137 "zero\0"
1138 "full\0"
1139 "random\0"
1140 "urandom\0"
1141 "tty\0"
1142 "net/tun\0";
1143
1144 const char *d;
1145 int r = 0;
1146 _cleanup_umask_ mode_t u;
1147
1148 assert(dest);
1149
1150 u = umask(0000);
1151
1152 /* Create /dev/net, so that we can create /dev/net/tun in it */
1153 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1154 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1155
1156 NULSTR_FOREACH(d, devnodes) {
1157 _cleanup_free_ char *from = NULL, *to = NULL;
1158 struct stat st;
1159
1160 from = strappend("/dev/", d);
1161 to = prefix_root(dest, from);
1162
1163 if (stat(from, &st) < 0) {
1164
1165 if (errno != ENOENT)
1166 return log_error_errno(errno, "Failed to stat %s: %m", from);
1167
1168 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1169
1170 log_error("%s is not a char or block device, cannot copy.", from);
1171 return -EIO;
1172
1173 } else {
1174 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1175 if (errno != EPERM)
1176 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1177
1178 /* Some systems abusively restrict mknod but
1179 * allow bind mounts. */
1180 r = touch(to);
1181 if (r < 0)
1182 return log_error_errno(r, "touch (%s) failed: %m", to);
1183 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1184 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1185 }
1186
1187 r = userns_lchown(to, 0, 0);
1188 if (r < 0)
1189 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1190 }
1191 }
1192
1193 return r;
1194 }
1195
1196 static int setup_pts(const char *dest) {
1197 _cleanup_free_ char *options = NULL;
1198 const char *p;
1199 int r;
1200
1201 #ifdef HAVE_SELINUX
1202 if (arg_selinux_apifs_context)
1203 (void) asprintf(&options,
1204 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1205 arg_uid_shift + TTY_GID,
1206 arg_selinux_apifs_context);
1207 else
1208 #endif
1209 (void) asprintf(&options,
1210 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1211 arg_uid_shift + TTY_GID);
1212
1213 if (!options)
1214 return log_oom();
1215
1216 /* Mount /dev/pts itself */
1217 p = prefix_roota(dest, "/dev/pts");
1218 if (mkdir(p, 0755) < 0)
1219 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1220 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1221 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1222 r = userns_lchown(p, 0, 0);
1223 if (r < 0)
1224 return log_error_errno(r, "Failed to chown /dev/pts: %m");
1225
1226 /* Create /dev/ptmx symlink */
1227 p = prefix_roota(dest, "/dev/ptmx");
1228 if (symlink("pts/ptmx", p) < 0)
1229 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1230 r = userns_lchown(p, 0, 0);
1231 if (r < 0)
1232 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
1233
1234 /* And fix /dev/pts/ptmx ownership */
1235 p = prefix_roota(dest, "/dev/pts/ptmx");
1236 r = userns_lchown(p, 0, 0);
1237 if (r < 0)
1238 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
1239
1240 return 0;
1241 }
1242
1243 static int setup_dev_console(const char *dest, const char *console) {
1244 _cleanup_umask_ mode_t u;
1245 const char *to;
1246 int r;
1247
1248 assert(dest);
1249 assert(console);
1250
1251 u = umask(0000);
1252
1253 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1254 if (r < 0)
1255 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1256
1257 /* We need to bind mount the right tty to /dev/console since
1258 * ptys can only exist on pts file systems. To have something
1259 * to bind mount things on we create a empty regular file. */
1260
1261 to = prefix_roota(dest, "/dev/console");
1262 r = touch(to);
1263 if (r < 0)
1264 return log_error_errno(r, "touch() for /dev/console failed: %m");
1265
1266 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1267 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1268
1269 return 0;
1270 }
1271
1272 static int setup_kmsg(const char *dest, int kmsg_socket) {
1273 const char *from, *to;
1274 _cleanup_umask_ mode_t u;
1275 int fd, r;
1276
1277 assert(kmsg_socket >= 0);
1278
1279 u = umask(0000);
1280
1281 /* We create the kmsg FIFO as /run/kmsg, but immediately
1282 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1283 * on the reading side behave very similar to /proc/kmsg,
1284 * their writing side behaves differently from /dev/kmsg in
1285 * that writing blocks when nothing is reading. In order to
1286 * avoid any problems with containers deadlocking due to this
1287 * we simply make /dev/kmsg unavailable to the container. */
1288 from = prefix_roota(dest, "/run/kmsg");
1289 to = prefix_roota(dest, "/proc/kmsg");
1290
1291 if (mkfifo(from, 0600) < 0)
1292 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1293 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1294 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1295
1296 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1297 if (fd < 0)
1298 return log_error_errno(errno, "Failed to open fifo: %m");
1299
1300 /* Store away the fd in the socket, so that it stays open as
1301 * long as we run the child */
1302 r = send_one_fd(kmsg_socket, fd, 0);
1303 safe_close(fd);
1304
1305 if (r < 0)
1306 return log_error_errno(r, "Failed to send FIFO fd: %m");
1307
1308 /* And now make the FIFO unavailable as /run/kmsg... */
1309 (void) unlink(from);
1310
1311 return 0;
1312 }
1313
1314 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1315 union in_addr_union *exposed = userdata;
1316
1317 assert(rtnl);
1318 assert(m);
1319 assert(exposed);
1320
1321 expose_port_execute(rtnl, arg_expose_ports, exposed);
1322 return 0;
1323 }
1324
1325 static int setup_hostname(void) {
1326
1327 if (arg_share_system)
1328 return 0;
1329
1330 if (sethostname_idempotent(arg_machine) < 0)
1331 return -errno;
1332
1333 return 0;
1334 }
1335
1336 static int setup_journal(const char *directory) {
1337 sd_id128_t machine_id, this_id;
1338 _cleanup_free_ char *b = NULL, *d = NULL;
1339 const char *etc_machine_id, *p, *q;
1340 bool try;
1341 char *id;
1342 int r;
1343
1344 /* Don't link journals in ephemeral mode */
1345 if (arg_ephemeral)
1346 return 0;
1347
1348 if (arg_link_journal == LINK_NO)
1349 return 0;
1350
1351 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1352
1353 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1354
1355 r = read_one_line_file(etc_machine_id, &b);
1356 if (r == -ENOENT && try)
1357 return 0;
1358 else if (r < 0)
1359 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
1360
1361 id = strstrip(b);
1362 if (isempty(id) && try)
1363 return 0;
1364
1365 /* Verify validity */
1366 r = sd_id128_from_string(id, &machine_id);
1367 if (r < 0)
1368 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
1369
1370 r = sd_id128_get_machine(&this_id);
1371 if (r < 0)
1372 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1373
1374 if (sd_id128_equal(machine_id, this_id)) {
1375 log_full(try ? LOG_WARNING : LOG_ERR,
1376 "Host and machine ids are equal (%s): refusing to link journals", id);
1377 if (try)
1378 return 0;
1379 return -EEXIST;
1380 }
1381
1382 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1383 if (r < 0)
1384 return log_error_errno(r, "Failed to create /var: %m");
1385
1386 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1387 if (r < 0)
1388 return log_error_errno(r, "Failed to create /var/log: %m");
1389
1390 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1391 if (r < 0)
1392 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1393
1394 p = strjoina("/var/log/journal/", id);
1395 q = prefix_roota(directory, p);
1396
1397 if (path_is_mount_point(p, 0) > 0) {
1398 if (try)
1399 return 0;
1400
1401 log_error("%s: already a mount point, refusing to use for journal", p);
1402 return -EEXIST;
1403 }
1404
1405 if (path_is_mount_point(q, 0) > 0) {
1406 if (try)
1407 return 0;
1408
1409 log_error("%s: already a mount point, refusing to use for journal", q);
1410 return -EEXIST;
1411 }
1412
1413 r = readlink_and_make_absolute(p, &d);
1414 if (r >= 0) {
1415 if ((arg_link_journal == LINK_GUEST ||
1416 arg_link_journal == LINK_AUTO) &&
1417 path_equal(d, q)) {
1418
1419 r = userns_mkdir(directory, p, 0755, 0, 0);
1420 if (r < 0)
1421 log_warning_errno(r, "Failed to create directory %s: %m", q);
1422 return 0;
1423 }
1424
1425 if (unlink(p) < 0)
1426 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1427 } else if (r == -EINVAL) {
1428
1429 if (arg_link_journal == LINK_GUEST &&
1430 rmdir(p) < 0) {
1431
1432 if (errno == ENOTDIR) {
1433 log_error("%s already exists and is neither a symlink nor a directory", p);
1434 return r;
1435 } else
1436 return log_error_errno(errno, "Failed to remove %s: %m", p);
1437 }
1438 } else if (r != -ENOENT)
1439 return log_error_errno(r, "readlink(%s) failed: %m", p);
1440
1441 if (arg_link_journal == LINK_GUEST) {
1442
1443 if (symlink(q, p) < 0) {
1444 if (try) {
1445 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1446 return 0;
1447 } else
1448 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1449 }
1450
1451 r = userns_mkdir(directory, p, 0755, 0, 0);
1452 if (r < 0)
1453 log_warning_errno(r, "Failed to create directory %s: %m", q);
1454 return 0;
1455 }
1456
1457 if (arg_link_journal == LINK_HOST) {
1458 /* don't create parents here -- if the host doesn't have
1459 * permanent journal set up, don't force it here */
1460
1461 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
1462 if (try) {
1463 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1464 return 0;
1465 } else
1466 return log_error_errno(errno, "Failed to create %s: %m", p);
1467 }
1468
1469 } else if (access(p, F_OK) < 0)
1470 return 0;
1471
1472 if (dir_is_empty(q) == 0)
1473 log_warning("%s is not empty, proceeding anyway.", q);
1474
1475 r = userns_mkdir(directory, p, 0755, 0, 0);
1476 if (r < 0)
1477 return log_error_errno(r, "Failed to create %s: %m", q);
1478
1479 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1480 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1481
1482 return 0;
1483 }
1484
1485 static int drop_capabilities(void) {
1486 return capability_bounding_set_drop(arg_retain, false);
1487 }
1488
1489 static int reset_audit_loginuid(void) {
1490 _cleanup_free_ char *p = NULL;
1491 int r;
1492
1493 if (arg_share_system)
1494 return 0;
1495
1496 r = read_one_line_file("/proc/self/loginuid", &p);
1497 if (r == -ENOENT)
1498 return 0;
1499 if (r < 0)
1500 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1501
1502 /* Already reset? */
1503 if (streq(p, "4294967295"))
1504 return 0;
1505
1506 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1507 if (r < 0) {
1508 log_error_errno(r,
1509 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1510 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1511 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1512 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1513 "using systemd-nspawn. Sleeping for 5s... (%m)");
1514
1515 sleep(5);
1516 }
1517
1518 return 0;
1519 }
1520
1521 static int setup_seccomp(void) {
1522
1523 #ifdef HAVE_SECCOMP
1524 static const struct {
1525 uint64_t capability;
1526 int syscall_num;
1527 } blacklist[] = {
1528 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1529 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1530 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1531 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1532 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1533 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1534 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1535 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1536 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1537 { CAP_SYSLOG, SCMP_SYS(syslog) },
1538 };
1539
1540 scmp_filter_ctx seccomp;
1541 unsigned i;
1542 int r;
1543
1544 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1545 if (!seccomp)
1546 return log_oom();
1547
1548 r = seccomp_add_secondary_archs(seccomp);
1549 if (r < 0) {
1550 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1551 goto finish;
1552 }
1553
1554 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1555 if (arg_retain & (1ULL << blacklist[i].capability))
1556 continue;
1557
1558 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1559 if (r == -EFAULT)
1560 continue; /* unknown syscall */
1561 if (r < 0) {
1562 log_error_errno(r, "Failed to block syscall: %m");
1563 goto finish;
1564 }
1565 }
1566
1567
1568 /*
1569 Audit is broken in containers, much of the userspace audit
1570 hookup will fail if running inside a container. We don't
1571 care and just turn off creation of audit sockets.
1572
1573 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1574 with EAFNOSUPPORT which audit userspace uses as indication
1575 that audit is disabled in the kernel.
1576 */
1577
1578 r = seccomp_rule_add(
1579 seccomp,
1580 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1581 SCMP_SYS(socket),
1582 2,
1583 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1584 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1585 if (r < 0) {
1586 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1587 goto finish;
1588 }
1589
1590 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1591 if (r < 0) {
1592 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1593 goto finish;
1594 }
1595
1596 r = seccomp_load(seccomp);
1597 if (r == -EINVAL) {
1598 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1599 r = 0;
1600 goto finish;
1601 }
1602 if (r < 0) {
1603 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1604 goto finish;
1605 }
1606
1607 finish:
1608 seccomp_release(seccomp);
1609 return r;
1610 #else
1611 return 0;
1612 #endif
1613
1614 }
1615
1616 static int setup_propagate(const char *root) {
1617 const char *p, *q;
1618 int r;
1619
1620 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1621 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1622 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1623 (void) mkdir_p(p, 0600);
1624
1625 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1626 if (r < 0)
1627 return log_error_errno(r, "Failed to create /run/systemd: %m");
1628
1629 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1630 if (r < 0)
1631 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
1632
1633 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1634 if (r < 0)
1635 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
1636
1637 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1638 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1639 return log_error_errno(errno, "Failed to install propagation bind mount.");
1640
1641 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1642 return log_error_errno(errno, "Failed to make propagation mount read-only");
1643
1644 return 0;
1645 }
1646
1647 static int setup_image(char **device_path, int *loop_nr) {
1648 struct loop_info64 info = {
1649 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1650 };
1651 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1652 _cleanup_free_ char* loopdev = NULL;
1653 struct stat st;
1654 int r, nr;
1655
1656 assert(device_path);
1657 assert(loop_nr);
1658 assert(arg_image);
1659
1660 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1661 if (fd < 0)
1662 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1663
1664 if (fstat(fd, &st) < 0)
1665 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1666
1667 if (S_ISBLK(st.st_mode)) {
1668 char *p;
1669
1670 p = strdup(arg_image);
1671 if (!p)
1672 return log_oom();
1673
1674 *device_path = p;
1675
1676 *loop_nr = -1;
1677
1678 r = fd;
1679 fd = -1;
1680
1681 return r;
1682 }
1683
1684 if (!S_ISREG(st.st_mode)) {
1685 log_error("%s is not a regular file or block device.", arg_image);
1686 return -EINVAL;
1687 }
1688
1689 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1690 if (control < 0)
1691 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1692
1693 nr = ioctl(control, LOOP_CTL_GET_FREE);
1694 if (nr < 0)
1695 return log_error_errno(errno, "Failed to allocate loop device: %m");
1696
1697 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1698 return log_oom();
1699
1700 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1701 if (loop < 0)
1702 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1703
1704 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1705 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1706
1707 if (arg_read_only)
1708 info.lo_flags |= LO_FLAGS_READ_ONLY;
1709
1710 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1711 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1712
1713 *device_path = loopdev;
1714 loopdev = NULL;
1715
1716 *loop_nr = nr;
1717
1718 r = loop;
1719 loop = -1;
1720
1721 return r;
1722 }
1723
1724 #define PARTITION_TABLE_BLURB \
1725 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1726 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1727 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1728 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1729 "to be bootable with systemd-nspawn."
1730
1731 static int dissect_image(
1732 int fd,
1733 char **root_device, bool *root_device_rw,
1734 char **home_device, bool *home_device_rw,
1735 char **srv_device, bool *srv_device_rw,
1736 bool *secondary) {
1737
1738 #ifdef HAVE_BLKID
1739 int home_nr = -1, srv_nr = -1;
1740 #ifdef GPT_ROOT_NATIVE
1741 int root_nr = -1;
1742 #endif
1743 #ifdef GPT_ROOT_SECONDARY
1744 int secondary_root_nr = -1;
1745 #endif
1746 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1747 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1748 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1749 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1750 _cleanup_udev_unref_ struct udev *udev = NULL;
1751 struct udev_list_entry *first, *item;
1752 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1753 bool is_gpt, is_mbr, multiple_generic = false;
1754 const char *pttype = NULL;
1755 blkid_partlist pl;
1756 struct stat st;
1757 unsigned i;
1758 int r;
1759
1760 assert(fd >= 0);
1761 assert(root_device);
1762 assert(home_device);
1763 assert(srv_device);
1764 assert(secondary);
1765 assert(arg_image);
1766
1767 b = blkid_new_probe();
1768 if (!b)
1769 return log_oom();
1770
1771 errno = 0;
1772 r = blkid_probe_set_device(b, fd, 0, 0);
1773 if (r != 0) {
1774 if (errno == 0)
1775 return log_oom();
1776
1777 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1778 }
1779
1780 blkid_probe_enable_partitions(b, 1);
1781 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1782
1783 errno = 0;
1784 r = blkid_do_safeprobe(b);
1785 if (r == -2 || r == 1) {
1786 log_error("Failed to identify any partition table on\n"
1787 " %s\n"
1788 PARTITION_TABLE_BLURB, arg_image);
1789 return -EINVAL;
1790 } else if (r != 0) {
1791 if (errno == 0)
1792 errno = EIO;
1793 return log_error_errno(errno, "Failed to probe: %m");
1794 }
1795
1796 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1797
1798 is_gpt = streq_ptr(pttype, "gpt");
1799 is_mbr = streq_ptr(pttype, "dos");
1800
1801 if (!is_gpt && !is_mbr) {
1802 log_error("No GPT or MBR partition table discovered on\n"
1803 " %s\n"
1804 PARTITION_TABLE_BLURB, arg_image);
1805 return -EINVAL;
1806 }
1807
1808 errno = 0;
1809 pl = blkid_probe_get_partitions(b);
1810 if (!pl) {
1811 if (errno == 0)
1812 return log_oom();
1813
1814 log_error("Failed to list partitions of %s", arg_image);
1815 return -errno;
1816 }
1817
1818 udev = udev_new();
1819 if (!udev)
1820 return log_oom();
1821
1822 if (fstat(fd, &st) < 0)
1823 return log_error_errno(errno, "Failed to stat block device: %m");
1824
1825 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1826 if (!d)
1827 return log_oom();
1828
1829 for (i = 0;; i++) {
1830 int n, m;
1831
1832 if (i >= 10) {
1833 log_error("Kernel partitions never appeared.");
1834 return -ENXIO;
1835 }
1836
1837 e = udev_enumerate_new(udev);
1838 if (!e)
1839 return log_oom();
1840
1841 r = udev_enumerate_add_match_parent(e, d);
1842 if (r < 0)
1843 return log_oom();
1844
1845 r = udev_enumerate_scan_devices(e);
1846 if (r < 0)
1847 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1848
1849 /* Count the partitions enumerated by the kernel */
1850 n = 0;
1851 first = udev_enumerate_get_list_entry(e);
1852 udev_list_entry_foreach(item, first)
1853 n++;
1854
1855 /* Count the partitions enumerated by blkid */
1856 m = blkid_partlist_numof_partitions(pl);
1857 if (n == m + 1)
1858 break;
1859 if (n > m + 1) {
1860 log_error("blkid and kernel partition list do not match.");
1861 return -EIO;
1862 }
1863 if (n < m + 1) {
1864 unsigned j;
1865
1866 /* The kernel has probed fewer partitions than
1867 * blkid? Maybe the kernel prober is still
1868 * running or it got EBUSY because udev
1869 * already opened the device. Let's reprobe
1870 * the device, which is a synchronous call
1871 * that waits until probing is complete. */
1872
1873 for (j = 0; j < 20; j++) {
1874
1875 r = ioctl(fd, BLKRRPART, 0);
1876 if (r < 0)
1877 r = -errno;
1878 if (r >= 0 || r != -EBUSY)
1879 break;
1880
1881 /* If something else has the device
1882 * open, such as an udev rule, the
1883 * ioctl will return EBUSY. Since
1884 * there's no way to wait until it
1885 * isn't busy anymore, let's just wait
1886 * a bit, and try again.
1887 *
1888 * This is really something they
1889 * should fix in the kernel! */
1890
1891 usleep(50 * USEC_PER_MSEC);
1892 }
1893
1894 if (r < 0)
1895 return log_error_errno(r, "Failed to reread partition table: %m");
1896 }
1897
1898 e = udev_enumerate_unref(e);
1899 }
1900
1901 first = udev_enumerate_get_list_entry(e);
1902 udev_list_entry_foreach(item, first) {
1903 _cleanup_udev_device_unref_ struct udev_device *q;
1904 const char *node;
1905 unsigned long long flags;
1906 blkid_partition pp;
1907 dev_t qn;
1908 int nr;
1909
1910 errno = 0;
1911 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1912 if (!q) {
1913 if (!errno)
1914 errno = ENOMEM;
1915
1916 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1917 }
1918
1919 qn = udev_device_get_devnum(q);
1920 if (major(qn) == 0)
1921 continue;
1922
1923 if (st.st_rdev == qn)
1924 continue;
1925
1926 node = udev_device_get_devnode(q);
1927 if (!node)
1928 continue;
1929
1930 pp = blkid_partlist_devno_to_partition(pl, qn);
1931 if (!pp)
1932 continue;
1933
1934 flags = blkid_partition_get_flags(pp);
1935
1936 nr = blkid_partition_get_partno(pp);
1937 if (nr < 0)
1938 continue;
1939
1940 if (is_gpt) {
1941 sd_id128_t type_id;
1942 const char *stype;
1943
1944 if (flags & GPT_FLAG_NO_AUTO)
1945 continue;
1946
1947 stype = blkid_partition_get_type_string(pp);
1948 if (!stype)
1949 continue;
1950
1951 if (sd_id128_from_string(stype, &type_id) < 0)
1952 continue;
1953
1954 if (sd_id128_equal(type_id, GPT_HOME)) {
1955
1956 if (home && nr >= home_nr)
1957 continue;
1958
1959 home_nr = nr;
1960 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1961
1962 r = free_and_strdup(&home, node);
1963 if (r < 0)
1964 return log_oom();
1965
1966 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1967
1968 if (srv && nr >= srv_nr)
1969 continue;
1970
1971 srv_nr = nr;
1972 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1973
1974 r = free_and_strdup(&srv, node);
1975 if (r < 0)
1976 return log_oom();
1977 }
1978 #ifdef GPT_ROOT_NATIVE
1979 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1980
1981 if (root && nr >= root_nr)
1982 continue;
1983
1984 root_nr = nr;
1985 root_rw = !(flags & GPT_FLAG_READ_ONLY);
1986
1987 r = free_and_strdup(&root, node);
1988 if (r < 0)
1989 return log_oom();
1990 }
1991 #endif
1992 #ifdef GPT_ROOT_SECONDARY
1993 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1994
1995 if (secondary_root && nr >= secondary_root_nr)
1996 continue;
1997
1998 secondary_root_nr = nr;
1999 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2000
2001 r = free_and_strdup(&secondary_root, node);
2002 if (r < 0)
2003 return log_oom();
2004 }
2005 #endif
2006 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2007
2008 if (generic)
2009 multiple_generic = true;
2010 else {
2011 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2012
2013 r = free_and_strdup(&generic, node);
2014 if (r < 0)
2015 return log_oom();
2016 }
2017 }
2018
2019 } else if (is_mbr) {
2020 int type;
2021
2022 if (flags != 0x80) /* Bootable flag */
2023 continue;
2024
2025 type = blkid_partition_get_type(pp);
2026 if (type != 0x83) /* Linux partition */
2027 continue;
2028
2029 if (generic)
2030 multiple_generic = true;
2031 else {
2032 generic_rw = true;
2033
2034 r = free_and_strdup(&root, node);
2035 if (r < 0)
2036 return log_oom();
2037 }
2038 }
2039 }
2040
2041 if (root) {
2042 *root_device = root;
2043 root = NULL;
2044
2045 *root_device_rw = root_rw;
2046 *secondary = false;
2047 } else if (secondary_root) {
2048 *root_device = secondary_root;
2049 secondary_root = NULL;
2050
2051 *root_device_rw = secondary_root_rw;
2052 *secondary = true;
2053 } else if (generic) {
2054
2055 /* There were no partitions with precise meanings
2056 * around, but we found generic partitions. In this
2057 * case, if there's only one, we can go ahead and boot
2058 * it, otherwise we bail out, because we really cannot
2059 * make any sense of it. */
2060
2061 if (multiple_generic) {
2062 log_error("Identified multiple bootable Linux partitions on\n"
2063 " %s\n"
2064 PARTITION_TABLE_BLURB, arg_image);
2065 return -EINVAL;
2066 }
2067
2068 *root_device = generic;
2069 generic = NULL;
2070
2071 *root_device_rw = generic_rw;
2072 *secondary = false;
2073 } else {
2074 log_error("Failed to identify root partition in disk image\n"
2075 " %s\n"
2076 PARTITION_TABLE_BLURB, arg_image);
2077 return -EINVAL;
2078 }
2079
2080 if (home) {
2081 *home_device = home;
2082 home = NULL;
2083
2084 *home_device_rw = home_rw;
2085 }
2086
2087 if (srv) {
2088 *srv_device = srv;
2089 srv = NULL;
2090
2091 *srv_device_rw = srv_rw;
2092 }
2093
2094 return 0;
2095 #else
2096 log_error("--image= is not supported, compiled without blkid support.");
2097 return -EOPNOTSUPP;
2098 #endif
2099 }
2100
2101 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2102 #ifdef HAVE_BLKID
2103 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2104 const char *fstype, *p;
2105 int r;
2106
2107 assert(what);
2108 assert(where);
2109
2110 if (arg_read_only)
2111 rw = false;
2112
2113 if (directory)
2114 p = strjoina(where, directory);
2115 else
2116 p = where;
2117
2118 errno = 0;
2119 b = blkid_new_probe_from_filename(what);
2120 if (!b) {
2121 if (errno == 0)
2122 return log_oom();
2123 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2124 }
2125
2126 blkid_probe_enable_superblocks(b, 1);
2127 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2128
2129 errno = 0;
2130 r = blkid_do_safeprobe(b);
2131 if (r == -1 || r == 1) {
2132 log_error("Cannot determine file system type of %s", what);
2133 return -EINVAL;
2134 } else if (r != 0) {
2135 if (errno == 0)
2136 errno = EIO;
2137 return log_error_errno(errno, "Failed to probe %s: %m", what);
2138 }
2139
2140 errno = 0;
2141 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2142 if (errno == 0)
2143 errno = EINVAL;
2144 log_error("Failed to determine file system type of %s", what);
2145 return -errno;
2146 }
2147
2148 if (streq(fstype, "crypto_LUKS")) {
2149 log_error("nspawn currently does not support LUKS disk images.");
2150 return -EOPNOTSUPP;
2151 }
2152
2153 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2154 return log_error_errno(errno, "Failed to mount %s: %m", what);
2155
2156 return 0;
2157 #else
2158 log_error("--image= is not supported, compiled without blkid support.");
2159 return -EOPNOTSUPP;
2160 #endif
2161 }
2162
2163 static int mount_devices(
2164 const char *where,
2165 const char *root_device, bool root_device_rw,
2166 const char *home_device, bool home_device_rw,
2167 const char *srv_device, bool srv_device_rw) {
2168 int r;
2169
2170 assert(where);
2171
2172 if (root_device) {
2173 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2174 if (r < 0)
2175 return log_error_errno(r, "Failed to mount root directory: %m");
2176 }
2177
2178 if (home_device) {
2179 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2180 if (r < 0)
2181 return log_error_errno(r, "Failed to mount home directory: %m");
2182 }
2183
2184 if (srv_device) {
2185 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2186 if (r < 0)
2187 return log_error_errno(r, "Failed to mount server data directory: %m");
2188 }
2189
2190 return 0;
2191 }
2192
2193 static void loop_remove(int nr, int *image_fd) {
2194 _cleanup_close_ int control = -1;
2195 int r;
2196
2197 if (nr < 0)
2198 return;
2199
2200 if (image_fd && *image_fd >= 0) {
2201 r = ioctl(*image_fd, LOOP_CLR_FD);
2202 if (r < 0)
2203 log_debug_errno(errno, "Failed to close loop image: %m");
2204 *image_fd = safe_close(*image_fd);
2205 }
2206
2207 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2208 if (control < 0) {
2209 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2210 return;
2211 }
2212
2213 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2214 if (r < 0)
2215 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2216 }
2217
2218 /*
2219 * Return values:
2220 * < 0 : wait_for_terminate() failed to get the state of the
2221 * container, the container was terminated by a signal, or
2222 * failed for an unknown reason. No change is made to the
2223 * container argument.
2224 * > 0 : The program executed in the container terminated with an
2225 * error. The exit code of the program executed in the
2226 * container is returned. The container argument has been set
2227 * to CONTAINER_TERMINATED.
2228 * 0 : The container is being rebooted, has been shut down or exited
2229 * successfully. The container argument has been set to either
2230 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2231 *
2232 * That is, success is indicated by a return value of zero, and an
2233 * error is indicated by a non-zero value.
2234 */
2235 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2236 siginfo_t status;
2237 int r;
2238
2239 r = wait_for_terminate(pid, &status);
2240 if (r < 0)
2241 return log_warning_errno(r, "Failed to wait for container: %m");
2242
2243 switch (status.si_code) {
2244
2245 case CLD_EXITED:
2246 if (status.si_status == 0) {
2247 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2248
2249 } else
2250 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2251
2252 *container = CONTAINER_TERMINATED;
2253 return status.si_status;
2254
2255 case CLD_KILLED:
2256 if (status.si_status == SIGINT) {
2257
2258 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2259 *container = CONTAINER_TERMINATED;
2260 return 0;
2261
2262 } else if (status.si_status == SIGHUP) {
2263
2264 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2265 *container = CONTAINER_REBOOTED;
2266 return 0;
2267 }
2268
2269 /* CLD_KILLED fallthrough */
2270
2271 case CLD_DUMPED:
2272 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2273 return -EIO;
2274
2275 default:
2276 log_error("Container %s failed due to unknown reason.", arg_machine);
2277 return -EIO;
2278 }
2279
2280 return r;
2281 }
2282
2283 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2284 pid_t pid;
2285
2286 pid = PTR_TO_PID(userdata);
2287 if (pid > 0) {
2288 if (kill(pid, arg_kill_signal) >= 0) {
2289 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2290 sd_event_source_set_userdata(s, NULL);
2291 return 0;
2292 }
2293 }
2294
2295 sd_event_exit(sd_event_source_get_event(s), 0);
2296 return 0;
2297 }
2298
2299 static int determine_names(void) {
2300 int r;
2301
2302 if (arg_template && !arg_directory && arg_machine) {
2303
2304 /* If --template= was specified then we should not
2305 * search for a machine, but instead create a new one
2306 * in /var/lib/machine. */
2307
2308 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2309 if (!arg_directory)
2310 return log_oom();
2311 }
2312
2313 if (!arg_image && !arg_directory) {
2314 if (arg_machine) {
2315 _cleanup_(image_unrefp) Image *i = NULL;
2316
2317 r = image_find(arg_machine, &i);
2318 if (r < 0)
2319 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2320 else if (r == 0) {
2321 log_error("No image for machine '%s': %m", arg_machine);
2322 return -ENOENT;
2323 }
2324
2325 if (i->type == IMAGE_RAW)
2326 r = free_and_strdup(&arg_image, i->path);
2327 else
2328 r = free_and_strdup(&arg_directory, i->path);
2329 if (r < 0)
2330 return log_error_errno(r, "Invalid image directory: %m");
2331
2332 if (!arg_ephemeral)
2333 arg_read_only = arg_read_only || i->read_only;
2334 } else
2335 arg_directory = get_current_dir_name();
2336
2337 if (!arg_directory && !arg_machine) {
2338 log_error("Failed to determine path, please use -D or -i.");
2339 return -EINVAL;
2340 }
2341 }
2342
2343 if (!arg_machine) {
2344 if (arg_directory && path_equal(arg_directory, "/"))
2345 arg_machine = gethostname_malloc();
2346 else
2347 arg_machine = strdup(basename(arg_image ?: arg_directory));
2348
2349 if (!arg_machine)
2350 return log_oom();
2351
2352 hostname_cleanup(arg_machine);
2353 if (!machine_name_is_valid(arg_machine)) {
2354 log_error("Failed to determine machine name automatically, please use -M.");
2355 return -EINVAL;
2356 }
2357
2358 if (arg_ephemeral) {
2359 char *b;
2360
2361 /* Add a random suffix when this is an
2362 * ephemeral machine, so that we can run many
2363 * instances at once without manually having
2364 * to specify -M each time. */
2365
2366 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2367 return log_oom();
2368
2369 free(arg_machine);
2370 arg_machine = b;
2371 }
2372 }
2373
2374 return 0;
2375 }
2376
2377 static int determine_uid_shift(const char *directory) {
2378 int r;
2379
2380 if (!arg_userns) {
2381 arg_uid_shift = 0;
2382 return 0;
2383 }
2384
2385 if (arg_uid_shift == UID_INVALID) {
2386 struct stat st;
2387
2388 r = stat(directory, &st);
2389 if (r < 0)
2390 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2391
2392 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2393
2394 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2395 log_error("UID and GID base of %s don't match.", directory);
2396 return -EINVAL;
2397 }
2398
2399 arg_uid_range = UINT32_C(0x10000);
2400 }
2401
2402 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2403 log_error("UID base too high for UID range.");
2404 return -EINVAL;
2405 }
2406
2407 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2408 return 0;
2409 }
2410
2411 static int inner_child(
2412 Barrier *barrier,
2413 const char *directory,
2414 bool secondary,
2415 int kmsg_socket,
2416 int rtnl_socket,
2417 FDSet *fds) {
2418
2419 _cleanup_free_ char *home = NULL;
2420 unsigned n_env = 1;
2421 const char *envp[] = {
2422 "PATH=" DEFAULT_PATH_SPLIT_USR,
2423 NULL, /* container */
2424 NULL, /* TERM */
2425 NULL, /* HOME */
2426 NULL, /* USER */
2427 NULL, /* LOGNAME */
2428 NULL, /* container_uuid */
2429 NULL, /* LISTEN_FDS */
2430 NULL, /* LISTEN_PID */
2431 NULL
2432 };
2433
2434 _cleanup_strv_free_ char **env_use = NULL;
2435 int r;
2436
2437 assert(barrier);
2438 assert(directory);
2439 assert(kmsg_socket >= 0);
2440
2441 cg_unified_flush();
2442
2443 if (arg_userns) {
2444 /* Tell the parent, that it now can write the UID map. */
2445 (void) barrier_place(barrier); /* #1 */
2446
2447 /* Wait until the parent wrote the UID map */
2448 if (!barrier_place_and_sync(barrier)) { /* #2 */
2449 log_error("Parent died too early");
2450 return -ESRCH;
2451 }
2452 }
2453
2454 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
2455 if (r < 0)
2456 return r;
2457
2458 r = mount_sysfs(NULL);
2459 if (r < 0)
2460 return r;
2461
2462 /* Wait until we are cgroup-ified, so that we
2463 * can mount the right cgroup path writable */
2464 if (!barrier_place_and_sync(barrier)) { /* #3 */
2465 log_error("Parent died too early");
2466 return -ESRCH;
2467 }
2468
2469 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2470 if (r < 0)
2471 return r;
2472
2473 r = reset_uid_gid();
2474 if (r < 0)
2475 return log_error_errno(r, "Couldn't become new root: %m");
2476
2477 r = setup_boot_id(NULL);
2478 if (r < 0)
2479 return r;
2480
2481 r = setup_kmsg(NULL, kmsg_socket);
2482 if (r < 0)
2483 return r;
2484 kmsg_socket = safe_close(kmsg_socket);
2485
2486 umask(0022);
2487
2488 if (setsid() < 0)
2489 return log_error_errno(errno, "setsid() failed: %m");
2490
2491 if (arg_private_network)
2492 loopback_setup();
2493
2494 if (arg_expose_ports) {
2495 r = expose_port_send_rtnl(rtnl_socket);
2496 if (r < 0)
2497 return r;
2498 rtnl_socket = safe_close(rtnl_socket);
2499 }
2500
2501 r = drop_capabilities();
2502 if (r < 0)
2503 return log_error_errno(r, "drop_capabilities() failed: %m");
2504
2505 setup_hostname();
2506
2507 if (arg_personality != PERSONALITY_INVALID) {
2508 if (personality(arg_personality) < 0)
2509 return log_error_errno(errno, "personality() failed: %m");
2510 } else if (secondary) {
2511 if (personality(PER_LINUX32) < 0)
2512 return log_error_errno(errno, "personality() failed: %m");
2513 }
2514
2515 #ifdef HAVE_SELINUX
2516 if (arg_selinux_context)
2517 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2518 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2519 #endif
2520
2521 r = change_uid_gid(arg_user, &home);
2522 if (r < 0)
2523 return r;
2524
2525 /* LXC sets container=lxc, so follow the scheme here */
2526 envp[n_env++] = strjoina("container=", arg_container_service_name);
2527
2528 envp[n_env] = strv_find_prefix(environ, "TERM=");
2529 if (envp[n_env])
2530 n_env ++;
2531
2532 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2533 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2534 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2535 return log_oom();
2536
2537 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2538 char as_uuid[37];
2539
2540 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2541 return log_oom();
2542 }
2543
2544 if (fdset_size(fds) > 0) {
2545 r = fdset_cloexec(fds, false);
2546 if (r < 0)
2547 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2548
2549 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2550 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2551 return log_oom();
2552 }
2553
2554 env_use = strv_env_merge(2, envp, arg_setenv);
2555 if (!env_use)
2556 return log_oom();
2557
2558 /* Let the parent know that we are ready and
2559 * wait until the parent is ready with the
2560 * setup, too... */
2561 if (!barrier_place_and_sync(barrier)) { /* #4 */
2562 log_error("Parent died too early");
2563 return -ESRCH;
2564 }
2565
2566 /* Now, explicitly close the log, so that we
2567 * then can close all remaining fds. Closing
2568 * the log explicitly first has the benefit
2569 * that the logging subsystem knows about it,
2570 * and is thus ready to be reopened should we
2571 * need it again. Note that the other fds
2572 * closed here are at least the locking and
2573 * barrier fds. */
2574 log_close();
2575 (void) fdset_close_others(fds);
2576
2577 if (arg_boot) {
2578 char **a;
2579 size_t m;
2580
2581 /* Automatically search for the init system */
2582
2583 m = 1 + strv_length(arg_parameters);
2584 a = newa(char*, m + 1);
2585 if (strv_isempty(arg_parameters))
2586 a[1] = NULL;
2587 else
2588 memcpy(a + 1, arg_parameters, m * sizeof(char*));
2589
2590 a[0] = (char*) "/usr/lib/systemd/systemd";
2591 execve(a[0], a, env_use);
2592
2593 a[0] = (char*) "/lib/systemd/systemd";
2594 execve(a[0], a, env_use);
2595
2596 a[0] = (char*) "/sbin/init";
2597 execve(a[0], a, env_use);
2598 } else if (!strv_isempty(arg_parameters))
2599 execvpe(arg_parameters[0], arg_parameters, env_use);
2600 else {
2601 chdir(home ?: "/root");
2602 execle("/bin/bash", "-bash", NULL, env_use);
2603 execle("/bin/sh", "-sh", NULL, env_use);
2604 }
2605
2606 r = -errno;
2607 (void) log_open();
2608 return log_error_errno(r, "execv() failed: %m");
2609 }
2610
2611 static int outer_child(
2612 Barrier *barrier,
2613 const char *directory,
2614 const char *console,
2615 const char *root_device, bool root_device_rw,
2616 const char *home_device, bool home_device_rw,
2617 const char *srv_device, bool srv_device_rw,
2618 bool interactive,
2619 bool secondary,
2620 int pid_socket,
2621 int kmsg_socket,
2622 int rtnl_socket,
2623 int uid_shift_socket,
2624 FDSet *fds) {
2625
2626 pid_t pid;
2627 ssize_t l;
2628 int r;
2629
2630 assert(barrier);
2631 assert(directory);
2632 assert(console);
2633 assert(pid_socket >= 0);
2634 assert(kmsg_socket >= 0);
2635
2636 cg_unified_flush();
2637
2638 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2639 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2640
2641 if (interactive) {
2642 close_nointr(STDIN_FILENO);
2643 close_nointr(STDOUT_FILENO);
2644 close_nointr(STDERR_FILENO);
2645
2646 r = open_terminal(console, O_RDWR);
2647 if (r != STDIN_FILENO) {
2648 if (r >= 0) {
2649 safe_close(r);
2650 r = -EINVAL;
2651 }
2652
2653 return log_error_errno(r, "Failed to open console: %m");
2654 }
2655
2656 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2657 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2658 return log_error_errno(errno, "Failed to duplicate console: %m");
2659 }
2660
2661 r = reset_audit_loginuid();
2662 if (r < 0)
2663 return r;
2664
2665 /* Mark everything as slave, so that we still
2666 * receive mounts from the real root, but don't
2667 * propagate mounts to the real root. */
2668 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2669 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2670
2671 r = mount_devices(directory,
2672 root_device, root_device_rw,
2673 home_device, home_device_rw,
2674 srv_device, srv_device_rw);
2675 if (r < 0)
2676 return r;
2677
2678 r = determine_uid_shift(directory);
2679 if (r < 0)
2680 return r;
2681
2682 if (arg_userns) {
2683 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2684 if (l < 0)
2685 return log_error_errno(errno, "Failed to send UID shift: %m");
2686 if (l != sizeof(arg_uid_shift)) {
2687 log_error("Short write while sending UID shift.");
2688 return -EIO;
2689 }
2690 }
2691
2692 /* Turn directory into bind mount */
2693 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2694 return log_error_errno(errno, "Failed to make bind mount: %m");
2695
2696 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2697 if (r < 0)
2698 return r;
2699
2700 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2701 if (r < 0)
2702 return r;
2703
2704 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2705 if (r < 0)
2706 return r;
2707
2708 if (arg_read_only) {
2709 r = bind_remount_recursive(directory, true);
2710 if (r < 0)
2711 return log_error_errno(r, "Failed to make tree read-only: %m");
2712 }
2713
2714 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2715 if (r < 0)
2716 return r;
2717
2718 r = copy_devnodes(directory);
2719 if (r < 0)
2720 return r;
2721
2722 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2723
2724 r = setup_pts(directory);
2725 if (r < 0)
2726 return r;
2727
2728 r = setup_propagate(directory);
2729 if (r < 0)
2730 return r;
2731
2732 r = setup_dev_console(directory, console);
2733 if (r < 0)
2734 return r;
2735
2736 r = setup_seccomp();
2737 if (r < 0)
2738 return r;
2739
2740 r = setup_timezone(directory);
2741 if (r < 0)
2742 return r;
2743
2744 r = setup_resolv_conf(directory);
2745 if (r < 0)
2746 return r;
2747
2748 r = setup_journal(directory);
2749 if (r < 0)
2750 return r;
2751
2752 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2753 if (r < 0)
2754 return r;
2755
2756 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2757 if (r < 0)
2758 return r;
2759
2760 r = mount_move_root(directory);
2761 if (r < 0)
2762 return log_error_errno(r, "Failed to move root directory: %m");
2763
2764 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2765 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2766 (arg_private_network ? CLONE_NEWNET : 0) |
2767 (arg_userns ? CLONE_NEWUSER : 0),
2768 NULL);
2769 if (pid < 0)
2770 return log_error_errno(errno, "Failed to fork inner child: %m");
2771 if (pid == 0) {
2772 pid_socket = safe_close(pid_socket);
2773 uid_shift_socket = safe_close(uid_shift_socket);
2774
2775 /* The inner child has all namespaces that are
2776 * requested, so that we all are owned by the user if
2777 * user namespaces are turned on. */
2778
2779 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2780 if (r < 0)
2781 _exit(EXIT_FAILURE);
2782
2783 _exit(EXIT_SUCCESS);
2784 }
2785
2786 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2787 if (l < 0)
2788 return log_error_errno(errno, "Failed to send PID: %m");
2789 if (l != sizeof(pid)) {
2790 log_error("Short write while sending PID.");
2791 return -EIO;
2792 }
2793
2794 pid_socket = safe_close(pid_socket);
2795 kmsg_socket = safe_close(kmsg_socket);
2796 rtnl_socket = safe_close(rtnl_socket);
2797
2798 return 0;
2799 }
2800
2801 static int setup_uid_map(pid_t pid) {
2802 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2803 int r;
2804
2805 assert(pid > 1);
2806
2807 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2808 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2809 r = write_string_file(uid_map, line, 0);
2810 if (r < 0)
2811 return log_error_errno(r, "Failed to write UID map: %m");
2812
2813 /* We always assign the same UID and GID ranges */
2814 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2815 r = write_string_file(uid_map, line, 0);
2816 if (r < 0)
2817 return log_error_errno(r, "Failed to write GID map: %m");
2818
2819 return 0;
2820 }
2821
2822 static int load_settings(void) {
2823 _cleanup_(settings_freep) Settings *settings = NULL;
2824 _cleanup_fclose_ FILE *f = NULL;
2825 _cleanup_free_ char *p = NULL;
2826 const char *fn, *i;
2827 int r;
2828
2829 /* If all settings are masked, there's no point in looking for
2830 * the settings file */
2831 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2832 return 0;
2833
2834 fn = strjoina(arg_machine, ".nspawn");
2835
2836 /* We first look in the admin's directories in /etc and /run */
2837 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2838 _cleanup_free_ char *j = NULL;
2839
2840 j = strjoin(i, "/", fn, NULL);
2841 if (!j)
2842 return log_oom();
2843
2844 f = fopen(j, "re");
2845 if (f) {
2846 p = j;
2847 j = NULL;
2848
2849 /* By default, we trust configuration from /etc and /run */
2850 if (arg_settings_trusted < 0)
2851 arg_settings_trusted = true;
2852
2853 break;
2854 }
2855
2856 if (errno != ENOENT)
2857 return log_error_errno(errno, "Failed to open %s: %m", j);
2858 }
2859
2860 if (!f) {
2861 /* After that, let's look for a file next to the
2862 * actual image we shall boot. */
2863
2864 if (arg_image) {
2865 p = file_in_same_dir(arg_image, fn);
2866 if (!p)
2867 return log_oom();
2868 } else if (arg_directory) {
2869 p = file_in_same_dir(arg_directory, fn);
2870 if (!p)
2871 return log_oom();
2872 }
2873
2874 if (p) {
2875 f = fopen(p, "re");
2876 if (!f && errno != ENOENT)
2877 return log_error_errno(errno, "Failed to open %s: %m", p);
2878
2879 /* By default, we do not trust configuration from /var/lib/machines */
2880 if (arg_settings_trusted < 0)
2881 arg_settings_trusted = false;
2882 }
2883 }
2884
2885 if (!f)
2886 return 0;
2887
2888 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2889
2890 r = settings_load(f, p, &settings);
2891 if (r < 0)
2892 return r;
2893
2894 /* Copy over bits from the settings, unless they have been
2895 * explicitly masked by command line switches. */
2896
2897 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2898 settings->boot >= 0) {
2899 arg_boot = settings->boot;
2900
2901 strv_free(arg_parameters);
2902 arg_parameters = settings->parameters;
2903 settings->parameters = NULL;
2904 }
2905
2906 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2907 settings->environment) {
2908 strv_free(arg_setenv);
2909 arg_setenv = settings->environment;
2910 settings->environment = NULL;
2911 }
2912
2913 if ((arg_settings_mask & SETTING_USER) == 0 &&
2914 settings->user) {
2915 free(arg_user);
2916 arg_user = settings->user;
2917 settings->user = NULL;
2918 }
2919
2920 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
2921 uint64_t plus;
2922
2923 plus = settings->capability;
2924 if (settings_private_network(settings))
2925 plus |= (1ULL << CAP_NET_ADMIN);
2926
2927 if (!arg_settings_trusted && plus != 0) {
2928 if (settings->capability != 0)
2929 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2930 } else
2931 arg_retain |= plus;
2932
2933 arg_retain &= ~settings->drop_capability;
2934 }
2935
2936 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2937 settings->kill_signal > 0)
2938 arg_kill_signal = settings->kill_signal;
2939
2940 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2941 settings->personality != PERSONALITY_INVALID)
2942 arg_personality = settings->personality;
2943
2944 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2945 !sd_id128_is_null(settings->machine_id)) {
2946
2947 if (!arg_settings_trusted)
2948 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2949 else
2950 arg_uuid = settings->machine_id;
2951 }
2952
2953 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2954 settings->read_only >= 0)
2955 arg_read_only = settings->read_only;
2956
2957 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2958 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2959 arg_volatile_mode = settings->volatile_mode;
2960
2961 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2962 settings->n_custom_mounts > 0) {
2963
2964 if (!arg_settings_trusted)
2965 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2966 else {
2967 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2968 arg_custom_mounts = settings->custom_mounts;
2969 arg_n_custom_mounts = settings->n_custom_mounts;
2970
2971 settings->custom_mounts = NULL;
2972 settings->n_custom_mounts = 0;
2973 }
2974 }
2975
2976 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2977 (settings->private_network >= 0 ||
2978 settings->network_veth >= 0 ||
2979 settings->network_bridge ||
2980 settings->network_interfaces ||
2981 settings->network_macvlan ||
2982 settings->network_ipvlan ||
2983 settings->network_veth_extra)) {
2984
2985 if (!arg_settings_trusted)
2986 log_warning("Ignoring network settings, file %s is not trusted.", p);
2987 else {
2988 arg_network_veth = settings_network_veth(settings);
2989 arg_private_network = settings_private_network(settings);
2990
2991 strv_free(arg_network_interfaces);
2992 arg_network_interfaces = settings->network_interfaces;
2993 settings->network_interfaces = NULL;
2994
2995 strv_free(arg_network_macvlan);
2996 arg_network_macvlan = settings->network_macvlan;
2997 settings->network_macvlan = NULL;
2998
2999 strv_free(arg_network_ipvlan);
3000 arg_network_ipvlan = settings->network_ipvlan;
3001 settings->network_ipvlan = NULL;
3002
3003 strv_free(arg_network_veth_extra);
3004 arg_network_veth_extra = settings->network_veth_extra;
3005 settings->network_veth_extra = NULL;
3006
3007 free(arg_network_bridge);
3008 arg_network_bridge = settings->network_bridge;
3009 settings->network_bridge = NULL;
3010 }
3011 }
3012
3013 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3014 settings->expose_ports) {
3015
3016 if (!arg_settings_trusted)
3017 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3018 else {
3019 expose_port_free_all(arg_expose_ports);
3020 arg_expose_ports = settings->expose_ports;
3021 settings->expose_ports = NULL;
3022 }
3023 }
3024
3025 return 0;
3026 }
3027
3028 int main(int argc, char *argv[]) {
3029
3030 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3031 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3032 _cleanup_close_ int master = -1, image_fd = -1;
3033 _cleanup_fdset_free_ FDSet *fds = NULL;
3034 int r, n_fd_passed, loop_nr = -1;
3035 char veth_name[IFNAMSIZ];
3036 bool secondary = false, remove_subvol = false;
3037 sigset_t mask_chld;
3038 pid_t pid = 0;
3039 int ret = EXIT_SUCCESS;
3040 union in_addr_union exposed = {};
3041 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3042 bool interactive;
3043
3044 log_parse_environment();
3045 log_open();
3046
3047 r = parse_argv(argc, argv);
3048 if (r <= 0)
3049 goto finish;
3050
3051 if (geteuid() != 0) {
3052 log_error("Need to be root.");
3053 r = -EPERM;
3054 goto finish;
3055 }
3056 r = determine_names();
3057 if (r < 0)
3058 goto finish;
3059
3060 r = load_settings();
3061 if (r < 0)
3062 goto finish;
3063
3064 r = verify_arguments();
3065 if (r < 0)
3066 goto finish;
3067
3068 n_fd_passed = sd_listen_fds(false);
3069 if (n_fd_passed > 0) {
3070 r = fdset_new_listen_fds(&fds, false);
3071 if (r < 0) {
3072 log_error_errno(r, "Failed to collect file descriptors: %m");
3073 goto finish;
3074 }
3075 }
3076
3077 if (arg_directory) {
3078 assert(!arg_image);
3079
3080 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3081 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3082 r = -EINVAL;
3083 goto finish;
3084 }
3085
3086 if (arg_ephemeral) {
3087 _cleanup_free_ char *np = NULL;
3088
3089 /* If the specified path is a mount point we
3090 * generate the new snapshot immediately
3091 * inside it under a random name. However if
3092 * the specified is not a mount point we
3093 * create the new snapshot in the parent
3094 * directory, just next to it. */
3095 r = path_is_mount_point(arg_directory, 0);
3096 if (r < 0) {
3097 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3098 goto finish;
3099 }
3100 if (r > 0)
3101 r = tempfn_random_child(arg_directory, "machine.", &np);
3102 else
3103 r = tempfn_random(arg_directory, "machine.", &np);
3104 if (r < 0) {
3105 log_error_errno(r, "Failed to generate name for snapshot: %m");
3106 goto finish;
3107 }
3108
3109 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3110 if (r < 0) {
3111 log_error_errno(r, "Failed to lock %s: %m", np);
3112 goto finish;
3113 }
3114
3115 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3116 if (r < 0) {
3117 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3118 goto finish;
3119 }
3120
3121 free(arg_directory);
3122 arg_directory = np;
3123 np = NULL;
3124
3125 remove_subvol = true;
3126
3127 } else {
3128 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3129 if (r == -EBUSY) {
3130 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3131 goto finish;
3132 }
3133 if (r < 0) {
3134 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3135 return r;
3136 }
3137
3138 if (arg_template) {
3139 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3140 if (r == -EEXIST) {
3141 if (!arg_quiet)
3142 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3143 } else if (r < 0) {
3144 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3145 goto finish;
3146 } else {
3147 if (!arg_quiet)
3148 log_info("Populated %s from template %s.", arg_directory, arg_template);
3149 }
3150 }
3151 }
3152
3153 if (arg_boot) {
3154 if (path_is_os_tree(arg_directory) <= 0) {
3155 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3156 r = -EINVAL;
3157 goto finish;
3158 }
3159 } else {
3160 const char *p;
3161
3162 p = strjoina(arg_directory, "/usr/");
3163 if (laccess(p, F_OK) < 0) {
3164 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
3165 r = -EINVAL;
3166 goto finish;
3167 }
3168 }
3169
3170 } else {
3171 char template[] = "/tmp/nspawn-root-XXXXXX";
3172
3173 assert(arg_image);
3174 assert(!arg_template);
3175
3176 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3177 if (r == -EBUSY) {
3178 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3179 goto finish;
3180 }
3181 if (r < 0) {
3182 r = log_error_errno(r, "Failed to create image lock: %m");
3183 goto finish;
3184 }
3185
3186 if (!mkdtemp(template)) {
3187 log_error_errno(errno, "Failed to create temporary directory: %m");
3188 r = -errno;
3189 goto finish;
3190 }
3191
3192 arg_directory = strdup(template);
3193 if (!arg_directory) {
3194 r = log_oom();
3195 goto finish;
3196 }
3197
3198 image_fd = setup_image(&device_path, &loop_nr);
3199 if (image_fd < 0) {
3200 r = image_fd;
3201 goto finish;
3202 }
3203
3204 r = dissect_image(image_fd,
3205 &root_device, &root_device_rw,
3206 &home_device, &home_device_rw,
3207 &srv_device, &srv_device_rw,
3208 &secondary);
3209 if (r < 0)
3210 goto finish;
3211 }
3212
3213 r = custom_mounts_prepare();
3214 if (r < 0)
3215 goto finish;
3216
3217 interactive =
3218 isatty(STDIN_FILENO) > 0 &&
3219 isatty(STDOUT_FILENO) > 0;
3220
3221 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3222 if (master < 0) {
3223 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3224 goto finish;
3225 }
3226
3227 r = ptsname_malloc(master, &console);
3228 if (r < 0) {
3229 r = log_error_errno(r, "Failed to determine tty name: %m");
3230 goto finish;
3231 }
3232
3233 if (unlockpt(master) < 0) {
3234 r = log_error_errno(errno, "Failed to unlock tty: %m");
3235 goto finish;
3236 }
3237
3238 if (!arg_quiet)
3239 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3240 arg_machine, arg_image ?: arg_directory);
3241
3242 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3243
3244 assert_se(sigemptyset(&mask_chld) == 0);
3245 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3246
3247 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3248 r = log_error_errno(errno, "Failed to become subreaper: %m");
3249 goto finish;
3250 }
3251
3252 for (;;) {
3253 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 };
3254 ContainerStatus container_status;
3255 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3256 static const struct sigaction sa = {
3257 .sa_handler = nop_signal_handler,
3258 .sa_flags = SA_NOCLDSTOP,
3259 };
3260 int ifi = 0;
3261 ssize_t l;
3262 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3263 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3264 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3265 char last_char = 0;
3266
3267 r = barrier_create(&barrier);
3268 if (r < 0) {
3269 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3270 goto finish;
3271 }
3272
3273 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3274 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3275 goto finish;
3276 }
3277
3278 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3279 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3280 goto finish;
3281 }
3282
3283 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3284 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3285 goto finish;
3286 }
3287
3288 if (arg_userns)
3289 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3290 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3291 goto finish;
3292 }
3293
3294 /* Child can be killed before execv(), so handle SIGCHLD
3295 * in order to interrupt parent's blocking calls and
3296 * give it a chance to call wait() and terminate. */
3297 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3298 if (r < 0) {
3299 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3300 goto finish;
3301 }
3302
3303 r = sigaction(SIGCHLD, &sa, NULL);
3304 if (r < 0) {
3305 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3306 goto finish;
3307 }
3308
3309 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3310 if (pid < 0) {
3311 if (errno == EINVAL)
3312 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3313 else
3314 r = log_error_errno(errno, "clone() failed: %m");
3315
3316 goto finish;
3317 }
3318
3319 if (pid == 0) {
3320 /* The outer child only has a file system namespace. */
3321 barrier_set_role(&barrier, BARRIER_CHILD);
3322
3323 master = safe_close(master);
3324
3325 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3326 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3327 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3328 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3329
3330 (void) reset_all_signal_handlers();
3331 (void) reset_signal_mask();
3332
3333 r = outer_child(&barrier,
3334 arg_directory,
3335 console,
3336 root_device, root_device_rw,
3337 home_device, home_device_rw,
3338 srv_device, srv_device_rw,
3339 interactive,
3340 secondary,
3341 pid_socket_pair[1],
3342 kmsg_socket_pair[1],
3343 rtnl_socket_pair[1],
3344 uid_shift_socket_pair[1],
3345 fds);
3346 if (r < 0)
3347 _exit(EXIT_FAILURE);
3348
3349 _exit(EXIT_SUCCESS);
3350 }
3351
3352 barrier_set_role(&barrier, BARRIER_PARENT);
3353
3354 fds = fdset_free(fds);
3355
3356 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3357 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3358 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3359 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3360
3361 /* Wait for the outer child. */
3362 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3363 if (r < 0)
3364 goto finish;
3365 if (r != 0) {
3366 r = -EIO;
3367 goto finish;
3368 }
3369 pid = 0;
3370
3371 /* And now retrieve the PID of the inner child. */
3372 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3373 if (l < 0) {
3374 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3375 goto finish;
3376 }
3377 if (l != sizeof(pid)) {
3378 log_error("Short read while reading inner child PID.");
3379 r = EIO;
3380 goto finish;
3381 }
3382
3383 log_debug("Init process invoked as PID " PID_FMT, pid);
3384
3385 if (arg_userns) {
3386 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3387 log_error("Child died too early.");
3388 r = -ESRCH;
3389 goto finish;
3390 }
3391
3392 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3393 if (l < 0) {
3394 r = log_error_errno(errno, "Failed to read UID shift: %m");
3395 goto finish;
3396 }
3397 if (l != sizeof(arg_uid_shift)) {
3398 log_error("Short read while reading UID shift.");
3399 r = EIO;
3400 goto finish;
3401 }
3402
3403 r = setup_uid_map(pid);
3404 if (r < 0)
3405 goto finish;
3406
3407 (void) barrier_place(&barrier); /* #2 */
3408 }
3409
3410 if (arg_private_network) {
3411
3412 r = move_network_interfaces(pid, arg_network_interfaces);
3413 if (r < 0)
3414 goto finish;
3415
3416 if (arg_network_veth) {
3417 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3418 if (r < 0)
3419 goto finish;
3420 else if (r > 0)
3421 ifi = r;
3422
3423 if (arg_network_bridge) {
3424 r = setup_bridge(veth_name, arg_network_bridge);
3425 if (r < 0)
3426 goto finish;
3427 if (r > 0)
3428 ifi = r;
3429 }
3430 }
3431
3432 r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
3433 if (r < 0)
3434 goto finish;
3435
3436 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3437 if (r < 0)
3438 goto finish;
3439
3440 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3441 if (r < 0)
3442 goto finish;
3443 }
3444
3445 if (arg_register) {
3446 r = register_machine(
3447 arg_machine,
3448 pid,
3449 arg_directory,
3450 arg_uuid,
3451 ifi,
3452 arg_slice,
3453 arg_custom_mounts, arg_n_custom_mounts,
3454 arg_kill_signal,
3455 arg_property,
3456 arg_keep_unit,
3457 arg_container_service_name);
3458 if (r < 0)
3459 goto finish;
3460 }
3461
3462 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3463 if (r < 0)
3464 goto finish;
3465
3466 if (arg_keep_unit) {
3467 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3468 if (r < 0)
3469 goto finish;
3470 }
3471
3472 r = chown_cgroup(pid, arg_uid_shift);
3473 if (r < 0)
3474 goto finish;
3475
3476 /* Notify the child that the parent is ready with all
3477 * its setup (including cgroup-ification), and that
3478 * the child can now hand over control to the code to
3479 * run inside the container. */
3480 (void) barrier_place(&barrier); /* #3 */
3481
3482 /* Block SIGCHLD here, before notifying child.
3483 * process_pty() will handle it with the other signals. */
3484 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3485
3486 /* Reset signal to default */
3487 r = default_signals(SIGCHLD, -1);
3488 if (r < 0) {
3489 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3490 goto finish;
3491 }
3492
3493 /* Let the child know that we are ready and wait that the child is completely ready now. */
3494 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3495 log_error("Child died too early.");
3496 r = -ESRCH;
3497 goto finish;
3498 }
3499
3500 sd_notifyf(false,
3501 "READY=1\n"
3502 "STATUS=Container running.\n"
3503 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3504
3505 r = sd_event_new(&event);
3506 if (r < 0) {
3507 log_error_errno(r, "Failed to get default event source: %m");
3508 goto finish;
3509 }
3510
3511 if (arg_kill_signal > 0) {
3512 /* Try to kill the init system on SIGINT or SIGTERM */
3513 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid));
3514 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid));
3515 } else {
3516 /* Immediately exit */
3517 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3518 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3519 }
3520
3521 /* simply exit on sigchld */
3522 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3523
3524 if (arg_expose_ports) {
3525 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3526 if (r < 0)
3527 goto finish;
3528
3529 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3530 }
3531
3532 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3533
3534 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
3535 if (r < 0) {
3536 log_error_errno(r, "Failed to create PTY forwarder: %m");
3537 goto finish;
3538 }
3539
3540 r = sd_event_loop(event);
3541 if (r < 0) {
3542 log_error_errno(r, "Failed to run event loop: %m");
3543 goto finish;
3544 }
3545
3546 pty_forward_get_last_char(forward, &last_char);
3547
3548 forward = pty_forward_free(forward);
3549
3550 if (!arg_quiet && last_char != '\n')
3551 putc('\n', stdout);
3552
3553 /* Kill if it is not dead yet anyway */
3554 if (arg_register && !arg_keep_unit)
3555 terminate_machine(pid);
3556
3557 /* Normally redundant, but better safe than sorry */
3558 kill(pid, SIGKILL);
3559
3560 r = wait_for_container(pid, &container_status);
3561 pid = 0;
3562
3563 if (r < 0)
3564 /* We failed to wait for the container, or the
3565 * container exited abnormally */
3566 goto finish;
3567 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3568 /* The container exited with a non-zero
3569 * status, or with zero status and no reboot
3570 * was requested. */
3571 ret = r;
3572 break;
3573 }
3574
3575 /* CONTAINER_REBOOTED, loop again */
3576
3577 if (arg_keep_unit) {
3578 /* Special handling if we are running as a
3579 * service: instead of simply restarting the
3580 * machine we want to restart the entire
3581 * service, so let's inform systemd about this
3582 * with the special exit code 133. The service
3583 * file uses RestartForceExitStatus=133 so
3584 * that this results in a full nspawn
3585 * restart. This is necessary since we might
3586 * have cgroup parameters set we want to have
3587 * flushed out. */
3588 ret = 133;
3589 r = 0;
3590 break;
3591 }
3592
3593 expose_port_flush(arg_expose_ports, &exposed);
3594 }
3595
3596 finish:
3597 sd_notify(false,
3598 "STOPPING=1\n"
3599 "STATUS=Terminating...");
3600
3601 if (pid > 0)
3602 kill(pid, SIGKILL);
3603
3604 /* Try to flush whatever is still queued in the pty */
3605 if (master >= 0)
3606 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
3607
3608 loop_remove(loop_nr, &image_fd);
3609
3610 if (remove_subvol && arg_directory) {
3611 int k;
3612
3613 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
3614 if (k < 0)
3615 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3616 }
3617
3618 if (arg_machine) {
3619 const char *p;
3620
3621 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3622 (void) rm_rf(p, REMOVE_ROOT);
3623 }
3624
3625 expose_port_flush(arg_expose_ports, &exposed);
3626
3627 free(arg_directory);
3628 free(arg_template);
3629 free(arg_image);
3630 free(arg_machine);
3631 free(arg_user);
3632 strv_free(arg_setenv);
3633 free(arg_network_bridge);
3634 strv_free(arg_network_interfaces);
3635 strv_free(arg_network_macvlan);
3636 strv_free(arg_network_ipvlan);
3637 strv_free(arg_network_veth_extra);
3638 strv_free(arg_parameters);
3639 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3640 expose_port_free_all(arg_expose_ports);
3641
3642 return r < 0 ? EXIT_FAILURE : ret;
3643 }