]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #1975 from ssahani/vxlan2
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #ifdef HAVE_BLKID
23 #include <blkid/blkid.h>
24 #endif
25 #include <errno.h>
26 #include <getopt.h>
27 #include <linux/loop.h>
28 #include <sched.h>
29 #ifdef HAVE_SECCOMP
30 #include <seccomp.h>
31 #endif
32 #ifdef HAVE_SELINUX
33 #include <selinux/selinux.h>
34 #endif
35 #include <signal.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <sys/file.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
44 #include <unistd.h>
45
46 #include "sd-daemon.h"
47 #include "sd-id128.h"
48
49 #include "alloc-util.h"
50 #include "barrier.h"
51 #include "base-filesystem.h"
52 #include "blkid-util.h"
53 #include "btrfs-util.h"
54 #include "cap-list.h"
55 #include "capability-util.h"
56 #include "cgroup-util.h"
57 #include "copy.h"
58 #include "dev-setup.h"
59 #include "env-util.h"
60 #include "fd-util.h"
61 #include "fdset.h"
62 #include "fileio.h"
63 #include "formats-util.h"
64 #include "fs-util.h"
65 #include "gpt.h"
66 #include "hostname-util.h"
67 #include "log.h"
68 #include "loopback-setup.h"
69 #include "machine-image.h"
70 #include "macro.h"
71 #include "missing.h"
72 #include "mkdir.h"
73 #include "mount-util.h"
74 #include "netlink-util.h"
75 #include "nspawn-cgroup.h"
76 #include "nspawn-expose-ports.h"
77 #include "nspawn-mount.h"
78 #include "nspawn-network.h"
79 #include "nspawn-register.h"
80 #include "nspawn-settings.h"
81 #include "nspawn-setuid.h"
82 #include "parse-util.h"
83 #include "path-util.h"
84 #include "process-util.h"
85 #include "ptyfwd.h"
86 #include "random-util.h"
87 #include "rm-rf.h"
88 #ifdef HAVE_SECCOMP
89 #include "seccomp-util.h"
90 #endif
91 #include "signal-util.h"
92 #include "socket-util.h"
93 #include "stat-util.h"
94 #include "stdio-util.h"
95 #include "string-util.h"
96 #include "strv.h"
97 #include "terminal-util.h"
98 #include "udev-util.h"
99 #include "umask-util.h"
100 #include "user-util.h"
101 #include "util.h"
102
103 typedef enum ContainerStatus {
104 CONTAINER_TERMINATED,
105 CONTAINER_REBOOTED
106 } ContainerStatus;
107
108 typedef enum LinkJournal {
109 LINK_NO,
110 LINK_AUTO,
111 LINK_HOST,
112 LINK_GUEST
113 } LinkJournal;
114
115 static char *arg_directory = NULL;
116 static char *arg_template = NULL;
117 static char *arg_user = NULL;
118 static sd_id128_t arg_uuid = {};
119 static char *arg_machine = NULL;
120 static const char *arg_selinux_context = NULL;
121 static const char *arg_selinux_apifs_context = NULL;
122 static const char *arg_slice = NULL;
123 static bool arg_private_network = false;
124 static bool arg_read_only = false;
125 static bool arg_boot = false;
126 static bool arg_ephemeral = false;
127 static LinkJournal arg_link_journal = LINK_AUTO;
128 static bool arg_link_journal_try = false;
129 static uint64_t arg_retain =
130 (1ULL << CAP_CHOWN) |
131 (1ULL << CAP_DAC_OVERRIDE) |
132 (1ULL << CAP_DAC_READ_SEARCH) |
133 (1ULL << CAP_FOWNER) |
134 (1ULL << CAP_FSETID) |
135 (1ULL << CAP_IPC_OWNER) |
136 (1ULL << CAP_KILL) |
137 (1ULL << CAP_LEASE) |
138 (1ULL << CAP_LINUX_IMMUTABLE) |
139 (1ULL << CAP_NET_BIND_SERVICE) |
140 (1ULL << CAP_NET_BROADCAST) |
141 (1ULL << CAP_NET_RAW) |
142 (1ULL << CAP_SETGID) |
143 (1ULL << CAP_SETFCAP) |
144 (1ULL << CAP_SETPCAP) |
145 (1ULL << CAP_SETUID) |
146 (1ULL << CAP_SYS_ADMIN) |
147 (1ULL << CAP_SYS_CHROOT) |
148 (1ULL << CAP_SYS_NICE) |
149 (1ULL << CAP_SYS_PTRACE) |
150 (1ULL << CAP_SYS_TTY_CONFIG) |
151 (1ULL << CAP_SYS_RESOURCE) |
152 (1ULL << CAP_SYS_BOOT) |
153 (1ULL << CAP_AUDIT_WRITE) |
154 (1ULL << CAP_AUDIT_CONTROL) |
155 (1ULL << CAP_MKNOD);
156 static CustomMount *arg_custom_mounts = NULL;
157 static unsigned arg_n_custom_mounts = 0;
158 static char **arg_setenv = NULL;
159 static bool arg_quiet = false;
160 static bool arg_share_system = false;
161 static bool arg_register = true;
162 static bool arg_keep_unit = false;
163 static char **arg_network_interfaces = NULL;
164 static char **arg_network_macvlan = NULL;
165 static char **arg_network_ipvlan = NULL;
166 static bool arg_network_veth = false;
167 static char **arg_network_veth_extra = NULL;
168 static char *arg_network_bridge = NULL;
169 static unsigned long arg_personality = PERSONALITY_INVALID;
170 static char *arg_image = NULL;
171 static VolatileMode arg_volatile_mode = VOLATILE_NO;
172 static ExposePort *arg_expose_ports = NULL;
173 static char **arg_property = NULL;
174 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
175 static bool arg_userns = false;
176 static int arg_kill_signal = 0;
177 static bool arg_unified_cgroup_hierarchy = false;
178 static SettingsMask arg_settings_mask = 0;
179 static int arg_settings_trusted = -1;
180 static char **arg_parameters = NULL;
181 static const char *arg_container_service_name = "systemd-nspawn";
182
183 static void help(void) {
184 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
185 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
186 " -h --help Show this help\n"
187 " --version Print version string\n"
188 " -q --quiet Do not show status information\n"
189 " -D --directory=PATH Root directory for the container\n"
190 " --template=PATH Initialize root directory from template directory,\n"
191 " if missing\n"
192 " -x --ephemeral Run container with snapshot of root directory, and\n"
193 " remove it after exit\n"
194 " -i --image=PATH File system device or disk image for the container\n"
195 " -b --boot Boot up full system (i.e. invoke init)\n"
196 " -u --user=USER Run the command under specified user or uid\n"
197 " -M --machine=NAME Set the machine name for the container\n"
198 " --uuid=UUID Set a specific machine UUID for the container\n"
199 " -S --slice=SLICE Place the container in the specified slice\n"
200 " --property=NAME=VALUE Set scope unit property\n"
201 " --private-users[=UIDBASE[:NUIDS]]\n"
202 " Run within user namespace\n"
203 " --private-network Disable network in container\n"
204 " --network-interface=INTERFACE\n"
205 " Assign an existing network interface to the\n"
206 " container\n"
207 " --network-macvlan=INTERFACE\n"
208 " Create a macvlan network interface based on an\n"
209 " existing network interface to the container\n"
210 " --network-ipvlan=INTERFACE\n"
211 " Create a ipvlan network interface based on an\n"
212 " existing network interface to the container\n"
213 " -n --network-veth Add a virtual Ethernet connection between host\n"
214 " and container\n"
215 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
216 " Add an additional virtual Ethernet link between\n"
217 " host and container\n"
218 " --network-bridge=INTERFACE\n"
219 " Add a virtual Ethernet connection between host\n"
220 " and container and add it to an existing bridge on\n"
221 " the host\n"
222 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
223 " Expose a container IP port on the host\n"
224 " -Z --selinux-context=SECLABEL\n"
225 " Set the SELinux security context to be used by\n"
226 " processes in the container\n"
227 " -L --selinux-apifs-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " API/tmpfs file systems in the container\n"
230 " --capability=CAP In addition to the default, retain specified\n"
231 " capability\n"
232 " --drop-capability=CAP Drop the specified capability from the default set\n"
233 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
234 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
235 " try-guest, try-host\n"
236 " -j Equivalent to --link-journal=try-guest\n"
237 " --read-only Mount the root directory read-only\n"
238 " --bind=PATH[:PATH[:OPTIONS]]\n"
239 " Bind mount a file or directory from the host into\n"
240 " the container\n"
241 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
242 " Similar, but creates a read-only bind mount\n"
243 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
244 " --overlay=PATH[:PATH...]:PATH\n"
245 " Create an overlay mount from the host to \n"
246 " the container\n"
247 " --overlay-ro=PATH[:PATH...]:PATH\n"
248 " Similar, but creates a read-only overlay mount\n"
249 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
250 " --share-system Share system namespaces with host\n"
251 " --register=BOOLEAN Register container as machine\n"
252 " --keep-unit Do not register a scope for the machine, reuse\n"
253 " the service unit nspawn is running in\n"
254 " --volatile[=MODE] Run the system in volatile mode\n"
255 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
256 , program_invocation_short_name);
257 }
258
259
260 static int custom_mounts_prepare(void) {
261 unsigned i;
262 int r;
263
264 /* Ensure the mounts are applied prefix first. */
265 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
266
267 /* Allocate working directories for the overlay file systems that need it */
268 for (i = 0; i < arg_n_custom_mounts; i++) {
269 CustomMount *m = &arg_custom_mounts[i];
270
271 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
272 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
273 return -EINVAL;
274 }
275
276 if (m->type != CUSTOM_MOUNT_OVERLAY)
277 continue;
278
279 if (m->work_dir)
280 continue;
281
282 if (m->read_only)
283 continue;
284
285 r = tempfn_random(m->source, NULL, &m->work_dir);
286 if (r < 0)
287 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
288 }
289
290 return 0;
291 }
292
293 static int detect_unified_cgroup_hierarchy(void) {
294 const char *e;
295 int r;
296
297 /* Allow the user to control whether the unified hierarchy is used */
298 e = getenv("UNIFIED_CGROUP_HIERARCHY");
299 if (e) {
300 r = parse_boolean(e);
301 if (r < 0)
302 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
303
304 arg_unified_cgroup_hierarchy = r;
305 return 0;
306 }
307
308 /* Otherwise inherit the default from the host system */
309 r = cg_unified();
310 if (r < 0)
311 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
312
313 arg_unified_cgroup_hierarchy = r;
314 return 0;
315 }
316
317 static int parse_argv(int argc, char *argv[]) {
318
319 enum {
320 ARG_VERSION = 0x100,
321 ARG_PRIVATE_NETWORK,
322 ARG_UUID,
323 ARG_READ_ONLY,
324 ARG_CAPABILITY,
325 ARG_DROP_CAPABILITY,
326 ARG_LINK_JOURNAL,
327 ARG_BIND,
328 ARG_BIND_RO,
329 ARG_TMPFS,
330 ARG_OVERLAY,
331 ARG_OVERLAY_RO,
332 ARG_SETENV,
333 ARG_SHARE_SYSTEM,
334 ARG_REGISTER,
335 ARG_KEEP_UNIT,
336 ARG_NETWORK_INTERFACE,
337 ARG_NETWORK_MACVLAN,
338 ARG_NETWORK_IPVLAN,
339 ARG_NETWORK_BRIDGE,
340 ARG_NETWORK_VETH_EXTRA,
341 ARG_PERSONALITY,
342 ARG_VOLATILE,
343 ARG_TEMPLATE,
344 ARG_PROPERTY,
345 ARG_PRIVATE_USERS,
346 ARG_KILL_SIGNAL,
347 ARG_SETTINGS,
348 };
349
350 static const struct option options[] = {
351 { "help", no_argument, NULL, 'h' },
352 { "version", no_argument, NULL, ARG_VERSION },
353 { "directory", required_argument, NULL, 'D' },
354 { "template", required_argument, NULL, ARG_TEMPLATE },
355 { "ephemeral", no_argument, NULL, 'x' },
356 { "user", required_argument, NULL, 'u' },
357 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
358 { "boot", no_argument, NULL, 'b' },
359 { "uuid", required_argument, NULL, ARG_UUID },
360 { "read-only", no_argument, NULL, ARG_READ_ONLY },
361 { "capability", required_argument, NULL, ARG_CAPABILITY },
362 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
363 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
364 { "bind", required_argument, NULL, ARG_BIND },
365 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
366 { "tmpfs", required_argument, NULL, ARG_TMPFS },
367 { "overlay", required_argument, NULL, ARG_OVERLAY },
368 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
369 { "machine", required_argument, NULL, 'M' },
370 { "slice", required_argument, NULL, 'S' },
371 { "setenv", required_argument, NULL, ARG_SETENV },
372 { "selinux-context", required_argument, NULL, 'Z' },
373 { "selinux-apifs-context", required_argument, NULL, 'L' },
374 { "quiet", no_argument, NULL, 'q' },
375 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
376 { "register", required_argument, NULL, ARG_REGISTER },
377 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
378 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
379 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
380 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
381 { "network-veth", no_argument, NULL, 'n' },
382 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
383 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
384 { "personality", required_argument, NULL, ARG_PERSONALITY },
385 { "image", required_argument, NULL, 'i' },
386 { "volatile", optional_argument, NULL, ARG_VOLATILE },
387 { "port", required_argument, NULL, 'p' },
388 { "property", required_argument, NULL, ARG_PROPERTY },
389 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
390 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
391 { "settings", required_argument, NULL, ARG_SETTINGS },
392 {}
393 };
394
395 int c, r;
396 const char *p, *e;
397 uint64_t plus = 0, minus = 0;
398 bool mask_all_settings = false, mask_no_settings = false;
399
400 assert(argc >= 0);
401 assert(argv);
402
403 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
404
405 switch (c) {
406
407 case 'h':
408 help();
409 return 0;
410
411 case ARG_VERSION:
412 return version();
413
414 case 'D':
415 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
416 if (r < 0)
417 return r;
418 break;
419
420 case ARG_TEMPLATE:
421 r = parse_path_argument_and_warn(optarg, false, &arg_template);
422 if (r < 0)
423 return r;
424 break;
425
426 case 'i':
427 r = parse_path_argument_and_warn(optarg, false, &arg_image);
428 if (r < 0)
429 return r;
430 break;
431
432 case 'x':
433 arg_ephemeral = true;
434 break;
435
436 case 'u':
437 r = free_and_strdup(&arg_user, optarg);
438 if (r < 0)
439 return log_oom();
440
441 arg_settings_mask |= SETTING_USER;
442 break;
443
444 case ARG_NETWORK_BRIDGE:
445 r = free_and_strdup(&arg_network_bridge, optarg);
446 if (r < 0)
447 return log_oom();
448
449 /* fall through */
450
451 case 'n':
452 arg_network_veth = true;
453 arg_private_network = true;
454 arg_settings_mask |= SETTING_NETWORK;
455 break;
456
457 case ARG_NETWORK_VETH_EXTRA:
458 r = veth_extra_parse(&arg_network_veth_extra, optarg);
459 if (r < 0)
460 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
461
462 arg_private_network = true;
463 arg_settings_mask |= SETTING_NETWORK;
464 break;
465
466 case ARG_NETWORK_INTERFACE:
467 if (strv_extend(&arg_network_interfaces, optarg) < 0)
468 return log_oom();
469
470 arg_private_network = true;
471 arg_settings_mask |= SETTING_NETWORK;
472 break;
473
474 case ARG_NETWORK_MACVLAN:
475 if (strv_extend(&arg_network_macvlan, optarg) < 0)
476 return log_oom();
477
478 arg_private_network = true;
479 arg_settings_mask |= SETTING_NETWORK;
480 break;
481
482 case ARG_NETWORK_IPVLAN:
483 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
484 return log_oom();
485
486 /* fall through */
487
488 case ARG_PRIVATE_NETWORK:
489 arg_private_network = true;
490 arg_settings_mask |= SETTING_NETWORK;
491 break;
492
493 case 'b':
494 arg_boot = true;
495 arg_settings_mask |= SETTING_BOOT;
496 break;
497
498 case ARG_UUID:
499 r = sd_id128_from_string(optarg, &arg_uuid);
500 if (r < 0) {
501 log_error("Invalid UUID: %s", optarg);
502 return r;
503 }
504
505 arg_settings_mask |= SETTING_MACHINE_ID;
506 break;
507
508 case 'S':
509 arg_slice = optarg;
510 break;
511
512 case 'M':
513 if (isempty(optarg))
514 arg_machine = mfree(arg_machine);
515 else {
516 if (!machine_name_is_valid(optarg)) {
517 log_error("Invalid machine name: %s", optarg);
518 return -EINVAL;
519 }
520
521 r = free_and_strdup(&arg_machine, optarg);
522 if (r < 0)
523 return log_oom();
524
525 break;
526 }
527
528 case 'Z':
529 arg_selinux_context = optarg;
530 break;
531
532 case 'L':
533 arg_selinux_apifs_context = optarg;
534 break;
535
536 case ARG_READ_ONLY:
537 arg_read_only = true;
538 arg_settings_mask |= SETTING_READ_ONLY;
539 break;
540
541 case ARG_CAPABILITY:
542 case ARG_DROP_CAPABILITY: {
543 p = optarg;
544 for(;;) {
545 _cleanup_free_ char *t = NULL;
546
547 r = extract_first_word(&p, &t, ",", 0);
548 if (r < 0)
549 return log_error_errno(r, "Failed to parse capability %s.", t);
550
551 if (r == 0)
552 break;
553
554 if (streq(t, "all")) {
555 if (c == ARG_CAPABILITY)
556 plus = (uint64_t) -1;
557 else
558 minus = (uint64_t) -1;
559 } else {
560 int cap;
561
562 cap = capability_from_name(t);
563 if (cap < 0) {
564 log_error("Failed to parse capability %s.", t);
565 return -EINVAL;
566 }
567
568 if (c == ARG_CAPABILITY)
569 plus |= 1ULL << (uint64_t) cap;
570 else
571 minus |= 1ULL << (uint64_t) cap;
572 }
573 }
574
575 arg_settings_mask |= SETTING_CAPABILITY;
576 break;
577 }
578
579 case 'j':
580 arg_link_journal = LINK_GUEST;
581 arg_link_journal_try = true;
582 break;
583
584 case ARG_LINK_JOURNAL:
585 if (streq(optarg, "auto")) {
586 arg_link_journal = LINK_AUTO;
587 arg_link_journal_try = false;
588 } else if (streq(optarg, "no")) {
589 arg_link_journal = LINK_NO;
590 arg_link_journal_try = false;
591 } else if (streq(optarg, "guest")) {
592 arg_link_journal = LINK_GUEST;
593 arg_link_journal_try = false;
594 } else if (streq(optarg, "host")) {
595 arg_link_journal = LINK_HOST;
596 arg_link_journal_try = false;
597 } else if (streq(optarg, "try-guest")) {
598 arg_link_journal = LINK_GUEST;
599 arg_link_journal_try = true;
600 } else if (streq(optarg, "try-host")) {
601 arg_link_journal = LINK_HOST;
602 arg_link_journal_try = true;
603 } else {
604 log_error("Failed to parse link journal mode %s", optarg);
605 return -EINVAL;
606 }
607
608 break;
609
610 case ARG_BIND:
611 case ARG_BIND_RO:
612 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
613 if (r < 0)
614 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
615
616 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
617 break;
618
619 case ARG_TMPFS:
620 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
621 if (r < 0)
622 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
623
624 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
625 break;
626
627 case ARG_OVERLAY:
628 case ARG_OVERLAY_RO: {
629 _cleanup_free_ char *upper = NULL, *destination = NULL;
630 _cleanup_strv_free_ char **lower = NULL;
631 CustomMount *m;
632 unsigned n = 0;
633 char **i;
634
635 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
636 if (r == -ENOMEM)
637 return log_oom();
638 else if (r < 0) {
639 log_error("Invalid overlay specification: %s", optarg);
640 return r;
641 }
642
643 STRV_FOREACH(i, lower) {
644 if (!path_is_absolute(*i)) {
645 log_error("Overlay path %s is not absolute.", *i);
646 return -EINVAL;
647 }
648
649 n++;
650 }
651
652 if (n < 2) {
653 log_error("--overlay= needs at least two colon-separated directories specified.");
654 return -EINVAL;
655 }
656
657 if (n == 2) {
658 /* If two parameters are specified,
659 * the first one is the lower, the
660 * second one the upper directory. And
661 * we'll also define the destination
662 * mount point the same as the upper. */
663 upper = lower[1];
664 lower[1] = NULL;
665
666 destination = strdup(upper);
667 if (!destination)
668 return log_oom();
669
670 } else {
671 upper = lower[n - 2];
672 destination = lower[n - 1];
673 lower[n - 2] = NULL;
674 }
675
676 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
677 if (!m)
678 return log_oom();
679
680 m->destination = destination;
681 m->source = upper;
682 m->lower = lower;
683 m->read_only = c == ARG_OVERLAY_RO;
684
685 upper = destination = NULL;
686 lower = NULL;
687
688 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
689 break;
690 }
691
692 case ARG_SETENV: {
693 char **n;
694
695 if (!env_assignment_is_valid(optarg)) {
696 log_error("Environment variable assignment '%s' is not valid.", optarg);
697 return -EINVAL;
698 }
699
700 n = strv_env_set(arg_setenv, optarg);
701 if (!n)
702 return log_oom();
703
704 strv_free(arg_setenv);
705 arg_setenv = n;
706
707 arg_settings_mask |= SETTING_ENVIRONMENT;
708 break;
709 }
710
711 case 'q':
712 arg_quiet = true;
713 break;
714
715 case ARG_SHARE_SYSTEM:
716 arg_share_system = true;
717 break;
718
719 case ARG_REGISTER:
720 r = parse_boolean(optarg);
721 if (r < 0) {
722 log_error("Failed to parse --register= argument: %s", optarg);
723 return r;
724 }
725
726 arg_register = r;
727 break;
728
729 case ARG_KEEP_UNIT:
730 arg_keep_unit = true;
731 break;
732
733 case ARG_PERSONALITY:
734
735 arg_personality = personality_from_string(optarg);
736 if (arg_personality == PERSONALITY_INVALID) {
737 log_error("Unknown or unsupported personality '%s'.", optarg);
738 return -EINVAL;
739 }
740
741 arg_settings_mask |= SETTING_PERSONALITY;
742 break;
743
744 case ARG_VOLATILE:
745
746 if (!optarg)
747 arg_volatile_mode = VOLATILE_YES;
748 else {
749 VolatileMode m;
750
751 m = volatile_mode_from_string(optarg);
752 if (m < 0) {
753 log_error("Failed to parse --volatile= argument: %s", optarg);
754 return -EINVAL;
755 } else
756 arg_volatile_mode = m;
757 }
758
759 arg_settings_mask |= SETTING_VOLATILE_MODE;
760 break;
761
762 case 'p':
763 r = expose_port_parse(&arg_expose_ports, optarg);
764 if (r == -EEXIST)
765 return log_error_errno(r, "Duplicate port specification: %s", optarg);
766 if (r < 0)
767 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
768
769 arg_settings_mask |= SETTING_EXPOSE_PORTS;
770 break;
771
772 case ARG_PROPERTY:
773 if (strv_extend(&arg_property, optarg) < 0)
774 return log_oom();
775
776 break;
777
778 case ARG_PRIVATE_USERS:
779 if (optarg) {
780 _cleanup_free_ char *buffer = NULL;
781 const char *range, *shift;
782
783 range = strchr(optarg, ':');
784 if (range) {
785 buffer = strndup(optarg, range - optarg);
786 if (!buffer)
787 return log_oom();
788 shift = buffer;
789
790 range++;
791 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
792 log_error("Failed to parse UID range: %s", range);
793 return -EINVAL;
794 }
795 } else
796 shift = optarg;
797
798 if (parse_uid(shift, &arg_uid_shift) < 0) {
799 log_error("Failed to parse UID: %s", optarg);
800 return -EINVAL;
801 }
802 }
803
804 arg_userns = true;
805 break;
806
807 case ARG_KILL_SIGNAL:
808 arg_kill_signal = signal_from_string_try_harder(optarg);
809 if (arg_kill_signal < 0) {
810 log_error("Cannot parse signal: %s", optarg);
811 return -EINVAL;
812 }
813
814 arg_settings_mask |= SETTING_KILL_SIGNAL;
815 break;
816
817 case ARG_SETTINGS:
818
819 /* no → do not read files
820 * yes → read files, do not override cmdline, trust only subset
821 * override → read files, override cmdline, trust only subset
822 * trusted → read files, do not override cmdline, trust all
823 */
824
825 r = parse_boolean(optarg);
826 if (r < 0) {
827 if (streq(optarg, "trusted")) {
828 mask_all_settings = false;
829 mask_no_settings = false;
830 arg_settings_trusted = true;
831
832 } else if (streq(optarg, "override")) {
833 mask_all_settings = false;
834 mask_no_settings = true;
835 arg_settings_trusted = -1;
836 } else
837 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
838 } else if (r > 0) {
839 /* yes */
840 mask_all_settings = false;
841 mask_no_settings = false;
842 arg_settings_trusted = -1;
843 } else {
844 /* no */
845 mask_all_settings = true;
846 mask_no_settings = false;
847 arg_settings_trusted = false;
848 }
849
850 break;
851
852 case '?':
853 return -EINVAL;
854
855 default:
856 assert_not_reached("Unhandled option");
857 }
858
859 if (arg_share_system)
860 arg_register = false;
861
862 if (arg_boot && arg_share_system) {
863 log_error("--boot and --share-system may not be combined.");
864 return -EINVAL;
865 }
866
867 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
868 log_error("--keep-unit may not be used when invoked from a user session.");
869 return -EINVAL;
870 }
871
872 if (arg_directory && arg_image) {
873 log_error("--directory= and --image= may not be combined.");
874 return -EINVAL;
875 }
876
877 if (arg_template && arg_image) {
878 log_error("--template= and --image= may not be combined.");
879 return -EINVAL;
880 }
881
882 if (arg_template && !(arg_directory || arg_machine)) {
883 log_error("--template= needs --directory= or --machine=.");
884 return -EINVAL;
885 }
886
887 if (arg_ephemeral && arg_template) {
888 log_error("--ephemeral and --template= may not be combined.");
889 return -EINVAL;
890 }
891
892 if (arg_ephemeral && arg_image) {
893 log_error("--ephemeral and --image= may not be combined.");
894 return -EINVAL;
895 }
896
897 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
898 log_error("--ephemeral and --link-journal= may not be combined.");
899 return -EINVAL;
900 }
901
902 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
903 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
904
905 if (argc > optind) {
906 arg_parameters = strv_copy(argv + optind);
907 if (!arg_parameters)
908 return log_oom();
909
910 arg_settings_mask |= SETTING_BOOT;
911 }
912
913 /* Load all settings from .nspawn files */
914 if (mask_no_settings)
915 arg_settings_mask = 0;
916
917 /* Don't load any settings from .nspawn files */
918 if (mask_all_settings)
919 arg_settings_mask = _SETTINGS_MASK_ALL;
920
921 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
922
923 r = detect_unified_cgroup_hierarchy();
924 if (r < 0)
925 return r;
926
927 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
928 if (e)
929 arg_container_service_name = e;
930
931 return 1;
932 }
933
934 static int verify_arguments(void) {
935
936 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
937 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
938 return -EINVAL;
939 }
940
941 if (arg_expose_ports && !arg_private_network) {
942 log_error("Cannot use --port= without private networking.");
943 return -EINVAL;
944 }
945
946 if (arg_boot && arg_kill_signal <= 0)
947 arg_kill_signal = SIGRTMIN+3;
948
949 return 0;
950 }
951
952 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
953 assert(p);
954
955 if (!arg_userns)
956 return 0;
957
958 if (uid == UID_INVALID && gid == GID_INVALID)
959 return 0;
960
961 if (uid != UID_INVALID) {
962 uid += arg_uid_shift;
963
964 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
965 return -EOVERFLOW;
966 }
967
968 if (gid != GID_INVALID) {
969 gid += (gid_t) arg_uid_shift;
970
971 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
972 return -EOVERFLOW;
973 }
974
975 if (lchown(p, uid, gid) < 0)
976 return -errno;
977
978 return 0;
979 }
980
981 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
982 const char *q;
983
984 q = prefix_roota(root, path);
985 if (mkdir(q, mode) < 0) {
986 if (errno == EEXIST)
987 return 0;
988 return -errno;
989 }
990
991 return userns_lchown(q, uid, gid);
992 }
993
994 static int setup_timezone(const char *dest) {
995 _cleanup_free_ char *p = NULL, *q = NULL;
996 const char *where, *check, *what;
997 char *z, *y;
998 int r;
999
1000 assert(dest);
1001
1002 /* Fix the timezone, if possible */
1003 r = readlink_malloc("/etc/localtime", &p);
1004 if (r < 0) {
1005 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1006 return 0;
1007 }
1008
1009 z = path_startswith(p, "../usr/share/zoneinfo/");
1010 if (!z)
1011 z = path_startswith(p, "/usr/share/zoneinfo/");
1012 if (!z) {
1013 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1014 return 0;
1015 }
1016
1017 where = prefix_roota(dest, "/etc/localtime");
1018 r = readlink_malloc(where, &q);
1019 if (r >= 0) {
1020 y = path_startswith(q, "../usr/share/zoneinfo/");
1021 if (!y)
1022 y = path_startswith(q, "/usr/share/zoneinfo/");
1023
1024 /* Already pointing to the right place? Then do nothing .. */
1025 if (y && streq(y, z))
1026 return 0;
1027 }
1028
1029 check = strjoina("/usr/share/zoneinfo/", z);
1030 check = prefix_roota(dest, check);
1031 if (laccess(check, F_OK) < 0) {
1032 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1033 return 0;
1034 }
1035
1036 r = unlink(where);
1037 if (r < 0 && errno != ENOENT) {
1038 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1039 return 0;
1040 }
1041
1042 what = strjoina("../usr/share/zoneinfo/", z);
1043 if (symlink(what, where) < 0) {
1044 log_error_errno(errno, "Failed to correct timezone of container: %m");
1045 return 0;
1046 }
1047
1048 r = userns_lchown(where, 0, 0);
1049 if (r < 0)
1050 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1051
1052 return 0;
1053 }
1054
1055 static int setup_resolv_conf(const char *dest) {
1056 const char *where = NULL;
1057 int r;
1058
1059 assert(dest);
1060
1061 if (arg_private_network)
1062 return 0;
1063
1064 /* Fix resolv.conf, if possible */
1065 where = prefix_roota(dest, "/etc/resolv.conf");
1066
1067 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1068 if (r < 0) {
1069 /* If the file already exists as symlink, let's
1070 * suppress the warning, under the assumption that
1071 * resolved or something similar runs inside and the
1072 * symlink points there.
1073 *
1074 * If the disk image is read-only, there's also no
1075 * point in complaining.
1076 */
1077 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1078 "Failed to copy /etc/resolv.conf to %s: %m", where);
1079 return 0;
1080 }
1081
1082 r = userns_lchown(where, 0, 0);
1083 if (r < 0)
1084 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1085
1086 return 0;
1087 }
1088
1089 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1090 assert(s);
1091
1092 snprintf(s, 37,
1093 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1094 SD_ID128_FORMAT_VAL(id));
1095
1096 return s;
1097 }
1098
1099 static int setup_boot_id(const char *dest) {
1100 const char *from, *to;
1101 sd_id128_t rnd = {};
1102 char as_uuid[37];
1103 int r;
1104
1105 if (arg_share_system)
1106 return 0;
1107
1108 /* Generate a new randomized boot ID, so that each boot-up of
1109 * the container gets a new one */
1110
1111 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1112 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1113
1114 r = sd_id128_randomize(&rnd);
1115 if (r < 0)
1116 return log_error_errno(r, "Failed to generate random boot id: %m");
1117
1118 id128_format_as_uuid(rnd, as_uuid);
1119
1120 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1121 if (r < 0)
1122 return log_error_errno(r, "Failed to write boot id: %m");
1123
1124 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1125 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1126 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1127 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1128
1129 unlink(from);
1130 return r;
1131 }
1132
1133 static int copy_devnodes(const char *dest) {
1134
1135 static const char devnodes[] =
1136 "null\0"
1137 "zero\0"
1138 "full\0"
1139 "random\0"
1140 "urandom\0"
1141 "tty\0"
1142 "net/tun\0";
1143
1144 const char *d;
1145 int r = 0;
1146 _cleanup_umask_ mode_t u;
1147
1148 assert(dest);
1149
1150 u = umask(0000);
1151
1152 /* Create /dev/net, so that we can create /dev/net/tun in it */
1153 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1154 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1155
1156 NULSTR_FOREACH(d, devnodes) {
1157 _cleanup_free_ char *from = NULL, *to = NULL;
1158 struct stat st;
1159
1160 from = strappend("/dev/", d);
1161 to = prefix_root(dest, from);
1162
1163 if (stat(from, &st) < 0) {
1164
1165 if (errno != ENOENT)
1166 return log_error_errno(errno, "Failed to stat %s: %m", from);
1167
1168 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1169
1170 log_error("%s is not a char or block device, cannot copy.", from);
1171 return -EIO;
1172
1173 } else {
1174 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1175 if (errno != EPERM)
1176 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1177
1178 /* Some systems abusively restrict mknod but
1179 * allow bind mounts. */
1180 r = touch(to);
1181 if (r < 0)
1182 return log_error_errno(r, "touch (%s) failed: %m", to);
1183 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1184 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1185 }
1186
1187 r = userns_lchown(to, 0, 0);
1188 if (r < 0)
1189 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1190 }
1191 }
1192
1193 return r;
1194 }
1195
1196 static int setup_pts(const char *dest) {
1197 _cleanup_free_ char *options = NULL;
1198 const char *p;
1199 int r;
1200
1201 #ifdef HAVE_SELINUX
1202 if (arg_selinux_apifs_context)
1203 (void) asprintf(&options,
1204 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1205 arg_uid_shift + TTY_GID,
1206 arg_selinux_apifs_context);
1207 else
1208 #endif
1209 (void) asprintf(&options,
1210 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1211 arg_uid_shift + TTY_GID);
1212
1213 if (!options)
1214 return log_oom();
1215
1216 /* Mount /dev/pts itself */
1217 p = prefix_roota(dest, "/dev/pts");
1218 if (mkdir(p, 0755) < 0)
1219 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1220 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1221 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1222 r = userns_lchown(p, 0, 0);
1223 if (r < 0)
1224 return log_error_errno(r, "Failed to chown /dev/pts: %m");
1225
1226 /* Create /dev/ptmx symlink */
1227 p = prefix_roota(dest, "/dev/ptmx");
1228 if (symlink("pts/ptmx", p) < 0)
1229 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1230 r = userns_lchown(p, 0, 0);
1231 if (r < 0)
1232 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
1233
1234 /* And fix /dev/pts/ptmx ownership */
1235 p = prefix_roota(dest, "/dev/pts/ptmx");
1236 r = userns_lchown(p, 0, 0);
1237 if (r < 0)
1238 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
1239
1240 return 0;
1241 }
1242
1243 static int setup_dev_console(const char *dest, const char *console) {
1244 _cleanup_umask_ mode_t u;
1245 const char *to;
1246 int r;
1247
1248 assert(dest);
1249 assert(console);
1250
1251 u = umask(0000);
1252
1253 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1254 if (r < 0)
1255 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1256
1257 /* We need to bind mount the right tty to /dev/console since
1258 * ptys can only exist on pts file systems. To have something
1259 * to bind mount things on we create a empty regular file. */
1260
1261 to = prefix_roota(dest, "/dev/console");
1262 r = touch(to);
1263 if (r < 0)
1264 return log_error_errno(r, "touch() for /dev/console failed: %m");
1265
1266 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1267 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1268
1269 return 0;
1270 }
1271
1272 static int setup_kmsg(const char *dest, int kmsg_socket) {
1273 const char *from, *to;
1274 _cleanup_umask_ mode_t u;
1275 int fd, r;
1276
1277 assert(kmsg_socket >= 0);
1278
1279 u = umask(0000);
1280
1281 /* We create the kmsg FIFO as /run/kmsg, but immediately
1282 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1283 * on the reading side behave very similar to /proc/kmsg,
1284 * their writing side behaves differently from /dev/kmsg in
1285 * that writing blocks when nothing is reading. In order to
1286 * avoid any problems with containers deadlocking due to this
1287 * we simply make /dev/kmsg unavailable to the container. */
1288 from = prefix_roota(dest, "/run/kmsg");
1289 to = prefix_roota(dest, "/proc/kmsg");
1290
1291 if (mkfifo(from, 0600) < 0)
1292 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1293 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1294 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1295
1296 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1297 if (fd < 0)
1298 return log_error_errno(errno, "Failed to open fifo: %m");
1299
1300 /* Store away the fd in the socket, so that it stays open as
1301 * long as we run the child */
1302 r = send_one_fd(kmsg_socket, fd, 0);
1303 safe_close(fd);
1304
1305 if (r < 0)
1306 return log_error_errno(r, "Failed to send FIFO fd: %m");
1307
1308 /* And now make the FIFO unavailable as /run/kmsg... */
1309 (void) unlink(from);
1310
1311 return 0;
1312 }
1313
1314 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1315 union in_addr_union *exposed = userdata;
1316
1317 assert(rtnl);
1318 assert(m);
1319 assert(exposed);
1320
1321 expose_port_execute(rtnl, arg_expose_ports, exposed);
1322 return 0;
1323 }
1324
1325 static int setup_hostname(void) {
1326
1327 if (arg_share_system)
1328 return 0;
1329
1330 if (sethostname_idempotent(arg_machine) < 0)
1331 return -errno;
1332
1333 return 0;
1334 }
1335
1336 static int setup_journal(const char *directory) {
1337 sd_id128_t machine_id, this_id;
1338 _cleanup_free_ char *b = NULL, *d = NULL;
1339 const char *etc_machine_id, *p, *q;
1340 char *id;
1341 int r;
1342
1343 /* Don't link journals in ephemeral mode */
1344 if (arg_ephemeral)
1345 return 0;
1346
1347 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1348
1349 r = read_one_line_file(etc_machine_id, &b);
1350 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1351 return 0;
1352 else if (r < 0)
1353 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
1354
1355 id = strstrip(b);
1356 if (isempty(id) && arg_link_journal == LINK_AUTO)
1357 return 0;
1358
1359 /* Verify validity */
1360 r = sd_id128_from_string(id, &machine_id);
1361 if (r < 0)
1362 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
1363
1364 r = sd_id128_get_machine(&this_id);
1365 if (r < 0)
1366 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1367
1368 if (sd_id128_equal(machine_id, this_id)) {
1369 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1370 "Host and machine ids are equal (%s): refusing to link journals", id);
1371 if (arg_link_journal == LINK_AUTO)
1372 return 0;
1373 return -EEXIST;
1374 }
1375
1376 if (arg_link_journal == LINK_NO)
1377 return 0;
1378
1379 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1380 if (r < 0)
1381 return log_error_errno(r, "Failed to create /var: %m");
1382
1383 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1384 if (r < 0)
1385 return log_error_errno(r, "Failed to create /var/log: %m");
1386
1387 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1388 if (r < 0)
1389 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1390
1391 p = strjoina("/var/log/journal/", id);
1392 q = prefix_roota(directory, p);
1393
1394 if (path_is_mount_point(p, 0) > 0) {
1395 if (arg_link_journal != LINK_AUTO) {
1396 log_error("%s: already a mount point, refusing to use for journal", p);
1397 return -EEXIST;
1398 }
1399
1400 return 0;
1401 }
1402
1403 if (path_is_mount_point(q, 0) > 0) {
1404 if (arg_link_journal != LINK_AUTO) {
1405 log_error("%s: already a mount point, refusing to use for journal", q);
1406 return -EEXIST;
1407 }
1408
1409 return 0;
1410 }
1411
1412 r = readlink_and_make_absolute(p, &d);
1413 if (r >= 0) {
1414 if ((arg_link_journal == LINK_GUEST ||
1415 arg_link_journal == LINK_AUTO) &&
1416 path_equal(d, q)) {
1417
1418 r = userns_mkdir(directory, p, 0755, 0, 0);
1419 if (r < 0)
1420 log_warning_errno(r, "Failed to create directory %s: %m", q);
1421 return 0;
1422 }
1423
1424 if (unlink(p) < 0)
1425 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1426 } else if (r == -EINVAL) {
1427
1428 if (arg_link_journal == LINK_GUEST &&
1429 rmdir(p) < 0) {
1430
1431 if (errno == ENOTDIR) {
1432 log_error("%s already exists and is neither a symlink nor a directory", p);
1433 return r;
1434 } else
1435 return log_error_errno(errno, "Failed to remove %s: %m", p);
1436 }
1437 } else if (r != -ENOENT)
1438 return log_error_errno(r, "readlink(%s) failed: %m", p);
1439
1440 if (arg_link_journal == LINK_GUEST) {
1441
1442 if (symlink(q, p) < 0) {
1443 if (arg_link_journal_try) {
1444 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1445 return 0;
1446 } else
1447 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1448 }
1449
1450 r = userns_mkdir(directory, p, 0755, 0, 0);
1451 if (r < 0)
1452 log_warning_errno(r, "Failed to create directory %s: %m", q);
1453 return 0;
1454 }
1455
1456 if (arg_link_journal == LINK_HOST) {
1457 /* don't create parents here -- if the host doesn't have
1458 * permanent journal set up, don't force it here */
1459 r = mkdir(p, 0755);
1460 if (r < 0) {
1461 if (arg_link_journal_try) {
1462 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1463 return 0;
1464 } else
1465 return log_error_errno(errno, "Failed to create %s: %m", p);
1466 }
1467
1468 } else if (access(p, F_OK) < 0)
1469 return 0;
1470
1471 if (dir_is_empty(q) == 0)
1472 log_warning("%s is not empty, proceeding anyway.", q);
1473
1474 r = userns_mkdir(directory, p, 0755, 0, 0);
1475 if (r < 0)
1476 return log_error_errno(r, "Failed to create %s: %m", q);
1477
1478 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1479 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1480
1481 return 0;
1482 }
1483
1484 static int drop_capabilities(void) {
1485 return capability_bounding_set_drop(arg_retain, false);
1486 }
1487
1488 static int reset_audit_loginuid(void) {
1489 _cleanup_free_ char *p = NULL;
1490 int r;
1491
1492 if (arg_share_system)
1493 return 0;
1494
1495 r = read_one_line_file("/proc/self/loginuid", &p);
1496 if (r == -ENOENT)
1497 return 0;
1498 if (r < 0)
1499 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1500
1501 /* Already reset? */
1502 if (streq(p, "4294967295"))
1503 return 0;
1504
1505 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1506 if (r < 0) {
1507 log_error_errno(r,
1508 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1509 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1510 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1511 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1512 "using systemd-nspawn. Sleeping for 5s... (%m)");
1513
1514 sleep(5);
1515 }
1516
1517 return 0;
1518 }
1519
1520 static int setup_seccomp(void) {
1521
1522 #ifdef HAVE_SECCOMP
1523 static const struct {
1524 uint64_t capability;
1525 int syscall_num;
1526 } blacklist[] = {
1527 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1528 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1529 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1530 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1531 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1532 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1533 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1534 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1535 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1536 { CAP_SYSLOG, SCMP_SYS(syslog) },
1537 };
1538
1539 scmp_filter_ctx seccomp;
1540 unsigned i;
1541 int r;
1542
1543 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1544 if (!seccomp)
1545 return log_oom();
1546
1547 r = seccomp_add_secondary_archs(seccomp);
1548 if (r < 0) {
1549 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1550 goto finish;
1551 }
1552
1553 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1554 if (arg_retain & (1ULL << blacklist[i].capability))
1555 continue;
1556
1557 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1558 if (r == -EFAULT)
1559 continue; /* unknown syscall */
1560 if (r < 0) {
1561 log_error_errno(r, "Failed to block syscall: %m");
1562 goto finish;
1563 }
1564 }
1565
1566
1567 /*
1568 Audit is broken in containers, much of the userspace audit
1569 hookup will fail if running inside a container. We don't
1570 care and just turn off creation of audit sockets.
1571
1572 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1573 with EAFNOSUPPORT which audit userspace uses as indication
1574 that audit is disabled in the kernel.
1575 */
1576
1577 r = seccomp_rule_add(
1578 seccomp,
1579 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1580 SCMP_SYS(socket),
1581 2,
1582 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1583 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1584 if (r < 0) {
1585 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1586 goto finish;
1587 }
1588
1589 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1590 if (r < 0) {
1591 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1592 goto finish;
1593 }
1594
1595 r = seccomp_load(seccomp);
1596 if (r == -EINVAL) {
1597 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1598 r = 0;
1599 goto finish;
1600 }
1601 if (r < 0) {
1602 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1603 goto finish;
1604 }
1605
1606 finish:
1607 seccomp_release(seccomp);
1608 return r;
1609 #else
1610 return 0;
1611 #endif
1612
1613 }
1614
1615 static int setup_propagate(const char *root) {
1616 const char *p, *q;
1617 int r;
1618
1619 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1620 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1621 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1622 (void) mkdir_p(p, 0600);
1623
1624 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1625 if (r < 0)
1626 return log_error_errno(r, "Failed to create /run/systemd: %m");
1627
1628 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1629 if (r < 0)
1630 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
1631
1632 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1633 if (r < 0)
1634 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
1635
1636 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1637 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1638 return log_error_errno(errno, "Failed to install propagation bind mount.");
1639
1640 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1641 return log_error_errno(errno, "Failed to make propagation mount read-only");
1642
1643 return 0;
1644 }
1645
1646 static int setup_image(char **device_path, int *loop_nr) {
1647 struct loop_info64 info = {
1648 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1649 };
1650 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1651 _cleanup_free_ char* loopdev = NULL;
1652 struct stat st;
1653 int r, nr;
1654
1655 assert(device_path);
1656 assert(loop_nr);
1657 assert(arg_image);
1658
1659 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1660 if (fd < 0)
1661 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1662
1663 if (fstat(fd, &st) < 0)
1664 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1665
1666 if (S_ISBLK(st.st_mode)) {
1667 char *p;
1668
1669 p = strdup(arg_image);
1670 if (!p)
1671 return log_oom();
1672
1673 *device_path = p;
1674
1675 *loop_nr = -1;
1676
1677 r = fd;
1678 fd = -1;
1679
1680 return r;
1681 }
1682
1683 if (!S_ISREG(st.st_mode)) {
1684 log_error("%s is not a regular file or block device.", arg_image);
1685 return -EINVAL;
1686 }
1687
1688 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1689 if (control < 0)
1690 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1691
1692 nr = ioctl(control, LOOP_CTL_GET_FREE);
1693 if (nr < 0)
1694 return log_error_errno(errno, "Failed to allocate loop device: %m");
1695
1696 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1697 return log_oom();
1698
1699 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1700 if (loop < 0)
1701 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1702
1703 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1704 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1705
1706 if (arg_read_only)
1707 info.lo_flags |= LO_FLAGS_READ_ONLY;
1708
1709 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1710 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1711
1712 *device_path = loopdev;
1713 loopdev = NULL;
1714
1715 *loop_nr = nr;
1716
1717 r = loop;
1718 loop = -1;
1719
1720 return r;
1721 }
1722
1723 #define PARTITION_TABLE_BLURB \
1724 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1725 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1726 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1727 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1728 "to be bootable with systemd-nspawn."
1729
1730 static int dissect_image(
1731 int fd,
1732 char **root_device, bool *root_device_rw,
1733 char **home_device, bool *home_device_rw,
1734 char **srv_device, bool *srv_device_rw,
1735 bool *secondary) {
1736
1737 #ifdef HAVE_BLKID
1738 int home_nr = -1, srv_nr = -1;
1739 #ifdef GPT_ROOT_NATIVE
1740 int root_nr = -1;
1741 #endif
1742 #ifdef GPT_ROOT_SECONDARY
1743 int secondary_root_nr = -1;
1744 #endif
1745 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1746 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1747 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1748 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1749 _cleanup_udev_unref_ struct udev *udev = NULL;
1750 struct udev_list_entry *first, *item;
1751 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1752 bool is_gpt, is_mbr, multiple_generic = false;
1753 const char *pttype = NULL;
1754 blkid_partlist pl;
1755 struct stat st;
1756 unsigned i;
1757 int r;
1758
1759 assert(fd >= 0);
1760 assert(root_device);
1761 assert(home_device);
1762 assert(srv_device);
1763 assert(secondary);
1764 assert(arg_image);
1765
1766 b = blkid_new_probe();
1767 if (!b)
1768 return log_oom();
1769
1770 errno = 0;
1771 r = blkid_probe_set_device(b, fd, 0, 0);
1772 if (r != 0) {
1773 if (errno == 0)
1774 return log_oom();
1775
1776 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1777 }
1778
1779 blkid_probe_enable_partitions(b, 1);
1780 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1781
1782 errno = 0;
1783 r = blkid_do_safeprobe(b);
1784 if (r == -2 || r == 1) {
1785 log_error("Failed to identify any partition table on\n"
1786 " %s\n"
1787 PARTITION_TABLE_BLURB, arg_image);
1788 return -EINVAL;
1789 } else if (r != 0) {
1790 if (errno == 0)
1791 errno = EIO;
1792 return log_error_errno(errno, "Failed to probe: %m");
1793 }
1794
1795 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1796
1797 is_gpt = streq_ptr(pttype, "gpt");
1798 is_mbr = streq_ptr(pttype, "dos");
1799
1800 if (!is_gpt && !is_mbr) {
1801 log_error("No GPT or MBR partition table discovered on\n"
1802 " %s\n"
1803 PARTITION_TABLE_BLURB, arg_image);
1804 return -EINVAL;
1805 }
1806
1807 errno = 0;
1808 pl = blkid_probe_get_partitions(b);
1809 if (!pl) {
1810 if (errno == 0)
1811 return log_oom();
1812
1813 log_error("Failed to list partitions of %s", arg_image);
1814 return -errno;
1815 }
1816
1817 udev = udev_new();
1818 if (!udev)
1819 return log_oom();
1820
1821 if (fstat(fd, &st) < 0)
1822 return log_error_errno(errno, "Failed to stat block device: %m");
1823
1824 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1825 if (!d)
1826 return log_oom();
1827
1828 for (i = 0;; i++) {
1829 int n, m;
1830
1831 if (i >= 10) {
1832 log_error("Kernel partitions never appeared.");
1833 return -ENXIO;
1834 }
1835
1836 e = udev_enumerate_new(udev);
1837 if (!e)
1838 return log_oom();
1839
1840 r = udev_enumerate_add_match_parent(e, d);
1841 if (r < 0)
1842 return log_oom();
1843
1844 r = udev_enumerate_scan_devices(e);
1845 if (r < 0)
1846 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1847
1848 /* Count the partitions enumerated by the kernel */
1849 n = 0;
1850 first = udev_enumerate_get_list_entry(e);
1851 udev_list_entry_foreach(item, first)
1852 n++;
1853
1854 /* Count the partitions enumerated by blkid */
1855 m = blkid_partlist_numof_partitions(pl);
1856 if (n == m + 1)
1857 break;
1858 if (n > m + 1) {
1859 log_error("blkid and kernel partition list do not match.");
1860 return -EIO;
1861 }
1862 if (n < m + 1) {
1863 unsigned j;
1864
1865 /* The kernel has probed fewer partitions than
1866 * blkid? Maybe the kernel prober is still
1867 * running or it got EBUSY because udev
1868 * already opened the device. Let's reprobe
1869 * the device, which is a synchronous call
1870 * that waits until probing is complete. */
1871
1872 for (j = 0; j < 20; j++) {
1873
1874 r = ioctl(fd, BLKRRPART, 0);
1875 if (r < 0)
1876 r = -errno;
1877 if (r >= 0 || r != -EBUSY)
1878 break;
1879
1880 /* If something else has the device
1881 * open, such as an udev rule, the
1882 * ioctl will return EBUSY. Since
1883 * there's no way to wait until it
1884 * isn't busy anymore, let's just wait
1885 * a bit, and try again.
1886 *
1887 * This is really something they
1888 * should fix in the kernel! */
1889
1890 usleep(50 * USEC_PER_MSEC);
1891 }
1892
1893 if (r < 0)
1894 return log_error_errno(r, "Failed to reread partition table: %m");
1895 }
1896
1897 e = udev_enumerate_unref(e);
1898 }
1899
1900 first = udev_enumerate_get_list_entry(e);
1901 udev_list_entry_foreach(item, first) {
1902 _cleanup_udev_device_unref_ struct udev_device *q;
1903 const char *node;
1904 unsigned long long flags;
1905 blkid_partition pp;
1906 dev_t qn;
1907 int nr;
1908
1909 errno = 0;
1910 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1911 if (!q) {
1912 if (!errno)
1913 errno = ENOMEM;
1914
1915 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1916 }
1917
1918 qn = udev_device_get_devnum(q);
1919 if (major(qn) == 0)
1920 continue;
1921
1922 if (st.st_rdev == qn)
1923 continue;
1924
1925 node = udev_device_get_devnode(q);
1926 if (!node)
1927 continue;
1928
1929 pp = blkid_partlist_devno_to_partition(pl, qn);
1930 if (!pp)
1931 continue;
1932
1933 flags = blkid_partition_get_flags(pp);
1934
1935 nr = blkid_partition_get_partno(pp);
1936 if (nr < 0)
1937 continue;
1938
1939 if (is_gpt) {
1940 sd_id128_t type_id;
1941 const char *stype;
1942
1943 if (flags & GPT_FLAG_NO_AUTO)
1944 continue;
1945
1946 stype = blkid_partition_get_type_string(pp);
1947 if (!stype)
1948 continue;
1949
1950 if (sd_id128_from_string(stype, &type_id) < 0)
1951 continue;
1952
1953 if (sd_id128_equal(type_id, GPT_HOME)) {
1954
1955 if (home && nr >= home_nr)
1956 continue;
1957
1958 home_nr = nr;
1959 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1960
1961 r = free_and_strdup(&home, node);
1962 if (r < 0)
1963 return log_oom();
1964
1965 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1966
1967 if (srv && nr >= srv_nr)
1968 continue;
1969
1970 srv_nr = nr;
1971 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1972
1973 r = free_and_strdup(&srv, node);
1974 if (r < 0)
1975 return log_oom();
1976 }
1977 #ifdef GPT_ROOT_NATIVE
1978 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1979
1980 if (root && nr >= root_nr)
1981 continue;
1982
1983 root_nr = nr;
1984 root_rw = !(flags & GPT_FLAG_READ_ONLY);
1985
1986 r = free_and_strdup(&root, node);
1987 if (r < 0)
1988 return log_oom();
1989 }
1990 #endif
1991 #ifdef GPT_ROOT_SECONDARY
1992 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1993
1994 if (secondary_root && nr >= secondary_root_nr)
1995 continue;
1996
1997 secondary_root_nr = nr;
1998 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
1999
2000 r = free_and_strdup(&secondary_root, node);
2001 if (r < 0)
2002 return log_oom();
2003 }
2004 #endif
2005 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2006
2007 if (generic)
2008 multiple_generic = true;
2009 else {
2010 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2011
2012 r = free_and_strdup(&generic, node);
2013 if (r < 0)
2014 return log_oom();
2015 }
2016 }
2017
2018 } else if (is_mbr) {
2019 int type;
2020
2021 if (flags != 0x80) /* Bootable flag */
2022 continue;
2023
2024 type = blkid_partition_get_type(pp);
2025 if (type != 0x83) /* Linux partition */
2026 continue;
2027
2028 if (generic)
2029 multiple_generic = true;
2030 else {
2031 generic_rw = true;
2032
2033 r = free_and_strdup(&root, node);
2034 if (r < 0)
2035 return log_oom();
2036 }
2037 }
2038 }
2039
2040 if (root) {
2041 *root_device = root;
2042 root = NULL;
2043
2044 *root_device_rw = root_rw;
2045 *secondary = false;
2046 } else if (secondary_root) {
2047 *root_device = secondary_root;
2048 secondary_root = NULL;
2049
2050 *root_device_rw = secondary_root_rw;
2051 *secondary = true;
2052 } else if (generic) {
2053
2054 /* There were no partitions with precise meanings
2055 * around, but we found generic partitions. In this
2056 * case, if there's only one, we can go ahead and boot
2057 * it, otherwise we bail out, because we really cannot
2058 * make any sense of it. */
2059
2060 if (multiple_generic) {
2061 log_error("Identified multiple bootable Linux partitions on\n"
2062 " %s\n"
2063 PARTITION_TABLE_BLURB, arg_image);
2064 return -EINVAL;
2065 }
2066
2067 *root_device = generic;
2068 generic = NULL;
2069
2070 *root_device_rw = generic_rw;
2071 *secondary = false;
2072 } else {
2073 log_error("Failed to identify root partition in disk image\n"
2074 " %s\n"
2075 PARTITION_TABLE_BLURB, arg_image);
2076 return -EINVAL;
2077 }
2078
2079 if (home) {
2080 *home_device = home;
2081 home = NULL;
2082
2083 *home_device_rw = home_rw;
2084 }
2085
2086 if (srv) {
2087 *srv_device = srv;
2088 srv = NULL;
2089
2090 *srv_device_rw = srv_rw;
2091 }
2092
2093 return 0;
2094 #else
2095 log_error("--image= is not supported, compiled without blkid support.");
2096 return -EOPNOTSUPP;
2097 #endif
2098 }
2099
2100 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2101 #ifdef HAVE_BLKID
2102 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2103 const char *fstype, *p;
2104 int r;
2105
2106 assert(what);
2107 assert(where);
2108
2109 if (arg_read_only)
2110 rw = false;
2111
2112 if (directory)
2113 p = strjoina(where, directory);
2114 else
2115 p = where;
2116
2117 errno = 0;
2118 b = blkid_new_probe_from_filename(what);
2119 if (!b) {
2120 if (errno == 0)
2121 return log_oom();
2122 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2123 }
2124
2125 blkid_probe_enable_superblocks(b, 1);
2126 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2127
2128 errno = 0;
2129 r = blkid_do_safeprobe(b);
2130 if (r == -1 || r == 1) {
2131 log_error("Cannot determine file system type of %s", what);
2132 return -EINVAL;
2133 } else if (r != 0) {
2134 if (errno == 0)
2135 errno = EIO;
2136 return log_error_errno(errno, "Failed to probe %s: %m", what);
2137 }
2138
2139 errno = 0;
2140 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2141 if (errno == 0)
2142 errno = EINVAL;
2143 log_error("Failed to determine file system type of %s", what);
2144 return -errno;
2145 }
2146
2147 if (streq(fstype, "crypto_LUKS")) {
2148 log_error("nspawn currently does not support LUKS disk images.");
2149 return -EOPNOTSUPP;
2150 }
2151
2152 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2153 return log_error_errno(errno, "Failed to mount %s: %m", what);
2154
2155 return 0;
2156 #else
2157 log_error("--image= is not supported, compiled without blkid support.");
2158 return -EOPNOTSUPP;
2159 #endif
2160 }
2161
2162 static int mount_devices(
2163 const char *where,
2164 const char *root_device, bool root_device_rw,
2165 const char *home_device, bool home_device_rw,
2166 const char *srv_device, bool srv_device_rw) {
2167 int r;
2168
2169 assert(where);
2170
2171 if (root_device) {
2172 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2173 if (r < 0)
2174 return log_error_errno(r, "Failed to mount root directory: %m");
2175 }
2176
2177 if (home_device) {
2178 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2179 if (r < 0)
2180 return log_error_errno(r, "Failed to mount home directory: %m");
2181 }
2182
2183 if (srv_device) {
2184 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2185 if (r < 0)
2186 return log_error_errno(r, "Failed to mount server data directory: %m");
2187 }
2188
2189 return 0;
2190 }
2191
2192 static void loop_remove(int nr, int *image_fd) {
2193 _cleanup_close_ int control = -1;
2194 int r;
2195
2196 if (nr < 0)
2197 return;
2198
2199 if (image_fd && *image_fd >= 0) {
2200 r = ioctl(*image_fd, LOOP_CLR_FD);
2201 if (r < 0)
2202 log_debug_errno(errno, "Failed to close loop image: %m");
2203 *image_fd = safe_close(*image_fd);
2204 }
2205
2206 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2207 if (control < 0) {
2208 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2209 return;
2210 }
2211
2212 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2213 if (r < 0)
2214 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2215 }
2216
2217 /*
2218 * Return values:
2219 * < 0 : wait_for_terminate() failed to get the state of the
2220 * container, the container was terminated by a signal, or
2221 * failed for an unknown reason. No change is made to the
2222 * container argument.
2223 * > 0 : The program executed in the container terminated with an
2224 * error. The exit code of the program executed in the
2225 * container is returned. The container argument has been set
2226 * to CONTAINER_TERMINATED.
2227 * 0 : The container is being rebooted, has been shut down or exited
2228 * successfully. The container argument has been set to either
2229 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2230 *
2231 * That is, success is indicated by a return value of zero, and an
2232 * error is indicated by a non-zero value.
2233 */
2234 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2235 siginfo_t status;
2236 int r;
2237
2238 r = wait_for_terminate(pid, &status);
2239 if (r < 0)
2240 return log_warning_errno(r, "Failed to wait for container: %m");
2241
2242 switch (status.si_code) {
2243
2244 case CLD_EXITED:
2245 if (status.si_status == 0) {
2246 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2247
2248 } else
2249 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2250
2251 *container = CONTAINER_TERMINATED;
2252 return status.si_status;
2253
2254 case CLD_KILLED:
2255 if (status.si_status == SIGINT) {
2256
2257 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2258 *container = CONTAINER_TERMINATED;
2259 return 0;
2260
2261 } else if (status.si_status == SIGHUP) {
2262
2263 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2264 *container = CONTAINER_REBOOTED;
2265 return 0;
2266 }
2267
2268 /* CLD_KILLED fallthrough */
2269
2270 case CLD_DUMPED:
2271 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2272 return -EIO;
2273
2274 default:
2275 log_error("Container %s failed due to unknown reason.", arg_machine);
2276 return -EIO;
2277 }
2278
2279 return r;
2280 }
2281
2282 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2283 pid_t pid;
2284
2285 pid = PTR_TO_PID(userdata);
2286 if (pid > 0) {
2287 if (kill(pid, arg_kill_signal) >= 0) {
2288 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2289 sd_event_source_set_userdata(s, NULL);
2290 return 0;
2291 }
2292 }
2293
2294 sd_event_exit(sd_event_source_get_event(s), 0);
2295 return 0;
2296 }
2297
2298 static int determine_names(void) {
2299 int r;
2300
2301 if (arg_template && !arg_directory && arg_machine) {
2302
2303 /* If --template= was specified then we should not
2304 * search for a machine, but instead create a new one
2305 * in /var/lib/machine. */
2306
2307 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2308 if (!arg_directory)
2309 return log_oom();
2310 }
2311
2312 if (!arg_image && !arg_directory) {
2313 if (arg_machine) {
2314 _cleanup_(image_unrefp) Image *i = NULL;
2315
2316 r = image_find(arg_machine, &i);
2317 if (r < 0)
2318 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2319 else if (r == 0) {
2320 log_error("No image for machine '%s': %m", arg_machine);
2321 return -ENOENT;
2322 }
2323
2324 if (i->type == IMAGE_RAW)
2325 r = free_and_strdup(&arg_image, i->path);
2326 else
2327 r = free_and_strdup(&arg_directory, i->path);
2328 if (r < 0)
2329 return log_error_errno(r, "Invalid image directory: %m");
2330
2331 if (!arg_ephemeral)
2332 arg_read_only = arg_read_only || i->read_only;
2333 } else
2334 arg_directory = get_current_dir_name();
2335
2336 if (!arg_directory && !arg_machine) {
2337 log_error("Failed to determine path, please use -D or -i.");
2338 return -EINVAL;
2339 }
2340 }
2341
2342 if (!arg_machine) {
2343 if (arg_directory && path_equal(arg_directory, "/"))
2344 arg_machine = gethostname_malloc();
2345 else
2346 arg_machine = strdup(basename(arg_image ?: arg_directory));
2347
2348 if (!arg_machine)
2349 return log_oom();
2350
2351 hostname_cleanup(arg_machine);
2352 if (!machine_name_is_valid(arg_machine)) {
2353 log_error("Failed to determine machine name automatically, please use -M.");
2354 return -EINVAL;
2355 }
2356
2357 if (arg_ephemeral) {
2358 char *b;
2359
2360 /* Add a random suffix when this is an
2361 * ephemeral machine, so that we can run many
2362 * instances at once without manually having
2363 * to specify -M each time. */
2364
2365 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2366 return log_oom();
2367
2368 free(arg_machine);
2369 arg_machine = b;
2370 }
2371 }
2372
2373 return 0;
2374 }
2375
2376 static int determine_uid_shift(const char *directory) {
2377 int r;
2378
2379 if (!arg_userns) {
2380 arg_uid_shift = 0;
2381 return 0;
2382 }
2383
2384 if (arg_uid_shift == UID_INVALID) {
2385 struct stat st;
2386
2387 r = stat(directory, &st);
2388 if (r < 0)
2389 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2390
2391 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2392
2393 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2394 log_error("UID and GID base of %s don't match.", directory);
2395 return -EINVAL;
2396 }
2397
2398 arg_uid_range = UINT32_C(0x10000);
2399 }
2400
2401 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2402 log_error("UID base too high for UID range.");
2403 return -EINVAL;
2404 }
2405
2406 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2407 return 0;
2408 }
2409
2410 static int inner_child(
2411 Barrier *barrier,
2412 const char *directory,
2413 bool secondary,
2414 int kmsg_socket,
2415 int rtnl_socket,
2416 FDSet *fds) {
2417
2418 _cleanup_free_ char *home = NULL;
2419 unsigned n_env = 1;
2420 const char *envp[] = {
2421 "PATH=" DEFAULT_PATH_SPLIT_USR,
2422 NULL, /* container */
2423 NULL, /* TERM */
2424 NULL, /* HOME */
2425 NULL, /* USER */
2426 NULL, /* LOGNAME */
2427 NULL, /* container_uuid */
2428 NULL, /* LISTEN_FDS */
2429 NULL, /* LISTEN_PID */
2430 NULL
2431 };
2432
2433 _cleanup_strv_free_ char **env_use = NULL;
2434 int r;
2435
2436 assert(barrier);
2437 assert(directory);
2438 assert(kmsg_socket >= 0);
2439
2440 cg_unified_flush();
2441
2442 if (arg_userns) {
2443 /* Tell the parent, that it now can write the UID map. */
2444 (void) barrier_place(barrier); /* #1 */
2445
2446 /* Wait until the parent wrote the UID map */
2447 if (!barrier_place_and_sync(barrier)) { /* #2 */
2448 log_error("Parent died too early");
2449 return -ESRCH;
2450 }
2451 }
2452
2453 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
2454 if (r < 0)
2455 return r;
2456
2457 r = mount_sysfs(NULL);
2458 if (r < 0)
2459 return r;
2460
2461 /* Wait until we are cgroup-ified, so that we
2462 * can mount the right cgroup path writable */
2463 if (!barrier_place_and_sync(barrier)) { /* #3 */
2464 log_error("Parent died too early");
2465 return -ESRCH;
2466 }
2467
2468 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2469 if (r < 0)
2470 return r;
2471
2472 r = reset_uid_gid();
2473 if (r < 0)
2474 return log_error_errno(r, "Couldn't become new root: %m");
2475
2476 r = setup_boot_id(NULL);
2477 if (r < 0)
2478 return r;
2479
2480 r = setup_kmsg(NULL, kmsg_socket);
2481 if (r < 0)
2482 return r;
2483 kmsg_socket = safe_close(kmsg_socket);
2484
2485 umask(0022);
2486
2487 if (setsid() < 0)
2488 return log_error_errno(errno, "setsid() failed: %m");
2489
2490 if (arg_private_network)
2491 loopback_setup();
2492
2493 if (arg_expose_ports) {
2494 r = expose_port_send_rtnl(rtnl_socket);
2495 if (r < 0)
2496 return r;
2497 rtnl_socket = safe_close(rtnl_socket);
2498 }
2499
2500 r = drop_capabilities();
2501 if (r < 0)
2502 return log_error_errno(r, "drop_capabilities() failed: %m");
2503
2504 setup_hostname();
2505
2506 if (arg_personality != PERSONALITY_INVALID) {
2507 if (personality(arg_personality) < 0)
2508 return log_error_errno(errno, "personality() failed: %m");
2509 } else if (secondary) {
2510 if (personality(PER_LINUX32) < 0)
2511 return log_error_errno(errno, "personality() failed: %m");
2512 }
2513
2514 #ifdef HAVE_SELINUX
2515 if (arg_selinux_context)
2516 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2517 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2518 #endif
2519
2520 r = change_uid_gid(arg_user, &home);
2521 if (r < 0)
2522 return r;
2523
2524 /* LXC sets container=lxc, so follow the scheme here */
2525 envp[n_env++] = strjoina("container=", arg_container_service_name);
2526
2527 envp[n_env] = strv_find_prefix(environ, "TERM=");
2528 if (envp[n_env])
2529 n_env ++;
2530
2531 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2532 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2533 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2534 return log_oom();
2535
2536 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2537 char as_uuid[37];
2538
2539 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2540 return log_oom();
2541 }
2542
2543 if (fdset_size(fds) > 0) {
2544 r = fdset_cloexec(fds, false);
2545 if (r < 0)
2546 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2547
2548 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2549 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2550 return log_oom();
2551 }
2552
2553 env_use = strv_env_merge(2, envp, arg_setenv);
2554 if (!env_use)
2555 return log_oom();
2556
2557 /* Let the parent know that we are ready and
2558 * wait until the parent is ready with the
2559 * setup, too... */
2560 if (!barrier_place_and_sync(barrier)) { /* #4 */
2561 log_error("Parent died too early");
2562 return -ESRCH;
2563 }
2564
2565 /* Now, explicitly close the log, so that we
2566 * then can close all remaining fds. Closing
2567 * the log explicitly first has the benefit
2568 * that the logging subsystem knows about it,
2569 * and is thus ready to be reopened should we
2570 * need it again. Note that the other fds
2571 * closed here are at least the locking and
2572 * barrier fds. */
2573 log_close();
2574 (void) fdset_close_others(fds);
2575
2576 if (arg_boot) {
2577 char **a;
2578 size_t m;
2579
2580 /* Automatically search for the init system */
2581
2582 m = 1 + strv_length(arg_parameters);
2583 a = newa(char*, m + 1);
2584 if (strv_isempty(arg_parameters))
2585 a[1] = NULL;
2586 else
2587 memcpy(a + 1, arg_parameters, m * sizeof(char*));
2588
2589 a[0] = (char*) "/usr/lib/systemd/systemd";
2590 execve(a[0], a, env_use);
2591
2592 a[0] = (char*) "/lib/systemd/systemd";
2593 execve(a[0], a, env_use);
2594
2595 a[0] = (char*) "/sbin/init";
2596 execve(a[0], a, env_use);
2597 } else if (!strv_isempty(arg_parameters))
2598 execvpe(arg_parameters[0], arg_parameters, env_use);
2599 else {
2600 chdir(home ?: "/root");
2601 execle("/bin/bash", "-bash", NULL, env_use);
2602 execle("/bin/sh", "-sh", NULL, env_use);
2603 }
2604
2605 r = -errno;
2606 (void) log_open();
2607 return log_error_errno(r, "execv() failed: %m");
2608 }
2609
2610 static int outer_child(
2611 Barrier *barrier,
2612 const char *directory,
2613 const char *console,
2614 const char *root_device, bool root_device_rw,
2615 const char *home_device, bool home_device_rw,
2616 const char *srv_device, bool srv_device_rw,
2617 bool interactive,
2618 bool secondary,
2619 int pid_socket,
2620 int kmsg_socket,
2621 int rtnl_socket,
2622 int uid_shift_socket,
2623 FDSet *fds) {
2624
2625 pid_t pid;
2626 ssize_t l;
2627 int r;
2628
2629 assert(barrier);
2630 assert(directory);
2631 assert(console);
2632 assert(pid_socket >= 0);
2633 assert(kmsg_socket >= 0);
2634
2635 cg_unified_flush();
2636
2637 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2638 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2639
2640 if (interactive) {
2641 close_nointr(STDIN_FILENO);
2642 close_nointr(STDOUT_FILENO);
2643 close_nointr(STDERR_FILENO);
2644
2645 r = open_terminal(console, O_RDWR);
2646 if (r != STDIN_FILENO) {
2647 if (r >= 0) {
2648 safe_close(r);
2649 r = -EINVAL;
2650 }
2651
2652 return log_error_errno(r, "Failed to open console: %m");
2653 }
2654
2655 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2656 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2657 return log_error_errno(errno, "Failed to duplicate console: %m");
2658 }
2659
2660 r = reset_audit_loginuid();
2661 if (r < 0)
2662 return r;
2663
2664 /* Mark everything as slave, so that we still
2665 * receive mounts from the real root, but don't
2666 * propagate mounts to the real root. */
2667 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2668 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2669
2670 r = mount_devices(directory,
2671 root_device, root_device_rw,
2672 home_device, home_device_rw,
2673 srv_device, srv_device_rw);
2674 if (r < 0)
2675 return r;
2676
2677 r = determine_uid_shift(directory);
2678 if (r < 0)
2679 return r;
2680
2681 if (arg_userns) {
2682 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2683 if (l < 0)
2684 return log_error_errno(errno, "Failed to send UID shift: %m");
2685 if (l != sizeof(arg_uid_shift)) {
2686 log_error("Short write while sending UID shift.");
2687 return -EIO;
2688 }
2689 }
2690
2691 /* Turn directory into bind mount */
2692 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2693 return log_error_errno(errno, "Failed to make bind mount: %m");
2694
2695 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2696 if (r < 0)
2697 return r;
2698
2699 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2700 if (r < 0)
2701 return r;
2702
2703 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2704 if (r < 0)
2705 return r;
2706
2707 if (arg_read_only) {
2708 r = bind_remount_recursive(directory, true);
2709 if (r < 0)
2710 return log_error_errno(r, "Failed to make tree read-only: %m");
2711 }
2712
2713 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2714 if (r < 0)
2715 return r;
2716
2717 r = copy_devnodes(directory);
2718 if (r < 0)
2719 return r;
2720
2721 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2722
2723 r = setup_pts(directory);
2724 if (r < 0)
2725 return r;
2726
2727 r = setup_propagate(directory);
2728 if (r < 0)
2729 return r;
2730
2731 r = setup_dev_console(directory, console);
2732 if (r < 0)
2733 return r;
2734
2735 r = setup_seccomp();
2736 if (r < 0)
2737 return r;
2738
2739 r = setup_timezone(directory);
2740 if (r < 0)
2741 return r;
2742
2743 r = setup_resolv_conf(directory);
2744 if (r < 0)
2745 return r;
2746
2747 r = setup_journal(directory);
2748 if (r < 0)
2749 return r;
2750
2751 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2752 if (r < 0)
2753 return r;
2754
2755 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2756 if (r < 0)
2757 return r;
2758
2759 r = mount_move_root(directory);
2760 if (r < 0)
2761 return log_error_errno(r, "Failed to move root directory: %m");
2762
2763 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2764 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2765 (arg_private_network ? CLONE_NEWNET : 0) |
2766 (arg_userns ? CLONE_NEWUSER : 0),
2767 NULL);
2768 if (pid < 0)
2769 return log_error_errno(errno, "Failed to fork inner child: %m");
2770 if (pid == 0) {
2771 pid_socket = safe_close(pid_socket);
2772 uid_shift_socket = safe_close(uid_shift_socket);
2773
2774 /* The inner child has all namespaces that are
2775 * requested, so that we all are owned by the user if
2776 * user namespaces are turned on. */
2777
2778 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2779 if (r < 0)
2780 _exit(EXIT_FAILURE);
2781
2782 _exit(EXIT_SUCCESS);
2783 }
2784
2785 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2786 if (l < 0)
2787 return log_error_errno(errno, "Failed to send PID: %m");
2788 if (l != sizeof(pid)) {
2789 log_error("Short write while sending PID.");
2790 return -EIO;
2791 }
2792
2793 pid_socket = safe_close(pid_socket);
2794 kmsg_socket = safe_close(kmsg_socket);
2795 rtnl_socket = safe_close(rtnl_socket);
2796
2797 return 0;
2798 }
2799
2800 static int setup_uid_map(pid_t pid) {
2801 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2802 int r;
2803
2804 assert(pid > 1);
2805
2806 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2807 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2808 r = write_string_file(uid_map, line, 0);
2809 if (r < 0)
2810 return log_error_errno(r, "Failed to write UID map: %m");
2811
2812 /* We always assign the same UID and GID ranges */
2813 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2814 r = write_string_file(uid_map, line, 0);
2815 if (r < 0)
2816 return log_error_errno(r, "Failed to write GID map: %m");
2817
2818 return 0;
2819 }
2820
2821 static int load_settings(void) {
2822 _cleanup_(settings_freep) Settings *settings = NULL;
2823 _cleanup_fclose_ FILE *f = NULL;
2824 _cleanup_free_ char *p = NULL;
2825 const char *fn, *i;
2826 int r;
2827
2828 /* If all settings are masked, there's no point in looking for
2829 * the settings file */
2830 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2831 return 0;
2832
2833 fn = strjoina(arg_machine, ".nspawn");
2834
2835 /* We first look in the admin's directories in /etc and /run */
2836 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2837 _cleanup_free_ char *j = NULL;
2838
2839 j = strjoin(i, "/", fn, NULL);
2840 if (!j)
2841 return log_oom();
2842
2843 f = fopen(j, "re");
2844 if (f) {
2845 p = j;
2846 j = NULL;
2847
2848 /* By default, we trust configuration from /etc and /run */
2849 if (arg_settings_trusted < 0)
2850 arg_settings_trusted = true;
2851
2852 break;
2853 }
2854
2855 if (errno != ENOENT)
2856 return log_error_errno(errno, "Failed to open %s: %m", j);
2857 }
2858
2859 if (!f) {
2860 /* After that, let's look for a file next to the
2861 * actual image we shall boot. */
2862
2863 if (arg_image) {
2864 p = file_in_same_dir(arg_image, fn);
2865 if (!p)
2866 return log_oom();
2867 } else if (arg_directory) {
2868 p = file_in_same_dir(arg_directory, fn);
2869 if (!p)
2870 return log_oom();
2871 }
2872
2873 if (p) {
2874 f = fopen(p, "re");
2875 if (!f && errno != ENOENT)
2876 return log_error_errno(errno, "Failed to open %s: %m", p);
2877
2878 /* By default, we do not trust configuration from /var/lib/machines */
2879 if (arg_settings_trusted < 0)
2880 arg_settings_trusted = false;
2881 }
2882 }
2883
2884 if (!f)
2885 return 0;
2886
2887 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2888
2889 r = settings_load(f, p, &settings);
2890 if (r < 0)
2891 return r;
2892
2893 /* Copy over bits from the settings, unless they have been
2894 * explicitly masked by command line switches. */
2895
2896 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2897 settings->boot >= 0) {
2898 arg_boot = settings->boot;
2899
2900 strv_free(arg_parameters);
2901 arg_parameters = settings->parameters;
2902 settings->parameters = NULL;
2903 }
2904
2905 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2906 settings->environment) {
2907 strv_free(arg_setenv);
2908 arg_setenv = settings->environment;
2909 settings->environment = NULL;
2910 }
2911
2912 if ((arg_settings_mask & SETTING_USER) == 0 &&
2913 settings->user) {
2914 free(arg_user);
2915 arg_user = settings->user;
2916 settings->user = NULL;
2917 }
2918
2919 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
2920 uint64_t plus;
2921
2922 plus = settings->capability;
2923 if (settings_private_network(settings))
2924 plus |= (1ULL << CAP_NET_ADMIN);
2925
2926 if (!arg_settings_trusted && plus != 0) {
2927 if (settings->capability != 0)
2928 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2929 } else
2930 arg_retain |= plus;
2931
2932 arg_retain &= ~settings->drop_capability;
2933 }
2934
2935 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2936 settings->kill_signal > 0)
2937 arg_kill_signal = settings->kill_signal;
2938
2939 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2940 settings->personality != PERSONALITY_INVALID)
2941 arg_personality = settings->personality;
2942
2943 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2944 !sd_id128_is_null(settings->machine_id)) {
2945
2946 if (!arg_settings_trusted)
2947 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2948 else
2949 arg_uuid = settings->machine_id;
2950 }
2951
2952 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2953 settings->read_only >= 0)
2954 arg_read_only = settings->read_only;
2955
2956 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2957 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2958 arg_volatile_mode = settings->volatile_mode;
2959
2960 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2961 settings->n_custom_mounts > 0) {
2962
2963 if (!arg_settings_trusted)
2964 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2965 else {
2966 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2967 arg_custom_mounts = settings->custom_mounts;
2968 arg_n_custom_mounts = settings->n_custom_mounts;
2969
2970 settings->custom_mounts = NULL;
2971 settings->n_custom_mounts = 0;
2972 }
2973 }
2974
2975 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2976 (settings->private_network >= 0 ||
2977 settings->network_veth >= 0 ||
2978 settings->network_bridge ||
2979 settings->network_interfaces ||
2980 settings->network_macvlan ||
2981 settings->network_ipvlan ||
2982 settings->network_veth_extra)) {
2983
2984 if (!arg_settings_trusted)
2985 log_warning("Ignoring network settings, file %s is not trusted.", p);
2986 else {
2987 arg_network_veth = settings_network_veth(settings);
2988 arg_private_network = settings_private_network(settings);
2989
2990 strv_free(arg_network_interfaces);
2991 arg_network_interfaces = settings->network_interfaces;
2992 settings->network_interfaces = NULL;
2993
2994 strv_free(arg_network_macvlan);
2995 arg_network_macvlan = settings->network_macvlan;
2996 settings->network_macvlan = NULL;
2997
2998 strv_free(arg_network_ipvlan);
2999 arg_network_ipvlan = settings->network_ipvlan;
3000 settings->network_ipvlan = NULL;
3001
3002 strv_free(arg_network_veth_extra);
3003 arg_network_veth_extra = settings->network_veth_extra;
3004 settings->network_veth_extra = NULL;
3005
3006 free(arg_network_bridge);
3007 arg_network_bridge = settings->network_bridge;
3008 settings->network_bridge = NULL;
3009 }
3010 }
3011
3012 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3013 settings->expose_ports) {
3014
3015 if (!arg_settings_trusted)
3016 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3017 else {
3018 expose_port_free_all(arg_expose_ports);
3019 arg_expose_ports = settings->expose_ports;
3020 settings->expose_ports = NULL;
3021 }
3022 }
3023
3024 return 0;
3025 }
3026
3027 int main(int argc, char *argv[]) {
3028
3029 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3030 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3031 _cleanup_close_ int master = -1, image_fd = -1;
3032 _cleanup_fdset_free_ FDSet *fds = NULL;
3033 int r, n_fd_passed, loop_nr = -1;
3034 char veth_name[IFNAMSIZ];
3035 bool secondary = false, remove_subvol = false;
3036 sigset_t mask_chld;
3037 pid_t pid = 0;
3038 int ret = EXIT_SUCCESS;
3039 union in_addr_union exposed = {};
3040 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3041 bool interactive;
3042
3043 log_parse_environment();
3044 log_open();
3045
3046 r = parse_argv(argc, argv);
3047 if (r <= 0)
3048 goto finish;
3049
3050 if (geteuid() != 0) {
3051 log_error("Need to be root.");
3052 r = -EPERM;
3053 goto finish;
3054 }
3055 r = determine_names();
3056 if (r < 0)
3057 goto finish;
3058
3059 r = load_settings();
3060 if (r < 0)
3061 goto finish;
3062
3063 r = verify_arguments();
3064 if (r < 0)
3065 goto finish;
3066
3067 n_fd_passed = sd_listen_fds(false);
3068 if (n_fd_passed > 0) {
3069 r = fdset_new_listen_fds(&fds, false);
3070 if (r < 0) {
3071 log_error_errno(r, "Failed to collect file descriptors: %m");
3072 goto finish;
3073 }
3074 }
3075
3076 if (arg_directory) {
3077 assert(!arg_image);
3078
3079 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3080 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3081 r = -EINVAL;
3082 goto finish;
3083 }
3084
3085 if (arg_ephemeral) {
3086 _cleanup_free_ char *np = NULL;
3087
3088 /* If the specified path is a mount point we
3089 * generate the new snapshot immediately
3090 * inside it under a random name. However if
3091 * the specified is not a mount point we
3092 * create the new snapshot in the parent
3093 * directory, just next to it. */
3094 r = path_is_mount_point(arg_directory, 0);
3095 if (r < 0) {
3096 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3097 goto finish;
3098 }
3099 if (r > 0)
3100 r = tempfn_random_child(arg_directory, "machine.", &np);
3101 else
3102 r = tempfn_random(arg_directory, "machine.", &np);
3103 if (r < 0) {
3104 log_error_errno(r, "Failed to generate name for snapshot: %m");
3105 goto finish;
3106 }
3107
3108 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3109 if (r < 0) {
3110 log_error_errno(r, "Failed to lock %s: %m", np);
3111 goto finish;
3112 }
3113
3114 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3115 if (r < 0) {
3116 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3117 goto finish;
3118 }
3119
3120 free(arg_directory);
3121 arg_directory = np;
3122 np = NULL;
3123
3124 remove_subvol = true;
3125
3126 } else {
3127 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3128 if (r == -EBUSY) {
3129 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3130 goto finish;
3131 }
3132 if (r < 0) {
3133 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3134 return r;
3135 }
3136
3137 if (arg_template) {
3138 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3139 if (r == -EEXIST) {
3140 if (!arg_quiet)
3141 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3142 } else if (r < 0) {
3143 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3144 goto finish;
3145 } else {
3146 if (!arg_quiet)
3147 log_info("Populated %s from template %s.", arg_directory, arg_template);
3148 }
3149 }
3150 }
3151
3152 if (arg_boot) {
3153 if (path_is_os_tree(arg_directory) <= 0) {
3154 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3155 r = -EINVAL;
3156 goto finish;
3157 }
3158 } else {
3159 const char *p;
3160
3161 p = strjoina(arg_directory, "/usr/");
3162 if (laccess(p, F_OK) < 0) {
3163 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
3164 r = -EINVAL;
3165 goto finish;
3166 }
3167 }
3168
3169 } else {
3170 char template[] = "/tmp/nspawn-root-XXXXXX";
3171
3172 assert(arg_image);
3173 assert(!arg_template);
3174
3175 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3176 if (r == -EBUSY) {
3177 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3178 goto finish;
3179 }
3180 if (r < 0) {
3181 r = log_error_errno(r, "Failed to create image lock: %m");
3182 goto finish;
3183 }
3184
3185 if (!mkdtemp(template)) {
3186 log_error_errno(errno, "Failed to create temporary directory: %m");
3187 r = -errno;
3188 goto finish;
3189 }
3190
3191 arg_directory = strdup(template);
3192 if (!arg_directory) {
3193 r = log_oom();
3194 goto finish;
3195 }
3196
3197 image_fd = setup_image(&device_path, &loop_nr);
3198 if (image_fd < 0) {
3199 r = image_fd;
3200 goto finish;
3201 }
3202
3203 r = dissect_image(image_fd,
3204 &root_device, &root_device_rw,
3205 &home_device, &home_device_rw,
3206 &srv_device, &srv_device_rw,
3207 &secondary);
3208 if (r < 0)
3209 goto finish;
3210 }
3211
3212 r = custom_mounts_prepare();
3213 if (r < 0)
3214 goto finish;
3215
3216 interactive =
3217 isatty(STDIN_FILENO) > 0 &&
3218 isatty(STDOUT_FILENO) > 0;
3219
3220 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3221 if (master < 0) {
3222 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3223 goto finish;
3224 }
3225
3226 r = ptsname_malloc(master, &console);
3227 if (r < 0) {
3228 r = log_error_errno(r, "Failed to determine tty name: %m");
3229 goto finish;
3230 }
3231
3232 if (unlockpt(master) < 0) {
3233 r = log_error_errno(errno, "Failed to unlock tty: %m");
3234 goto finish;
3235 }
3236
3237 if (!arg_quiet)
3238 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3239 arg_machine, arg_image ?: arg_directory);
3240
3241 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3242
3243 assert_se(sigemptyset(&mask_chld) == 0);
3244 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3245
3246 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3247 r = log_error_errno(errno, "Failed to become subreaper: %m");
3248 goto finish;
3249 }
3250
3251 for (;;) {
3252 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 };
3253 ContainerStatus container_status;
3254 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3255 static const struct sigaction sa = {
3256 .sa_handler = nop_signal_handler,
3257 .sa_flags = SA_NOCLDSTOP,
3258 };
3259 int ifi = 0;
3260 ssize_t l;
3261 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3262 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3263 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3264 char last_char = 0;
3265
3266 r = barrier_create(&barrier);
3267 if (r < 0) {
3268 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3269 goto finish;
3270 }
3271
3272 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3273 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3274 goto finish;
3275 }
3276
3277 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3278 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3279 goto finish;
3280 }
3281
3282 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3283 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3284 goto finish;
3285 }
3286
3287 if (arg_userns)
3288 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3289 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3290 goto finish;
3291 }
3292
3293 /* Child can be killed before execv(), so handle SIGCHLD
3294 * in order to interrupt parent's blocking calls and
3295 * give it a chance to call wait() and terminate. */
3296 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3297 if (r < 0) {
3298 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3299 goto finish;
3300 }
3301
3302 r = sigaction(SIGCHLD, &sa, NULL);
3303 if (r < 0) {
3304 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3305 goto finish;
3306 }
3307
3308 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3309 if (pid < 0) {
3310 if (errno == EINVAL)
3311 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3312 else
3313 r = log_error_errno(errno, "clone() failed: %m");
3314
3315 goto finish;
3316 }
3317
3318 if (pid == 0) {
3319 /* The outer child only has a file system namespace. */
3320 barrier_set_role(&barrier, BARRIER_CHILD);
3321
3322 master = safe_close(master);
3323
3324 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3325 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3326 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3327 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3328
3329 (void) reset_all_signal_handlers();
3330 (void) reset_signal_mask();
3331
3332 r = outer_child(&barrier,
3333 arg_directory,
3334 console,
3335 root_device, root_device_rw,
3336 home_device, home_device_rw,
3337 srv_device, srv_device_rw,
3338 interactive,
3339 secondary,
3340 pid_socket_pair[1],
3341 kmsg_socket_pair[1],
3342 rtnl_socket_pair[1],
3343 uid_shift_socket_pair[1],
3344 fds);
3345 if (r < 0)
3346 _exit(EXIT_FAILURE);
3347
3348 _exit(EXIT_SUCCESS);
3349 }
3350
3351 barrier_set_role(&barrier, BARRIER_PARENT);
3352
3353 fds = fdset_free(fds);
3354
3355 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3356 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3357 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3358 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3359
3360 /* Wait for the outer child. */
3361 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3362 if (r < 0)
3363 goto finish;
3364 if (r != 0) {
3365 r = -EIO;
3366 goto finish;
3367 }
3368 pid = 0;
3369
3370 /* And now retrieve the PID of the inner child. */
3371 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3372 if (l < 0) {
3373 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3374 goto finish;
3375 }
3376 if (l != sizeof(pid)) {
3377 log_error("Short read while reading inner child PID.");
3378 r = EIO;
3379 goto finish;
3380 }
3381
3382 log_debug("Init process invoked as PID " PID_FMT, pid);
3383
3384 if (arg_userns) {
3385 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3386 log_error("Child died too early.");
3387 r = -ESRCH;
3388 goto finish;
3389 }
3390
3391 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3392 if (l < 0) {
3393 r = log_error_errno(errno, "Failed to read UID shift: %m");
3394 goto finish;
3395 }
3396 if (l != sizeof(arg_uid_shift)) {
3397 log_error("Short read while reading UID shift.");
3398 r = EIO;
3399 goto finish;
3400 }
3401
3402 r = setup_uid_map(pid);
3403 if (r < 0)
3404 goto finish;
3405
3406 (void) barrier_place(&barrier); /* #2 */
3407 }
3408
3409 if (arg_private_network) {
3410
3411 r = move_network_interfaces(pid, arg_network_interfaces);
3412 if (r < 0)
3413 goto finish;
3414
3415 if (arg_network_veth) {
3416 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3417 if (r < 0)
3418 goto finish;
3419 else if (r > 0)
3420 ifi = r;
3421
3422 if (arg_network_bridge) {
3423 r = setup_bridge(veth_name, arg_network_bridge);
3424 if (r < 0)
3425 goto finish;
3426 if (r > 0)
3427 ifi = r;
3428 }
3429 }
3430
3431 r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
3432 if (r < 0)
3433 goto finish;
3434
3435 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3436 if (r < 0)
3437 goto finish;
3438
3439 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3440 if (r < 0)
3441 goto finish;
3442 }
3443
3444 if (arg_register) {
3445 r = register_machine(
3446 arg_machine,
3447 pid,
3448 arg_directory,
3449 arg_uuid,
3450 ifi,
3451 arg_slice,
3452 arg_custom_mounts, arg_n_custom_mounts,
3453 arg_kill_signal,
3454 arg_property,
3455 arg_keep_unit,
3456 arg_container_service_name);
3457 if (r < 0)
3458 goto finish;
3459 }
3460
3461 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3462 if (r < 0)
3463 goto finish;
3464
3465 if (arg_keep_unit) {
3466 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3467 if (r < 0)
3468 goto finish;
3469 }
3470
3471 r = chown_cgroup(pid, arg_uid_shift);
3472 if (r < 0)
3473 goto finish;
3474
3475 /* Notify the child that the parent is ready with all
3476 * its setup (including cgroup-ification), and that
3477 * the child can now hand over control to the code to
3478 * run inside the container. */
3479 (void) barrier_place(&barrier); /* #3 */
3480
3481 /* Block SIGCHLD here, before notifying child.
3482 * process_pty() will handle it with the other signals. */
3483 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3484
3485 /* Reset signal to default */
3486 r = default_signals(SIGCHLD, -1);
3487 if (r < 0) {
3488 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3489 goto finish;
3490 }
3491
3492 /* Let the child know that we are ready and wait that the child is completely ready now. */
3493 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3494 log_error("Child died too early.");
3495 r = -ESRCH;
3496 goto finish;
3497 }
3498
3499 sd_notifyf(false,
3500 "READY=1\n"
3501 "STATUS=Container running.\n"
3502 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3503
3504 r = sd_event_new(&event);
3505 if (r < 0) {
3506 log_error_errno(r, "Failed to get default event source: %m");
3507 goto finish;
3508 }
3509
3510 if (arg_kill_signal > 0) {
3511 /* Try to kill the init system on SIGINT or SIGTERM */
3512 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid));
3513 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid));
3514 } else {
3515 /* Immediately exit */
3516 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3517 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3518 }
3519
3520 /* simply exit on sigchld */
3521 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3522
3523 if (arg_expose_ports) {
3524 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3525 if (r < 0)
3526 goto finish;
3527
3528 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3529 }
3530
3531 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3532
3533 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
3534 if (r < 0) {
3535 log_error_errno(r, "Failed to create PTY forwarder: %m");
3536 goto finish;
3537 }
3538
3539 r = sd_event_loop(event);
3540 if (r < 0) {
3541 log_error_errno(r, "Failed to run event loop: %m");
3542 goto finish;
3543 }
3544
3545 pty_forward_get_last_char(forward, &last_char);
3546
3547 forward = pty_forward_free(forward);
3548
3549 if (!arg_quiet && last_char != '\n')
3550 putc('\n', stdout);
3551
3552 /* Kill if it is not dead yet anyway */
3553 if (arg_register && !arg_keep_unit)
3554 terminate_machine(pid);
3555
3556 /* Normally redundant, but better safe than sorry */
3557 kill(pid, SIGKILL);
3558
3559 r = wait_for_container(pid, &container_status);
3560 pid = 0;
3561
3562 if (r < 0)
3563 /* We failed to wait for the container, or the
3564 * container exited abnormally */
3565 goto finish;
3566 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3567 /* The container exited with a non-zero
3568 * status, or with zero status and no reboot
3569 * was requested. */
3570 ret = r;
3571 break;
3572 }
3573
3574 /* CONTAINER_REBOOTED, loop again */
3575
3576 if (arg_keep_unit) {
3577 /* Special handling if we are running as a
3578 * service: instead of simply restarting the
3579 * machine we want to restart the entire
3580 * service, so let's inform systemd about this
3581 * with the special exit code 133. The service
3582 * file uses RestartForceExitStatus=133 so
3583 * that this results in a full nspawn
3584 * restart. This is necessary since we might
3585 * have cgroup parameters set we want to have
3586 * flushed out. */
3587 ret = 133;
3588 r = 0;
3589 break;
3590 }
3591
3592 expose_port_flush(arg_expose_ports, &exposed);
3593 }
3594
3595 finish:
3596 sd_notify(false,
3597 "STOPPING=1\n"
3598 "STATUS=Terminating...");
3599
3600 if (pid > 0)
3601 kill(pid, SIGKILL);
3602
3603 /* Try to flush whatever is still queued in the pty */
3604 if (master >= 0)
3605 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
3606
3607 loop_remove(loop_nr, &image_fd);
3608
3609 if (remove_subvol && arg_directory) {
3610 int k;
3611
3612 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
3613 if (k < 0)
3614 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3615 }
3616
3617 if (arg_machine) {
3618 const char *p;
3619
3620 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3621 (void) rm_rf(p, REMOVE_ROOT);
3622 }
3623
3624 expose_port_flush(arg_expose_ports, &exposed);
3625
3626 free(arg_directory);
3627 free(arg_template);
3628 free(arg_image);
3629 free(arg_machine);
3630 free(arg_user);
3631 strv_free(arg_setenv);
3632 free(arg_network_bridge);
3633 strv_free(arg_network_interfaces);
3634 strv_free(arg_network_macvlan);
3635 strv_free(arg_network_ipvlan);
3636 strv_free(arg_network_veth_extra);
3637 strv_free(arg_parameters);
3638 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3639 expose_port_free_all(arg_expose_ports);
3640
3641 return r < 0 ? EXIT_FAILURE : ret;
3642 }