]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
util: unify implementation of NOP signal handler
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #ifdef HAVE_BLKID
23 #include <blkid/blkid.h>
24 #endif
25 #include <errno.h>
26 #include <getopt.h>
27 #include <linux/loop.h>
28 #include <sched.h>
29 #ifdef HAVE_SECCOMP
30 #include <seccomp.h>
31 #endif
32 #ifdef HAVE_SELINUX
33 #include <selinux/selinux.h>
34 #endif
35 #include <signal.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <sys/file.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
44 #include <unistd.h>
45
46 #include "sd-daemon.h"
47 #include "sd-id128.h"
48
49 #include "barrier.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
53 #include "build.h"
54 #include "cap-list.h"
55 #include "capability.h"
56 #include "cgroup-util.h"
57 #include "copy.h"
58 #include "dev-setup.h"
59 #include "env-util.h"
60 #include "event-util.h"
61 #include "fdset.h"
62 #include "fileio.h"
63 #include "formats-util.h"
64 #include "gpt.h"
65 #include "hostname-util.h"
66 #include "log.h"
67 #include "loopback-setup.h"
68 #include "machine-image.h"
69 #include "macro.h"
70 #include "missing.h"
71 #include "mkdir.h"
72 #include "netlink-util.h"
73 #include "path-util.h"
74 #include "process-util.h"
75 #include "ptyfwd.h"
76 #include "random-util.h"
77 #include "rm-rf.h"
78 #ifdef HAVE_SECCOMP
79 #include "seccomp-util.h"
80 #endif
81 #include "signal-util.h"
82 #include "strv.h"
83 #include "terminal-util.h"
84 #include "udev-util.h"
85 #include "util.h"
86
87 #include "nspawn-settings.h"
88 #include "nspawn-mount.h"
89 #include "nspawn-network.h"
90 #include "nspawn-expose-ports.h"
91 #include "nspawn-cgroup.h"
92 #include "nspawn-register.h"
93 #include "nspawn-setuid.h"
94
95 typedef enum ContainerStatus {
96 CONTAINER_TERMINATED,
97 CONTAINER_REBOOTED
98 } ContainerStatus;
99
100 typedef enum LinkJournal {
101 LINK_NO,
102 LINK_AUTO,
103 LINK_HOST,
104 LINK_GUEST
105 } LinkJournal;
106
107 static char *arg_directory = NULL;
108 static char *arg_template = NULL;
109 static char *arg_user = NULL;
110 static sd_id128_t arg_uuid = {};
111 static char *arg_machine = NULL;
112 static const char *arg_selinux_context = NULL;
113 static const char *arg_selinux_apifs_context = NULL;
114 static const char *arg_slice = NULL;
115 static bool arg_private_network = false;
116 static bool arg_read_only = false;
117 static bool arg_boot = false;
118 static bool arg_ephemeral = false;
119 static LinkJournal arg_link_journal = LINK_AUTO;
120 static bool arg_link_journal_try = false;
121 static uint64_t arg_retain =
122 (1ULL << CAP_CHOWN) |
123 (1ULL << CAP_DAC_OVERRIDE) |
124 (1ULL << CAP_DAC_READ_SEARCH) |
125 (1ULL << CAP_FOWNER) |
126 (1ULL << CAP_FSETID) |
127 (1ULL << CAP_IPC_OWNER) |
128 (1ULL << CAP_KILL) |
129 (1ULL << CAP_LEASE) |
130 (1ULL << CAP_LINUX_IMMUTABLE) |
131 (1ULL << CAP_NET_BIND_SERVICE) |
132 (1ULL << CAP_NET_BROADCAST) |
133 (1ULL << CAP_NET_RAW) |
134 (1ULL << CAP_SETGID) |
135 (1ULL << CAP_SETFCAP) |
136 (1ULL << CAP_SETPCAP) |
137 (1ULL << CAP_SETUID) |
138 (1ULL << CAP_SYS_ADMIN) |
139 (1ULL << CAP_SYS_CHROOT) |
140 (1ULL << CAP_SYS_NICE) |
141 (1ULL << CAP_SYS_PTRACE) |
142 (1ULL << CAP_SYS_TTY_CONFIG) |
143 (1ULL << CAP_SYS_RESOURCE) |
144 (1ULL << CAP_SYS_BOOT) |
145 (1ULL << CAP_AUDIT_WRITE) |
146 (1ULL << CAP_AUDIT_CONTROL) |
147 (1ULL << CAP_MKNOD);
148 static CustomMount *arg_custom_mounts = NULL;
149 static unsigned arg_n_custom_mounts = 0;
150 static char **arg_setenv = NULL;
151 static bool arg_quiet = false;
152 static bool arg_share_system = false;
153 static bool arg_register = true;
154 static bool arg_keep_unit = false;
155 static char **arg_network_interfaces = NULL;
156 static char **arg_network_macvlan = NULL;
157 static char **arg_network_ipvlan = NULL;
158 static bool arg_network_veth = false;
159 static char *arg_network_bridge = NULL;
160 static unsigned long arg_personality = PERSONALITY_INVALID;
161 static char *arg_image = NULL;
162 static VolatileMode arg_volatile_mode = VOLATILE_NO;
163 static ExposePort *arg_expose_ports = NULL;
164 static char **arg_property = NULL;
165 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
166 static bool arg_userns = false;
167 static int arg_kill_signal = 0;
168 static bool arg_unified_cgroup_hierarchy = false;
169 static SettingsMask arg_settings_mask = 0;
170 static int arg_settings_trusted = -1;
171 static char **arg_parameters = NULL;
172
173 static void help(void) {
174 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
175 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
176 " -h --help Show this help\n"
177 " --version Print version string\n"
178 " -q --quiet Do not show status information\n"
179 " -D --directory=PATH Root directory for the container\n"
180 " --template=PATH Initialize root directory from template directory,\n"
181 " if missing\n"
182 " -x --ephemeral Run container with snapshot of root directory, and\n"
183 " remove it after exit\n"
184 " -i --image=PATH File system device or disk image for the container\n"
185 " -b --boot Boot up full system (i.e. invoke init)\n"
186 " -u --user=USER Run the command under specified user or uid\n"
187 " -M --machine=NAME Set the machine name for the container\n"
188 " --uuid=UUID Set a specific machine UUID for the container\n"
189 " -S --slice=SLICE Place the container in the specified slice\n"
190 " --property=NAME=VALUE Set scope unit property\n"
191 " --private-users[=UIDBASE[:NUIDS]]\n"
192 " Run within user namespace\n"
193 " --private-network Disable network in container\n"
194 " --network-interface=INTERFACE\n"
195 " Assign an existing network interface to the\n"
196 " container\n"
197 " --network-macvlan=INTERFACE\n"
198 " Create a macvlan network interface based on an\n"
199 " existing network interface to the container\n"
200 " --network-ipvlan=INTERFACE\n"
201 " Create a ipvlan network interface based on an\n"
202 " existing network interface to the container\n"
203 " -n --network-veth Add a virtual ethernet connection between host\n"
204 " and container\n"
205 " --network-bridge=INTERFACE\n"
206 " Add a virtual ethernet connection between host\n"
207 " and container and add it to an existing bridge on\n"
208 " the host\n"
209 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
210 " Expose a container IP port on the host\n"
211 " -Z --selinux-context=SECLABEL\n"
212 " Set the SELinux security context to be used by\n"
213 " processes in the container\n"
214 " -L --selinux-apifs-context=SECLABEL\n"
215 " Set the SELinux security context to be used by\n"
216 " API/tmpfs file systems in the container\n"
217 " --capability=CAP In addition to the default, retain specified\n"
218 " capability\n"
219 " --drop-capability=CAP Drop the specified capability from the default set\n"
220 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
221 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
222 " try-guest, try-host\n"
223 " -j Equivalent to --link-journal=try-guest\n"
224 " --read-only Mount the root directory read-only\n"
225 " --bind=PATH[:PATH[:OPTIONS]]\n"
226 " Bind mount a file or directory from the host into\n"
227 " the container\n"
228 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
229 " Similar, but creates a read-only bind mount\n"
230 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
231 " --overlay=PATH[:PATH...]:PATH\n"
232 " Create an overlay mount from the host to \n"
233 " the container\n"
234 " --overlay-ro=PATH[:PATH...]:PATH\n"
235 " Similar, but creates a read-only overlay mount\n"
236 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
237 " --share-system Share system namespaces with host\n"
238 " --register=BOOLEAN Register container as machine\n"
239 " --keep-unit Do not register a scope for the machine, reuse\n"
240 " the service unit nspawn is running in\n"
241 " --volatile[=MODE] Run the system in volatile mode\n"
242 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
243 , program_invocation_short_name);
244 }
245
246
247 static int custom_mounts_prepare(void) {
248 unsigned i;
249 int r;
250
251 /* Ensure the mounts are applied prefix first. */
252 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
253
254 /* Allocate working directories for the overlay file systems that need it */
255 for (i = 0; i < arg_n_custom_mounts; i++) {
256 CustomMount *m = &arg_custom_mounts[i];
257
258 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
259 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
260 return -EINVAL;
261 }
262
263 if (m->type != CUSTOM_MOUNT_OVERLAY)
264 continue;
265
266 if (m->work_dir)
267 continue;
268
269 if (m->read_only)
270 continue;
271
272 r = tempfn_random(m->source, NULL, &m->work_dir);
273 if (r < 0)
274 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
275 }
276
277 return 0;
278 }
279
280 static int set_sanitized_path(char **b, const char *path) {
281 char *p;
282
283 assert(b);
284 assert(path);
285
286 p = canonicalize_file_name(path);
287 if (!p) {
288 if (errno != ENOENT)
289 return -errno;
290
291 p = path_make_absolute_cwd(path);
292 if (!p)
293 return -ENOMEM;
294 }
295
296 free(*b);
297 *b = path_kill_slashes(p);
298 return 0;
299 }
300
301 static int detect_unified_cgroup_hierarchy(void) {
302 const char *e;
303 int r;
304
305 /* Allow the user to control whether the unified hierarchy is used */
306 e = getenv("UNIFIED_CGROUP_HIERARCHY");
307 if (e) {
308 r = parse_boolean(e);
309 if (r < 0)
310 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
311
312 arg_unified_cgroup_hierarchy = r;
313 return 0;
314 }
315
316 /* Otherwise inherit the default from the host system */
317 r = cg_unified();
318 if (r < 0)
319 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
320
321 arg_unified_cgroup_hierarchy = r;
322 return 0;
323 }
324
325 static int parse_argv(int argc, char *argv[]) {
326
327 enum {
328 ARG_VERSION = 0x100,
329 ARG_PRIVATE_NETWORK,
330 ARG_UUID,
331 ARG_READ_ONLY,
332 ARG_CAPABILITY,
333 ARG_DROP_CAPABILITY,
334 ARG_LINK_JOURNAL,
335 ARG_BIND,
336 ARG_BIND_RO,
337 ARG_TMPFS,
338 ARG_OVERLAY,
339 ARG_OVERLAY_RO,
340 ARG_SETENV,
341 ARG_SHARE_SYSTEM,
342 ARG_REGISTER,
343 ARG_KEEP_UNIT,
344 ARG_NETWORK_INTERFACE,
345 ARG_NETWORK_MACVLAN,
346 ARG_NETWORK_IPVLAN,
347 ARG_NETWORK_BRIDGE,
348 ARG_PERSONALITY,
349 ARG_VOLATILE,
350 ARG_TEMPLATE,
351 ARG_PROPERTY,
352 ARG_PRIVATE_USERS,
353 ARG_KILL_SIGNAL,
354 ARG_SETTINGS,
355 };
356
357 static const struct option options[] = {
358 { "help", no_argument, NULL, 'h' },
359 { "version", no_argument, NULL, ARG_VERSION },
360 { "directory", required_argument, NULL, 'D' },
361 { "template", required_argument, NULL, ARG_TEMPLATE },
362 { "ephemeral", no_argument, NULL, 'x' },
363 { "user", required_argument, NULL, 'u' },
364 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
365 { "boot", no_argument, NULL, 'b' },
366 { "uuid", required_argument, NULL, ARG_UUID },
367 { "read-only", no_argument, NULL, ARG_READ_ONLY },
368 { "capability", required_argument, NULL, ARG_CAPABILITY },
369 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
370 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
371 { "bind", required_argument, NULL, ARG_BIND },
372 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
373 { "tmpfs", required_argument, NULL, ARG_TMPFS },
374 { "overlay", required_argument, NULL, ARG_OVERLAY },
375 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
376 { "machine", required_argument, NULL, 'M' },
377 { "slice", required_argument, NULL, 'S' },
378 { "setenv", required_argument, NULL, ARG_SETENV },
379 { "selinux-context", required_argument, NULL, 'Z' },
380 { "selinux-apifs-context", required_argument, NULL, 'L' },
381 { "quiet", no_argument, NULL, 'q' },
382 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
383 { "register", required_argument, NULL, ARG_REGISTER },
384 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
385 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
386 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
387 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
388 { "network-veth", no_argument, NULL, 'n' },
389 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
390 { "personality", required_argument, NULL, ARG_PERSONALITY },
391 { "image", required_argument, NULL, 'i' },
392 { "volatile", optional_argument, NULL, ARG_VOLATILE },
393 { "port", required_argument, NULL, 'p' },
394 { "property", required_argument, NULL, ARG_PROPERTY },
395 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
396 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
397 { "settings", required_argument, NULL, ARG_SETTINGS },
398 {}
399 };
400
401 int c, r;
402 uint64_t plus = 0, minus = 0;
403 bool mask_all_settings = false, mask_no_settings = false;
404
405 assert(argc >= 0);
406 assert(argv);
407
408 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
409
410 switch (c) {
411
412 case 'h':
413 help();
414 return 0;
415
416 case ARG_VERSION:
417 puts(PACKAGE_STRING);
418 puts(SYSTEMD_FEATURES);
419 return 0;
420
421 case 'D':
422 r = set_sanitized_path(&arg_directory, optarg);
423 if (r < 0)
424 return log_error_errno(r, "Invalid root directory: %m");
425
426 break;
427
428 case ARG_TEMPLATE:
429 r = set_sanitized_path(&arg_template, optarg);
430 if (r < 0)
431 return log_error_errno(r, "Invalid template directory: %m");
432
433 break;
434
435 case 'i':
436 r = set_sanitized_path(&arg_image, optarg);
437 if (r < 0)
438 return log_error_errno(r, "Invalid image path: %m");
439
440 break;
441
442 case 'x':
443 arg_ephemeral = true;
444 break;
445
446 case 'u':
447 r = free_and_strdup(&arg_user, optarg);
448 if (r < 0)
449 return log_oom();
450
451 arg_settings_mask |= SETTING_USER;
452 break;
453
454 case ARG_NETWORK_BRIDGE:
455 r = free_and_strdup(&arg_network_bridge, optarg);
456 if (r < 0)
457 return log_oom();
458
459 /* fall through */
460
461 case 'n':
462 arg_network_veth = true;
463 arg_private_network = true;
464 arg_settings_mask |= SETTING_NETWORK;
465 break;
466
467 case ARG_NETWORK_INTERFACE:
468 if (strv_extend(&arg_network_interfaces, optarg) < 0)
469 return log_oom();
470
471 arg_private_network = true;
472 arg_settings_mask |= SETTING_NETWORK;
473 break;
474
475 case ARG_NETWORK_MACVLAN:
476 if (strv_extend(&arg_network_macvlan, optarg) < 0)
477 return log_oom();
478
479 arg_private_network = true;
480 arg_settings_mask |= SETTING_NETWORK;
481 break;
482
483 case ARG_NETWORK_IPVLAN:
484 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
485 return log_oom();
486
487 /* fall through */
488
489 case ARG_PRIVATE_NETWORK:
490 arg_private_network = true;
491 arg_settings_mask |= SETTING_NETWORK;
492 break;
493
494 case 'b':
495 arg_boot = true;
496 arg_settings_mask |= SETTING_BOOT;
497 break;
498
499 case ARG_UUID:
500 r = sd_id128_from_string(optarg, &arg_uuid);
501 if (r < 0) {
502 log_error("Invalid UUID: %s", optarg);
503 return r;
504 }
505
506 arg_settings_mask |= SETTING_MACHINE_ID;
507 break;
508
509 case 'S':
510 arg_slice = optarg;
511 break;
512
513 case 'M':
514 if (isempty(optarg))
515 arg_machine = mfree(arg_machine);
516 else {
517 if (!machine_name_is_valid(optarg)) {
518 log_error("Invalid machine name: %s", optarg);
519 return -EINVAL;
520 }
521
522 r = free_and_strdup(&arg_machine, optarg);
523 if (r < 0)
524 return log_oom();
525
526 break;
527 }
528
529 case 'Z':
530 arg_selinux_context = optarg;
531 break;
532
533 case 'L':
534 arg_selinux_apifs_context = optarg;
535 break;
536
537 case ARG_READ_ONLY:
538 arg_read_only = true;
539 arg_settings_mask |= SETTING_READ_ONLY;
540 break;
541
542 case ARG_CAPABILITY:
543 case ARG_DROP_CAPABILITY: {
544 const char *state, *word;
545 size_t length;
546
547 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
548 _cleanup_free_ char *t;
549
550 t = strndup(word, length);
551 if (!t)
552 return log_oom();
553
554 if (streq(t, "all")) {
555 if (c == ARG_CAPABILITY)
556 plus = (uint64_t) -1;
557 else
558 minus = (uint64_t) -1;
559 } else {
560 int cap;
561
562 cap = capability_from_name(t);
563 if (cap < 0) {
564 log_error("Failed to parse capability %s.", t);
565 return -EINVAL;
566 }
567
568 if (c == ARG_CAPABILITY)
569 plus |= 1ULL << (uint64_t) cap;
570 else
571 minus |= 1ULL << (uint64_t) cap;
572 }
573 }
574
575 arg_settings_mask |= SETTING_CAPABILITY;
576 break;
577 }
578
579 case 'j':
580 arg_link_journal = LINK_GUEST;
581 arg_link_journal_try = true;
582 break;
583
584 case ARG_LINK_JOURNAL:
585 if (streq(optarg, "auto")) {
586 arg_link_journal = LINK_AUTO;
587 arg_link_journal_try = false;
588 } else if (streq(optarg, "no")) {
589 arg_link_journal = LINK_NO;
590 arg_link_journal_try = false;
591 } else if (streq(optarg, "guest")) {
592 arg_link_journal = LINK_GUEST;
593 arg_link_journal_try = false;
594 } else if (streq(optarg, "host")) {
595 arg_link_journal = LINK_HOST;
596 arg_link_journal_try = false;
597 } else if (streq(optarg, "try-guest")) {
598 arg_link_journal = LINK_GUEST;
599 arg_link_journal_try = true;
600 } else if (streq(optarg, "try-host")) {
601 arg_link_journal = LINK_HOST;
602 arg_link_journal_try = true;
603 } else {
604 log_error("Failed to parse link journal mode %s", optarg);
605 return -EINVAL;
606 }
607
608 break;
609
610 case ARG_BIND:
611 case ARG_BIND_RO:
612 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
613 if (r < 0)
614 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
615
616 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
617 break;
618
619 case ARG_TMPFS:
620 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
621 if (r < 0)
622 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
623
624 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
625 break;
626
627 case ARG_OVERLAY:
628 case ARG_OVERLAY_RO: {
629 _cleanup_free_ char *upper = NULL, *destination = NULL;
630 _cleanup_strv_free_ char **lower = NULL;
631 CustomMount *m;
632 unsigned n = 0;
633 char **i;
634
635 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
636 if (r == -ENOMEM)
637 return log_oom();
638 else if (r < 0) {
639 log_error("Invalid overlay specification: %s", optarg);
640 return r;
641 }
642
643 STRV_FOREACH(i, lower) {
644 if (!path_is_absolute(*i)) {
645 log_error("Overlay path %s is not absolute.", *i);
646 return -EINVAL;
647 }
648
649 n++;
650 }
651
652 if (n < 2) {
653 log_error("--overlay= needs at least two colon-separated directories specified.");
654 return -EINVAL;
655 }
656
657 if (n == 2) {
658 /* If two parameters are specified,
659 * the first one is the lower, the
660 * second one the upper directory. And
661 * we'll also define the destination
662 * mount point the same as the upper. */
663 upper = lower[1];
664 lower[1] = NULL;
665
666 destination = strdup(upper);
667 if (!destination)
668 return log_oom();
669
670 } else {
671 upper = lower[n - 2];
672 destination = lower[n - 1];
673 lower[n - 2] = NULL;
674 }
675
676 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
677 if (!m)
678 return log_oom();
679
680 m->destination = destination;
681 m->source = upper;
682 m->lower = lower;
683 m->read_only = c == ARG_OVERLAY_RO;
684
685 upper = destination = NULL;
686 lower = NULL;
687
688 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
689 break;
690 }
691
692 case ARG_SETENV: {
693 char **n;
694
695 if (!env_assignment_is_valid(optarg)) {
696 log_error("Environment variable assignment '%s' is not valid.", optarg);
697 return -EINVAL;
698 }
699
700 n = strv_env_set(arg_setenv, optarg);
701 if (!n)
702 return log_oom();
703
704 strv_free(arg_setenv);
705 arg_setenv = n;
706
707 arg_settings_mask |= SETTING_ENVIRONMENT;
708 break;
709 }
710
711 case 'q':
712 arg_quiet = true;
713 break;
714
715 case ARG_SHARE_SYSTEM:
716 arg_share_system = true;
717 break;
718
719 case ARG_REGISTER:
720 r = parse_boolean(optarg);
721 if (r < 0) {
722 log_error("Failed to parse --register= argument: %s", optarg);
723 return r;
724 }
725
726 arg_register = r;
727 break;
728
729 case ARG_KEEP_UNIT:
730 arg_keep_unit = true;
731 break;
732
733 case ARG_PERSONALITY:
734
735 arg_personality = personality_from_string(optarg);
736 if (arg_personality == PERSONALITY_INVALID) {
737 log_error("Unknown or unsupported personality '%s'.", optarg);
738 return -EINVAL;
739 }
740
741 arg_settings_mask |= SETTING_PERSONALITY;
742 break;
743
744 case ARG_VOLATILE:
745
746 if (!optarg)
747 arg_volatile_mode = VOLATILE_YES;
748 else {
749 VolatileMode m;
750
751 m = volatile_mode_from_string(optarg);
752 if (m < 0) {
753 log_error("Failed to parse --volatile= argument: %s", optarg);
754 return -EINVAL;
755 } else
756 arg_volatile_mode = m;
757 }
758
759 arg_settings_mask |= SETTING_VOLATILE_MODE;
760 break;
761
762 case 'p':
763 r = expose_port_parse(&arg_expose_ports, optarg);
764 if (r == -EEXIST)
765 return log_error_errno(r, "Duplicate port specification: %s", optarg);
766 if (r < 0)
767 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
768
769 arg_settings_mask |= SETTING_EXPOSE_PORTS;
770 break;
771
772 case ARG_PROPERTY:
773 if (strv_extend(&arg_property, optarg) < 0)
774 return log_oom();
775
776 break;
777
778 case ARG_PRIVATE_USERS:
779 if (optarg) {
780 _cleanup_free_ char *buffer = NULL;
781 const char *range, *shift;
782
783 range = strchr(optarg, ':');
784 if (range) {
785 buffer = strndup(optarg, range - optarg);
786 if (!buffer)
787 return log_oom();
788 shift = buffer;
789
790 range++;
791 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
792 log_error("Failed to parse UID range: %s", range);
793 return -EINVAL;
794 }
795 } else
796 shift = optarg;
797
798 if (parse_uid(shift, &arg_uid_shift) < 0) {
799 log_error("Failed to parse UID: %s", optarg);
800 return -EINVAL;
801 }
802 }
803
804 arg_userns = true;
805 break;
806
807 case ARG_KILL_SIGNAL:
808 arg_kill_signal = signal_from_string_try_harder(optarg);
809 if (arg_kill_signal < 0) {
810 log_error("Cannot parse signal: %s", optarg);
811 return -EINVAL;
812 }
813
814 arg_settings_mask |= SETTING_KILL_SIGNAL;
815 break;
816
817 case ARG_SETTINGS:
818
819 /* no → do not read files
820 * yes → read files, do not override cmdline, trust only subset
821 * override → read files, override cmdline, trust only subset
822 * trusted → read files, do not override cmdline, trust all
823 */
824
825 r = parse_boolean(optarg);
826 if (r < 0) {
827 if (streq(optarg, "trusted")) {
828 mask_all_settings = false;
829 mask_no_settings = false;
830 arg_settings_trusted = true;
831
832 } else if (streq(optarg, "override")) {
833 mask_all_settings = false;
834 mask_no_settings = true;
835 arg_settings_trusted = -1;
836 } else
837 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
838 } else if (r > 0) {
839 /* yes */
840 mask_all_settings = false;
841 mask_no_settings = false;
842 arg_settings_trusted = -1;
843 } else {
844 /* no */
845 mask_all_settings = true;
846 mask_no_settings = false;
847 arg_settings_trusted = false;
848 }
849
850 break;
851
852 case '?':
853 return -EINVAL;
854
855 default:
856 assert_not_reached("Unhandled option");
857 }
858
859 if (arg_share_system)
860 arg_register = false;
861
862 if (arg_boot && arg_share_system) {
863 log_error("--boot and --share-system may not be combined.");
864 return -EINVAL;
865 }
866
867 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
868 log_error("--keep-unit may not be used when invoked from a user session.");
869 return -EINVAL;
870 }
871
872 if (arg_directory && arg_image) {
873 log_error("--directory= and --image= may not be combined.");
874 return -EINVAL;
875 }
876
877 if (arg_template && arg_image) {
878 log_error("--template= and --image= may not be combined.");
879 return -EINVAL;
880 }
881
882 if (arg_template && !(arg_directory || arg_machine)) {
883 log_error("--template= needs --directory= or --machine=.");
884 return -EINVAL;
885 }
886
887 if (arg_ephemeral && arg_template) {
888 log_error("--ephemeral and --template= may not be combined.");
889 return -EINVAL;
890 }
891
892 if (arg_ephemeral && arg_image) {
893 log_error("--ephemeral and --image= may not be combined.");
894 return -EINVAL;
895 }
896
897 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
898 log_error("--ephemeral and --link-journal= may not be combined.");
899 return -EINVAL;
900 }
901
902 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
903 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
904
905 if (argc > optind) {
906 arg_parameters = strv_copy(argv + optind);
907 if (!arg_parameters)
908 return log_oom();
909
910 arg_settings_mask |= SETTING_BOOT;
911 }
912
913 /* Load all settings from .nspawn files */
914 if (mask_no_settings)
915 arg_settings_mask = 0;
916
917 /* Don't load any settings from .nspawn files */
918 if (mask_all_settings)
919 arg_settings_mask = _SETTINGS_MASK_ALL;
920
921 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
922
923 r = detect_unified_cgroup_hierarchy();
924 if (r < 0)
925 return r;
926
927 return 1;
928 }
929
930 static int verify_arguments(void) {
931
932 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
933 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
934 return -EINVAL;
935 }
936
937 if (arg_expose_ports && !arg_private_network) {
938 log_error("Cannot use --port= without private networking.");
939 return -EINVAL;
940 }
941
942 if (arg_boot && arg_kill_signal <= 0)
943 arg_kill_signal = SIGRTMIN+3;
944
945 return 0;
946 }
947
948 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
949 assert(p);
950
951 if (!arg_userns)
952 return 0;
953
954 if (uid == UID_INVALID && gid == GID_INVALID)
955 return 0;
956
957 if (uid != UID_INVALID) {
958 uid += arg_uid_shift;
959
960 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
961 return -EOVERFLOW;
962 }
963
964 if (gid != GID_INVALID) {
965 gid += (gid_t) arg_uid_shift;
966
967 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
968 return -EOVERFLOW;
969 }
970
971 if (lchown(p, uid, gid) < 0)
972 return -errno;
973
974 return 0;
975 }
976
977 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
978 const char *q;
979
980 q = prefix_roota(root, path);
981 if (mkdir(q, mode) < 0) {
982 if (errno == EEXIST)
983 return 0;
984 return -errno;
985 }
986
987 return userns_lchown(q, uid, gid);
988 }
989
990 static int setup_timezone(const char *dest) {
991 _cleanup_free_ char *p = NULL, *q = NULL;
992 const char *where, *check, *what;
993 char *z, *y;
994 int r;
995
996 assert(dest);
997
998 /* Fix the timezone, if possible */
999 r = readlink_malloc("/etc/localtime", &p);
1000 if (r < 0) {
1001 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1002 return 0;
1003 }
1004
1005 z = path_startswith(p, "../usr/share/zoneinfo/");
1006 if (!z)
1007 z = path_startswith(p, "/usr/share/zoneinfo/");
1008 if (!z) {
1009 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1010 return 0;
1011 }
1012
1013 where = prefix_roota(dest, "/etc/localtime");
1014 r = readlink_malloc(where, &q);
1015 if (r >= 0) {
1016 y = path_startswith(q, "../usr/share/zoneinfo/");
1017 if (!y)
1018 y = path_startswith(q, "/usr/share/zoneinfo/");
1019
1020 /* Already pointing to the right place? Then do nothing .. */
1021 if (y && streq(y, z))
1022 return 0;
1023 }
1024
1025 check = strjoina("/usr/share/zoneinfo/", z);
1026 check = prefix_root(dest, check);
1027 if (laccess(check, F_OK) < 0) {
1028 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1029 return 0;
1030 }
1031
1032 r = unlink(where);
1033 if (r < 0 && errno != ENOENT) {
1034 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1035 return 0;
1036 }
1037
1038 what = strjoina("../usr/share/zoneinfo/", z);
1039 if (symlink(what, where) < 0) {
1040 log_error_errno(errno, "Failed to correct timezone of container: %m");
1041 return 0;
1042 }
1043
1044 r = userns_lchown(where, 0, 0);
1045 if (r < 0)
1046 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1047
1048 return 0;
1049 }
1050
1051 static int setup_resolv_conf(const char *dest) {
1052 const char *where = NULL;
1053 int r;
1054
1055 assert(dest);
1056
1057 if (arg_private_network)
1058 return 0;
1059
1060 /* Fix resolv.conf, if possible */
1061 where = prefix_roota(dest, "/etc/resolv.conf");
1062
1063 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1064 if (r < 0) {
1065 /* If the file already exists as symlink, let's
1066 * suppress the warning, under the assumption that
1067 * resolved or something similar runs inside and the
1068 * symlink points there.
1069 *
1070 * If the disk image is read-only, there's also no
1071 * point in complaining.
1072 */
1073 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1074 "Failed to copy /etc/resolv.conf to %s: %m", where);
1075 return 0;
1076 }
1077
1078 r = userns_lchown(where, 0, 0);
1079 if (r < 0)
1080 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1081
1082 return 0;
1083 }
1084
1085 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1086 assert(s);
1087
1088 snprintf(s, 37,
1089 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1090 SD_ID128_FORMAT_VAL(id));
1091
1092 return s;
1093 }
1094
1095 static int setup_boot_id(const char *dest) {
1096 const char *from, *to;
1097 sd_id128_t rnd = {};
1098 char as_uuid[37];
1099 int r;
1100
1101 if (arg_share_system)
1102 return 0;
1103
1104 /* Generate a new randomized boot ID, so that each boot-up of
1105 * the container gets a new one */
1106
1107 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1108 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1109
1110 r = sd_id128_randomize(&rnd);
1111 if (r < 0)
1112 return log_error_errno(r, "Failed to generate random boot id: %m");
1113
1114 id128_format_as_uuid(rnd, as_uuid);
1115
1116 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1117 if (r < 0)
1118 return log_error_errno(r, "Failed to write boot id: %m");
1119
1120 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1121 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1122 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1123 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1124
1125 unlink(from);
1126 return r;
1127 }
1128
1129 static int copy_devnodes(const char *dest) {
1130
1131 static const char devnodes[] =
1132 "null\0"
1133 "zero\0"
1134 "full\0"
1135 "random\0"
1136 "urandom\0"
1137 "tty\0"
1138 "net/tun\0";
1139
1140 const char *d;
1141 int r = 0;
1142 _cleanup_umask_ mode_t u;
1143
1144 assert(dest);
1145
1146 u = umask(0000);
1147
1148 /* Create /dev/net, so that we can create /dev/net/tun in it */
1149 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1150 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1151
1152 NULSTR_FOREACH(d, devnodes) {
1153 _cleanup_free_ char *from = NULL, *to = NULL;
1154 struct stat st;
1155
1156 from = strappend("/dev/", d);
1157 to = prefix_root(dest, from);
1158
1159 if (stat(from, &st) < 0) {
1160
1161 if (errno != ENOENT)
1162 return log_error_errno(errno, "Failed to stat %s: %m", from);
1163
1164 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1165
1166 log_error("%s is not a char or block device, cannot copy.", from);
1167 return -EIO;
1168
1169 } else {
1170 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1171 if (errno != EPERM)
1172 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1173
1174 /* Some systems abusively restrict mknod but
1175 * allow bind mounts. */
1176 r = touch(to);
1177 if (r < 0)
1178 return log_error_errno(r, "touch (%s) failed: %m", to);
1179 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1180 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1181 }
1182
1183 r = userns_lchown(to, 0, 0);
1184 if (r < 0)
1185 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1186 }
1187 }
1188
1189 return r;
1190 }
1191
1192 static int setup_pts(const char *dest) {
1193 _cleanup_free_ char *options = NULL;
1194 const char *p;
1195
1196 #ifdef HAVE_SELINUX
1197 if (arg_selinux_apifs_context)
1198 (void) asprintf(&options,
1199 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1200 arg_uid_shift + TTY_GID,
1201 arg_selinux_apifs_context);
1202 else
1203 #endif
1204 (void) asprintf(&options,
1205 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1206 arg_uid_shift + TTY_GID);
1207
1208 if (!options)
1209 return log_oom();
1210
1211 /* Mount /dev/pts itself */
1212 p = prefix_roota(dest, "/dev/pts");
1213 if (mkdir(p, 0755) < 0)
1214 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1215 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1216 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1217 if (userns_lchown(p, 0, 0) < 0)
1218 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1219
1220 /* Create /dev/ptmx symlink */
1221 p = prefix_roota(dest, "/dev/ptmx");
1222 if (symlink("pts/ptmx", p) < 0)
1223 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1224 if (userns_lchown(p, 0, 0) < 0)
1225 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1226
1227 /* And fix /dev/pts/ptmx ownership */
1228 p = prefix_roota(dest, "/dev/pts/ptmx");
1229 if (userns_lchown(p, 0, 0) < 0)
1230 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1231
1232 return 0;
1233 }
1234
1235 static int setup_dev_console(const char *dest, const char *console) {
1236 _cleanup_umask_ mode_t u;
1237 const char *to;
1238 int r;
1239
1240 assert(dest);
1241 assert(console);
1242
1243 u = umask(0000);
1244
1245 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1246 if (r < 0)
1247 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1248
1249 /* We need to bind mount the right tty to /dev/console since
1250 * ptys can only exist on pts file systems. To have something
1251 * to bind mount things on we create a empty regular file. */
1252
1253 to = prefix_roota(dest, "/dev/console");
1254 r = touch(to);
1255 if (r < 0)
1256 return log_error_errno(r, "touch() for /dev/console failed: %m");
1257
1258 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1259 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1260
1261 return 0;
1262 }
1263
1264 static int setup_kmsg(const char *dest, int kmsg_socket) {
1265 const char *from, *to;
1266 _cleanup_umask_ mode_t u;
1267 int fd, r;
1268
1269 assert(kmsg_socket >= 0);
1270
1271 u = umask(0000);
1272
1273 /* We create the kmsg FIFO as /run/kmsg, but immediately
1274 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1275 * on the reading side behave very similar to /proc/kmsg,
1276 * their writing side behaves differently from /dev/kmsg in
1277 * that writing blocks when nothing is reading. In order to
1278 * avoid any problems with containers deadlocking due to this
1279 * we simply make /dev/kmsg unavailable to the container. */
1280 from = prefix_roota(dest, "/run/kmsg");
1281 to = prefix_roota(dest, "/proc/kmsg");
1282
1283 if (mkfifo(from, 0600) < 0)
1284 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1285 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1286 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1287
1288 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1289 if (fd < 0)
1290 return log_error_errno(errno, "Failed to open fifo: %m");
1291
1292 /* Store away the fd in the socket, so that it stays open as
1293 * long as we run the child */
1294 r = send_one_fd(kmsg_socket, fd, 0);
1295 safe_close(fd);
1296
1297 if (r < 0)
1298 return log_error_errno(r, "Failed to send FIFO fd: %m");
1299
1300 /* And now make the FIFO unavailable as /run/kmsg... */
1301 (void) unlink(from);
1302
1303 return 0;
1304 }
1305
1306 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1307 union in_addr_union *exposed = userdata;
1308
1309 assert(rtnl);
1310 assert(m);
1311 assert(exposed);
1312
1313 expose_port_execute(rtnl, arg_expose_ports, exposed);
1314 return 0;
1315 }
1316
1317 static int setup_hostname(void) {
1318
1319 if (arg_share_system)
1320 return 0;
1321
1322 if (sethostname_idempotent(arg_machine) < 0)
1323 return -errno;
1324
1325 return 0;
1326 }
1327
1328 static int setup_journal(const char *directory) {
1329 sd_id128_t machine_id, this_id;
1330 _cleanup_free_ char *b = NULL, *d = NULL;
1331 const char *etc_machine_id, *p, *q;
1332 char *id;
1333 int r;
1334
1335 /* Don't link journals in ephemeral mode */
1336 if (arg_ephemeral)
1337 return 0;
1338
1339 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1340
1341 r = read_one_line_file(etc_machine_id, &b);
1342 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1343 return 0;
1344 else if (r < 0)
1345 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
1346
1347 id = strstrip(b);
1348 if (isempty(id) && arg_link_journal == LINK_AUTO)
1349 return 0;
1350
1351 /* Verify validity */
1352 r = sd_id128_from_string(id, &machine_id);
1353 if (r < 0)
1354 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
1355
1356 r = sd_id128_get_machine(&this_id);
1357 if (r < 0)
1358 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1359
1360 if (sd_id128_equal(machine_id, this_id)) {
1361 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1362 "Host and machine ids are equal (%s): refusing to link journals", id);
1363 if (arg_link_journal == LINK_AUTO)
1364 return 0;
1365 return -EEXIST;
1366 }
1367
1368 if (arg_link_journal == LINK_NO)
1369 return 0;
1370
1371 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1372 if (r < 0)
1373 return log_error_errno(r, "Failed to create /var: %m");
1374
1375 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1376 if (r < 0)
1377 return log_error_errno(r, "Failed to create /var/log: %m");
1378
1379 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1380 if (r < 0)
1381 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1382
1383 p = strjoina("/var/log/journal/", id);
1384 q = prefix_roota(directory, p);
1385
1386 if (path_is_mount_point(p, 0) > 0) {
1387 if (arg_link_journal != LINK_AUTO) {
1388 log_error("%s: already a mount point, refusing to use for journal", p);
1389 return -EEXIST;
1390 }
1391
1392 return 0;
1393 }
1394
1395 if (path_is_mount_point(q, 0) > 0) {
1396 if (arg_link_journal != LINK_AUTO) {
1397 log_error("%s: already a mount point, refusing to use for journal", q);
1398 return -EEXIST;
1399 }
1400
1401 return 0;
1402 }
1403
1404 r = readlink_and_make_absolute(p, &d);
1405 if (r >= 0) {
1406 if ((arg_link_journal == LINK_GUEST ||
1407 arg_link_journal == LINK_AUTO) &&
1408 path_equal(d, q)) {
1409
1410 r = userns_mkdir(directory, p, 0755, 0, 0);
1411 if (r < 0)
1412 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1413 return 0;
1414 }
1415
1416 if (unlink(p) < 0)
1417 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1418 } else if (r == -EINVAL) {
1419
1420 if (arg_link_journal == LINK_GUEST &&
1421 rmdir(p) < 0) {
1422
1423 if (errno == ENOTDIR) {
1424 log_error("%s already exists and is neither a symlink nor a directory", p);
1425 return r;
1426 } else {
1427 log_error_errno(errno, "Failed to remove %s: %m", p);
1428 return -errno;
1429 }
1430 }
1431 } else if (r != -ENOENT) {
1432 log_error_errno(errno, "readlink(%s) failed: %m", p);
1433 return r;
1434 }
1435
1436 if (arg_link_journal == LINK_GUEST) {
1437
1438 if (symlink(q, p) < 0) {
1439 if (arg_link_journal_try) {
1440 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1441 return 0;
1442 } else {
1443 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1444 return -errno;
1445 }
1446 }
1447
1448 r = userns_mkdir(directory, p, 0755, 0, 0);
1449 if (r < 0)
1450 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1451 return 0;
1452 }
1453
1454 if (arg_link_journal == LINK_HOST) {
1455 /* don't create parents here -- if the host doesn't have
1456 * permanent journal set up, don't force it here */
1457 r = mkdir(p, 0755);
1458 if (r < 0) {
1459 if (arg_link_journal_try) {
1460 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1461 return 0;
1462 } else {
1463 log_error_errno(errno, "Failed to create %s: %m", p);
1464 return r;
1465 }
1466 }
1467
1468 } else if (access(p, F_OK) < 0)
1469 return 0;
1470
1471 if (dir_is_empty(q) == 0)
1472 log_warning("%s is not empty, proceeding anyway.", q);
1473
1474 r = userns_mkdir(directory, p, 0755, 0, 0);
1475 if (r < 0) {
1476 log_error_errno(errno, "Failed to create %s: %m", q);
1477 return r;
1478 }
1479
1480 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1481 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1482
1483 return 0;
1484 }
1485
1486 static int drop_capabilities(void) {
1487 return capability_bounding_set_drop(~arg_retain, false);
1488 }
1489
1490 static int reset_audit_loginuid(void) {
1491 _cleanup_free_ char *p = NULL;
1492 int r;
1493
1494 if (arg_share_system)
1495 return 0;
1496
1497 r = read_one_line_file("/proc/self/loginuid", &p);
1498 if (r == -ENOENT)
1499 return 0;
1500 if (r < 0)
1501 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1502
1503 /* Already reset? */
1504 if (streq(p, "4294967295"))
1505 return 0;
1506
1507 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1508 if (r < 0) {
1509 log_error_errno(r,
1510 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1511 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1512 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1513 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1514 "using systemd-nspawn. Sleeping for 5s... (%m)");
1515
1516 sleep(5);
1517 }
1518
1519 return 0;
1520 }
1521
1522 static int setup_seccomp(void) {
1523
1524 #ifdef HAVE_SECCOMP
1525 static const struct {
1526 uint64_t capability;
1527 int syscall_num;
1528 } blacklist[] = {
1529 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1530 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1531 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1532 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1533 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1534 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1535 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1536 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1537 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1538 { CAP_SYSLOG, SCMP_SYS(syslog) },
1539 };
1540
1541 scmp_filter_ctx seccomp;
1542 unsigned i;
1543 int r;
1544
1545 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1546 if (!seccomp)
1547 return log_oom();
1548
1549 r = seccomp_add_secondary_archs(seccomp);
1550 if (r < 0) {
1551 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1552 goto finish;
1553 }
1554
1555 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1556 if (arg_retain & (1ULL << blacklist[i].capability))
1557 continue;
1558
1559 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1560 if (r == -EFAULT)
1561 continue; /* unknown syscall */
1562 if (r < 0) {
1563 log_error_errno(r, "Failed to block syscall: %m");
1564 goto finish;
1565 }
1566 }
1567
1568
1569 /*
1570 Audit is broken in containers, much of the userspace audit
1571 hookup will fail if running inside a container. We don't
1572 care and just turn off creation of audit sockets.
1573
1574 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1575 with EAFNOSUPPORT which audit userspace uses as indication
1576 that audit is disabled in the kernel.
1577 */
1578
1579 r = seccomp_rule_add(
1580 seccomp,
1581 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1582 SCMP_SYS(socket),
1583 2,
1584 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1585 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1586 if (r < 0) {
1587 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1588 goto finish;
1589 }
1590
1591 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1592 if (r < 0) {
1593 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1594 goto finish;
1595 }
1596
1597 r = seccomp_load(seccomp);
1598 if (r == -EINVAL) {
1599 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1600 r = 0;
1601 goto finish;
1602 }
1603 if (r < 0) {
1604 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1605 goto finish;
1606 }
1607
1608 finish:
1609 seccomp_release(seccomp);
1610 return r;
1611 #else
1612 return 0;
1613 #endif
1614
1615 }
1616
1617 static int setup_propagate(const char *root) {
1618 const char *p, *q;
1619
1620 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1621 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1622 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1623 (void) mkdir_p(p, 0600);
1624
1625 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
1626 return log_error_errno(errno, "Failed to create /run/systemd: %m");
1627
1628 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1629 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
1630
1631 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1632 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
1633
1634 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1635 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1636 return log_error_errno(errno, "Failed to install propagation bind mount.");
1637
1638 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1639 return log_error_errno(errno, "Failed to make propagation mount read-only");
1640
1641 return 0;
1642 }
1643
1644 static int setup_image(char **device_path, int *loop_nr) {
1645 struct loop_info64 info = {
1646 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1647 };
1648 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1649 _cleanup_free_ char* loopdev = NULL;
1650 struct stat st;
1651 int r, nr;
1652
1653 assert(device_path);
1654 assert(loop_nr);
1655 assert(arg_image);
1656
1657 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1658 if (fd < 0)
1659 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1660
1661 if (fstat(fd, &st) < 0)
1662 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1663
1664 if (S_ISBLK(st.st_mode)) {
1665 char *p;
1666
1667 p = strdup(arg_image);
1668 if (!p)
1669 return log_oom();
1670
1671 *device_path = p;
1672
1673 *loop_nr = -1;
1674
1675 r = fd;
1676 fd = -1;
1677
1678 return r;
1679 }
1680
1681 if (!S_ISREG(st.st_mode)) {
1682 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1683 return -EINVAL;
1684 }
1685
1686 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1687 if (control < 0)
1688 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1689
1690 nr = ioctl(control, LOOP_CTL_GET_FREE);
1691 if (nr < 0)
1692 return log_error_errno(errno, "Failed to allocate loop device: %m");
1693
1694 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1695 return log_oom();
1696
1697 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1698 if (loop < 0)
1699 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1700
1701 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1702 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1703
1704 if (arg_read_only)
1705 info.lo_flags |= LO_FLAGS_READ_ONLY;
1706
1707 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1708 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1709
1710 *device_path = loopdev;
1711 loopdev = NULL;
1712
1713 *loop_nr = nr;
1714
1715 r = loop;
1716 loop = -1;
1717
1718 return r;
1719 }
1720
1721 #define PARTITION_TABLE_BLURB \
1722 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1723 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1724 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1725 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1726 "to be bootable with systemd-nspawn."
1727
1728 static int dissect_image(
1729 int fd,
1730 char **root_device, bool *root_device_rw,
1731 char **home_device, bool *home_device_rw,
1732 char **srv_device, bool *srv_device_rw,
1733 bool *secondary) {
1734
1735 #ifdef HAVE_BLKID
1736 int home_nr = -1, srv_nr = -1;
1737 #ifdef GPT_ROOT_NATIVE
1738 int root_nr = -1;
1739 #endif
1740 #ifdef GPT_ROOT_SECONDARY
1741 int secondary_root_nr = -1;
1742 #endif
1743 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1744 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1745 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1746 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1747 _cleanup_udev_unref_ struct udev *udev = NULL;
1748 struct udev_list_entry *first, *item;
1749 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1750 bool is_gpt, is_mbr, multiple_generic = false;
1751 const char *pttype = NULL;
1752 blkid_partlist pl;
1753 struct stat st;
1754 unsigned i;
1755 int r;
1756
1757 assert(fd >= 0);
1758 assert(root_device);
1759 assert(home_device);
1760 assert(srv_device);
1761 assert(secondary);
1762 assert(arg_image);
1763
1764 b = blkid_new_probe();
1765 if (!b)
1766 return log_oom();
1767
1768 errno = 0;
1769 r = blkid_probe_set_device(b, fd, 0, 0);
1770 if (r != 0) {
1771 if (errno == 0)
1772 return log_oom();
1773
1774 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1775 return -errno;
1776 }
1777
1778 blkid_probe_enable_partitions(b, 1);
1779 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1780
1781 errno = 0;
1782 r = blkid_do_safeprobe(b);
1783 if (r == -2 || r == 1) {
1784 log_error("Failed to identify any partition table on\n"
1785 " %s\n"
1786 PARTITION_TABLE_BLURB, arg_image);
1787 return -EINVAL;
1788 } else if (r != 0) {
1789 if (errno == 0)
1790 errno = EIO;
1791 log_error_errno(errno, "Failed to probe: %m");
1792 return -errno;
1793 }
1794
1795 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1796
1797 is_gpt = streq_ptr(pttype, "gpt");
1798 is_mbr = streq_ptr(pttype, "dos");
1799
1800 if (!is_gpt && !is_mbr) {
1801 log_error("No GPT or MBR partition table discovered on\n"
1802 " %s\n"
1803 PARTITION_TABLE_BLURB, arg_image);
1804 return -EINVAL;
1805 }
1806
1807 errno = 0;
1808 pl = blkid_probe_get_partitions(b);
1809 if (!pl) {
1810 if (errno == 0)
1811 return log_oom();
1812
1813 log_error("Failed to list partitions of %s", arg_image);
1814 return -errno;
1815 }
1816
1817 udev = udev_new();
1818 if (!udev)
1819 return log_oom();
1820
1821 if (fstat(fd, &st) < 0)
1822 return log_error_errno(errno, "Failed to stat block device: %m");
1823
1824 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1825 if (!d)
1826 return log_oom();
1827
1828 for (i = 0;; i++) {
1829 int n, m;
1830
1831 if (i >= 10) {
1832 log_error("Kernel partitions never appeared.");
1833 return -ENXIO;
1834 }
1835
1836 e = udev_enumerate_new(udev);
1837 if (!e)
1838 return log_oom();
1839
1840 r = udev_enumerate_add_match_parent(e, d);
1841 if (r < 0)
1842 return log_oom();
1843
1844 r = udev_enumerate_scan_devices(e);
1845 if (r < 0)
1846 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1847
1848 /* Count the partitions enumerated by the kernel */
1849 n = 0;
1850 first = udev_enumerate_get_list_entry(e);
1851 udev_list_entry_foreach(item, first)
1852 n++;
1853
1854 /* Count the partitions enumerated by blkid */
1855 m = blkid_partlist_numof_partitions(pl);
1856 if (n == m + 1)
1857 break;
1858 if (n > m + 1) {
1859 log_error("blkid and kernel partition list do not match.");
1860 return -EIO;
1861 }
1862 if (n < m + 1) {
1863 unsigned j;
1864
1865 /* The kernel has probed fewer partitions than
1866 * blkid? Maybe the kernel prober is still
1867 * running or it got EBUSY because udev
1868 * already opened the device. Let's reprobe
1869 * the device, which is a synchronous call
1870 * that waits until probing is complete. */
1871
1872 for (j = 0; j < 20; j++) {
1873
1874 r = ioctl(fd, BLKRRPART, 0);
1875 if (r < 0)
1876 r = -errno;
1877 if (r >= 0 || r != -EBUSY)
1878 break;
1879
1880 /* If something else has the device
1881 * open, such as an udev rule, the
1882 * ioctl will return EBUSY. Since
1883 * there's no way to wait until it
1884 * isn't busy anymore, let's just wait
1885 * a bit, and try again.
1886 *
1887 * This is really something they
1888 * should fix in the kernel! */
1889
1890 usleep(50 * USEC_PER_MSEC);
1891 }
1892
1893 if (r < 0)
1894 return log_error_errno(r, "Failed to reread partition table: %m");
1895 }
1896
1897 e = udev_enumerate_unref(e);
1898 }
1899
1900 first = udev_enumerate_get_list_entry(e);
1901 udev_list_entry_foreach(item, first) {
1902 _cleanup_udev_device_unref_ struct udev_device *q;
1903 const char *node;
1904 unsigned long long flags;
1905 blkid_partition pp;
1906 dev_t qn;
1907 int nr;
1908
1909 errno = 0;
1910 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1911 if (!q) {
1912 if (!errno)
1913 errno = ENOMEM;
1914
1915 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1916 return -errno;
1917 }
1918
1919 qn = udev_device_get_devnum(q);
1920 if (major(qn) == 0)
1921 continue;
1922
1923 if (st.st_rdev == qn)
1924 continue;
1925
1926 node = udev_device_get_devnode(q);
1927 if (!node)
1928 continue;
1929
1930 pp = blkid_partlist_devno_to_partition(pl, qn);
1931 if (!pp)
1932 continue;
1933
1934 flags = blkid_partition_get_flags(pp);
1935
1936 nr = blkid_partition_get_partno(pp);
1937 if (nr < 0)
1938 continue;
1939
1940 if (is_gpt) {
1941 sd_id128_t type_id;
1942 const char *stype;
1943
1944 if (flags & GPT_FLAG_NO_AUTO)
1945 continue;
1946
1947 stype = blkid_partition_get_type_string(pp);
1948 if (!stype)
1949 continue;
1950
1951 if (sd_id128_from_string(stype, &type_id) < 0)
1952 continue;
1953
1954 if (sd_id128_equal(type_id, GPT_HOME)) {
1955
1956 if (home && nr >= home_nr)
1957 continue;
1958
1959 home_nr = nr;
1960 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1961
1962 r = free_and_strdup(&home, node);
1963 if (r < 0)
1964 return log_oom();
1965
1966 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1967
1968 if (srv && nr >= srv_nr)
1969 continue;
1970
1971 srv_nr = nr;
1972 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1973
1974 r = free_and_strdup(&srv, node);
1975 if (r < 0)
1976 return log_oom();
1977 }
1978 #ifdef GPT_ROOT_NATIVE
1979 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1980
1981 if (root && nr >= root_nr)
1982 continue;
1983
1984 root_nr = nr;
1985 root_rw = !(flags & GPT_FLAG_READ_ONLY);
1986
1987 r = free_and_strdup(&root, node);
1988 if (r < 0)
1989 return log_oom();
1990 }
1991 #endif
1992 #ifdef GPT_ROOT_SECONDARY
1993 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1994
1995 if (secondary_root && nr >= secondary_root_nr)
1996 continue;
1997
1998 secondary_root_nr = nr;
1999 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2000
2001 r = free_and_strdup(&secondary_root, node);
2002 if (r < 0)
2003 return log_oom();
2004 }
2005 #endif
2006 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2007
2008 if (generic)
2009 multiple_generic = true;
2010 else {
2011 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2012
2013 r = free_and_strdup(&generic, node);
2014 if (r < 0)
2015 return log_oom();
2016 }
2017 }
2018
2019 } else if (is_mbr) {
2020 int type;
2021
2022 if (flags != 0x80) /* Bootable flag */
2023 continue;
2024
2025 type = blkid_partition_get_type(pp);
2026 if (type != 0x83) /* Linux partition */
2027 continue;
2028
2029 if (generic)
2030 multiple_generic = true;
2031 else {
2032 generic_rw = true;
2033
2034 r = free_and_strdup(&root, node);
2035 if (r < 0)
2036 return log_oom();
2037 }
2038 }
2039 }
2040
2041 if (root) {
2042 *root_device = root;
2043 root = NULL;
2044
2045 *root_device_rw = root_rw;
2046 *secondary = false;
2047 } else if (secondary_root) {
2048 *root_device = secondary_root;
2049 secondary_root = NULL;
2050
2051 *root_device_rw = secondary_root_rw;
2052 *secondary = true;
2053 } else if (generic) {
2054
2055 /* There were no partitions with precise meanings
2056 * around, but we found generic partitions. In this
2057 * case, if there's only one, we can go ahead and boot
2058 * it, otherwise we bail out, because we really cannot
2059 * make any sense of it. */
2060
2061 if (multiple_generic) {
2062 log_error("Identified multiple bootable Linux partitions on\n"
2063 " %s\n"
2064 PARTITION_TABLE_BLURB, arg_image);
2065 return -EINVAL;
2066 }
2067
2068 *root_device = generic;
2069 generic = NULL;
2070
2071 *root_device_rw = generic_rw;
2072 *secondary = false;
2073 } else {
2074 log_error("Failed to identify root partition in disk image\n"
2075 " %s\n"
2076 PARTITION_TABLE_BLURB, arg_image);
2077 return -EINVAL;
2078 }
2079
2080 if (home) {
2081 *home_device = home;
2082 home = NULL;
2083
2084 *home_device_rw = home_rw;
2085 }
2086
2087 if (srv) {
2088 *srv_device = srv;
2089 srv = NULL;
2090
2091 *srv_device_rw = srv_rw;
2092 }
2093
2094 return 0;
2095 #else
2096 log_error("--image= is not supported, compiled without blkid support.");
2097 return -EOPNOTSUPP;
2098 #endif
2099 }
2100
2101 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2102 #ifdef HAVE_BLKID
2103 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2104 const char *fstype, *p;
2105 int r;
2106
2107 assert(what);
2108 assert(where);
2109
2110 if (arg_read_only)
2111 rw = false;
2112
2113 if (directory)
2114 p = strjoina(where, directory);
2115 else
2116 p = where;
2117
2118 errno = 0;
2119 b = blkid_new_probe_from_filename(what);
2120 if (!b) {
2121 if (errno == 0)
2122 return log_oom();
2123 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2124 return -errno;
2125 }
2126
2127 blkid_probe_enable_superblocks(b, 1);
2128 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2129
2130 errno = 0;
2131 r = blkid_do_safeprobe(b);
2132 if (r == -1 || r == 1) {
2133 log_error("Cannot determine file system type of %s", what);
2134 return -EINVAL;
2135 } else if (r != 0) {
2136 if (errno == 0)
2137 errno = EIO;
2138 log_error_errno(errno, "Failed to probe %s: %m", what);
2139 return -errno;
2140 }
2141
2142 errno = 0;
2143 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2144 if (errno == 0)
2145 errno = EINVAL;
2146 log_error("Failed to determine file system type of %s", what);
2147 return -errno;
2148 }
2149
2150 if (streq(fstype, "crypto_LUKS")) {
2151 log_error("nspawn currently does not support LUKS disk images.");
2152 return -EOPNOTSUPP;
2153 }
2154
2155 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2156 return log_error_errno(errno, "Failed to mount %s: %m", what);
2157
2158 return 0;
2159 #else
2160 log_error("--image= is not supported, compiled without blkid support.");
2161 return -EOPNOTSUPP;
2162 #endif
2163 }
2164
2165 static int mount_devices(
2166 const char *where,
2167 const char *root_device, bool root_device_rw,
2168 const char *home_device, bool home_device_rw,
2169 const char *srv_device, bool srv_device_rw) {
2170 int r;
2171
2172 assert(where);
2173
2174 if (root_device) {
2175 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2176 if (r < 0)
2177 return log_error_errno(r, "Failed to mount root directory: %m");
2178 }
2179
2180 if (home_device) {
2181 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2182 if (r < 0)
2183 return log_error_errno(r, "Failed to mount home directory: %m");
2184 }
2185
2186 if (srv_device) {
2187 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2188 if (r < 0)
2189 return log_error_errno(r, "Failed to mount server data directory: %m");
2190 }
2191
2192 return 0;
2193 }
2194
2195 static void loop_remove(int nr, int *image_fd) {
2196 _cleanup_close_ int control = -1;
2197 int r;
2198
2199 if (nr < 0)
2200 return;
2201
2202 if (image_fd && *image_fd >= 0) {
2203 r = ioctl(*image_fd, LOOP_CLR_FD);
2204 if (r < 0)
2205 log_debug_errno(errno, "Failed to close loop image: %m");
2206 *image_fd = safe_close(*image_fd);
2207 }
2208
2209 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2210 if (control < 0) {
2211 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2212 return;
2213 }
2214
2215 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2216 if (r < 0)
2217 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2218 }
2219
2220 /*
2221 * Return values:
2222 * < 0 : wait_for_terminate() failed to get the state of the
2223 * container, the container was terminated by a signal, or
2224 * failed for an unknown reason. No change is made to the
2225 * container argument.
2226 * > 0 : The program executed in the container terminated with an
2227 * error. The exit code of the program executed in the
2228 * container is returned. The container argument has been set
2229 * to CONTAINER_TERMINATED.
2230 * 0 : The container is being rebooted, has been shut down or exited
2231 * successfully. The container argument has been set to either
2232 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2233 *
2234 * That is, success is indicated by a return value of zero, and an
2235 * error is indicated by a non-zero value.
2236 */
2237 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2238 siginfo_t status;
2239 int r;
2240
2241 r = wait_for_terminate(pid, &status);
2242 if (r < 0)
2243 return log_warning_errno(r, "Failed to wait for container: %m");
2244
2245 switch (status.si_code) {
2246
2247 case CLD_EXITED:
2248 if (status.si_status == 0) {
2249 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2250
2251 } else
2252 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2253
2254 *container = CONTAINER_TERMINATED;
2255 return status.si_status;
2256
2257 case CLD_KILLED:
2258 if (status.si_status == SIGINT) {
2259
2260 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2261 *container = CONTAINER_TERMINATED;
2262 return 0;
2263
2264 } else if (status.si_status == SIGHUP) {
2265
2266 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2267 *container = CONTAINER_REBOOTED;
2268 return 0;
2269 }
2270
2271 /* CLD_KILLED fallthrough */
2272
2273 case CLD_DUMPED:
2274 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2275 return -EIO;
2276
2277 default:
2278 log_error("Container %s failed due to unknown reason.", arg_machine);
2279 return -EIO;
2280 }
2281
2282 return r;
2283 }
2284
2285 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2286 pid_t pid;
2287
2288 pid = PTR_TO_UINT32(userdata);
2289 if (pid > 0) {
2290 if (kill(pid, arg_kill_signal) >= 0) {
2291 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2292 sd_event_source_set_userdata(s, NULL);
2293 return 0;
2294 }
2295 }
2296
2297 sd_event_exit(sd_event_source_get_event(s), 0);
2298 return 0;
2299 }
2300
2301 static int determine_names(void) {
2302 int r;
2303
2304 if (arg_template && !arg_directory && arg_machine) {
2305
2306 /* If --template= was specified then we should not
2307 * search for a machine, but instead create a new one
2308 * in /var/lib/machine. */
2309
2310 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2311 if (!arg_directory)
2312 return log_oom();
2313 }
2314
2315 if (!arg_image && !arg_directory) {
2316 if (arg_machine) {
2317 _cleanup_(image_unrefp) Image *i = NULL;
2318
2319 r = image_find(arg_machine, &i);
2320 if (r < 0)
2321 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2322 else if (r == 0) {
2323 log_error("No image for machine '%s': %m", arg_machine);
2324 return -ENOENT;
2325 }
2326
2327 if (i->type == IMAGE_RAW)
2328 r = set_sanitized_path(&arg_image, i->path);
2329 else
2330 r = set_sanitized_path(&arg_directory, i->path);
2331 if (r < 0)
2332 return log_error_errno(r, "Invalid image directory: %m");
2333
2334 if (!arg_ephemeral)
2335 arg_read_only = arg_read_only || i->read_only;
2336 } else
2337 arg_directory = get_current_dir_name();
2338
2339 if (!arg_directory && !arg_machine) {
2340 log_error("Failed to determine path, please use -D or -i.");
2341 return -EINVAL;
2342 }
2343 }
2344
2345 if (!arg_machine) {
2346 if (arg_directory && path_equal(arg_directory, "/"))
2347 arg_machine = gethostname_malloc();
2348 else
2349 arg_machine = strdup(basename(arg_image ?: arg_directory));
2350
2351 if (!arg_machine)
2352 return log_oom();
2353
2354 hostname_cleanup(arg_machine);
2355 if (!machine_name_is_valid(arg_machine)) {
2356 log_error("Failed to determine machine name automatically, please use -M.");
2357 return -EINVAL;
2358 }
2359
2360 if (arg_ephemeral) {
2361 char *b;
2362
2363 /* Add a random suffix when this is an
2364 * ephemeral machine, so that we can run many
2365 * instances at once without manually having
2366 * to specify -M each time. */
2367
2368 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2369 return log_oom();
2370
2371 free(arg_machine);
2372 arg_machine = b;
2373 }
2374 }
2375
2376 return 0;
2377 }
2378
2379 static int determine_uid_shift(const char *directory) {
2380 int r;
2381
2382 if (!arg_userns) {
2383 arg_uid_shift = 0;
2384 return 0;
2385 }
2386
2387 if (arg_uid_shift == UID_INVALID) {
2388 struct stat st;
2389
2390 r = stat(directory, &st);
2391 if (r < 0)
2392 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2393
2394 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2395
2396 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2397 log_error("UID and GID base of %s don't match.", directory);
2398 return -EINVAL;
2399 }
2400
2401 arg_uid_range = UINT32_C(0x10000);
2402 }
2403
2404 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2405 log_error("UID base too high for UID range.");
2406 return -EINVAL;
2407 }
2408
2409 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2410 return 0;
2411 }
2412
2413 static int inner_child(
2414 Barrier *barrier,
2415 const char *directory,
2416 bool secondary,
2417 int kmsg_socket,
2418 int rtnl_socket,
2419 FDSet *fds) {
2420
2421 _cleanup_free_ char *home = NULL;
2422 unsigned n_env = 2;
2423 const char *envp[] = {
2424 "PATH=" DEFAULT_PATH_SPLIT_USR,
2425 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2426 NULL, /* TERM */
2427 NULL, /* HOME */
2428 NULL, /* USER */
2429 NULL, /* LOGNAME */
2430 NULL, /* container_uuid */
2431 NULL, /* LISTEN_FDS */
2432 NULL, /* LISTEN_PID */
2433 NULL
2434 };
2435
2436 _cleanup_strv_free_ char **env_use = NULL;
2437 int r;
2438
2439 assert(barrier);
2440 assert(directory);
2441 assert(kmsg_socket >= 0);
2442
2443 cg_unified_flush();
2444
2445 if (arg_userns) {
2446 /* Tell the parent, that it now can write the UID map. */
2447 (void) barrier_place(barrier); /* #1 */
2448
2449 /* Wait until the parent wrote the UID map */
2450 if (!barrier_place_and_sync(barrier)) { /* #2 */
2451 log_error("Parent died too early");
2452 return -ESRCH;
2453 }
2454 }
2455
2456 r = mount_all(NULL, true, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2457 if (r < 0)
2458 return r;
2459
2460 /* Wait until we are cgroup-ified, so that we
2461 * can mount the right cgroup path writable */
2462 if (!barrier_place_and_sync(barrier)) { /* #3 */
2463 log_error("Parent died too early");
2464 return -ESRCH;
2465 }
2466
2467 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2468 if (r < 0)
2469 return r;
2470
2471 r = reset_uid_gid();
2472 if (r < 0)
2473 return log_error_errno(r, "Couldn't become new root: %m");
2474
2475 r = setup_boot_id(NULL);
2476 if (r < 0)
2477 return r;
2478
2479 r = setup_kmsg(NULL, kmsg_socket);
2480 if (r < 0)
2481 return r;
2482 kmsg_socket = safe_close(kmsg_socket);
2483
2484 umask(0022);
2485
2486 if (setsid() < 0)
2487 return log_error_errno(errno, "setsid() failed: %m");
2488
2489 if (arg_private_network)
2490 loopback_setup();
2491
2492 if (arg_expose_ports) {
2493 r = expose_port_send_rtnl(rtnl_socket);
2494 if (r < 0)
2495 return r;
2496 rtnl_socket = safe_close(rtnl_socket);
2497 }
2498
2499 if (drop_capabilities() < 0)
2500 return log_error_errno(errno, "drop_capabilities() failed: %m");
2501
2502 setup_hostname();
2503
2504 if (arg_personality != PERSONALITY_INVALID) {
2505 if (personality(arg_personality) < 0)
2506 return log_error_errno(errno, "personality() failed: %m");
2507 } else if (secondary) {
2508 if (personality(PER_LINUX32) < 0)
2509 return log_error_errno(errno, "personality() failed: %m");
2510 }
2511
2512 #ifdef HAVE_SELINUX
2513 if (arg_selinux_context)
2514 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2515 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2516 #endif
2517
2518 r = change_uid_gid(arg_user, &home);
2519 if (r < 0)
2520 return r;
2521
2522 envp[n_env] = strv_find_prefix(environ, "TERM=");
2523 if (envp[n_env])
2524 n_env ++;
2525
2526 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2527 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2528 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2529 return log_oom();
2530
2531 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2532 char as_uuid[37];
2533
2534 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2535 return log_oom();
2536 }
2537
2538 if (fdset_size(fds) > 0) {
2539 r = fdset_cloexec(fds, false);
2540 if (r < 0)
2541 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2542
2543 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2544 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2545 return log_oom();
2546 }
2547
2548 env_use = strv_env_merge(2, envp, arg_setenv);
2549 if (!env_use)
2550 return log_oom();
2551
2552 /* Let the parent know that we are ready and
2553 * wait until the parent is ready with the
2554 * setup, too... */
2555 if (!barrier_place_and_sync(barrier)) { /* #4 */
2556 log_error("Parent died too early");
2557 return -ESRCH;
2558 }
2559
2560 /* Now, explicitly close the log, so that we
2561 * then can close all remaining fds. Closing
2562 * the log explicitly first has the benefit
2563 * that the logging subsystem knows about it,
2564 * and is thus ready to be reopened should we
2565 * need it again. Note that the other fds
2566 * closed here are at least the locking and
2567 * barrier fds. */
2568 log_close();
2569 (void) fdset_close_others(fds);
2570
2571 if (arg_boot) {
2572 char **a;
2573 size_t m;
2574
2575 /* Automatically search for the init system */
2576
2577 m = 1 + strv_length(arg_parameters);
2578 a = newa(char*, m + 1);
2579 if (strv_isempty(arg_parameters))
2580 a[1] = NULL;
2581 else
2582 memcpy(a + 1, arg_parameters, m * sizeof(char*));
2583
2584 a[0] = (char*) "/usr/lib/systemd/systemd";
2585 execve(a[0], a, env_use);
2586
2587 a[0] = (char*) "/lib/systemd/systemd";
2588 execve(a[0], a, env_use);
2589
2590 a[0] = (char*) "/sbin/init";
2591 execve(a[0], a, env_use);
2592 } else if (!strv_isempty(arg_parameters))
2593 execvpe(arg_parameters[0], arg_parameters, env_use);
2594 else {
2595 chdir(home ?: "/root");
2596 execle("/bin/bash", "-bash", NULL, env_use);
2597 execle("/bin/sh", "-sh", NULL, env_use);
2598 }
2599
2600 (void) log_open();
2601 return log_error_errno(errno, "execv() failed: %m");
2602 }
2603
2604 static int outer_child(
2605 Barrier *barrier,
2606 const char *directory,
2607 const char *console,
2608 const char *root_device, bool root_device_rw,
2609 const char *home_device, bool home_device_rw,
2610 const char *srv_device, bool srv_device_rw,
2611 bool interactive,
2612 bool secondary,
2613 int pid_socket,
2614 int kmsg_socket,
2615 int rtnl_socket,
2616 int uid_shift_socket,
2617 FDSet *fds) {
2618
2619 pid_t pid;
2620 ssize_t l;
2621 int r;
2622
2623 assert(barrier);
2624 assert(directory);
2625 assert(console);
2626 assert(pid_socket >= 0);
2627 assert(kmsg_socket >= 0);
2628
2629 cg_unified_flush();
2630
2631 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2632 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2633
2634 if (interactive) {
2635 close_nointr(STDIN_FILENO);
2636 close_nointr(STDOUT_FILENO);
2637 close_nointr(STDERR_FILENO);
2638
2639 r = open_terminal(console, O_RDWR);
2640 if (r != STDIN_FILENO) {
2641 if (r >= 0) {
2642 safe_close(r);
2643 r = -EINVAL;
2644 }
2645
2646 return log_error_errno(r, "Failed to open console: %m");
2647 }
2648
2649 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2650 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2651 return log_error_errno(errno, "Failed to duplicate console: %m");
2652 }
2653
2654 r = reset_audit_loginuid();
2655 if (r < 0)
2656 return r;
2657
2658 /* Mark everything as slave, so that we still
2659 * receive mounts from the real root, but don't
2660 * propagate mounts to the real root. */
2661 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2662 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2663
2664 r = mount_devices(directory,
2665 root_device, root_device_rw,
2666 home_device, home_device_rw,
2667 srv_device, srv_device_rw);
2668 if (r < 0)
2669 return r;
2670
2671 r = determine_uid_shift(directory);
2672 if (r < 0)
2673 return r;
2674
2675 if (arg_userns) {
2676 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2677 if (l < 0)
2678 return log_error_errno(errno, "Failed to send UID shift: %m");
2679 if (l != sizeof(arg_uid_shift)) {
2680 log_error("Short write while sending UID shift.");
2681 return -EIO;
2682 }
2683 }
2684
2685 /* Turn directory into bind mount */
2686 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2687 return log_error_errno(errno, "Failed to make bind mount: %m");
2688
2689 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2690 if (r < 0)
2691 return r;
2692
2693 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2694 if (r < 0)
2695 return r;
2696
2697 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2698 if (r < 0)
2699 return r;
2700
2701 if (arg_read_only) {
2702 r = bind_remount_recursive(directory, true);
2703 if (r < 0)
2704 return log_error_errno(r, "Failed to make tree read-only: %m");
2705 }
2706
2707 r = mount_all(directory, false, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2708 if (r < 0)
2709 return r;
2710
2711 r = copy_devnodes(directory);
2712 if (r < 0)
2713 return r;
2714
2715 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2716
2717 r = setup_pts(directory);
2718 if (r < 0)
2719 return r;
2720
2721 r = setup_propagate(directory);
2722 if (r < 0)
2723 return r;
2724
2725 r = setup_dev_console(directory, console);
2726 if (r < 0)
2727 return r;
2728
2729 r = setup_seccomp();
2730 if (r < 0)
2731 return r;
2732
2733 r = setup_timezone(directory);
2734 if (r < 0)
2735 return r;
2736
2737 r = setup_resolv_conf(directory);
2738 if (r < 0)
2739 return r;
2740
2741 r = setup_journal(directory);
2742 if (r < 0)
2743 return r;
2744
2745 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2746 if (r < 0)
2747 return r;
2748
2749 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2750 if (r < 0)
2751 return r;
2752
2753 r = mount_move_root(directory);
2754 if (r < 0)
2755 return log_error_errno(r, "Failed to move root directory: %m");
2756
2757 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2758 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2759 (arg_private_network ? CLONE_NEWNET : 0) |
2760 (arg_userns ? CLONE_NEWUSER : 0),
2761 NULL);
2762 if (pid < 0)
2763 return log_error_errno(errno, "Failed to fork inner child: %m");
2764 if (pid == 0) {
2765 pid_socket = safe_close(pid_socket);
2766 uid_shift_socket = safe_close(uid_shift_socket);
2767
2768 /* The inner child has all namespaces that are
2769 * requested, so that we all are owned by the user if
2770 * user namespaces are turned on. */
2771
2772 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2773 if (r < 0)
2774 _exit(EXIT_FAILURE);
2775
2776 _exit(EXIT_SUCCESS);
2777 }
2778
2779 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2780 if (l < 0)
2781 return log_error_errno(errno, "Failed to send PID: %m");
2782 if (l != sizeof(pid)) {
2783 log_error("Short write while sending PID.");
2784 return -EIO;
2785 }
2786
2787 pid_socket = safe_close(pid_socket);
2788 kmsg_socket = safe_close(kmsg_socket);
2789 rtnl_socket = safe_close(rtnl_socket);
2790
2791 return 0;
2792 }
2793
2794 static int setup_uid_map(pid_t pid) {
2795 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2796 int r;
2797
2798 assert(pid > 1);
2799
2800 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2801 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2802 r = write_string_file(uid_map, line, 0);
2803 if (r < 0)
2804 return log_error_errno(r, "Failed to write UID map: %m");
2805
2806 /* We always assign the same UID and GID ranges */
2807 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2808 r = write_string_file(uid_map, line, 0);
2809 if (r < 0)
2810 return log_error_errno(r, "Failed to write GID map: %m");
2811
2812 return 0;
2813 }
2814
2815 static int load_settings(void) {
2816 _cleanup_(settings_freep) Settings *settings = NULL;
2817 _cleanup_fclose_ FILE *f = NULL;
2818 _cleanup_free_ char *p = NULL;
2819 const char *fn, *i;
2820 int r;
2821
2822 /* If all settings are masked, there's no point in looking for
2823 * the settings file */
2824 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2825 return 0;
2826
2827 fn = strjoina(arg_machine, ".nspawn");
2828
2829 /* We first look in the admin's directories in /etc and /run */
2830 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2831 _cleanup_free_ char *j = NULL;
2832
2833 j = strjoin(i, "/", fn, NULL);
2834 if (!j)
2835 return log_oom();
2836
2837 f = fopen(j, "re");
2838 if (f) {
2839 p = j;
2840 j = NULL;
2841
2842 /* By default we trust configuration from /etc and /run */
2843 if (arg_settings_trusted < 0)
2844 arg_settings_trusted = true;
2845
2846 break;
2847 }
2848
2849 if (errno != ENOENT)
2850 return log_error_errno(errno, "Failed to open %s: %m", j);
2851 }
2852
2853 if (!f) {
2854 /* After that, let's look for a file next to the
2855 * actual image we shall boot. */
2856
2857 if (arg_image) {
2858 p = file_in_same_dir(arg_image, fn);
2859 if (!p)
2860 return log_oom();
2861 } else if (arg_directory) {
2862 p = file_in_same_dir(arg_directory, fn);
2863 if (!p)
2864 return log_oom();
2865 }
2866
2867 if (p) {
2868 f = fopen(p, "re");
2869 if (!f && errno != ENOENT)
2870 return log_error_errno(errno, "Failed to open %s: %m", p);
2871
2872 /* By default we do not trust configuration from /var/lib/machines */
2873 if (arg_settings_trusted < 0)
2874 arg_settings_trusted = false;
2875 }
2876 }
2877
2878 if (!f)
2879 return 0;
2880
2881 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2882
2883 r = settings_load(f, p, &settings);
2884 if (r < 0)
2885 return r;
2886
2887 /* Copy over bits from the settings, unless they have been
2888 * explicitly masked by command line switches. */
2889
2890 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2891 settings->boot >= 0) {
2892 arg_boot = settings->boot;
2893
2894 strv_free(arg_parameters);
2895 arg_parameters = settings->parameters;
2896 settings->parameters = NULL;
2897 }
2898
2899 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2900 settings->environment) {
2901 strv_free(arg_setenv);
2902 arg_setenv = settings->environment;
2903 settings->environment = NULL;
2904 }
2905
2906 if ((arg_settings_mask & SETTING_USER) == 0 &&
2907 settings->user) {
2908 free(arg_user);
2909 arg_user = settings->user;
2910 settings->user = NULL;
2911 }
2912
2913 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
2914
2915 if (!arg_settings_trusted && settings->capability != 0)
2916 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2917 else
2918 arg_retain |= settings->capability;
2919
2920 arg_retain &= ~settings->drop_capability;
2921 }
2922
2923 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2924 settings->kill_signal > 0)
2925 arg_kill_signal = settings->kill_signal;
2926
2927 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2928 settings->personality != PERSONALITY_INVALID)
2929 arg_personality = settings->personality;
2930
2931 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2932 !sd_id128_is_null(settings->machine_id)) {
2933
2934 if (!arg_settings_trusted)
2935 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2936 else
2937 arg_uuid = settings->machine_id;
2938 }
2939
2940 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2941 settings->read_only >= 0)
2942 arg_read_only = settings->read_only;
2943
2944 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2945 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2946 arg_volatile_mode = settings->volatile_mode;
2947
2948 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2949 settings->n_custom_mounts > 0) {
2950
2951 if (!arg_settings_trusted)
2952 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2953 else {
2954 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2955 arg_custom_mounts = settings->custom_mounts;
2956 arg_n_custom_mounts = settings->n_custom_mounts;
2957
2958 settings->custom_mounts = NULL;
2959 settings->n_custom_mounts = 0;
2960 }
2961 }
2962
2963 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2964 (settings->private_network >= 0 ||
2965 settings->network_veth >= 0 ||
2966 settings->network_bridge ||
2967 settings->network_interfaces ||
2968 settings->network_macvlan ||
2969 settings->network_ipvlan)) {
2970
2971 if (!arg_settings_trusted)
2972 log_warning("Ignoring network settings, file %s is not trusted.", p);
2973 else {
2974 strv_free(arg_network_interfaces);
2975 arg_network_interfaces = settings->network_interfaces;
2976 settings->network_interfaces = NULL;
2977
2978 strv_free(arg_network_macvlan);
2979 arg_network_macvlan = settings->network_macvlan;
2980 settings->network_macvlan = NULL;
2981
2982 strv_free(arg_network_ipvlan);
2983 arg_network_ipvlan = settings->network_ipvlan;
2984 settings->network_ipvlan = NULL;
2985
2986 free(arg_network_bridge);
2987 arg_network_bridge = settings->network_bridge;
2988 settings->network_bridge = NULL;
2989
2990 arg_network_veth = settings->network_veth > 0 || settings->network_bridge;
2991
2992 arg_private_network = true; /* all these settings imply private networking */
2993 }
2994 }
2995
2996 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
2997 settings->expose_ports) {
2998
2999 if (!arg_settings_trusted)
3000 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3001 else {
3002 expose_port_free_all(arg_expose_ports);
3003 arg_expose_ports = settings->expose_ports;
3004 settings->expose_ports = NULL;
3005 }
3006 }
3007
3008 return 0;
3009 }
3010
3011 int main(int argc, char *argv[]) {
3012
3013 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3014 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3015 _cleanup_close_ int master = -1, image_fd = -1;
3016 _cleanup_fdset_free_ FDSet *fds = NULL;
3017 int r, n_fd_passed, loop_nr = -1;
3018 char veth_name[IFNAMSIZ];
3019 bool secondary = false, remove_subvol = false;
3020 sigset_t mask_chld;
3021 pid_t pid = 0;
3022 int ret = EXIT_SUCCESS;
3023 union in_addr_union exposed = {};
3024 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3025 bool interactive;
3026
3027 log_parse_environment();
3028 log_open();
3029
3030 r = parse_argv(argc, argv);
3031 if (r <= 0)
3032 goto finish;
3033
3034 if (geteuid() != 0) {
3035 log_error("Need to be root.");
3036 r = -EPERM;
3037 goto finish;
3038 }
3039 r = determine_names();
3040 if (r < 0)
3041 goto finish;
3042
3043 r = load_settings();
3044 if (r < 0)
3045 goto finish;
3046
3047 r = verify_arguments();
3048 if (r < 0)
3049 goto finish;
3050
3051 n_fd_passed = sd_listen_fds(false);
3052 if (n_fd_passed > 0) {
3053 r = fdset_new_listen_fds(&fds, false);
3054 if (r < 0) {
3055 log_error_errno(r, "Failed to collect file descriptors: %m");
3056 goto finish;
3057 }
3058 }
3059
3060 if (arg_directory) {
3061 assert(!arg_image);
3062
3063 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3064 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3065 r = -EINVAL;
3066 goto finish;
3067 }
3068
3069 if (arg_ephemeral) {
3070 _cleanup_free_ char *np = NULL;
3071
3072 /* If the specified path is a mount point we
3073 * generate the new snapshot immediately
3074 * inside it under a random name. However if
3075 * the specified is not a mount point we
3076 * create the new snapshot in the parent
3077 * directory, just next to it. */
3078 r = path_is_mount_point(arg_directory, 0);
3079 if (r < 0) {
3080 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3081 goto finish;
3082 }
3083 if (r > 0)
3084 r = tempfn_random_child(arg_directory, "machine.", &np);
3085 else
3086 r = tempfn_random(arg_directory, "machine.", &np);
3087 if (r < 0) {
3088 log_error_errno(r, "Failed to generate name for snapshot: %m");
3089 goto finish;
3090 }
3091
3092 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3093 if (r < 0) {
3094 log_error_errno(r, "Failed to lock %s: %m", np);
3095 goto finish;
3096 }
3097
3098 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
3099 if (r < 0) {
3100 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3101 goto finish;
3102 }
3103
3104 free(arg_directory);
3105 arg_directory = np;
3106 np = NULL;
3107
3108 remove_subvol = true;
3109
3110 } else {
3111 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3112 if (r == -EBUSY) {
3113 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3114 goto finish;
3115 }
3116 if (r < 0) {
3117 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3118 return r;
3119 }
3120
3121 if (arg_template) {
3122 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
3123 if (r == -EEXIST) {
3124 if (!arg_quiet)
3125 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3126 } else if (r < 0) {
3127 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3128 goto finish;
3129 } else {
3130 if (!arg_quiet)
3131 log_info("Populated %s from template %s.", arg_directory, arg_template);
3132 }
3133 }
3134 }
3135
3136 if (arg_boot) {
3137 if (path_is_os_tree(arg_directory) <= 0) {
3138 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3139 r = -EINVAL;
3140 goto finish;
3141 }
3142 } else {
3143 const char *p;
3144
3145 p = strjoina(arg_directory,
3146 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3147 if (access(p, F_OK) < 0) {
3148 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3149 r = -EINVAL;
3150 goto finish;
3151 }
3152 }
3153
3154 } else {
3155 char template[] = "/tmp/nspawn-root-XXXXXX";
3156
3157 assert(arg_image);
3158 assert(!arg_template);
3159
3160 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3161 if (r == -EBUSY) {
3162 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3163 goto finish;
3164 }
3165 if (r < 0) {
3166 r = log_error_errno(r, "Failed to create image lock: %m");
3167 goto finish;
3168 }
3169
3170 if (!mkdtemp(template)) {
3171 log_error_errno(errno, "Failed to create temporary directory: %m");
3172 r = -errno;
3173 goto finish;
3174 }
3175
3176 arg_directory = strdup(template);
3177 if (!arg_directory) {
3178 r = log_oom();
3179 goto finish;
3180 }
3181
3182 image_fd = setup_image(&device_path, &loop_nr);
3183 if (image_fd < 0) {
3184 r = image_fd;
3185 goto finish;
3186 }
3187
3188 r = dissect_image(image_fd,
3189 &root_device, &root_device_rw,
3190 &home_device, &home_device_rw,
3191 &srv_device, &srv_device_rw,
3192 &secondary);
3193 if (r < 0)
3194 goto finish;
3195 }
3196
3197 r = custom_mounts_prepare();
3198 if (r < 0)
3199 goto finish;
3200
3201 interactive =
3202 isatty(STDIN_FILENO) > 0 &&
3203 isatty(STDOUT_FILENO) > 0;
3204
3205 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3206 if (master < 0) {
3207 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3208 goto finish;
3209 }
3210
3211 r = ptsname_malloc(master, &console);
3212 if (r < 0) {
3213 r = log_error_errno(r, "Failed to determine tty name: %m");
3214 goto finish;
3215 }
3216
3217 if (unlockpt(master) < 0) {
3218 r = log_error_errno(errno, "Failed to unlock tty: %m");
3219 goto finish;
3220 }
3221
3222 if (!arg_quiet)
3223 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3224 arg_machine, arg_image ?: arg_directory);
3225
3226 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3227
3228 assert_se(sigemptyset(&mask_chld) == 0);
3229 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3230
3231 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3232 r = log_error_errno(errno, "Failed to become subreaper: %m");
3233 goto finish;
3234 }
3235
3236 for (;;) {
3237 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
3238 uid_shift_socket_pair[2] = { -1, -1 };
3239 ContainerStatus container_status;
3240 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3241 static const struct sigaction sa = {
3242 .sa_handler = nop_signal_handler,
3243 .sa_flags = SA_NOCLDSTOP,
3244 };
3245 int ifi = 0;
3246 ssize_t l;
3247 _cleanup_event_unref_ sd_event *event = NULL;
3248 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3249 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3250 char last_char = 0;
3251
3252 r = barrier_create(&barrier);
3253 if (r < 0) {
3254 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3255 goto finish;
3256 }
3257
3258 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3259 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3260 goto finish;
3261 }
3262
3263 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3264 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3265 goto finish;
3266 }
3267
3268 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3269 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3270 goto finish;
3271 }
3272
3273 if (arg_userns)
3274 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3275 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3276 goto finish;
3277 }
3278
3279 /* Child can be killed before execv(), so handle SIGCHLD
3280 * in order to interrupt parent's blocking calls and
3281 * give it a chance to call wait() and terminate. */
3282 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3283 if (r < 0) {
3284 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3285 goto finish;
3286 }
3287
3288 r = sigaction(SIGCHLD, &sa, NULL);
3289 if (r < 0) {
3290 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3291 goto finish;
3292 }
3293
3294 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3295 if (pid < 0) {
3296 if (errno == EINVAL)
3297 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3298 else
3299 r = log_error_errno(errno, "clone() failed: %m");
3300
3301 goto finish;
3302 }
3303
3304 if (pid == 0) {
3305 /* The outer child only has a file system namespace. */
3306 barrier_set_role(&barrier, BARRIER_CHILD);
3307
3308 master = safe_close(master);
3309
3310 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3311 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3312 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3313 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3314
3315 (void) reset_all_signal_handlers();
3316 (void) reset_signal_mask();
3317
3318 r = outer_child(&barrier,
3319 arg_directory,
3320 console,
3321 root_device, root_device_rw,
3322 home_device, home_device_rw,
3323 srv_device, srv_device_rw,
3324 interactive,
3325 secondary,
3326 pid_socket_pair[1],
3327 kmsg_socket_pair[1],
3328 rtnl_socket_pair[1],
3329 uid_shift_socket_pair[1],
3330 fds);
3331 if (r < 0)
3332 _exit(EXIT_FAILURE);
3333
3334 _exit(EXIT_SUCCESS);
3335 }
3336
3337 barrier_set_role(&barrier, BARRIER_PARENT);
3338
3339 fds = fdset_free(fds);
3340
3341 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3342 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3343 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3344 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3345
3346 /* Wait for the outer child. */
3347 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3348 if (r < 0)
3349 goto finish;
3350 if (r != 0) {
3351 r = -EIO;
3352 goto finish;
3353 }
3354 pid = 0;
3355
3356 /* And now retrieve the PID of the inner child. */
3357 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3358 if (l < 0) {
3359 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3360 goto finish;
3361 }
3362 if (l != sizeof(pid)) {
3363 log_error("Short read while reading inner child PID.");
3364 r = EIO;
3365 goto finish;
3366 }
3367
3368 log_debug("Init process invoked as PID " PID_FMT, pid);
3369
3370 if (arg_userns) {
3371 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3372 log_error("Child died too early.");
3373 r = -ESRCH;
3374 goto finish;
3375 }
3376
3377 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3378 if (l < 0) {
3379 r = log_error_errno(errno, "Failed to read UID shift: %m");
3380 goto finish;
3381 }
3382 if (l != sizeof(arg_uid_shift)) {
3383 log_error("Short read while reading UID shift.");
3384 r = EIO;
3385 goto finish;
3386 }
3387
3388 r = setup_uid_map(pid);
3389 if (r < 0)
3390 goto finish;
3391
3392 (void) barrier_place(&barrier); /* #2 */
3393 }
3394
3395 if (arg_private_network) {
3396
3397 r = move_network_interfaces(pid, arg_network_interfaces);
3398 if (r < 0)
3399 goto finish;
3400
3401 if (arg_network_veth) {
3402 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3403 if (r < 0)
3404 goto finish;
3405 else if (r > 0)
3406 ifi = r;
3407
3408 if (arg_network_bridge) {
3409 r = setup_bridge(veth_name, arg_network_bridge);
3410 if (r < 0)
3411 goto finish;
3412 if (r > 0)
3413 ifi = r;
3414 }
3415 }
3416
3417 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3418 if (r < 0)
3419 goto finish;
3420
3421 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3422 if (r < 0)
3423 goto finish;
3424 }
3425
3426 if (arg_register) {
3427 r = register_machine(
3428 arg_machine,
3429 pid,
3430 arg_directory,
3431 arg_uuid,
3432 ifi,
3433 arg_slice,
3434 arg_custom_mounts, arg_n_custom_mounts,
3435 arg_kill_signal,
3436 arg_property,
3437 arg_keep_unit);
3438 if (r < 0)
3439 goto finish;
3440 }
3441
3442 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3443 if (r < 0)
3444 goto finish;
3445
3446 if (arg_keep_unit) {
3447 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3448 if (r < 0)
3449 goto finish;
3450 }
3451
3452 r = chown_cgroup(pid, arg_uid_shift);
3453 if (r < 0)
3454 goto finish;
3455
3456 /* Notify the child that the parent is ready with all
3457 * its setup (including cgroup-ification), and that
3458 * the child can now hand over control to the code to
3459 * run inside the container. */
3460 (void) barrier_place(&barrier); /* #3 */
3461
3462 /* Block SIGCHLD here, before notifying child.
3463 * process_pty() will handle it with the other signals. */
3464 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3465
3466 /* Reset signal to default */
3467 r = default_signals(SIGCHLD, -1);
3468 if (r < 0) {
3469 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3470 goto finish;
3471 }
3472
3473 /* Let the child know that we are ready and wait that the child is completely ready now. */
3474 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3475 log_error("Child died too early.");
3476 r = -ESRCH;
3477 goto finish;
3478 }
3479
3480 sd_notifyf(false,
3481 "READY=1\n"
3482 "STATUS=Container running.\n"
3483 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3484
3485 r = sd_event_new(&event);
3486 if (r < 0) {
3487 log_error_errno(r, "Failed to get default event source: %m");
3488 goto finish;
3489 }
3490
3491 if (arg_kill_signal > 0) {
3492 /* Try to kill the init system on SIGINT or SIGTERM */
3493 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3494 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3495 } else {
3496 /* Immediately exit */
3497 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3498 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3499 }
3500
3501 /* simply exit on sigchld */
3502 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3503
3504 if (arg_expose_ports) {
3505 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3506 if (r < 0)
3507 goto finish;
3508
3509 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3510 }
3511
3512 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3513
3514 r = pty_forward_new(event, master, true, !interactive, &forward);
3515 if (r < 0) {
3516 log_error_errno(r, "Failed to create PTY forwarder: %m");
3517 goto finish;
3518 }
3519
3520 r = sd_event_loop(event);
3521 if (r < 0) {
3522 log_error_errno(r, "Failed to run event loop: %m");
3523 goto finish;
3524 }
3525
3526 pty_forward_get_last_char(forward, &last_char);
3527
3528 forward = pty_forward_free(forward);
3529
3530 if (!arg_quiet && last_char != '\n')
3531 putc('\n', stdout);
3532
3533 /* Kill if it is not dead yet anyway */
3534 if (arg_register && !arg_keep_unit)
3535 terminate_machine(pid);
3536
3537 /* Normally redundant, but better safe than sorry */
3538 kill(pid, SIGKILL);
3539
3540 r = wait_for_container(pid, &container_status);
3541 pid = 0;
3542
3543 if (r < 0)
3544 /* We failed to wait for the container, or the
3545 * container exited abnormally */
3546 goto finish;
3547 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3548 /* The container exited with a non-zero
3549 * status, or with zero status and no reboot
3550 * was requested. */
3551 ret = r;
3552 break;
3553 }
3554
3555 /* CONTAINER_REBOOTED, loop again */
3556
3557 if (arg_keep_unit) {
3558 /* Special handling if we are running as a
3559 * service: instead of simply restarting the
3560 * machine we want to restart the entire
3561 * service, so let's inform systemd about this
3562 * with the special exit code 133. The service
3563 * file uses RestartForceExitStatus=133 so
3564 * that this results in a full nspawn
3565 * restart. This is necessary since we might
3566 * have cgroup parameters set we want to have
3567 * flushed out. */
3568 ret = 133;
3569 r = 0;
3570 break;
3571 }
3572
3573 expose_port_flush(arg_expose_ports, &exposed);
3574 }
3575
3576 finish:
3577 sd_notify(false,
3578 "STOPPING=1\n"
3579 "STATUS=Terminating...");
3580
3581 if (pid > 0)
3582 kill(pid, SIGKILL);
3583
3584 /* Try to flush whatever is still queued in the pty */
3585 if (master >= 0)
3586 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
3587
3588 loop_remove(loop_nr, &image_fd);
3589
3590 if (remove_subvol && arg_directory) {
3591 int k;
3592
3593 k = btrfs_subvol_remove(arg_directory, true);
3594 if (k < 0)
3595 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3596 }
3597
3598 if (arg_machine) {
3599 const char *p;
3600
3601 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3602 (void) rm_rf(p, REMOVE_ROOT);
3603 }
3604
3605 expose_port_flush(arg_expose_ports, &exposed);
3606
3607 free(arg_directory);
3608 free(arg_template);
3609 free(arg_image);
3610 free(arg_machine);
3611 free(arg_user);
3612 strv_free(arg_setenv);
3613 free(arg_network_bridge);
3614 strv_free(arg_network_interfaces);
3615 strv_free(arg_network_macvlan);
3616 strv_free(arg_network_ipvlan);
3617 strv_free(arg_parameters);
3618 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3619 expose_port_free_all(arg_expose_ports);
3620
3621 return r < 0 ? EXIT_FAILURE : ret;
3622 }