]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #1374 from olof/autoconf_gcrypt_dep
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #ifdef HAVE_BLKID
23 #include <blkid/blkid.h>
24 #endif
25 #include <errno.h>
26 #include <getopt.h>
27 #include <linux/loop.h>
28 #include <sched.h>
29 #ifdef HAVE_SECCOMP
30 #include <seccomp.h>
31 #endif
32 #ifdef HAVE_SELINUX
33 #include <selinux/selinux.h>
34 #endif
35 #include <signal.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <sys/file.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
44 #include <unistd.h>
45
46 #include "sd-daemon.h"
47 #include "sd-id128.h"
48
49 #include "barrier.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
53 #include "build.h"
54 #include "cap-list.h"
55 #include "capability.h"
56 #include "cgroup-util.h"
57 #include "copy.h"
58 #include "dev-setup.h"
59 #include "env-util.h"
60 #include "event-util.h"
61 #include "fdset.h"
62 #include "fileio.h"
63 #include "formats-util.h"
64 #include "gpt.h"
65 #include "hostname-util.h"
66 #include "log.h"
67 #include "loopback-setup.h"
68 #include "machine-image.h"
69 #include "macro.h"
70 #include "missing.h"
71 #include "mkdir.h"
72 #include "netlink-util.h"
73 #include "path-util.h"
74 #include "process-util.h"
75 #include "ptyfwd.h"
76 #include "random-util.h"
77 #include "rm-rf.h"
78 #ifdef HAVE_SECCOMP
79 #include "seccomp-util.h"
80 #endif
81 #include "signal-util.h"
82 #include "strv.h"
83 #include "terminal-util.h"
84 #include "udev-util.h"
85 #include "util.h"
86
87 #include "nspawn-settings.h"
88 #include "nspawn-mount.h"
89 #include "nspawn-network.h"
90 #include "nspawn-expose-ports.h"
91 #include "nspawn-cgroup.h"
92 #include "nspawn-register.h"
93 #include "nspawn-setuid.h"
94
95 typedef enum ContainerStatus {
96 CONTAINER_TERMINATED,
97 CONTAINER_REBOOTED
98 } ContainerStatus;
99
100 typedef enum LinkJournal {
101 LINK_NO,
102 LINK_AUTO,
103 LINK_HOST,
104 LINK_GUEST
105 } LinkJournal;
106
107 static char *arg_directory = NULL;
108 static char *arg_template = NULL;
109 static char *arg_user = NULL;
110 static sd_id128_t arg_uuid = {};
111 static char *arg_machine = NULL;
112 static const char *arg_selinux_context = NULL;
113 static const char *arg_selinux_apifs_context = NULL;
114 static const char *arg_slice = NULL;
115 static bool arg_private_network = false;
116 static bool arg_read_only = false;
117 static bool arg_boot = false;
118 static bool arg_ephemeral = false;
119 static LinkJournal arg_link_journal = LINK_AUTO;
120 static bool arg_link_journal_try = false;
121 static uint64_t arg_retain =
122 (1ULL << CAP_CHOWN) |
123 (1ULL << CAP_DAC_OVERRIDE) |
124 (1ULL << CAP_DAC_READ_SEARCH) |
125 (1ULL << CAP_FOWNER) |
126 (1ULL << CAP_FSETID) |
127 (1ULL << CAP_IPC_OWNER) |
128 (1ULL << CAP_KILL) |
129 (1ULL << CAP_LEASE) |
130 (1ULL << CAP_LINUX_IMMUTABLE) |
131 (1ULL << CAP_NET_BIND_SERVICE) |
132 (1ULL << CAP_NET_BROADCAST) |
133 (1ULL << CAP_NET_RAW) |
134 (1ULL << CAP_SETGID) |
135 (1ULL << CAP_SETFCAP) |
136 (1ULL << CAP_SETPCAP) |
137 (1ULL << CAP_SETUID) |
138 (1ULL << CAP_SYS_ADMIN) |
139 (1ULL << CAP_SYS_CHROOT) |
140 (1ULL << CAP_SYS_NICE) |
141 (1ULL << CAP_SYS_PTRACE) |
142 (1ULL << CAP_SYS_TTY_CONFIG) |
143 (1ULL << CAP_SYS_RESOURCE) |
144 (1ULL << CAP_SYS_BOOT) |
145 (1ULL << CAP_AUDIT_WRITE) |
146 (1ULL << CAP_AUDIT_CONTROL) |
147 (1ULL << CAP_MKNOD);
148 static CustomMount *arg_custom_mounts = NULL;
149 static unsigned arg_n_custom_mounts = 0;
150 static char **arg_setenv = NULL;
151 static bool arg_quiet = false;
152 static bool arg_share_system = false;
153 static bool arg_register = true;
154 static bool arg_keep_unit = false;
155 static char **arg_network_interfaces = NULL;
156 static char **arg_network_macvlan = NULL;
157 static char **arg_network_ipvlan = NULL;
158 static bool arg_network_veth = false;
159 static char *arg_network_bridge = NULL;
160 static unsigned long arg_personality = PERSONALITY_INVALID;
161 static char *arg_image = NULL;
162 static VolatileMode arg_volatile_mode = VOLATILE_NO;
163 static ExposePort *arg_expose_ports = NULL;
164 static char **arg_property = NULL;
165 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
166 static bool arg_userns = false;
167 static int arg_kill_signal = 0;
168 static bool arg_unified_cgroup_hierarchy = false;
169 static SettingsMask arg_settings_mask = 0;
170 static int arg_settings_trusted = -1;
171 static char **arg_parameters = NULL;
172
173 static void help(void) {
174 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
175 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
176 " -h --help Show this help\n"
177 " --version Print version string\n"
178 " -q --quiet Do not show status information\n"
179 " -D --directory=PATH Root directory for the container\n"
180 " --template=PATH Initialize root directory from template directory,\n"
181 " if missing\n"
182 " -x --ephemeral Run container with snapshot of root directory, and\n"
183 " remove it after exit\n"
184 " -i --image=PATH File system device or disk image for the container\n"
185 " -b --boot Boot up full system (i.e. invoke init)\n"
186 " -u --user=USER Run the command under specified user or uid\n"
187 " -M --machine=NAME Set the machine name for the container\n"
188 " --uuid=UUID Set a specific machine UUID for the container\n"
189 " -S --slice=SLICE Place the container in the specified slice\n"
190 " --property=NAME=VALUE Set scope unit property\n"
191 " --private-users[=UIDBASE[:NUIDS]]\n"
192 " Run within user namespace\n"
193 " --private-network Disable network in container\n"
194 " --network-interface=INTERFACE\n"
195 " Assign an existing network interface to the\n"
196 " container\n"
197 " --network-macvlan=INTERFACE\n"
198 " Create a macvlan network interface based on an\n"
199 " existing network interface to the container\n"
200 " --network-ipvlan=INTERFACE\n"
201 " Create a ipvlan network interface based on an\n"
202 " existing network interface to the container\n"
203 " -n --network-veth Add a virtual ethernet connection between host\n"
204 " and container\n"
205 " --network-bridge=INTERFACE\n"
206 " Add a virtual ethernet connection between host\n"
207 " and container and add it to an existing bridge on\n"
208 " the host\n"
209 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
210 " Expose a container IP port on the host\n"
211 " -Z --selinux-context=SECLABEL\n"
212 " Set the SELinux security context to be used by\n"
213 " processes in the container\n"
214 " -L --selinux-apifs-context=SECLABEL\n"
215 " Set the SELinux security context to be used by\n"
216 " API/tmpfs file systems in the container\n"
217 " --capability=CAP In addition to the default, retain specified\n"
218 " capability\n"
219 " --drop-capability=CAP Drop the specified capability from the default set\n"
220 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
221 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
222 " try-guest, try-host\n"
223 " -j Equivalent to --link-journal=try-guest\n"
224 " --read-only Mount the root directory read-only\n"
225 " --bind=PATH[:PATH[:OPTIONS]]\n"
226 " Bind mount a file or directory from the host into\n"
227 " the container\n"
228 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
229 " Similar, but creates a read-only bind mount\n"
230 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
231 " --overlay=PATH[:PATH...]:PATH\n"
232 " Create an overlay mount from the host to \n"
233 " the container\n"
234 " --overlay-ro=PATH[:PATH...]:PATH\n"
235 " Similar, but creates a read-only overlay mount\n"
236 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
237 " --share-system Share system namespaces with host\n"
238 " --register=BOOLEAN Register container as machine\n"
239 " --keep-unit Do not register a scope for the machine, reuse\n"
240 " the service unit nspawn is running in\n"
241 " --volatile[=MODE] Run the system in volatile mode\n"
242 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
243 , program_invocation_short_name);
244 }
245
246
247 static int custom_mounts_prepare(void) {
248 unsigned i;
249 int r;
250
251 /* Ensure the mounts are applied prefix first. */
252 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
253
254 /* Allocate working directories for the overlay file systems that need it */
255 for (i = 0; i < arg_n_custom_mounts; i++) {
256 CustomMount *m = &arg_custom_mounts[i];
257
258 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
259 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
260 return -EINVAL;
261 }
262
263 if (m->type != CUSTOM_MOUNT_OVERLAY)
264 continue;
265
266 if (m->work_dir)
267 continue;
268
269 if (m->read_only)
270 continue;
271
272 r = tempfn_random(m->source, NULL, &m->work_dir);
273 if (r < 0)
274 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
275 }
276
277 return 0;
278 }
279
280 static int set_sanitized_path(char **b, const char *path) {
281 char *p;
282
283 assert(b);
284 assert(path);
285
286 p = canonicalize_file_name(path);
287 if (!p) {
288 if (errno != ENOENT)
289 return -errno;
290
291 p = path_make_absolute_cwd(path);
292 if (!p)
293 return -ENOMEM;
294 }
295
296 free(*b);
297 *b = path_kill_slashes(p);
298 return 0;
299 }
300
301 static int detect_unified_cgroup_hierarchy(void) {
302 const char *e;
303 int r;
304
305 /* Allow the user to control whether the unified hierarchy is used */
306 e = getenv("UNIFIED_CGROUP_HIERARCHY");
307 if (e) {
308 r = parse_boolean(e);
309 if (r < 0)
310 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
311
312 arg_unified_cgroup_hierarchy = r;
313 return 0;
314 }
315
316 /* Otherwise inherit the default from the host system */
317 r = cg_unified();
318 if (r < 0)
319 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
320
321 arg_unified_cgroup_hierarchy = r;
322 return 0;
323 }
324
325 static int parse_argv(int argc, char *argv[]) {
326
327 enum {
328 ARG_VERSION = 0x100,
329 ARG_PRIVATE_NETWORK,
330 ARG_UUID,
331 ARG_READ_ONLY,
332 ARG_CAPABILITY,
333 ARG_DROP_CAPABILITY,
334 ARG_LINK_JOURNAL,
335 ARG_BIND,
336 ARG_BIND_RO,
337 ARG_TMPFS,
338 ARG_OVERLAY,
339 ARG_OVERLAY_RO,
340 ARG_SETENV,
341 ARG_SHARE_SYSTEM,
342 ARG_REGISTER,
343 ARG_KEEP_UNIT,
344 ARG_NETWORK_INTERFACE,
345 ARG_NETWORK_MACVLAN,
346 ARG_NETWORK_IPVLAN,
347 ARG_NETWORK_BRIDGE,
348 ARG_PERSONALITY,
349 ARG_VOLATILE,
350 ARG_TEMPLATE,
351 ARG_PROPERTY,
352 ARG_PRIVATE_USERS,
353 ARG_KILL_SIGNAL,
354 ARG_SETTINGS,
355 };
356
357 static const struct option options[] = {
358 { "help", no_argument, NULL, 'h' },
359 { "version", no_argument, NULL, ARG_VERSION },
360 { "directory", required_argument, NULL, 'D' },
361 { "template", required_argument, NULL, ARG_TEMPLATE },
362 { "ephemeral", no_argument, NULL, 'x' },
363 { "user", required_argument, NULL, 'u' },
364 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
365 { "boot", no_argument, NULL, 'b' },
366 { "uuid", required_argument, NULL, ARG_UUID },
367 { "read-only", no_argument, NULL, ARG_READ_ONLY },
368 { "capability", required_argument, NULL, ARG_CAPABILITY },
369 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
370 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
371 { "bind", required_argument, NULL, ARG_BIND },
372 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
373 { "tmpfs", required_argument, NULL, ARG_TMPFS },
374 { "overlay", required_argument, NULL, ARG_OVERLAY },
375 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
376 { "machine", required_argument, NULL, 'M' },
377 { "slice", required_argument, NULL, 'S' },
378 { "setenv", required_argument, NULL, ARG_SETENV },
379 { "selinux-context", required_argument, NULL, 'Z' },
380 { "selinux-apifs-context", required_argument, NULL, 'L' },
381 { "quiet", no_argument, NULL, 'q' },
382 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
383 { "register", required_argument, NULL, ARG_REGISTER },
384 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
385 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
386 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
387 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
388 { "network-veth", no_argument, NULL, 'n' },
389 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
390 { "personality", required_argument, NULL, ARG_PERSONALITY },
391 { "image", required_argument, NULL, 'i' },
392 { "volatile", optional_argument, NULL, ARG_VOLATILE },
393 { "port", required_argument, NULL, 'p' },
394 { "property", required_argument, NULL, ARG_PROPERTY },
395 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
396 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
397 { "settings", required_argument, NULL, ARG_SETTINGS },
398 {}
399 };
400
401 int c, r;
402 uint64_t plus = 0, minus = 0;
403 bool mask_all_settings = false, mask_no_settings = false;
404
405 assert(argc >= 0);
406 assert(argv);
407
408 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
409
410 switch (c) {
411
412 case 'h':
413 help();
414 return 0;
415
416 case ARG_VERSION:
417 puts(PACKAGE_STRING);
418 puts(SYSTEMD_FEATURES);
419 return 0;
420
421 case 'D':
422 r = set_sanitized_path(&arg_directory, optarg);
423 if (r < 0)
424 return log_error_errno(r, "Invalid root directory: %m");
425
426 break;
427
428 case ARG_TEMPLATE:
429 r = set_sanitized_path(&arg_template, optarg);
430 if (r < 0)
431 return log_error_errno(r, "Invalid template directory: %m");
432
433 break;
434
435 case 'i':
436 r = set_sanitized_path(&arg_image, optarg);
437 if (r < 0)
438 return log_error_errno(r, "Invalid image path: %m");
439
440 break;
441
442 case 'x':
443 arg_ephemeral = true;
444 break;
445
446 case 'u':
447 r = free_and_strdup(&arg_user, optarg);
448 if (r < 0)
449 return log_oom();
450
451 arg_settings_mask |= SETTING_USER;
452 break;
453
454 case ARG_NETWORK_BRIDGE:
455 r = free_and_strdup(&arg_network_bridge, optarg);
456 if (r < 0)
457 return log_oom();
458
459 /* fall through */
460
461 case 'n':
462 arg_network_veth = true;
463 arg_private_network = true;
464 arg_settings_mask |= SETTING_NETWORK;
465 break;
466
467 case ARG_NETWORK_INTERFACE:
468 if (strv_extend(&arg_network_interfaces, optarg) < 0)
469 return log_oom();
470
471 arg_private_network = true;
472 arg_settings_mask |= SETTING_NETWORK;
473 break;
474
475 case ARG_NETWORK_MACVLAN:
476 if (strv_extend(&arg_network_macvlan, optarg) < 0)
477 return log_oom();
478
479 arg_private_network = true;
480 arg_settings_mask |= SETTING_NETWORK;
481 break;
482
483 case ARG_NETWORK_IPVLAN:
484 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
485 return log_oom();
486
487 /* fall through */
488
489 case ARG_PRIVATE_NETWORK:
490 arg_private_network = true;
491 arg_settings_mask |= SETTING_NETWORK;
492 break;
493
494 case 'b':
495 arg_boot = true;
496 arg_settings_mask |= SETTING_BOOT;
497 break;
498
499 case ARG_UUID:
500 r = sd_id128_from_string(optarg, &arg_uuid);
501 if (r < 0) {
502 log_error("Invalid UUID: %s", optarg);
503 return r;
504 }
505
506 arg_settings_mask |= SETTING_MACHINE_ID;
507 break;
508
509 case 'S':
510 arg_slice = optarg;
511 break;
512
513 case 'M':
514 if (isempty(optarg))
515 arg_machine = mfree(arg_machine);
516 else {
517 if (!machine_name_is_valid(optarg)) {
518 log_error("Invalid machine name: %s", optarg);
519 return -EINVAL;
520 }
521
522 r = free_and_strdup(&arg_machine, optarg);
523 if (r < 0)
524 return log_oom();
525
526 break;
527 }
528
529 case 'Z':
530 arg_selinux_context = optarg;
531 break;
532
533 case 'L':
534 arg_selinux_apifs_context = optarg;
535 break;
536
537 case ARG_READ_ONLY:
538 arg_read_only = true;
539 arg_settings_mask |= SETTING_READ_ONLY;
540 break;
541
542 case ARG_CAPABILITY:
543 case ARG_DROP_CAPABILITY: {
544 const char *state, *word;
545 size_t length;
546
547 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
548 _cleanup_free_ char *t;
549
550 t = strndup(word, length);
551 if (!t)
552 return log_oom();
553
554 if (streq(t, "all")) {
555 if (c == ARG_CAPABILITY)
556 plus = (uint64_t) -1;
557 else
558 minus = (uint64_t) -1;
559 } else {
560 int cap;
561
562 cap = capability_from_name(t);
563 if (cap < 0) {
564 log_error("Failed to parse capability %s.", t);
565 return -EINVAL;
566 }
567
568 if (c == ARG_CAPABILITY)
569 plus |= 1ULL << (uint64_t) cap;
570 else
571 minus |= 1ULL << (uint64_t) cap;
572 }
573 }
574
575 arg_settings_mask |= SETTING_CAPABILITY;
576 break;
577 }
578
579 case 'j':
580 arg_link_journal = LINK_GUEST;
581 arg_link_journal_try = true;
582 break;
583
584 case ARG_LINK_JOURNAL:
585 if (streq(optarg, "auto")) {
586 arg_link_journal = LINK_AUTO;
587 arg_link_journal_try = false;
588 } else if (streq(optarg, "no")) {
589 arg_link_journal = LINK_NO;
590 arg_link_journal_try = false;
591 } else if (streq(optarg, "guest")) {
592 arg_link_journal = LINK_GUEST;
593 arg_link_journal_try = false;
594 } else if (streq(optarg, "host")) {
595 arg_link_journal = LINK_HOST;
596 arg_link_journal_try = false;
597 } else if (streq(optarg, "try-guest")) {
598 arg_link_journal = LINK_GUEST;
599 arg_link_journal_try = true;
600 } else if (streq(optarg, "try-host")) {
601 arg_link_journal = LINK_HOST;
602 arg_link_journal_try = true;
603 } else {
604 log_error("Failed to parse link journal mode %s", optarg);
605 return -EINVAL;
606 }
607
608 break;
609
610 case ARG_BIND:
611 case ARG_BIND_RO:
612 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
613 if (r < 0)
614 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
615
616 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
617 break;
618
619 case ARG_TMPFS:
620 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
621 if (r < 0)
622 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
623
624 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
625 break;
626
627 case ARG_OVERLAY:
628 case ARG_OVERLAY_RO: {
629 _cleanup_free_ char *upper = NULL, *destination = NULL;
630 _cleanup_strv_free_ char **lower = NULL;
631 CustomMount *m;
632 unsigned n = 0;
633 char **i;
634
635 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
636 if (r == -ENOMEM)
637 return log_oom();
638 else if (r < 0) {
639 log_error("Invalid overlay specification: %s", optarg);
640 return r;
641 }
642
643 STRV_FOREACH(i, lower) {
644 if (!path_is_absolute(*i)) {
645 log_error("Overlay path %s is not absolute.", *i);
646 return -EINVAL;
647 }
648
649 n++;
650 }
651
652 if (n < 2) {
653 log_error("--overlay= needs at least two colon-separated directories specified.");
654 return -EINVAL;
655 }
656
657 if (n == 2) {
658 /* If two parameters are specified,
659 * the first one is the lower, the
660 * second one the upper directory. And
661 * we'll also define the destination
662 * mount point the same as the upper. */
663 upper = lower[1];
664 lower[1] = NULL;
665
666 destination = strdup(upper);
667 if (!destination)
668 return log_oom();
669
670 } else {
671 upper = lower[n - 2];
672 destination = lower[n - 1];
673 lower[n - 2] = NULL;
674 }
675
676 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
677 if (!m)
678 return log_oom();
679
680 m->destination = destination;
681 m->source = upper;
682 m->lower = lower;
683 m->read_only = c == ARG_OVERLAY_RO;
684
685 upper = destination = NULL;
686 lower = NULL;
687
688 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
689 break;
690 }
691
692 case ARG_SETENV: {
693 char **n;
694
695 if (!env_assignment_is_valid(optarg)) {
696 log_error("Environment variable assignment '%s' is not valid.", optarg);
697 return -EINVAL;
698 }
699
700 n = strv_env_set(arg_setenv, optarg);
701 if (!n)
702 return log_oom();
703
704 strv_free(arg_setenv);
705 arg_setenv = n;
706
707 arg_settings_mask |= SETTING_ENVIRONMENT;
708 break;
709 }
710
711 case 'q':
712 arg_quiet = true;
713 break;
714
715 case ARG_SHARE_SYSTEM:
716 arg_share_system = true;
717 break;
718
719 case ARG_REGISTER:
720 r = parse_boolean(optarg);
721 if (r < 0) {
722 log_error("Failed to parse --register= argument: %s", optarg);
723 return r;
724 }
725
726 arg_register = r;
727 break;
728
729 case ARG_KEEP_UNIT:
730 arg_keep_unit = true;
731 break;
732
733 case ARG_PERSONALITY:
734
735 arg_personality = personality_from_string(optarg);
736 if (arg_personality == PERSONALITY_INVALID) {
737 log_error("Unknown or unsupported personality '%s'.", optarg);
738 return -EINVAL;
739 }
740
741 arg_settings_mask |= SETTING_PERSONALITY;
742 break;
743
744 case ARG_VOLATILE:
745
746 if (!optarg)
747 arg_volatile_mode = VOLATILE_YES;
748 else {
749 VolatileMode m;
750
751 m = volatile_mode_from_string(optarg);
752 if (m < 0) {
753 log_error("Failed to parse --volatile= argument: %s", optarg);
754 return -EINVAL;
755 } else
756 arg_volatile_mode = m;
757 }
758
759 arg_settings_mask |= SETTING_VOLATILE_MODE;
760 break;
761
762 case 'p':
763 r = expose_port_parse(&arg_expose_ports, optarg);
764 if (r == -EEXIST)
765 return log_error_errno(r, "Duplicate port specification: %s", optarg);
766 if (r < 0)
767 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
768
769 arg_settings_mask |= SETTING_EXPOSE_PORTS;
770 break;
771
772 case ARG_PROPERTY:
773 if (strv_extend(&arg_property, optarg) < 0)
774 return log_oom();
775
776 break;
777
778 case ARG_PRIVATE_USERS:
779 if (optarg) {
780 _cleanup_free_ char *buffer = NULL;
781 const char *range, *shift;
782
783 range = strchr(optarg, ':');
784 if (range) {
785 buffer = strndup(optarg, range - optarg);
786 if (!buffer)
787 return log_oom();
788 shift = buffer;
789
790 range++;
791 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
792 log_error("Failed to parse UID range: %s", range);
793 return -EINVAL;
794 }
795 } else
796 shift = optarg;
797
798 if (parse_uid(shift, &arg_uid_shift) < 0) {
799 log_error("Failed to parse UID: %s", optarg);
800 return -EINVAL;
801 }
802 }
803
804 arg_userns = true;
805 break;
806
807 case ARG_KILL_SIGNAL:
808 arg_kill_signal = signal_from_string_try_harder(optarg);
809 if (arg_kill_signal < 0) {
810 log_error("Cannot parse signal: %s", optarg);
811 return -EINVAL;
812 }
813
814 arg_settings_mask |= SETTING_KILL_SIGNAL;
815 break;
816
817 case ARG_SETTINGS:
818
819 /* no → do not read files
820 * yes → read files, do not override cmdline, trust only subset
821 * override → read files, override cmdline, trust only subset
822 * trusted → read files, do not override cmdline, trust all
823 */
824
825 r = parse_boolean(optarg);
826 if (r < 0) {
827 if (streq(optarg, "trusted")) {
828 mask_all_settings = false;
829 mask_no_settings = false;
830 arg_settings_trusted = true;
831
832 } else if (streq(optarg, "override")) {
833 mask_all_settings = false;
834 mask_no_settings = true;
835 arg_settings_trusted = -1;
836 } else
837 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
838 } else if (r > 0) {
839 /* yes */
840 mask_all_settings = false;
841 mask_no_settings = false;
842 arg_settings_trusted = -1;
843 } else {
844 /* no */
845 mask_all_settings = true;
846 mask_no_settings = false;
847 arg_settings_trusted = false;
848 }
849
850 break;
851
852 case '?':
853 return -EINVAL;
854
855 default:
856 assert_not_reached("Unhandled option");
857 }
858
859 if (arg_share_system)
860 arg_register = false;
861
862 if (arg_boot && arg_share_system) {
863 log_error("--boot and --share-system may not be combined.");
864 return -EINVAL;
865 }
866
867 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
868 log_error("--keep-unit may not be used when invoked from a user session.");
869 return -EINVAL;
870 }
871
872 if (arg_directory && arg_image) {
873 log_error("--directory= and --image= may not be combined.");
874 return -EINVAL;
875 }
876
877 if (arg_template && arg_image) {
878 log_error("--template= and --image= may not be combined.");
879 return -EINVAL;
880 }
881
882 if (arg_template && !(arg_directory || arg_machine)) {
883 log_error("--template= needs --directory= or --machine=.");
884 return -EINVAL;
885 }
886
887 if (arg_ephemeral && arg_template) {
888 log_error("--ephemeral and --template= may not be combined.");
889 return -EINVAL;
890 }
891
892 if (arg_ephemeral && arg_image) {
893 log_error("--ephemeral and --image= may not be combined.");
894 return -EINVAL;
895 }
896
897 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
898 log_error("--ephemeral and --link-journal= may not be combined.");
899 return -EINVAL;
900 }
901
902 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
903 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
904
905 if (argc > optind) {
906 arg_parameters = strv_copy(argv + optind);
907 if (!arg_parameters)
908 return log_oom();
909
910 arg_settings_mask |= SETTING_BOOT;
911 }
912
913 /* Load all settings from .nspawn files */
914 if (mask_no_settings)
915 arg_settings_mask = 0;
916
917 /* Don't load any settings from .nspawn files */
918 if (mask_all_settings)
919 arg_settings_mask = _SETTINGS_MASK_ALL;
920
921 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
922
923 r = detect_unified_cgroup_hierarchy();
924 if (r < 0)
925 return r;
926
927 return 1;
928 }
929
930 static int verify_arguments(void) {
931
932 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
933 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
934 return -EINVAL;
935 }
936
937 if (arg_expose_ports && !arg_private_network) {
938 log_error("Cannot use --port= without private networking.");
939 return -EINVAL;
940 }
941
942 if (arg_boot && arg_kill_signal <= 0)
943 arg_kill_signal = SIGRTMIN+3;
944
945 return 0;
946 }
947
948 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
949 assert(p);
950
951 if (!arg_userns)
952 return 0;
953
954 if (uid == UID_INVALID && gid == GID_INVALID)
955 return 0;
956
957 if (uid != UID_INVALID) {
958 uid += arg_uid_shift;
959
960 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
961 return -EOVERFLOW;
962 }
963
964 if (gid != GID_INVALID) {
965 gid += (gid_t) arg_uid_shift;
966
967 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
968 return -EOVERFLOW;
969 }
970
971 if (lchown(p, uid, gid) < 0)
972 return -errno;
973
974 return 0;
975 }
976
977 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
978 const char *q;
979
980 q = prefix_roota(root, path);
981 if (mkdir(q, mode) < 0) {
982 if (errno == EEXIST)
983 return 0;
984 return -errno;
985 }
986
987 return userns_lchown(q, uid, gid);
988 }
989
990 static int setup_timezone(const char *dest) {
991 _cleanup_free_ char *p = NULL, *q = NULL;
992 const char *where, *check, *what;
993 char *z, *y;
994 int r;
995
996 assert(dest);
997
998 /* Fix the timezone, if possible */
999 r = readlink_malloc("/etc/localtime", &p);
1000 if (r < 0) {
1001 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1002 return 0;
1003 }
1004
1005 z = path_startswith(p, "../usr/share/zoneinfo/");
1006 if (!z)
1007 z = path_startswith(p, "/usr/share/zoneinfo/");
1008 if (!z) {
1009 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1010 return 0;
1011 }
1012
1013 where = prefix_roota(dest, "/etc/localtime");
1014 r = readlink_malloc(where, &q);
1015 if (r >= 0) {
1016 y = path_startswith(q, "../usr/share/zoneinfo/");
1017 if (!y)
1018 y = path_startswith(q, "/usr/share/zoneinfo/");
1019
1020 /* Already pointing to the right place? Then do nothing .. */
1021 if (y && streq(y, z))
1022 return 0;
1023 }
1024
1025 check = strjoina("/usr/share/zoneinfo/", z);
1026 check = prefix_root(dest, check);
1027 if (laccess(check, F_OK) < 0) {
1028 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1029 return 0;
1030 }
1031
1032 r = unlink(where);
1033 if (r < 0 && errno != ENOENT) {
1034 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1035 return 0;
1036 }
1037
1038 what = strjoina("../usr/share/zoneinfo/", z);
1039 if (symlink(what, where) < 0) {
1040 log_error_errno(errno, "Failed to correct timezone of container: %m");
1041 return 0;
1042 }
1043
1044 r = userns_lchown(where, 0, 0);
1045 if (r < 0)
1046 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1047
1048 return 0;
1049 }
1050
1051 static int setup_resolv_conf(const char *dest) {
1052 const char *where = NULL;
1053 int r;
1054
1055 assert(dest);
1056
1057 if (arg_private_network)
1058 return 0;
1059
1060 /* Fix resolv.conf, if possible */
1061 where = prefix_roota(dest, "/etc/resolv.conf");
1062
1063 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1064 if (r < 0) {
1065 /* If the file already exists as symlink, let's
1066 * suppress the warning, under the assumption that
1067 * resolved or something similar runs inside and the
1068 * symlink points there.
1069 *
1070 * If the disk image is read-only, there's also no
1071 * point in complaining.
1072 */
1073 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1074 "Failed to copy /etc/resolv.conf to %s: %m", where);
1075 return 0;
1076 }
1077
1078 r = userns_lchown(where, 0, 0);
1079 if (r < 0)
1080 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1081
1082 return 0;
1083 }
1084
1085 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1086 assert(s);
1087
1088 snprintf(s, 37,
1089 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1090 SD_ID128_FORMAT_VAL(id));
1091
1092 return s;
1093 }
1094
1095 static int setup_boot_id(const char *dest) {
1096 const char *from, *to;
1097 sd_id128_t rnd = {};
1098 char as_uuid[37];
1099 int r;
1100
1101 if (arg_share_system)
1102 return 0;
1103
1104 /* Generate a new randomized boot ID, so that each boot-up of
1105 * the container gets a new one */
1106
1107 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1108 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1109
1110 r = sd_id128_randomize(&rnd);
1111 if (r < 0)
1112 return log_error_errno(r, "Failed to generate random boot id: %m");
1113
1114 id128_format_as_uuid(rnd, as_uuid);
1115
1116 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1117 if (r < 0)
1118 return log_error_errno(r, "Failed to write boot id: %m");
1119
1120 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1121 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1122 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1123 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1124
1125 unlink(from);
1126 return r;
1127 }
1128
1129 static int copy_devnodes(const char *dest) {
1130
1131 static const char devnodes[] =
1132 "null\0"
1133 "zero\0"
1134 "full\0"
1135 "random\0"
1136 "urandom\0"
1137 "tty\0"
1138 "net/tun\0";
1139
1140 const char *d;
1141 int r = 0;
1142 _cleanup_umask_ mode_t u;
1143
1144 assert(dest);
1145
1146 u = umask(0000);
1147
1148 /* Create /dev/net, so that we can create /dev/net/tun in it */
1149 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1150 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1151
1152 NULSTR_FOREACH(d, devnodes) {
1153 _cleanup_free_ char *from = NULL, *to = NULL;
1154 struct stat st;
1155
1156 from = strappend("/dev/", d);
1157 to = prefix_root(dest, from);
1158
1159 if (stat(from, &st) < 0) {
1160
1161 if (errno != ENOENT)
1162 return log_error_errno(errno, "Failed to stat %s: %m", from);
1163
1164 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1165
1166 log_error("%s is not a char or block device, cannot copy.", from);
1167 return -EIO;
1168
1169 } else {
1170 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1171 if (errno != EPERM)
1172 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1173
1174 /* Some systems abusively restrict mknod but
1175 * allow bind mounts. */
1176 r = touch(to);
1177 if (r < 0)
1178 return log_error_errno(r, "touch (%s) failed: %m", to);
1179 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1180 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1181 }
1182
1183 r = userns_lchown(to, 0, 0);
1184 if (r < 0)
1185 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1186 }
1187 }
1188
1189 return r;
1190 }
1191
1192 static int setup_pts(const char *dest) {
1193 _cleanup_free_ char *options = NULL;
1194 const char *p;
1195
1196 #ifdef HAVE_SELINUX
1197 if (arg_selinux_apifs_context)
1198 (void) asprintf(&options,
1199 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1200 arg_uid_shift + TTY_GID,
1201 arg_selinux_apifs_context);
1202 else
1203 #endif
1204 (void) asprintf(&options,
1205 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1206 arg_uid_shift + TTY_GID);
1207
1208 if (!options)
1209 return log_oom();
1210
1211 /* Mount /dev/pts itself */
1212 p = prefix_roota(dest, "/dev/pts");
1213 if (mkdir(p, 0755) < 0)
1214 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1215 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1216 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1217 if (userns_lchown(p, 0, 0) < 0)
1218 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1219
1220 /* Create /dev/ptmx symlink */
1221 p = prefix_roota(dest, "/dev/ptmx");
1222 if (symlink("pts/ptmx", p) < 0)
1223 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1224 if (userns_lchown(p, 0, 0) < 0)
1225 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1226
1227 /* And fix /dev/pts/ptmx ownership */
1228 p = prefix_roota(dest, "/dev/pts/ptmx");
1229 if (userns_lchown(p, 0, 0) < 0)
1230 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1231
1232 return 0;
1233 }
1234
1235 static int setup_dev_console(const char *dest, const char *console) {
1236 _cleanup_umask_ mode_t u;
1237 const char *to;
1238 int r;
1239
1240 assert(dest);
1241 assert(console);
1242
1243 u = umask(0000);
1244
1245 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1246 if (r < 0)
1247 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1248
1249 /* We need to bind mount the right tty to /dev/console since
1250 * ptys can only exist on pts file systems. To have something
1251 * to bind mount things on we create a empty regular file. */
1252
1253 to = prefix_roota(dest, "/dev/console");
1254 r = touch(to);
1255 if (r < 0)
1256 return log_error_errno(r, "touch() for /dev/console failed: %m");
1257
1258 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1259 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1260
1261 return 0;
1262 }
1263
1264 static int setup_kmsg(const char *dest, int kmsg_socket) {
1265 const char *from, *to;
1266 _cleanup_umask_ mode_t u;
1267 int fd, r;
1268
1269 assert(kmsg_socket >= 0);
1270
1271 u = umask(0000);
1272
1273 /* We create the kmsg FIFO as /run/kmsg, but immediately
1274 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1275 * on the reading side behave very similar to /proc/kmsg,
1276 * their writing side behaves differently from /dev/kmsg in
1277 * that writing blocks when nothing is reading. In order to
1278 * avoid any problems with containers deadlocking due to this
1279 * we simply make /dev/kmsg unavailable to the container. */
1280 from = prefix_roota(dest, "/run/kmsg");
1281 to = prefix_roota(dest, "/proc/kmsg");
1282
1283 if (mkfifo(from, 0600) < 0)
1284 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1285 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1286 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1287
1288 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1289 if (fd < 0)
1290 return log_error_errno(errno, "Failed to open fifo: %m");
1291
1292 /* Store away the fd in the socket, so that it stays open as
1293 * long as we run the child */
1294 r = send_one_fd(kmsg_socket, fd);
1295 safe_close(fd);
1296
1297 if (r < 0)
1298 return log_error_errno(r, "Failed to send FIFO fd: %m");
1299
1300 /* And now make the FIFO unavailable as /run/kmsg... */
1301 (void) unlink(from);
1302
1303 return 0;
1304 }
1305
1306 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1307 union in_addr_union *exposed = userdata;
1308
1309 assert(rtnl);
1310 assert(m);
1311 assert(exposed);
1312
1313 expose_port_execute(rtnl, arg_expose_ports, exposed);
1314 return 0;
1315 }
1316
1317 static int setup_hostname(void) {
1318
1319 if (arg_share_system)
1320 return 0;
1321
1322 if (sethostname_idempotent(arg_machine) < 0)
1323 return -errno;
1324
1325 return 0;
1326 }
1327
1328 static int setup_journal(const char *directory) {
1329 sd_id128_t machine_id, this_id;
1330 _cleanup_free_ char *b = NULL, *d = NULL;
1331 const char *etc_machine_id, *p, *q;
1332 char *id;
1333 int r;
1334
1335 /* Don't link journals in ephemeral mode */
1336 if (arg_ephemeral)
1337 return 0;
1338
1339 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1340
1341 r = read_one_line_file(etc_machine_id, &b);
1342 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1343 return 0;
1344 else if (r < 0)
1345 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
1346
1347 id = strstrip(b);
1348 if (isempty(id) && arg_link_journal == LINK_AUTO)
1349 return 0;
1350
1351 /* Verify validity */
1352 r = sd_id128_from_string(id, &machine_id);
1353 if (r < 0)
1354 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
1355
1356 r = sd_id128_get_machine(&this_id);
1357 if (r < 0)
1358 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1359
1360 if (sd_id128_equal(machine_id, this_id)) {
1361 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1362 "Host and machine ids are equal (%s): refusing to link journals", id);
1363 if (arg_link_journal == LINK_AUTO)
1364 return 0;
1365 return -EEXIST;
1366 }
1367
1368 if (arg_link_journal == LINK_NO)
1369 return 0;
1370
1371 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1372 if (r < 0)
1373 return log_error_errno(r, "Failed to create /var: %m");
1374
1375 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1376 if (r < 0)
1377 return log_error_errno(r, "Failed to create /var/log: %m");
1378
1379 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1380 if (r < 0)
1381 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1382
1383 p = strjoina("/var/log/journal/", id);
1384 q = prefix_roota(directory, p);
1385
1386 if (path_is_mount_point(p, 0) > 0) {
1387 if (arg_link_journal != LINK_AUTO) {
1388 log_error("%s: already a mount point, refusing to use for journal", p);
1389 return -EEXIST;
1390 }
1391
1392 return 0;
1393 }
1394
1395 if (path_is_mount_point(q, 0) > 0) {
1396 if (arg_link_journal != LINK_AUTO) {
1397 log_error("%s: already a mount point, refusing to use for journal", q);
1398 return -EEXIST;
1399 }
1400
1401 return 0;
1402 }
1403
1404 r = readlink_and_make_absolute(p, &d);
1405 if (r >= 0) {
1406 if ((arg_link_journal == LINK_GUEST ||
1407 arg_link_journal == LINK_AUTO) &&
1408 path_equal(d, q)) {
1409
1410 r = userns_mkdir(directory, p, 0755, 0, 0);
1411 if (r < 0)
1412 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1413 return 0;
1414 }
1415
1416 if (unlink(p) < 0)
1417 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1418 } else if (r == -EINVAL) {
1419
1420 if (arg_link_journal == LINK_GUEST &&
1421 rmdir(p) < 0) {
1422
1423 if (errno == ENOTDIR) {
1424 log_error("%s already exists and is neither a symlink nor a directory", p);
1425 return r;
1426 } else {
1427 log_error_errno(errno, "Failed to remove %s: %m", p);
1428 return -errno;
1429 }
1430 }
1431 } else if (r != -ENOENT) {
1432 log_error_errno(errno, "readlink(%s) failed: %m", p);
1433 return r;
1434 }
1435
1436 if (arg_link_journal == LINK_GUEST) {
1437
1438 if (symlink(q, p) < 0) {
1439 if (arg_link_journal_try) {
1440 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1441 return 0;
1442 } else {
1443 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1444 return -errno;
1445 }
1446 }
1447
1448 r = userns_mkdir(directory, p, 0755, 0, 0);
1449 if (r < 0)
1450 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1451 return 0;
1452 }
1453
1454 if (arg_link_journal == LINK_HOST) {
1455 /* don't create parents here -- if the host doesn't have
1456 * permanent journal set up, don't force it here */
1457 r = mkdir(p, 0755);
1458 if (r < 0) {
1459 if (arg_link_journal_try) {
1460 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1461 return 0;
1462 } else {
1463 log_error_errno(errno, "Failed to create %s: %m", p);
1464 return r;
1465 }
1466 }
1467
1468 } else if (access(p, F_OK) < 0)
1469 return 0;
1470
1471 if (dir_is_empty(q) == 0)
1472 log_warning("%s is not empty, proceeding anyway.", q);
1473
1474 r = userns_mkdir(directory, p, 0755, 0, 0);
1475 if (r < 0) {
1476 log_error_errno(errno, "Failed to create %s: %m", q);
1477 return r;
1478 }
1479
1480 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1481 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1482
1483 return 0;
1484 }
1485
1486 static int drop_capabilities(void) {
1487 return capability_bounding_set_drop(~arg_retain, false);
1488 }
1489
1490 static int reset_audit_loginuid(void) {
1491 _cleanup_free_ char *p = NULL;
1492 int r;
1493
1494 if (arg_share_system)
1495 return 0;
1496
1497 r = read_one_line_file("/proc/self/loginuid", &p);
1498 if (r == -ENOENT)
1499 return 0;
1500 if (r < 0)
1501 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1502
1503 /* Already reset? */
1504 if (streq(p, "4294967295"))
1505 return 0;
1506
1507 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1508 if (r < 0) {
1509 log_error_errno(r,
1510 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1511 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1512 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1513 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1514 "using systemd-nspawn. Sleeping for 5s... (%m)");
1515
1516 sleep(5);
1517 }
1518
1519 return 0;
1520 }
1521
1522 static int setup_seccomp(void) {
1523
1524 #ifdef HAVE_SECCOMP
1525 static const struct {
1526 uint64_t capability;
1527 int syscall_num;
1528 } blacklist[] = {
1529 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1530 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1531 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1532 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1533 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1534 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1535 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1536 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1537 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1538 { CAP_SYSLOG, SCMP_SYS(syslog) },
1539 };
1540
1541 scmp_filter_ctx seccomp;
1542 unsigned i;
1543 int r;
1544
1545 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1546 if (!seccomp)
1547 return log_oom();
1548
1549 r = seccomp_add_secondary_archs(seccomp);
1550 if (r < 0) {
1551 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1552 goto finish;
1553 }
1554
1555 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1556 if (arg_retain & (1ULL << blacklist[i].capability))
1557 continue;
1558
1559 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1560 if (r == -EFAULT)
1561 continue; /* unknown syscall */
1562 if (r < 0) {
1563 log_error_errno(r, "Failed to block syscall: %m");
1564 goto finish;
1565 }
1566 }
1567
1568
1569 /*
1570 Audit is broken in containers, much of the userspace audit
1571 hookup will fail if running inside a container. We don't
1572 care and just turn off creation of audit sockets.
1573
1574 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1575 with EAFNOSUPPORT which audit userspace uses as indication
1576 that audit is disabled in the kernel.
1577 */
1578
1579 r = seccomp_rule_add(
1580 seccomp,
1581 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1582 SCMP_SYS(socket),
1583 2,
1584 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1585 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1586 if (r < 0) {
1587 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1588 goto finish;
1589 }
1590
1591 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1592 if (r < 0) {
1593 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1594 goto finish;
1595 }
1596
1597 r = seccomp_load(seccomp);
1598 if (r == -EINVAL) {
1599 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1600 r = 0;
1601 goto finish;
1602 }
1603 if (r < 0) {
1604 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1605 goto finish;
1606 }
1607
1608 finish:
1609 seccomp_release(seccomp);
1610 return r;
1611 #else
1612 return 0;
1613 #endif
1614
1615 }
1616
1617 static int setup_propagate(const char *root) {
1618 const char *p, *q;
1619
1620 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1621 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1622 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1623 (void) mkdir_p(p, 0600);
1624
1625 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
1626 return log_error_errno(errno, "Failed to create /run/systemd: %m");
1627
1628 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1629 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
1630
1631 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1632 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
1633
1634 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1635 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1636 return log_error_errno(errno, "Failed to install propagation bind mount.");
1637
1638 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1639 return log_error_errno(errno, "Failed to make propagation mount read-only");
1640
1641 return 0;
1642 }
1643
1644 static int setup_image(char **device_path, int *loop_nr) {
1645 struct loop_info64 info = {
1646 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1647 };
1648 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1649 _cleanup_free_ char* loopdev = NULL;
1650 struct stat st;
1651 int r, nr;
1652
1653 assert(device_path);
1654 assert(loop_nr);
1655 assert(arg_image);
1656
1657 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1658 if (fd < 0)
1659 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1660
1661 if (fstat(fd, &st) < 0)
1662 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1663
1664 if (S_ISBLK(st.st_mode)) {
1665 char *p;
1666
1667 p = strdup(arg_image);
1668 if (!p)
1669 return log_oom();
1670
1671 *device_path = p;
1672
1673 *loop_nr = -1;
1674
1675 r = fd;
1676 fd = -1;
1677
1678 return r;
1679 }
1680
1681 if (!S_ISREG(st.st_mode)) {
1682 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1683 return -EINVAL;
1684 }
1685
1686 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1687 if (control < 0)
1688 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1689
1690 nr = ioctl(control, LOOP_CTL_GET_FREE);
1691 if (nr < 0)
1692 return log_error_errno(errno, "Failed to allocate loop device: %m");
1693
1694 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1695 return log_oom();
1696
1697 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1698 if (loop < 0)
1699 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1700
1701 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1702 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1703
1704 if (arg_read_only)
1705 info.lo_flags |= LO_FLAGS_READ_ONLY;
1706
1707 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1708 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1709
1710 *device_path = loopdev;
1711 loopdev = NULL;
1712
1713 *loop_nr = nr;
1714
1715 r = loop;
1716 loop = -1;
1717
1718 return r;
1719 }
1720
1721 #define PARTITION_TABLE_BLURB \
1722 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1723 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1724 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1725 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1726 "to be bootable with systemd-nspawn."
1727
1728 static int dissect_image(
1729 int fd,
1730 char **root_device, bool *root_device_rw,
1731 char **home_device, bool *home_device_rw,
1732 char **srv_device, bool *srv_device_rw,
1733 bool *secondary) {
1734
1735 #ifdef HAVE_BLKID
1736 int home_nr = -1, srv_nr = -1;
1737 #ifdef GPT_ROOT_NATIVE
1738 int root_nr = -1;
1739 #endif
1740 #ifdef GPT_ROOT_SECONDARY
1741 int secondary_root_nr = -1;
1742 #endif
1743 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1744 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1745 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1746 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1747 _cleanup_udev_unref_ struct udev *udev = NULL;
1748 struct udev_list_entry *first, *item;
1749 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1750 bool is_gpt, is_mbr, multiple_generic = false;
1751 const char *pttype = NULL;
1752 blkid_partlist pl;
1753 struct stat st;
1754 unsigned i;
1755 int r;
1756
1757 assert(fd >= 0);
1758 assert(root_device);
1759 assert(home_device);
1760 assert(srv_device);
1761 assert(secondary);
1762 assert(arg_image);
1763
1764 b = blkid_new_probe();
1765 if (!b)
1766 return log_oom();
1767
1768 errno = 0;
1769 r = blkid_probe_set_device(b, fd, 0, 0);
1770 if (r != 0) {
1771 if (errno == 0)
1772 return log_oom();
1773
1774 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1775 return -errno;
1776 }
1777
1778 blkid_probe_enable_partitions(b, 1);
1779 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1780
1781 errno = 0;
1782 r = blkid_do_safeprobe(b);
1783 if (r == -2 || r == 1) {
1784 log_error("Failed to identify any partition table on\n"
1785 " %s\n"
1786 PARTITION_TABLE_BLURB, arg_image);
1787 return -EINVAL;
1788 } else if (r != 0) {
1789 if (errno == 0)
1790 errno = EIO;
1791 log_error_errno(errno, "Failed to probe: %m");
1792 return -errno;
1793 }
1794
1795 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1796
1797 is_gpt = streq_ptr(pttype, "gpt");
1798 is_mbr = streq_ptr(pttype, "dos");
1799
1800 if (!is_gpt && !is_mbr) {
1801 log_error("No GPT or MBR partition table discovered on\n"
1802 " %s\n"
1803 PARTITION_TABLE_BLURB, arg_image);
1804 return -EINVAL;
1805 }
1806
1807 errno = 0;
1808 pl = blkid_probe_get_partitions(b);
1809 if (!pl) {
1810 if (errno == 0)
1811 return log_oom();
1812
1813 log_error("Failed to list partitions of %s", arg_image);
1814 return -errno;
1815 }
1816
1817 udev = udev_new();
1818 if (!udev)
1819 return log_oom();
1820
1821 if (fstat(fd, &st) < 0)
1822 return log_error_errno(errno, "Failed to stat block device: %m");
1823
1824 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1825 if (!d)
1826 return log_oom();
1827
1828 for (i = 0;; i++) {
1829 int n, m;
1830
1831 if (i >= 10) {
1832 log_error("Kernel partitions never appeared.");
1833 return -ENXIO;
1834 }
1835
1836 e = udev_enumerate_new(udev);
1837 if (!e)
1838 return log_oom();
1839
1840 r = udev_enumerate_add_match_parent(e, d);
1841 if (r < 0)
1842 return log_oom();
1843
1844 r = udev_enumerate_scan_devices(e);
1845 if (r < 0)
1846 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1847
1848 /* Count the partitions enumerated by the kernel */
1849 n = 0;
1850 first = udev_enumerate_get_list_entry(e);
1851 udev_list_entry_foreach(item, first)
1852 n++;
1853
1854 /* Count the partitions enumerated by blkid */
1855 m = blkid_partlist_numof_partitions(pl);
1856 if (n == m + 1)
1857 break;
1858 if (n > m + 1) {
1859 log_error("blkid and kernel partition list do not match.");
1860 return -EIO;
1861 }
1862 if (n < m + 1) {
1863 unsigned j;
1864
1865 /* The kernel has probed fewer partitions than
1866 * blkid? Maybe the kernel prober is still
1867 * running or it got EBUSY because udev
1868 * already opened the device. Let's reprobe
1869 * the device, which is a synchronous call
1870 * that waits until probing is complete. */
1871
1872 for (j = 0; j < 20; j++) {
1873
1874 r = ioctl(fd, BLKRRPART, 0);
1875 if (r < 0)
1876 r = -errno;
1877 if (r >= 0 || r != -EBUSY)
1878 break;
1879
1880 /* If something else has the device
1881 * open, such as an udev rule, the
1882 * ioctl will return EBUSY. Since
1883 * there's no way to wait until it
1884 * isn't busy anymore, let's just wait
1885 * a bit, and try again.
1886 *
1887 * This is really something they
1888 * should fix in the kernel! */
1889
1890 usleep(50 * USEC_PER_MSEC);
1891 }
1892
1893 if (r < 0)
1894 return log_error_errno(r, "Failed to reread partition table: %m");
1895 }
1896
1897 e = udev_enumerate_unref(e);
1898 }
1899
1900 first = udev_enumerate_get_list_entry(e);
1901 udev_list_entry_foreach(item, first) {
1902 _cleanup_udev_device_unref_ struct udev_device *q;
1903 const char *node;
1904 unsigned long long flags;
1905 blkid_partition pp;
1906 dev_t qn;
1907 int nr;
1908
1909 errno = 0;
1910 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1911 if (!q) {
1912 if (!errno)
1913 errno = ENOMEM;
1914
1915 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1916 return -errno;
1917 }
1918
1919 qn = udev_device_get_devnum(q);
1920 if (major(qn) == 0)
1921 continue;
1922
1923 if (st.st_rdev == qn)
1924 continue;
1925
1926 node = udev_device_get_devnode(q);
1927 if (!node)
1928 continue;
1929
1930 pp = blkid_partlist_devno_to_partition(pl, qn);
1931 if (!pp)
1932 continue;
1933
1934 flags = blkid_partition_get_flags(pp);
1935
1936 nr = blkid_partition_get_partno(pp);
1937 if (nr < 0)
1938 continue;
1939
1940 if (is_gpt) {
1941 sd_id128_t type_id;
1942 const char *stype;
1943
1944 if (flags & GPT_FLAG_NO_AUTO)
1945 continue;
1946
1947 stype = blkid_partition_get_type_string(pp);
1948 if (!stype)
1949 continue;
1950
1951 if (sd_id128_from_string(stype, &type_id) < 0)
1952 continue;
1953
1954 if (sd_id128_equal(type_id, GPT_HOME)) {
1955
1956 if (home && nr >= home_nr)
1957 continue;
1958
1959 home_nr = nr;
1960 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1961
1962 r = free_and_strdup(&home, node);
1963 if (r < 0)
1964 return log_oom();
1965
1966 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1967
1968 if (srv && nr >= srv_nr)
1969 continue;
1970
1971 srv_nr = nr;
1972 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1973
1974 r = free_and_strdup(&srv, node);
1975 if (r < 0)
1976 return log_oom();
1977 }
1978 #ifdef GPT_ROOT_NATIVE
1979 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1980
1981 if (root && nr >= root_nr)
1982 continue;
1983
1984 root_nr = nr;
1985 root_rw = !(flags & GPT_FLAG_READ_ONLY);
1986
1987 r = free_and_strdup(&root, node);
1988 if (r < 0)
1989 return log_oom();
1990 }
1991 #endif
1992 #ifdef GPT_ROOT_SECONDARY
1993 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1994
1995 if (secondary_root && nr >= secondary_root_nr)
1996 continue;
1997
1998 secondary_root_nr = nr;
1999 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2000
2001 r = free_and_strdup(&secondary_root, node);
2002 if (r < 0)
2003 return log_oom();
2004 }
2005 #endif
2006 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2007
2008 if (generic)
2009 multiple_generic = true;
2010 else {
2011 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2012
2013 r = free_and_strdup(&generic, node);
2014 if (r < 0)
2015 return log_oom();
2016 }
2017 }
2018
2019 } else if (is_mbr) {
2020 int type;
2021
2022 if (flags != 0x80) /* Bootable flag */
2023 continue;
2024
2025 type = blkid_partition_get_type(pp);
2026 if (type != 0x83) /* Linux partition */
2027 continue;
2028
2029 if (generic)
2030 multiple_generic = true;
2031 else {
2032 generic_rw = true;
2033
2034 r = free_and_strdup(&root, node);
2035 if (r < 0)
2036 return log_oom();
2037 }
2038 }
2039 }
2040
2041 if (root) {
2042 *root_device = root;
2043 root = NULL;
2044
2045 *root_device_rw = root_rw;
2046 *secondary = false;
2047 } else if (secondary_root) {
2048 *root_device = secondary_root;
2049 secondary_root = NULL;
2050
2051 *root_device_rw = secondary_root_rw;
2052 *secondary = true;
2053 } else if (generic) {
2054
2055 /* There were no partitions with precise meanings
2056 * around, but we found generic partitions. In this
2057 * case, if there's only one, we can go ahead and boot
2058 * it, otherwise we bail out, because we really cannot
2059 * make any sense of it. */
2060
2061 if (multiple_generic) {
2062 log_error("Identified multiple bootable Linux partitions on\n"
2063 " %s\n"
2064 PARTITION_TABLE_BLURB, arg_image);
2065 return -EINVAL;
2066 }
2067
2068 *root_device = generic;
2069 generic = NULL;
2070
2071 *root_device_rw = generic_rw;
2072 *secondary = false;
2073 } else {
2074 log_error("Failed to identify root partition in disk image\n"
2075 " %s\n"
2076 PARTITION_TABLE_BLURB, arg_image);
2077 return -EINVAL;
2078 }
2079
2080 if (home) {
2081 *home_device = home;
2082 home = NULL;
2083
2084 *home_device_rw = home_rw;
2085 }
2086
2087 if (srv) {
2088 *srv_device = srv;
2089 srv = NULL;
2090
2091 *srv_device_rw = srv_rw;
2092 }
2093
2094 return 0;
2095 #else
2096 log_error("--image= is not supported, compiled without blkid support.");
2097 return -EOPNOTSUPP;
2098 #endif
2099 }
2100
2101 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2102 #ifdef HAVE_BLKID
2103 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2104 const char *fstype, *p;
2105 int r;
2106
2107 assert(what);
2108 assert(where);
2109
2110 if (arg_read_only)
2111 rw = false;
2112
2113 if (directory)
2114 p = strjoina(where, directory);
2115 else
2116 p = where;
2117
2118 errno = 0;
2119 b = blkid_new_probe_from_filename(what);
2120 if (!b) {
2121 if (errno == 0)
2122 return log_oom();
2123 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2124 return -errno;
2125 }
2126
2127 blkid_probe_enable_superblocks(b, 1);
2128 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2129
2130 errno = 0;
2131 r = blkid_do_safeprobe(b);
2132 if (r == -1 || r == 1) {
2133 log_error("Cannot determine file system type of %s", what);
2134 return -EINVAL;
2135 } else if (r != 0) {
2136 if (errno == 0)
2137 errno = EIO;
2138 log_error_errno(errno, "Failed to probe %s: %m", what);
2139 return -errno;
2140 }
2141
2142 errno = 0;
2143 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2144 if (errno == 0)
2145 errno = EINVAL;
2146 log_error("Failed to determine file system type of %s", what);
2147 return -errno;
2148 }
2149
2150 if (streq(fstype, "crypto_LUKS")) {
2151 log_error("nspawn currently does not support LUKS disk images.");
2152 return -EOPNOTSUPP;
2153 }
2154
2155 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2156 return log_error_errno(errno, "Failed to mount %s: %m", what);
2157
2158 return 0;
2159 #else
2160 log_error("--image= is not supported, compiled without blkid support.");
2161 return -EOPNOTSUPP;
2162 #endif
2163 }
2164
2165 static int mount_devices(
2166 const char *where,
2167 const char *root_device, bool root_device_rw,
2168 const char *home_device, bool home_device_rw,
2169 const char *srv_device, bool srv_device_rw) {
2170 int r;
2171
2172 assert(where);
2173
2174 if (root_device) {
2175 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2176 if (r < 0)
2177 return log_error_errno(r, "Failed to mount root directory: %m");
2178 }
2179
2180 if (home_device) {
2181 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2182 if (r < 0)
2183 return log_error_errno(r, "Failed to mount home directory: %m");
2184 }
2185
2186 if (srv_device) {
2187 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2188 if (r < 0)
2189 return log_error_errno(r, "Failed to mount server data directory: %m");
2190 }
2191
2192 return 0;
2193 }
2194
2195 static void loop_remove(int nr, int *image_fd) {
2196 _cleanup_close_ int control = -1;
2197 int r;
2198
2199 if (nr < 0)
2200 return;
2201
2202 if (image_fd && *image_fd >= 0) {
2203 r = ioctl(*image_fd, LOOP_CLR_FD);
2204 if (r < 0)
2205 log_debug_errno(errno, "Failed to close loop image: %m");
2206 *image_fd = safe_close(*image_fd);
2207 }
2208
2209 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2210 if (control < 0) {
2211 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2212 return;
2213 }
2214
2215 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2216 if (r < 0)
2217 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2218 }
2219
2220 /*
2221 * Return values:
2222 * < 0 : wait_for_terminate() failed to get the state of the
2223 * container, the container was terminated by a signal, or
2224 * failed for an unknown reason. No change is made to the
2225 * container argument.
2226 * > 0 : The program executed in the container terminated with an
2227 * error. The exit code of the program executed in the
2228 * container is returned. The container argument has been set
2229 * to CONTAINER_TERMINATED.
2230 * 0 : The container is being rebooted, has been shut down or exited
2231 * successfully. The container argument has been set to either
2232 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2233 *
2234 * That is, success is indicated by a return value of zero, and an
2235 * error is indicated by a non-zero value.
2236 */
2237 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2238 siginfo_t status;
2239 int r;
2240
2241 r = wait_for_terminate(pid, &status);
2242 if (r < 0)
2243 return log_warning_errno(r, "Failed to wait for container: %m");
2244
2245 switch (status.si_code) {
2246
2247 case CLD_EXITED:
2248 if (status.si_status == 0) {
2249 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2250
2251 } else
2252 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2253
2254 *container = CONTAINER_TERMINATED;
2255 return status.si_status;
2256
2257 case CLD_KILLED:
2258 if (status.si_status == SIGINT) {
2259
2260 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2261 *container = CONTAINER_TERMINATED;
2262 return 0;
2263
2264 } else if (status.si_status == SIGHUP) {
2265
2266 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2267 *container = CONTAINER_REBOOTED;
2268 return 0;
2269 }
2270
2271 /* CLD_KILLED fallthrough */
2272
2273 case CLD_DUMPED:
2274 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2275 return -EIO;
2276
2277 default:
2278 log_error("Container %s failed due to unknown reason.", arg_machine);
2279 return -EIO;
2280 }
2281
2282 return r;
2283 }
2284
2285 static void nop_handler(int sig) {}
2286
2287 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2288 pid_t pid;
2289
2290 pid = PTR_TO_UINT32(userdata);
2291 if (pid > 0) {
2292 if (kill(pid, arg_kill_signal) >= 0) {
2293 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2294 sd_event_source_set_userdata(s, NULL);
2295 return 0;
2296 }
2297 }
2298
2299 sd_event_exit(sd_event_source_get_event(s), 0);
2300 return 0;
2301 }
2302
2303 static int determine_names(void) {
2304 int r;
2305
2306 if (arg_template && !arg_directory && arg_machine) {
2307
2308 /* If --template= was specified then we should not
2309 * search for a machine, but instead create a new one
2310 * in /var/lib/machine. */
2311
2312 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2313 if (!arg_directory)
2314 return log_oom();
2315 }
2316
2317 if (!arg_image && !arg_directory) {
2318 if (arg_machine) {
2319 _cleanup_(image_unrefp) Image *i = NULL;
2320
2321 r = image_find(arg_machine, &i);
2322 if (r < 0)
2323 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2324 else if (r == 0) {
2325 log_error("No image for machine '%s': %m", arg_machine);
2326 return -ENOENT;
2327 }
2328
2329 if (i->type == IMAGE_RAW)
2330 r = set_sanitized_path(&arg_image, i->path);
2331 else
2332 r = set_sanitized_path(&arg_directory, i->path);
2333 if (r < 0)
2334 return log_error_errno(r, "Invalid image directory: %m");
2335
2336 if (!arg_ephemeral)
2337 arg_read_only = arg_read_only || i->read_only;
2338 } else
2339 arg_directory = get_current_dir_name();
2340
2341 if (!arg_directory && !arg_machine) {
2342 log_error("Failed to determine path, please use -D or -i.");
2343 return -EINVAL;
2344 }
2345 }
2346
2347 if (!arg_machine) {
2348 if (arg_directory && path_equal(arg_directory, "/"))
2349 arg_machine = gethostname_malloc();
2350 else
2351 arg_machine = strdup(basename(arg_image ?: arg_directory));
2352
2353 if (!arg_machine)
2354 return log_oom();
2355
2356 hostname_cleanup(arg_machine);
2357 if (!machine_name_is_valid(arg_machine)) {
2358 log_error("Failed to determine machine name automatically, please use -M.");
2359 return -EINVAL;
2360 }
2361
2362 if (arg_ephemeral) {
2363 char *b;
2364
2365 /* Add a random suffix when this is an
2366 * ephemeral machine, so that we can run many
2367 * instances at once without manually having
2368 * to specify -M each time. */
2369
2370 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2371 return log_oom();
2372
2373 free(arg_machine);
2374 arg_machine = b;
2375 }
2376 }
2377
2378 return 0;
2379 }
2380
2381 static int determine_uid_shift(const char *directory) {
2382 int r;
2383
2384 if (!arg_userns) {
2385 arg_uid_shift = 0;
2386 return 0;
2387 }
2388
2389 if (arg_uid_shift == UID_INVALID) {
2390 struct stat st;
2391
2392 r = stat(directory, &st);
2393 if (r < 0)
2394 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2395
2396 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2397
2398 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2399 log_error("UID and GID base of %s don't match.", directory);
2400 return -EINVAL;
2401 }
2402
2403 arg_uid_range = UINT32_C(0x10000);
2404 }
2405
2406 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2407 log_error("UID base too high for UID range.");
2408 return -EINVAL;
2409 }
2410
2411 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2412 return 0;
2413 }
2414
2415 static int inner_child(
2416 Barrier *barrier,
2417 const char *directory,
2418 bool secondary,
2419 int kmsg_socket,
2420 int rtnl_socket,
2421 FDSet *fds) {
2422
2423 _cleanup_free_ char *home = NULL;
2424 unsigned n_env = 2;
2425 const char *envp[] = {
2426 "PATH=" DEFAULT_PATH_SPLIT_USR,
2427 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2428 NULL, /* TERM */
2429 NULL, /* HOME */
2430 NULL, /* USER */
2431 NULL, /* LOGNAME */
2432 NULL, /* container_uuid */
2433 NULL, /* LISTEN_FDS */
2434 NULL, /* LISTEN_PID */
2435 NULL
2436 };
2437
2438 _cleanup_strv_free_ char **env_use = NULL;
2439 int r;
2440
2441 assert(barrier);
2442 assert(directory);
2443 assert(kmsg_socket >= 0);
2444
2445 cg_unified_flush();
2446
2447 if (arg_userns) {
2448 /* Tell the parent, that it now can write the UID map. */
2449 (void) barrier_place(barrier); /* #1 */
2450
2451 /* Wait until the parent wrote the UID map */
2452 if (!barrier_place_and_sync(barrier)) { /* #2 */
2453 log_error("Parent died too early");
2454 return -ESRCH;
2455 }
2456 }
2457
2458 r = mount_all(NULL, true, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2459 if (r < 0)
2460 return r;
2461
2462 /* Wait until we are cgroup-ified, so that we
2463 * can mount the right cgroup path writable */
2464 if (!barrier_place_and_sync(barrier)) { /* #3 */
2465 log_error("Parent died too early");
2466 return -ESRCH;
2467 }
2468
2469 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2470 if (r < 0)
2471 return r;
2472
2473 r = reset_uid_gid();
2474 if (r < 0)
2475 return log_error_errno(r, "Couldn't become new root: %m");
2476
2477 r = setup_boot_id(NULL);
2478 if (r < 0)
2479 return r;
2480
2481 r = setup_kmsg(NULL, kmsg_socket);
2482 if (r < 0)
2483 return r;
2484 kmsg_socket = safe_close(kmsg_socket);
2485
2486 umask(0022);
2487
2488 if (setsid() < 0)
2489 return log_error_errno(errno, "setsid() failed: %m");
2490
2491 if (arg_private_network)
2492 loopback_setup();
2493
2494 if (arg_expose_ports) {
2495 r = expose_port_send_rtnl(rtnl_socket);
2496 if (r < 0)
2497 return r;
2498 rtnl_socket = safe_close(rtnl_socket);
2499 }
2500
2501 if (drop_capabilities() < 0)
2502 return log_error_errno(errno, "drop_capabilities() failed: %m");
2503
2504 setup_hostname();
2505
2506 if (arg_personality != PERSONALITY_INVALID) {
2507 if (personality(arg_personality) < 0)
2508 return log_error_errno(errno, "personality() failed: %m");
2509 } else if (secondary) {
2510 if (personality(PER_LINUX32) < 0)
2511 return log_error_errno(errno, "personality() failed: %m");
2512 }
2513
2514 #ifdef HAVE_SELINUX
2515 if (arg_selinux_context)
2516 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2517 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2518 #endif
2519
2520 r = change_uid_gid(arg_user, &home);
2521 if (r < 0)
2522 return r;
2523
2524 envp[n_env] = strv_find_prefix(environ, "TERM=");
2525 if (envp[n_env])
2526 n_env ++;
2527
2528 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2529 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2530 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2531 return log_oom();
2532
2533 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2534 char as_uuid[37];
2535
2536 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2537 return log_oom();
2538 }
2539
2540 if (fdset_size(fds) > 0) {
2541 r = fdset_cloexec(fds, false);
2542 if (r < 0)
2543 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2544
2545 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2546 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2547 return log_oom();
2548 }
2549
2550 env_use = strv_env_merge(2, envp, arg_setenv);
2551 if (!env_use)
2552 return log_oom();
2553
2554 /* Let the parent know that we are ready and
2555 * wait until the parent is ready with the
2556 * setup, too... */
2557 if (!barrier_place_and_sync(barrier)) { /* #4 */
2558 log_error("Parent died too early");
2559 return -ESRCH;
2560 }
2561
2562 /* Now, explicitly close the log, so that we
2563 * then can close all remaining fds. Closing
2564 * the log explicitly first has the benefit
2565 * that the logging subsystem knows about it,
2566 * and is thus ready to be reopened should we
2567 * need it again. Note that the other fds
2568 * closed here are at least the locking and
2569 * barrier fds. */
2570 log_close();
2571 (void) fdset_close_others(fds);
2572
2573 if (arg_boot) {
2574 char **a;
2575 size_t m;
2576
2577 /* Automatically search for the init system */
2578
2579 m = 1 + strv_length(arg_parameters);
2580 a = newa(char*, m + 1);
2581 if (strv_isempty(arg_parameters))
2582 a[1] = NULL;
2583 else
2584 memcpy(a + 1, arg_parameters, m * sizeof(char*));
2585
2586 a[0] = (char*) "/usr/lib/systemd/systemd";
2587 execve(a[0], a, env_use);
2588
2589 a[0] = (char*) "/lib/systemd/systemd";
2590 execve(a[0], a, env_use);
2591
2592 a[0] = (char*) "/sbin/init";
2593 execve(a[0], a, env_use);
2594 } else if (!strv_isempty(arg_parameters))
2595 execvpe(arg_parameters[0], arg_parameters, env_use);
2596 else {
2597 chdir(home ?: "/root");
2598 execle("/bin/bash", "-bash", NULL, env_use);
2599 execle("/bin/sh", "-sh", NULL, env_use);
2600 }
2601
2602 (void) log_open();
2603 return log_error_errno(errno, "execv() failed: %m");
2604 }
2605
2606 static int outer_child(
2607 Barrier *barrier,
2608 const char *directory,
2609 const char *console,
2610 const char *root_device, bool root_device_rw,
2611 const char *home_device, bool home_device_rw,
2612 const char *srv_device, bool srv_device_rw,
2613 bool interactive,
2614 bool secondary,
2615 int pid_socket,
2616 int kmsg_socket,
2617 int rtnl_socket,
2618 int uid_shift_socket,
2619 FDSet *fds) {
2620
2621 pid_t pid;
2622 ssize_t l;
2623 int r;
2624
2625 assert(barrier);
2626 assert(directory);
2627 assert(console);
2628 assert(pid_socket >= 0);
2629 assert(kmsg_socket >= 0);
2630
2631 cg_unified_flush();
2632
2633 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2634 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2635
2636 if (interactive) {
2637 close_nointr(STDIN_FILENO);
2638 close_nointr(STDOUT_FILENO);
2639 close_nointr(STDERR_FILENO);
2640
2641 r = open_terminal(console, O_RDWR);
2642 if (r != STDIN_FILENO) {
2643 if (r >= 0) {
2644 safe_close(r);
2645 r = -EINVAL;
2646 }
2647
2648 return log_error_errno(r, "Failed to open console: %m");
2649 }
2650
2651 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2652 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2653 return log_error_errno(errno, "Failed to duplicate console: %m");
2654 }
2655
2656 r = reset_audit_loginuid();
2657 if (r < 0)
2658 return r;
2659
2660 /* Mark everything as slave, so that we still
2661 * receive mounts from the real root, but don't
2662 * propagate mounts to the real root. */
2663 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2664 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2665
2666 r = mount_devices(directory,
2667 root_device, root_device_rw,
2668 home_device, home_device_rw,
2669 srv_device, srv_device_rw);
2670 if (r < 0)
2671 return r;
2672
2673 r = determine_uid_shift(directory);
2674 if (r < 0)
2675 return r;
2676
2677 if (arg_userns) {
2678 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2679 if (l < 0)
2680 return log_error_errno(errno, "Failed to send UID shift: %m");
2681 if (l != sizeof(arg_uid_shift)) {
2682 log_error("Short write while sending UID shift.");
2683 return -EIO;
2684 }
2685 }
2686
2687 /* Turn directory into bind mount */
2688 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2689 return log_error_errno(errno, "Failed to make bind mount: %m");
2690
2691 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2692 if (r < 0)
2693 return r;
2694
2695 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2696 if (r < 0)
2697 return r;
2698
2699 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2700 if (r < 0)
2701 return r;
2702
2703 if (arg_read_only) {
2704 r = bind_remount_recursive(directory, true);
2705 if (r < 0)
2706 return log_error_errno(r, "Failed to make tree read-only: %m");
2707 }
2708
2709 r = mount_all(directory, false, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2710 if (r < 0)
2711 return r;
2712
2713 r = copy_devnodes(directory);
2714 if (r < 0)
2715 return r;
2716
2717 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2718
2719 r = setup_pts(directory);
2720 if (r < 0)
2721 return r;
2722
2723 r = setup_propagate(directory);
2724 if (r < 0)
2725 return r;
2726
2727 r = setup_dev_console(directory, console);
2728 if (r < 0)
2729 return r;
2730
2731 r = setup_seccomp();
2732 if (r < 0)
2733 return r;
2734
2735 r = setup_timezone(directory);
2736 if (r < 0)
2737 return r;
2738
2739 r = setup_resolv_conf(directory);
2740 if (r < 0)
2741 return r;
2742
2743 r = setup_journal(directory);
2744 if (r < 0)
2745 return r;
2746
2747 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2748 if (r < 0)
2749 return r;
2750
2751 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2752 if (r < 0)
2753 return r;
2754
2755 r = mount_move_root(directory);
2756 if (r < 0)
2757 return log_error_errno(r, "Failed to move root directory: %m");
2758
2759 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2760 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2761 (arg_private_network ? CLONE_NEWNET : 0) |
2762 (arg_userns ? CLONE_NEWUSER : 0),
2763 NULL);
2764 if (pid < 0)
2765 return log_error_errno(errno, "Failed to fork inner child: %m");
2766 if (pid == 0) {
2767 pid_socket = safe_close(pid_socket);
2768 uid_shift_socket = safe_close(uid_shift_socket);
2769
2770 /* The inner child has all namespaces that are
2771 * requested, so that we all are owned by the user if
2772 * user namespaces are turned on. */
2773
2774 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2775 if (r < 0)
2776 _exit(EXIT_FAILURE);
2777
2778 _exit(EXIT_SUCCESS);
2779 }
2780
2781 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2782 if (l < 0)
2783 return log_error_errno(errno, "Failed to send PID: %m");
2784 if (l != sizeof(pid)) {
2785 log_error("Short write while sending PID.");
2786 return -EIO;
2787 }
2788
2789 pid_socket = safe_close(pid_socket);
2790 kmsg_socket = safe_close(kmsg_socket);
2791 rtnl_socket = safe_close(rtnl_socket);
2792
2793 return 0;
2794 }
2795
2796 static int setup_uid_map(pid_t pid) {
2797 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2798 int r;
2799
2800 assert(pid > 1);
2801
2802 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2803 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2804 r = write_string_file(uid_map, line, 0);
2805 if (r < 0)
2806 return log_error_errno(r, "Failed to write UID map: %m");
2807
2808 /* We always assign the same UID and GID ranges */
2809 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2810 r = write_string_file(uid_map, line, 0);
2811 if (r < 0)
2812 return log_error_errno(r, "Failed to write GID map: %m");
2813
2814 return 0;
2815 }
2816
2817 static int load_settings(void) {
2818 _cleanup_(settings_freep) Settings *settings = NULL;
2819 _cleanup_fclose_ FILE *f = NULL;
2820 _cleanup_free_ char *p = NULL;
2821 const char *fn, *i;
2822 int r;
2823
2824 /* If all settings are masked, there's no point in looking for
2825 * the settings file */
2826 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2827 return 0;
2828
2829 fn = strjoina(arg_machine, ".nspawn");
2830
2831 /* We first look in the admin's directories in /etc and /run */
2832 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2833 _cleanup_free_ char *j = NULL;
2834
2835 j = strjoin(i, "/", fn, NULL);
2836 if (!j)
2837 return log_oom();
2838
2839 f = fopen(j, "re");
2840 if (f) {
2841 p = j;
2842 j = NULL;
2843
2844 /* By default we trust configuration from /etc and /run */
2845 if (arg_settings_trusted < 0)
2846 arg_settings_trusted = true;
2847
2848 break;
2849 }
2850
2851 if (errno != ENOENT)
2852 return log_error_errno(errno, "Failed to open %s: %m", j);
2853 }
2854
2855 if (!f) {
2856 /* After that, let's look for a file next to the
2857 * actual image we shall boot. */
2858
2859 if (arg_image) {
2860 p = file_in_same_dir(arg_image, fn);
2861 if (!p)
2862 return log_oom();
2863 } else if (arg_directory) {
2864 p = file_in_same_dir(arg_directory, fn);
2865 if (!p)
2866 return log_oom();
2867 }
2868
2869 if (p) {
2870 f = fopen(p, "re");
2871 if (!f && errno != ENOENT)
2872 return log_error_errno(errno, "Failed to open %s: %m", p);
2873
2874 /* By default we do not trust configuration from /var/lib/machines */
2875 if (arg_settings_trusted < 0)
2876 arg_settings_trusted = false;
2877 }
2878 }
2879
2880 if (!f)
2881 return 0;
2882
2883 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2884
2885 r = settings_load(f, p, &settings);
2886 if (r < 0)
2887 return r;
2888
2889 /* Copy over bits from the settings, unless they have been
2890 * explicitly masked by command line switches. */
2891
2892 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2893 settings->boot >= 0) {
2894 arg_boot = settings->boot;
2895
2896 strv_free(arg_parameters);
2897 arg_parameters = settings->parameters;
2898 settings->parameters = NULL;
2899 }
2900
2901 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2902 settings->environment) {
2903 strv_free(arg_setenv);
2904 arg_setenv = settings->environment;
2905 settings->environment = NULL;
2906 }
2907
2908 if ((arg_settings_mask & SETTING_USER) == 0 &&
2909 settings->user) {
2910 free(arg_user);
2911 arg_user = settings->user;
2912 settings->user = NULL;
2913 }
2914
2915 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
2916
2917 if (!arg_settings_trusted && settings->capability != 0)
2918 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2919 else
2920 arg_retain |= settings->capability;
2921
2922 arg_retain &= ~settings->drop_capability;
2923 }
2924
2925 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2926 settings->kill_signal > 0)
2927 arg_kill_signal = settings->kill_signal;
2928
2929 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2930 settings->personality != PERSONALITY_INVALID)
2931 arg_personality = settings->personality;
2932
2933 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2934 !sd_id128_is_null(settings->machine_id)) {
2935
2936 if (!arg_settings_trusted)
2937 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2938 else
2939 arg_uuid = settings->machine_id;
2940 }
2941
2942 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2943 settings->read_only >= 0)
2944 arg_read_only = settings->read_only;
2945
2946 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2947 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2948 arg_volatile_mode = settings->volatile_mode;
2949
2950 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2951 settings->n_custom_mounts > 0) {
2952
2953 if (!arg_settings_trusted)
2954 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2955 else {
2956 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2957 arg_custom_mounts = settings->custom_mounts;
2958 arg_n_custom_mounts = settings->n_custom_mounts;
2959
2960 settings->custom_mounts = NULL;
2961 settings->n_custom_mounts = 0;
2962 }
2963 }
2964
2965 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2966 (settings->private_network >= 0 ||
2967 settings->network_veth >= 0 ||
2968 settings->network_bridge ||
2969 settings->network_interfaces ||
2970 settings->network_macvlan ||
2971 settings->network_ipvlan)) {
2972
2973 if (!arg_settings_trusted)
2974 log_warning("Ignoring network settings, file %s is not trusted.", p);
2975 else {
2976 strv_free(arg_network_interfaces);
2977 arg_network_interfaces = settings->network_interfaces;
2978 settings->network_interfaces = NULL;
2979
2980 strv_free(arg_network_macvlan);
2981 arg_network_macvlan = settings->network_macvlan;
2982 settings->network_macvlan = NULL;
2983
2984 strv_free(arg_network_ipvlan);
2985 arg_network_ipvlan = settings->network_ipvlan;
2986 settings->network_ipvlan = NULL;
2987
2988 free(arg_network_bridge);
2989 arg_network_bridge = settings->network_bridge;
2990 settings->network_bridge = NULL;
2991
2992 arg_network_veth = settings->network_veth > 0 || settings->network_bridge;
2993
2994 arg_private_network = true; /* all these settings imply private networking */
2995 }
2996 }
2997
2998 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
2999 settings->expose_ports) {
3000
3001 if (!arg_settings_trusted)
3002 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3003 else {
3004 expose_port_free_all(arg_expose_ports);
3005 arg_expose_ports = settings->expose_ports;
3006 settings->expose_ports = NULL;
3007 }
3008 }
3009
3010 return 0;
3011 }
3012
3013 int main(int argc, char *argv[]) {
3014
3015 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3016 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3017 _cleanup_close_ int master = -1, image_fd = -1;
3018 _cleanup_fdset_free_ FDSet *fds = NULL;
3019 int r, n_fd_passed, loop_nr = -1;
3020 char veth_name[IFNAMSIZ];
3021 bool secondary = false, remove_subvol = false;
3022 sigset_t mask_chld;
3023 pid_t pid = 0;
3024 int ret = EXIT_SUCCESS;
3025 union in_addr_union exposed = {};
3026 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3027 bool interactive;
3028
3029 log_parse_environment();
3030 log_open();
3031
3032 r = parse_argv(argc, argv);
3033 if (r <= 0)
3034 goto finish;
3035
3036 if (geteuid() != 0) {
3037 log_error("Need to be root.");
3038 r = -EPERM;
3039 goto finish;
3040 }
3041 r = determine_names();
3042 if (r < 0)
3043 goto finish;
3044
3045 r = load_settings();
3046 if (r < 0)
3047 goto finish;
3048
3049 r = verify_arguments();
3050 if (r < 0)
3051 goto finish;
3052
3053 n_fd_passed = sd_listen_fds(false);
3054 if (n_fd_passed > 0) {
3055 r = fdset_new_listen_fds(&fds, false);
3056 if (r < 0) {
3057 log_error_errno(r, "Failed to collect file descriptors: %m");
3058 goto finish;
3059 }
3060 }
3061
3062 if (arg_directory) {
3063 assert(!arg_image);
3064
3065 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3066 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3067 r = -EINVAL;
3068 goto finish;
3069 }
3070
3071 if (arg_ephemeral) {
3072 _cleanup_free_ char *np = NULL;
3073
3074 /* If the specified path is a mount point we
3075 * generate the new snapshot immediately
3076 * inside it under a random name. However if
3077 * the specified is not a mount point we
3078 * create the new snapshot in the parent
3079 * directory, just next to it. */
3080 r = path_is_mount_point(arg_directory, 0);
3081 if (r < 0) {
3082 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3083 goto finish;
3084 }
3085 if (r > 0)
3086 r = tempfn_random_child(arg_directory, "machine.", &np);
3087 else
3088 r = tempfn_random(arg_directory, "machine.", &np);
3089 if (r < 0) {
3090 log_error_errno(r, "Failed to generate name for snapshot: %m");
3091 goto finish;
3092 }
3093
3094 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3095 if (r < 0) {
3096 log_error_errno(r, "Failed to lock %s: %m", np);
3097 goto finish;
3098 }
3099
3100 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
3101 if (r < 0) {
3102 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3103 goto finish;
3104 }
3105
3106 free(arg_directory);
3107 arg_directory = np;
3108 np = NULL;
3109
3110 remove_subvol = true;
3111
3112 } else {
3113 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3114 if (r == -EBUSY) {
3115 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3116 goto finish;
3117 }
3118 if (r < 0) {
3119 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3120 return r;
3121 }
3122
3123 if (arg_template) {
3124 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
3125 if (r == -EEXIST) {
3126 if (!arg_quiet)
3127 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3128 } else if (r < 0) {
3129 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3130 goto finish;
3131 } else {
3132 if (!arg_quiet)
3133 log_info("Populated %s from template %s.", arg_directory, arg_template);
3134 }
3135 }
3136 }
3137
3138 if (arg_boot) {
3139 if (path_is_os_tree(arg_directory) <= 0) {
3140 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3141 r = -EINVAL;
3142 goto finish;
3143 }
3144 } else {
3145 const char *p;
3146
3147 p = strjoina(arg_directory,
3148 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3149 if (access(p, F_OK) < 0) {
3150 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3151 r = -EINVAL;
3152 goto finish;
3153 }
3154 }
3155
3156 } else {
3157 char template[] = "/tmp/nspawn-root-XXXXXX";
3158
3159 assert(arg_image);
3160 assert(!arg_template);
3161
3162 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3163 if (r == -EBUSY) {
3164 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3165 goto finish;
3166 }
3167 if (r < 0) {
3168 r = log_error_errno(r, "Failed to create image lock: %m");
3169 goto finish;
3170 }
3171
3172 if (!mkdtemp(template)) {
3173 log_error_errno(errno, "Failed to create temporary directory: %m");
3174 r = -errno;
3175 goto finish;
3176 }
3177
3178 arg_directory = strdup(template);
3179 if (!arg_directory) {
3180 r = log_oom();
3181 goto finish;
3182 }
3183
3184 image_fd = setup_image(&device_path, &loop_nr);
3185 if (image_fd < 0) {
3186 r = image_fd;
3187 goto finish;
3188 }
3189
3190 r = dissect_image(image_fd,
3191 &root_device, &root_device_rw,
3192 &home_device, &home_device_rw,
3193 &srv_device, &srv_device_rw,
3194 &secondary);
3195 if (r < 0)
3196 goto finish;
3197 }
3198
3199 r = custom_mounts_prepare();
3200 if (r < 0)
3201 goto finish;
3202
3203 interactive =
3204 isatty(STDIN_FILENO) > 0 &&
3205 isatty(STDOUT_FILENO) > 0;
3206
3207 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3208 if (master < 0) {
3209 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3210 goto finish;
3211 }
3212
3213 r = ptsname_malloc(master, &console);
3214 if (r < 0) {
3215 r = log_error_errno(r, "Failed to determine tty name: %m");
3216 goto finish;
3217 }
3218
3219 if (unlockpt(master) < 0) {
3220 r = log_error_errno(errno, "Failed to unlock tty: %m");
3221 goto finish;
3222 }
3223
3224 if (!arg_quiet)
3225 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3226 arg_machine, arg_image ?: arg_directory);
3227
3228 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3229
3230 assert_se(sigemptyset(&mask_chld) == 0);
3231 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3232
3233 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3234 r = log_error_errno(errno, "Failed to become subreaper: %m");
3235 goto finish;
3236 }
3237
3238 for (;;) {
3239 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
3240 uid_shift_socket_pair[2] = { -1, -1 };
3241 ContainerStatus container_status;
3242 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3243 static const struct sigaction sa = {
3244 .sa_handler = nop_handler,
3245 .sa_flags = SA_NOCLDSTOP,
3246 };
3247 int ifi = 0;
3248 ssize_t l;
3249 _cleanup_event_unref_ sd_event *event = NULL;
3250 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3251 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3252 char last_char = 0;
3253
3254 r = barrier_create(&barrier);
3255 if (r < 0) {
3256 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3257 goto finish;
3258 }
3259
3260 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3261 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3262 goto finish;
3263 }
3264
3265 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3266 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3267 goto finish;
3268 }
3269
3270 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3271 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3272 goto finish;
3273 }
3274
3275 if (arg_userns)
3276 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3277 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3278 goto finish;
3279 }
3280
3281 /* Child can be killed before execv(), so handle SIGCHLD
3282 * in order to interrupt parent's blocking calls and
3283 * give it a chance to call wait() and terminate. */
3284 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3285 if (r < 0) {
3286 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3287 goto finish;
3288 }
3289
3290 r = sigaction(SIGCHLD, &sa, NULL);
3291 if (r < 0) {
3292 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3293 goto finish;
3294 }
3295
3296 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3297 if (pid < 0) {
3298 if (errno == EINVAL)
3299 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3300 else
3301 r = log_error_errno(errno, "clone() failed: %m");
3302
3303 goto finish;
3304 }
3305
3306 if (pid == 0) {
3307 /* The outer child only has a file system namespace. */
3308 barrier_set_role(&barrier, BARRIER_CHILD);
3309
3310 master = safe_close(master);
3311
3312 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3313 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3314 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3315 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3316
3317 (void) reset_all_signal_handlers();
3318 (void) reset_signal_mask();
3319
3320 r = outer_child(&barrier,
3321 arg_directory,
3322 console,
3323 root_device, root_device_rw,
3324 home_device, home_device_rw,
3325 srv_device, srv_device_rw,
3326 interactive,
3327 secondary,
3328 pid_socket_pair[1],
3329 kmsg_socket_pair[1],
3330 rtnl_socket_pair[1],
3331 uid_shift_socket_pair[1],
3332 fds);
3333 if (r < 0)
3334 _exit(EXIT_FAILURE);
3335
3336 _exit(EXIT_SUCCESS);
3337 }
3338
3339 barrier_set_role(&barrier, BARRIER_PARENT);
3340
3341 fdset_free(fds);
3342 fds = NULL;
3343
3344 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3345 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3346 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3347 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3348
3349 /* Wait for the outer child. */
3350 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3351 if (r < 0)
3352 goto finish;
3353 if (r != 0) {
3354 r = -EIO;
3355 goto finish;
3356 }
3357 pid = 0;
3358
3359 /* And now retrieve the PID of the inner child. */
3360 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3361 if (l < 0) {
3362 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3363 goto finish;
3364 }
3365 if (l != sizeof(pid)) {
3366 log_error("Short read while reading inner child PID.");
3367 r = EIO;
3368 goto finish;
3369 }
3370
3371 log_debug("Init process invoked as PID " PID_FMT, pid);
3372
3373 if (arg_userns) {
3374 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3375 log_error("Child died too early.");
3376 r = -ESRCH;
3377 goto finish;
3378 }
3379
3380 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3381 if (l < 0) {
3382 r = log_error_errno(errno, "Failed to read UID shift: %m");
3383 goto finish;
3384 }
3385 if (l != sizeof(arg_uid_shift)) {
3386 log_error("Short read while reading UID shift.");
3387 r = EIO;
3388 goto finish;
3389 }
3390
3391 r = setup_uid_map(pid);
3392 if (r < 0)
3393 goto finish;
3394
3395 (void) barrier_place(&barrier); /* #2 */
3396 }
3397
3398 if (arg_private_network) {
3399
3400 r = move_network_interfaces(pid, arg_network_interfaces);
3401 if (r < 0)
3402 goto finish;
3403
3404 if (arg_network_veth) {
3405 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3406 if (r < 0)
3407 goto finish;
3408 else if (r > 0)
3409 ifi = r;
3410
3411 if (arg_network_bridge) {
3412 r = setup_bridge(veth_name, arg_network_bridge);
3413 if (r < 0)
3414 goto finish;
3415 if (r > 0)
3416 ifi = r;
3417 }
3418 }
3419
3420 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3421 if (r < 0)
3422 goto finish;
3423
3424 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3425 if (r < 0)
3426 goto finish;
3427 }
3428
3429 if (arg_register) {
3430 r = register_machine(
3431 arg_machine,
3432 pid,
3433 arg_directory,
3434 arg_uuid,
3435 ifi,
3436 arg_slice,
3437 arg_custom_mounts, arg_n_custom_mounts,
3438 arg_kill_signal,
3439 arg_property,
3440 arg_keep_unit);
3441 if (r < 0)
3442 goto finish;
3443 }
3444
3445 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3446 if (r < 0)
3447 goto finish;
3448
3449 if (arg_keep_unit) {
3450 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3451 if (r < 0)
3452 goto finish;
3453 }
3454
3455 r = chown_cgroup(pid, arg_uid_shift);
3456 if (r < 0)
3457 goto finish;
3458
3459 /* Notify the child that the parent is ready with all
3460 * its setup (including cgroup-ification), and that
3461 * the child can now hand over control to the code to
3462 * run inside the container. */
3463 (void) barrier_place(&barrier); /* #3 */
3464
3465 /* Block SIGCHLD here, before notifying child.
3466 * process_pty() will handle it with the other signals. */
3467 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3468
3469 /* Reset signal to default */
3470 r = default_signals(SIGCHLD, -1);
3471 if (r < 0) {
3472 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3473 goto finish;
3474 }
3475
3476 /* Let the child know that we are ready and wait that the child is completely ready now. */
3477 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3478 log_error("Child died too early.");
3479 r = -ESRCH;
3480 goto finish;
3481 }
3482
3483 sd_notifyf(false,
3484 "READY=1\n"
3485 "STATUS=Container running.\n"
3486 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3487
3488 r = sd_event_new(&event);
3489 if (r < 0) {
3490 log_error_errno(r, "Failed to get default event source: %m");
3491 goto finish;
3492 }
3493
3494 if (arg_kill_signal > 0) {
3495 /* Try to kill the init system on SIGINT or SIGTERM */
3496 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3497 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3498 } else {
3499 /* Immediately exit */
3500 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3501 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3502 }
3503
3504 /* simply exit on sigchld */
3505 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3506
3507 if (arg_expose_ports) {
3508 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3509 if (r < 0)
3510 goto finish;
3511
3512 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3513 }
3514
3515 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3516
3517 r = pty_forward_new(event, master, true, !interactive, &forward);
3518 if (r < 0) {
3519 log_error_errno(r, "Failed to create PTY forwarder: %m");
3520 goto finish;
3521 }
3522
3523 r = sd_event_loop(event);
3524 if (r < 0) {
3525 log_error_errno(r, "Failed to run event loop: %m");
3526 goto finish;
3527 }
3528
3529 pty_forward_get_last_char(forward, &last_char);
3530
3531 forward = pty_forward_free(forward);
3532
3533 if (!arg_quiet && last_char != '\n')
3534 putc('\n', stdout);
3535
3536 /* Kill if it is not dead yet anyway */
3537 if (arg_register && !arg_keep_unit)
3538 terminate_machine(pid);
3539
3540 /* Normally redundant, but better safe than sorry */
3541 kill(pid, SIGKILL);
3542
3543 r = wait_for_container(pid, &container_status);
3544 pid = 0;
3545
3546 if (r < 0)
3547 /* We failed to wait for the container, or the
3548 * container exited abnormally */
3549 goto finish;
3550 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3551 /* The container exited with a non-zero
3552 * status, or with zero status and no reboot
3553 * was requested. */
3554 ret = r;
3555 break;
3556 }
3557
3558 /* CONTAINER_REBOOTED, loop again */
3559
3560 if (arg_keep_unit) {
3561 /* Special handling if we are running as a
3562 * service: instead of simply restarting the
3563 * machine we want to restart the entire
3564 * service, so let's inform systemd about this
3565 * with the special exit code 133. The service
3566 * file uses RestartForceExitStatus=133 so
3567 * that this results in a full nspawn
3568 * restart. This is necessary since we might
3569 * have cgroup parameters set we want to have
3570 * flushed out. */
3571 ret = 133;
3572 r = 0;
3573 break;
3574 }
3575
3576 expose_port_flush(arg_expose_ports, &exposed);
3577 }
3578
3579 finish:
3580 sd_notify(false,
3581 "STOPPING=1\n"
3582 "STATUS=Terminating...");
3583
3584 if (pid > 0)
3585 kill(pid, SIGKILL);
3586
3587 /* Try to flush whatever is still queued in the pty */
3588 if (master >= 0)
3589 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
3590
3591 loop_remove(loop_nr, &image_fd);
3592
3593 if (remove_subvol && arg_directory) {
3594 int k;
3595
3596 k = btrfs_subvol_remove(arg_directory, true);
3597 if (k < 0)
3598 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3599 }
3600
3601 if (arg_machine) {
3602 const char *p;
3603
3604 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3605 (void) rm_rf(p, REMOVE_ROOT);
3606 }
3607
3608 expose_port_flush(arg_expose_ports, &exposed);
3609
3610 free(arg_directory);
3611 free(arg_template);
3612 free(arg_image);
3613 free(arg_machine);
3614 free(arg_user);
3615 strv_free(arg_setenv);
3616 free(arg_network_bridge);
3617 strv_free(arg_network_interfaces);
3618 strv_free(arg_network_macvlan);
3619 strv_free(arg_network_ipvlan);
3620 strv_free(arg_parameters);
3621 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3622 expose_port_free_all(arg_expose_ports);
3623
3624 return r < 0 ? EXIT_FAILURE : ret;
3625 }