]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #1417 from poettering/nspawn-and-more
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #ifdef HAVE_BLKID
23 #include <blkid/blkid.h>
24 #endif
25 #include <errno.h>
26 #include <getopt.h>
27 #include <linux/loop.h>
28 #include <sched.h>
29 #ifdef HAVE_SECCOMP
30 #include <seccomp.h>
31 #endif
32 #ifdef HAVE_SELINUX
33 #include <selinux/selinux.h>
34 #endif
35 #include <signal.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <sys/file.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
44 #include <unistd.h>
45
46 #include "sd-daemon.h"
47 #include "sd-id128.h"
48
49 #include "barrier.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
53 #include "cap-list.h"
54 #include "capability.h"
55 #include "cgroup-util.h"
56 #include "copy.h"
57 #include "dev-setup.h"
58 #include "env-util.h"
59 #include "event-util.h"
60 #include "fdset.h"
61 #include "fileio.h"
62 #include "formats-util.h"
63 #include "gpt.h"
64 #include "hostname-util.h"
65 #include "log.h"
66 #include "loopback-setup.h"
67 #include "machine-image.h"
68 #include "macro.h"
69 #include "missing.h"
70 #include "mkdir.h"
71 #include "netlink-util.h"
72 #include "path-util.h"
73 #include "process-util.h"
74 #include "ptyfwd.h"
75 #include "random-util.h"
76 #include "rm-rf.h"
77 #ifdef HAVE_SECCOMP
78 #include "seccomp-util.h"
79 #endif
80 #include "signal-util.h"
81 #include "strv.h"
82 #include "terminal-util.h"
83 #include "udev-util.h"
84 #include "util.h"
85
86 #include "nspawn-cgroup.h"
87 #include "nspawn-expose-ports.h"
88 #include "nspawn-mount.h"
89 #include "nspawn-network.h"
90 #include "nspawn-register.h"
91 #include "nspawn-settings.h"
92 #include "nspawn-setuid.h"
93
94 typedef enum ContainerStatus {
95 CONTAINER_TERMINATED,
96 CONTAINER_REBOOTED
97 } ContainerStatus;
98
99 typedef enum LinkJournal {
100 LINK_NO,
101 LINK_AUTO,
102 LINK_HOST,
103 LINK_GUEST
104 } LinkJournal;
105
106 static char *arg_directory = NULL;
107 static char *arg_template = NULL;
108 static char *arg_user = NULL;
109 static sd_id128_t arg_uuid = {};
110 static char *arg_machine = NULL;
111 static const char *arg_selinux_context = NULL;
112 static const char *arg_selinux_apifs_context = NULL;
113 static const char *arg_slice = NULL;
114 static bool arg_private_network = false;
115 static bool arg_read_only = false;
116 static bool arg_boot = false;
117 static bool arg_ephemeral = false;
118 static LinkJournal arg_link_journal = LINK_AUTO;
119 static bool arg_link_journal_try = false;
120 static uint64_t arg_retain =
121 (1ULL << CAP_CHOWN) |
122 (1ULL << CAP_DAC_OVERRIDE) |
123 (1ULL << CAP_DAC_READ_SEARCH) |
124 (1ULL << CAP_FOWNER) |
125 (1ULL << CAP_FSETID) |
126 (1ULL << CAP_IPC_OWNER) |
127 (1ULL << CAP_KILL) |
128 (1ULL << CAP_LEASE) |
129 (1ULL << CAP_LINUX_IMMUTABLE) |
130 (1ULL << CAP_NET_BIND_SERVICE) |
131 (1ULL << CAP_NET_BROADCAST) |
132 (1ULL << CAP_NET_RAW) |
133 (1ULL << CAP_SETGID) |
134 (1ULL << CAP_SETFCAP) |
135 (1ULL << CAP_SETPCAP) |
136 (1ULL << CAP_SETUID) |
137 (1ULL << CAP_SYS_ADMIN) |
138 (1ULL << CAP_SYS_CHROOT) |
139 (1ULL << CAP_SYS_NICE) |
140 (1ULL << CAP_SYS_PTRACE) |
141 (1ULL << CAP_SYS_TTY_CONFIG) |
142 (1ULL << CAP_SYS_RESOURCE) |
143 (1ULL << CAP_SYS_BOOT) |
144 (1ULL << CAP_AUDIT_WRITE) |
145 (1ULL << CAP_AUDIT_CONTROL) |
146 (1ULL << CAP_MKNOD);
147 static CustomMount *arg_custom_mounts = NULL;
148 static unsigned arg_n_custom_mounts = 0;
149 static char **arg_setenv = NULL;
150 static bool arg_quiet = false;
151 static bool arg_share_system = false;
152 static bool arg_register = true;
153 static bool arg_keep_unit = false;
154 static char **arg_network_interfaces = NULL;
155 static char **arg_network_macvlan = NULL;
156 static char **arg_network_ipvlan = NULL;
157 static bool arg_network_veth = false;
158 static char *arg_network_bridge = NULL;
159 static unsigned long arg_personality = PERSONALITY_INVALID;
160 static char *arg_image = NULL;
161 static VolatileMode arg_volatile_mode = VOLATILE_NO;
162 static ExposePort *arg_expose_ports = NULL;
163 static char **arg_property = NULL;
164 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
165 static bool arg_userns = false;
166 static int arg_kill_signal = 0;
167 static bool arg_unified_cgroup_hierarchy = false;
168 static SettingsMask arg_settings_mask = 0;
169 static int arg_settings_trusted = -1;
170 static char **arg_parameters = NULL;
171
172 static void help(void) {
173 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
174 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
175 " -h --help Show this help\n"
176 " --version Print version string\n"
177 " -q --quiet Do not show status information\n"
178 " -D --directory=PATH Root directory for the container\n"
179 " --template=PATH Initialize root directory from template directory,\n"
180 " if missing\n"
181 " -x --ephemeral Run container with snapshot of root directory, and\n"
182 " remove it after exit\n"
183 " -i --image=PATH File system device or disk image for the container\n"
184 " -b --boot Boot up full system (i.e. invoke init)\n"
185 " -u --user=USER Run the command under specified user or uid\n"
186 " -M --machine=NAME Set the machine name for the container\n"
187 " --uuid=UUID Set a specific machine UUID for the container\n"
188 " -S --slice=SLICE Place the container in the specified slice\n"
189 " --property=NAME=VALUE Set scope unit property\n"
190 " --private-users[=UIDBASE[:NUIDS]]\n"
191 " Run within user namespace\n"
192 " --private-network Disable network in container\n"
193 " --network-interface=INTERFACE\n"
194 " Assign an existing network interface to the\n"
195 " container\n"
196 " --network-macvlan=INTERFACE\n"
197 " Create a macvlan network interface based on an\n"
198 " existing network interface to the container\n"
199 " --network-ipvlan=INTERFACE\n"
200 " Create a ipvlan network interface based on an\n"
201 " existing network interface to the container\n"
202 " -n --network-veth Add a virtual ethernet connection between host\n"
203 " and container\n"
204 " --network-bridge=INTERFACE\n"
205 " Add a virtual ethernet connection between host\n"
206 " and container and add it to an existing bridge on\n"
207 " the host\n"
208 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
209 " Expose a container IP port on the host\n"
210 " -Z --selinux-context=SECLABEL\n"
211 " Set the SELinux security context to be used by\n"
212 " processes in the container\n"
213 " -L --selinux-apifs-context=SECLABEL\n"
214 " Set the SELinux security context to be used by\n"
215 " API/tmpfs file systems in the container\n"
216 " --capability=CAP In addition to the default, retain specified\n"
217 " capability\n"
218 " --drop-capability=CAP Drop the specified capability from the default set\n"
219 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
220 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
221 " try-guest, try-host\n"
222 " -j Equivalent to --link-journal=try-guest\n"
223 " --read-only Mount the root directory read-only\n"
224 " --bind=PATH[:PATH[:OPTIONS]]\n"
225 " Bind mount a file or directory from the host into\n"
226 " the container\n"
227 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
228 " Similar, but creates a read-only bind mount\n"
229 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
230 " --overlay=PATH[:PATH...]:PATH\n"
231 " Create an overlay mount from the host to \n"
232 " the container\n"
233 " --overlay-ro=PATH[:PATH...]:PATH\n"
234 " Similar, but creates a read-only overlay mount\n"
235 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
236 " --share-system Share system namespaces with host\n"
237 " --register=BOOLEAN Register container as machine\n"
238 " --keep-unit Do not register a scope for the machine, reuse\n"
239 " the service unit nspawn is running in\n"
240 " --volatile[=MODE] Run the system in volatile mode\n"
241 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
242 , program_invocation_short_name);
243 }
244
245
246 static int custom_mounts_prepare(void) {
247 unsigned i;
248 int r;
249
250 /* Ensure the mounts are applied prefix first. */
251 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
252
253 /* Allocate working directories for the overlay file systems that need it */
254 for (i = 0; i < arg_n_custom_mounts; i++) {
255 CustomMount *m = &arg_custom_mounts[i];
256
257 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
258 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
259 return -EINVAL;
260 }
261
262 if (m->type != CUSTOM_MOUNT_OVERLAY)
263 continue;
264
265 if (m->work_dir)
266 continue;
267
268 if (m->read_only)
269 continue;
270
271 r = tempfn_random(m->source, NULL, &m->work_dir);
272 if (r < 0)
273 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
274 }
275
276 return 0;
277 }
278
279 static int set_sanitized_path(char **b, const char *path) {
280 char *p;
281
282 assert(b);
283 assert(path);
284
285 p = canonicalize_file_name(path);
286 if (!p) {
287 if (errno != ENOENT)
288 return -errno;
289
290 p = path_make_absolute_cwd(path);
291 if (!p)
292 return -ENOMEM;
293 }
294
295 free(*b);
296 *b = path_kill_slashes(p);
297 return 0;
298 }
299
300 static int detect_unified_cgroup_hierarchy(void) {
301 const char *e;
302 int r;
303
304 /* Allow the user to control whether the unified hierarchy is used */
305 e = getenv("UNIFIED_CGROUP_HIERARCHY");
306 if (e) {
307 r = parse_boolean(e);
308 if (r < 0)
309 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
310
311 arg_unified_cgroup_hierarchy = r;
312 return 0;
313 }
314
315 /* Otherwise inherit the default from the host system */
316 r = cg_unified();
317 if (r < 0)
318 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
319
320 arg_unified_cgroup_hierarchy = r;
321 return 0;
322 }
323
324 static int parse_argv(int argc, char *argv[]) {
325
326 enum {
327 ARG_VERSION = 0x100,
328 ARG_PRIVATE_NETWORK,
329 ARG_UUID,
330 ARG_READ_ONLY,
331 ARG_CAPABILITY,
332 ARG_DROP_CAPABILITY,
333 ARG_LINK_JOURNAL,
334 ARG_BIND,
335 ARG_BIND_RO,
336 ARG_TMPFS,
337 ARG_OVERLAY,
338 ARG_OVERLAY_RO,
339 ARG_SETENV,
340 ARG_SHARE_SYSTEM,
341 ARG_REGISTER,
342 ARG_KEEP_UNIT,
343 ARG_NETWORK_INTERFACE,
344 ARG_NETWORK_MACVLAN,
345 ARG_NETWORK_IPVLAN,
346 ARG_NETWORK_BRIDGE,
347 ARG_PERSONALITY,
348 ARG_VOLATILE,
349 ARG_TEMPLATE,
350 ARG_PROPERTY,
351 ARG_PRIVATE_USERS,
352 ARG_KILL_SIGNAL,
353 ARG_SETTINGS,
354 };
355
356 static const struct option options[] = {
357 { "help", no_argument, NULL, 'h' },
358 { "version", no_argument, NULL, ARG_VERSION },
359 { "directory", required_argument, NULL, 'D' },
360 { "template", required_argument, NULL, ARG_TEMPLATE },
361 { "ephemeral", no_argument, NULL, 'x' },
362 { "user", required_argument, NULL, 'u' },
363 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
364 { "boot", no_argument, NULL, 'b' },
365 { "uuid", required_argument, NULL, ARG_UUID },
366 { "read-only", no_argument, NULL, ARG_READ_ONLY },
367 { "capability", required_argument, NULL, ARG_CAPABILITY },
368 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
369 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
370 { "bind", required_argument, NULL, ARG_BIND },
371 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
372 { "tmpfs", required_argument, NULL, ARG_TMPFS },
373 { "overlay", required_argument, NULL, ARG_OVERLAY },
374 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
375 { "machine", required_argument, NULL, 'M' },
376 { "slice", required_argument, NULL, 'S' },
377 { "setenv", required_argument, NULL, ARG_SETENV },
378 { "selinux-context", required_argument, NULL, 'Z' },
379 { "selinux-apifs-context", required_argument, NULL, 'L' },
380 { "quiet", no_argument, NULL, 'q' },
381 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
382 { "register", required_argument, NULL, ARG_REGISTER },
383 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
384 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
385 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
386 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
387 { "network-veth", no_argument, NULL, 'n' },
388 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
389 { "personality", required_argument, NULL, ARG_PERSONALITY },
390 { "image", required_argument, NULL, 'i' },
391 { "volatile", optional_argument, NULL, ARG_VOLATILE },
392 { "port", required_argument, NULL, 'p' },
393 { "property", required_argument, NULL, ARG_PROPERTY },
394 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
395 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
396 { "settings", required_argument, NULL, ARG_SETTINGS },
397 {}
398 };
399
400 int c, r;
401 uint64_t plus = 0, minus = 0;
402 bool mask_all_settings = false, mask_no_settings = false;
403
404 assert(argc >= 0);
405 assert(argv);
406
407 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
408
409 switch (c) {
410
411 case 'h':
412 help();
413 return 0;
414
415 case ARG_VERSION:
416 return version();
417
418 case 'D':
419 r = set_sanitized_path(&arg_directory, optarg);
420 if (r < 0)
421 return log_error_errno(r, "Invalid root directory: %m");
422
423 break;
424
425 case ARG_TEMPLATE:
426 r = set_sanitized_path(&arg_template, optarg);
427 if (r < 0)
428 return log_error_errno(r, "Invalid template directory: %m");
429
430 break;
431
432 case 'i':
433 r = set_sanitized_path(&arg_image, optarg);
434 if (r < 0)
435 return log_error_errno(r, "Invalid image path: %m");
436
437 break;
438
439 case 'x':
440 arg_ephemeral = true;
441 break;
442
443 case 'u':
444 r = free_and_strdup(&arg_user, optarg);
445 if (r < 0)
446 return log_oom();
447
448 arg_settings_mask |= SETTING_USER;
449 break;
450
451 case ARG_NETWORK_BRIDGE:
452 r = free_and_strdup(&arg_network_bridge, optarg);
453 if (r < 0)
454 return log_oom();
455
456 /* fall through */
457
458 case 'n':
459 arg_network_veth = true;
460 arg_private_network = true;
461 arg_settings_mask |= SETTING_NETWORK;
462 break;
463
464 case ARG_NETWORK_INTERFACE:
465 if (strv_extend(&arg_network_interfaces, optarg) < 0)
466 return log_oom();
467
468 arg_private_network = true;
469 arg_settings_mask |= SETTING_NETWORK;
470 break;
471
472 case ARG_NETWORK_MACVLAN:
473 if (strv_extend(&arg_network_macvlan, optarg) < 0)
474 return log_oom();
475
476 arg_private_network = true;
477 arg_settings_mask |= SETTING_NETWORK;
478 break;
479
480 case ARG_NETWORK_IPVLAN:
481 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
482 return log_oom();
483
484 /* fall through */
485
486 case ARG_PRIVATE_NETWORK:
487 arg_private_network = true;
488 arg_settings_mask |= SETTING_NETWORK;
489 break;
490
491 case 'b':
492 arg_boot = true;
493 arg_settings_mask |= SETTING_BOOT;
494 break;
495
496 case ARG_UUID:
497 r = sd_id128_from_string(optarg, &arg_uuid);
498 if (r < 0) {
499 log_error("Invalid UUID: %s", optarg);
500 return r;
501 }
502
503 arg_settings_mask |= SETTING_MACHINE_ID;
504 break;
505
506 case 'S':
507 arg_slice = optarg;
508 break;
509
510 case 'M':
511 if (isempty(optarg))
512 arg_machine = mfree(arg_machine);
513 else {
514 if (!machine_name_is_valid(optarg)) {
515 log_error("Invalid machine name: %s", optarg);
516 return -EINVAL;
517 }
518
519 r = free_and_strdup(&arg_machine, optarg);
520 if (r < 0)
521 return log_oom();
522
523 break;
524 }
525
526 case 'Z':
527 arg_selinux_context = optarg;
528 break;
529
530 case 'L':
531 arg_selinux_apifs_context = optarg;
532 break;
533
534 case ARG_READ_ONLY:
535 arg_read_only = true;
536 arg_settings_mask |= SETTING_READ_ONLY;
537 break;
538
539 case ARG_CAPABILITY:
540 case ARG_DROP_CAPABILITY: {
541 const char *state, *word;
542 size_t length;
543
544 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
545 _cleanup_free_ char *t;
546
547 t = strndup(word, length);
548 if (!t)
549 return log_oom();
550
551 if (streq(t, "all")) {
552 if (c == ARG_CAPABILITY)
553 plus = (uint64_t) -1;
554 else
555 minus = (uint64_t) -1;
556 } else {
557 int cap;
558
559 cap = capability_from_name(t);
560 if (cap < 0) {
561 log_error("Failed to parse capability %s.", t);
562 return -EINVAL;
563 }
564
565 if (c == ARG_CAPABILITY)
566 plus |= 1ULL << (uint64_t) cap;
567 else
568 minus |= 1ULL << (uint64_t) cap;
569 }
570 }
571
572 arg_settings_mask |= SETTING_CAPABILITY;
573 break;
574 }
575
576 case 'j':
577 arg_link_journal = LINK_GUEST;
578 arg_link_journal_try = true;
579 break;
580
581 case ARG_LINK_JOURNAL:
582 if (streq(optarg, "auto")) {
583 arg_link_journal = LINK_AUTO;
584 arg_link_journal_try = false;
585 } else if (streq(optarg, "no")) {
586 arg_link_journal = LINK_NO;
587 arg_link_journal_try = false;
588 } else if (streq(optarg, "guest")) {
589 arg_link_journal = LINK_GUEST;
590 arg_link_journal_try = false;
591 } else if (streq(optarg, "host")) {
592 arg_link_journal = LINK_HOST;
593 arg_link_journal_try = false;
594 } else if (streq(optarg, "try-guest")) {
595 arg_link_journal = LINK_GUEST;
596 arg_link_journal_try = true;
597 } else if (streq(optarg, "try-host")) {
598 arg_link_journal = LINK_HOST;
599 arg_link_journal_try = true;
600 } else {
601 log_error("Failed to parse link journal mode %s", optarg);
602 return -EINVAL;
603 }
604
605 break;
606
607 case ARG_BIND:
608 case ARG_BIND_RO:
609 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
610 if (r < 0)
611 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
612
613 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
614 break;
615
616 case ARG_TMPFS:
617 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
618 if (r < 0)
619 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
620
621 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
622 break;
623
624 case ARG_OVERLAY:
625 case ARG_OVERLAY_RO: {
626 _cleanup_free_ char *upper = NULL, *destination = NULL;
627 _cleanup_strv_free_ char **lower = NULL;
628 CustomMount *m;
629 unsigned n = 0;
630 char **i;
631
632 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
633 if (r == -ENOMEM)
634 return log_oom();
635 else if (r < 0) {
636 log_error("Invalid overlay specification: %s", optarg);
637 return r;
638 }
639
640 STRV_FOREACH(i, lower) {
641 if (!path_is_absolute(*i)) {
642 log_error("Overlay path %s is not absolute.", *i);
643 return -EINVAL;
644 }
645
646 n++;
647 }
648
649 if (n < 2) {
650 log_error("--overlay= needs at least two colon-separated directories specified.");
651 return -EINVAL;
652 }
653
654 if (n == 2) {
655 /* If two parameters are specified,
656 * the first one is the lower, the
657 * second one the upper directory. And
658 * we'll also define the destination
659 * mount point the same as the upper. */
660 upper = lower[1];
661 lower[1] = NULL;
662
663 destination = strdup(upper);
664 if (!destination)
665 return log_oom();
666
667 } else {
668 upper = lower[n - 2];
669 destination = lower[n - 1];
670 lower[n - 2] = NULL;
671 }
672
673 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
674 if (!m)
675 return log_oom();
676
677 m->destination = destination;
678 m->source = upper;
679 m->lower = lower;
680 m->read_only = c == ARG_OVERLAY_RO;
681
682 upper = destination = NULL;
683 lower = NULL;
684
685 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
686 break;
687 }
688
689 case ARG_SETENV: {
690 char **n;
691
692 if (!env_assignment_is_valid(optarg)) {
693 log_error("Environment variable assignment '%s' is not valid.", optarg);
694 return -EINVAL;
695 }
696
697 n = strv_env_set(arg_setenv, optarg);
698 if (!n)
699 return log_oom();
700
701 strv_free(arg_setenv);
702 arg_setenv = n;
703
704 arg_settings_mask |= SETTING_ENVIRONMENT;
705 break;
706 }
707
708 case 'q':
709 arg_quiet = true;
710 break;
711
712 case ARG_SHARE_SYSTEM:
713 arg_share_system = true;
714 break;
715
716 case ARG_REGISTER:
717 r = parse_boolean(optarg);
718 if (r < 0) {
719 log_error("Failed to parse --register= argument: %s", optarg);
720 return r;
721 }
722
723 arg_register = r;
724 break;
725
726 case ARG_KEEP_UNIT:
727 arg_keep_unit = true;
728 break;
729
730 case ARG_PERSONALITY:
731
732 arg_personality = personality_from_string(optarg);
733 if (arg_personality == PERSONALITY_INVALID) {
734 log_error("Unknown or unsupported personality '%s'.", optarg);
735 return -EINVAL;
736 }
737
738 arg_settings_mask |= SETTING_PERSONALITY;
739 break;
740
741 case ARG_VOLATILE:
742
743 if (!optarg)
744 arg_volatile_mode = VOLATILE_YES;
745 else {
746 VolatileMode m;
747
748 m = volatile_mode_from_string(optarg);
749 if (m < 0) {
750 log_error("Failed to parse --volatile= argument: %s", optarg);
751 return -EINVAL;
752 } else
753 arg_volatile_mode = m;
754 }
755
756 arg_settings_mask |= SETTING_VOLATILE_MODE;
757 break;
758
759 case 'p':
760 r = expose_port_parse(&arg_expose_ports, optarg);
761 if (r == -EEXIST)
762 return log_error_errno(r, "Duplicate port specification: %s", optarg);
763 if (r < 0)
764 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
765
766 arg_settings_mask |= SETTING_EXPOSE_PORTS;
767 break;
768
769 case ARG_PROPERTY:
770 if (strv_extend(&arg_property, optarg) < 0)
771 return log_oom();
772
773 break;
774
775 case ARG_PRIVATE_USERS:
776 if (optarg) {
777 _cleanup_free_ char *buffer = NULL;
778 const char *range, *shift;
779
780 range = strchr(optarg, ':');
781 if (range) {
782 buffer = strndup(optarg, range - optarg);
783 if (!buffer)
784 return log_oom();
785 shift = buffer;
786
787 range++;
788 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
789 log_error("Failed to parse UID range: %s", range);
790 return -EINVAL;
791 }
792 } else
793 shift = optarg;
794
795 if (parse_uid(shift, &arg_uid_shift) < 0) {
796 log_error("Failed to parse UID: %s", optarg);
797 return -EINVAL;
798 }
799 }
800
801 arg_userns = true;
802 break;
803
804 case ARG_KILL_SIGNAL:
805 arg_kill_signal = signal_from_string_try_harder(optarg);
806 if (arg_kill_signal < 0) {
807 log_error("Cannot parse signal: %s", optarg);
808 return -EINVAL;
809 }
810
811 arg_settings_mask |= SETTING_KILL_SIGNAL;
812 break;
813
814 case ARG_SETTINGS:
815
816 /* no → do not read files
817 * yes → read files, do not override cmdline, trust only subset
818 * override → read files, override cmdline, trust only subset
819 * trusted → read files, do not override cmdline, trust all
820 */
821
822 r = parse_boolean(optarg);
823 if (r < 0) {
824 if (streq(optarg, "trusted")) {
825 mask_all_settings = false;
826 mask_no_settings = false;
827 arg_settings_trusted = true;
828
829 } else if (streq(optarg, "override")) {
830 mask_all_settings = false;
831 mask_no_settings = true;
832 arg_settings_trusted = -1;
833 } else
834 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
835 } else if (r > 0) {
836 /* yes */
837 mask_all_settings = false;
838 mask_no_settings = false;
839 arg_settings_trusted = -1;
840 } else {
841 /* no */
842 mask_all_settings = true;
843 mask_no_settings = false;
844 arg_settings_trusted = false;
845 }
846
847 break;
848
849 case '?':
850 return -EINVAL;
851
852 default:
853 assert_not_reached("Unhandled option");
854 }
855
856 if (arg_share_system)
857 arg_register = false;
858
859 if (arg_boot && arg_share_system) {
860 log_error("--boot and --share-system may not be combined.");
861 return -EINVAL;
862 }
863
864 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
865 log_error("--keep-unit may not be used when invoked from a user session.");
866 return -EINVAL;
867 }
868
869 if (arg_directory && arg_image) {
870 log_error("--directory= and --image= may not be combined.");
871 return -EINVAL;
872 }
873
874 if (arg_template && arg_image) {
875 log_error("--template= and --image= may not be combined.");
876 return -EINVAL;
877 }
878
879 if (arg_template && !(arg_directory || arg_machine)) {
880 log_error("--template= needs --directory= or --machine=.");
881 return -EINVAL;
882 }
883
884 if (arg_ephemeral && arg_template) {
885 log_error("--ephemeral and --template= may not be combined.");
886 return -EINVAL;
887 }
888
889 if (arg_ephemeral && arg_image) {
890 log_error("--ephemeral and --image= may not be combined.");
891 return -EINVAL;
892 }
893
894 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
895 log_error("--ephemeral and --link-journal= may not be combined.");
896 return -EINVAL;
897 }
898
899 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
900 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
901
902 if (argc > optind) {
903 arg_parameters = strv_copy(argv + optind);
904 if (!arg_parameters)
905 return log_oom();
906
907 arg_settings_mask |= SETTING_BOOT;
908 }
909
910 /* Load all settings from .nspawn files */
911 if (mask_no_settings)
912 arg_settings_mask = 0;
913
914 /* Don't load any settings from .nspawn files */
915 if (mask_all_settings)
916 arg_settings_mask = _SETTINGS_MASK_ALL;
917
918 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
919
920 r = detect_unified_cgroup_hierarchy();
921 if (r < 0)
922 return r;
923
924 return 1;
925 }
926
927 static int verify_arguments(void) {
928
929 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
930 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
931 return -EINVAL;
932 }
933
934 if (arg_expose_ports && !arg_private_network) {
935 log_error("Cannot use --port= without private networking.");
936 return -EINVAL;
937 }
938
939 if (arg_boot && arg_kill_signal <= 0)
940 arg_kill_signal = SIGRTMIN+3;
941
942 return 0;
943 }
944
945 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
946 assert(p);
947
948 if (!arg_userns)
949 return 0;
950
951 if (uid == UID_INVALID && gid == GID_INVALID)
952 return 0;
953
954 if (uid != UID_INVALID) {
955 uid += arg_uid_shift;
956
957 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
958 return -EOVERFLOW;
959 }
960
961 if (gid != GID_INVALID) {
962 gid += (gid_t) arg_uid_shift;
963
964 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
965 return -EOVERFLOW;
966 }
967
968 if (lchown(p, uid, gid) < 0)
969 return -errno;
970
971 return 0;
972 }
973
974 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
975 const char *q;
976
977 q = prefix_roota(root, path);
978 if (mkdir(q, mode) < 0) {
979 if (errno == EEXIST)
980 return 0;
981 return -errno;
982 }
983
984 return userns_lchown(q, uid, gid);
985 }
986
987 static int setup_timezone(const char *dest) {
988 _cleanup_free_ char *p = NULL, *q = NULL;
989 const char *where, *check, *what;
990 char *z, *y;
991 int r;
992
993 assert(dest);
994
995 /* Fix the timezone, if possible */
996 r = readlink_malloc("/etc/localtime", &p);
997 if (r < 0) {
998 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
999 return 0;
1000 }
1001
1002 z = path_startswith(p, "../usr/share/zoneinfo/");
1003 if (!z)
1004 z = path_startswith(p, "/usr/share/zoneinfo/");
1005 if (!z) {
1006 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1007 return 0;
1008 }
1009
1010 where = prefix_roota(dest, "/etc/localtime");
1011 r = readlink_malloc(where, &q);
1012 if (r >= 0) {
1013 y = path_startswith(q, "../usr/share/zoneinfo/");
1014 if (!y)
1015 y = path_startswith(q, "/usr/share/zoneinfo/");
1016
1017 /* Already pointing to the right place? Then do nothing .. */
1018 if (y && streq(y, z))
1019 return 0;
1020 }
1021
1022 check = strjoina("/usr/share/zoneinfo/", z);
1023 check = prefix_root(dest, check);
1024 if (laccess(check, F_OK) < 0) {
1025 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1026 return 0;
1027 }
1028
1029 r = unlink(where);
1030 if (r < 0 && errno != ENOENT) {
1031 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1032 return 0;
1033 }
1034
1035 what = strjoina("../usr/share/zoneinfo/", z);
1036 if (symlink(what, where) < 0) {
1037 log_error_errno(errno, "Failed to correct timezone of container: %m");
1038 return 0;
1039 }
1040
1041 r = userns_lchown(where, 0, 0);
1042 if (r < 0)
1043 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1044
1045 return 0;
1046 }
1047
1048 static int setup_resolv_conf(const char *dest) {
1049 const char *where = NULL;
1050 int r;
1051
1052 assert(dest);
1053
1054 if (arg_private_network)
1055 return 0;
1056
1057 /* Fix resolv.conf, if possible */
1058 where = prefix_roota(dest, "/etc/resolv.conf");
1059
1060 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1061 if (r < 0) {
1062 /* If the file already exists as symlink, let's
1063 * suppress the warning, under the assumption that
1064 * resolved or something similar runs inside and the
1065 * symlink points there.
1066 *
1067 * If the disk image is read-only, there's also no
1068 * point in complaining.
1069 */
1070 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1071 "Failed to copy /etc/resolv.conf to %s: %m", where);
1072 return 0;
1073 }
1074
1075 r = userns_lchown(where, 0, 0);
1076 if (r < 0)
1077 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1078
1079 return 0;
1080 }
1081
1082 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1083 assert(s);
1084
1085 snprintf(s, 37,
1086 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1087 SD_ID128_FORMAT_VAL(id));
1088
1089 return s;
1090 }
1091
1092 static int setup_boot_id(const char *dest) {
1093 const char *from, *to;
1094 sd_id128_t rnd = {};
1095 char as_uuid[37];
1096 int r;
1097
1098 if (arg_share_system)
1099 return 0;
1100
1101 /* Generate a new randomized boot ID, so that each boot-up of
1102 * the container gets a new one */
1103
1104 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1105 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1106
1107 r = sd_id128_randomize(&rnd);
1108 if (r < 0)
1109 return log_error_errno(r, "Failed to generate random boot id: %m");
1110
1111 id128_format_as_uuid(rnd, as_uuid);
1112
1113 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1114 if (r < 0)
1115 return log_error_errno(r, "Failed to write boot id: %m");
1116
1117 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1118 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1119 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1120 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1121
1122 unlink(from);
1123 return r;
1124 }
1125
1126 static int copy_devnodes(const char *dest) {
1127
1128 static const char devnodes[] =
1129 "null\0"
1130 "zero\0"
1131 "full\0"
1132 "random\0"
1133 "urandom\0"
1134 "tty\0"
1135 "net/tun\0";
1136
1137 const char *d;
1138 int r = 0;
1139 _cleanup_umask_ mode_t u;
1140
1141 assert(dest);
1142
1143 u = umask(0000);
1144
1145 /* Create /dev/net, so that we can create /dev/net/tun in it */
1146 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1147 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1148
1149 NULSTR_FOREACH(d, devnodes) {
1150 _cleanup_free_ char *from = NULL, *to = NULL;
1151 struct stat st;
1152
1153 from = strappend("/dev/", d);
1154 to = prefix_root(dest, from);
1155
1156 if (stat(from, &st) < 0) {
1157
1158 if (errno != ENOENT)
1159 return log_error_errno(errno, "Failed to stat %s: %m", from);
1160
1161 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1162
1163 log_error("%s is not a char or block device, cannot copy.", from);
1164 return -EIO;
1165
1166 } else {
1167 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1168 if (errno != EPERM)
1169 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1170
1171 /* Some systems abusively restrict mknod but
1172 * allow bind mounts. */
1173 r = touch(to);
1174 if (r < 0)
1175 return log_error_errno(r, "touch (%s) failed: %m", to);
1176 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1177 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1178 }
1179
1180 r = userns_lchown(to, 0, 0);
1181 if (r < 0)
1182 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1183 }
1184 }
1185
1186 return r;
1187 }
1188
1189 static int setup_pts(const char *dest) {
1190 _cleanup_free_ char *options = NULL;
1191 const char *p;
1192
1193 #ifdef HAVE_SELINUX
1194 if (arg_selinux_apifs_context)
1195 (void) asprintf(&options,
1196 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1197 arg_uid_shift + TTY_GID,
1198 arg_selinux_apifs_context);
1199 else
1200 #endif
1201 (void) asprintf(&options,
1202 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1203 arg_uid_shift + TTY_GID);
1204
1205 if (!options)
1206 return log_oom();
1207
1208 /* Mount /dev/pts itself */
1209 p = prefix_roota(dest, "/dev/pts");
1210 if (mkdir(p, 0755) < 0)
1211 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1212 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1213 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1214 if (userns_lchown(p, 0, 0) < 0)
1215 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1216
1217 /* Create /dev/ptmx symlink */
1218 p = prefix_roota(dest, "/dev/ptmx");
1219 if (symlink("pts/ptmx", p) < 0)
1220 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1221 if (userns_lchown(p, 0, 0) < 0)
1222 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1223
1224 /* And fix /dev/pts/ptmx ownership */
1225 p = prefix_roota(dest, "/dev/pts/ptmx");
1226 if (userns_lchown(p, 0, 0) < 0)
1227 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1228
1229 return 0;
1230 }
1231
1232 static int setup_dev_console(const char *dest, const char *console) {
1233 _cleanup_umask_ mode_t u;
1234 const char *to;
1235 int r;
1236
1237 assert(dest);
1238 assert(console);
1239
1240 u = umask(0000);
1241
1242 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1243 if (r < 0)
1244 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1245
1246 /* We need to bind mount the right tty to /dev/console since
1247 * ptys can only exist on pts file systems. To have something
1248 * to bind mount things on we create a empty regular file. */
1249
1250 to = prefix_roota(dest, "/dev/console");
1251 r = touch(to);
1252 if (r < 0)
1253 return log_error_errno(r, "touch() for /dev/console failed: %m");
1254
1255 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1256 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1257
1258 return 0;
1259 }
1260
1261 static int setup_kmsg(const char *dest, int kmsg_socket) {
1262 const char *from, *to;
1263 _cleanup_umask_ mode_t u;
1264 int fd, r;
1265
1266 assert(kmsg_socket >= 0);
1267
1268 u = umask(0000);
1269
1270 /* We create the kmsg FIFO as /run/kmsg, but immediately
1271 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1272 * on the reading side behave very similar to /proc/kmsg,
1273 * their writing side behaves differently from /dev/kmsg in
1274 * that writing blocks when nothing is reading. In order to
1275 * avoid any problems with containers deadlocking due to this
1276 * we simply make /dev/kmsg unavailable to the container. */
1277 from = prefix_roota(dest, "/run/kmsg");
1278 to = prefix_roota(dest, "/proc/kmsg");
1279
1280 if (mkfifo(from, 0600) < 0)
1281 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1282 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1283 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1284
1285 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1286 if (fd < 0)
1287 return log_error_errno(errno, "Failed to open fifo: %m");
1288
1289 /* Store away the fd in the socket, so that it stays open as
1290 * long as we run the child */
1291 r = send_one_fd(kmsg_socket, fd, 0);
1292 safe_close(fd);
1293
1294 if (r < 0)
1295 return log_error_errno(r, "Failed to send FIFO fd: %m");
1296
1297 /* And now make the FIFO unavailable as /run/kmsg... */
1298 (void) unlink(from);
1299
1300 return 0;
1301 }
1302
1303 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1304 union in_addr_union *exposed = userdata;
1305
1306 assert(rtnl);
1307 assert(m);
1308 assert(exposed);
1309
1310 expose_port_execute(rtnl, arg_expose_ports, exposed);
1311 return 0;
1312 }
1313
1314 static int setup_hostname(void) {
1315
1316 if (arg_share_system)
1317 return 0;
1318
1319 if (sethostname_idempotent(arg_machine) < 0)
1320 return -errno;
1321
1322 return 0;
1323 }
1324
1325 static int setup_journal(const char *directory) {
1326 sd_id128_t machine_id, this_id;
1327 _cleanup_free_ char *b = NULL, *d = NULL;
1328 const char *etc_machine_id, *p, *q;
1329 char *id;
1330 int r;
1331
1332 /* Don't link journals in ephemeral mode */
1333 if (arg_ephemeral)
1334 return 0;
1335
1336 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1337
1338 r = read_one_line_file(etc_machine_id, &b);
1339 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1340 return 0;
1341 else if (r < 0)
1342 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
1343
1344 id = strstrip(b);
1345 if (isempty(id) && arg_link_journal == LINK_AUTO)
1346 return 0;
1347
1348 /* Verify validity */
1349 r = sd_id128_from_string(id, &machine_id);
1350 if (r < 0)
1351 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
1352
1353 r = sd_id128_get_machine(&this_id);
1354 if (r < 0)
1355 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1356
1357 if (sd_id128_equal(machine_id, this_id)) {
1358 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1359 "Host and machine ids are equal (%s): refusing to link journals", id);
1360 if (arg_link_journal == LINK_AUTO)
1361 return 0;
1362 return -EEXIST;
1363 }
1364
1365 if (arg_link_journal == LINK_NO)
1366 return 0;
1367
1368 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1369 if (r < 0)
1370 return log_error_errno(r, "Failed to create /var: %m");
1371
1372 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1373 if (r < 0)
1374 return log_error_errno(r, "Failed to create /var/log: %m");
1375
1376 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1377 if (r < 0)
1378 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1379
1380 p = strjoina("/var/log/journal/", id);
1381 q = prefix_roota(directory, p);
1382
1383 if (path_is_mount_point(p, 0) > 0) {
1384 if (arg_link_journal != LINK_AUTO) {
1385 log_error("%s: already a mount point, refusing to use for journal", p);
1386 return -EEXIST;
1387 }
1388
1389 return 0;
1390 }
1391
1392 if (path_is_mount_point(q, 0) > 0) {
1393 if (arg_link_journal != LINK_AUTO) {
1394 log_error("%s: already a mount point, refusing to use for journal", q);
1395 return -EEXIST;
1396 }
1397
1398 return 0;
1399 }
1400
1401 r = readlink_and_make_absolute(p, &d);
1402 if (r >= 0) {
1403 if ((arg_link_journal == LINK_GUEST ||
1404 arg_link_journal == LINK_AUTO) &&
1405 path_equal(d, q)) {
1406
1407 r = userns_mkdir(directory, p, 0755, 0, 0);
1408 if (r < 0)
1409 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1410 return 0;
1411 }
1412
1413 if (unlink(p) < 0)
1414 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1415 } else if (r == -EINVAL) {
1416
1417 if (arg_link_journal == LINK_GUEST &&
1418 rmdir(p) < 0) {
1419
1420 if (errno == ENOTDIR) {
1421 log_error("%s already exists and is neither a symlink nor a directory", p);
1422 return r;
1423 } else {
1424 log_error_errno(errno, "Failed to remove %s: %m", p);
1425 return -errno;
1426 }
1427 }
1428 } else if (r != -ENOENT) {
1429 log_error_errno(errno, "readlink(%s) failed: %m", p);
1430 return r;
1431 }
1432
1433 if (arg_link_journal == LINK_GUEST) {
1434
1435 if (symlink(q, p) < 0) {
1436 if (arg_link_journal_try) {
1437 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1438 return 0;
1439 } else {
1440 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1441 return -errno;
1442 }
1443 }
1444
1445 r = userns_mkdir(directory, p, 0755, 0, 0);
1446 if (r < 0)
1447 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1448 return 0;
1449 }
1450
1451 if (arg_link_journal == LINK_HOST) {
1452 /* don't create parents here -- if the host doesn't have
1453 * permanent journal set up, don't force it here */
1454 r = mkdir(p, 0755);
1455 if (r < 0) {
1456 if (arg_link_journal_try) {
1457 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1458 return 0;
1459 } else {
1460 log_error_errno(errno, "Failed to create %s: %m", p);
1461 return r;
1462 }
1463 }
1464
1465 } else if (access(p, F_OK) < 0)
1466 return 0;
1467
1468 if (dir_is_empty(q) == 0)
1469 log_warning("%s is not empty, proceeding anyway.", q);
1470
1471 r = userns_mkdir(directory, p, 0755, 0, 0);
1472 if (r < 0) {
1473 log_error_errno(errno, "Failed to create %s: %m", q);
1474 return r;
1475 }
1476
1477 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1478 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1479
1480 return 0;
1481 }
1482
1483 static int drop_capabilities(void) {
1484 return capability_bounding_set_drop(~arg_retain, false);
1485 }
1486
1487 static int reset_audit_loginuid(void) {
1488 _cleanup_free_ char *p = NULL;
1489 int r;
1490
1491 if (arg_share_system)
1492 return 0;
1493
1494 r = read_one_line_file("/proc/self/loginuid", &p);
1495 if (r == -ENOENT)
1496 return 0;
1497 if (r < 0)
1498 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1499
1500 /* Already reset? */
1501 if (streq(p, "4294967295"))
1502 return 0;
1503
1504 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1505 if (r < 0) {
1506 log_error_errno(r,
1507 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1508 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1509 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1510 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1511 "using systemd-nspawn. Sleeping for 5s... (%m)");
1512
1513 sleep(5);
1514 }
1515
1516 return 0;
1517 }
1518
1519 static int setup_seccomp(void) {
1520
1521 #ifdef HAVE_SECCOMP
1522 static const struct {
1523 uint64_t capability;
1524 int syscall_num;
1525 } blacklist[] = {
1526 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1527 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1528 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1529 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1530 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1531 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1532 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1533 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1534 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1535 { CAP_SYSLOG, SCMP_SYS(syslog) },
1536 };
1537
1538 scmp_filter_ctx seccomp;
1539 unsigned i;
1540 int r;
1541
1542 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1543 if (!seccomp)
1544 return log_oom();
1545
1546 r = seccomp_add_secondary_archs(seccomp);
1547 if (r < 0) {
1548 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1549 goto finish;
1550 }
1551
1552 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1553 if (arg_retain & (1ULL << blacklist[i].capability))
1554 continue;
1555
1556 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1557 if (r == -EFAULT)
1558 continue; /* unknown syscall */
1559 if (r < 0) {
1560 log_error_errno(r, "Failed to block syscall: %m");
1561 goto finish;
1562 }
1563 }
1564
1565
1566 /*
1567 Audit is broken in containers, much of the userspace audit
1568 hookup will fail if running inside a container. We don't
1569 care and just turn off creation of audit sockets.
1570
1571 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1572 with EAFNOSUPPORT which audit userspace uses as indication
1573 that audit is disabled in the kernel.
1574 */
1575
1576 r = seccomp_rule_add(
1577 seccomp,
1578 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1579 SCMP_SYS(socket),
1580 2,
1581 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1582 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1583 if (r < 0) {
1584 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1585 goto finish;
1586 }
1587
1588 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1589 if (r < 0) {
1590 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1591 goto finish;
1592 }
1593
1594 r = seccomp_load(seccomp);
1595 if (r == -EINVAL) {
1596 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1597 r = 0;
1598 goto finish;
1599 }
1600 if (r < 0) {
1601 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1602 goto finish;
1603 }
1604
1605 finish:
1606 seccomp_release(seccomp);
1607 return r;
1608 #else
1609 return 0;
1610 #endif
1611
1612 }
1613
1614 static int setup_propagate(const char *root) {
1615 const char *p, *q;
1616
1617 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1618 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1619 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1620 (void) mkdir_p(p, 0600);
1621
1622 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
1623 return log_error_errno(errno, "Failed to create /run/systemd: %m");
1624
1625 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1626 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
1627
1628 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1629 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
1630
1631 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1632 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1633 return log_error_errno(errno, "Failed to install propagation bind mount.");
1634
1635 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1636 return log_error_errno(errno, "Failed to make propagation mount read-only");
1637
1638 return 0;
1639 }
1640
1641 static int setup_image(char **device_path, int *loop_nr) {
1642 struct loop_info64 info = {
1643 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1644 };
1645 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1646 _cleanup_free_ char* loopdev = NULL;
1647 struct stat st;
1648 int r, nr;
1649
1650 assert(device_path);
1651 assert(loop_nr);
1652 assert(arg_image);
1653
1654 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1655 if (fd < 0)
1656 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1657
1658 if (fstat(fd, &st) < 0)
1659 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1660
1661 if (S_ISBLK(st.st_mode)) {
1662 char *p;
1663
1664 p = strdup(arg_image);
1665 if (!p)
1666 return log_oom();
1667
1668 *device_path = p;
1669
1670 *loop_nr = -1;
1671
1672 r = fd;
1673 fd = -1;
1674
1675 return r;
1676 }
1677
1678 if (!S_ISREG(st.st_mode)) {
1679 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1680 return -EINVAL;
1681 }
1682
1683 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1684 if (control < 0)
1685 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1686
1687 nr = ioctl(control, LOOP_CTL_GET_FREE);
1688 if (nr < 0)
1689 return log_error_errno(errno, "Failed to allocate loop device: %m");
1690
1691 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1692 return log_oom();
1693
1694 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1695 if (loop < 0)
1696 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1697
1698 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1699 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1700
1701 if (arg_read_only)
1702 info.lo_flags |= LO_FLAGS_READ_ONLY;
1703
1704 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1705 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1706
1707 *device_path = loopdev;
1708 loopdev = NULL;
1709
1710 *loop_nr = nr;
1711
1712 r = loop;
1713 loop = -1;
1714
1715 return r;
1716 }
1717
1718 #define PARTITION_TABLE_BLURB \
1719 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1720 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1721 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1722 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1723 "to be bootable with systemd-nspawn."
1724
1725 static int dissect_image(
1726 int fd,
1727 char **root_device, bool *root_device_rw,
1728 char **home_device, bool *home_device_rw,
1729 char **srv_device, bool *srv_device_rw,
1730 bool *secondary) {
1731
1732 #ifdef HAVE_BLKID
1733 int home_nr = -1, srv_nr = -1;
1734 #ifdef GPT_ROOT_NATIVE
1735 int root_nr = -1;
1736 #endif
1737 #ifdef GPT_ROOT_SECONDARY
1738 int secondary_root_nr = -1;
1739 #endif
1740 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1741 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1742 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1743 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1744 _cleanup_udev_unref_ struct udev *udev = NULL;
1745 struct udev_list_entry *first, *item;
1746 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1747 bool is_gpt, is_mbr, multiple_generic = false;
1748 const char *pttype = NULL;
1749 blkid_partlist pl;
1750 struct stat st;
1751 unsigned i;
1752 int r;
1753
1754 assert(fd >= 0);
1755 assert(root_device);
1756 assert(home_device);
1757 assert(srv_device);
1758 assert(secondary);
1759 assert(arg_image);
1760
1761 b = blkid_new_probe();
1762 if (!b)
1763 return log_oom();
1764
1765 errno = 0;
1766 r = blkid_probe_set_device(b, fd, 0, 0);
1767 if (r != 0) {
1768 if (errno == 0)
1769 return log_oom();
1770
1771 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1772 return -errno;
1773 }
1774
1775 blkid_probe_enable_partitions(b, 1);
1776 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1777
1778 errno = 0;
1779 r = blkid_do_safeprobe(b);
1780 if (r == -2 || r == 1) {
1781 log_error("Failed to identify any partition table on\n"
1782 " %s\n"
1783 PARTITION_TABLE_BLURB, arg_image);
1784 return -EINVAL;
1785 } else if (r != 0) {
1786 if (errno == 0)
1787 errno = EIO;
1788 log_error_errno(errno, "Failed to probe: %m");
1789 return -errno;
1790 }
1791
1792 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1793
1794 is_gpt = streq_ptr(pttype, "gpt");
1795 is_mbr = streq_ptr(pttype, "dos");
1796
1797 if (!is_gpt && !is_mbr) {
1798 log_error("No GPT or MBR partition table discovered on\n"
1799 " %s\n"
1800 PARTITION_TABLE_BLURB, arg_image);
1801 return -EINVAL;
1802 }
1803
1804 errno = 0;
1805 pl = blkid_probe_get_partitions(b);
1806 if (!pl) {
1807 if (errno == 0)
1808 return log_oom();
1809
1810 log_error("Failed to list partitions of %s", arg_image);
1811 return -errno;
1812 }
1813
1814 udev = udev_new();
1815 if (!udev)
1816 return log_oom();
1817
1818 if (fstat(fd, &st) < 0)
1819 return log_error_errno(errno, "Failed to stat block device: %m");
1820
1821 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1822 if (!d)
1823 return log_oom();
1824
1825 for (i = 0;; i++) {
1826 int n, m;
1827
1828 if (i >= 10) {
1829 log_error("Kernel partitions never appeared.");
1830 return -ENXIO;
1831 }
1832
1833 e = udev_enumerate_new(udev);
1834 if (!e)
1835 return log_oom();
1836
1837 r = udev_enumerate_add_match_parent(e, d);
1838 if (r < 0)
1839 return log_oom();
1840
1841 r = udev_enumerate_scan_devices(e);
1842 if (r < 0)
1843 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1844
1845 /* Count the partitions enumerated by the kernel */
1846 n = 0;
1847 first = udev_enumerate_get_list_entry(e);
1848 udev_list_entry_foreach(item, first)
1849 n++;
1850
1851 /* Count the partitions enumerated by blkid */
1852 m = blkid_partlist_numof_partitions(pl);
1853 if (n == m + 1)
1854 break;
1855 if (n > m + 1) {
1856 log_error("blkid and kernel partition list do not match.");
1857 return -EIO;
1858 }
1859 if (n < m + 1) {
1860 unsigned j;
1861
1862 /* The kernel has probed fewer partitions than
1863 * blkid? Maybe the kernel prober is still
1864 * running or it got EBUSY because udev
1865 * already opened the device. Let's reprobe
1866 * the device, which is a synchronous call
1867 * that waits until probing is complete. */
1868
1869 for (j = 0; j < 20; j++) {
1870
1871 r = ioctl(fd, BLKRRPART, 0);
1872 if (r < 0)
1873 r = -errno;
1874 if (r >= 0 || r != -EBUSY)
1875 break;
1876
1877 /* If something else has the device
1878 * open, such as an udev rule, the
1879 * ioctl will return EBUSY. Since
1880 * there's no way to wait until it
1881 * isn't busy anymore, let's just wait
1882 * a bit, and try again.
1883 *
1884 * This is really something they
1885 * should fix in the kernel! */
1886
1887 usleep(50 * USEC_PER_MSEC);
1888 }
1889
1890 if (r < 0)
1891 return log_error_errno(r, "Failed to reread partition table: %m");
1892 }
1893
1894 e = udev_enumerate_unref(e);
1895 }
1896
1897 first = udev_enumerate_get_list_entry(e);
1898 udev_list_entry_foreach(item, first) {
1899 _cleanup_udev_device_unref_ struct udev_device *q;
1900 const char *node;
1901 unsigned long long flags;
1902 blkid_partition pp;
1903 dev_t qn;
1904 int nr;
1905
1906 errno = 0;
1907 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1908 if (!q) {
1909 if (!errno)
1910 errno = ENOMEM;
1911
1912 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1913 return -errno;
1914 }
1915
1916 qn = udev_device_get_devnum(q);
1917 if (major(qn) == 0)
1918 continue;
1919
1920 if (st.st_rdev == qn)
1921 continue;
1922
1923 node = udev_device_get_devnode(q);
1924 if (!node)
1925 continue;
1926
1927 pp = blkid_partlist_devno_to_partition(pl, qn);
1928 if (!pp)
1929 continue;
1930
1931 flags = blkid_partition_get_flags(pp);
1932
1933 nr = blkid_partition_get_partno(pp);
1934 if (nr < 0)
1935 continue;
1936
1937 if (is_gpt) {
1938 sd_id128_t type_id;
1939 const char *stype;
1940
1941 if (flags & GPT_FLAG_NO_AUTO)
1942 continue;
1943
1944 stype = blkid_partition_get_type_string(pp);
1945 if (!stype)
1946 continue;
1947
1948 if (sd_id128_from_string(stype, &type_id) < 0)
1949 continue;
1950
1951 if (sd_id128_equal(type_id, GPT_HOME)) {
1952
1953 if (home && nr >= home_nr)
1954 continue;
1955
1956 home_nr = nr;
1957 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1958
1959 r = free_and_strdup(&home, node);
1960 if (r < 0)
1961 return log_oom();
1962
1963 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1964
1965 if (srv && nr >= srv_nr)
1966 continue;
1967
1968 srv_nr = nr;
1969 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1970
1971 r = free_and_strdup(&srv, node);
1972 if (r < 0)
1973 return log_oom();
1974 }
1975 #ifdef GPT_ROOT_NATIVE
1976 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1977
1978 if (root && nr >= root_nr)
1979 continue;
1980
1981 root_nr = nr;
1982 root_rw = !(flags & GPT_FLAG_READ_ONLY);
1983
1984 r = free_and_strdup(&root, node);
1985 if (r < 0)
1986 return log_oom();
1987 }
1988 #endif
1989 #ifdef GPT_ROOT_SECONDARY
1990 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1991
1992 if (secondary_root && nr >= secondary_root_nr)
1993 continue;
1994
1995 secondary_root_nr = nr;
1996 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
1997
1998 r = free_and_strdup(&secondary_root, node);
1999 if (r < 0)
2000 return log_oom();
2001 }
2002 #endif
2003 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2004
2005 if (generic)
2006 multiple_generic = true;
2007 else {
2008 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2009
2010 r = free_and_strdup(&generic, node);
2011 if (r < 0)
2012 return log_oom();
2013 }
2014 }
2015
2016 } else if (is_mbr) {
2017 int type;
2018
2019 if (flags != 0x80) /* Bootable flag */
2020 continue;
2021
2022 type = blkid_partition_get_type(pp);
2023 if (type != 0x83) /* Linux partition */
2024 continue;
2025
2026 if (generic)
2027 multiple_generic = true;
2028 else {
2029 generic_rw = true;
2030
2031 r = free_and_strdup(&root, node);
2032 if (r < 0)
2033 return log_oom();
2034 }
2035 }
2036 }
2037
2038 if (root) {
2039 *root_device = root;
2040 root = NULL;
2041
2042 *root_device_rw = root_rw;
2043 *secondary = false;
2044 } else if (secondary_root) {
2045 *root_device = secondary_root;
2046 secondary_root = NULL;
2047
2048 *root_device_rw = secondary_root_rw;
2049 *secondary = true;
2050 } else if (generic) {
2051
2052 /* There were no partitions with precise meanings
2053 * around, but we found generic partitions. In this
2054 * case, if there's only one, we can go ahead and boot
2055 * it, otherwise we bail out, because we really cannot
2056 * make any sense of it. */
2057
2058 if (multiple_generic) {
2059 log_error("Identified multiple bootable Linux partitions on\n"
2060 " %s\n"
2061 PARTITION_TABLE_BLURB, arg_image);
2062 return -EINVAL;
2063 }
2064
2065 *root_device = generic;
2066 generic = NULL;
2067
2068 *root_device_rw = generic_rw;
2069 *secondary = false;
2070 } else {
2071 log_error("Failed to identify root partition in disk image\n"
2072 " %s\n"
2073 PARTITION_TABLE_BLURB, arg_image);
2074 return -EINVAL;
2075 }
2076
2077 if (home) {
2078 *home_device = home;
2079 home = NULL;
2080
2081 *home_device_rw = home_rw;
2082 }
2083
2084 if (srv) {
2085 *srv_device = srv;
2086 srv = NULL;
2087
2088 *srv_device_rw = srv_rw;
2089 }
2090
2091 return 0;
2092 #else
2093 log_error("--image= is not supported, compiled without blkid support.");
2094 return -EOPNOTSUPP;
2095 #endif
2096 }
2097
2098 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2099 #ifdef HAVE_BLKID
2100 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2101 const char *fstype, *p;
2102 int r;
2103
2104 assert(what);
2105 assert(where);
2106
2107 if (arg_read_only)
2108 rw = false;
2109
2110 if (directory)
2111 p = strjoina(where, directory);
2112 else
2113 p = where;
2114
2115 errno = 0;
2116 b = blkid_new_probe_from_filename(what);
2117 if (!b) {
2118 if (errno == 0)
2119 return log_oom();
2120 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2121 return -errno;
2122 }
2123
2124 blkid_probe_enable_superblocks(b, 1);
2125 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2126
2127 errno = 0;
2128 r = blkid_do_safeprobe(b);
2129 if (r == -1 || r == 1) {
2130 log_error("Cannot determine file system type of %s", what);
2131 return -EINVAL;
2132 } else if (r != 0) {
2133 if (errno == 0)
2134 errno = EIO;
2135 log_error_errno(errno, "Failed to probe %s: %m", what);
2136 return -errno;
2137 }
2138
2139 errno = 0;
2140 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2141 if (errno == 0)
2142 errno = EINVAL;
2143 log_error("Failed to determine file system type of %s", what);
2144 return -errno;
2145 }
2146
2147 if (streq(fstype, "crypto_LUKS")) {
2148 log_error("nspawn currently does not support LUKS disk images.");
2149 return -EOPNOTSUPP;
2150 }
2151
2152 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2153 return log_error_errno(errno, "Failed to mount %s: %m", what);
2154
2155 return 0;
2156 #else
2157 log_error("--image= is not supported, compiled without blkid support.");
2158 return -EOPNOTSUPP;
2159 #endif
2160 }
2161
2162 static int mount_devices(
2163 const char *where,
2164 const char *root_device, bool root_device_rw,
2165 const char *home_device, bool home_device_rw,
2166 const char *srv_device, bool srv_device_rw) {
2167 int r;
2168
2169 assert(where);
2170
2171 if (root_device) {
2172 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2173 if (r < 0)
2174 return log_error_errno(r, "Failed to mount root directory: %m");
2175 }
2176
2177 if (home_device) {
2178 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2179 if (r < 0)
2180 return log_error_errno(r, "Failed to mount home directory: %m");
2181 }
2182
2183 if (srv_device) {
2184 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2185 if (r < 0)
2186 return log_error_errno(r, "Failed to mount server data directory: %m");
2187 }
2188
2189 return 0;
2190 }
2191
2192 static void loop_remove(int nr, int *image_fd) {
2193 _cleanup_close_ int control = -1;
2194 int r;
2195
2196 if (nr < 0)
2197 return;
2198
2199 if (image_fd && *image_fd >= 0) {
2200 r = ioctl(*image_fd, LOOP_CLR_FD);
2201 if (r < 0)
2202 log_debug_errno(errno, "Failed to close loop image: %m");
2203 *image_fd = safe_close(*image_fd);
2204 }
2205
2206 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2207 if (control < 0) {
2208 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2209 return;
2210 }
2211
2212 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2213 if (r < 0)
2214 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2215 }
2216
2217 /*
2218 * Return values:
2219 * < 0 : wait_for_terminate() failed to get the state of the
2220 * container, the container was terminated by a signal, or
2221 * failed for an unknown reason. No change is made to the
2222 * container argument.
2223 * > 0 : The program executed in the container terminated with an
2224 * error. The exit code of the program executed in the
2225 * container is returned. The container argument has been set
2226 * to CONTAINER_TERMINATED.
2227 * 0 : The container is being rebooted, has been shut down or exited
2228 * successfully. The container argument has been set to either
2229 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2230 *
2231 * That is, success is indicated by a return value of zero, and an
2232 * error is indicated by a non-zero value.
2233 */
2234 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2235 siginfo_t status;
2236 int r;
2237
2238 r = wait_for_terminate(pid, &status);
2239 if (r < 0)
2240 return log_warning_errno(r, "Failed to wait for container: %m");
2241
2242 switch (status.si_code) {
2243
2244 case CLD_EXITED:
2245 if (status.si_status == 0) {
2246 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2247
2248 } else
2249 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2250
2251 *container = CONTAINER_TERMINATED;
2252 return status.si_status;
2253
2254 case CLD_KILLED:
2255 if (status.si_status == SIGINT) {
2256
2257 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2258 *container = CONTAINER_TERMINATED;
2259 return 0;
2260
2261 } else if (status.si_status == SIGHUP) {
2262
2263 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2264 *container = CONTAINER_REBOOTED;
2265 return 0;
2266 }
2267
2268 /* CLD_KILLED fallthrough */
2269
2270 case CLD_DUMPED:
2271 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2272 return -EIO;
2273
2274 default:
2275 log_error("Container %s failed due to unknown reason.", arg_machine);
2276 return -EIO;
2277 }
2278
2279 return r;
2280 }
2281
2282 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2283 pid_t pid;
2284
2285 pid = PTR_TO_UINT32(userdata);
2286 if (pid > 0) {
2287 if (kill(pid, arg_kill_signal) >= 0) {
2288 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2289 sd_event_source_set_userdata(s, NULL);
2290 return 0;
2291 }
2292 }
2293
2294 sd_event_exit(sd_event_source_get_event(s), 0);
2295 return 0;
2296 }
2297
2298 static int determine_names(void) {
2299 int r;
2300
2301 if (arg_template && !arg_directory && arg_machine) {
2302
2303 /* If --template= was specified then we should not
2304 * search for a machine, but instead create a new one
2305 * in /var/lib/machine. */
2306
2307 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2308 if (!arg_directory)
2309 return log_oom();
2310 }
2311
2312 if (!arg_image && !arg_directory) {
2313 if (arg_machine) {
2314 _cleanup_(image_unrefp) Image *i = NULL;
2315
2316 r = image_find(arg_machine, &i);
2317 if (r < 0)
2318 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2319 else if (r == 0) {
2320 log_error("No image for machine '%s': %m", arg_machine);
2321 return -ENOENT;
2322 }
2323
2324 if (i->type == IMAGE_RAW)
2325 r = set_sanitized_path(&arg_image, i->path);
2326 else
2327 r = set_sanitized_path(&arg_directory, i->path);
2328 if (r < 0)
2329 return log_error_errno(r, "Invalid image directory: %m");
2330
2331 if (!arg_ephemeral)
2332 arg_read_only = arg_read_only || i->read_only;
2333 } else
2334 arg_directory = get_current_dir_name();
2335
2336 if (!arg_directory && !arg_machine) {
2337 log_error("Failed to determine path, please use -D or -i.");
2338 return -EINVAL;
2339 }
2340 }
2341
2342 if (!arg_machine) {
2343 if (arg_directory && path_equal(arg_directory, "/"))
2344 arg_machine = gethostname_malloc();
2345 else
2346 arg_machine = strdup(basename(arg_image ?: arg_directory));
2347
2348 if (!arg_machine)
2349 return log_oom();
2350
2351 hostname_cleanup(arg_machine);
2352 if (!machine_name_is_valid(arg_machine)) {
2353 log_error("Failed to determine machine name automatically, please use -M.");
2354 return -EINVAL;
2355 }
2356
2357 if (arg_ephemeral) {
2358 char *b;
2359
2360 /* Add a random suffix when this is an
2361 * ephemeral machine, so that we can run many
2362 * instances at once without manually having
2363 * to specify -M each time. */
2364
2365 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2366 return log_oom();
2367
2368 free(arg_machine);
2369 arg_machine = b;
2370 }
2371 }
2372
2373 return 0;
2374 }
2375
2376 static int determine_uid_shift(const char *directory) {
2377 int r;
2378
2379 if (!arg_userns) {
2380 arg_uid_shift = 0;
2381 return 0;
2382 }
2383
2384 if (arg_uid_shift == UID_INVALID) {
2385 struct stat st;
2386
2387 r = stat(directory, &st);
2388 if (r < 0)
2389 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2390
2391 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2392
2393 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2394 log_error("UID and GID base of %s don't match.", directory);
2395 return -EINVAL;
2396 }
2397
2398 arg_uid_range = UINT32_C(0x10000);
2399 }
2400
2401 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2402 log_error("UID base too high for UID range.");
2403 return -EINVAL;
2404 }
2405
2406 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2407 return 0;
2408 }
2409
2410 static int inner_child(
2411 Barrier *barrier,
2412 const char *directory,
2413 bool secondary,
2414 int kmsg_socket,
2415 int rtnl_socket,
2416 FDSet *fds) {
2417
2418 _cleanup_free_ char *home = NULL;
2419 unsigned n_env = 2;
2420 const char *envp[] = {
2421 "PATH=" DEFAULT_PATH_SPLIT_USR,
2422 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2423 NULL, /* TERM */
2424 NULL, /* HOME */
2425 NULL, /* USER */
2426 NULL, /* LOGNAME */
2427 NULL, /* container_uuid */
2428 NULL, /* LISTEN_FDS */
2429 NULL, /* LISTEN_PID */
2430 NULL
2431 };
2432
2433 _cleanup_strv_free_ char **env_use = NULL;
2434 int r;
2435
2436 assert(barrier);
2437 assert(directory);
2438 assert(kmsg_socket >= 0);
2439
2440 cg_unified_flush();
2441
2442 if (arg_userns) {
2443 /* Tell the parent, that it now can write the UID map. */
2444 (void) barrier_place(barrier); /* #1 */
2445
2446 /* Wait until the parent wrote the UID map */
2447 if (!barrier_place_and_sync(barrier)) { /* #2 */
2448 log_error("Parent died too early");
2449 return -ESRCH;
2450 }
2451 }
2452
2453 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2454 if (r < 0)
2455 return r;
2456
2457 r = mount_sysfs(NULL);
2458 if (r < 0)
2459 return r;
2460
2461 /* Wait until we are cgroup-ified, so that we
2462 * can mount the right cgroup path writable */
2463 if (!barrier_place_and_sync(barrier)) { /* #3 */
2464 log_error("Parent died too early");
2465 return -ESRCH;
2466 }
2467
2468 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2469 if (r < 0)
2470 return r;
2471
2472 r = reset_uid_gid();
2473 if (r < 0)
2474 return log_error_errno(r, "Couldn't become new root: %m");
2475
2476 r = setup_boot_id(NULL);
2477 if (r < 0)
2478 return r;
2479
2480 r = setup_kmsg(NULL, kmsg_socket);
2481 if (r < 0)
2482 return r;
2483 kmsg_socket = safe_close(kmsg_socket);
2484
2485 umask(0022);
2486
2487 if (setsid() < 0)
2488 return log_error_errno(errno, "setsid() failed: %m");
2489
2490 if (arg_private_network)
2491 loopback_setup();
2492
2493 if (arg_expose_ports) {
2494 r = expose_port_send_rtnl(rtnl_socket);
2495 if (r < 0)
2496 return r;
2497 rtnl_socket = safe_close(rtnl_socket);
2498 }
2499
2500 if (drop_capabilities() < 0)
2501 return log_error_errno(errno, "drop_capabilities() failed: %m");
2502
2503 setup_hostname();
2504
2505 if (arg_personality != PERSONALITY_INVALID) {
2506 if (personality(arg_personality) < 0)
2507 return log_error_errno(errno, "personality() failed: %m");
2508 } else if (secondary) {
2509 if (personality(PER_LINUX32) < 0)
2510 return log_error_errno(errno, "personality() failed: %m");
2511 }
2512
2513 #ifdef HAVE_SELINUX
2514 if (arg_selinux_context)
2515 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2516 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2517 #endif
2518
2519 r = change_uid_gid(arg_user, &home);
2520 if (r < 0)
2521 return r;
2522
2523 envp[n_env] = strv_find_prefix(environ, "TERM=");
2524 if (envp[n_env])
2525 n_env ++;
2526
2527 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2528 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2529 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2530 return log_oom();
2531
2532 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2533 char as_uuid[37];
2534
2535 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2536 return log_oom();
2537 }
2538
2539 if (fdset_size(fds) > 0) {
2540 r = fdset_cloexec(fds, false);
2541 if (r < 0)
2542 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2543
2544 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2545 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2546 return log_oom();
2547 }
2548
2549 env_use = strv_env_merge(2, envp, arg_setenv);
2550 if (!env_use)
2551 return log_oom();
2552
2553 /* Let the parent know that we are ready and
2554 * wait until the parent is ready with the
2555 * setup, too... */
2556 if (!barrier_place_and_sync(barrier)) { /* #4 */
2557 log_error("Parent died too early");
2558 return -ESRCH;
2559 }
2560
2561 /* Now, explicitly close the log, so that we
2562 * then can close all remaining fds. Closing
2563 * the log explicitly first has the benefit
2564 * that the logging subsystem knows about it,
2565 * and is thus ready to be reopened should we
2566 * need it again. Note that the other fds
2567 * closed here are at least the locking and
2568 * barrier fds. */
2569 log_close();
2570 (void) fdset_close_others(fds);
2571
2572 if (arg_boot) {
2573 char **a;
2574 size_t m;
2575
2576 /* Automatically search for the init system */
2577
2578 m = 1 + strv_length(arg_parameters);
2579 a = newa(char*, m + 1);
2580 if (strv_isempty(arg_parameters))
2581 a[1] = NULL;
2582 else
2583 memcpy(a + 1, arg_parameters, m * sizeof(char*));
2584
2585 a[0] = (char*) "/usr/lib/systemd/systemd";
2586 execve(a[0], a, env_use);
2587
2588 a[0] = (char*) "/lib/systemd/systemd";
2589 execve(a[0], a, env_use);
2590
2591 a[0] = (char*) "/sbin/init";
2592 execve(a[0], a, env_use);
2593 } else if (!strv_isempty(arg_parameters))
2594 execvpe(arg_parameters[0], arg_parameters, env_use);
2595 else {
2596 chdir(home ?: "/root");
2597 execle("/bin/bash", "-bash", NULL, env_use);
2598 execle("/bin/sh", "-sh", NULL, env_use);
2599 }
2600
2601 (void) log_open();
2602 return log_error_errno(errno, "execv() failed: %m");
2603 }
2604
2605 static int outer_child(
2606 Barrier *barrier,
2607 const char *directory,
2608 const char *console,
2609 const char *root_device, bool root_device_rw,
2610 const char *home_device, bool home_device_rw,
2611 const char *srv_device, bool srv_device_rw,
2612 bool interactive,
2613 bool secondary,
2614 int pid_socket,
2615 int kmsg_socket,
2616 int rtnl_socket,
2617 int uid_shift_socket,
2618 FDSet *fds) {
2619
2620 pid_t pid;
2621 ssize_t l;
2622 int r;
2623
2624 assert(barrier);
2625 assert(directory);
2626 assert(console);
2627 assert(pid_socket >= 0);
2628 assert(kmsg_socket >= 0);
2629
2630 cg_unified_flush();
2631
2632 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2633 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2634
2635 if (interactive) {
2636 close_nointr(STDIN_FILENO);
2637 close_nointr(STDOUT_FILENO);
2638 close_nointr(STDERR_FILENO);
2639
2640 r = open_terminal(console, O_RDWR);
2641 if (r != STDIN_FILENO) {
2642 if (r >= 0) {
2643 safe_close(r);
2644 r = -EINVAL;
2645 }
2646
2647 return log_error_errno(r, "Failed to open console: %m");
2648 }
2649
2650 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2651 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2652 return log_error_errno(errno, "Failed to duplicate console: %m");
2653 }
2654
2655 r = reset_audit_loginuid();
2656 if (r < 0)
2657 return r;
2658
2659 /* Mark everything as slave, so that we still
2660 * receive mounts from the real root, but don't
2661 * propagate mounts to the real root. */
2662 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2663 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2664
2665 r = mount_devices(directory,
2666 root_device, root_device_rw,
2667 home_device, home_device_rw,
2668 srv_device, srv_device_rw);
2669 if (r < 0)
2670 return r;
2671
2672 r = determine_uid_shift(directory);
2673 if (r < 0)
2674 return r;
2675
2676 if (arg_userns) {
2677 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2678 if (l < 0)
2679 return log_error_errno(errno, "Failed to send UID shift: %m");
2680 if (l != sizeof(arg_uid_shift)) {
2681 log_error("Short write while sending UID shift.");
2682 return -EIO;
2683 }
2684 }
2685
2686 /* Turn directory into bind mount */
2687 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2688 return log_error_errno(errno, "Failed to make bind mount: %m");
2689
2690 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2691 if (r < 0)
2692 return r;
2693
2694 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2695 if (r < 0)
2696 return r;
2697
2698 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2699 if (r < 0)
2700 return r;
2701
2702 if (arg_read_only) {
2703 r = bind_remount_recursive(directory, true);
2704 if (r < 0)
2705 return log_error_errno(r, "Failed to make tree read-only: %m");
2706 }
2707
2708 r = mount_all(directory, arg_userns, false, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2709 if (r < 0)
2710 return r;
2711
2712 r = copy_devnodes(directory);
2713 if (r < 0)
2714 return r;
2715
2716 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2717
2718 r = setup_pts(directory);
2719 if (r < 0)
2720 return r;
2721
2722 r = setup_propagate(directory);
2723 if (r < 0)
2724 return r;
2725
2726 r = setup_dev_console(directory, console);
2727 if (r < 0)
2728 return r;
2729
2730 r = setup_seccomp();
2731 if (r < 0)
2732 return r;
2733
2734 r = setup_timezone(directory);
2735 if (r < 0)
2736 return r;
2737
2738 r = setup_resolv_conf(directory);
2739 if (r < 0)
2740 return r;
2741
2742 r = setup_journal(directory);
2743 if (r < 0)
2744 return r;
2745
2746 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2747 if (r < 0)
2748 return r;
2749
2750 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2751 if (r < 0)
2752 return r;
2753
2754 r = mount_move_root(directory);
2755 if (r < 0)
2756 return log_error_errno(r, "Failed to move root directory: %m");
2757
2758 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2759 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2760 (arg_private_network ? CLONE_NEWNET : 0) |
2761 (arg_userns ? CLONE_NEWUSER : 0),
2762 NULL);
2763 if (pid < 0)
2764 return log_error_errno(errno, "Failed to fork inner child: %m");
2765 if (pid == 0) {
2766 pid_socket = safe_close(pid_socket);
2767 uid_shift_socket = safe_close(uid_shift_socket);
2768
2769 /* The inner child has all namespaces that are
2770 * requested, so that we all are owned by the user if
2771 * user namespaces are turned on. */
2772
2773 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2774 if (r < 0)
2775 _exit(EXIT_FAILURE);
2776
2777 _exit(EXIT_SUCCESS);
2778 }
2779
2780 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2781 if (l < 0)
2782 return log_error_errno(errno, "Failed to send PID: %m");
2783 if (l != sizeof(pid)) {
2784 log_error("Short write while sending PID.");
2785 return -EIO;
2786 }
2787
2788 pid_socket = safe_close(pid_socket);
2789 kmsg_socket = safe_close(kmsg_socket);
2790 rtnl_socket = safe_close(rtnl_socket);
2791
2792 return 0;
2793 }
2794
2795 static int setup_uid_map(pid_t pid) {
2796 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2797 int r;
2798
2799 assert(pid > 1);
2800
2801 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2802 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2803 r = write_string_file(uid_map, line, 0);
2804 if (r < 0)
2805 return log_error_errno(r, "Failed to write UID map: %m");
2806
2807 /* We always assign the same UID and GID ranges */
2808 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2809 r = write_string_file(uid_map, line, 0);
2810 if (r < 0)
2811 return log_error_errno(r, "Failed to write GID map: %m");
2812
2813 return 0;
2814 }
2815
2816 static int load_settings(void) {
2817 _cleanup_(settings_freep) Settings *settings = NULL;
2818 _cleanup_fclose_ FILE *f = NULL;
2819 _cleanup_free_ char *p = NULL;
2820 const char *fn, *i;
2821 int r;
2822
2823 /* If all settings are masked, there's no point in looking for
2824 * the settings file */
2825 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2826 return 0;
2827
2828 fn = strjoina(arg_machine, ".nspawn");
2829
2830 /* We first look in the admin's directories in /etc and /run */
2831 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2832 _cleanup_free_ char *j = NULL;
2833
2834 j = strjoin(i, "/", fn, NULL);
2835 if (!j)
2836 return log_oom();
2837
2838 f = fopen(j, "re");
2839 if (f) {
2840 p = j;
2841 j = NULL;
2842
2843 /* By default we trust configuration from /etc and /run */
2844 if (arg_settings_trusted < 0)
2845 arg_settings_trusted = true;
2846
2847 break;
2848 }
2849
2850 if (errno != ENOENT)
2851 return log_error_errno(errno, "Failed to open %s: %m", j);
2852 }
2853
2854 if (!f) {
2855 /* After that, let's look for a file next to the
2856 * actual image we shall boot. */
2857
2858 if (arg_image) {
2859 p = file_in_same_dir(arg_image, fn);
2860 if (!p)
2861 return log_oom();
2862 } else if (arg_directory) {
2863 p = file_in_same_dir(arg_directory, fn);
2864 if (!p)
2865 return log_oom();
2866 }
2867
2868 if (p) {
2869 f = fopen(p, "re");
2870 if (!f && errno != ENOENT)
2871 return log_error_errno(errno, "Failed to open %s: %m", p);
2872
2873 /* By default we do not trust configuration from /var/lib/machines */
2874 if (arg_settings_trusted < 0)
2875 arg_settings_trusted = false;
2876 }
2877 }
2878
2879 if (!f)
2880 return 0;
2881
2882 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2883
2884 r = settings_load(f, p, &settings);
2885 if (r < 0)
2886 return r;
2887
2888 /* Copy over bits from the settings, unless they have been
2889 * explicitly masked by command line switches. */
2890
2891 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2892 settings->boot >= 0) {
2893 arg_boot = settings->boot;
2894
2895 strv_free(arg_parameters);
2896 arg_parameters = settings->parameters;
2897 settings->parameters = NULL;
2898 }
2899
2900 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2901 settings->environment) {
2902 strv_free(arg_setenv);
2903 arg_setenv = settings->environment;
2904 settings->environment = NULL;
2905 }
2906
2907 if ((arg_settings_mask & SETTING_USER) == 0 &&
2908 settings->user) {
2909 free(arg_user);
2910 arg_user = settings->user;
2911 settings->user = NULL;
2912 }
2913
2914 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
2915
2916 if (!arg_settings_trusted && settings->capability != 0)
2917 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2918 else
2919 arg_retain |= settings->capability;
2920
2921 arg_retain &= ~settings->drop_capability;
2922 }
2923
2924 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2925 settings->kill_signal > 0)
2926 arg_kill_signal = settings->kill_signal;
2927
2928 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2929 settings->personality != PERSONALITY_INVALID)
2930 arg_personality = settings->personality;
2931
2932 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2933 !sd_id128_is_null(settings->machine_id)) {
2934
2935 if (!arg_settings_trusted)
2936 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2937 else
2938 arg_uuid = settings->machine_id;
2939 }
2940
2941 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2942 settings->read_only >= 0)
2943 arg_read_only = settings->read_only;
2944
2945 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2946 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2947 arg_volatile_mode = settings->volatile_mode;
2948
2949 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2950 settings->n_custom_mounts > 0) {
2951
2952 if (!arg_settings_trusted)
2953 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2954 else {
2955 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2956 arg_custom_mounts = settings->custom_mounts;
2957 arg_n_custom_mounts = settings->n_custom_mounts;
2958
2959 settings->custom_mounts = NULL;
2960 settings->n_custom_mounts = 0;
2961 }
2962 }
2963
2964 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2965 (settings->private_network >= 0 ||
2966 settings->network_veth >= 0 ||
2967 settings->network_bridge ||
2968 settings->network_interfaces ||
2969 settings->network_macvlan ||
2970 settings->network_ipvlan)) {
2971
2972 if (!arg_settings_trusted)
2973 log_warning("Ignoring network settings, file %s is not trusted.", p);
2974 else {
2975 strv_free(arg_network_interfaces);
2976 arg_network_interfaces = settings->network_interfaces;
2977 settings->network_interfaces = NULL;
2978
2979 strv_free(arg_network_macvlan);
2980 arg_network_macvlan = settings->network_macvlan;
2981 settings->network_macvlan = NULL;
2982
2983 strv_free(arg_network_ipvlan);
2984 arg_network_ipvlan = settings->network_ipvlan;
2985 settings->network_ipvlan = NULL;
2986
2987 free(arg_network_bridge);
2988 arg_network_bridge = settings->network_bridge;
2989 settings->network_bridge = NULL;
2990
2991 arg_network_veth = settings->network_veth > 0 || settings->network_bridge;
2992
2993 arg_private_network = true; /* all these settings imply private networking */
2994 }
2995 }
2996
2997 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
2998 settings->expose_ports) {
2999
3000 if (!arg_settings_trusted)
3001 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3002 else {
3003 expose_port_free_all(arg_expose_ports);
3004 arg_expose_ports = settings->expose_ports;
3005 settings->expose_ports = NULL;
3006 }
3007 }
3008
3009 return 0;
3010 }
3011
3012 int main(int argc, char *argv[]) {
3013
3014 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3015 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3016 _cleanup_close_ int master = -1, image_fd = -1;
3017 _cleanup_fdset_free_ FDSet *fds = NULL;
3018 int r, n_fd_passed, loop_nr = -1;
3019 char veth_name[IFNAMSIZ];
3020 bool secondary = false, remove_subvol = false;
3021 sigset_t mask_chld;
3022 pid_t pid = 0;
3023 int ret = EXIT_SUCCESS;
3024 union in_addr_union exposed = {};
3025 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3026 bool interactive;
3027
3028 log_parse_environment();
3029 log_open();
3030
3031 r = parse_argv(argc, argv);
3032 if (r <= 0)
3033 goto finish;
3034
3035 if (geteuid() != 0) {
3036 log_error("Need to be root.");
3037 r = -EPERM;
3038 goto finish;
3039 }
3040 r = determine_names();
3041 if (r < 0)
3042 goto finish;
3043
3044 r = load_settings();
3045 if (r < 0)
3046 goto finish;
3047
3048 r = verify_arguments();
3049 if (r < 0)
3050 goto finish;
3051
3052 n_fd_passed = sd_listen_fds(false);
3053 if (n_fd_passed > 0) {
3054 r = fdset_new_listen_fds(&fds, false);
3055 if (r < 0) {
3056 log_error_errno(r, "Failed to collect file descriptors: %m");
3057 goto finish;
3058 }
3059 }
3060
3061 if (arg_directory) {
3062 assert(!arg_image);
3063
3064 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3065 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3066 r = -EINVAL;
3067 goto finish;
3068 }
3069
3070 if (arg_ephemeral) {
3071 _cleanup_free_ char *np = NULL;
3072
3073 /* If the specified path is a mount point we
3074 * generate the new snapshot immediately
3075 * inside it under a random name. However if
3076 * the specified is not a mount point we
3077 * create the new snapshot in the parent
3078 * directory, just next to it. */
3079 r = path_is_mount_point(arg_directory, 0);
3080 if (r < 0) {
3081 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3082 goto finish;
3083 }
3084 if (r > 0)
3085 r = tempfn_random_child(arg_directory, "machine.", &np);
3086 else
3087 r = tempfn_random(arg_directory, "machine.", &np);
3088 if (r < 0) {
3089 log_error_errno(r, "Failed to generate name for snapshot: %m");
3090 goto finish;
3091 }
3092
3093 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3094 if (r < 0) {
3095 log_error_errno(r, "Failed to lock %s: %m", np);
3096 goto finish;
3097 }
3098
3099 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
3100 if (r < 0) {
3101 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3102 goto finish;
3103 }
3104
3105 free(arg_directory);
3106 arg_directory = np;
3107 np = NULL;
3108
3109 remove_subvol = true;
3110
3111 } else {
3112 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3113 if (r == -EBUSY) {
3114 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3115 goto finish;
3116 }
3117 if (r < 0) {
3118 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3119 return r;
3120 }
3121
3122 if (arg_template) {
3123 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
3124 if (r == -EEXIST) {
3125 if (!arg_quiet)
3126 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3127 } else if (r < 0) {
3128 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3129 goto finish;
3130 } else {
3131 if (!arg_quiet)
3132 log_info("Populated %s from template %s.", arg_directory, arg_template);
3133 }
3134 }
3135 }
3136
3137 if (arg_boot) {
3138 if (path_is_os_tree(arg_directory) <= 0) {
3139 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3140 r = -EINVAL;
3141 goto finish;
3142 }
3143 } else {
3144 const char *p;
3145
3146 p = strjoina(arg_directory,
3147 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3148 if (access(p, F_OK) < 0) {
3149 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3150 r = -EINVAL;
3151 goto finish;
3152 }
3153 }
3154
3155 } else {
3156 char template[] = "/tmp/nspawn-root-XXXXXX";
3157
3158 assert(arg_image);
3159 assert(!arg_template);
3160
3161 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3162 if (r == -EBUSY) {
3163 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3164 goto finish;
3165 }
3166 if (r < 0) {
3167 r = log_error_errno(r, "Failed to create image lock: %m");
3168 goto finish;
3169 }
3170
3171 if (!mkdtemp(template)) {
3172 log_error_errno(errno, "Failed to create temporary directory: %m");
3173 r = -errno;
3174 goto finish;
3175 }
3176
3177 arg_directory = strdup(template);
3178 if (!arg_directory) {
3179 r = log_oom();
3180 goto finish;
3181 }
3182
3183 image_fd = setup_image(&device_path, &loop_nr);
3184 if (image_fd < 0) {
3185 r = image_fd;
3186 goto finish;
3187 }
3188
3189 r = dissect_image(image_fd,
3190 &root_device, &root_device_rw,
3191 &home_device, &home_device_rw,
3192 &srv_device, &srv_device_rw,
3193 &secondary);
3194 if (r < 0)
3195 goto finish;
3196 }
3197
3198 r = custom_mounts_prepare();
3199 if (r < 0)
3200 goto finish;
3201
3202 interactive =
3203 isatty(STDIN_FILENO) > 0 &&
3204 isatty(STDOUT_FILENO) > 0;
3205
3206 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3207 if (master < 0) {
3208 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3209 goto finish;
3210 }
3211
3212 r = ptsname_malloc(master, &console);
3213 if (r < 0) {
3214 r = log_error_errno(r, "Failed to determine tty name: %m");
3215 goto finish;
3216 }
3217
3218 if (unlockpt(master) < 0) {
3219 r = log_error_errno(errno, "Failed to unlock tty: %m");
3220 goto finish;
3221 }
3222
3223 if (!arg_quiet)
3224 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3225 arg_machine, arg_image ?: arg_directory);
3226
3227 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3228
3229 assert_se(sigemptyset(&mask_chld) == 0);
3230 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3231
3232 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3233 r = log_error_errno(errno, "Failed to become subreaper: %m");
3234 goto finish;
3235 }
3236
3237 for (;;) {
3238 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
3239 uid_shift_socket_pair[2] = { -1, -1 };
3240 ContainerStatus container_status;
3241 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3242 static const struct sigaction sa = {
3243 .sa_handler = nop_signal_handler,
3244 .sa_flags = SA_NOCLDSTOP,
3245 };
3246 int ifi = 0;
3247 ssize_t l;
3248 _cleanup_event_unref_ sd_event *event = NULL;
3249 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3250 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3251 char last_char = 0;
3252
3253 r = barrier_create(&barrier);
3254 if (r < 0) {
3255 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3256 goto finish;
3257 }
3258
3259 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3260 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3261 goto finish;
3262 }
3263
3264 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3265 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3266 goto finish;
3267 }
3268
3269 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3270 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3271 goto finish;
3272 }
3273
3274 if (arg_userns)
3275 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3276 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3277 goto finish;
3278 }
3279
3280 /* Child can be killed before execv(), so handle SIGCHLD
3281 * in order to interrupt parent's blocking calls and
3282 * give it a chance to call wait() and terminate. */
3283 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3284 if (r < 0) {
3285 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3286 goto finish;
3287 }
3288
3289 r = sigaction(SIGCHLD, &sa, NULL);
3290 if (r < 0) {
3291 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3292 goto finish;
3293 }
3294
3295 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3296 if (pid < 0) {
3297 if (errno == EINVAL)
3298 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3299 else
3300 r = log_error_errno(errno, "clone() failed: %m");
3301
3302 goto finish;
3303 }
3304
3305 if (pid == 0) {
3306 /* The outer child only has a file system namespace. */
3307 barrier_set_role(&barrier, BARRIER_CHILD);
3308
3309 master = safe_close(master);
3310
3311 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3312 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3313 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3314 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3315
3316 (void) reset_all_signal_handlers();
3317 (void) reset_signal_mask();
3318
3319 r = outer_child(&barrier,
3320 arg_directory,
3321 console,
3322 root_device, root_device_rw,
3323 home_device, home_device_rw,
3324 srv_device, srv_device_rw,
3325 interactive,
3326 secondary,
3327 pid_socket_pair[1],
3328 kmsg_socket_pair[1],
3329 rtnl_socket_pair[1],
3330 uid_shift_socket_pair[1],
3331 fds);
3332 if (r < 0)
3333 _exit(EXIT_FAILURE);
3334
3335 _exit(EXIT_SUCCESS);
3336 }
3337
3338 barrier_set_role(&barrier, BARRIER_PARENT);
3339
3340 fds = fdset_free(fds);
3341
3342 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3343 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3344 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3345 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3346
3347 /* Wait for the outer child. */
3348 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3349 if (r < 0)
3350 goto finish;
3351 if (r != 0) {
3352 r = -EIO;
3353 goto finish;
3354 }
3355 pid = 0;
3356
3357 /* And now retrieve the PID of the inner child. */
3358 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3359 if (l < 0) {
3360 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3361 goto finish;
3362 }
3363 if (l != sizeof(pid)) {
3364 log_error("Short read while reading inner child PID.");
3365 r = EIO;
3366 goto finish;
3367 }
3368
3369 log_debug("Init process invoked as PID " PID_FMT, pid);
3370
3371 if (arg_userns) {
3372 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3373 log_error("Child died too early.");
3374 r = -ESRCH;
3375 goto finish;
3376 }
3377
3378 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3379 if (l < 0) {
3380 r = log_error_errno(errno, "Failed to read UID shift: %m");
3381 goto finish;
3382 }
3383 if (l != sizeof(arg_uid_shift)) {
3384 log_error("Short read while reading UID shift.");
3385 r = EIO;
3386 goto finish;
3387 }
3388
3389 r = setup_uid_map(pid);
3390 if (r < 0)
3391 goto finish;
3392
3393 (void) barrier_place(&barrier); /* #2 */
3394 }
3395
3396 if (arg_private_network) {
3397
3398 r = move_network_interfaces(pid, arg_network_interfaces);
3399 if (r < 0)
3400 goto finish;
3401
3402 if (arg_network_veth) {
3403 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3404 if (r < 0)
3405 goto finish;
3406 else if (r > 0)
3407 ifi = r;
3408
3409 if (arg_network_bridge) {
3410 r = setup_bridge(veth_name, arg_network_bridge);
3411 if (r < 0)
3412 goto finish;
3413 if (r > 0)
3414 ifi = r;
3415 }
3416 }
3417
3418 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3419 if (r < 0)
3420 goto finish;
3421
3422 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3423 if (r < 0)
3424 goto finish;
3425 }
3426
3427 if (arg_register) {
3428 r = register_machine(
3429 arg_machine,
3430 pid,
3431 arg_directory,
3432 arg_uuid,
3433 ifi,
3434 arg_slice,
3435 arg_custom_mounts, arg_n_custom_mounts,
3436 arg_kill_signal,
3437 arg_property,
3438 arg_keep_unit);
3439 if (r < 0)
3440 goto finish;
3441 }
3442
3443 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3444 if (r < 0)
3445 goto finish;
3446
3447 if (arg_keep_unit) {
3448 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3449 if (r < 0)
3450 goto finish;
3451 }
3452
3453 r = chown_cgroup(pid, arg_uid_shift);
3454 if (r < 0)
3455 goto finish;
3456
3457 /* Notify the child that the parent is ready with all
3458 * its setup (including cgroup-ification), and that
3459 * the child can now hand over control to the code to
3460 * run inside the container. */
3461 (void) barrier_place(&barrier); /* #3 */
3462
3463 /* Block SIGCHLD here, before notifying child.
3464 * process_pty() will handle it with the other signals. */
3465 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3466
3467 /* Reset signal to default */
3468 r = default_signals(SIGCHLD, -1);
3469 if (r < 0) {
3470 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3471 goto finish;
3472 }
3473
3474 /* Let the child know that we are ready and wait that the child is completely ready now. */
3475 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3476 log_error("Child died too early.");
3477 r = -ESRCH;
3478 goto finish;
3479 }
3480
3481 sd_notifyf(false,
3482 "READY=1\n"
3483 "STATUS=Container running.\n"
3484 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3485
3486 r = sd_event_new(&event);
3487 if (r < 0) {
3488 log_error_errno(r, "Failed to get default event source: %m");
3489 goto finish;
3490 }
3491
3492 if (arg_kill_signal > 0) {
3493 /* Try to kill the init system on SIGINT or SIGTERM */
3494 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3495 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3496 } else {
3497 /* Immediately exit */
3498 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3499 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3500 }
3501
3502 /* simply exit on sigchld */
3503 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3504
3505 if (arg_expose_ports) {
3506 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3507 if (r < 0)
3508 goto finish;
3509
3510 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3511 }
3512
3513 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3514
3515 r = pty_forward_new(event, master, true, !interactive, &forward);
3516 if (r < 0) {
3517 log_error_errno(r, "Failed to create PTY forwarder: %m");
3518 goto finish;
3519 }
3520
3521 r = sd_event_loop(event);
3522 if (r < 0) {
3523 log_error_errno(r, "Failed to run event loop: %m");
3524 goto finish;
3525 }
3526
3527 pty_forward_get_last_char(forward, &last_char);
3528
3529 forward = pty_forward_free(forward);
3530
3531 if (!arg_quiet && last_char != '\n')
3532 putc('\n', stdout);
3533
3534 /* Kill if it is not dead yet anyway */
3535 if (arg_register && !arg_keep_unit)
3536 terminate_machine(pid);
3537
3538 /* Normally redundant, but better safe than sorry */
3539 kill(pid, SIGKILL);
3540
3541 r = wait_for_container(pid, &container_status);
3542 pid = 0;
3543
3544 if (r < 0)
3545 /* We failed to wait for the container, or the
3546 * container exited abnormally */
3547 goto finish;
3548 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3549 /* The container exited with a non-zero
3550 * status, or with zero status and no reboot
3551 * was requested. */
3552 ret = r;
3553 break;
3554 }
3555
3556 /* CONTAINER_REBOOTED, loop again */
3557
3558 if (arg_keep_unit) {
3559 /* Special handling if we are running as a
3560 * service: instead of simply restarting the
3561 * machine we want to restart the entire
3562 * service, so let's inform systemd about this
3563 * with the special exit code 133. The service
3564 * file uses RestartForceExitStatus=133 so
3565 * that this results in a full nspawn
3566 * restart. This is necessary since we might
3567 * have cgroup parameters set we want to have
3568 * flushed out. */
3569 ret = 133;
3570 r = 0;
3571 break;
3572 }
3573
3574 expose_port_flush(arg_expose_ports, &exposed);
3575 }
3576
3577 finish:
3578 sd_notify(false,
3579 "STOPPING=1\n"
3580 "STATUS=Terminating...");
3581
3582 if (pid > 0)
3583 kill(pid, SIGKILL);
3584
3585 /* Try to flush whatever is still queued in the pty */
3586 if (master >= 0)
3587 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
3588
3589 loop_remove(loop_nr, &image_fd);
3590
3591 if (remove_subvol && arg_directory) {
3592 int k;
3593
3594 k = btrfs_subvol_remove(arg_directory, true);
3595 if (k < 0)
3596 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3597 }
3598
3599 if (arg_machine) {
3600 const char *p;
3601
3602 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3603 (void) rm_rf(p, REMOVE_ROOT);
3604 }
3605
3606 expose_port_flush(arg_expose_ports, &exposed);
3607
3608 free(arg_directory);
3609 free(arg_template);
3610 free(arg_image);
3611 free(arg_machine);
3612 free(arg_user);
3613 strv_free(arg_setenv);
3614 free(arg_network_bridge);
3615 strv_free(arg_network_interfaces);
3616 strv_free(arg_network_macvlan);
3617 strv_free(arg_network_ipvlan);
3618 strv_free(arg_parameters);
3619 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3620 expose_port_free_all(arg_expose_ports);
3621
3622 return r < 0 ? EXIT_FAILURE : ret;
3623 }