]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #1195 from poettering/nspawn-fixes
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #ifdef HAVE_BLKID
23 #include <blkid/blkid.h>
24 #endif
25 #include <errno.h>
26 #include <getopt.h>
27 #include <linux/loop.h>
28 #include <sched.h>
29 #ifdef HAVE_SECCOMP
30 #include <seccomp.h>
31 #endif
32 #ifdef HAVE_SELINUX
33 #include <selinux/selinux.h>
34 #endif
35 #include <signal.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <sys/file.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
44 #include <unistd.h>
45
46 #include "sd-daemon.h"
47 #include "sd-id128.h"
48
49 #include "barrier.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
53 #include "build.h"
54 #include "cap-list.h"
55 #include "capability.h"
56 #include "cgroup-util.h"
57 #include "copy.h"
58 #include "dev-setup.h"
59 #include "env-util.h"
60 #include "event-util.h"
61 #include "fdset.h"
62 #include "fileio.h"
63 #include "formats-util.h"
64 #include "gpt.h"
65 #include "hostname-util.h"
66 #include "log.h"
67 #include "loopback-setup.h"
68 #include "machine-image.h"
69 #include "macro.h"
70 #include "missing.h"
71 #include "mkdir.h"
72 #include "netlink-util.h"
73 #include "path-util.h"
74 #include "process-util.h"
75 #include "ptyfwd.h"
76 #include "random-util.h"
77 #include "rm-rf.h"
78 #ifdef HAVE_SECCOMP
79 #include "seccomp-util.h"
80 #endif
81 #include "signal-util.h"
82 #include "strv.h"
83 #include "terminal-util.h"
84 #include "udev-util.h"
85 #include "util.h"
86
87 #include "nspawn-settings.h"
88 #include "nspawn-mount.h"
89 #include "nspawn-network.h"
90 #include "nspawn-expose-ports.h"
91 #include "nspawn-cgroup.h"
92 #include "nspawn-register.h"
93 #include "nspawn-setuid.h"
94
95 typedef enum ContainerStatus {
96 CONTAINER_TERMINATED,
97 CONTAINER_REBOOTED
98 } ContainerStatus;
99
100 typedef enum LinkJournal {
101 LINK_NO,
102 LINK_AUTO,
103 LINK_HOST,
104 LINK_GUEST
105 } LinkJournal;
106
107 static char *arg_directory = NULL;
108 static char *arg_template = NULL;
109 static char *arg_user = NULL;
110 static sd_id128_t arg_uuid = {};
111 static char *arg_machine = NULL;
112 static const char *arg_selinux_context = NULL;
113 static const char *arg_selinux_apifs_context = NULL;
114 static const char *arg_slice = NULL;
115 static bool arg_private_network = false;
116 static bool arg_read_only = false;
117 static bool arg_boot = false;
118 static bool arg_ephemeral = false;
119 static LinkJournal arg_link_journal = LINK_AUTO;
120 static bool arg_link_journal_try = false;
121 static uint64_t arg_retain =
122 (1ULL << CAP_CHOWN) |
123 (1ULL << CAP_DAC_OVERRIDE) |
124 (1ULL << CAP_DAC_READ_SEARCH) |
125 (1ULL << CAP_FOWNER) |
126 (1ULL << CAP_FSETID) |
127 (1ULL << CAP_IPC_OWNER) |
128 (1ULL << CAP_KILL) |
129 (1ULL << CAP_LEASE) |
130 (1ULL << CAP_LINUX_IMMUTABLE) |
131 (1ULL << CAP_NET_BIND_SERVICE) |
132 (1ULL << CAP_NET_BROADCAST) |
133 (1ULL << CAP_NET_RAW) |
134 (1ULL << CAP_SETGID) |
135 (1ULL << CAP_SETFCAP) |
136 (1ULL << CAP_SETPCAP) |
137 (1ULL << CAP_SETUID) |
138 (1ULL << CAP_SYS_ADMIN) |
139 (1ULL << CAP_SYS_CHROOT) |
140 (1ULL << CAP_SYS_NICE) |
141 (1ULL << CAP_SYS_PTRACE) |
142 (1ULL << CAP_SYS_TTY_CONFIG) |
143 (1ULL << CAP_SYS_RESOURCE) |
144 (1ULL << CAP_SYS_BOOT) |
145 (1ULL << CAP_AUDIT_WRITE) |
146 (1ULL << CAP_AUDIT_CONTROL) |
147 (1ULL << CAP_MKNOD);
148 static CustomMount *arg_custom_mounts = NULL;
149 static unsigned arg_n_custom_mounts = 0;
150 static char **arg_setenv = NULL;
151 static bool arg_quiet = false;
152 static bool arg_share_system = false;
153 static bool arg_register = true;
154 static bool arg_keep_unit = false;
155 static char **arg_network_interfaces = NULL;
156 static char **arg_network_macvlan = NULL;
157 static char **arg_network_ipvlan = NULL;
158 static bool arg_network_veth = false;
159 static char *arg_network_bridge = NULL;
160 static unsigned long arg_personality = PERSONALITY_INVALID;
161 static char *arg_image = NULL;
162 static VolatileMode arg_volatile_mode = VOLATILE_NO;
163 static ExposePort *arg_expose_ports = NULL;
164 static char **arg_property = NULL;
165 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
166 static bool arg_userns = false;
167 static int arg_kill_signal = 0;
168 static bool arg_unified_cgroup_hierarchy = false;
169 static SettingsMask arg_settings_mask = 0;
170 static int arg_settings_trusted = -1;
171 static char **arg_parameters = NULL;
172
173 static void help(void) {
174 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
175 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
176 " -h --help Show this help\n"
177 " --version Print version string\n"
178 " -q --quiet Do not show status information\n"
179 " -D --directory=PATH Root directory for the container\n"
180 " --template=PATH Initialize root directory from template directory,\n"
181 " if missing\n"
182 " -x --ephemeral Run container with snapshot of root directory, and\n"
183 " remove it after exit\n"
184 " -i --image=PATH File system device or disk image for the container\n"
185 " -b --boot Boot up full system (i.e. invoke init)\n"
186 " -u --user=USER Run the command under specified user or uid\n"
187 " -M --machine=NAME Set the machine name for the container\n"
188 " --uuid=UUID Set a specific machine UUID for the container\n"
189 " -S --slice=SLICE Place the container in the specified slice\n"
190 " --property=NAME=VALUE Set scope unit property\n"
191 " --private-users[=UIDBASE[:NUIDS]]\n"
192 " Run within user namespace\n"
193 " --private-network Disable network in container\n"
194 " --network-interface=INTERFACE\n"
195 " Assign an existing network interface to the\n"
196 " container\n"
197 " --network-macvlan=INTERFACE\n"
198 " Create a macvlan network interface based on an\n"
199 " existing network interface to the container\n"
200 " --network-ipvlan=INTERFACE\n"
201 " Create a ipvlan network interface based on an\n"
202 " existing network interface to the container\n"
203 " -n --network-veth Add a virtual ethernet connection between host\n"
204 " and container\n"
205 " --network-bridge=INTERFACE\n"
206 " Add a virtual ethernet connection between host\n"
207 " and container and add it to an existing bridge on\n"
208 " the host\n"
209 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
210 " Expose a container IP port on the host\n"
211 " -Z --selinux-context=SECLABEL\n"
212 " Set the SELinux security context to be used by\n"
213 " processes in the container\n"
214 " -L --selinux-apifs-context=SECLABEL\n"
215 " Set the SELinux security context to be used by\n"
216 " API/tmpfs file systems in the container\n"
217 " --capability=CAP In addition to the default, retain specified\n"
218 " capability\n"
219 " --drop-capability=CAP Drop the specified capability from the default set\n"
220 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
221 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
222 " try-guest, try-host\n"
223 " -j Equivalent to --link-journal=try-guest\n"
224 " --read-only Mount the root directory read-only\n"
225 " --bind=PATH[:PATH[:OPTIONS]]\n"
226 " Bind mount a file or directory from the host into\n"
227 " the container\n"
228 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
229 " Similar, but creates a read-only bind mount\n"
230 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
231 " --overlay=PATH[:PATH...]:PATH\n"
232 " Create an overlay mount from the host to \n"
233 " the container\n"
234 " --overlay-ro=PATH[:PATH...]:PATH\n"
235 " Similar, but creates a read-only overlay mount\n"
236 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
237 " --share-system Share system namespaces with host\n"
238 " --register=BOOLEAN Register container as machine\n"
239 " --keep-unit Do not register a scope for the machine, reuse\n"
240 " the service unit nspawn is running in\n"
241 " --volatile[=MODE] Run the system in volatile mode\n"
242 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
243 , program_invocation_short_name);
244 }
245
246
247 static int custom_mounts_prepare(void) {
248 unsigned i;
249 int r;
250
251 /* Ensure the mounts are applied prefix first. */
252 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
253
254 /* Allocate working directories for the overlay file systems that need it */
255 for (i = 0; i < arg_n_custom_mounts; i++) {
256 CustomMount *m = &arg_custom_mounts[i];
257
258 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
259 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
260 return -EINVAL;
261 }
262
263 if (m->type != CUSTOM_MOUNT_OVERLAY)
264 continue;
265
266 if (m->work_dir)
267 continue;
268
269 if (m->read_only)
270 continue;
271
272 r = tempfn_random(m->source, NULL, &m->work_dir);
273 if (r < 0)
274 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
275 }
276
277 return 0;
278 }
279
280 static int set_sanitized_path(char **b, const char *path) {
281 char *p;
282
283 assert(b);
284 assert(path);
285
286 p = canonicalize_file_name(path);
287 if (!p) {
288 if (errno != ENOENT)
289 return -errno;
290
291 p = path_make_absolute_cwd(path);
292 if (!p)
293 return -ENOMEM;
294 }
295
296 free(*b);
297 *b = path_kill_slashes(p);
298 return 0;
299 }
300
301 static int detect_unified_cgroup_hierarchy(void) {
302 const char *e;
303 int r;
304
305 /* Allow the user to control whether the unified hierarchy is used */
306 e = getenv("UNIFIED_CGROUP_HIERARCHY");
307 if (e) {
308 r = parse_boolean(e);
309 if (r < 0)
310 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
311
312 arg_unified_cgroup_hierarchy = r;
313 return 0;
314 }
315
316 /* Otherwise inherit the default from the host system */
317 r = cg_unified();
318 if (r < 0)
319 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
320
321 arg_unified_cgroup_hierarchy = r;
322 return 0;
323 }
324
325 static int parse_argv(int argc, char *argv[]) {
326
327 enum {
328 ARG_VERSION = 0x100,
329 ARG_PRIVATE_NETWORK,
330 ARG_UUID,
331 ARG_READ_ONLY,
332 ARG_CAPABILITY,
333 ARG_DROP_CAPABILITY,
334 ARG_LINK_JOURNAL,
335 ARG_BIND,
336 ARG_BIND_RO,
337 ARG_TMPFS,
338 ARG_OVERLAY,
339 ARG_OVERLAY_RO,
340 ARG_SETENV,
341 ARG_SHARE_SYSTEM,
342 ARG_REGISTER,
343 ARG_KEEP_UNIT,
344 ARG_NETWORK_INTERFACE,
345 ARG_NETWORK_MACVLAN,
346 ARG_NETWORK_IPVLAN,
347 ARG_NETWORK_BRIDGE,
348 ARG_PERSONALITY,
349 ARG_VOLATILE,
350 ARG_TEMPLATE,
351 ARG_PROPERTY,
352 ARG_PRIVATE_USERS,
353 ARG_KILL_SIGNAL,
354 ARG_SETTINGS,
355 };
356
357 static const struct option options[] = {
358 { "help", no_argument, NULL, 'h' },
359 { "version", no_argument, NULL, ARG_VERSION },
360 { "directory", required_argument, NULL, 'D' },
361 { "template", required_argument, NULL, ARG_TEMPLATE },
362 { "ephemeral", no_argument, NULL, 'x' },
363 { "user", required_argument, NULL, 'u' },
364 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
365 { "boot", no_argument, NULL, 'b' },
366 { "uuid", required_argument, NULL, ARG_UUID },
367 { "read-only", no_argument, NULL, ARG_READ_ONLY },
368 { "capability", required_argument, NULL, ARG_CAPABILITY },
369 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
370 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
371 { "bind", required_argument, NULL, ARG_BIND },
372 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
373 { "tmpfs", required_argument, NULL, ARG_TMPFS },
374 { "overlay", required_argument, NULL, ARG_OVERLAY },
375 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
376 { "machine", required_argument, NULL, 'M' },
377 { "slice", required_argument, NULL, 'S' },
378 { "setenv", required_argument, NULL, ARG_SETENV },
379 { "selinux-context", required_argument, NULL, 'Z' },
380 { "selinux-apifs-context", required_argument, NULL, 'L' },
381 { "quiet", no_argument, NULL, 'q' },
382 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
383 { "register", required_argument, NULL, ARG_REGISTER },
384 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
385 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
386 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
387 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
388 { "network-veth", no_argument, NULL, 'n' },
389 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
390 { "personality", required_argument, NULL, ARG_PERSONALITY },
391 { "image", required_argument, NULL, 'i' },
392 { "volatile", optional_argument, NULL, ARG_VOLATILE },
393 { "port", required_argument, NULL, 'p' },
394 { "property", required_argument, NULL, ARG_PROPERTY },
395 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
396 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
397 { "settings", required_argument, NULL, ARG_SETTINGS },
398 {}
399 };
400
401 int c, r;
402 uint64_t plus = 0, minus = 0;
403 bool mask_all_settings = false, mask_no_settings = false;
404
405 assert(argc >= 0);
406 assert(argv);
407
408 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
409
410 switch (c) {
411
412 case 'h':
413 help();
414 return 0;
415
416 case ARG_VERSION:
417 puts(PACKAGE_STRING);
418 puts(SYSTEMD_FEATURES);
419 return 0;
420
421 case 'D':
422 r = set_sanitized_path(&arg_directory, optarg);
423 if (r < 0)
424 return log_error_errno(r, "Invalid root directory: %m");
425
426 break;
427
428 case ARG_TEMPLATE:
429 r = set_sanitized_path(&arg_template, optarg);
430 if (r < 0)
431 return log_error_errno(r, "Invalid template directory: %m");
432
433 break;
434
435 case 'i':
436 r = set_sanitized_path(&arg_image, optarg);
437 if (r < 0)
438 return log_error_errno(r, "Invalid image path: %m");
439
440 break;
441
442 case 'x':
443 arg_ephemeral = true;
444 break;
445
446 case 'u':
447 r = free_and_strdup(&arg_user, optarg);
448 if (r < 0)
449 return log_oom();
450
451 arg_settings_mask |= SETTING_USER;
452 break;
453
454 case ARG_NETWORK_BRIDGE:
455 r = free_and_strdup(&arg_network_bridge, optarg);
456 if (r < 0)
457 return log_oom();
458
459 /* fall through */
460
461 case 'n':
462 arg_network_veth = true;
463 arg_private_network = true;
464 arg_settings_mask |= SETTING_NETWORK;
465 break;
466
467 case ARG_NETWORK_INTERFACE:
468 if (strv_extend(&arg_network_interfaces, optarg) < 0)
469 return log_oom();
470
471 arg_private_network = true;
472 arg_settings_mask |= SETTING_NETWORK;
473 break;
474
475 case ARG_NETWORK_MACVLAN:
476 if (strv_extend(&arg_network_macvlan, optarg) < 0)
477 return log_oom();
478
479 arg_private_network = true;
480 arg_settings_mask |= SETTING_NETWORK;
481 break;
482
483 case ARG_NETWORK_IPVLAN:
484 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
485 return log_oom();
486
487 /* fall through */
488
489 case ARG_PRIVATE_NETWORK:
490 arg_private_network = true;
491 arg_settings_mask |= SETTING_NETWORK;
492 break;
493
494 case 'b':
495 arg_boot = true;
496 arg_settings_mask |= SETTING_BOOT;
497 break;
498
499 case ARG_UUID:
500 r = sd_id128_from_string(optarg, &arg_uuid);
501 if (r < 0) {
502 log_error("Invalid UUID: %s", optarg);
503 return r;
504 }
505
506 arg_settings_mask |= SETTING_MACHINE_ID;
507 break;
508
509 case 'S':
510 arg_slice = optarg;
511 break;
512
513 case 'M':
514 if (isempty(optarg))
515 arg_machine = mfree(arg_machine);
516 else {
517 if (!machine_name_is_valid(optarg)) {
518 log_error("Invalid machine name: %s", optarg);
519 return -EINVAL;
520 }
521
522 r = free_and_strdup(&arg_machine, optarg);
523 if (r < 0)
524 return log_oom();
525
526 break;
527 }
528
529 case 'Z':
530 arg_selinux_context = optarg;
531 break;
532
533 case 'L':
534 arg_selinux_apifs_context = optarg;
535 break;
536
537 case ARG_READ_ONLY:
538 arg_read_only = true;
539 arg_settings_mask |= SETTING_READ_ONLY;
540 break;
541
542 case ARG_CAPABILITY:
543 case ARG_DROP_CAPABILITY: {
544 const char *state, *word;
545 size_t length;
546
547 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
548 _cleanup_free_ char *t;
549
550 t = strndup(word, length);
551 if (!t)
552 return log_oom();
553
554 if (streq(t, "all")) {
555 if (c == ARG_CAPABILITY)
556 plus = (uint64_t) -1;
557 else
558 minus = (uint64_t) -1;
559 } else {
560 int cap;
561
562 cap = capability_from_name(t);
563 if (cap < 0) {
564 log_error("Failed to parse capability %s.", t);
565 return -EINVAL;
566 }
567
568 if (c == ARG_CAPABILITY)
569 plus |= 1ULL << (uint64_t) cap;
570 else
571 minus |= 1ULL << (uint64_t) cap;
572 }
573 }
574
575 arg_settings_mask |= SETTING_CAPABILITY;
576 break;
577 }
578
579 case 'j':
580 arg_link_journal = LINK_GUEST;
581 arg_link_journal_try = true;
582 break;
583
584 case ARG_LINK_JOURNAL:
585 if (streq(optarg, "auto")) {
586 arg_link_journal = LINK_AUTO;
587 arg_link_journal_try = false;
588 } else if (streq(optarg, "no")) {
589 arg_link_journal = LINK_NO;
590 arg_link_journal_try = false;
591 } else if (streq(optarg, "guest")) {
592 arg_link_journal = LINK_GUEST;
593 arg_link_journal_try = false;
594 } else if (streq(optarg, "host")) {
595 arg_link_journal = LINK_HOST;
596 arg_link_journal_try = false;
597 } else if (streq(optarg, "try-guest")) {
598 arg_link_journal = LINK_GUEST;
599 arg_link_journal_try = true;
600 } else if (streq(optarg, "try-host")) {
601 arg_link_journal = LINK_HOST;
602 arg_link_journal_try = true;
603 } else {
604 log_error("Failed to parse link journal mode %s", optarg);
605 return -EINVAL;
606 }
607
608 break;
609
610 case ARG_BIND:
611 case ARG_BIND_RO:
612 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
613 if (r < 0)
614 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
615
616 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
617 break;
618
619 case ARG_TMPFS:
620 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
621 if (r < 0)
622 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
623
624 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
625 break;
626
627 case ARG_OVERLAY:
628 case ARG_OVERLAY_RO: {
629 _cleanup_free_ char *upper = NULL, *destination = NULL;
630 _cleanup_strv_free_ char **lower = NULL;
631 CustomMount *m;
632 unsigned n = 0;
633 char **i;
634
635 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
636 if (r == -ENOMEM)
637 return log_oom();
638 else if (r < 0) {
639 log_error("Invalid overlay specification: %s", optarg);
640 return r;
641 }
642
643 STRV_FOREACH(i, lower) {
644 if (!path_is_absolute(*i)) {
645 log_error("Overlay path %s is not absolute.", *i);
646 return -EINVAL;
647 }
648
649 n++;
650 }
651
652 if (n < 2) {
653 log_error("--overlay= needs at least two colon-separated directories specified.");
654 return -EINVAL;
655 }
656
657 if (n == 2) {
658 /* If two parameters are specified,
659 * the first one is the lower, the
660 * second one the upper directory. And
661 * we'll also define the destination
662 * mount point the same as the upper. */
663 upper = lower[1];
664 lower[1] = NULL;
665
666 destination = strdup(upper);
667 if (!destination)
668 return log_oom();
669
670 } else {
671 upper = lower[n - 2];
672 destination = lower[n - 1];
673 lower[n - 2] = NULL;
674 }
675
676 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
677 if (!m)
678 return log_oom();
679
680 m->destination = destination;
681 m->source = upper;
682 m->lower = lower;
683 m->read_only = c == ARG_OVERLAY_RO;
684
685 upper = destination = NULL;
686 lower = NULL;
687
688 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
689 break;
690 }
691
692 case ARG_SETENV: {
693 char **n;
694
695 if (!env_assignment_is_valid(optarg)) {
696 log_error("Environment variable assignment '%s' is not valid.", optarg);
697 return -EINVAL;
698 }
699
700 n = strv_env_set(arg_setenv, optarg);
701 if (!n)
702 return log_oom();
703
704 strv_free(arg_setenv);
705 arg_setenv = n;
706
707 arg_settings_mask |= SETTING_ENVIRONMENT;
708 break;
709 }
710
711 case 'q':
712 arg_quiet = true;
713 break;
714
715 case ARG_SHARE_SYSTEM:
716 arg_share_system = true;
717 break;
718
719 case ARG_REGISTER:
720 r = parse_boolean(optarg);
721 if (r < 0) {
722 log_error("Failed to parse --register= argument: %s", optarg);
723 return r;
724 }
725
726 arg_register = r;
727 break;
728
729 case ARG_KEEP_UNIT:
730 arg_keep_unit = true;
731 break;
732
733 case ARG_PERSONALITY:
734
735 arg_personality = personality_from_string(optarg);
736 if (arg_personality == PERSONALITY_INVALID) {
737 log_error("Unknown or unsupported personality '%s'.", optarg);
738 return -EINVAL;
739 }
740
741 arg_settings_mask |= SETTING_PERSONALITY;
742 break;
743
744 case ARG_VOLATILE:
745
746 if (!optarg)
747 arg_volatile_mode = VOLATILE_YES;
748 else {
749 VolatileMode m;
750
751 m = volatile_mode_from_string(optarg);
752 if (m < 0) {
753 log_error("Failed to parse --volatile= argument: %s", optarg);
754 return -EINVAL;
755 } else
756 arg_volatile_mode = m;
757 }
758
759 arg_settings_mask |= SETTING_VOLATILE_MODE;
760 break;
761
762 case 'p':
763 r = expose_port_parse(&arg_expose_ports, optarg);
764 if (r == -EEXIST)
765 return log_error_errno(r, "Duplicate port specification: %s", optarg);
766 if (r < 0)
767 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
768
769 arg_settings_mask |= SETTING_EXPOSE_PORTS;
770 break;
771
772 case ARG_PROPERTY:
773 if (strv_extend(&arg_property, optarg) < 0)
774 return log_oom();
775
776 break;
777
778 case ARG_PRIVATE_USERS:
779 if (optarg) {
780 _cleanup_free_ char *buffer = NULL;
781 const char *range, *shift;
782
783 range = strchr(optarg, ':');
784 if (range) {
785 buffer = strndup(optarg, range - optarg);
786 if (!buffer)
787 return log_oom();
788 shift = buffer;
789
790 range++;
791 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
792 log_error("Failed to parse UID range: %s", range);
793 return -EINVAL;
794 }
795 } else
796 shift = optarg;
797
798 if (parse_uid(shift, &arg_uid_shift) < 0) {
799 log_error("Failed to parse UID: %s", optarg);
800 return -EINVAL;
801 }
802 }
803
804 arg_userns = true;
805 break;
806
807 case ARG_KILL_SIGNAL:
808 arg_kill_signal = signal_from_string_try_harder(optarg);
809 if (arg_kill_signal < 0) {
810 log_error("Cannot parse signal: %s", optarg);
811 return -EINVAL;
812 }
813
814 arg_settings_mask |= SETTING_KILL_SIGNAL;
815 break;
816
817 case ARG_SETTINGS:
818
819 /* no → do not read files
820 * yes → read files, do not override cmdline, trust only subset
821 * override → read files, override cmdline, trust only subset
822 * trusted → read files, do not override cmdline, trust all
823 */
824
825 r = parse_boolean(optarg);
826 if (r < 0) {
827 if (streq(optarg, "trusted")) {
828 mask_all_settings = false;
829 mask_no_settings = false;
830 arg_settings_trusted = true;
831
832 } else if (streq(optarg, "override")) {
833 mask_all_settings = false;
834 mask_no_settings = true;
835 arg_settings_trusted = -1;
836 } else
837 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
838 } else if (r > 0) {
839 /* yes */
840 mask_all_settings = false;
841 mask_no_settings = false;
842 arg_settings_trusted = -1;
843 } else {
844 /* no */
845 mask_all_settings = true;
846 mask_no_settings = false;
847 arg_settings_trusted = false;
848 }
849
850 break;
851
852 case '?':
853 return -EINVAL;
854
855 default:
856 assert_not_reached("Unhandled option");
857 }
858
859 if (arg_share_system)
860 arg_register = false;
861
862 if (arg_boot && arg_share_system) {
863 log_error("--boot and --share-system may not be combined.");
864 return -EINVAL;
865 }
866
867 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
868 log_error("--keep-unit may not be used when invoked from a user session.");
869 return -EINVAL;
870 }
871
872 if (arg_directory && arg_image) {
873 log_error("--directory= and --image= may not be combined.");
874 return -EINVAL;
875 }
876
877 if (arg_template && arg_image) {
878 log_error("--template= and --image= may not be combined.");
879 return -EINVAL;
880 }
881
882 if (arg_template && !(arg_directory || arg_machine)) {
883 log_error("--template= needs --directory= or --machine=.");
884 return -EINVAL;
885 }
886
887 if (arg_ephemeral && arg_template) {
888 log_error("--ephemeral and --template= may not be combined.");
889 return -EINVAL;
890 }
891
892 if (arg_ephemeral && arg_image) {
893 log_error("--ephemeral and --image= may not be combined.");
894 return -EINVAL;
895 }
896
897 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
898 log_error("--ephemeral and --link-journal= may not be combined.");
899 return -EINVAL;
900 }
901
902 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
903 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
904
905 if (argc > optind) {
906 arg_parameters = strv_copy(argv + optind);
907 if (!arg_parameters)
908 return log_oom();
909
910 arg_settings_mask |= SETTING_BOOT;
911 }
912
913 /* Load all settings from .nspawn files */
914 if (mask_no_settings)
915 arg_settings_mask = 0;
916
917 /* Don't load any settings from .nspawn files */
918 if (mask_all_settings)
919 arg_settings_mask = _SETTINGS_MASK_ALL;
920
921 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
922
923 r = detect_unified_cgroup_hierarchy();
924 if (r < 0)
925 return r;
926
927 return 1;
928 }
929
930 static int verify_arguments(void) {
931
932 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
933 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
934 return -EINVAL;
935 }
936
937 if (arg_expose_ports && !arg_private_network) {
938 log_error("Cannot use --port= without private networking.");
939 return -EINVAL;
940 }
941
942 if (arg_boot && arg_kill_signal <= 0)
943 arg_kill_signal = SIGRTMIN+3;
944
945 return 0;
946 }
947
948 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
949 assert(p);
950
951 if (!arg_userns)
952 return 0;
953
954 if (uid == UID_INVALID && gid == GID_INVALID)
955 return 0;
956
957 if (uid != UID_INVALID) {
958 uid += arg_uid_shift;
959
960 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
961 return -EOVERFLOW;
962 }
963
964 if (gid != GID_INVALID) {
965 gid += (gid_t) arg_uid_shift;
966
967 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
968 return -EOVERFLOW;
969 }
970
971 if (lchown(p, uid, gid) < 0)
972 return -errno;
973
974 return 0;
975 }
976
977 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
978 const char *q;
979
980 q = prefix_roota(root, path);
981 if (mkdir(q, mode) < 0) {
982 if (errno == EEXIST)
983 return 0;
984 return -errno;
985 }
986
987 return userns_lchown(q, uid, gid);
988 }
989
990 static int setup_timezone(const char *dest) {
991 _cleanup_free_ char *p = NULL, *q = NULL;
992 const char *where, *check, *what;
993 char *z, *y;
994 int r;
995
996 assert(dest);
997
998 /* Fix the timezone, if possible */
999 r = readlink_malloc("/etc/localtime", &p);
1000 if (r < 0) {
1001 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1002 return 0;
1003 }
1004
1005 z = path_startswith(p, "../usr/share/zoneinfo/");
1006 if (!z)
1007 z = path_startswith(p, "/usr/share/zoneinfo/");
1008 if (!z) {
1009 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1010 return 0;
1011 }
1012
1013 where = prefix_roota(dest, "/etc/localtime");
1014 r = readlink_malloc(where, &q);
1015 if (r >= 0) {
1016 y = path_startswith(q, "../usr/share/zoneinfo/");
1017 if (!y)
1018 y = path_startswith(q, "/usr/share/zoneinfo/");
1019
1020 /* Already pointing to the right place? Then do nothing .. */
1021 if (y && streq(y, z))
1022 return 0;
1023 }
1024
1025 check = strjoina("/usr/share/zoneinfo/", z);
1026 check = prefix_root(dest, check);
1027 if (laccess(check, F_OK) < 0) {
1028 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1029 return 0;
1030 }
1031
1032 r = unlink(where);
1033 if (r < 0 && errno != ENOENT) {
1034 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1035 return 0;
1036 }
1037
1038 what = strjoina("../usr/share/zoneinfo/", z);
1039 if (symlink(what, where) < 0) {
1040 log_error_errno(errno, "Failed to correct timezone of container: %m");
1041 return 0;
1042 }
1043
1044 r = userns_lchown(where, 0, 0);
1045 if (r < 0)
1046 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1047
1048 return 0;
1049 }
1050
1051 static int setup_resolv_conf(const char *dest) {
1052 const char *where = NULL;
1053 int r;
1054
1055 assert(dest);
1056
1057 if (arg_private_network)
1058 return 0;
1059
1060 /* Fix resolv.conf, if possible */
1061 where = prefix_roota(dest, "/etc/resolv.conf");
1062
1063 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1064 if (r < 0) {
1065 /* If the file already exists as symlink, let's
1066 * suppress the warning, under the assumption that
1067 * resolved or something similar runs inside and the
1068 * symlink points there.
1069 *
1070 * If the disk image is read-only, there's also no
1071 * point in complaining.
1072 */
1073 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1074 "Failed to copy /etc/resolv.conf to %s: %m", where);
1075 return 0;
1076 }
1077
1078 r = userns_lchown(where, 0, 0);
1079 if (r < 0)
1080 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1081
1082 return 0;
1083 }
1084
1085 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1086 assert(s);
1087
1088 snprintf(s, 37,
1089 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1090 SD_ID128_FORMAT_VAL(id));
1091
1092 return s;
1093 }
1094
1095 static int setup_boot_id(const char *dest) {
1096 const char *from, *to;
1097 sd_id128_t rnd = {};
1098 char as_uuid[37];
1099 int r;
1100
1101 if (arg_share_system)
1102 return 0;
1103
1104 /* Generate a new randomized boot ID, so that each boot-up of
1105 * the container gets a new one */
1106
1107 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1108 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1109
1110 r = sd_id128_randomize(&rnd);
1111 if (r < 0)
1112 return log_error_errno(r, "Failed to generate random boot id: %m");
1113
1114 id128_format_as_uuid(rnd, as_uuid);
1115
1116 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1117 if (r < 0)
1118 return log_error_errno(r, "Failed to write boot id: %m");
1119
1120 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1121 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1122 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1123 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1124
1125 unlink(from);
1126 return r;
1127 }
1128
1129 static int copy_devnodes(const char *dest) {
1130
1131 static const char devnodes[] =
1132 "null\0"
1133 "zero\0"
1134 "full\0"
1135 "random\0"
1136 "urandom\0"
1137 "tty\0"
1138 "net/tun\0";
1139
1140 const char *d;
1141 int r = 0;
1142 _cleanup_umask_ mode_t u;
1143
1144 assert(dest);
1145
1146 u = umask(0000);
1147
1148 /* Create /dev/net, so that we can create /dev/net/tun in it */
1149 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1150 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1151
1152 NULSTR_FOREACH(d, devnodes) {
1153 _cleanup_free_ char *from = NULL, *to = NULL;
1154 struct stat st;
1155
1156 from = strappend("/dev/", d);
1157 to = prefix_root(dest, from);
1158
1159 if (stat(from, &st) < 0) {
1160
1161 if (errno != ENOENT)
1162 return log_error_errno(errno, "Failed to stat %s: %m", from);
1163
1164 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1165
1166 log_error("%s is not a char or block device, cannot copy.", from);
1167 return -EIO;
1168
1169 } else {
1170 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1171 if (errno != EPERM)
1172 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1173
1174 /* Some systems abusively restrict mknod but
1175 * allow bind mounts. */
1176 r = touch(to);
1177 if (r < 0)
1178 return log_error_errno(r, "touch (%s) failed: %m", to);
1179 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1180 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1181 }
1182
1183 r = userns_lchown(to, 0, 0);
1184 if (r < 0)
1185 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1186 }
1187 }
1188
1189 return r;
1190 }
1191
1192 static int setup_pts(const char *dest) {
1193 _cleanup_free_ char *options = NULL;
1194 const char *p;
1195
1196 #ifdef HAVE_SELINUX
1197 if (arg_selinux_apifs_context)
1198 (void) asprintf(&options,
1199 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1200 arg_uid_shift + TTY_GID,
1201 arg_selinux_apifs_context);
1202 else
1203 #endif
1204 (void) asprintf(&options,
1205 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1206 arg_uid_shift + TTY_GID);
1207
1208 if (!options)
1209 return log_oom();
1210
1211 /* Mount /dev/pts itself */
1212 p = prefix_roota(dest, "/dev/pts");
1213 if (mkdir(p, 0755) < 0)
1214 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1215 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1216 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1217 if (userns_lchown(p, 0, 0) < 0)
1218 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1219
1220 /* Create /dev/ptmx symlink */
1221 p = prefix_roota(dest, "/dev/ptmx");
1222 if (symlink("pts/ptmx", p) < 0)
1223 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1224 if (userns_lchown(p, 0, 0) < 0)
1225 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1226
1227 /* And fix /dev/pts/ptmx ownership */
1228 p = prefix_roota(dest, "/dev/pts/ptmx");
1229 if (userns_lchown(p, 0, 0) < 0)
1230 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1231
1232 return 0;
1233 }
1234
1235 static int setup_dev_console(const char *dest, const char *console) {
1236 _cleanup_umask_ mode_t u;
1237 const char *to;
1238 int r;
1239
1240 assert(dest);
1241 assert(console);
1242
1243 u = umask(0000);
1244
1245 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1246 if (r < 0)
1247 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1248
1249 /* We need to bind mount the right tty to /dev/console since
1250 * ptys can only exist on pts file systems. To have something
1251 * to bind mount things on we create a empty regular file. */
1252
1253 to = prefix_roota(dest, "/dev/console");
1254 r = touch(to);
1255 if (r < 0)
1256 return log_error_errno(r, "touch() for /dev/console failed: %m");
1257
1258 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1259 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1260
1261 return 0;
1262 }
1263
1264 static int setup_kmsg(const char *dest, int kmsg_socket) {
1265 const char *from, *to;
1266 _cleanup_umask_ mode_t u;
1267 int fd, k;
1268 union {
1269 struct cmsghdr cmsghdr;
1270 uint8_t buf[CMSG_SPACE(sizeof(int))];
1271 } control = {};
1272 struct msghdr mh = {
1273 .msg_control = &control,
1274 .msg_controllen = sizeof(control),
1275 };
1276 struct cmsghdr *cmsg;
1277
1278 assert(kmsg_socket >= 0);
1279
1280 u = umask(0000);
1281
1282 /* We create the kmsg FIFO as /run/kmsg, but immediately
1283 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1284 * on the reading side behave very similar to /proc/kmsg,
1285 * their writing side behaves differently from /dev/kmsg in
1286 * that writing blocks when nothing is reading. In order to
1287 * avoid any problems with containers deadlocking due to this
1288 * we simply make /dev/kmsg unavailable to the container. */
1289 from = prefix_roota(dest, "/run/kmsg");
1290 to = prefix_roota(dest, "/proc/kmsg");
1291
1292 if (mkfifo(from, 0600) < 0)
1293 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1294 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1295 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1296
1297 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1298 if (fd < 0)
1299 return log_error_errno(errno, "Failed to open fifo: %m");
1300
1301 cmsg = CMSG_FIRSTHDR(&mh);
1302 cmsg->cmsg_level = SOL_SOCKET;
1303 cmsg->cmsg_type = SCM_RIGHTS;
1304 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1305 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1306
1307 mh.msg_controllen = cmsg->cmsg_len;
1308
1309 /* Store away the fd in the socket, so that it stays open as
1310 * long as we run the child */
1311 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1312 safe_close(fd);
1313
1314 if (k < 0)
1315 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1316
1317 /* And now make the FIFO unavailable as /run/kmsg... */
1318 (void) unlink(from);
1319
1320 return 0;
1321 }
1322
1323 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1324 union in_addr_union *exposed = userdata;
1325
1326 assert(rtnl);
1327 assert(m);
1328 assert(exposed);
1329
1330 expose_port_execute(rtnl, arg_expose_ports, exposed);
1331 return 0;
1332 }
1333
1334 static int setup_hostname(void) {
1335
1336 if (arg_share_system)
1337 return 0;
1338
1339 if (sethostname_idempotent(arg_machine) < 0)
1340 return -errno;
1341
1342 return 0;
1343 }
1344
1345 static int setup_journal(const char *directory) {
1346 sd_id128_t machine_id, this_id;
1347 _cleanup_free_ char *b = NULL, *d = NULL;
1348 const char *etc_machine_id, *p, *q;
1349 char *id;
1350 int r;
1351
1352 /* Don't link journals in ephemeral mode */
1353 if (arg_ephemeral)
1354 return 0;
1355
1356 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1357
1358 r = read_one_line_file(etc_machine_id, &b);
1359 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1360 return 0;
1361 else if (r < 0)
1362 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
1363
1364 id = strstrip(b);
1365 if (isempty(id) && arg_link_journal == LINK_AUTO)
1366 return 0;
1367
1368 /* Verify validity */
1369 r = sd_id128_from_string(id, &machine_id);
1370 if (r < 0)
1371 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
1372
1373 r = sd_id128_get_machine(&this_id);
1374 if (r < 0)
1375 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1376
1377 if (sd_id128_equal(machine_id, this_id)) {
1378 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1379 "Host and machine ids are equal (%s): refusing to link journals", id);
1380 if (arg_link_journal == LINK_AUTO)
1381 return 0;
1382 return -EEXIST;
1383 }
1384
1385 if (arg_link_journal == LINK_NO)
1386 return 0;
1387
1388 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1389 if (r < 0)
1390 return log_error_errno(r, "Failed to create /var: %m");
1391
1392 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1393 if (r < 0)
1394 return log_error_errno(r, "Failed to create /var/log: %m");
1395
1396 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1397 if (r < 0)
1398 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1399
1400 p = strjoina("/var/log/journal/", id);
1401 q = prefix_roota(directory, p);
1402
1403 if (path_is_mount_point(p, 0) > 0) {
1404 if (arg_link_journal != LINK_AUTO) {
1405 log_error("%s: already a mount point, refusing to use for journal", p);
1406 return -EEXIST;
1407 }
1408
1409 return 0;
1410 }
1411
1412 if (path_is_mount_point(q, 0) > 0) {
1413 if (arg_link_journal != LINK_AUTO) {
1414 log_error("%s: already a mount point, refusing to use for journal", q);
1415 return -EEXIST;
1416 }
1417
1418 return 0;
1419 }
1420
1421 r = readlink_and_make_absolute(p, &d);
1422 if (r >= 0) {
1423 if ((arg_link_journal == LINK_GUEST ||
1424 arg_link_journal == LINK_AUTO) &&
1425 path_equal(d, q)) {
1426
1427 r = userns_mkdir(directory, p, 0755, 0, 0);
1428 if (r < 0)
1429 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1430 return 0;
1431 }
1432
1433 if (unlink(p) < 0)
1434 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1435 } else if (r == -EINVAL) {
1436
1437 if (arg_link_journal == LINK_GUEST &&
1438 rmdir(p) < 0) {
1439
1440 if (errno == ENOTDIR) {
1441 log_error("%s already exists and is neither a symlink nor a directory", p);
1442 return r;
1443 } else {
1444 log_error_errno(errno, "Failed to remove %s: %m", p);
1445 return -errno;
1446 }
1447 }
1448 } else if (r != -ENOENT) {
1449 log_error_errno(errno, "readlink(%s) failed: %m", p);
1450 return r;
1451 }
1452
1453 if (arg_link_journal == LINK_GUEST) {
1454
1455 if (symlink(q, p) < 0) {
1456 if (arg_link_journal_try) {
1457 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1458 return 0;
1459 } else {
1460 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1461 return -errno;
1462 }
1463 }
1464
1465 r = userns_mkdir(directory, p, 0755, 0, 0);
1466 if (r < 0)
1467 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1468 return 0;
1469 }
1470
1471 if (arg_link_journal == LINK_HOST) {
1472 /* don't create parents here -- if the host doesn't have
1473 * permanent journal set up, don't force it here */
1474 r = mkdir(p, 0755);
1475 if (r < 0) {
1476 if (arg_link_journal_try) {
1477 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1478 return 0;
1479 } else {
1480 log_error_errno(errno, "Failed to create %s: %m", p);
1481 return r;
1482 }
1483 }
1484
1485 } else if (access(p, F_OK) < 0)
1486 return 0;
1487
1488 if (dir_is_empty(q) == 0)
1489 log_warning("%s is not empty, proceeding anyway.", q);
1490
1491 r = userns_mkdir(directory, p, 0755, 0, 0);
1492 if (r < 0) {
1493 log_error_errno(errno, "Failed to create %s: %m", q);
1494 return r;
1495 }
1496
1497 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1498 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1499
1500 return 0;
1501 }
1502
1503 static int drop_capabilities(void) {
1504 return capability_bounding_set_drop(~arg_retain, false);
1505 }
1506
1507 static int reset_audit_loginuid(void) {
1508 _cleanup_free_ char *p = NULL;
1509 int r;
1510
1511 if (arg_share_system)
1512 return 0;
1513
1514 r = read_one_line_file("/proc/self/loginuid", &p);
1515 if (r == -ENOENT)
1516 return 0;
1517 if (r < 0)
1518 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1519
1520 /* Already reset? */
1521 if (streq(p, "4294967295"))
1522 return 0;
1523
1524 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1525 if (r < 0) {
1526 log_error_errno(r,
1527 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1528 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1529 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1530 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1531 "using systemd-nspawn. Sleeping for 5s... (%m)");
1532
1533 sleep(5);
1534 }
1535
1536 return 0;
1537 }
1538
1539 static int setup_seccomp(void) {
1540
1541 #ifdef HAVE_SECCOMP
1542 static const struct {
1543 uint64_t capability;
1544 int syscall_num;
1545 } blacklist[] = {
1546 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1547 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1548 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1549 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1550 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1551 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1552 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1553 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1554 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1555 { CAP_SYSLOG, SCMP_SYS(syslog) },
1556 };
1557
1558 scmp_filter_ctx seccomp;
1559 unsigned i;
1560 int r;
1561
1562 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1563 if (!seccomp)
1564 return log_oom();
1565
1566 r = seccomp_add_secondary_archs(seccomp);
1567 if (r < 0) {
1568 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1569 goto finish;
1570 }
1571
1572 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1573 if (arg_retain & (1ULL << blacklist[i].capability))
1574 continue;
1575
1576 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1577 if (r == -EFAULT)
1578 continue; /* unknown syscall */
1579 if (r < 0) {
1580 log_error_errno(r, "Failed to block syscall: %m");
1581 goto finish;
1582 }
1583 }
1584
1585
1586 /*
1587 Audit is broken in containers, much of the userspace audit
1588 hookup will fail if running inside a container. We don't
1589 care and just turn off creation of audit sockets.
1590
1591 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1592 with EAFNOSUPPORT which audit userspace uses as indication
1593 that audit is disabled in the kernel.
1594 */
1595
1596 r = seccomp_rule_add(
1597 seccomp,
1598 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1599 SCMP_SYS(socket),
1600 2,
1601 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1602 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1603 if (r < 0) {
1604 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1605 goto finish;
1606 }
1607
1608 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1609 if (r < 0) {
1610 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1611 goto finish;
1612 }
1613
1614 r = seccomp_load(seccomp);
1615 if (r == -EINVAL) {
1616 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1617 r = 0;
1618 goto finish;
1619 }
1620 if (r < 0) {
1621 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1622 goto finish;
1623 }
1624
1625 finish:
1626 seccomp_release(seccomp);
1627 return r;
1628 #else
1629 return 0;
1630 #endif
1631
1632 }
1633
1634 static int setup_propagate(const char *root) {
1635 const char *p, *q;
1636
1637 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1638 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1639 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1640 (void) mkdir_p(p, 0600);
1641
1642 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
1643 return log_error_errno(errno, "Failed to create /run/systemd: %m");
1644
1645 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1646 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
1647
1648 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1649 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
1650
1651 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1652 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1653 return log_error_errno(errno, "Failed to install propagation bind mount.");
1654
1655 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1656 return log_error_errno(errno, "Failed to make propagation mount read-only");
1657
1658 return 0;
1659 }
1660
1661 static int setup_image(char **device_path, int *loop_nr) {
1662 struct loop_info64 info = {
1663 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1664 };
1665 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1666 _cleanup_free_ char* loopdev = NULL;
1667 struct stat st;
1668 int r, nr;
1669
1670 assert(device_path);
1671 assert(loop_nr);
1672 assert(arg_image);
1673
1674 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1675 if (fd < 0)
1676 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1677
1678 if (fstat(fd, &st) < 0)
1679 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1680
1681 if (S_ISBLK(st.st_mode)) {
1682 char *p;
1683
1684 p = strdup(arg_image);
1685 if (!p)
1686 return log_oom();
1687
1688 *device_path = p;
1689
1690 *loop_nr = -1;
1691
1692 r = fd;
1693 fd = -1;
1694
1695 return r;
1696 }
1697
1698 if (!S_ISREG(st.st_mode)) {
1699 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1700 return -EINVAL;
1701 }
1702
1703 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1704 if (control < 0)
1705 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1706
1707 nr = ioctl(control, LOOP_CTL_GET_FREE);
1708 if (nr < 0)
1709 return log_error_errno(errno, "Failed to allocate loop device: %m");
1710
1711 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1712 return log_oom();
1713
1714 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1715 if (loop < 0)
1716 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1717
1718 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1719 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1720
1721 if (arg_read_only)
1722 info.lo_flags |= LO_FLAGS_READ_ONLY;
1723
1724 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1725 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1726
1727 *device_path = loopdev;
1728 loopdev = NULL;
1729
1730 *loop_nr = nr;
1731
1732 r = loop;
1733 loop = -1;
1734
1735 return r;
1736 }
1737
1738 #define PARTITION_TABLE_BLURB \
1739 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1740 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1741 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1742 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1743 "to be bootable with systemd-nspawn."
1744
1745 static int dissect_image(
1746 int fd,
1747 char **root_device, bool *root_device_rw,
1748 char **home_device, bool *home_device_rw,
1749 char **srv_device, bool *srv_device_rw,
1750 bool *secondary) {
1751
1752 #ifdef HAVE_BLKID
1753 int home_nr = -1, srv_nr = -1;
1754 #ifdef GPT_ROOT_NATIVE
1755 int root_nr = -1;
1756 #endif
1757 #ifdef GPT_ROOT_SECONDARY
1758 int secondary_root_nr = -1;
1759 #endif
1760 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1761 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1762 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1763 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1764 _cleanup_udev_unref_ struct udev *udev = NULL;
1765 struct udev_list_entry *first, *item;
1766 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1767 bool is_gpt, is_mbr, multiple_generic = false;
1768 const char *pttype = NULL;
1769 blkid_partlist pl;
1770 struct stat st;
1771 unsigned i;
1772 int r;
1773
1774 assert(fd >= 0);
1775 assert(root_device);
1776 assert(home_device);
1777 assert(srv_device);
1778 assert(secondary);
1779 assert(arg_image);
1780
1781 b = blkid_new_probe();
1782 if (!b)
1783 return log_oom();
1784
1785 errno = 0;
1786 r = blkid_probe_set_device(b, fd, 0, 0);
1787 if (r != 0) {
1788 if (errno == 0)
1789 return log_oom();
1790
1791 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1792 return -errno;
1793 }
1794
1795 blkid_probe_enable_partitions(b, 1);
1796 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1797
1798 errno = 0;
1799 r = blkid_do_safeprobe(b);
1800 if (r == -2 || r == 1) {
1801 log_error("Failed to identify any partition table on\n"
1802 " %s\n"
1803 PARTITION_TABLE_BLURB, arg_image);
1804 return -EINVAL;
1805 } else if (r != 0) {
1806 if (errno == 0)
1807 errno = EIO;
1808 log_error_errno(errno, "Failed to probe: %m");
1809 return -errno;
1810 }
1811
1812 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1813
1814 is_gpt = streq_ptr(pttype, "gpt");
1815 is_mbr = streq_ptr(pttype, "dos");
1816
1817 if (!is_gpt && !is_mbr) {
1818 log_error("No GPT or MBR partition table discovered on\n"
1819 " %s\n"
1820 PARTITION_TABLE_BLURB, arg_image);
1821 return -EINVAL;
1822 }
1823
1824 errno = 0;
1825 pl = blkid_probe_get_partitions(b);
1826 if (!pl) {
1827 if (errno == 0)
1828 return log_oom();
1829
1830 log_error("Failed to list partitions of %s", arg_image);
1831 return -errno;
1832 }
1833
1834 udev = udev_new();
1835 if (!udev)
1836 return log_oom();
1837
1838 if (fstat(fd, &st) < 0)
1839 return log_error_errno(errno, "Failed to stat block device: %m");
1840
1841 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1842 if (!d)
1843 return log_oom();
1844
1845 for (i = 0;; i++) {
1846 int n, m;
1847
1848 if (i >= 10) {
1849 log_error("Kernel partitions never appeared.");
1850 return -ENXIO;
1851 }
1852
1853 e = udev_enumerate_new(udev);
1854 if (!e)
1855 return log_oom();
1856
1857 r = udev_enumerate_add_match_parent(e, d);
1858 if (r < 0)
1859 return log_oom();
1860
1861 r = udev_enumerate_scan_devices(e);
1862 if (r < 0)
1863 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1864
1865 /* Count the partitions enumerated by the kernel */
1866 n = 0;
1867 first = udev_enumerate_get_list_entry(e);
1868 udev_list_entry_foreach(item, first)
1869 n++;
1870
1871 /* Count the partitions enumerated by blkid */
1872 m = blkid_partlist_numof_partitions(pl);
1873 if (n == m + 1)
1874 break;
1875 if (n > m + 1) {
1876 log_error("blkid and kernel partition list do not match.");
1877 return -EIO;
1878 }
1879 if (n < m + 1) {
1880 unsigned j;
1881
1882 /* The kernel has probed fewer partitions than
1883 * blkid? Maybe the kernel prober is still
1884 * running or it got EBUSY because udev
1885 * already opened the device. Let's reprobe
1886 * the device, which is a synchronous call
1887 * that waits until probing is complete. */
1888
1889 for (j = 0; j < 20; j++) {
1890
1891 r = ioctl(fd, BLKRRPART, 0);
1892 if (r < 0)
1893 r = -errno;
1894 if (r >= 0 || r != -EBUSY)
1895 break;
1896
1897 /* If something else has the device
1898 * open, such as an udev rule, the
1899 * ioctl will return EBUSY. Since
1900 * there's no way to wait until it
1901 * isn't busy anymore, let's just wait
1902 * a bit, and try again.
1903 *
1904 * This is really something they
1905 * should fix in the kernel! */
1906
1907 usleep(50 * USEC_PER_MSEC);
1908 }
1909
1910 if (r < 0)
1911 return log_error_errno(r, "Failed to reread partition table: %m");
1912 }
1913
1914 e = udev_enumerate_unref(e);
1915 }
1916
1917 first = udev_enumerate_get_list_entry(e);
1918 udev_list_entry_foreach(item, first) {
1919 _cleanup_udev_device_unref_ struct udev_device *q;
1920 const char *node;
1921 unsigned long long flags;
1922 blkid_partition pp;
1923 dev_t qn;
1924 int nr;
1925
1926 errno = 0;
1927 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1928 if (!q) {
1929 if (!errno)
1930 errno = ENOMEM;
1931
1932 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1933 return -errno;
1934 }
1935
1936 qn = udev_device_get_devnum(q);
1937 if (major(qn) == 0)
1938 continue;
1939
1940 if (st.st_rdev == qn)
1941 continue;
1942
1943 node = udev_device_get_devnode(q);
1944 if (!node)
1945 continue;
1946
1947 pp = blkid_partlist_devno_to_partition(pl, qn);
1948 if (!pp)
1949 continue;
1950
1951 flags = blkid_partition_get_flags(pp);
1952
1953 nr = blkid_partition_get_partno(pp);
1954 if (nr < 0)
1955 continue;
1956
1957 if (is_gpt) {
1958 sd_id128_t type_id;
1959 const char *stype;
1960
1961 if (flags & GPT_FLAG_NO_AUTO)
1962 continue;
1963
1964 stype = blkid_partition_get_type_string(pp);
1965 if (!stype)
1966 continue;
1967
1968 if (sd_id128_from_string(stype, &type_id) < 0)
1969 continue;
1970
1971 if (sd_id128_equal(type_id, GPT_HOME)) {
1972
1973 if (home && nr >= home_nr)
1974 continue;
1975
1976 home_nr = nr;
1977 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1978
1979 r = free_and_strdup(&home, node);
1980 if (r < 0)
1981 return log_oom();
1982
1983 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1984
1985 if (srv && nr >= srv_nr)
1986 continue;
1987
1988 srv_nr = nr;
1989 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1990
1991 r = free_and_strdup(&srv, node);
1992 if (r < 0)
1993 return log_oom();
1994 }
1995 #ifdef GPT_ROOT_NATIVE
1996 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1997
1998 if (root && nr >= root_nr)
1999 continue;
2000
2001 root_nr = nr;
2002 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2003
2004 r = free_and_strdup(&root, node);
2005 if (r < 0)
2006 return log_oom();
2007 }
2008 #endif
2009 #ifdef GPT_ROOT_SECONDARY
2010 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2011
2012 if (secondary_root && nr >= secondary_root_nr)
2013 continue;
2014
2015 secondary_root_nr = nr;
2016 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2017
2018 r = free_and_strdup(&secondary_root, node);
2019 if (r < 0)
2020 return log_oom();
2021 }
2022 #endif
2023 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2024
2025 if (generic)
2026 multiple_generic = true;
2027 else {
2028 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2029
2030 r = free_and_strdup(&generic, node);
2031 if (r < 0)
2032 return log_oom();
2033 }
2034 }
2035
2036 } else if (is_mbr) {
2037 int type;
2038
2039 if (flags != 0x80) /* Bootable flag */
2040 continue;
2041
2042 type = blkid_partition_get_type(pp);
2043 if (type != 0x83) /* Linux partition */
2044 continue;
2045
2046 if (generic)
2047 multiple_generic = true;
2048 else {
2049 generic_rw = true;
2050
2051 r = free_and_strdup(&root, node);
2052 if (r < 0)
2053 return log_oom();
2054 }
2055 }
2056 }
2057
2058 if (root) {
2059 *root_device = root;
2060 root = NULL;
2061
2062 *root_device_rw = root_rw;
2063 *secondary = false;
2064 } else if (secondary_root) {
2065 *root_device = secondary_root;
2066 secondary_root = NULL;
2067
2068 *root_device_rw = secondary_root_rw;
2069 *secondary = true;
2070 } else if (generic) {
2071
2072 /* There were no partitions with precise meanings
2073 * around, but we found generic partitions. In this
2074 * case, if there's only one, we can go ahead and boot
2075 * it, otherwise we bail out, because we really cannot
2076 * make any sense of it. */
2077
2078 if (multiple_generic) {
2079 log_error("Identified multiple bootable Linux partitions on\n"
2080 " %s\n"
2081 PARTITION_TABLE_BLURB, arg_image);
2082 return -EINVAL;
2083 }
2084
2085 *root_device = generic;
2086 generic = NULL;
2087
2088 *root_device_rw = generic_rw;
2089 *secondary = false;
2090 } else {
2091 log_error("Failed to identify root partition in disk image\n"
2092 " %s\n"
2093 PARTITION_TABLE_BLURB, arg_image);
2094 return -EINVAL;
2095 }
2096
2097 if (home) {
2098 *home_device = home;
2099 home = NULL;
2100
2101 *home_device_rw = home_rw;
2102 }
2103
2104 if (srv) {
2105 *srv_device = srv;
2106 srv = NULL;
2107
2108 *srv_device_rw = srv_rw;
2109 }
2110
2111 return 0;
2112 #else
2113 log_error("--image= is not supported, compiled without blkid support.");
2114 return -EOPNOTSUPP;
2115 #endif
2116 }
2117
2118 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2119 #ifdef HAVE_BLKID
2120 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2121 const char *fstype, *p;
2122 int r;
2123
2124 assert(what);
2125 assert(where);
2126
2127 if (arg_read_only)
2128 rw = false;
2129
2130 if (directory)
2131 p = strjoina(where, directory);
2132 else
2133 p = where;
2134
2135 errno = 0;
2136 b = blkid_new_probe_from_filename(what);
2137 if (!b) {
2138 if (errno == 0)
2139 return log_oom();
2140 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2141 return -errno;
2142 }
2143
2144 blkid_probe_enable_superblocks(b, 1);
2145 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2146
2147 errno = 0;
2148 r = blkid_do_safeprobe(b);
2149 if (r == -1 || r == 1) {
2150 log_error("Cannot determine file system type of %s", what);
2151 return -EINVAL;
2152 } else if (r != 0) {
2153 if (errno == 0)
2154 errno = EIO;
2155 log_error_errno(errno, "Failed to probe %s: %m", what);
2156 return -errno;
2157 }
2158
2159 errno = 0;
2160 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2161 if (errno == 0)
2162 errno = EINVAL;
2163 log_error("Failed to determine file system type of %s", what);
2164 return -errno;
2165 }
2166
2167 if (streq(fstype, "crypto_LUKS")) {
2168 log_error("nspawn currently does not support LUKS disk images.");
2169 return -EOPNOTSUPP;
2170 }
2171
2172 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2173 return log_error_errno(errno, "Failed to mount %s: %m", what);
2174
2175 return 0;
2176 #else
2177 log_error("--image= is not supported, compiled without blkid support.");
2178 return -EOPNOTSUPP;
2179 #endif
2180 }
2181
2182 static int mount_devices(
2183 const char *where,
2184 const char *root_device, bool root_device_rw,
2185 const char *home_device, bool home_device_rw,
2186 const char *srv_device, bool srv_device_rw) {
2187 int r;
2188
2189 assert(where);
2190
2191 if (root_device) {
2192 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2193 if (r < 0)
2194 return log_error_errno(r, "Failed to mount root directory: %m");
2195 }
2196
2197 if (home_device) {
2198 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2199 if (r < 0)
2200 return log_error_errno(r, "Failed to mount home directory: %m");
2201 }
2202
2203 if (srv_device) {
2204 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2205 if (r < 0)
2206 return log_error_errno(r, "Failed to mount server data directory: %m");
2207 }
2208
2209 return 0;
2210 }
2211
2212 static void loop_remove(int nr, int *image_fd) {
2213 _cleanup_close_ int control = -1;
2214 int r;
2215
2216 if (nr < 0)
2217 return;
2218
2219 if (image_fd && *image_fd >= 0) {
2220 r = ioctl(*image_fd, LOOP_CLR_FD);
2221 if (r < 0)
2222 log_debug_errno(errno, "Failed to close loop image: %m");
2223 *image_fd = safe_close(*image_fd);
2224 }
2225
2226 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2227 if (control < 0) {
2228 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2229 return;
2230 }
2231
2232 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2233 if (r < 0)
2234 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2235 }
2236
2237 /*
2238 * Return values:
2239 * < 0 : wait_for_terminate() failed to get the state of the
2240 * container, the container was terminated by a signal, or
2241 * failed for an unknown reason. No change is made to the
2242 * container argument.
2243 * > 0 : The program executed in the container terminated with an
2244 * error. The exit code of the program executed in the
2245 * container is returned. The container argument has been set
2246 * to CONTAINER_TERMINATED.
2247 * 0 : The container is being rebooted, has been shut down or exited
2248 * successfully. The container argument has been set to either
2249 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2250 *
2251 * That is, success is indicated by a return value of zero, and an
2252 * error is indicated by a non-zero value.
2253 */
2254 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2255 siginfo_t status;
2256 int r;
2257
2258 r = wait_for_terminate(pid, &status);
2259 if (r < 0)
2260 return log_warning_errno(r, "Failed to wait for container: %m");
2261
2262 switch (status.si_code) {
2263
2264 case CLD_EXITED:
2265 if (status.si_status == 0) {
2266 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2267
2268 } else
2269 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2270
2271 *container = CONTAINER_TERMINATED;
2272 return status.si_status;
2273
2274 case CLD_KILLED:
2275 if (status.si_status == SIGINT) {
2276
2277 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2278 *container = CONTAINER_TERMINATED;
2279 return 0;
2280
2281 } else if (status.si_status == SIGHUP) {
2282
2283 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2284 *container = CONTAINER_REBOOTED;
2285 return 0;
2286 }
2287
2288 /* CLD_KILLED fallthrough */
2289
2290 case CLD_DUMPED:
2291 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2292 return -EIO;
2293
2294 default:
2295 log_error("Container %s failed due to unknown reason.", arg_machine);
2296 return -EIO;
2297 }
2298
2299 return r;
2300 }
2301
2302 static void nop_handler(int sig) {}
2303
2304 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2305 pid_t pid;
2306
2307 pid = PTR_TO_UINT32(userdata);
2308 if (pid > 0) {
2309 if (kill(pid, arg_kill_signal) >= 0) {
2310 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2311 sd_event_source_set_userdata(s, NULL);
2312 return 0;
2313 }
2314 }
2315
2316 sd_event_exit(sd_event_source_get_event(s), 0);
2317 return 0;
2318 }
2319
2320 static int determine_names(void) {
2321 int r;
2322
2323 if (arg_template && !arg_directory && arg_machine) {
2324
2325 /* If --template= was specified then we should not
2326 * search for a machine, but instead create a new one
2327 * in /var/lib/machine. */
2328
2329 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2330 if (!arg_directory)
2331 return log_oom();
2332 }
2333
2334 if (!arg_image && !arg_directory) {
2335 if (arg_machine) {
2336 _cleanup_(image_unrefp) Image *i = NULL;
2337
2338 r = image_find(arg_machine, &i);
2339 if (r < 0)
2340 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2341 else if (r == 0) {
2342 log_error("No image for machine '%s': %m", arg_machine);
2343 return -ENOENT;
2344 }
2345
2346 if (i->type == IMAGE_RAW)
2347 r = set_sanitized_path(&arg_image, i->path);
2348 else
2349 r = set_sanitized_path(&arg_directory, i->path);
2350 if (r < 0)
2351 return log_error_errno(r, "Invalid image directory: %m");
2352
2353 if (!arg_ephemeral)
2354 arg_read_only = arg_read_only || i->read_only;
2355 } else
2356 arg_directory = get_current_dir_name();
2357
2358 if (!arg_directory && !arg_machine) {
2359 log_error("Failed to determine path, please use -D or -i.");
2360 return -EINVAL;
2361 }
2362 }
2363
2364 if (!arg_machine) {
2365 if (arg_directory && path_equal(arg_directory, "/"))
2366 arg_machine = gethostname_malloc();
2367 else
2368 arg_machine = strdup(basename(arg_image ?: arg_directory));
2369
2370 if (!arg_machine)
2371 return log_oom();
2372
2373 hostname_cleanup(arg_machine);
2374 if (!machine_name_is_valid(arg_machine)) {
2375 log_error("Failed to determine machine name automatically, please use -M.");
2376 return -EINVAL;
2377 }
2378
2379 if (arg_ephemeral) {
2380 char *b;
2381
2382 /* Add a random suffix when this is an
2383 * ephemeral machine, so that we can run many
2384 * instances at once without manually having
2385 * to specify -M each time. */
2386
2387 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2388 return log_oom();
2389
2390 free(arg_machine);
2391 arg_machine = b;
2392 }
2393 }
2394
2395 return 0;
2396 }
2397
2398 static int determine_uid_shift(const char *directory) {
2399 int r;
2400
2401 if (!arg_userns) {
2402 arg_uid_shift = 0;
2403 return 0;
2404 }
2405
2406 if (arg_uid_shift == UID_INVALID) {
2407 struct stat st;
2408
2409 r = stat(directory, &st);
2410 if (r < 0)
2411 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2412
2413 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2414
2415 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2416 log_error("UID and GID base of %s don't match.", directory);
2417 return -EINVAL;
2418 }
2419
2420 arg_uid_range = UINT32_C(0x10000);
2421 }
2422
2423 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2424 log_error("UID base too high for UID range.");
2425 return -EINVAL;
2426 }
2427
2428 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2429 return 0;
2430 }
2431
2432 static int inner_child(
2433 Barrier *barrier,
2434 const char *directory,
2435 bool secondary,
2436 int kmsg_socket,
2437 int rtnl_socket,
2438 FDSet *fds) {
2439
2440 _cleanup_free_ char *home = NULL;
2441 unsigned n_env = 2;
2442 const char *envp[] = {
2443 "PATH=" DEFAULT_PATH_SPLIT_USR,
2444 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2445 NULL, /* TERM */
2446 NULL, /* HOME */
2447 NULL, /* USER */
2448 NULL, /* LOGNAME */
2449 NULL, /* container_uuid */
2450 NULL, /* LISTEN_FDS */
2451 NULL, /* LISTEN_PID */
2452 NULL
2453 };
2454
2455 _cleanup_strv_free_ char **env_use = NULL;
2456 int r;
2457
2458 assert(barrier);
2459 assert(directory);
2460 assert(kmsg_socket >= 0);
2461
2462 cg_unified_flush();
2463
2464 if (arg_userns) {
2465 /* Tell the parent, that it now can write the UID map. */
2466 (void) barrier_place(barrier); /* #1 */
2467
2468 /* Wait until the parent wrote the UID map */
2469 if (!barrier_place_and_sync(barrier)) { /* #2 */
2470 log_error("Parent died too early");
2471 return -ESRCH;
2472 }
2473 }
2474
2475 r = mount_all(NULL, true, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2476 if (r < 0)
2477 return r;
2478
2479 /* Wait until we are cgroup-ified, so that we
2480 * can mount the right cgroup path writable */
2481 if (!barrier_place_and_sync(barrier)) { /* #3 */
2482 log_error("Parent died too early");
2483 return -ESRCH;
2484 }
2485
2486 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2487 if (r < 0)
2488 return r;
2489
2490 r = reset_uid_gid();
2491 if (r < 0)
2492 return log_error_errno(r, "Couldn't become new root: %m");
2493
2494 r = setup_boot_id(NULL);
2495 if (r < 0)
2496 return r;
2497
2498 r = setup_kmsg(NULL, kmsg_socket);
2499 if (r < 0)
2500 return r;
2501 kmsg_socket = safe_close(kmsg_socket);
2502
2503 umask(0022);
2504
2505 if (setsid() < 0)
2506 return log_error_errno(errno, "setsid() failed: %m");
2507
2508 if (arg_private_network)
2509 loopback_setup();
2510
2511 if (arg_expose_ports) {
2512 r = expose_port_send_rtnl(rtnl_socket);
2513 if (r < 0)
2514 return r;
2515 rtnl_socket = safe_close(rtnl_socket);
2516 }
2517
2518 if (drop_capabilities() < 0)
2519 return log_error_errno(errno, "drop_capabilities() failed: %m");
2520
2521 setup_hostname();
2522
2523 if (arg_personality != PERSONALITY_INVALID) {
2524 if (personality(arg_personality) < 0)
2525 return log_error_errno(errno, "personality() failed: %m");
2526 } else if (secondary) {
2527 if (personality(PER_LINUX32) < 0)
2528 return log_error_errno(errno, "personality() failed: %m");
2529 }
2530
2531 #ifdef HAVE_SELINUX
2532 if (arg_selinux_context)
2533 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2534 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2535 #endif
2536
2537 r = change_uid_gid(arg_user, &home);
2538 if (r < 0)
2539 return r;
2540
2541 envp[n_env] = strv_find_prefix(environ, "TERM=");
2542 if (envp[n_env])
2543 n_env ++;
2544
2545 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2546 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2547 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2548 return log_oom();
2549
2550 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2551 char as_uuid[37];
2552
2553 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2554 return log_oom();
2555 }
2556
2557 if (fdset_size(fds) > 0) {
2558 r = fdset_cloexec(fds, false);
2559 if (r < 0)
2560 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2561
2562 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2563 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2564 return log_oom();
2565 }
2566
2567 env_use = strv_env_merge(2, envp, arg_setenv);
2568 if (!env_use)
2569 return log_oom();
2570
2571 /* Let the parent know that we are ready and
2572 * wait until the parent is ready with the
2573 * setup, too... */
2574 if (!barrier_place_and_sync(barrier)) { /* #4 */
2575 log_error("Parent died too early");
2576 return -ESRCH;
2577 }
2578
2579 /* Now, explicitly close the log, so that we
2580 * then can close all remaining fds. Closing
2581 * the log explicitly first has the benefit
2582 * that the logging subsystem knows about it,
2583 * and is thus ready to be reopened should we
2584 * need it again. Note that the other fds
2585 * closed here are at least the locking and
2586 * barrier fds. */
2587 log_close();
2588 (void) fdset_close_others(fds);
2589
2590 if (arg_boot) {
2591 char **a;
2592 size_t m;
2593
2594 /* Automatically search for the init system */
2595
2596 m = 1 + strv_length(arg_parameters);
2597 a = newa(char*, m + 1);
2598 if (strv_isempty(arg_parameters))
2599 a[1] = NULL;
2600 else
2601 memcpy(a + 1, arg_parameters, m * sizeof(char*));
2602
2603 a[0] = (char*) "/usr/lib/systemd/systemd";
2604 execve(a[0], a, env_use);
2605
2606 a[0] = (char*) "/lib/systemd/systemd";
2607 execve(a[0], a, env_use);
2608
2609 a[0] = (char*) "/sbin/init";
2610 execve(a[0], a, env_use);
2611 } else if (!strv_isempty(arg_parameters))
2612 execvpe(arg_parameters[0], arg_parameters, env_use);
2613 else {
2614 chdir(home ?: "/root");
2615 execle("/bin/bash", "-bash", NULL, env_use);
2616 execle("/bin/sh", "-sh", NULL, env_use);
2617 }
2618
2619 (void) log_open();
2620 return log_error_errno(errno, "execv() failed: %m");
2621 }
2622
2623 static int outer_child(
2624 Barrier *barrier,
2625 const char *directory,
2626 const char *console,
2627 const char *root_device, bool root_device_rw,
2628 const char *home_device, bool home_device_rw,
2629 const char *srv_device, bool srv_device_rw,
2630 bool interactive,
2631 bool secondary,
2632 int pid_socket,
2633 int kmsg_socket,
2634 int rtnl_socket,
2635 int uid_shift_socket,
2636 FDSet *fds) {
2637
2638 pid_t pid;
2639 ssize_t l;
2640 int r;
2641
2642 assert(barrier);
2643 assert(directory);
2644 assert(console);
2645 assert(pid_socket >= 0);
2646 assert(kmsg_socket >= 0);
2647
2648 cg_unified_flush();
2649
2650 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2651 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2652
2653 if (interactive) {
2654 close_nointr(STDIN_FILENO);
2655 close_nointr(STDOUT_FILENO);
2656 close_nointr(STDERR_FILENO);
2657
2658 r = open_terminal(console, O_RDWR);
2659 if (r != STDIN_FILENO) {
2660 if (r >= 0) {
2661 safe_close(r);
2662 r = -EINVAL;
2663 }
2664
2665 return log_error_errno(r, "Failed to open console: %m");
2666 }
2667
2668 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2669 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2670 return log_error_errno(errno, "Failed to duplicate console: %m");
2671 }
2672
2673 r = reset_audit_loginuid();
2674 if (r < 0)
2675 return r;
2676
2677 /* Mark everything as slave, so that we still
2678 * receive mounts from the real root, but don't
2679 * propagate mounts to the real root. */
2680 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2681 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2682
2683 r = mount_devices(directory,
2684 root_device, root_device_rw,
2685 home_device, home_device_rw,
2686 srv_device, srv_device_rw);
2687 if (r < 0)
2688 return r;
2689
2690 r = determine_uid_shift(directory);
2691 if (r < 0)
2692 return r;
2693
2694 if (arg_userns) {
2695 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2696 if (l < 0)
2697 return log_error_errno(errno, "Failed to send UID shift: %m");
2698 if (l != sizeof(arg_uid_shift)) {
2699 log_error("Short write while sending UID shift.");
2700 return -EIO;
2701 }
2702 }
2703
2704 /* Turn directory into bind mount */
2705 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2706 return log_error_errno(errno, "Failed to make bind mount: %m");
2707
2708 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2709 if (r < 0)
2710 return r;
2711
2712 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2713 if (r < 0)
2714 return r;
2715
2716 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2717 if (r < 0)
2718 return r;
2719
2720 if (arg_read_only) {
2721 r = bind_remount_recursive(directory, true);
2722 if (r < 0)
2723 return log_error_errno(r, "Failed to make tree read-only: %m");
2724 }
2725
2726 r = mount_all(directory, false, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2727 if (r < 0)
2728 return r;
2729
2730 r = copy_devnodes(directory);
2731 if (r < 0)
2732 return r;
2733
2734 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2735
2736 r = setup_pts(directory);
2737 if (r < 0)
2738 return r;
2739
2740 r = setup_propagate(directory);
2741 if (r < 0)
2742 return r;
2743
2744 r = setup_dev_console(directory, console);
2745 if (r < 0)
2746 return r;
2747
2748 r = setup_seccomp();
2749 if (r < 0)
2750 return r;
2751
2752 r = setup_timezone(directory);
2753 if (r < 0)
2754 return r;
2755
2756 r = setup_resolv_conf(directory);
2757 if (r < 0)
2758 return r;
2759
2760 r = setup_journal(directory);
2761 if (r < 0)
2762 return r;
2763
2764 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2765 if (r < 0)
2766 return r;
2767
2768 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2769 if (r < 0)
2770 return r;
2771
2772 r = mount_move_root(directory);
2773 if (r < 0)
2774 return log_error_errno(r, "Failed to move root directory: %m");
2775
2776 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2777 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2778 (arg_private_network ? CLONE_NEWNET : 0) |
2779 (arg_userns ? CLONE_NEWUSER : 0),
2780 NULL);
2781 if (pid < 0)
2782 return log_error_errno(errno, "Failed to fork inner child: %m");
2783 if (pid == 0) {
2784 pid_socket = safe_close(pid_socket);
2785 uid_shift_socket = safe_close(uid_shift_socket);
2786
2787 /* The inner child has all namespaces that are
2788 * requested, so that we all are owned by the user if
2789 * user namespaces are turned on. */
2790
2791 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2792 if (r < 0)
2793 _exit(EXIT_FAILURE);
2794
2795 _exit(EXIT_SUCCESS);
2796 }
2797
2798 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2799 if (l < 0)
2800 return log_error_errno(errno, "Failed to send PID: %m");
2801 if (l != sizeof(pid)) {
2802 log_error("Short write while sending PID.");
2803 return -EIO;
2804 }
2805
2806 pid_socket = safe_close(pid_socket);
2807
2808 return 0;
2809 }
2810
2811 static int setup_uid_map(pid_t pid) {
2812 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2813 int r;
2814
2815 assert(pid > 1);
2816
2817 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2818 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2819 r = write_string_file(uid_map, line, 0);
2820 if (r < 0)
2821 return log_error_errno(r, "Failed to write UID map: %m");
2822
2823 /* We always assign the same UID and GID ranges */
2824 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2825 r = write_string_file(uid_map, line, 0);
2826 if (r < 0)
2827 return log_error_errno(r, "Failed to write GID map: %m");
2828
2829 return 0;
2830 }
2831
2832 static int load_settings(void) {
2833 _cleanup_(settings_freep) Settings *settings = NULL;
2834 _cleanup_fclose_ FILE *f = NULL;
2835 _cleanup_free_ char *p = NULL;
2836 const char *fn, *i;
2837 int r;
2838
2839 /* If all settings are masked, there's no point in looking for
2840 * the settings file */
2841 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2842 return 0;
2843
2844 fn = strjoina(arg_machine, ".nspawn");
2845
2846 /* We first look in the admin's directories in /etc and /run */
2847 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2848 _cleanup_free_ char *j = NULL;
2849
2850 j = strjoin(i, "/", fn, NULL);
2851 if (!j)
2852 return log_oom();
2853
2854 f = fopen(j, "re");
2855 if (f) {
2856 p = j;
2857 j = NULL;
2858
2859 /* By default we trust configuration from /etc and /run */
2860 if (arg_settings_trusted < 0)
2861 arg_settings_trusted = true;
2862
2863 break;
2864 }
2865
2866 if (errno != ENOENT)
2867 return log_error_errno(errno, "Failed to open %s: %m", j);
2868 }
2869
2870 if (!f) {
2871 /* After that, let's look for a file next to the
2872 * actual image we shall boot. */
2873
2874 if (arg_image) {
2875 p = file_in_same_dir(arg_image, fn);
2876 if (!p)
2877 return log_oom();
2878 } else if (arg_directory) {
2879 p = file_in_same_dir(arg_directory, fn);
2880 if (!p)
2881 return log_oom();
2882 }
2883
2884 if (p) {
2885 f = fopen(p, "re");
2886 if (!f && errno != ENOENT)
2887 return log_error_errno(errno, "Failed to open %s: %m", p);
2888
2889 /* By default we do not trust configuration from /var/lib/machines */
2890 if (arg_settings_trusted < 0)
2891 arg_settings_trusted = false;
2892 }
2893 }
2894
2895 if (!f)
2896 return 0;
2897
2898 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2899
2900 r = settings_load(f, p, &settings);
2901 if (r < 0)
2902 return r;
2903
2904 /* Copy over bits from the settings, unless they have been
2905 * explicitly masked by command line switches. */
2906
2907 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2908 settings->boot >= 0) {
2909 arg_boot = settings->boot;
2910
2911 strv_free(arg_parameters);
2912 arg_parameters = settings->parameters;
2913 settings->parameters = NULL;
2914 }
2915
2916 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2917 settings->environment) {
2918 strv_free(arg_setenv);
2919 arg_setenv = settings->environment;
2920 settings->environment = NULL;
2921 }
2922
2923 if ((arg_settings_mask & SETTING_USER) == 0 &&
2924 settings->user) {
2925 free(arg_user);
2926 arg_user = settings->user;
2927 settings->user = NULL;
2928 }
2929
2930 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
2931
2932 if (!arg_settings_trusted && settings->capability != 0)
2933 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2934 else
2935 arg_retain |= settings->capability;
2936
2937 arg_retain &= ~settings->drop_capability;
2938 }
2939
2940 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2941 settings->kill_signal > 0)
2942 arg_kill_signal = settings->kill_signal;
2943
2944 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2945 settings->personality != PERSONALITY_INVALID)
2946 arg_personality = settings->personality;
2947
2948 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2949 !sd_id128_is_null(settings->machine_id)) {
2950
2951 if (!arg_settings_trusted)
2952 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2953 else
2954 arg_uuid = settings->machine_id;
2955 }
2956
2957 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2958 settings->read_only >= 0)
2959 arg_read_only = settings->read_only;
2960
2961 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2962 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2963 arg_volatile_mode = settings->volatile_mode;
2964
2965 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2966 settings->n_custom_mounts > 0) {
2967
2968 if (!arg_settings_trusted)
2969 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2970 else {
2971 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2972 arg_custom_mounts = settings->custom_mounts;
2973 arg_n_custom_mounts = settings->n_custom_mounts;
2974
2975 settings->custom_mounts = NULL;
2976 settings->n_custom_mounts = 0;
2977 }
2978 }
2979
2980 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2981 (settings->private_network >= 0 ||
2982 settings->network_veth >= 0 ||
2983 settings->network_bridge ||
2984 settings->network_interfaces ||
2985 settings->network_macvlan ||
2986 settings->network_ipvlan)) {
2987
2988 if (!arg_settings_trusted)
2989 log_warning("Ignoring network settings, file %s is not trusted.", p);
2990 else {
2991 strv_free(arg_network_interfaces);
2992 arg_network_interfaces = settings->network_interfaces;
2993 settings->network_interfaces = NULL;
2994
2995 strv_free(arg_network_macvlan);
2996 arg_network_macvlan = settings->network_macvlan;
2997 settings->network_macvlan = NULL;
2998
2999 strv_free(arg_network_ipvlan);
3000 arg_network_ipvlan = settings->network_ipvlan;
3001 settings->network_ipvlan = NULL;
3002
3003 free(arg_network_bridge);
3004 arg_network_bridge = settings->network_bridge;
3005 settings->network_bridge = NULL;
3006
3007 arg_network_veth = settings->network_veth > 0 || settings->network_bridge;
3008
3009 arg_private_network = true; /* all these settings imply private networking */
3010 }
3011 }
3012
3013 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3014 settings->expose_ports) {
3015
3016 if (!arg_settings_trusted)
3017 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3018 else {
3019 expose_port_free_all(arg_expose_ports);
3020 arg_expose_ports = settings->expose_ports;
3021 settings->expose_ports = NULL;
3022 }
3023 }
3024
3025 return 0;
3026 }
3027
3028 int main(int argc, char *argv[]) {
3029
3030 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3031 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3032 _cleanup_close_ int master = -1, image_fd = -1;
3033 _cleanup_fdset_free_ FDSet *fds = NULL;
3034 int r, n_fd_passed, loop_nr = -1;
3035 char veth_name[IFNAMSIZ];
3036 bool secondary = false, remove_subvol = false;
3037 sigset_t mask_chld;
3038 pid_t pid = 0;
3039 int ret = EXIT_SUCCESS;
3040 union in_addr_union exposed = {};
3041 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3042 bool interactive;
3043
3044 log_parse_environment();
3045 log_open();
3046
3047 r = parse_argv(argc, argv);
3048 if (r <= 0)
3049 goto finish;
3050
3051 if (geteuid() != 0) {
3052 log_error("Need to be root.");
3053 r = -EPERM;
3054 goto finish;
3055 }
3056 r = determine_names();
3057 if (r < 0)
3058 goto finish;
3059
3060 r = load_settings();
3061 if (r < 0)
3062 goto finish;
3063
3064 r = verify_arguments();
3065 if (r < 0)
3066 goto finish;
3067
3068 n_fd_passed = sd_listen_fds(false);
3069 if (n_fd_passed > 0) {
3070 r = fdset_new_listen_fds(&fds, false);
3071 if (r < 0) {
3072 log_error_errno(r, "Failed to collect file descriptors: %m");
3073 goto finish;
3074 }
3075 }
3076
3077 if (arg_directory) {
3078 assert(!arg_image);
3079
3080 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3081 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3082 r = -EINVAL;
3083 goto finish;
3084 }
3085
3086 if (arg_ephemeral) {
3087 _cleanup_free_ char *np = NULL;
3088
3089 /* If the specified path is a mount point we
3090 * generate the new snapshot immediately
3091 * inside it under a random name. However if
3092 * the specified is not a mount point we
3093 * create the new snapshot in the parent
3094 * directory, just next to it. */
3095 r = path_is_mount_point(arg_directory, 0);
3096 if (r < 0) {
3097 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3098 goto finish;
3099 }
3100 if (r > 0)
3101 r = tempfn_random_child(arg_directory, "machine.", &np);
3102 else
3103 r = tempfn_random(arg_directory, "machine.", &np);
3104 if (r < 0) {
3105 log_error_errno(r, "Failed to generate name for snapshot: %m");
3106 goto finish;
3107 }
3108
3109 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3110 if (r < 0) {
3111 log_error_errno(r, "Failed to lock %s: %m", np);
3112 goto finish;
3113 }
3114
3115 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
3116 if (r < 0) {
3117 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3118 goto finish;
3119 }
3120
3121 free(arg_directory);
3122 arg_directory = np;
3123 np = NULL;
3124
3125 remove_subvol = true;
3126
3127 } else {
3128 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3129 if (r == -EBUSY) {
3130 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3131 goto finish;
3132 }
3133 if (r < 0) {
3134 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3135 return r;
3136 }
3137
3138 if (arg_template) {
3139 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
3140 if (r == -EEXIST) {
3141 if (!arg_quiet)
3142 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3143 } else if (r < 0) {
3144 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3145 goto finish;
3146 } else {
3147 if (!arg_quiet)
3148 log_info("Populated %s from template %s.", arg_directory, arg_template);
3149 }
3150 }
3151 }
3152
3153 if (arg_boot) {
3154 if (path_is_os_tree(arg_directory) <= 0) {
3155 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3156 r = -EINVAL;
3157 goto finish;
3158 }
3159 } else {
3160 const char *p;
3161
3162 p = strjoina(arg_directory,
3163 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3164 if (access(p, F_OK) < 0) {
3165 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3166 r = -EINVAL;
3167 goto finish;
3168 }
3169 }
3170
3171 } else {
3172 char template[] = "/tmp/nspawn-root-XXXXXX";
3173
3174 assert(arg_image);
3175 assert(!arg_template);
3176
3177 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3178 if (r == -EBUSY) {
3179 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3180 goto finish;
3181 }
3182 if (r < 0) {
3183 r = log_error_errno(r, "Failed to create image lock: %m");
3184 goto finish;
3185 }
3186
3187 if (!mkdtemp(template)) {
3188 log_error_errno(errno, "Failed to create temporary directory: %m");
3189 r = -errno;
3190 goto finish;
3191 }
3192
3193 arg_directory = strdup(template);
3194 if (!arg_directory) {
3195 r = log_oom();
3196 goto finish;
3197 }
3198
3199 image_fd = setup_image(&device_path, &loop_nr);
3200 if (image_fd < 0) {
3201 r = image_fd;
3202 goto finish;
3203 }
3204
3205 r = dissect_image(image_fd,
3206 &root_device, &root_device_rw,
3207 &home_device, &home_device_rw,
3208 &srv_device, &srv_device_rw,
3209 &secondary);
3210 if (r < 0)
3211 goto finish;
3212 }
3213
3214 r = custom_mounts_prepare();
3215 if (r < 0)
3216 goto finish;
3217
3218 interactive =
3219 isatty(STDIN_FILENO) > 0 &&
3220 isatty(STDOUT_FILENO) > 0;
3221
3222 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3223 if (master < 0) {
3224 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3225 goto finish;
3226 }
3227
3228 r = ptsname_malloc(master, &console);
3229 if (r < 0) {
3230 r = log_error_errno(r, "Failed to determine tty name: %m");
3231 goto finish;
3232 }
3233
3234 if (unlockpt(master) < 0) {
3235 r = log_error_errno(errno, "Failed to unlock tty: %m");
3236 goto finish;
3237 }
3238
3239 if (!arg_quiet)
3240 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3241 arg_machine, arg_image ?: arg_directory);
3242
3243 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3244
3245 assert_se(sigemptyset(&mask_chld) == 0);
3246 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3247
3248 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3249 r = log_error_errno(errno, "Failed to become subreaper: %m");
3250 goto finish;
3251 }
3252
3253 for (;;) {
3254 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
3255 uid_shift_socket_pair[2] = { -1, -1 };
3256 ContainerStatus container_status;
3257 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3258 static const struct sigaction sa = {
3259 .sa_handler = nop_handler,
3260 .sa_flags = SA_NOCLDSTOP,
3261 };
3262 int ifi = 0;
3263 ssize_t l;
3264 _cleanup_event_unref_ sd_event *event = NULL;
3265 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3266 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3267 char last_char = 0;
3268
3269 r = barrier_create(&barrier);
3270 if (r < 0) {
3271 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3272 goto finish;
3273 }
3274
3275 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3276 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3277 goto finish;
3278 }
3279
3280 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3281 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3282 goto finish;
3283 }
3284
3285 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3286 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3287 goto finish;
3288 }
3289
3290 if (arg_userns)
3291 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3292 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3293 goto finish;
3294 }
3295
3296 /* Child can be killed before execv(), so handle SIGCHLD
3297 * in order to interrupt parent's blocking calls and
3298 * give it a chance to call wait() and terminate. */
3299 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3300 if (r < 0) {
3301 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3302 goto finish;
3303 }
3304
3305 r = sigaction(SIGCHLD, &sa, NULL);
3306 if (r < 0) {
3307 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3308 goto finish;
3309 }
3310
3311 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3312 if (pid < 0) {
3313 if (errno == EINVAL)
3314 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3315 else
3316 r = log_error_errno(errno, "clone() failed: %m");
3317
3318 goto finish;
3319 }
3320
3321 if (pid == 0) {
3322 /* The outer child only has a file system namespace. */
3323 barrier_set_role(&barrier, BARRIER_CHILD);
3324
3325 master = safe_close(master);
3326
3327 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3328 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3329 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3330 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3331
3332 (void) reset_all_signal_handlers();
3333 (void) reset_signal_mask();
3334
3335 r = outer_child(&barrier,
3336 arg_directory,
3337 console,
3338 root_device, root_device_rw,
3339 home_device, home_device_rw,
3340 srv_device, srv_device_rw,
3341 interactive,
3342 secondary,
3343 pid_socket_pair[1],
3344 kmsg_socket_pair[1],
3345 rtnl_socket_pair[1],
3346 uid_shift_socket_pair[1],
3347 fds);
3348 if (r < 0)
3349 _exit(EXIT_FAILURE);
3350
3351 _exit(EXIT_SUCCESS);
3352 }
3353
3354 barrier_set_role(&barrier, BARRIER_PARENT);
3355
3356 fdset_free(fds);
3357 fds = NULL;
3358
3359 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3360 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3361 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3362 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3363
3364 /* Wait for the outer child. */
3365 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3366 if (r < 0)
3367 goto finish;
3368 if (r != 0) {
3369 r = -EIO;
3370 goto finish;
3371 }
3372 pid = 0;
3373
3374 /* And now retrieve the PID of the inner child. */
3375 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3376 if (l < 0) {
3377 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3378 goto finish;
3379 }
3380 if (l != sizeof(pid)) {
3381 log_error("Short read while reading inner child PID.");
3382 r = EIO;
3383 goto finish;
3384 }
3385
3386 log_debug("Init process invoked as PID " PID_FMT, pid);
3387
3388 if (arg_userns) {
3389 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3390 log_error("Child died too early.");
3391 r = -ESRCH;
3392 goto finish;
3393 }
3394
3395 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3396 if (l < 0) {
3397 r = log_error_errno(errno, "Failed to read UID shift: %m");
3398 goto finish;
3399 }
3400 if (l != sizeof(arg_uid_shift)) {
3401 log_error("Short read while reading UID shift.");
3402 r = EIO;
3403 goto finish;
3404 }
3405
3406 r = setup_uid_map(pid);
3407 if (r < 0)
3408 goto finish;
3409
3410 (void) barrier_place(&barrier); /* #2 */
3411 }
3412
3413 if (arg_private_network) {
3414
3415 r = move_network_interfaces(pid, arg_network_interfaces);
3416 if (r < 0)
3417 goto finish;
3418
3419 if (arg_network_veth) {
3420 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3421 if (r < 0)
3422 goto finish;
3423 else if (r > 0)
3424 ifi = r;
3425
3426 if (arg_network_bridge) {
3427 r = setup_bridge(veth_name, arg_network_bridge);
3428 if (r < 0)
3429 goto finish;
3430 if (r > 0)
3431 ifi = r;
3432 }
3433 }
3434
3435 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3436 if (r < 0)
3437 goto finish;
3438
3439 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3440 if (r < 0)
3441 goto finish;
3442 }
3443
3444 if (arg_register) {
3445 r = register_machine(
3446 arg_machine,
3447 pid,
3448 arg_directory,
3449 arg_uuid,
3450 ifi,
3451 arg_slice,
3452 arg_custom_mounts, arg_n_custom_mounts,
3453 arg_kill_signal,
3454 arg_property,
3455 arg_keep_unit);
3456 if (r < 0)
3457 goto finish;
3458 }
3459
3460 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3461 if (r < 0)
3462 goto finish;
3463
3464 if (arg_keep_unit) {
3465 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3466 if (r < 0)
3467 goto finish;
3468 }
3469
3470 r = chown_cgroup(pid, arg_uid_shift);
3471 if (r < 0)
3472 goto finish;
3473
3474 /* Notify the child that the parent is ready with all
3475 * its setup (including cgroup-ification), and that
3476 * the child can now hand over control to the code to
3477 * run inside the container. */
3478 (void) barrier_place(&barrier); /* #3 */
3479
3480 /* Block SIGCHLD here, before notifying child.
3481 * process_pty() will handle it with the other signals. */
3482 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3483
3484 /* Reset signal to default */
3485 r = default_signals(SIGCHLD, -1);
3486 if (r < 0) {
3487 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3488 goto finish;
3489 }
3490
3491 /* Let the child know that we are ready and wait that the child is completely ready now. */
3492 if (!barrier_place_and_sync(&barrier)) { /* #5 */
3493 log_error("Client died too early.");
3494 r = -ESRCH;
3495 goto finish;
3496 }
3497
3498 sd_notifyf(false,
3499 "READY=1\n"
3500 "STATUS=Container running.\n"
3501 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3502
3503 r = sd_event_new(&event);
3504 if (r < 0) {
3505 log_error_errno(r, "Failed to get default event source: %m");
3506 goto finish;
3507 }
3508
3509 if (arg_kill_signal > 0) {
3510 /* Try to kill the init system on SIGINT or SIGTERM */
3511 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3512 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3513 } else {
3514 /* Immediately exit */
3515 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3516 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3517 }
3518
3519 /* simply exit on sigchld */
3520 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3521
3522 if (arg_expose_ports) {
3523 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3524 if (r < 0)
3525 goto finish;
3526
3527 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3528 }
3529
3530 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3531
3532 r = pty_forward_new(event, master, true, !interactive, &forward);
3533 if (r < 0) {
3534 log_error_errno(r, "Failed to create PTY forwarder: %m");
3535 goto finish;
3536 }
3537
3538 r = sd_event_loop(event);
3539 if (r < 0) {
3540 log_error_errno(r, "Failed to run event loop: %m");
3541 goto finish;
3542 }
3543
3544 pty_forward_get_last_char(forward, &last_char);
3545
3546 forward = pty_forward_free(forward);
3547
3548 if (!arg_quiet && last_char != '\n')
3549 putc('\n', stdout);
3550
3551 /* Kill if it is not dead yet anyway */
3552 if (arg_register && !arg_keep_unit)
3553 terminate_machine(pid);
3554
3555 /* Normally redundant, but better safe than sorry */
3556 kill(pid, SIGKILL);
3557
3558 r = wait_for_container(pid, &container_status);
3559 pid = 0;
3560
3561 if (r < 0)
3562 /* We failed to wait for the container, or the
3563 * container exited abnormally */
3564 goto finish;
3565 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3566 /* The container exited with a non-zero
3567 * status, or with zero status and no reboot
3568 * was requested. */
3569 ret = r;
3570 break;
3571 }
3572
3573 /* CONTAINER_REBOOTED, loop again */
3574
3575 if (arg_keep_unit) {
3576 /* Special handling if we are running as a
3577 * service: instead of simply restarting the
3578 * machine we want to restart the entire
3579 * service, so let's inform systemd about this
3580 * with the special exit code 133. The service
3581 * file uses RestartForceExitStatus=133 so
3582 * that this results in a full nspawn
3583 * restart. This is necessary since we might
3584 * have cgroup parameters set we want to have
3585 * flushed out. */
3586 ret = 133;
3587 r = 0;
3588 break;
3589 }
3590
3591 expose_port_flush(arg_expose_ports, &exposed);
3592 }
3593
3594 finish:
3595 sd_notify(false,
3596 "STOPPING=1\n"
3597 "STATUS=Terminating...");
3598
3599 if (pid > 0)
3600 kill(pid, SIGKILL);
3601
3602 /* Try to flush whatever is still queued in the pty */
3603 if (master >= 0)
3604 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
3605
3606 loop_remove(loop_nr, &image_fd);
3607
3608 if (remove_subvol && arg_directory) {
3609 int k;
3610
3611 k = btrfs_subvol_remove(arg_directory, true);
3612 if (k < 0)
3613 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3614 }
3615
3616 if (arg_machine) {
3617 const char *p;
3618
3619 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3620 (void) rm_rf(p, REMOVE_ROOT);
3621 }
3622
3623 expose_port_flush(arg_expose_ports, &exposed);
3624
3625 free(arg_directory);
3626 free(arg_template);
3627 free(arg_image);
3628 free(arg_machine);
3629 free(arg_user);
3630 strv_free(arg_setenv);
3631 free(arg_network_bridge);
3632 strv_free(arg_network_interfaces);
3633 strv_free(arg_network_macvlan);
3634 strv_free(arg_network_ipvlan);
3635 strv_free(arg_parameters);
3636 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3637 expose_port_free_all(arg_expose_ports);
3638
3639 return r < 0 ? EXIT_FAILURE : ret;
3640 }