]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #2603 from poettering/drop-compat-libs
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #ifdef HAVE_BLKID
21 #include <blkid/blkid.h>
22 #endif
23 #include <errno.h>
24 #include <getopt.h>
25 #include <linux/loop.h>
26 #include <sched.h>
27 #ifdef HAVE_SECCOMP
28 #include <seccomp.h>
29 #endif
30 #ifdef HAVE_SELINUX
31 #include <selinux/selinux.h>
32 #endif
33 #include <signal.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <sys/file.h>
38 #include <sys/mount.h>
39 #include <sys/personality.h>
40 #include <sys/prctl.h>
41 #include <sys/types.h>
42 #include <unistd.h>
43
44 #include "sd-daemon.h"
45 #include "sd-id128.h"
46
47 #include "alloc-util.h"
48 #include "barrier.h"
49 #include "base-filesystem.h"
50 #include "blkid-util.h"
51 #include "btrfs-util.h"
52 #include "cap-list.h"
53 #include "capability-util.h"
54 #include "cgroup-util.h"
55 #include "copy.h"
56 #include "dev-setup.h"
57 #include "env-util.h"
58 #include "fd-util.h"
59 #include "fdset.h"
60 #include "fileio.h"
61 #include "formats-util.h"
62 #include "fs-util.h"
63 #include "gpt.h"
64 #include "hostname-util.h"
65 #include "log.h"
66 #include "loopback-setup.h"
67 #include "machine-image.h"
68 #include "macro.h"
69 #include "missing.h"
70 #include "mkdir.h"
71 #include "mount-util.h"
72 #include "netlink-util.h"
73 #include "nspawn-cgroup.h"
74 #include "nspawn-expose-ports.h"
75 #include "nspawn-mount.h"
76 #include "nspawn-network.h"
77 #include "nspawn-register.h"
78 #include "nspawn-settings.h"
79 #include "nspawn-setuid.h"
80 #include "nspawn-stub-pid1.h"
81 #include "parse-util.h"
82 #include "path-util.h"
83 #include "process-util.h"
84 #include "ptyfwd.h"
85 #include "random-util.h"
86 #include "rm-rf.h"
87 #ifdef HAVE_SECCOMP
88 #include "seccomp-util.h"
89 #endif
90 #include "signal-util.h"
91 #include "socket-util.h"
92 #include "stat-util.h"
93 #include "stdio-util.h"
94 #include "string-util.h"
95 #include "strv.h"
96 #include "terminal-util.h"
97 #include "udev-util.h"
98 #include "umask-util.h"
99 #include "user-util.h"
100 #include "util.h"
101
102 typedef enum ContainerStatus {
103 CONTAINER_TERMINATED,
104 CONTAINER_REBOOTED
105 } ContainerStatus;
106
107 typedef enum LinkJournal {
108 LINK_NO,
109 LINK_AUTO,
110 LINK_HOST,
111 LINK_GUEST
112 } LinkJournal;
113
114 static char *arg_directory = NULL;
115 static char *arg_template = NULL;
116 static char *arg_chdir = NULL;
117 static char *arg_user = NULL;
118 static sd_id128_t arg_uuid = {};
119 static char *arg_machine = NULL;
120 static const char *arg_selinux_context = NULL;
121 static const char *arg_selinux_apifs_context = NULL;
122 static const char *arg_slice = NULL;
123 static bool arg_private_network = false;
124 static bool arg_read_only = false;
125 static StartMode arg_start_mode = START_PID1;
126 static bool arg_ephemeral = false;
127 static LinkJournal arg_link_journal = LINK_AUTO;
128 static bool arg_link_journal_try = false;
129 static uint64_t arg_retain =
130 (1ULL << CAP_CHOWN) |
131 (1ULL << CAP_DAC_OVERRIDE) |
132 (1ULL << CAP_DAC_READ_SEARCH) |
133 (1ULL << CAP_FOWNER) |
134 (1ULL << CAP_FSETID) |
135 (1ULL << CAP_IPC_OWNER) |
136 (1ULL << CAP_KILL) |
137 (1ULL << CAP_LEASE) |
138 (1ULL << CAP_LINUX_IMMUTABLE) |
139 (1ULL << CAP_NET_BIND_SERVICE) |
140 (1ULL << CAP_NET_BROADCAST) |
141 (1ULL << CAP_NET_RAW) |
142 (1ULL << CAP_SETGID) |
143 (1ULL << CAP_SETFCAP) |
144 (1ULL << CAP_SETPCAP) |
145 (1ULL << CAP_SETUID) |
146 (1ULL << CAP_SYS_ADMIN) |
147 (1ULL << CAP_SYS_CHROOT) |
148 (1ULL << CAP_SYS_NICE) |
149 (1ULL << CAP_SYS_PTRACE) |
150 (1ULL << CAP_SYS_TTY_CONFIG) |
151 (1ULL << CAP_SYS_RESOURCE) |
152 (1ULL << CAP_SYS_BOOT) |
153 (1ULL << CAP_AUDIT_WRITE) |
154 (1ULL << CAP_AUDIT_CONTROL) |
155 (1ULL << CAP_MKNOD);
156 static CustomMount *arg_custom_mounts = NULL;
157 static unsigned arg_n_custom_mounts = 0;
158 static char **arg_setenv = NULL;
159 static bool arg_quiet = false;
160 static bool arg_share_system = false;
161 static bool arg_register = true;
162 static bool arg_keep_unit = false;
163 static char **arg_network_interfaces = NULL;
164 static char **arg_network_macvlan = NULL;
165 static char **arg_network_ipvlan = NULL;
166 static bool arg_network_veth = false;
167 static char **arg_network_veth_extra = NULL;
168 static char *arg_network_bridge = NULL;
169 static unsigned long arg_personality = PERSONALITY_INVALID;
170 static char *arg_image = NULL;
171 static VolatileMode arg_volatile_mode = VOLATILE_NO;
172 static ExposePort *arg_expose_ports = NULL;
173 static char **arg_property = NULL;
174 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
175 static bool arg_userns = false;
176 static int arg_kill_signal = 0;
177 static bool arg_unified_cgroup_hierarchy = false;
178 static SettingsMask arg_settings_mask = 0;
179 static int arg_settings_trusted = -1;
180 static char **arg_parameters = NULL;
181 static const char *arg_container_service_name = "systemd-nspawn";
182
183 static void help(void) {
184 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
185 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
186 " -h --help Show this help\n"
187 " --version Print version string\n"
188 " -q --quiet Do not show status information\n"
189 " -D --directory=PATH Root directory for the container\n"
190 " --template=PATH Initialize root directory from template directory,\n"
191 " if missing\n"
192 " -x --ephemeral Run container with snapshot of root directory, and\n"
193 " remove it after exit\n"
194 " -i --image=PATH File system device or disk image for the container\n"
195 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
196 " -b --boot Boot up full system (i.e. invoke init)\n"
197 " --chdir=PATH Set working directory in the container\n"
198 " -u --user=USER Run the command under specified user or uid\n"
199 " -M --machine=NAME Set the machine name for the container\n"
200 " --uuid=UUID Set a specific machine UUID for the container\n"
201 " -S --slice=SLICE Place the container in the specified slice\n"
202 " --property=NAME=VALUE Set scope unit property\n"
203 " --private-users[=UIDBASE[:NUIDS]]\n"
204 " Run within user namespace\n"
205 " --private-network Disable network in container\n"
206 " --network-interface=INTERFACE\n"
207 " Assign an existing network interface to the\n"
208 " container\n"
209 " --network-macvlan=INTERFACE\n"
210 " Create a macvlan network interface based on an\n"
211 " existing network interface to the container\n"
212 " --network-ipvlan=INTERFACE\n"
213 " Create a ipvlan network interface based on an\n"
214 " existing network interface to the container\n"
215 " -n --network-veth Add a virtual Ethernet connection between host\n"
216 " and container\n"
217 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
218 " Add an additional virtual Ethernet link between\n"
219 " host and container\n"
220 " --network-bridge=INTERFACE\n"
221 " Add a virtual Ethernet connection between host\n"
222 " and container and add it to an existing bridge on\n"
223 " the host\n"
224 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
225 " Expose a container IP port on the host\n"
226 " -Z --selinux-context=SECLABEL\n"
227 " Set the SELinux security context to be used by\n"
228 " processes in the container\n"
229 " -L --selinux-apifs-context=SECLABEL\n"
230 " Set the SELinux security context to be used by\n"
231 " API/tmpfs file systems in the container\n"
232 " --capability=CAP In addition to the default, retain specified\n"
233 " capability\n"
234 " --drop-capability=CAP Drop the specified capability from the default set\n"
235 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
236 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
237 " host, try-guest, try-host\n"
238 " -j Equivalent to --link-journal=try-guest\n"
239 " --read-only Mount the root directory read-only\n"
240 " --bind=PATH[:PATH[:OPTIONS]]\n"
241 " Bind mount a file or directory from the host into\n"
242 " the container\n"
243 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
244 " Similar, but creates a read-only bind mount\n"
245 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
246 " --overlay=PATH[:PATH...]:PATH\n"
247 " Create an overlay mount from the host to \n"
248 " the container\n"
249 " --overlay-ro=PATH[:PATH...]:PATH\n"
250 " Similar, but creates a read-only overlay mount\n"
251 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
252 " --share-system Share system namespaces with host\n"
253 " --register=BOOLEAN Register container as machine\n"
254 " --keep-unit Do not register a scope for the machine, reuse\n"
255 " the service unit nspawn is running in\n"
256 " --volatile[=MODE] Run the system in volatile mode\n"
257 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
258 , program_invocation_short_name);
259 }
260
261
262 static int custom_mounts_prepare(void) {
263 unsigned i;
264 int r;
265
266 /* Ensure the mounts are applied prefix first. */
267 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
268
269 /* Allocate working directories for the overlay file systems that need it */
270 for (i = 0; i < arg_n_custom_mounts; i++) {
271 CustomMount *m = &arg_custom_mounts[i];
272
273 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
274 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
275 return -EINVAL;
276 }
277
278 if (m->type != CUSTOM_MOUNT_OVERLAY)
279 continue;
280
281 if (m->work_dir)
282 continue;
283
284 if (m->read_only)
285 continue;
286
287 r = tempfn_random(m->source, NULL, &m->work_dir);
288 if (r < 0)
289 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
290 }
291
292 return 0;
293 }
294
295 static int detect_unified_cgroup_hierarchy(void) {
296 const char *e;
297 int r;
298
299 /* Allow the user to control whether the unified hierarchy is used */
300 e = getenv("UNIFIED_CGROUP_HIERARCHY");
301 if (e) {
302 r = parse_boolean(e);
303 if (r < 0)
304 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
305
306 arg_unified_cgroup_hierarchy = r;
307 return 0;
308 }
309
310 /* Otherwise inherit the default from the host system */
311 r = cg_unified();
312 if (r < 0)
313 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
314
315 arg_unified_cgroup_hierarchy = r;
316 return 0;
317 }
318
319 static int parse_argv(int argc, char *argv[]) {
320
321 enum {
322 ARG_VERSION = 0x100,
323 ARG_PRIVATE_NETWORK,
324 ARG_UUID,
325 ARG_READ_ONLY,
326 ARG_CAPABILITY,
327 ARG_DROP_CAPABILITY,
328 ARG_LINK_JOURNAL,
329 ARG_BIND,
330 ARG_BIND_RO,
331 ARG_TMPFS,
332 ARG_OVERLAY,
333 ARG_OVERLAY_RO,
334 ARG_SETENV,
335 ARG_SHARE_SYSTEM,
336 ARG_REGISTER,
337 ARG_KEEP_UNIT,
338 ARG_NETWORK_INTERFACE,
339 ARG_NETWORK_MACVLAN,
340 ARG_NETWORK_IPVLAN,
341 ARG_NETWORK_BRIDGE,
342 ARG_NETWORK_VETH_EXTRA,
343 ARG_PERSONALITY,
344 ARG_VOLATILE,
345 ARG_TEMPLATE,
346 ARG_PROPERTY,
347 ARG_PRIVATE_USERS,
348 ARG_KILL_SIGNAL,
349 ARG_SETTINGS,
350 ARG_CHDIR,
351 };
352
353 static const struct option options[] = {
354 { "help", no_argument, NULL, 'h' },
355 { "version", no_argument, NULL, ARG_VERSION },
356 { "directory", required_argument, NULL, 'D' },
357 { "template", required_argument, NULL, ARG_TEMPLATE },
358 { "ephemeral", no_argument, NULL, 'x' },
359 { "user", required_argument, NULL, 'u' },
360 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
361 { "as-pid2", no_argument, NULL, 'a' },
362 { "boot", no_argument, NULL, 'b' },
363 { "uuid", required_argument, NULL, ARG_UUID },
364 { "read-only", no_argument, NULL, ARG_READ_ONLY },
365 { "capability", required_argument, NULL, ARG_CAPABILITY },
366 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
367 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
368 { "bind", required_argument, NULL, ARG_BIND },
369 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
370 { "tmpfs", required_argument, NULL, ARG_TMPFS },
371 { "overlay", required_argument, NULL, ARG_OVERLAY },
372 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
373 { "machine", required_argument, NULL, 'M' },
374 { "slice", required_argument, NULL, 'S' },
375 { "setenv", required_argument, NULL, ARG_SETENV },
376 { "selinux-context", required_argument, NULL, 'Z' },
377 { "selinux-apifs-context", required_argument, NULL, 'L' },
378 { "quiet", no_argument, NULL, 'q' },
379 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
380 { "register", required_argument, NULL, ARG_REGISTER },
381 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
382 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
383 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
384 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
385 { "network-veth", no_argument, NULL, 'n' },
386 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
387 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
388 { "personality", required_argument, NULL, ARG_PERSONALITY },
389 { "image", required_argument, NULL, 'i' },
390 { "volatile", optional_argument, NULL, ARG_VOLATILE },
391 { "port", required_argument, NULL, 'p' },
392 { "property", required_argument, NULL, ARG_PROPERTY },
393 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
394 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
395 { "settings", required_argument, NULL, ARG_SETTINGS },
396 { "chdir", required_argument, NULL, ARG_CHDIR },
397 {}
398 };
399
400 int c, r;
401 const char *p, *e;
402 uint64_t plus = 0, minus = 0;
403 bool mask_all_settings = false, mask_no_settings = false;
404
405 assert(argc >= 0);
406 assert(argv);
407
408 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
409
410 switch (c) {
411
412 case 'h':
413 help();
414 return 0;
415
416 case ARG_VERSION:
417 return version();
418
419 case 'D':
420 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
421 if (r < 0)
422 return r;
423 break;
424
425 case ARG_TEMPLATE:
426 r = parse_path_argument_and_warn(optarg, false, &arg_template);
427 if (r < 0)
428 return r;
429 break;
430
431 case 'i':
432 r = parse_path_argument_and_warn(optarg, false, &arg_image);
433 if (r < 0)
434 return r;
435 break;
436
437 case 'x':
438 arg_ephemeral = true;
439 break;
440
441 case 'u':
442 r = free_and_strdup(&arg_user, optarg);
443 if (r < 0)
444 return log_oom();
445
446 arg_settings_mask |= SETTING_USER;
447 break;
448
449 case ARG_NETWORK_BRIDGE:
450 r = free_and_strdup(&arg_network_bridge, optarg);
451 if (r < 0)
452 return log_oom();
453
454 /* fall through */
455
456 case 'n':
457 arg_network_veth = true;
458 arg_private_network = true;
459 arg_settings_mask |= SETTING_NETWORK;
460 break;
461
462 case ARG_NETWORK_VETH_EXTRA:
463 r = veth_extra_parse(&arg_network_veth_extra, optarg);
464 if (r < 0)
465 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
466
467 arg_private_network = true;
468 arg_settings_mask |= SETTING_NETWORK;
469 break;
470
471 case ARG_NETWORK_INTERFACE:
472 if (strv_extend(&arg_network_interfaces, optarg) < 0)
473 return log_oom();
474
475 arg_private_network = true;
476 arg_settings_mask |= SETTING_NETWORK;
477 break;
478
479 case ARG_NETWORK_MACVLAN:
480 if (strv_extend(&arg_network_macvlan, optarg) < 0)
481 return log_oom();
482
483 arg_private_network = true;
484 arg_settings_mask |= SETTING_NETWORK;
485 break;
486
487 case ARG_NETWORK_IPVLAN:
488 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
489 return log_oom();
490
491 /* fall through */
492
493 case ARG_PRIVATE_NETWORK:
494 arg_private_network = true;
495 arg_settings_mask |= SETTING_NETWORK;
496 break;
497
498 case 'b':
499 if (arg_start_mode == START_PID2) {
500 log_error("--boot and --as-pid2 may not be combined.");
501 return -EINVAL;
502 }
503
504 arg_start_mode = START_BOOT;
505 arg_settings_mask |= SETTING_START_MODE;
506 break;
507
508 case 'a':
509 if (arg_start_mode == START_BOOT) {
510 log_error("--boot and --as-pid2 may not be combined.");
511 return -EINVAL;
512 }
513
514 arg_start_mode = START_PID2;
515 arg_settings_mask |= SETTING_START_MODE;
516 break;
517
518 case ARG_UUID:
519 r = sd_id128_from_string(optarg, &arg_uuid);
520 if (r < 0) {
521 log_error("Invalid UUID: %s", optarg);
522 return r;
523 }
524
525 arg_settings_mask |= SETTING_MACHINE_ID;
526 break;
527
528 case 'S':
529 arg_slice = optarg;
530 break;
531
532 case 'M':
533 if (isempty(optarg))
534 arg_machine = mfree(arg_machine);
535 else {
536 if (!machine_name_is_valid(optarg)) {
537 log_error("Invalid machine name: %s", optarg);
538 return -EINVAL;
539 }
540
541 r = free_and_strdup(&arg_machine, optarg);
542 if (r < 0)
543 return log_oom();
544
545 break;
546 }
547
548 case 'Z':
549 arg_selinux_context = optarg;
550 break;
551
552 case 'L':
553 arg_selinux_apifs_context = optarg;
554 break;
555
556 case ARG_READ_ONLY:
557 arg_read_only = true;
558 arg_settings_mask |= SETTING_READ_ONLY;
559 break;
560
561 case ARG_CAPABILITY:
562 case ARG_DROP_CAPABILITY: {
563 p = optarg;
564 for(;;) {
565 _cleanup_free_ char *t = NULL;
566
567 r = extract_first_word(&p, &t, ",", 0);
568 if (r < 0)
569 return log_error_errno(r, "Failed to parse capability %s.", t);
570
571 if (r == 0)
572 break;
573
574 if (streq(t, "all")) {
575 if (c == ARG_CAPABILITY)
576 plus = (uint64_t) -1;
577 else
578 minus = (uint64_t) -1;
579 } else {
580 int cap;
581
582 cap = capability_from_name(t);
583 if (cap < 0) {
584 log_error("Failed to parse capability %s.", t);
585 return -EINVAL;
586 }
587
588 if (c == ARG_CAPABILITY)
589 plus |= 1ULL << (uint64_t) cap;
590 else
591 minus |= 1ULL << (uint64_t) cap;
592 }
593 }
594
595 arg_settings_mask |= SETTING_CAPABILITY;
596 break;
597 }
598
599 case 'j':
600 arg_link_journal = LINK_GUEST;
601 arg_link_journal_try = true;
602 break;
603
604 case ARG_LINK_JOURNAL:
605 if (streq(optarg, "auto")) {
606 arg_link_journal = LINK_AUTO;
607 arg_link_journal_try = false;
608 } else if (streq(optarg, "no")) {
609 arg_link_journal = LINK_NO;
610 arg_link_journal_try = false;
611 } else if (streq(optarg, "guest")) {
612 arg_link_journal = LINK_GUEST;
613 arg_link_journal_try = false;
614 } else if (streq(optarg, "host")) {
615 arg_link_journal = LINK_HOST;
616 arg_link_journal_try = false;
617 } else if (streq(optarg, "try-guest")) {
618 arg_link_journal = LINK_GUEST;
619 arg_link_journal_try = true;
620 } else if (streq(optarg, "try-host")) {
621 arg_link_journal = LINK_HOST;
622 arg_link_journal_try = true;
623 } else {
624 log_error("Failed to parse link journal mode %s", optarg);
625 return -EINVAL;
626 }
627
628 break;
629
630 case ARG_BIND:
631 case ARG_BIND_RO:
632 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
633 if (r < 0)
634 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
635
636 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
637 break;
638
639 case ARG_TMPFS:
640 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
641 if (r < 0)
642 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
643
644 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
645 break;
646
647 case ARG_OVERLAY:
648 case ARG_OVERLAY_RO: {
649 _cleanup_free_ char *upper = NULL, *destination = NULL;
650 _cleanup_strv_free_ char **lower = NULL;
651 CustomMount *m;
652 unsigned n = 0;
653 char **i;
654
655 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
656 if (r == -ENOMEM)
657 return log_oom();
658 else if (r < 0) {
659 log_error("Invalid overlay specification: %s", optarg);
660 return r;
661 }
662
663 STRV_FOREACH(i, lower) {
664 if (!path_is_absolute(*i)) {
665 log_error("Overlay path %s is not absolute.", *i);
666 return -EINVAL;
667 }
668
669 n++;
670 }
671
672 if (n < 2) {
673 log_error("--overlay= needs at least two colon-separated directories specified.");
674 return -EINVAL;
675 }
676
677 if (n == 2) {
678 /* If two parameters are specified,
679 * the first one is the lower, the
680 * second one the upper directory. And
681 * we'll also define the destination
682 * mount point the same as the upper. */
683 upper = lower[1];
684 lower[1] = NULL;
685
686 destination = strdup(upper);
687 if (!destination)
688 return log_oom();
689
690 } else {
691 upper = lower[n - 2];
692 destination = lower[n - 1];
693 lower[n - 2] = NULL;
694 }
695
696 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
697 if (!m)
698 return log_oom();
699
700 m->destination = destination;
701 m->source = upper;
702 m->lower = lower;
703 m->read_only = c == ARG_OVERLAY_RO;
704
705 upper = destination = NULL;
706 lower = NULL;
707
708 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
709 break;
710 }
711
712 case ARG_SETENV: {
713 char **n;
714
715 if (!env_assignment_is_valid(optarg)) {
716 log_error("Environment variable assignment '%s' is not valid.", optarg);
717 return -EINVAL;
718 }
719
720 n = strv_env_set(arg_setenv, optarg);
721 if (!n)
722 return log_oom();
723
724 strv_free(arg_setenv);
725 arg_setenv = n;
726
727 arg_settings_mask |= SETTING_ENVIRONMENT;
728 break;
729 }
730
731 case 'q':
732 arg_quiet = true;
733 break;
734
735 case ARG_SHARE_SYSTEM:
736 arg_share_system = true;
737 break;
738
739 case ARG_REGISTER:
740 r = parse_boolean(optarg);
741 if (r < 0) {
742 log_error("Failed to parse --register= argument: %s", optarg);
743 return r;
744 }
745
746 arg_register = r;
747 break;
748
749 case ARG_KEEP_UNIT:
750 arg_keep_unit = true;
751 break;
752
753 case ARG_PERSONALITY:
754
755 arg_personality = personality_from_string(optarg);
756 if (arg_personality == PERSONALITY_INVALID) {
757 log_error("Unknown or unsupported personality '%s'.", optarg);
758 return -EINVAL;
759 }
760
761 arg_settings_mask |= SETTING_PERSONALITY;
762 break;
763
764 case ARG_VOLATILE:
765
766 if (!optarg)
767 arg_volatile_mode = VOLATILE_YES;
768 else {
769 VolatileMode m;
770
771 m = volatile_mode_from_string(optarg);
772 if (m < 0) {
773 log_error("Failed to parse --volatile= argument: %s", optarg);
774 return -EINVAL;
775 } else
776 arg_volatile_mode = m;
777 }
778
779 arg_settings_mask |= SETTING_VOLATILE_MODE;
780 break;
781
782 case 'p':
783 r = expose_port_parse(&arg_expose_ports, optarg);
784 if (r == -EEXIST)
785 return log_error_errno(r, "Duplicate port specification: %s", optarg);
786 if (r < 0)
787 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
788
789 arg_settings_mask |= SETTING_EXPOSE_PORTS;
790 break;
791
792 case ARG_PROPERTY:
793 if (strv_extend(&arg_property, optarg) < 0)
794 return log_oom();
795
796 break;
797
798 case ARG_PRIVATE_USERS:
799 if (optarg) {
800 _cleanup_free_ char *buffer = NULL;
801 const char *range, *shift;
802
803 range = strchr(optarg, ':');
804 if (range) {
805 buffer = strndup(optarg, range - optarg);
806 if (!buffer)
807 return log_oom();
808 shift = buffer;
809
810 range++;
811 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
812 log_error("Failed to parse UID range: %s", range);
813 return -EINVAL;
814 }
815 } else
816 shift = optarg;
817
818 if (parse_uid(shift, &arg_uid_shift) < 0) {
819 log_error("Failed to parse UID: %s", optarg);
820 return -EINVAL;
821 }
822 }
823
824 arg_userns = true;
825 break;
826
827 case ARG_KILL_SIGNAL:
828 arg_kill_signal = signal_from_string_try_harder(optarg);
829 if (arg_kill_signal < 0) {
830 log_error("Cannot parse signal: %s", optarg);
831 return -EINVAL;
832 }
833
834 arg_settings_mask |= SETTING_KILL_SIGNAL;
835 break;
836
837 case ARG_SETTINGS:
838
839 /* no → do not read files
840 * yes → read files, do not override cmdline, trust only subset
841 * override → read files, override cmdline, trust only subset
842 * trusted → read files, do not override cmdline, trust all
843 */
844
845 r = parse_boolean(optarg);
846 if (r < 0) {
847 if (streq(optarg, "trusted")) {
848 mask_all_settings = false;
849 mask_no_settings = false;
850 arg_settings_trusted = true;
851
852 } else if (streq(optarg, "override")) {
853 mask_all_settings = false;
854 mask_no_settings = true;
855 arg_settings_trusted = -1;
856 } else
857 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
858 } else if (r > 0) {
859 /* yes */
860 mask_all_settings = false;
861 mask_no_settings = false;
862 arg_settings_trusted = -1;
863 } else {
864 /* no */
865 mask_all_settings = true;
866 mask_no_settings = false;
867 arg_settings_trusted = false;
868 }
869
870 break;
871
872 case ARG_CHDIR:
873 if (!path_is_absolute(optarg)) {
874 log_error("Working directory %s is not an absolute path.", optarg);
875 return -EINVAL;
876 }
877
878 r = free_and_strdup(&arg_chdir, optarg);
879 if (r < 0)
880 return log_oom();
881
882 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
883 break;
884
885 case '?':
886 return -EINVAL;
887
888 default:
889 assert_not_reached("Unhandled option");
890 }
891
892 if (arg_share_system)
893 arg_register = false;
894
895 if (arg_start_mode != START_PID1 && arg_share_system) {
896 log_error("--boot and --share-system may not be combined.");
897 return -EINVAL;
898 }
899
900 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
901 log_error("--keep-unit may not be used when invoked from a user session.");
902 return -EINVAL;
903 }
904
905 if (arg_directory && arg_image) {
906 log_error("--directory= and --image= may not be combined.");
907 return -EINVAL;
908 }
909
910 if (arg_template && arg_image) {
911 log_error("--template= and --image= may not be combined.");
912 return -EINVAL;
913 }
914
915 if (arg_template && !(arg_directory || arg_machine)) {
916 log_error("--template= needs --directory= or --machine=.");
917 return -EINVAL;
918 }
919
920 if (arg_ephemeral && arg_template) {
921 log_error("--ephemeral and --template= may not be combined.");
922 return -EINVAL;
923 }
924
925 if (arg_ephemeral && arg_image) {
926 log_error("--ephemeral and --image= may not be combined.");
927 return -EINVAL;
928 }
929
930 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
931 log_error("--ephemeral and --link-journal= may not be combined.");
932 return -EINVAL;
933 }
934
935 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
936 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
937
938 if (argc > optind) {
939 arg_parameters = strv_copy(argv + optind);
940 if (!arg_parameters)
941 return log_oom();
942
943 arg_settings_mask |= SETTING_START_MODE;
944 }
945
946 /* Load all settings from .nspawn files */
947 if (mask_no_settings)
948 arg_settings_mask = 0;
949
950 /* Don't load any settings from .nspawn files */
951 if (mask_all_settings)
952 arg_settings_mask = _SETTINGS_MASK_ALL;
953
954 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
955
956 r = detect_unified_cgroup_hierarchy();
957 if (r < 0)
958 return r;
959
960 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
961 if (e)
962 arg_container_service_name = e;
963
964 return 1;
965 }
966
967 static int verify_arguments(void) {
968
969 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
970 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
971 return -EINVAL;
972 }
973
974 if (arg_expose_ports && !arg_private_network) {
975 log_error("Cannot use --port= without private networking.");
976 return -EINVAL;
977 }
978
979 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
980 arg_kill_signal = SIGRTMIN+3;
981
982 return 0;
983 }
984
985 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
986 assert(p);
987
988 if (!arg_userns)
989 return 0;
990
991 if (uid == UID_INVALID && gid == GID_INVALID)
992 return 0;
993
994 if (uid != UID_INVALID) {
995 uid += arg_uid_shift;
996
997 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
998 return -EOVERFLOW;
999 }
1000
1001 if (gid != GID_INVALID) {
1002 gid += (gid_t) arg_uid_shift;
1003
1004 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1005 return -EOVERFLOW;
1006 }
1007
1008 if (lchown(p, uid, gid) < 0)
1009 return -errno;
1010
1011 return 0;
1012 }
1013
1014 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1015 const char *q;
1016
1017 q = prefix_roota(root, path);
1018 if (mkdir(q, mode) < 0) {
1019 if (errno == EEXIST)
1020 return 0;
1021 return -errno;
1022 }
1023
1024 return userns_lchown(q, uid, gid);
1025 }
1026
1027 static int setup_timezone(const char *dest) {
1028 _cleanup_free_ char *p = NULL, *q = NULL;
1029 const char *where, *check, *what;
1030 char *z, *y;
1031 int r;
1032
1033 assert(dest);
1034
1035 /* Fix the timezone, if possible */
1036 r = readlink_malloc("/etc/localtime", &p);
1037 if (r < 0) {
1038 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1039 return 0;
1040 }
1041
1042 z = path_startswith(p, "../usr/share/zoneinfo/");
1043 if (!z)
1044 z = path_startswith(p, "/usr/share/zoneinfo/");
1045 if (!z) {
1046 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1047 return 0;
1048 }
1049
1050 where = prefix_roota(dest, "/etc/localtime");
1051 r = readlink_malloc(where, &q);
1052 if (r >= 0) {
1053 y = path_startswith(q, "../usr/share/zoneinfo/");
1054 if (!y)
1055 y = path_startswith(q, "/usr/share/zoneinfo/");
1056
1057 /* Already pointing to the right place? Then do nothing .. */
1058 if (y && streq(y, z))
1059 return 0;
1060 }
1061
1062 check = strjoina("/usr/share/zoneinfo/", z);
1063 check = prefix_roota(dest, check);
1064 if (laccess(check, F_OK) < 0) {
1065 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1066 return 0;
1067 }
1068
1069 r = unlink(where);
1070 if (r < 0 && errno != ENOENT) {
1071 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1072 return 0;
1073 }
1074
1075 what = strjoina("../usr/share/zoneinfo/", z);
1076 if (symlink(what, where) < 0) {
1077 log_error_errno(errno, "Failed to correct timezone of container: %m");
1078 return 0;
1079 }
1080
1081 r = userns_lchown(where, 0, 0);
1082 if (r < 0)
1083 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1084
1085 return 0;
1086 }
1087
1088 static int setup_resolv_conf(const char *dest) {
1089 const char *where = NULL;
1090 int r;
1091
1092 assert(dest);
1093
1094 if (arg_private_network)
1095 return 0;
1096
1097 /* Fix resolv.conf, if possible */
1098 where = prefix_roota(dest, "/etc/resolv.conf");
1099
1100 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1101 if (r < 0) {
1102 /* If the file already exists as symlink, let's
1103 * suppress the warning, under the assumption that
1104 * resolved or something similar runs inside and the
1105 * symlink points there.
1106 *
1107 * If the disk image is read-only, there's also no
1108 * point in complaining.
1109 */
1110 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1111 "Failed to copy /etc/resolv.conf to %s: %m", where);
1112 return 0;
1113 }
1114
1115 r = userns_lchown(where, 0, 0);
1116 if (r < 0)
1117 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1118
1119 return 0;
1120 }
1121
1122 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1123 assert(s);
1124
1125 snprintf(s, 37,
1126 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1127 SD_ID128_FORMAT_VAL(id));
1128
1129 return s;
1130 }
1131
1132 static int setup_boot_id(const char *dest) {
1133 const char *from, *to;
1134 sd_id128_t rnd = {};
1135 char as_uuid[37];
1136 int r;
1137
1138 if (arg_share_system)
1139 return 0;
1140
1141 /* Generate a new randomized boot ID, so that each boot-up of
1142 * the container gets a new one */
1143
1144 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1145 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1146
1147 r = sd_id128_randomize(&rnd);
1148 if (r < 0)
1149 return log_error_errno(r, "Failed to generate random boot id: %m");
1150
1151 id128_format_as_uuid(rnd, as_uuid);
1152
1153 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1154 if (r < 0)
1155 return log_error_errno(r, "Failed to write boot id: %m");
1156
1157 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1158 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1159 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1160 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1161
1162 unlink(from);
1163 return r;
1164 }
1165
1166 static int copy_devnodes(const char *dest) {
1167
1168 static const char devnodes[] =
1169 "null\0"
1170 "zero\0"
1171 "full\0"
1172 "random\0"
1173 "urandom\0"
1174 "tty\0"
1175 "net/tun\0";
1176
1177 const char *d;
1178 int r = 0;
1179 _cleanup_umask_ mode_t u;
1180
1181 assert(dest);
1182
1183 u = umask(0000);
1184
1185 /* Create /dev/net, so that we can create /dev/net/tun in it */
1186 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1187 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1188
1189 NULSTR_FOREACH(d, devnodes) {
1190 _cleanup_free_ char *from = NULL, *to = NULL;
1191 struct stat st;
1192
1193 from = strappend("/dev/", d);
1194 to = prefix_root(dest, from);
1195
1196 if (stat(from, &st) < 0) {
1197
1198 if (errno != ENOENT)
1199 return log_error_errno(errno, "Failed to stat %s: %m", from);
1200
1201 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1202
1203 log_error("%s is not a char or block device, cannot copy.", from);
1204 return -EIO;
1205
1206 } else {
1207 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1208 if (errno != EPERM)
1209 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1210
1211 /* Some systems abusively restrict mknod but
1212 * allow bind mounts. */
1213 r = touch(to);
1214 if (r < 0)
1215 return log_error_errno(r, "touch (%s) failed: %m", to);
1216 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1217 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1218 }
1219
1220 r = userns_lchown(to, 0, 0);
1221 if (r < 0)
1222 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1223 }
1224 }
1225
1226 return r;
1227 }
1228
1229 static int setup_pts(const char *dest) {
1230 _cleanup_free_ char *options = NULL;
1231 const char *p;
1232 int r;
1233
1234 #ifdef HAVE_SELINUX
1235 if (arg_selinux_apifs_context)
1236 (void) asprintf(&options,
1237 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1238 arg_uid_shift + TTY_GID,
1239 arg_selinux_apifs_context);
1240 else
1241 #endif
1242 (void) asprintf(&options,
1243 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1244 arg_uid_shift + TTY_GID);
1245
1246 if (!options)
1247 return log_oom();
1248
1249 /* Mount /dev/pts itself */
1250 p = prefix_roota(dest, "/dev/pts");
1251 if (mkdir(p, 0755) < 0)
1252 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1253 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1254 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1255 r = userns_lchown(p, 0, 0);
1256 if (r < 0)
1257 return log_error_errno(r, "Failed to chown /dev/pts: %m");
1258
1259 /* Create /dev/ptmx symlink */
1260 p = prefix_roota(dest, "/dev/ptmx");
1261 if (symlink("pts/ptmx", p) < 0)
1262 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1263 r = userns_lchown(p, 0, 0);
1264 if (r < 0)
1265 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
1266
1267 /* And fix /dev/pts/ptmx ownership */
1268 p = prefix_roota(dest, "/dev/pts/ptmx");
1269 r = userns_lchown(p, 0, 0);
1270 if (r < 0)
1271 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
1272
1273 return 0;
1274 }
1275
1276 static int setup_dev_console(const char *dest, const char *console) {
1277 _cleanup_umask_ mode_t u;
1278 const char *to;
1279 int r;
1280
1281 assert(dest);
1282 assert(console);
1283
1284 u = umask(0000);
1285
1286 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1287 if (r < 0)
1288 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1289
1290 /* We need to bind mount the right tty to /dev/console since
1291 * ptys can only exist on pts file systems. To have something
1292 * to bind mount things on we create a empty regular file. */
1293
1294 to = prefix_roota(dest, "/dev/console");
1295 r = touch(to);
1296 if (r < 0)
1297 return log_error_errno(r, "touch() for /dev/console failed: %m");
1298
1299 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1300 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1301
1302 return 0;
1303 }
1304
1305 static int setup_kmsg(const char *dest, int kmsg_socket) {
1306 const char *from, *to;
1307 _cleanup_umask_ mode_t u;
1308 int fd, r;
1309
1310 assert(kmsg_socket >= 0);
1311
1312 u = umask(0000);
1313
1314 /* We create the kmsg FIFO as /run/kmsg, but immediately
1315 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1316 * on the reading side behave very similar to /proc/kmsg,
1317 * their writing side behaves differently from /dev/kmsg in
1318 * that writing blocks when nothing is reading. In order to
1319 * avoid any problems with containers deadlocking due to this
1320 * we simply make /dev/kmsg unavailable to the container. */
1321 from = prefix_roota(dest, "/run/kmsg");
1322 to = prefix_roota(dest, "/proc/kmsg");
1323
1324 if (mkfifo(from, 0600) < 0)
1325 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1326 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1327 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1328
1329 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1330 if (fd < 0)
1331 return log_error_errno(errno, "Failed to open fifo: %m");
1332
1333 /* Store away the fd in the socket, so that it stays open as
1334 * long as we run the child */
1335 r = send_one_fd(kmsg_socket, fd, 0);
1336 safe_close(fd);
1337
1338 if (r < 0)
1339 return log_error_errno(r, "Failed to send FIFO fd: %m");
1340
1341 /* And now make the FIFO unavailable as /run/kmsg... */
1342 (void) unlink(from);
1343
1344 return 0;
1345 }
1346
1347 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1348 union in_addr_union *exposed = userdata;
1349
1350 assert(rtnl);
1351 assert(m);
1352 assert(exposed);
1353
1354 expose_port_execute(rtnl, arg_expose_ports, exposed);
1355 return 0;
1356 }
1357
1358 static int setup_hostname(void) {
1359
1360 if (arg_share_system)
1361 return 0;
1362
1363 if (sethostname_idempotent(arg_machine) < 0)
1364 return -errno;
1365
1366 return 0;
1367 }
1368
1369 static int setup_journal(const char *directory) {
1370 sd_id128_t machine_id, this_id;
1371 _cleanup_free_ char *b = NULL, *d = NULL;
1372 const char *etc_machine_id, *p, *q;
1373 bool try;
1374 char *id;
1375 int r;
1376
1377 /* Don't link journals in ephemeral mode */
1378 if (arg_ephemeral)
1379 return 0;
1380
1381 if (arg_link_journal == LINK_NO)
1382 return 0;
1383
1384 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1385
1386 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1387
1388 r = read_one_line_file(etc_machine_id, &b);
1389 if (r == -ENOENT && try)
1390 return 0;
1391 else if (r < 0)
1392 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
1393
1394 id = strstrip(b);
1395 if (isempty(id) && try)
1396 return 0;
1397
1398 /* Verify validity */
1399 r = sd_id128_from_string(id, &machine_id);
1400 if (r < 0)
1401 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
1402
1403 r = sd_id128_get_machine(&this_id);
1404 if (r < 0)
1405 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1406
1407 if (sd_id128_equal(machine_id, this_id)) {
1408 log_full(try ? LOG_WARNING : LOG_ERR,
1409 "Host and machine ids are equal (%s): refusing to link journals", id);
1410 if (try)
1411 return 0;
1412 return -EEXIST;
1413 }
1414
1415 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1416 if (r < 0)
1417 return log_error_errno(r, "Failed to create /var: %m");
1418
1419 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1420 if (r < 0)
1421 return log_error_errno(r, "Failed to create /var/log: %m");
1422
1423 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1424 if (r < 0)
1425 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1426
1427 p = strjoina("/var/log/journal/", id);
1428 q = prefix_roota(directory, p);
1429
1430 if (path_is_mount_point(p, 0) > 0) {
1431 if (try)
1432 return 0;
1433
1434 log_error("%s: already a mount point, refusing to use for journal", p);
1435 return -EEXIST;
1436 }
1437
1438 if (path_is_mount_point(q, 0) > 0) {
1439 if (try)
1440 return 0;
1441
1442 log_error("%s: already a mount point, refusing to use for journal", q);
1443 return -EEXIST;
1444 }
1445
1446 r = readlink_and_make_absolute(p, &d);
1447 if (r >= 0) {
1448 if ((arg_link_journal == LINK_GUEST ||
1449 arg_link_journal == LINK_AUTO) &&
1450 path_equal(d, q)) {
1451
1452 r = userns_mkdir(directory, p, 0755, 0, 0);
1453 if (r < 0)
1454 log_warning_errno(r, "Failed to create directory %s: %m", q);
1455 return 0;
1456 }
1457
1458 if (unlink(p) < 0)
1459 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1460 } else if (r == -EINVAL) {
1461
1462 if (arg_link_journal == LINK_GUEST &&
1463 rmdir(p) < 0) {
1464
1465 if (errno == ENOTDIR) {
1466 log_error("%s already exists and is neither a symlink nor a directory", p);
1467 return r;
1468 } else
1469 return log_error_errno(errno, "Failed to remove %s: %m", p);
1470 }
1471 } else if (r != -ENOENT)
1472 return log_error_errno(r, "readlink(%s) failed: %m", p);
1473
1474 if (arg_link_journal == LINK_GUEST) {
1475
1476 if (symlink(q, p) < 0) {
1477 if (try) {
1478 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1479 return 0;
1480 } else
1481 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1482 }
1483
1484 r = userns_mkdir(directory, p, 0755, 0, 0);
1485 if (r < 0)
1486 log_warning_errno(r, "Failed to create directory %s: %m", q);
1487 return 0;
1488 }
1489
1490 if (arg_link_journal == LINK_HOST) {
1491 /* don't create parents here -- if the host doesn't have
1492 * permanent journal set up, don't force it here */
1493
1494 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
1495 if (try) {
1496 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1497 return 0;
1498 } else
1499 return log_error_errno(errno, "Failed to create %s: %m", p);
1500 }
1501
1502 } else if (access(p, F_OK) < 0)
1503 return 0;
1504
1505 if (dir_is_empty(q) == 0)
1506 log_warning("%s is not empty, proceeding anyway.", q);
1507
1508 r = userns_mkdir(directory, p, 0755, 0, 0);
1509 if (r < 0)
1510 return log_error_errno(r, "Failed to create %s: %m", q);
1511
1512 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1513 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1514
1515 return 0;
1516 }
1517
1518 static int drop_capabilities(void) {
1519 return capability_bounding_set_drop(arg_retain, false);
1520 }
1521
1522 static int reset_audit_loginuid(void) {
1523 _cleanup_free_ char *p = NULL;
1524 int r;
1525
1526 if (arg_share_system)
1527 return 0;
1528
1529 r = read_one_line_file("/proc/self/loginuid", &p);
1530 if (r == -ENOENT)
1531 return 0;
1532 if (r < 0)
1533 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1534
1535 /* Already reset? */
1536 if (streq(p, "4294967295"))
1537 return 0;
1538
1539 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1540 if (r < 0) {
1541 log_error_errno(r,
1542 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1543 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1544 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1545 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1546 "using systemd-nspawn. Sleeping for 5s... (%m)");
1547
1548 sleep(5);
1549 }
1550
1551 return 0;
1552 }
1553
1554 static int setup_seccomp(void) {
1555
1556 #ifdef HAVE_SECCOMP
1557 static const struct {
1558 uint64_t capability;
1559 int syscall_num;
1560 } blacklist[] = {
1561 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1562 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1563 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1564 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1565 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1566 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1567 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1568 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1569 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1570 { CAP_SYSLOG, SCMP_SYS(syslog) },
1571 };
1572
1573 scmp_filter_ctx seccomp;
1574 unsigned i;
1575 int r;
1576
1577 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1578 if (!seccomp)
1579 return log_oom();
1580
1581 r = seccomp_add_secondary_archs(seccomp);
1582 if (r < 0) {
1583 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1584 goto finish;
1585 }
1586
1587 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1588 if (arg_retain & (1ULL << blacklist[i].capability))
1589 continue;
1590
1591 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1592 if (r == -EFAULT)
1593 continue; /* unknown syscall */
1594 if (r < 0) {
1595 log_error_errno(r, "Failed to block syscall: %m");
1596 goto finish;
1597 }
1598 }
1599
1600
1601 /*
1602 Audit is broken in containers, much of the userspace audit
1603 hookup will fail if running inside a container. We don't
1604 care and just turn off creation of audit sockets.
1605
1606 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1607 with EAFNOSUPPORT which audit userspace uses as indication
1608 that audit is disabled in the kernel.
1609 */
1610
1611 r = seccomp_rule_add(
1612 seccomp,
1613 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1614 SCMP_SYS(socket),
1615 2,
1616 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1617 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1618 if (r < 0) {
1619 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1620 goto finish;
1621 }
1622
1623 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1624 if (r < 0) {
1625 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1626 goto finish;
1627 }
1628
1629 r = seccomp_load(seccomp);
1630 if (r == -EINVAL) {
1631 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1632 r = 0;
1633 goto finish;
1634 }
1635 if (r < 0) {
1636 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1637 goto finish;
1638 }
1639
1640 finish:
1641 seccomp_release(seccomp);
1642 return r;
1643 #else
1644 return 0;
1645 #endif
1646
1647 }
1648
1649 static int setup_propagate(const char *root) {
1650 const char *p, *q;
1651 int r;
1652
1653 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1654 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1655 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1656 (void) mkdir_p(p, 0600);
1657
1658 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1659 if (r < 0)
1660 return log_error_errno(r, "Failed to create /run/systemd: %m");
1661
1662 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1663 if (r < 0)
1664 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
1665
1666 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1667 if (r < 0)
1668 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
1669
1670 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1671 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1672 return log_error_errno(errno, "Failed to install propagation bind mount.");
1673
1674 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1675 return log_error_errno(errno, "Failed to make propagation mount read-only");
1676
1677 return 0;
1678 }
1679
1680 static int setup_image(char **device_path, int *loop_nr) {
1681 struct loop_info64 info = {
1682 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1683 };
1684 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1685 _cleanup_free_ char* loopdev = NULL;
1686 struct stat st;
1687 int r, nr;
1688
1689 assert(device_path);
1690 assert(loop_nr);
1691 assert(arg_image);
1692
1693 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1694 if (fd < 0)
1695 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1696
1697 if (fstat(fd, &st) < 0)
1698 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1699
1700 if (S_ISBLK(st.st_mode)) {
1701 char *p;
1702
1703 p = strdup(arg_image);
1704 if (!p)
1705 return log_oom();
1706
1707 *device_path = p;
1708
1709 *loop_nr = -1;
1710
1711 r = fd;
1712 fd = -1;
1713
1714 return r;
1715 }
1716
1717 if (!S_ISREG(st.st_mode)) {
1718 log_error("%s is not a regular file or block device.", arg_image);
1719 return -EINVAL;
1720 }
1721
1722 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1723 if (control < 0)
1724 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1725
1726 nr = ioctl(control, LOOP_CTL_GET_FREE);
1727 if (nr < 0)
1728 return log_error_errno(errno, "Failed to allocate loop device: %m");
1729
1730 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1731 return log_oom();
1732
1733 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1734 if (loop < 0)
1735 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1736
1737 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1738 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1739
1740 if (arg_read_only)
1741 info.lo_flags |= LO_FLAGS_READ_ONLY;
1742
1743 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1744 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1745
1746 *device_path = loopdev;
1747 loopdev = NULL;
1748
1749 *loop_nr = nr;
1750
1751 r = loop;
1752 loop = -1;
1753
1754 return r;
1755 }
1756
1757 #define PARTITION_TABLE_BLURB \
1758 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1759 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1760 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1761 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1762 "to be bootable with systemd-nspawn."
1763
1764 static int dissect_image(
1765 int fd,
1766 char **root_device, bool *root_device_rw,
1767 char **home_device, bool *home_device_rw,
1768 char **srv_device, bool *srv_device_rw,
1769 bool *secondary) {
1770
1771 #ifdef HAVE_BLKID
1772 int home_nr = -1, srv_nr = -1;
1773 #ifdef GPT_ROOT_NATIVE
1774 int root_nr = -1;
1775 #endif
1776 #ifdef GPT_ROOT_SECONDARY
1777 int secondary_root_nr = -1;
1778 #endif
1779 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1780 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1781 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1782 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1783 _cleanup_udev_unref_ struct udev *udev = NULL;
1784 struct udev_list_entry *first, *item;
1785 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1786 bool is_gpt, is_mbr, multiple_generic = false;
1787 const char *pttype = NULL;
1788 blkid_partlist pl;
1789 struct stat st;
1790 unsigned i;
1791 int r;
1792
1793 assert(fd >= 0);
1794 assert(root_device);
1795 assert(home_device);
1796 assert(srv_device);
1797 assert(secondary);
1798 assert(arg_image);
1799
1800 b = blkid_new_probe();
1801 if (!b)
1802 return log_oom();
1803
1804 errno = 0;
1805 r = blkid_probe_set_device(b, fd, 0, 0);
1806 if (r != 0) {
1807 if (errno == 0)
1808 return log_oom();
1809
1810 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1811 }
1812
1813 blkid_probe_enable_partitions(b, 1);
1814 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1815
1816 errno = 0;
1817 r = blkid_do_safeprobe(b);
1818 if (r == -2 || r == 1) {
1819 log_error("Failed to identify any partition table on\n"
1820 " %s\n"
1821 PARTITION_TABLE_BLURB, arg_image);
1822 return -EINVAL;
1823 } else if (r != 0) {
1824 if (errno == 0)
1825 errno = EIO;
1826 return log_error_errno(errno, "Failed to probe: %m");
1827 }
1828
1829 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1830
1831 is_gpt = streq_ptr(pttype, "gpt");
1832 is_mbr = streq_ptr(pttype, "dos");
1833
1834 if (!is_gpt && !is_mbr) {
1835 log_error("No GPT or MBR partition table discovered on\n"
1836 " %s\n"
1837 PARTITION_TABLE_BLURB, arg_image);
1838 return -EINVAL;
1839 }
1840
1841 errno = 0;
1842 pl = blkid_probe_get_partitions(b);
1843 if (!pl) {
1844 if (errno == 0)
1845 return log_oom();
1846
1847 log_error("Failed to list partitions of %s", arg_image);
1848 return -errno;
1849 }
1850
1851 udev = udev_new();
1852 if (!udev)
1853 return log_oom();
1854
1855 if (fstat(fd, &st) < 0)
1856 return log_error_errno(errno, "Failed to stat block device: %m");
1857
1858 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1859 if (!d)
1860 return log_oom();
1861
1862 for (i = 0;; i++) {
1863 int n, m;
1864
1865 if (i >= 10) {
1866 log_error("Kernel partitions never appeared.");
1867 return -ENXIO;
1868 }
1869
1870 e = udev_enumerate_new(udev);
1871 if (!e)
1872 return log_oom();
1873
1874 r = udev_enumerate_add_match_parent(e, d);
1875 if (r < 0)
1876 return log_oom();
1877
1878 r = udev_enumerate_scan_devices(e);
1879 if (r < 0)
1880 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1881
1882 /* Count the partitions enumerated by the kernel */
1883 n = 0;
1884 first = udev_enumerate_get_list_entry(e);
1885 udev_list_entry_foreach(item, first)
1886 n++;
1887
1888 /* Count the partitions enumerated by blkid */
1889 m = blkid_partlist_numof_partitions(pl);
1890 if (n == m + 1)
1891 break;
1892 if (n > m + 1) {
1893 log_error("blkid and kernel partition list do not match.");
1894 return -EIO;
1895 }
1896 if (n < m + 1) {
1897 unsigned j;
1898
1899 /* The kernel has probed fewer partitions than
1900 * blkid? Maybe the kernel prober is still
1901 * running or it got EBUSY because udev
1902 * already opened the device. Let's reprobe
1903 * the device, which is a synchronous call
1904 * that waits until probing is complete. */
1905
1906 for (j = 0; j < 20; j++) {
1907
1908 r = ioctl(fd, BLKRRPART, 0);
1909 if (r < 0)
1910 r = -errno;
1911 if (r >= 0 || r != -EBUSY)
1912 break;
1913
1914 /* If something else has the device
1915 * open, such as an udev rule, the
1916 * ioctl will return EBUSY. Since
1917 * there's no way to wait until it
1918 * isn't busy anymore, let's just wait
1919 * a bit, and try again.
1920 *
1921 * This is really something they
1922 * should fix in the kernel! */
1923
1924 usleep(50 * USEC_PER_MSEC);
1925 }
1926
1927 if (r < 0)
1928 return log_error_errno(r, "Failed to reread partition table: %m");
1929 }
1930
1931 e = udev_enumerate_unref(e);
1932 }
1933
1934 first = udev_enumerate_get_list_entry(e);
1935 udev_list_entry_foreach(item, first) {
1936 _cleanup_udev_device_unref_ struct udev_device *q;
1937 const char *node;
1938 unsigned long long flags;
1939 blkid_partition pp;
1940 dev_t qn;
1941 int nr;
1942
1943 errno = 0;
1944 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1945 if (!q) {
1946 if (!errno)
1947 errno = ENOMEM;
1948
1949 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1950 }
1951
1952 qn = udev_device_get_devnum(q);
1953 if (major(qn) == 0)
1954 continue;
1955
1956 if (st.st_rdev == qn)
1957 continue;
1958
1959 node = udev_device_get_devnode(q);
1960 if (!node)
1961 continue;
1962
1963 pp = blkid_partlist_devno_to_partition(pl, qn);
1964 if (!pp)
1965 continue;
1966
1967 flags = blkid_partition_get_flags(pp);
1968
1969 nr = blkid_partition_get_partno(pp);
1970 if (nr < 0)
1971 continue;
1972
1973 if (is_gpt) {
1974 sd_id128_t type_id;
1975 const char *stype;
1976
1977 if (flags & GPT_FLAG_NO_AUTO)
1978 continue;
1979
1980 stype = blkid_partition_get_type_string(pp);
1981 if (!stype)
1982 continue;
1983
1984 if (sd_id128_from_string(stype, &type_id) < 0)
1985 continue;
1986
1987 if (sd_id128_equal(type_id, GPT_HOME)) {
1988
1989 if (home && nr >= home_nr)
1990 continue;
1991
1992 home_nr = nr;
1993 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1994
1995 r = free_and_strdup(&home, node);
1996 if (r < 0)
1997 return log_oom();
1998
1999 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2000
2001 if (srv && nr >= srv_nr)
2002 continue;
2003
2004 srv_nr = nr;
2005 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2006
2007 r = free_and_strdup(&srv, node);
2008 if (r < 0)
2009 return log_oom();
2010 }
2011 #ifdef GPT_ROOT_NATIVE
2012 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2013
2014 if (root && nr >= root_nr)
2015 continue;
2016
2017 root_nr = nr;
2018 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2019
2020 r = free_and_strdup(&root, node);
2021 if (r < 0)
2022 return log_oom();
2023 }
2024 #endif
2025 #ifdef GPT_ROOT_SECONDARY
2026 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2027
2028 if (secondary_root && nr >= secondary_root_nr)
2029 continue;
2030
2031 secondary_root_nr = nr;
2032 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2033
2034 r = free_and_strdup(&secondary_root, node);
2035 if (r < 0)
2036 return log_oom();
2037 }
2038 #endif
2039 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2040
2041 if (generic)
2042 multiple_generic = true;
2043 else {
2044 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2045
2046 r = free_and_strdup(&generic, node);
2047 if (r < 0)
2048 return log_oom();
2049 }
2050 }
2051
2052 } else if (is_mbr) {
2053 int type;
2054
2055 if (flags != 0x80) /* Bootable flag */
2056 continue;
2057
2058 type = blkid_partition_get_type(pp);
2059 if (type != 0x83) /* Linux partition */
2060 continue;
2061
2062 if (generic)
2063 multiple_generic = true;
2064 else {
2065 generic_rw = true;
2066
2067 r = free_and_strdup(&root, node);
2068 if (r < 0)
2069 return log_oom();
2070 }
2071 }
2072 }
2073
2074 if (root) {
2075 *root_device = root;
2076 root = NULL;
2077
2078 *root_device_rw = root_rw;
2079 *secondary = false;
2080 } else if (secondary_root) {
2081 *root_device = secondary_root;
2082 secondary_root = NULL;
2083
2084 *root_device_rw = secondary_root_rw;
2085 *secondary = true;
2086 } else if (generic) {
2087
2088 /* There were no partitions with precise meanings
2089 * around, but we found generic partitions. In this
2090 * case, if there's only one, we can go ahead and boot
2091 * it, otherwise we bail out, because we really cannot
2092 * make any sense of it. */
2093
2094 if (multiple_generic) {
2095 log_error("Identified multiple bootable Linux partitions on\n"
2096 " %s\n"
2097 PARTITION_TABLE_BLURB, arg_image);
2098 return -EINVAL;
2099 }
2100
2101 *root_device = generic;
2102 generic = NULL;
2103
2104 *root_device_rw = generic_rw;
2105 *secondary = false;
2106 } else {
2107 log_error("Failed to identify root partition in disk image\n"
2108 " %s\n"
2109 PARTITION_TABLE_BLURB, arg_image);
2110 return -EINVAL;
2111 }
2112
2113 if (home) {
2114 *home_device = home;
2115 home = NULL;
2116
2117 *home_device_rw = home_rw;
2118 }
2119
2120 if (srv) {
2121 *srv_device = srv;
2122 srv = NULL;
2123
2124 *srv_device_rw = srv_rw;
2125 }
2126
2127 return 0;
2128 #else
2129 log_error("--image= is not supported, compiled without blkid support.");
2130 return -EOPNOTSUPP;
2131 #endif
2132 }
2133
2134 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2135 #ifdef HAVE_BLKID
2136 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2137 const char *fstype, *p;
2138 int r;
2139
2140 assert(what);
2141 assert(where);
2142
2143 if (arg_read_only)
2144 rw = false;
2145
2146 if (directory)
2147 p = strjoina(where, directory);
2148 else
2149 p = where;
2150
2151 errno = 0;
2152 b = blkid_new_probe_from_filename(what);
2153 if (!b) {
2154 if (errno == 0)
2155 return log_oom();
2156 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2157 }
2158
2159 blkid_probe_enable_superblocks(b, 1);
2160 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2161
2162 errno = 0;
2163 r = blkid_do_safeprobe(b);
2164 if (r == -1 || r == 1) {
2165 log_error("Cannot determine file system type of %s", what);
2166 return -EINVAL;
2167 } else if (r != 0) {
2168 if (errno == 0)
2169 errno = EIO;
2170 return log_error_errno(errno, "Failed to probe %s: %m", what);
2171 }
2172
2173 errno = 0;
2174 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2175 if (errno == 0)
2176 errno = EINVAL;
2177 log_error("Failed to determine file system type of %s", what);
2178 return -errno;
2179 }
2180
2181 if (streq(fstype, "crypto_LUKS")) {
2182 log_error("nspawn currently does not support LUKS disk images.");
2183 return -EOPNOTSUPP;
2184 }
2185
2186 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2187 return log_error_errno(errno, "Failed to mount %s: %m", what);
2188
2189 return 0;
2190 #else
2191 log_error("--image= is not supported, compiled without blkid support.");
2192 return -EOPNOTSUPP;
2193 #endif
2194 }
2195
2196 static int mount_devices(
2197 const char *where,
2198 const char *root_device, bool root_device_rw,
2199 const char *home_device, bool home_device_rw,
2200 const char *srv_device, bool srv_device_rw) {
2201 int r;
2202
2203 assert(where);
2204
2205 if (root_device) {
2206 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2207 if (r < 0)
2208 return log_error_errno(r, "Failed to mount root directory: %m");
2209 }
2210
2211 if (home_device) {
2212 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2213 if (r < 0)
2214 return log_error_errno(r, "Failed to mount home directory: %m");
2215 }
2216
2217 if (srv_device) {
2218 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2219 if (r < 0)
2220 return log_error_errno(r, "Failed to mount server data directory: %m");
2221 }
2222
2223 return 0;
2224 }
2225
2226 static void loop_remove(int nr, int *image_fd) {
2227 _cleanup_close_ int control = -1;
2228 int r;
2229
2230 if (nr < 0)
2231 return;
2232
2233 if (image_fd && *image_fd >= 0) {
2234 r = ioctl(*image_fd, LOOP_CLR_FD);
2235 if (r < 0)
2236 log_debug_errno(errno, "Failed to close loop image: %m");
2237 *image_fd = safe_close(*image_fd);
2238 }
2239
2240 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2241 if (control < 0) {
2242 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2243 return;
2244 }
2245
2246 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2247 if (r < 0)
2248 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2249 }
2250
2251 /*
2252 * Return values:
2253 * < 0 : wait_for_terminate() failed to get the state of the
2254 * container, the container was terminated by a signal, or
2255 * failed for an unknown reason. No change is made to the
2256 * container argument.
2257 * > 0 : The program executed in the container terminated with an
2258 * error. The exit code of the program executed in the
2259 * container is returned. The container argument has been set
2260 * to CONTAINER_TERMINATED.
2261 * 0 : The container is being rebooted, has been shut down or exited
2262 * successfully. The container argument has been set to either
2263 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2264 *
2265 * That is, success is indicated by a return value of zero, and an
2266 * error is indicated by a non-zero value.
2267 */
2268 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2269 siginfo_t status;
2270 int r;
2271
2272 r = wait_for_terminate(pid, &status);
2273 if (r < 0)
2274 return log_warning_errno(r, "Failed to wait for container: %m");
2275
2276 switch (status.si_code) {
2277
2278 case CLD_EXITED:
2279 if (status.si_status == 0) {
2280 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2281
2282 } else
2283 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2284
2285 *container = CONTAINER_TERMINATED;
2286 return status.si_status;
2287
2288 case CLD_KILLED:
2289 if (status.si_status == SIGINT) {
2290
2291 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2292 *container = CONTAINER_TERMINATED;
2293 return 0;
2294
2295 } else if (status.si_status == SIGHUP) {
2296
2297 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2298 *container = CONTAINER_REBOOTED;
2299 return 0;
2300 }
2301
2302 /* CLD_KILLED fallthrough */
2303
2304 case CLD_DUMPED:
2305 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2306 return -EIO;
2307
2308 default:
2309 log_error("Container %s failed due to unknown reason.", arg_machine);
2310 return -EIO;
2311 }
2312
2313 return r;
2314 }
2315
2316 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2317 pid_t pid;
2318
2319 pid = PTR_TO_PID(userdata);
2320 if (pid > 0) {
2321 if (kill(pid, arg_kill_signal) >= 0) {
2322 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2323 sd_event_source_set_userdata(s, NULL);
2324 return 0;
2325 }
2326 }
2327
2328 sd_event_exit(sd_event_source_get_event(s), 0);
2329 return 0;
2330 }
2331
2332 static int determine_names(void) {
2333 int r;
2334
2335 if (arg_template && !arg_directory && arg_machine) {
2336
2337 /* If --template= was specified then we should not
2338 * search for a machine, but instead create a new one
2339 * in /var/lib/machine. */
2340
2341 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2342 if (!arg_directory)
2343 return log_oom();
2344 }
2345
2346 if (!arg_image && !arg_directory) {
2347 if (arg_machine) {
2348 _cleanup_(image_unrefp) Image *i = NULL;
2349
2350 r = image_find(arg_machine, &i);
2351 if (r < 0)
2352 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2353 else if (r == 0) {
2354 log_error("No image for machine '%s': %m", arg_machine);
2355 return -ENOENT;
2356 }
2357
2358 if (i->type == IMAGE_RAW)
2359 r = free_and_strdup(&arg_image, i->path);
2360 else
2361 r = free_and_strdup(&arg_directory, i->path);
2362 if (r < 0)
2363 return log_error_errno(r, "Invalid image directory: %m");
2364
2365 if (!arg_ephemeral)
2366 arg_read_only = arg_read_only || i->read_only;
2367 } else
2368 arg_directory = get_current_dir_name();
2369
2370 if (!arg_directory && !arg_machine) {
2371 log_error("Failed to determine path, please use -D or -i.");
2372 return -EINVAL;
2373 }
2374 }
2375
2376 if (!arg_machine) {
2377 if (arg_directory && path_equal(arg_directory, "/"))
2378 arg_machine = gethostname_malloc();
2379 else
2380 arg_machine = strdup(basename(arg_image ?: arg_directory));
2381
2382 if (!arg_machine)
2383 return log_oom();
2384
2385 hostname_cleanup(arg_machine);
2386 if (!machine_name_is_valid(arg_machine)) {
2387 log_error("Failed to determine machine name automatically, please use -M.");
2388 return -EINVAL;
2389 }
2390
2391 if (arg_ephemeral) {
2392 char *b;
2393
2394 /* Add a random suffix when this is an
2395 * ephemeral machine, so that we can run many
2396 * instances at once without manually having
2397 * to specify -M each time. */
2398
2399 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2400 return log_oom();
2401
2402 free(arg_machine);
2403 arg_machine = b;
2404 }
2405 }
2406
2407 return 0;
2408 }
2409
2410 static int determine_uid_shift(const char *directory) {
2411 int r;
2412
2413 if (!arg_userns) {
2414 arg_uid_shift = 0;
2415 return 0;
2416 }
2417
2418 if (arg_uid_shift == UID_INVALID) {
2419 struct stat st;
2420
2421 r = stat(directory, &st);
2422 if (r < 0)
2423 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2424
2425 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2426
2427 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2428 log_error("UID and GID base of %s don't match.", directory);
2429 return -EINVAL;
2430 }
2431
2432 arg_uid_range = UINT32_C(0x10000);
2433 }
2434
2435 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2436 log_error("UID base too high for UID range.");
2437 return -EINVAL;
2438 }
2439
2440 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2441 return 0;
2442 }
2443
2444 static int inner_child(
2445 Barrier *barrier,
2446 const char *directory,
2447 bool secondary,
2448 int kmsg_socket,
2449 int rtnl_socket,
2450 FDSet *fds) {
2451
2452 _cleanup_free_ char *home = NULL;
2453 unsigned n_env = 1;
2454 const char *envp[] = {
2455 "PATH=" DEFAULT_PATH_SPLIT_USR,
2456 NULL, /* container */
2457 NULL, /* TERM */
2458 NULL, /* HOME */
2459 NULL, /* USER */
2460 NULL, /* LOGNAME */
2461 NULL, /* container_uuid */
2462 NULL, /* LISTEN_FDS */
2463 NULL, /* LISTEN_PID */
2464 NULL
2465 };
2466
2467 _cleanup_strv_free_ char **env_use = NULL;
2468 int r;
2469
2470 assert(barrier);
2471 assert(directory);
2472 assert(kmsg_socket >= 0);
2473
2474 cg_unified_flush();
2475
2476 if (arg_userns) {
2477 /* Tell the parent, that it now can write the UID map. */
2478 (void) barrier_place(barrier); /* #1 */
2479
2480 /* Wait until the parent wrote the UID map */
2481 if (!barrier_place_and_sync(barrier)) { /* #2 */
2482 log_error("Parent died too early");
2483 return -ESRCH;
2484 }
2485 }
2486
2487 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
2488 if (r < 0)
2489 return r;
2490
2491 r = mount_sysfs(NULL);
2492 if (r < 0)
2493 return r;
2494
2495 /* Wait until we are cgroup-ified, so that we
2496 * can mount the right cgroup path writable */
2497 if (!barrier_place_and_sync(barrier)) { /* #3 */
2498 log_error("Parent died too early");
2499 return -ESRCH;
2500 }
2501
2502 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2503 if (r < 0)
2504 return r;
2505
2506 r = reset_uid_gid();
2507 if (r < 0)
2508 return log_error_errno(r, "Couldn't become new root: %m");
2509
2510 r = setup_boot_id(NULL);
2511 if (r < 0)
2512 return r;
2513
2514 r = setup_kmsg(NULL, kmsg_socket);
2515 if (r < 0)
2516 return r;
2517 kmsg_socket = safe_close(kmsg_socket);
2518
2519 umask(0022);
2520
2521 if (setsid() < 0)
2522 return log_error_errno(errno, "setsid() failed: %m");
2523
2524 if (arg_private_network)
2525 loopback_setup();
2526
2527 if (arg_expose_ports) {
2528 r = expose_port_send_rtnl(rtnl_socket);
2529 if (r < 0)
2530 return r;
2531 rtnl_socket = safe_close(rtnl_socket);
2532 }
2533
2534 r = drop_capabilities();
2535 if (r < 0)
2536 return log_error_errno(r, "drop_capabilities() failed: %m");
2537
2538 setup_hostname();
2539
2540 if (arg_personality != PERSONALITY_INVALID) {
2541 if (personality(arg_personality) < 0)
2542 return log_error_errno(errno, "personality() failed: %m");
2543 } else if (secondary) {
2544 if (personality(PER_LINUX32) < 0)
2545 return log_error_errno(errno, "personality() failed: %m");
2546 }
2547
2548 #ifdef HAVE_SELINUX
2549 if (arg_selinux_context)
2550 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2551 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2552 #endif
2553
2554 r = change_uid_gid(arg_user, &home);
2555 if (r < 0)
2556 return r;
2557
2558 /* LXC sets container=lxc, so follow the scheme here */
2559 envp[n_env++] = strjoina("container=", arg_container_service_name);
2560
2561 envp[n_env] = strv_find_prefix(environ, "TERM=");
2562 if (envp[n_env])
2563 n_env ++;
2564
2565 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2566 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2567 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2568 return log_oom();
2569
2570 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2571 char as_uuid[37];
2572
2573 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2574 return log_oom();
2575 }
2576
2577 if (fdset_size(fds) > 0) {
2578 r = fdset_cloexec(fds, false);
2579 if (r < 0)
2580 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2581
2582 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2583 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2584 return log_oom();
2585 }
2586
2587 env_use = strv_env_merge(2, envp, arg_setenv);
2588 if (!env_use)
2589 return log_oom();
2590
2591 /* Let the parent know that we are ready and
2592 * wait until the parent is ready with the
2593 * setup, too... */
2594 if (!barrier_place_and_sync(barrier)) { /* #4 */
2595 log_error("Parent died too early");
2596 return -ESRCH;
2597 }
2598
2599 if (arg_chdir)
2600 if (chdir(arg_chdir) < 0)
2601 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2602
2603 if (arg_start_mode == START_PID2) {
2604 r = stub_pid1();
2605 if (r < 0)
2606 return r;
2607 }
2608
2609 /* Now, explicitly close the log, so that we
2610 * then can close all remaining fds. Closing
2611 * the log explicitly first has the benefit
2612 * that the logging subsystem knows about it,
2613 * and is thus ready to be reopened should we
2614 * need it again. Note that the other fds
2615 * closed here are at least the locking and
2616 * barrier fds. */
2617 log_close();
2618 (void) fdset_close_others(fds);
2619
2620 if (arg_start_mode == START_BOOT) {
2621 char **a;
2622 size_t m;
2623
2624 /* Automatically search for the init system */
2625
2626 m = strv_length(arg_parameters);
2627 a = newa(char*, m + 2);
2628 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2629 a[1 + m] = NULL;
2630
2631 a[0] = (char*) "/usr/lib/systemd/systemd";
2632 execve(a[0], a, env_use);
2633
2634 a[0] = (char*) "/lib/systemd/systemd";
2635 execve(a[0], a, env_use);
2636
2637 a[0] = (char*) "/sbin/init";
2638 execve(a[0], a, env_use);
2639 } else if (!strv_isempty(arg_parameters))
2640 execvpe(arg_parameters[0], arg_parameters, env_use);
2641 else {
2642 if (!arg_chdir)
2643 chdir(home ?: "/root");
2644
2645 execle("/bin/bash", "-bash", NULL, env_use);
2646 execle("/bin/sh", "-sh", NULL, env_use);
2647 }
2648
2649 r = -errno;
2650 (void) log_open();
2651 return log_error_errno(r, "execv() failed: %m");
2652 }
2653
2654 static int outer_child(
2655 Barrier *barrier,
2656 const char *directory,
2657 const char *console,
2658 const char *root_device, bool root_device_rw,
2659 const char *home_device, bool home_device_rw,
2660 const char *srv_device, bool srv_device_rw,
2661 bool interactive,
2662 bool secondary,
2663 int pid_socket,
2664 int kmsg_socket,
2665 int rtnl_socket,
2666 int uid_shift_socket,
2667 FDSet *fds) {
2668
2669 pid_t pid;
2670 ssize_t l;
2671 int r;
2672
2673 assert(barrier);
2674 assert(directory);
2675 assert(console);
2676 assert(pid_socket >= 0);
2677 assert(kmsg_socket >= 0);
2678
2679 cg_unified_flush();
2680
2681 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2682 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2683
2684 if (interactive) {
2685 close_nointr(STDIN_FILENO);
2686 close_nointr(STDOUT_FILENO);
2687 close_nointr(STDERR_FILENO);
2688
2689 r = open_terminal(console, O_RDWR);
2690 if (r != STDIN_FILENO) {
2691 if (r >= 0) {
2692 safe_close(r);
2693 r = -EINVAL;
2694 }
2695
2696 return log_error_errno(r, "Failed to open console: %m");
2697 }
2698
2699 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2700 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2701 return log_error_errno(errno, "Failed to duplicate console: %m");
2702 }
2703
2704 r = reset_audit_loginuid();
2705 if (r < 0)
2706 return r;
2707
2708 /* Mark everything as slave, so that we still
2709 * receive mounts from the real root, but don't
2710 * propagate mounts to the real root. */
2711 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2712 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2713
2714 r = mount_devices(directory,
2715 root_device, root_device_rw,
2716 home_device, home_device_rw,
2717 srv_device, srv_device_rw);
2718 if (r < 0)
2719 return r;
2720
2721 r = determine_uid_shift(directory);
2722 if (r < 0)
2723 return r;
2724
2725 if (arg_userns) {
2726 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2727 if (l < 0)
2728 return log_error_errno(errno, "Failed to send UID shift: %m");
2729 if (l != sizeof(arg_uid_shift)) {
2730 log_error("Short write while sending UID shift.");
2731 return -EIO;
2732 }
2733 }
2734
2735 /* Turn directory into bind mount */
2736 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2737 return log_error_errno(errno, "Failed to make bind mount: %m");
2738
2739 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2740 if (r < 0)
2741 return r;
2742
2743 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2744 if (r < 0)
2745 return r;
2746
2747 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2748 if (r < 0)
2749 return r;
2750
2751 if (arg_read_only) {
2752 r = bind_remount_recursive(directory, true);
2753 if (r < 0)
2754 return log_error_errno(r, "Failed to make tree read-only: %m");
2755 }
2756
2757 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2758 if (r < 0)
2759 return r;
2760
2761 r = copy_devnodes(directory);
2762 if (r < 0)
2763 return r;
2764
2765 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2766
2767 r = setup_pts(directory);
2768 if (r < 0)
2769 return r;
2770
2771 r = setup_propagate(directory);
2772 if (r < 0)
2773 return r;
2774
2775 r = setup_dev_console(directory, console);
2776 if (r < 0)
2777 return r;
2778
2779 r = setup_seccomp();
2780 if (r < 0)
2781 return r;
2782
2783 r = setup_timezone(directory);
2784 if (r < 0)
2785 return r;
2786
2787 r = setup_resolv_conf(directory);
2788 if (r < 0)
2789 return r;
2790
2791 r = setup_journal(directory);
2792 if (r < 0)
2793 return r;
2794
2795 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2796 if (r < 0)
2797 return r;
2798
2799 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2800 if (r < 0)
2801 return r;
2802
2803 r = mount_move_root(directory);
2804 if (r < 0)
2805 return log_error_errno(r, "Failed to move root directory: %m");
2806
2807 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2808 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2809 (arg_private_network ? CLONE_NEWNET : 0) |
2810 (arg_userns ? CLONE_NEWUSER : 0),
2811 NULL);
2812 if (pid < 0)
2813 return log_error_errno(errno, "Failed to fork inner child: %m");
2814 if (pid == 0) {
2815 pid_socket = safe_close(pid_socket);
2816 uid_shift_socket = safe_close(uid_shift_socket);
2817
2818 /* The inner child has all namespaces that are
2819 * requested, so that we all are owned by the user if
2820 * user namespaces are turned on. */
2821
2822 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2823 if (r < 0)
2824 _exit(EXIT_FAILURE);
2825
2826 _exit(EXIT_SUCCESS);
2827 }
2828
2829 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2830 if (l < 0)
2831 return log_error_errno(errno, "Failed to send PID: %m");
2832 if (l != sizeof(pid)) {
2833 log_error("Short write while sending PID.");
2834 return -EIO;
2835 }
2836
2837 pid_socket = safe_close(pid_socket);
2838 kmsg_socket = safe_close(kmsg_socket);
2839 rtnl_socket = safe_close(rtnl_socket);
2840
2841 return 0;
2842 }
2843
2844 static int setup_uid_map(pid_t pid) {
2845 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2846 int r;
2847
2848 assert(pid > 1);
2849
2850 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2851 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2852 r = write_string_file(uid_map, line, 0);
2853 if (r < 0)
2854 return log_error_errno(r, "Failed to write UID map: %m");
2855
2856 /* We always assign the same UID and GID ranges */
2857 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2858 r = write_string_file(uid_map, line, 0);
2859 if (r < 0)
2860 return log_error_errno(r, "Failed to write GID map: %m");
2861
2862 return 0;
2863 }
2864
2865 static int load_settings(void) {
2866 _cleanup_(settings_freep) Settings *settings = NULL;
2867 _cleanup_fclose_ FILE *f = NULL;
2868 _cleanup_free_ char *p = NULL;
2869 const char *fn, *i;
2870 int r;
2871
2872 /* If all settings are masked, there's no point in looking for
2873 * the settings file */
2874 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2875 return 0;
2876
2877 fn = strjoina(arg_machine, ".nspawn");
2878
2879 /* We first look in the admin's directories in /etc and /run */
2880 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2881 _cleanup_free_ char *j = NULL;
2882
2883 j = strjoin(i, "/", fn, NULL);
2884 if (!j)
2885 return log_oom();
2886
2887 f = fopen(j, "re");
2888 if (f) {
2889 p = j;
2890 j = NULL;
2891
2892 /* By default, we trust configuration from /etc and /run */
2893 if (arg_settings_trusted < 0)
2894 arg_settings_trusted = true;
2895
2896 break;
2897 }
2898
2899 if (errno != ENOENT)
2900 return log_error_errno(errno, "Failed to open %s: %m", j);
2901 }
2902
2903 if (!f) {
2904 /* After that, let's look for a file next to the
2905 * actual image we shall boot. */
2906
2907 if (arg_image) {
2908 p = file_in_same_dir(arg_image, fn);
2909 if (!p)
2910 return log_oom();
2911 } else if (arg_directory) {
2912 p = file_in_same_dir(arg_directory, fn);
2913 if (!p)
2914 return log_oom();
2915 }
2916
2917 if (p) {
2918 f = fopen(p, "re");
2919 if (!f && errno != ENOENT)
2920 return log_error_errno(errno, "Failed to open %s: %m", p);
2921
2922 /* By default, we do not trust configuration from /var/lib/machines */
2923 if (arg_settings_trusted < 0)
2924 arg_settings_trusted = false;
2925 }
2926 }
2927
2928 if (!f)
2929 return 0;
2930
2931 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2932
2933 r = settings_load(f, p, &settings);
2934 if (r < 0)
2935 return r;
2936
2937 /* Copy over bits from the settings, unless they have been
2938 * explicitly masked by command line switches. */
2939
2940 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
2941 settings->start_mode >= 0) {
2942 arg_start_mode = settings->start_mode;
2943
2944 strv_free(arg_parameters);
2945 arg_parameters = settings->parameters;
2946 settings->parameters = NULL;
2947 }
2948
2949 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
2950 settings->working_directory) {
2951 free(arg_chdir);
2952 arg_chdir = settings->working_directory;
2953 settings->working_directory = NULL;
2954 }
2955
2956 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2957 settings->environment) {
2958 strv_free(arg_setenv);
2959 arg_setenv = settings->environment;
2960 settings->environment = NULL;
2961 }
2962
2963 if ((arg_settings_mask & SETTING_USER) == 0 &&
2964 settings->user) {
2965 free(arg_user);
2966 arg_user = settings->user;
2967 settings->user = NULL;
2968 }
2969
2970 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
2971 uint64_t plus;
2972
2973 plus = settings->capability;
2974 if (settings_private_network(settings))
2975 plus |= (1ULL << CAP_NET_ADMIN);
2976
2977 if (!arg_settings_trusted && plus != 0) {
2978 if (settings->capability != 0)
2979 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2980 } else
2981 arg_retain |= plus;
2982
2983 arg_retain &= ~settings->drop_capability;
2984 }
2985
2986 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2987 settings->kill_signal > 0)
2988 arg_kill_signal = settings->kill_signal;
2989
2990 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2991 settings->personality != PERSONALITY_INVALID)
2992 arg_personality = settings->personality;
2993
2994 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2995 !sd_id128_is_null(settings->machine_id)) {
2996
2997 if (!arg_settings_trusted)
2998 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2999 else
3000 arg_uuid = settings->machine_id;
3001 }
3002
3003 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3004 settings->read_only >= 0)
3005 arg_read_only = settings->read_only;
3006
3007 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3008 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3009 arg_volatile_mode = settings->volatile_mode;
3010
3011 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3012 settings->n_custom_mounts > 0) {
3013
3014 if (!arg_settings_trusted)
3015 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3016 else {
3017 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3018 arg_custom_mounts = settings->custom_mounts;
3019 arg_n_custom_mounts = settings->n_custom_mounts;
3020
3021 settings->custom_mounts = NULL;
3022 settings->n_custom_mounts = 0;
3023 }
3024 }
3025
3026 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3027 (settings->private_network >= 0 ||
3028 settings->network_veth >= 0 ||
3029 settings->network_bridge ||
3030 settings->network_interfaces ||
3031 settings->network_macvlan ||
3032 settings->network_ipvlan ||
3033 settings->network_veth_extra)) {
3034
3035 if (!arg_settings_trusted)
3036 log_warning("Ignoring network settings, file %s is not trusted.", p);
3037 else {
3038 arg_network_veth = settings_network_veth(settings);
3039 arg_private_network = settings_private_network(settings);
3040
3041 strv_free(arg_network_interfaces);
3042 arg_network_interfaces = settings->network_interfaces;
3043 settings->network_interfaces = NULL;
3044
3045 strv_free(arg_network_macvlan);
3046 arg_network_macvlan = settings->network_macvlan;
3047 settings->network_macvlan = NULL;
3048
3049 strv_free(arg_network_ipvlan);
3050 arg_network_ipvlan = settings->network_ipvlan;
3051 settings->network_ipvlan = NULL;
3052
3053 strv_free(arg_network_veth_extra);
3054 arg_network_veth_extra = settings->network_veth_extra;
3055 settings->network_veth_extra = NULL;
3056
3057 free(arg_network_bridge);
3058 arg_network_bridge = settings->network_bridge;
3059 settings->network_bridge = NULL;
3060 }
3061 }
3062
3063 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3064 settings->expose_ports) {
3065
3066 if (!arg_settings_trusted)
3067 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3068 else {
3069 expose_port_free_all(arg_expose_ports);
3070 arg_expose_ports = settings->expose_ports;
3071 settings->expose_ports = NULL;
3072 }
3073 }
3074
3075 return 0;
3076 }
3077
3078 int main(int argc, char *argv[]) {
3079
3080 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3081 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3082 _cleanup_close_ int master = -1, image_fd = -1;
3083 _cleanup_fdset_free_ FDSet *fds = NULL;
3084 int r, n_fd_passed, loop_nr = -1;
3085 char veth_name[IFNAMSIZ];
3086 bool secondary = false, remove_subvol = false;
3087 sigset_t mask_chld;
3088 pid_t pid = 0;
3089 int ret = EXIT_SUCCESS;
3090 union in_addr_union exposed = {};
3091 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3092 bool interactive;
3093
3094 log_parse_environment();
3095 log_open();
3096
3097 /* Make sure rename_process() in the stub init process can work */
3098 saved_argv = argv;
3099 saved_argc = argc;
3100
3101 r = parse_argv(argc, argv);
3102 if (r <= 0)
3103 goto finish;
3104
3105 if (geteuid() != 0) {
3106 log_error("Need to be root.");
3107 r = -EPERM;
3108 goto finish;
3109 }
3110 r = determine_names();
3111 if (r < 0)
3112 goto finish;
3113
3114 r = load_settings();
3115 if (r < 0)
3116 goto finish;
3117
3118 r = verify_arguments();
3119 if (r < 0)
3120 goto finish;
3121
3122 n_fd_passed = sd_listen_fds(false);
3123 if (n_fd_passed > 0) {
3124 r = fdset_new_listen_fds(&fds, false);
3125 if (r < 0) {
3126 log_error_errno(r, "Failed to collect file descriptors: %m");
3127 goto finish;
3128 }
3129 }
3130
3131 if (arg_directory) {
3132 assert(!arg_image);
3133
3134 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3135 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3136 r = -EINVAL;
3137 goto finish;
3138 }
3139
3140 if (arg_ephemeral) {
3141 _cleanup_free_ char *np = NULL;
3142
3143 /* If the specified path is a mount point we
3144 * generate the new snapshot immediately
3145 * inside it under a random name. However if
3146 * the specified is not a mount point we
3147 * create the new snapshot in the parent
3148 * directory, just next to it. */
3149 r = path_is_mount_point(arg_directory, 0);
3150 if (r < 0) {
3151 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3152 goto finish;
3153 }
3154 if (r > 0)
3155 r = tempfn_random_child(arg_directory, "machine.", &np);
3156 else
3157 r = tempfn_random(arg_directory, "machine.", &np);
3158 if (r < 0) {
3159 log_error_errno(r, "Failed to generate name for snapshot: %m");
3160 goto finish;
3161 }
3162
3163 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3164 if (r < 0) {
3165 log_error_errno(r, "Failed to lock %s: %m", np);
3166 goto finish;
3167 }
3168
3169 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3170 if (r < 0) {
3171 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3172 goto finish;
3173 }
3174
3175 free(arg_directory);
3176 arg_directory = np;
3177 np = NULL;
3178
3179 remove_subvol = true;
3180
3181 } else {
3182 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3183 if (r == -EBUSY) {
3184 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3185 goto finish;
3186 }
3187 if (r < 0) {
3188 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3189 return r;
3190 }
3191
3192 if (arg_template) {
3193 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3194 if (r == -EEXIST) {
3195 if (!arg_quiet)
3196 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3197 } else if (r < 0) {
3198 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3199 goto finish;
3200 } else {
3201 if (!arg_quiet)
3202 log_info("Populated %s from template %s.", arg_directory, arg_template);
3203 }
3204 }
3205 }
3206
3207 if (arg_start_mode == START_BOOT) {
3208 if (path_is_os_tree(arg_directory) <= 0) {
3209 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3210 r = -EINVAL;
3211 goto finish;
3212 }
3213 } else {
3214 const char *p;
3215
3216 p = strjoina(arg_directory, "/usr/");
3217 if (laccess(p, F_OK) < 0) {
3218 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
3219 r = -EINVAL;
3220 goto finish;
3221 }
3222 }
3223
3224 } else {
3225 char template[] = "/tmp/nspawn-root-XXXXXX";
3226
3227 assert(arg_image);
3228 assert(!arg_template);
3229
3230 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3231 if (r == -EBUSY) {
3232 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3233 goto finish;
3234 }
3235 if (r < 0) {
3236 r = log_error_errno(r, "Failed to create image lock: %m");
3237 goto finish;
3238 }
3239
3240 if (!mkdtemp(template)) {
3241 log_error_errno(errno, "Failed to create temporary directory: %m");
3242 r = -errno;
3243 goto finish;
3244 }
3245
3246 arg_directory = strdup(template);
3247 if (!arg_directory) {
3248 r = log_oom();
3249 goto finish;
3250 }
3251
3252 image_fd = setup_image(&device_path, &loop_nr);
3253 if (image_fd < 0) {
3254 r = image_fd;
3255 goto finish;
3256 }
3257
3258 r = dissect_image(image_fd,
3259 &root_device, &root_device_rw,
3260 &home_device, &home_device_rw,
3261 &srv_device, &srv_device_rw,
3262 &secondary);
3263 if (r < 0)
3264 goto finish;
3265 }
3266
3267 r = custom_mounts_prepare();
3268 if (r < 0)
3269 goto finish;
3270
3271 interactive =
3272 isatty(STDIN_FILENO) > 0 &&
3273 isatty(STDOUT_FILENO) > 0;
3274
3275 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3276 if (master < 0) {
3277 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3278 goto finish;
3279 }
3280
3281 r = ptsname_malloc(master, &console);
3282 if (r < 0) {
3283 r = log_error_errno(r, "Failed to determine tty name: %m");
3284 goto finish;
3285 }
3286
3287 if (unlockpt(master) < 0) {
3288 r = log_error_errno(errno, "Failed to unlock tty: %m");
3289 goto finish;
3290 }
3291
3292 if (!arg_quiet)
3293 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3294 arg_machine, arg_image ?: arg_directory);
3295
3296 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3297
3298 assert_se(sigemptyset(&mask_chld) == 0);
3299 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3300
3301 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3302 r = log_error_errno(errno, "Failed to become subreaper: %m");
3303 goto finish;
3304 }
3305
3306 for (;;) {
3307 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 };
3308 ContainerStatus container_status;
3309 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3310 static const struct sigaction sa = {
3311 .sa_handler = nop_signal_handler,
3312 .sa_flags = SA_NOCLDSTOP,
3313 };
3314 int ifi = 0;
3315 ssize_t l;
3316 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3317 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3318 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3319 char last_char = 0;
3320
3321 r = barrier_create(&barrier);
3322 if (r < 0) {
3323 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3324 goto finish;
3325 }
3326
3327 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3328 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3329 goto finish;
3330 }
3331
3332 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3333 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3334 goto finish;
3335 }
3336
3337 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3338 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3339 goto finish;
3340 }
3341
3342 if (arg_userns)
3343 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3344 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3345 goto finish;
3346 }
3347
3348 /* Child can be killed before execv(), so handle SIGCHLD
3349 * in order to interrupt parent's blocking calls and
3350 * give it a chance to call wait() and terminate. */
3351 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3352 if (r < 0) {
3353 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3354 goto finish;
3355 }
3356
3357 r = sigaction(SIGCHLD, &sa, NULL);
3358 if (r < 0) {
3359 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3360 goto finish;
3361 }
3362
3363 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3364 if (pid < 0) {
3365 if (errno == EINVAL)
3366 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3367 else
3368 r = log_error_errno(errno, "clone() failed: %m");
3369
3370 goto finish;
3371 }
3372
3373 if (pid == 0) {
3374 /* The outer child only has a file system namespace. */
3375 barrier_set_role(&barrier, BARRIER_CHILD);
3376
3377 master = safe_close(master);
3378
3379 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3380 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3381 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3382 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3383
3384 (void) reset_all_signal_handlers();
3385 (void) reset_signal_mask();
3386
3387 r = outer_child(&barrier,
3388 arg_directory,
3389 console,
3390 root_device, root_device_rw,
3391 home_device, home_device_rw,
3392 srv_device, srv_device_rw,
3393 interactive,
3394 secondary,
3395 pid_socket_pair[1],
3396 kmsg_socket_pair[1],
3397 rtnl_socket_pair[1],
3398 uid_shift_socket_pair[1],
3399 fds);
3400 if (r < 0)
3401 _exit(EXIT_FAILURE);
3402
3403 _exit(EXIT_SUCCESS);
3404 }
3405
3406 barrier_set_role(&barrier, BARRIER_PARENT);
3407
3408 fds = fdset_free(fds);
3409
3410 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3411 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3412 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3413 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3414
3415 /* Wait for the outer child. */
3416 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3417 if (r < 0)
3418 goto finish;
3419 if (r != 0) {
3420 r = -EIO;
3421 goto finish;
3422 }
3423 pid = 0;
3424
3425 /* And now retrieve the PID of the inner child. */
3426 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3427 if (l < 0) {
3428 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3429 goto finish;
3430 }
3431 if (l != sizeof(pid)) {
3432 log_error("Short read while reading inner child PID.");
3433 r = EIO;
3434 goto finish;
3435 }
3436
3437 log_debug("Init process invoked as PID " PID_FMT, pid);
3438
3439 if (arg_userns) {
3440 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3441 log_error("Child died too early.");
3442 r = -ESRCH;
3443 goto finish;
3444 }
3445
3446 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3447 if (l < 0) {
3448 r = log_error_errno(errno, "Failed to read UID shift: %m");
3449 goto finish;
3450 }
3451 if (l != sizeof(arg_uid_shift)) {
3452 log_error("Short read while reading UID shift.");
3453 r = EIO;
3454 goto finish;
3455 }
3456
3457 r = setup_uid_map(pid);
3458 if (r < 0)
3459 goto finish;
3460
3461 (void) barrier_place(&barrier); /* #2 */
3462 }
3463
3464 if (arg_private_network) {
3465
3466 r = move_network_interfaces(pid, arg_network_interfaces);
3467 if (r < 0)
3468 goto finish;
3469
3470 if (arg_network_veth) {
3471 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3472 if (r < 0)
3473 goto finish;
3474 else if (r > 0)
3475 ifi = r;
3476
3477 if (arg_network_bridge) {
3478 r = setup_bridge(veth_name, arg_network_bridge);
3479 if (r < 0)
3480 goto finish;
3481 if (r > 0)
3482 ifi = r;
3483 }
3484 }
3485
3486 r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
3487 if (r < 0)
3488 goto finish;
3489
3490 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3491 if (r < 0)
3492 goto finish;
3493
3494 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3495 if (r < 0)
3496 goto finish;
3497 }
3498
3499 if (arg_register) {
3500 r = register_machine(
3501 arg_machine,
3502 pid,
3503 arg_directory,
3504 arg_uuid,
3505 ifi,
3506 arg_slice,
3507 arg_custom_mounts, arg_n_custom_mounts,
3508 arg_kill_signal,
3509 arg_property,
3510 arg_keep_unit,
3511 arg_container_service_name);
3512 if (r < 0)
3513 goto finish;
3514 }
3515
3516 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3517 if (r < 0)
3518 goto finish;
3519
3520 if (arg_keep_unit) {
3521 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3522 if (r < 0)
3523 goto finish;
3524 }
3525
3526 r = chown_cgroup(pid, arg_uid_shift);
3527 if (r < 0)
3528 goto finish;
3529
3530 /* Notify the child that the parent is ready with all
3531 * its setup (including cgroup-ification), and that
3532 * the child can now hand over control to the code to
3533 * run inside the container. */
3534 (void) barrier_place(&barrier); /* #3 */
3535
3536 /* Block SIGCHLD here, before notifying child.
3537 * process_pty() will handle it with the other signals. */
3538 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3539
3540 /* Reset signal to default */
3541 r = default_signals(SIGCHLD, -1);
3542 if (r < 0) {
3543 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3544 goto finish;
3545 }
3546
3547 /* Let the child know that we are ready and wait that the child is completely ready now. */
3548 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3549 log_error("Child died too early.");
3550 r = -ESRCH;
3551 goto finish;
3552 }
3553
3554 sd_notifyf(false,
3555 "READY=1\n"
3556 "STATUS=Container running.\n"
3557 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3558
3559 r = sd_event_new(&event);
3560 if (r < 0) {
3561 log_error_errno(r, "Failed to get default event source: %m");
3562 goto finish;
3563 }
3564
3565 if (arg_kill_signal > 0) {
3566 /* Try to kill the init system on SIGINT or SIGTERM */
3567 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid));
3568 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid));
3569 } else {
3570 /* Immediately exit */
3571 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3572 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3573 }
3574
3575 /* simply exit on sigchld */
3576 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3577
3578 if (arg_expose_ports) {
3579 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3580 if (r < 0)
3581 goto finish;
3582
3583 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3584 }
3585
3586 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3587
3588 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
3589 if (r < 0) {
3590 log_error_errno(r, "Failed to create PTY forwarder: %m");
3591 goto finish;
3592 }
3593
3594 r = sd_event_loop(event);
3595 if (r < 0) {
3596 log_error_errno(r, "Failed to run event loop: %m");
3597 goto finish;
3598 }
3599
3600 pty_forward_get_last_char(forward, &last_char);
3601
3602 forward = pty_forward_free(forward);
3603
3604 if (!arg_quiet && last_char != '\n')
3605 putc('\n', stdout);
3606
3607 /* Kill if it is not dead yet anyway */
3608 if (arg_register && !arg_keep_unit)
3609 terminate_machine(pid);
3610
3611 /* Normally redundant, but better safe than sorry */
3612 kill(pid, SIGKILL);
3613
3614 r = wait_for_container(pid, &container_status);
3615 pid = 0;
3616
3617 if (r < 0)
3618 /* We failed to wait for the container, or the
3619 * container exited abnormally */
3620 goto finish;
3621 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3622 /* The container exited with a non-zero
3623 * status, or with zero status and no reboot
3624 * was requested. */
3625 ret = r;
3626 break;
3627 }
3628
3629 /* CONTAINER_REBOOTED, loop again */
3630
3631 if (arg_keep_unit) {
3632 /* Special handling if we are running as a
3633 * service: instead of simply restarting the
3634 * machine we want to restart the entire
3635 * service, so let's inform systemd about this
3636 * with the special exit code 133. The service
3637 * file uses RestartForceExitStatus=133 so
3638 * that this results in a full nspawn
3639 * restart. This is necessary since we might
3640 * have cgroup parameters set we want to have
3641 * flushed out. */
3642 ret = 133;
3643 r = 0;
3644 break;
3645 }
3646
3647 expose_port_flush(arg_expose_ports, &exposed);
3648 }
3649
3650 finish:
3651 sd_notify(false,
3652 "STOPPING=1\n"
3653 "STATUS=Terminating...");
3654
3655 if (pid > 0)
3656 kill(pid, SIGKILL);
3657
3658 /* Try to flush whatever is still queued in the pty */
3659 if (master >= 0)
3660 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
3661
3662 loop_remove(loop_nr, &image_fd);
3663
3664 if (remove_subvol && arg_directory) {
3665 int k;
3666
3667 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
3668 if (k < 0)
3669 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3670 }
3671
3672 if (arg_machine) {
3673 const char *p;
3674
3675 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3676 (void) rm_rf(p, REMOVE_ROOT);
3677 }
3678
3679 expose_port_flush(arg_expose_ports, &exposed);
3680
3681 free(arg_directory);
3682 free(arg_template);
3683 free(arg_image);
3684 free(arg_machine);
3685 free(arg_user);
3686 free(arg_chdir);
3687 strv_free(arg_setenv);
3688 free(arg_network_bridge);
3689 strv_free(arg_network_interfaces);
3690 strv_free(arg_network_macvlan);
3691 strv_free(arg_network_ipvlan);
3692 strv_free(arg_network_veth_extra);
3693 strv_free(arg_parameters);
3694 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3695 expose_port_free_all(arg_expose_ports);
3696
3697 return r < 0 ? EXIT_FAILURE : ret;
3698 }