]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: always setup machine id
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #ifdef HAVE_BLKID
21 #include <blkid/blkid.h>
22 #endif
23 #include <errno.h>
24 #include <getopt.h>
25 #include <linux/loop.h>
26 #include <sched.h>
27 #ifdef HAVE_SECCOMP
28 #include <seccomp.h>
29 #endif
30 #ifdef HAVE_SELINUX
31 #include <selinux/selinux.h>
32 #endif
33 #include <signal.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <sys/file.h>
38 #include <sys/mount.h>
39 #include <sys/personality.h>
40 #include <sys/prctl.h>
41 #include <sys/types.h>
42 #include <unistd.h>
43
44 #include "sd-daemon.h"
45 #include "sd-id128.h"
46
47 #include "alloc-util.h"
48 #include "barrier.h"
49 #include "base-filesystem.h"
50 #include "blkid-util.h"
51 #include "btrfs-util.h"
52 #include "cap-list.h"
53 #include "capability-util.h"
54 #include "cgroup-util.h"
55 #include "copy.h"
56 #include "dev-setup.h"
57 #include "env-util.h"
58 #include "fd-util.h"
59 #include "fdset.h"
60 #include "fileio.h"
61 #include "formats-util.h"
62 #include "fs-util.h"
63 #include "gpt.h"
64 #include "hostname-util.h"
65 #include "log.h"
66 #include "loopback-setup.h"
67 #include "machine-id-setup.h"
68 #include "machine-image.h"
69 #include "macro.h"
70 #include "missing.h"
71 #include "mkdir.h"
72 #include "mount-util.h"
73 #include "netlink-util.h"
74 #include "nspawn-cgroup.h"
75 #include "nspawn-expose-ports.h"
76 #include "nspawn-mount.h"
77 #include "nspawn-network.h"
78 #include "nspawn-register.h"
79 #include "nspawn-settings.h"
80 #include "nspawn-setuid.h"
81 #include "nspawn-stub-pid1.h"
82 #include "parse-util.h"
83 #include "path-util.h"
84 #include "process-util.h"
85 #include "ptyfwd.h"
86 #include "random-util.h"
87 #include "rm-rf.h"
88 #ifdef HAVE_SECCOMP
89 #include "seccomp-util.h"
90 #endif
91 #include "selinux-util.h"
92 #include "signal-util.h"
93 #include "socket-util.h"
94 #include "stat-util.h"
95 #include "stdio-util.h"
96 #include "string-util.h"
97 #include "strv.h"
98 #include "terminal-util.h"
99 #include "udev-util.h"
100 #include "umask-util.h"
101 #include "user-util.h"
102 #include "util.h"
103
104 typedef enum ContainerStatus {
105 CONTAINER_TERMINATED,
106 CONTAINER_REBOOTED
107 } ContainerStatus;
108
109 typedef enum LinkJournal {
110 LINK_NO,
111 LINK_AUTO,
112 LINK_HOST,
113 LINK_GUEST
114 } LinkJournal;
115
116 static char *arg_directory = NULL;
117 static char *arg_template = NULL;
118 static char *arg_chdir = NULL;
119 static char *arg_user = NULL;
120 static sd_id128_t arg_uuid = {};
121 static char *arg_machine = NULL;
122 static const char *arg_selinux_context = NULL;
123 static const char *arg_selinux_apifs_context = NULL;
124 static const char *arg_slice = NULL;
125 static bool arg_private_network = false;
126 static bool arg_read_only = false;
127 static StartMode arg_start_mode = START_PID1;
128 static bool arg_ephemeral = false;
129 static LinkJournal arg_link_journal = LINK_AUTO;
130 static bool arg_link_journal_try = false;
131 static uint64_t arg_retain =
132 (1ULL << CAP_CHOWN) |
133 (1ULL << CAP_DAC_OVERRIDE) |
134 (1ULL << CAP_DAC_READ_SEARCH) |
135 (1ULL << CAP_FOWNER) |
136 (1ULL << CAP_FSETID) |
137 (1ULL << CAP_IPC_OWNER) |
138 (1ULL << CAP_KILL) |
139 (1ULL << CAP_LEASE) |
140 (1ULL << CAP_LINUX_IMMUTABLE) |
141 (1ULL << CAP_NET_BIND_SERVICE) |
142 (1ULL << CAP_NET_BROADCAST) |
143 (1ULL << CAP_NET_RAW) |
144 (1ULL << CAP_SETGID) |
145 (1ULL << CAP_SETFCAP) |
146 (1ULL << CAP_SETPCAP) |
147 (1ULL << CAP_SETUID) |
148 (1ULL << CAP_SYS_ADMIN) |
149 (1ULL << CAP_SYS_CHROOT) |
150 (1ULL << CAP_SYS_NICE) |
151 (1ULL << CAP_SYS_PTRACE) |
152 (1ULL << CAP_SYS_TTY_CONFIG) |
153 (1ULL << CAP_SYS_RESOURCE) |
154 (1ULL << CAP_SYS_BOOT) |
155 (1ULL << CAP_AUDIT_WRITE) |
156 (1ULL << CAP_AUDIT_CONTROL) |
157 (1ULL << CAP_MKNOD);
158 static CustomMount *arg_custom_mounts = NULL;
159 static unsigned arg_n_custom_mounts = 0;
160 static char **arg_setenv = NULL;
161 static bool arg_quiet = false;
162 static bool arg_share_system = false;
163 static bool arg_register = true;
164 static bool arg_keep_unit = false;
165 static char **arg_network_interfaces = NULL;
166 static char **arg_network_macvlan = NULL;
167 static char **arg_network_ipvlan = NULL;
168 static bool arg_network_veth = false;
169 static char **arg_network_veth_extra = NULL;
170 static char *arg_network_bridge = NULL;
171 static unsigned long arg_personality = PERSONALITY_INVALID;
172 static char *arg_image = NULL;
173 static VolatileMode arg_volatile_mode = VOLATILE_NO;
174 static ExposePort *arg_expose_ports = NULL;
175 static char **arg_property = NULL;
176 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
177 static bool arg_userns = false;
178 static int arg_kill_signal = 0;
179 static bool arg_unified_cgroup_hierarchy = false;
180 static SettingsMask arg_settings_mask = 0;
181 static int arg_settings_trusted = -1;
182 static char **arg_parameters = NULL;
183 static const char *arg_container_service_name = "systemd-nspawn";
184
185 static void help(void) {
186 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
187 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
188 " -h --help Show this help\n"
189 " --version Print version string\n"
190 " -q --quiet Do not show status information\n"
191 " -D --directory=PATH Root directory for the container\n"
192 " --template=PATH Initialize root directory from template directory,\n"
193 " if missing\n"
194 " -x --ephemeral Run container with snapshot of root directory, and\n"
195 " remove it after exit\n"
196 " -i --image=PATH File system device or disk image for the container\n"
197 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
198 " -b --boot Boot up full system (i.e. invoke init)\n"
199 " --chdir=PATH Set working directory in the container\n"
200 " -u --user=USER Run the command under specified user or uid\n"
201 " -M --machine=NAME Set the machine name for the container\n"
202 " --uuid=UUID Set a specific machine UUID for the container\n"
203 " -S --slice=SLICE Place the container in the specified slice\n"
204 " --property=NAME=VALUE Set scope unit property\n"
205 " --private-users[=UIDBASE[:NUIDS]]\n"
206 " Run within user namespace\n"
207 " --private-network Disable network in container\n"
208 " --network-interface=INTERFACE\n"
209 " Assign an existing network interface to the\n"
210 " container\n"
211 " --network-macvlan=INTERFACE\n"
212 " Create a macvlan network interface based on an\n"
213 " existing network interface to the container\n"
214 " --network-ipvlan=INTERFACE\n"
215 " Create a ipvlan network interface based on an\n"
216 " existing network interface to the container\n"
217 " -n --network-veth Add a virtual Ethernet connection between host\n"
218 " and container\n"
219 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
220 " Add an additional virtual Ethernet link between\n"
221 " host and container\n"
222 " --network-bridge=INTERFACE\n"
223 " Add a virtual Ethernet connection between host\n"
224 " and container and add it to an existing bridge on\n"
225 " the host\n"
226 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
227 " Expose a container IP port on the host\n"
228 " -Z --selinux-context=SECLABEL\n"
229 " Set the SELinux security context to be used by\n"
230 " processes in the container\n"
231 " -L --selinux-apifs-context=SECLABEL\n"
232 " Set the SELinux security context to be used by\n"
233 " API/tmpfs file systems in the container\n"
234 " --capability=CAP In addition to the default, retain specified\n"
235 " capability\n"
236 " --drop-capability=CAP Drop the specified capability from the default set\n"
237 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
238 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
239 " host, try-guest, try-host\n"
240 " -j Equivalent to --link-journal=try-guest\n"
241 " --read-only Mount the root directory read-only\n"
242 " --bind=PATH[:PATH[:OPTIONS]]\n"
243 " Bind mount a file or directory from the host into\n"
244 " the container\n"
245 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
246 " Similar, but creates a read-only bind mount\n"
247 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
248 " --overlay=PATH[:PATH...]:PATH\n"
249 " Create an overlay mount from the host to \n"
250 " the container\n"
251 " --overlay-ro=PATH[:PATH...]:PATH\n"
252 " Similar, but creates a read-only overlay mount\n"
253 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
254 " --share-system Share system namespaces with host\n"
255 " --register=BOOLEAN Register container as machine\n"
256 " --keep-unit Do not register a scope for the machine, reuse\n"
257 " the service unit nspawn is running in\n"
258 " --volatile[=MODE] Run the system in volatile mode\n"
259 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
260 , program_invocation_short_name);
261 }
262
263
264 static int custom_mounts_prepare(void) {
265 unsigned i;
266 int r;
267
268 /* Ensure the mounts are applied prefix first. */
269 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
270
271 /* Allocate working directories for the overlay file systems that need it */
272 for (i = 0; i < arg_n_custom_mounts; i++) {
273 CustomMount *m = &arg_custom_mounts[i];
274
275 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
276 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
277 return -EINVAL;
278 }
279
280 if (m->type != CUSTOM_MOUNT_OVERLAY)
281 continue;
282
283 if (m->work_dir)
284 continue;
285
286 if (m->read_only)
287 continue;
288
289 r = tempfn_random(m->source, NULL, &m->work_dir);
290 if (r < 0)
291 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
292 }
293
294 return 0;
295 }
296
297 static int detect_unified_cgroup_hierarchy(void) {
298 const char *e;
299 int r;
300
301 /* Allow the user to control whether the unified hierarchy is used */
302 e = getenv("UNIFIED_CGROUP_HIERARCHY");
303 if (e) {
304 r = parse_boolean(e);
305 if (r < 0)
306 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
307
308 arg_unified_cgroup_hierarchy = r;
309 return 0;
310 }
311
312 /* Otherwise inherit the default from the host system */
313 r = cg_unified();
314 if (r < 0)
315 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
316
317 arg_unified_cgroup_hierarchy = r;
318 return 0;
319 }
320
321 static int parse_argv(int argc, char *argv[]) {
322
323 enum {
324 ARG_VERSION = 0x100,
325 ARG_PRIVATE_NETWORK,
326 ARG_UUID,
327 ARG_READ_ONLY,
328 ARG_CAPABILITY,
329 ARG_DROP_CAPABILITY,
330 ARG_LINK_JOURNAL,
331 ARG_BIND,
332 ARG_BIND_RO,
333 ARG_TMPFS,
334 ARG_OVERLAY,
335 ARG_OVERLAY_RO,
336 ARG_SETENV,
337 ARG_SHARE_SYSTEM,
338 ARG_REGISTER,
339 ARG_KEEP_UNIT,
340 ARG_NETWORK_INTERFACE,
341 ARG_NETWORK_MACVLAN,
342 ARG_NETWORK_IPVLAN,
343 ARG_NETWORK_BRIDGE,
344 ARG_NETWORK_VETH_EXTRA,
345 ARG_PERSONALITY,
346 ARG_VOLATILE,
347 ARG_TEMPLATE,
348 ARG_PROPERTY,
349 ARG_PRIVATE_USERS,
350 ARG_KILL_SIGNAL,
351 ARG_SETTINGS,
352 ARG_CHDIR,
353 };
354
355 static const struct option options[] = {
356 { "help", no_argument, NULL, 'h' },
357 { "version", no_argument, NULL, ARG_VERSION },
358 { "directory", required_argument, NULL, 'D' },
359 { "template", required_argument, NULL, ARG_TEMPLATE },
360 { "ephemeral", no_argument, NULL, 'x' },
361 { "user", required_argument, NULL, 'u' },
362 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
363 { "as-pid2", no_argument, NULL, 'a' },
364 { "boot", no_argument, NULL, 'b' },
365 { "uuid", required_argument, NULL, ARG_UUID },
366 { "read-only", no_argument, NULL, ARG_READ_ONLY },
367 { "capability", required_argument, NULL, ARG_CAPABILITY },
368 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
369 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
370 { "bind", required_argument, NULL, ARG_BIND },
371 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
372 { "tmpfs", required_argument, NULL, ARG_TMPFS },
373 { "overlay", required_argument, NULL, ARG_OVERLAY },
374 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
375 { "machine", required_argument, NULL, 'M' },
376 { "slice", required_argument, NULL, 'S' },
377 { "setenv", required_argument, NULL, ARG_SETENV },
378 { "selinux-context", required_argument, NULL, 'Z' },
379 { "selinux-apifs-context", required_argument, NULL, 'L' },
380 { "quiet", no_argument, NULL, 'q' },
381 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
382 { "register", required_argument, NULL, ARG_REGISTER },
383 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
384 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
385 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
386 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
387 { "network-veth", no_argument, NULL, 'n' },
388 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
389 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
390 { "personality", required_argument, NULL, ARG_PERSONALITY },
391 { "image", required_argument, NULL, 'i' },
392 { "volatile", optional_argument, NULL, ARG_VOLATILE },
393 { "port", required_argument, NULL, 'p' },
394 { "property", required_argument, NULL, ARG_PROPERTY },
395 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
396 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
397 { "settings", required_argument, NULL, ARG_SETTINGS },
398 { "chdir", required_argument, NULL, ARG_CHDIR },
399 {}
400 };
401
402 int c, r;
403 const char *p, *e;
404 uint64_t plus = 0, minus = 0;
405 bool mask_all_settings = false, mask_no_settings = false;
406
407 assert(argc >= 0);
408 assert(argv);
409
410 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
411
412 switch (c) {
413
414 case 'h':
415 help();
416 return 0;
417
418 case ARG_VERSION:
419 return version();
420
421 case 'D':
422 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
423 if (r < 0)
424 return r;
425 break;
426
427 case ARG_TEMPLATE:
428 r = parse_path_argument_and_warn(optarg, false, &arg_template);
429 if (r < 0)
430 return r;
431 break;
432
433 case 'i':
434 r = parse_path_argument_and_warn(optarg, false, &arg_image);
435 if (r < 0)
436 return r;
437 break;
438
439 case 'x':
440 arg_ephemeral = true;
441 break;
442
443 case 'u':
444 r = free_and_strdup(&arg_user, optarg);
445 if (r < 0)
446 return log_oom();
447
448 arg_settings_mask |= SETTING_USER;
449 break;
450
451 case ARG_NETWORK_BRIDGE:
452 r = free_and_strdup(&arg_network_bridge, optarg);
453 if (r < 0)
454 return log_oom();
455
456 /* fall through */
457
458 case 'n':
459 arg_network_veth = true;
460 arg_private_network = true;
461 arg_settings_mask |= SETTING_NETWORK;
462 break;
463
464 case ARG_NETWORK_VETH_EXTRA:
465 r = veth_extra_parse(&arg_network_veth_extra, optarg);
466 if (r < 0)
467 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
468
469 arg_private_network = true;
470 arg_settings_mask |= SETTING_NETWORK;
471 break;
472
473 case ARG_NETWORK_INTERFACE:
474 if (strv_extend(&arg_network_interfaces, optarg) < 0)
475 return log_oom();
476
477 arg_private_network = true;
478 arg_settings_mask |= SETTING_NETWORK;
479 break;
480
481 case ARG_NETWORK_MACVLAN:
482 if (strv_extend(&arg_network_macvlan, optarg) < 0)
483 return log_oom();
484
485 arg_private_network = true;
486 arg_settings_mask |= SETTING_NETWORK;
487 break;
488
489 case ARG_NETWORK_IPVLAN:
490 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
491 return log_oom();
492
493 /* fall through */
494
495 case ARG_PRIVATE_NETWORK:
496 arg_private_network = true;
497 arg_settings_mask |= SETTING_NETWORK;
498 break;
499
500 case 'b':
501 if (arg_start_mode == START_PID2) {
502 log_error("--boot and --as-pid2 may not be combined.");
503 return -EINVAL;
504 }
505
506 arg_start_mode = START_BOOT;
507 arg_settings_mask |= SETTING_START_MODE;
508 break;
509
510 case 'a':
511 if (arg_start_mode == START_BOOT) {
512 log_error("--boot and --as-pid2 may not be combined.");
513 return -EINVAL;
514 }
515
516 arg_start_mode = START_PID2;
517 arg_settings_mask |= SETTING_START_MODE;
518 break;
519
520 case ARG_UUID:
521 r = sd_id128_from_string(optarg, &arg_uuid);
522 if (r < 0) {
523 log_error("Invalid UUID: %s", optarg);
524 return r;
525 }
526
527 arg_settings_mask |= SETTING_MACHINE_ID;
528 break;
529
530 case 'S':
531 arg_slice = optarg;
532 break;
533
534 case 'M':
535 if (isempty(optarg))
536 arg_machine = mfree(arg_machine);
537 else {
538 if (!machine_name_is_valid(optarg)) {
539 log_error("Invalid machine name: %s", optarg);
540 return -EINVAL;
541 }
542
543 r = free_and_strdup(&arg_machine, optarg);
544 if (r < 0)
545 return log_oom();
546
547 break;
548 }
549
550 case 'Z':
551 arg_selinux_context = optarg;
552 break;
553
554 case 'L':
555 arg_selinux_apifs_context = optarg;
556 break;
557
558 case ARG_READ_ONLY:
559 arg_read_only = true;
560 arg_settings_mask |= SETTING_READ_ONLY;
561 break;
562
563 case ARG_CAPABILITY:
564 case ARG_DROP_CAPABILITY: {
565 p = optarg;
566 for (;;) {
567 _cleanup_free_ char *t = NULL;
568
569 r = extract_first_word(&p, &t, ",", 0);
570 if (r < 0)
571 return log_error_errno(r, "Failed to parse capability %s.", t);
572
573 if (r == 0)
574 break;
575
576 if (streq(t, "all")) {
577 if (c == ARG_CAPABILITY)
578 plus = (uint64_t) -1;
579 else
580 minus = (uint64_t) -1;
581 } else {
582 int cap;
583
584 cap = capability_from_name(t);
585 if (cap < 0) {
586 log_error("Failed to parse capability %s.", t);
587 return -EINVAL;
588 }
589
590 if (c == ARG_CAPABILITY)
591 plus |= 1ULL << (uint64_t) cap;
592 else
593 minus |= 1ULL << (uint64_t) cap;
594 }
595 }
596
597 arg_settings_mask |= SETTING_CAPABILITY;
598 break;
599 }
600
601 case 'j':
602 arg_link_journal = LINK_GUEST;
603 arg_link_journal_try = true;
604 break;
605
606 case ARG_LINK_JOURNAL:
607 if (streq(optarg, "auto")) {
608 arg_link_journal = LINK_AUTO;
609 arg_link_journal_try = false;
610 } else if (streq(optarg, "no")) {
611 arg_link_journal = LINK_NO;
612 arg_link_journal_try = false;
613 } else if (streq(optarg, "guest")) {
614 arg_link_journal = LINK_GUEST;
615 arg_link_journal_try = false;
616 } else if (streq(optarg, "host")) {
617 arg_link_journal = LINK_HOST;
618 arg_link_journal_try = false;
619 } else if (streq(optarg, "try-guest")) {
620 arg_link_journal = LINK_GUEST;
621 arg_link_journal_try = true;
622 } else if (streq(optarg, "try-host")) {
623 arg_link_journal = LINK_HOST;
624 arg_link_journal_try = true;
625 } else {
626 log_error("Failed to parse link journal mode %s", optarg);
627 return -EINVAL;
628 }
629
630 break;
631
632 case ARG_BIND:
633 case ARG_BIND_RO:
634 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
635 if (r < 0)
636 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
637
638 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
639 break;
640
641 case ARG_TMPFS:
642 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
643 if (r < 0)
644 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
645
646 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
647 break;
648
649 case ARG_OVERLAY:
650 case ARG_OVERLAY_RO: {
651 _cleanup_free_ char *upper = NULL, *destination = NULL;
652 _cleanup_strv_free_ char **lower = NULL;
653 CustomMount *m;
654 unsigned n = 0;
655 char **i;
656
657 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
658 if (r == -ENOMEM)
659 return log_oom();
660 else if (r < 0) {
661 log_error("Invalid overlay specification: %s", optarg);
662 return r;
663 }
664
665 STRV_FOREACH(i, lower) {
666 if (!path_is_absolute(*i)) {
667 log_error("Overlay path %s is not absolute.", *i);
668 return -EINVAL;
669 }
670
671 n++;
672 }
673
674 if (n < 2) {
675 log_error("--overlay= needs at least two colon-separated directories specified.");
676 return -EINVAL;
677 }
678
679 if (n == 2) {
680 /* If two parameters are specified,
681 * the first one is the lower, the
682 * second one the upper directory. And
683 * we'll also define the destination
684 * mount point the same as the upper. */
685 upper = lower[1];
686 lower[1] = NULL;
687
688 destination = strdup(upper);
689 if (!destination)
690 return log_oom();
691
692 } else {
693 upper = lower[n - 2];
694 destination = lower[n - 1];
695 lower[n - 2] = NULL;
696 }
697
698 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
699 if (!m)
700 return log_oom();
701
702 m->destination = destination;
703 m->source = upper;
704 m->lower = lower;
705 m->read_only = c == ARG_OVERLAY_RO;
706
707 upper = destination = NULL;
708 lower = NULL;
709
710 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
711 break;
712 }
713
714 case ARG_SETENV: {
715 char **n;
716
717 if (!env_assignment_is_valid(optarg)) {
718 log_error("Environment variable assignment '%s' is not valid.", optarg);
719 return -EINVAL;
720 }
721
722 n = strv_env_set(arg_setenv, optarg);
723 if (!n)
724 return log_oom();
725
726 strv_free(arg_setenv);
727 arg_setenv = n;
728
729 arg_settings_mask |= SETTING_ENVIRONMENT;
730 break;
731 }
732
733 case 'q':
734 arg_quiet = true;
735 break;
736
737 case ARG_SHARE_SYSTEM:
738 arg_share_system = true;
739 break;
740
741 case ARG_REGISTER:
742 r = parse_boolean(optarg);
743 if (r < 0) {
744 log_error("Failed to parse --register= argument: %s", optarg);
745 return r;
746 }
747
748 arg_register = r;
749 break;
750
751 case ARG_KEEP_UNIT:
752 arg_keep_unit = true;
753 break;
754
755 case ARG_PERSONALITY:
756
757 arg_personality = personality_from_string(optarg);
758 if (arg_personality == PERSONALITY_INVALID) {
759 log_error("Unknown or unsupported personality '%s'.", optarg);
760 return -EINVAL;
761 }
762
763 arg_settings_mask |= SETTING_PERSONALITY;
764 break;
765
766 case ARG_VOLATILE:
767
768 if (!optarg)
769 arg_volatile_mode = VOLATILE_YES;
770 else {
771 VolatileMode m;
772
773 m = volatile_mode_from_string(optarg);
774 if (m < 0) {
775 log_error("Failed to parse --volatile= argument: %s", optarg);
776 return -EINVAL;
777 } else
778 arg_volatile_mode = m;
779 }
780
781 arg_settings_mask |= SETTING_VOLATILE_MODE;
782 break;
783
784 case 'p':
785 r = expose_port_parse(&arg_expose_ports, optarg);
786 if (r == -EEXIST)
787 return log_error_errno(r, "Duplicate port specification: %s", optarg);
788 if (r < 0)
789 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
790
791 arg_settings_mask |= SETTING_EXPOSE_PORTS;
792 break;
793
794 case ARG_PROPERTY:
795 if (strv_extend(&arg_property, optarg) < 0)
796 return log_oom();
797
798 break;
799
800 case ARG_PRIVATE_USERS:
801 if (optarg) {
802 _cleanup_free_ char *buffer = NULL;
803 const char *range, *shift;
804
805 range = strchr(optarg, ':');
806 if (range) {
807 buffer = strndup(optarg, range - optarg);
808 if (!buffer)
809 return log_oom();
810 shift = buffer;
811
812 range++;
813 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
814 log_error("Failed to parse UID range: %s", range);
815 return -EINVAL;
816 }
817 } else
818 shift = optarg;
819
820 if (parse_uid(shift, &arg_uid_shift) < 0) {
821 log_error("Failed to parse UID: %s", optarg);
822 return -EINVAL;
823 }
824 }
825
826 arg_userns = true;
827 break;
828
829 case ARG_KILL_SIGNAL:
830 arg_kill_signal = signal_from_string_try_harder(optarg);
831 if (arg_kill_signal < 0) {
832 log_error("Cannot parse signal: %s", optarg);
833 return -EINVAL;
834 }
835
836 arg_settings_mask |= SETTING_KILL_SIGNAL;
837 break;
838
839 case ARG_SETTINGS:
840
841 /* no → do not read files
842 * yes → read files, do not override cmdline, trust only subset
843 * override → read files, override cmdline, trust only subset
844 * trusted → read files, do not override cmdline, trust all
845 */
846
847 r = parse_boolean(optarg);
848 if (r < 0) {
849 if (streq(optarg, "trusted")) {
850 mask_all_settings = false;
851 mask_no_settings = false;
852 arg_settings_trusted = true;
853
854 } else if (streq(optarg, "override")) {
855 mask_all_settings = false;
856 mask_no_settings = true;
857 arg_settings_trusted = -1;
858 } else
859 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
860 } else if (r > 0) {
861 /* yes */
862 mask_all_settings = false;
863 mask_no_settings = false;
864 arg_settings_trusted = -1;
865 } else {
866 /* no */
867 mask_all_settings = true;
868 mask_no_settings = false;
869 arg_settings_trusted = false;
870 }
871
872 break;
873
874 case ARG_CHDIR:
875 if (!path_is_absolute(optarg)) {
876 log_error("Working directory %s is not an absolute path.", optarg);
877 return -EINVAL;
878 }
879
880 r = free_and_strdup(&arg_chdir, optarg);
881 if (r < 0)
882 return log_oom();
883
884 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
885 break;
886
887 case '?':
888 return -EINVAL;
889
890 default:
891 assert_not_reached("Unhandled option");
892 }
893
894 if (arg_share_system)
895 arg_register = false;
896
897 if (arg_start_mode != START_PID1 && arg_share_system) {
898 log_error("--boot and --share-system may not be combined.");
899 return -EINVAL;
900 }
901
902 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
903 log_error("--keep-unit may not be used when invoked from a user session.");
904 return -EINVAL;
905 }
906
907 if (arg_directory && arg_image) {
908 log_error("--directory= and --image= may not be combined.");
909 return -EINVAL;
910 }
911
912 if (arg_template && arg_image) {
913 log_error("--template= and --image= may not be combined.");
914 return -EINVAL;
915 }
916
917 if (arg_template && !(arg_directory || arg_machine)) {
918 log_error("--template= needs --directory= or --machine=.");
919 return -EINVAL;
920 }
921
922 if (arg_ephemeral && arg_template) {
923 log_error("--ephemeral and --template= may not be combined.");
924 return -EINVAL;
925 }
926
927 if (arg_ephemeral && arg_image) {
928 log_error("--ephemeral and --image= may not be combined.");
929 return -EINVAL;
930 }
931
932 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
933 log_error("--ephemeral and --link-journal= may not be combined.");
934 return -EINVAL;
935 }
936
937 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
938 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
939
940 if (argc > optind) {
941 arg_parameters = strv_copy(argv + optind);
942 if (!arg_parameters)
943 return log_oom();
944
945 arg_settings_mask |= SETTING_START_MODE;
946 }
947
948 /* Load all settings from .nspawn files */
949 if (mask_no_settings)
950 arg_settings_mask = 0;
951
952 /* Don't load any settings from .nspawn files */
953 if (mask_all_settings)
954 arg_settings_mask = _SETTINGS_MASK_ALL;
955
956 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
957
958 r = detect_unified_cgroup_hierarchy();
959 if (r < 0)
960 return r;
961
962 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
963 if (e)
964 arg_container_service_name = e;
965
966 return 1;
967 }
968
969 static int verify_arguments(void) {
970
971 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
972 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
973 return -EINVAL;
974 }
975
976 if (arg_expose_ports && !arg_private_network) {
977 log_error("Cannot use --port= without private networking.");
978 return -EINVAL;
979 }
980
981 #ifndef HAVE_LIBIPTC
982 if (arg_expose_ports) {
983 log_error("--port= is not supported, compiled without libiptc support.");
984 return -EOPNOTSUPP;
985 }
986 #endif
987
988 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
989 arg_kill_signal = SIGRTMIN+3;
990
991 return 0;
992 }
993
994 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
995 assert(p);
996
997 if (!arg_userns)
998 return 0;
999
1000 if (uid == UID_INVALID && gid == GID_INVALID)
1001 return 0;
1002
1003 if (uid != UID_INVALID) {
1004 uid += arg_uid_shift;
1005
1006 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1007 return -EOVERFLOW;
1008 }
1009
1010 if (gid != GID_INVALID) {
1011 gid += (gid_t) arg_uid_shift;
1012
1013 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1014 return -EOVERFLOW;
1015 }
1016
1017 if (lchown(p, uid, gid) < 0)
1018 return -errno;
1019
1020 return 0;
1021 }
1022
1023 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1024 const char *q;
1025
1026 q = prefix_roota(root, path);
1027 if (mkdir(q, mode) < 0) {
1028 if (errno == EEXIST)
1029 return 0;
1030 return -errno;
1031 }
1032
1033 return userns_lchown(q, uid, gid);
1034 }
1035
1036 static int setup_timezone(const char *dest) {
1037 _cleanup_free_ char *p = NULL, *q = NULL;
1038 const char *where, *check, *what;
1039 char *z, *y;
1040 int r;
1041
1042 assert(dest);
1043
1044 /* Fix the timezone, if possible */
1045 r = readlink_malloc("/etc/localtime", &p);
1046 if (r < 0) {
1047 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1048 return 0;
1049 }
1050
1051 z = path_startswith(p, "../usr/share/zoneinfo/");
1052 if (!z)
1053 z = path_startswith(p, "/usr/share/zoneinfo/");
1054 if (!z) {
1055 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1056 return 0;
1057 }
1058
1059 where = prefix_roota(dest, "/etc/localtime");
1060 r = readlink_malloc(where, &q);
1061 if (r >= 0) {
1062 y = path_startswith(q, "../usr/share/zoneinfo/");
1063 if (!y)
1064 y = path_startswith(q, "/usr/share/zoneinfo/");
1065
1066 /* Already pointing to the right place? Then do nothing .. */
1067 if (y && streq(y, z))
1068 return 0;
1069 }
1070
1071 check = strjoina("/usr/share/zoneinfo/", z);
1072 check = prefix_roota(dest, check);
1073 if (laccess(check, F_OK) < 0) {
1074 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1075 return 0;
1076 }
1077
1078 r = unlink(where);
1079 if (r < 0 && errno != ENOENT) {
1080 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1081 return 0;
1082 }
1083
1084 what = strjoina("../usr/share/zoneinfo/", z);
1085 if (symlink(what, where) < 0) {
1086 log_error_errno(errno, "Failed to correct timezone of container: %m");
1087 return 0;
1088 }
1089
1090 r = userns_lchown(where, 0, 0);
1091 if (r < 0)
1092 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1093
1094 return 0;
1095 }
1096
1097 static int setup_resolv_conf(const char *dest) {
1098 const char *where = NULL;
1099 int r;
1100
1101 assert(dest);
1102
1103 if (arg_private_network)
1104 return 0;
1105
1106 /* Fix resolv.conf, if possible */
1107 where = prefix_roota(dest, "/etc/resolv.conf");
1108
1109 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1110 if (r < 0) {
1111 /* If the file already exists as symlink, let's
1112 * suppress the warning, under the assumption that
1113 * resolved or something similar runs inside and the
1114 * symlink points there.
1115 *
1116 * If the disk image is read-only, there's also no
1117 * point in complaining.
1118 */
1119 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1120 "Failed to copy /etc/resolv.conf to %s: %m", where);
1121 return 0;
1122 }
1123
1124 r = userns_lchown(where, 0, 0);
1125 if (r < 0)
1126 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1127
1128 return 0;
1129 }
1130
1131 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1132 assert(s);
1133
1134 snprintf(s, 37,
1135 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1136 SD_ID128_FORMAT_VAL(id));
1137
1138 return s;
1139 }
1140
1141 static int setup_boot_id(const char *dest) {
1142 const char *from, *to;
1143 sd_id128_t rnd = {};
1144 char as_uuid[37];
1145 int r;
1146
1147 if (arg_share_system)
1148 return 0;
1149
1150 /* Generate a new randomized boot ID, so that each boot-up of
1151 * the container gets a new one */
1152
1153 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1154 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1155
1156 r = sd_id128_randomize(&rnd);
1157 if (r < 0)
1158 return log_error_errno(r, "Failed to generate random boot id: %m");
1159
1160 id128_format_as_uuid(rnd, as_uuid);
1161
1162 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1163 if (r < 0)
1164 return log_error_errno(r, "Failed to write boot id: %m");
1165
1166 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1167 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1168 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1169 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1170
1171 unlink(from);
1172 return r;
1173 }
1174
1175 static int copy_devnodes(const char *dest) {
1176
1177 static const char devnodes[] =
1178 "null\0"
1179 "zero\0"
1180 "full\0"
1181 "random\0"
1182 "urandom\0"
1183 "tty\0"
1184 "net/tun\0";
1185
1186 const char *d;
1187 int r = 0;
1188 _cleanup_umask_ mode_t u;
1189
1190 assert(dest);
1191
1192 u = umask(0000);
1193
1194 /* Create /dev/net, so that we can create /dev/net/tun in it */
1195 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1196 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1197
1198 NULSTR_FOREACH(d, devnodes) {
1199 _cleanup_free_ char *from = NULL, *to = NULL;
1200 struct stat st;
1201
1202 from = strappend("/dev/", d);
1203 to = prefix_root(dest, from);
1204
1205 if (stat(from, &st) < 0) {
1206
1207 if (errno != ENOENT)
1208 return log_error_errno(errno, "Failed to stat %s: %m", from);
1209
1210 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1211
1212 log_error("%s is not a char or block device, cannot copy.", from);
1213 return -EIO;
1214
1215 } else {
1216 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1217 if (errno != EPERM)
1218 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1219
1220 /* Some systems abusively restrict mknod but
1221 * allow bind mounts. */
1222 r = touch(to);
1223 if (r < 0)
1224 return log_error_errno(r, "touch (%s) failed: %m", to);
1225 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1226 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1227 }
1228
1229 r = userns_lchown(to, 0, 0);
1230 if (r < 0)
1231 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1232 }
1233 }
1234
1235 return r;
1236 }
1237
1238 static int setup_pts(const char *dest) {
1239 _cleanup_free_ char *options = NULL;
1240 const char *p;
1241 int r;
1242
1243 #ifdef HAVE_SELINUX
1244 if (arg_selinux_apifs_context)
1245 (void) asprintf(&options,
1246 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1247 arg_uid_shift + TTY_GID,
1248 arg_selinux_apifs_context);
1249 else
1250 #endif
1251 (void) asprintf(&options,
1252 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1253 arg_uid_shift + TTY_GID);
1254
1255 if (!options)
1256 return log_oom();
1257
1258 /* Mount /dev/pts itself */
1259 p = prefix_roota(dest, "/dev/pts");
1260 if (mkdir(p, 0755) < 0)
1261 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1262 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1263 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1264 r = userns_lchown(p, 0, 0);
1265 if (r < 0)
1266 return log_error_errno(r, "Failed to chown /dev/pts: %m");
1267
1268 /* Create /dev/ptmx symlink */
1269 p = prefix_roota(dest, "/dev/ptmx");
1270 if (symlink("pts/ptmx", p) < 0)
1271 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1272 r = userns_lchown(p, 0, 0);
1273 if (r < 0)
1274 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
1275
1276 /* And fix /dev/pts/ptmx ownership */
1277 p = prefix_roota(dest, "/dev/pts/ptmx");
1278 r = userns_lchown(p, 0, 0);
1279 if (r < 0)
1280 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
1281
1282 return 0;
1283 }
1284
1285 static int setup_dev_console(const char *dest, const char *console) {
1286 _cleanup_umask_ mode_t u;
1287 const char *to;
1288 int r;
1289
1290 assert(dest);
1291 assert(console);
1292
1293 u = umask(0000);
1294
1295 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1296 if (r < 0)
1297 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1298
1299 /* We need to bind mount the right tty to /dev/console since
1300 * ptys can only exist on pts file systems. To have something
1301 * to bind mount things on we create a empty regular file. */
1302
1303 to = prefix_roota(dest, "/dev/console");
1304 r = touch(to);
1305 if (r < 0)
1306 return log_error_errno(r, "touch() for /dev/console failed: %m");
1307
1308 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1309 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1310
1311 return 0;
1312 }
1313
1314 static int setup_kmsg(const char *dest, int kmsg_socket) {
1315 const char *from, *to;
1316 _cleanup_umask_ mode_t u;
1317 int fd, r;
1318
1319 assert(kmsg_socket >= 0);
1320
1321 u = umask(0000);
1322
1323 /* We create the kmsg FIFO as /run/kmsg, but immediately
1324 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1325 * on the reading side behave very similar to /proc/kmsg,
1326 * their writing side behaves differently from /dev/kmsg in
1327 * that writing blocks when nothing is reading. In order to
1328 * avoid any problems with containers deadlocking due to this
1329 * we simply make /dev/kmsg unavailable to the container. */
1330 from = prefix_roota(dest, "/run/kmsg");
1331 to = prefix_roota(dest, "/proc/kmsg");
1332
1333 if (mkfifo(from, 0600) < 0)
1334 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1335 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1336 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1337
1338 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1339 if (fd < 0)
1340 return log_error_errno(errno, "Failed to open fifo: %m");
1341
1342 /* Store away the fd in the socket, so that it stays open as
1343 * long as we run the child */
1344 r = send_one_fd(kmsg_socket, fd, 0);
1345 safe_close(fd);
1346
1347 if (r < 0)
1348 return log_error_errno(r, "Failed to send FIFO fd: %m");
1349
1350 /* And now make the FIFO unavailable as /run/kmsg... */
1351 (void) unlink(from);
1352
1353 return 0;
1354 }
1355
1356 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1357 union in_addr_union *exposed = userdata;
1358
1359 assert(rtnl);
1360 assert(m);
1361 assert(exposed);
1362
1363 expose_port_execute(rtnl, arg_expose_ports, exposed);
1364 return 0;
1365 }
1366
1367 static int setup_hostname(void) {
1368
1369 if (arg_share_system)
1370 return 0;
1371
1372 if (sethostname_idempotent(arg_machine) < 0)
1373 return -errno;
1374
1375 return 0;
1376 }
1377
1378 static int setup_journal(const char *directory) {
1379 sd_id128_t this_id;
1380 _cleanup_free_ char *b = NULL, *d = NULL;
1381 const char *p, *q;
1382 bool try;
1383 char id[33];
1384 int r;
1385
1386 /* Don't link journals in ephemeral mode */
1387 if (arg_ephemeral)
1388 return 0;
1389
1390 if (arg_link_journal == LINK_NO)
1391 return 0;
1392
1393 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1394
1395 r = sd_id128_get_machine(&this_id);
1396 if (r < 0)
1397 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1398
1399 if (sd_id128_equal(arg_uuid, this_id)) {
1400 log_full(try ? LOG_WARNING : LOG_ERR,
1401 "Host and machine ids are equal (%s): refusing to link journals", id);
1402 if (try)
1403 return 0;
1404 return -EEXIST;
1405 }
1406
1407 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1408 if (r < 0)
1409 return log_error_errno(r, "Failed to create /var: %m");
1410
1411 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1412 if (r < 0)
1413 return log_error_errno(r, "Failed to create /var/log: %m");
1414
1415 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1416 if (r < 0)
1417 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1418
1419 (void) sd_id128_to_string(arg_uuid, id);
1420
1421 p = strjoina("/var/log/journal/", id);
1422 q = prefix_roota(directory, p);
1423
1424 if (path_is_mount_point(p, 0) > 0) {
1425 if (try)
1426 return 0;
1427
1428 log_error("%s: already a mount point, refusing to use for journal", p);
1429 return -EEXIST;
1430 }
1431
1432 if (path_is_mount_point(q, 0) > 0) {
1433 if (try)
1434 return 0;
1435
1436 log_error("%s: already a mount point, refusing to use for journal", q);
1437 return -EEXIST;
1438 }
1439
1440 r = readlink_and_make_absolute(p, &d);
1441 if (r >= 0) {
1442 if ((arg_link_journal == LINK_GUEST ||
1443 arg_link_journal == LINK_AUTO) &&
1444 path_equal(d, q)) {
1445
1446 r = userns_mkdir(directory, p, 0755, 0, 0);
1447 if (r < 0)
1448 log_warning_errno(r, "Failed to create directory %s: %m", q);
1449 return 0;
1450 }
1451
1452 if (unlink(p) < 0)
1453 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1454 } else if (r == -EINVAL) {
1455
1456 if (arg_link_journal == LINK_GUEST &&
1457 rmdir(p) < 0) {
1458
1459 if (errno == ENOTDIR) {
1460 log_error("%s already exists and is neither a symlink nor a directory", p);
1461 return r;
1462 } else
1463 return log_error_errno(errno, "Failed to remove %s: %m", p);
1464 }
1465 } else if (r != -ENOENT)
1466 return log_error_errno(r, "readlink(%s) failed: %m", p);
1467
1468 if (arg_link_journal == LINK_GUEST) {
1469
1470 if (symlink(q, p) < 0) {
1471 if (try) {
1472 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1473 return 0;
1474 } else
1475 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1476 }
1477
1478 r = userns_mkdir(directory, p, 0755, 0, 0);
1479 if (r < 0)
1480 log_warning_errno(r, "Failed to create directory %s: %m", q);
1481 return 0;
1482 }
1483
1484 if (arg_link_journal == LINK_HOST) {
1485 /* don't create parents here -- if the host doesn't have
1486 * permanent journal set up, don't force it here */
1487
1488 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
1489 if (try) {
1490 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1491 return 0;
1492 } else
1493 return log_error_errno(errno, "Failed to create %s: %m", p);
1494 }
1495
1496 } else if (access(p, F_OK) < 0)
1497 return 0;
1498
1499 if (dir_is_empty(q) == 0)
1500 log_warning("%s is not empty, proceeding anyway.", q);
1501
1502 r = userns_mkdir(directory, p, 0755, 0, 0);
1503 if (r < 0)
1504 return log_error_errno(r, "Failed to create %s: %m", q);
1505
1506 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1507 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1508
1509 return 0;
1510 }
1511
1512 static int drop_capabilities(void) {
1513 return capability_bounding_set_drop(arg_retain, false);
1514 }
1515
1516 static int reset_audit_loginuid(void) {
1517 _cleanup_free_ char *p = NULL;
1518 int r;
1519
1520 if (arg_share_system)
1521 return 0;
1522
1523 r = read_one_line_file("/proc/self/loginuid", &p);
1524 if (r == -ENOENT)
1525 return 0;
1526 if (r < 0)
1527 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1528
1529 /* Already reset? */
1530 if (streq(p, "4294967295"))
1531 return 0;
1532
1533 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1534 if (r < 0) {
1535 log_error_errno(r,
1536 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1537 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1538 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1539 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1540 "using systemd-nspawn. Sleeping for 5s... (%m)");
1541
1542 sleep(5);
1543 }
1544
1545 return 0;
1546 }
1547
1548 static int setup_seccomp(void) {
1549
1550 #ifdef HAVE_SECCOMP
1551 static const struct {
1552 uint64_t capability;
1553 int syscall_num;
1554 } blacklist[] = {
1555 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1556 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1557 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1558 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1559 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1560 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1561 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1562 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1563 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1564 { CAP_SYSLOG, SCMP_SYS(syslog) },
1565 };
1566
1567 scmp_filter_ctx seccomp;
1568 unsigned i;
1569 int r;
1570
1571 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1572 if (!seccomp)
1573 return log_oom();
1574
1575 r = seccomp_add_secondary_archs(seccomp);
1576 if (r < 0) {
1577 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1578 goto finish;
1579 }
1580
1581 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1582 if (arg_retain & (1ULL << blacklist[i].capability))
1583 continue;
1584
1585 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1586 if (r == -EFAULT)
1587 continue; /* unknown syscall */
1588 if (r < 0) {
1589 log_error_errno(r, "Failed to block syscall: %m");
1590 goto finish;
1591 }
1592 }
1593
1594
1595 /*
1596 Audit is broken in containers, much of the userspace audit
1597 hookup will fail if running inside a container. We don't
1598 care and just turn off creation of audit sockets.
1599
1600 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1601 with EAFNOSUPPORT which audit userspace uses as indication
1602 that audit is disabled in the kernel.
1603 */
1604
1605 r = seccomp_rule_add(
1606 seccomp,
1607 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1608 SCMP_SYS(socket),
1609 2,
1610 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1611 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1612 if (r < 0) {
1613 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1614 goto finish;
1615 }
1616
1617 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1618 if (r < 0) {
1619 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1620 goto finish;
1621 }
1622
1623 r = seccomp_load(seccomp);
1624 if (r == -EINVAL) {
1625 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1626 r = 0;
1627 goto finish;
1628 }
1629 if (r < 0) {
1630 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1631 goto finish;
1632 }
1633
1634 finish:
1635 seccomp_release(seccomp);
1636 return r;
1637 #else
1638 return 0;
1639 #endif
1640
1641 }
1642
1643 static int setup_propagate(const char *root) {
1644 const char *p, *q;
1645 int r;
1646
1647 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1648 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1649 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1650 (void) mkdir_p(p, 0600);
1651
1652 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1653 if (r < 0)
1654 return log_error_errno(r, "Failed to create /run/systemd: %m");
1655
1656 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1657 if (r < 0)
1658 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
1659
1660 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1661 if (r < 0)
1662 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
1663
1664 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1665 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1666 return log_error_errno(errno, "Failed to install propagation bind mount.");
1667
1668 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1669 return log_error_errno(errno, "Failed to make propagation mount read-only");
1670
1671 return 0;
1672 }
1673
1674 static int setup_image(char **device_path, int *loop_nr) {
1675 struct loop_info64 info = {
1676 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1677 };
1678 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1679 _cleanup_free_ char* loopdev = NULL;
1680 struct stat st;
1681 int r, nr;
1682
1683 assert(device_path);
1684 assert(loop_nr);
1685 assert(arg_image);
1686
1687 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1688 if (fd < 0)
1689 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1690
1691 if (fstat(fd, &st) < 0)
1692 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1693
1694 if (S_ISBLK(st.st_mode)) {
1695 char *p;
1696
1697 p = strdup(arg_image);
1698 if (!p)
1699 return log_oom();
1700
1701 *device_path = p;
1702
1703 *loop_nr = -1;
1704
1705 r = fd;
1706 fd = -1;
1707
1708 return r;
1709 }
1710
1711 if (!S_ISREG(st.st_mode)) {
1712 log_error("%s is not a regular file or block device.", arg_image);
1713 return -EINVAL;
1714 }
1715
1716 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1717 if (control < 0)
1718 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1719
1720 nr = ioctl(control, LOOP_CTL_GET_FREE);
1721 if (nr < 0)
1722 return log_error_errno(errno, "Failed to allocate loop device: %m");
1723
1724 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1725 return log_oom();
1726
1727 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1728 if (loop < 0)
1729 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1730
1731 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1732 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1733
1734 if (arg_read_only)
1735 info.lo_flags |= LO_FLAGS_READ_ONLY;
1736
1737 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1738 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1739
1740 *device_path = loopdev;
1741 loopdev = NULL;
1742
1743 *loop_nr = nr;
1744
1745 r = loop;
1746 loop = -1;
1747
1748 return r;
1749 }
1750
1751 #define PARTITION_TABLE_BLURB \
1752 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1753 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1754 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1755 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1756 "to be bootable with systemd-nspawn."
1757
1758 static int dissect_image(
1759 int fd,
1760 char **root_device, bool *root_device_rw,
1761 char **home_device, bool *home_device_rw,
1762 char **srv_device, bool *srv_device_rw,
1763 bool *secondary) {
1764
1765 #ifdef HAVE_BLKID
1766 int home_nr = -1, srv_nr = -1;
1767 #ifdef GPT_ROOT_NATIVE
1768 int root_nr = -1;
1769 #endif
1770 #ifdef GPT_ROOT_SECONDARY
1771 int secondary_root_nr = -1;
1772 #endif
1773 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1774 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1775 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1776 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1777 _cleanup_udev_unref_ struct udev *udev = NULL;
1778 struct udev_list_entry *first, *item;
1779 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1780 bool is_gpt, is_mbr, multiple_generic = false;
1781 const char *pttype = NULL;
1782 blkid_partlist pl;
1783 struct stat st;
1784 unsigned i;
1785 int r;
1786
1787 assert(fd >= 0);
1788 assert(root_device);
1789 assert(home_device);
1790 assert(srv_device);
1791 assert(secondary);
1792 assert(arg_image);
1793
1794 b = blkid_new_probe();
1795 if (!b)
1796 return log_oom();
1797
1798 errno = 0;
1799 r = blkid_probe_set_device(b, fd, 0, 0);
1800 if (r != 0) {
1801 if (errno == 0)
1802 return log_oom();
1803
1804 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1805 }
1806
1807 blkid_probe_enable_partitions(b, 1);
1808 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1809
1810 errno = 0;
1811 r = blkid_do_safeprobe(b);
1812 if (r == -2 || r == 1) {
1813 log_error("Failed to identify any partition table on\n"
1814 " %s\n"
1815 PARTITION_TABLE_BLURB, arg_image);
1816 return -EINVAL;
1817 } else if (r != 0) {
1818 if (errno == 0)
1819 errno = EIO;
1820 return log_error_errno(errno, "Failed to probe: %m");
1821 }
1822
1823 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1824
1825 is_gpt = streq_ptr(pttype, "gpt");
1826 is_mbr = streq_ptr(pttype, "dos");
1827
1828 if (!is_gpt && !is_mbr) {
1829 log_error("No GPT or MBR partition table discovered on\n"
1830 " %s\n"
1831 PARTITION_TABLE_BLURB, arg_image);
1832 return -EINVAL;
1833 }
1834
1835 errno = 0;
1836 pl = blkid_probe_get_partitions(b);
1837 if (!pl) {
1838 if (errno == 0)
1839 return log_oom();
1840
1841 log_error("Failed to list partitions of %s", arg_image);
1842 return -errno;
1843 }
1844
1845 udev = udev_new();
1846 if (!udev)
1847 return log_oom();
1848
1849 if (fstat(fd, &st) < 0)
1850 return log_error_errno(errno, "Failed to stat block device: %m");
1851
1852 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1853 if (!d)
1854 return log_oom();
1855
1856 for (i = 0;; i++) {
1857 int n, m;
1858
1859 if (i >= 10) {
1860 log_error("Kernel partitions never appeared.");
1861 return -ENXIO;
1862 }
1863
1864 e = udev_enumerate_new(udev);
1865 if (!e)
1866 return log_oom();
1867
1868 r = udev_enumerate_add_match_parent(e, d);
1869 if (r < 0)
1870 return log_oom();
1871
1872 r = udev_enumerate_scan_devices(e);
1873 if (r < 0)
1874 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1875
1876 /* Count the partitions enumerated by the kernel */
1877 n = 0;
1878 first = udev_enumerate_get_list_entry(e);
1879 udev_list_entry_foreach(item, first)
1880 n++;
1881
1882 /* Count the partitions enumerated by blkid */
1883 m = blkid_partlist_numof_partitions(pl);
1884 if (n == m + 1)
1885 break;
1886 if (n > m + 1) {
1887 log_error("blkid and kernel partition list do not match.");
1888 return -EIO;
1889 }
1890 if (n < m + 1) {
1891 unsigned j;
1892
1893 /* The kernel has probed fewer partitions than
1894 * blkid? Maybe the kernel prober is still
1895 * running or it got EBUSY because udev
1896 * already opened the device. Let's reprobe
1897 * the device, which is a synchronous call
1898 * that waits until probing is complete. */
1899
1900 for (j = 0; j < 20; j++) {
1901
1902 r = ioctl(fd, BLKRRPART, 0);
1903 if (r < 0)
1904 r = -errno;
1905 if (r >= 0 || r != -EBUSY)
1906 break;
1907
1908 /* If something else has the device
1909 * open, such as an udev rule, the
1910 * ioctl will return EBUSY. Since
1911 * there's no way to wait until it
1912 * isn't busy anymore, let's just wait
1913 * a bit, and try again.
1914 *
1915 * This is really something they
1916 * should fix in the kernel! */
1917
1918 usleep(50 * USEC_PER_MSEC);
1919 }
1920
1921 if (r < 0)
1922 return log_error_errno(r, "Failed to reread partition table: %m");
1923 }
1924
1925 e = udev_enumerate_unref(e);
1926 }
1927
1928 first = udev_enumerate_get_list_entry(e);
1929 udev_list_entry_foreach(item, first) {
1930 _cleanup_udev_device_unref_ struct udev_device *q;
1931 const char *node;
1932 unsigned long long flags;
1933 blkid_partition pp;
1934 dev_t qn;
1935 int nr;
1936
1937 errno = 0;
1938 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1939 if (!q) {
1940 if (!errno)
1941 errno = ENOMEM;
1942
1943 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1944 }
1945
1946 qn = udev_device_get_devnum(q);
1947 if (major(qn) == 0)
1948 continue;
1949
1950 if (st.st_rdev == qn)
1951 continue;
1952
1953 node = udev_device_get_devnode(q);
1954 if (!node)
1955 continue;
1956
1957 pp = blkid_partlist_devno_to_partition(pl, qn);
1958 if (!pp)
1959 continue;
1960
1961 flags = blkid_partition_get_flags(pp);
1962
1963 nr = blkid_partition_get_partno(pp);
1964 if (nr < 0)
1965 continue;
1966
1967 if (is_gpt) {
1968 sd_id128_t type_id;
1969 const char *stype;
1970
1971 if (flags & GPT_FLAG_NO_AUTO)
1972 continue;
1973
1974 stype = blkid_partition_get_type_string(pp);
1975 if (!stype)
1976 continue;
1977
1978 if (sd_id128_from_string(stype, &type_id) < 0)
1979 continue;
1980
1981 if (sd_id128_equal(type_id, GPT_HOME)) {
1982
1983 if (home && nr >= home_nr)
1984 continue;
1985
1986 home_nr = nr;
1987 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1988
1989 r = free_and_strdup(&home, node);
1990 if (r < 0)
1991 return log_oom();
1992
1993 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1994
1995 if (srv && nr >= srv_nr)
1996 continue;
1997
1998 srv_nr = nr;
1999 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2000
2001 r = free_and_strdup(&srv, node);
2002 if (r < 0)
2003 return log_oom();
2004 }
2005 #ifdef GPT_ROOT_NATIVE
2006 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2007
2008 if (root && nr >= root_nr)
2009 continue;
2010
2011 root_nr = nr;
2012 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2013
2014 r = free_and_strdup(&root, node);
2015 if (r < 0)
2016 return log_oom();
2017 }
2018 #endif
2019 #ifdef GPT_ROOT_SECONDARY
2020 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2021
2022 if (secondary_root && nr >= secondary_root_nr)
2023 continue;
2024
2025 secondary_root_nr = nr;
2026 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2027
2028 r = free_and_strdup(&secondary_root, node);
2029 if (r < 0)
2030 return log_oom();
2031 }
2032 #endif
2033 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2034
2035 if (generic)
2036 multiple_generic = true;
2037 else {
2038 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2039
2040 r = free_and_strdup(&generic, node);
2041 if (r < 0)
2042 return log_oom();
2043 }
2044 }
2045
2046 } else if (is_mbr) {
2047 int type;
2048
2049 if (flags != 0x80) /* Bootable flag */
2050 continue;
2051
2052 type = blkid_partition_get_type(pp);
2053 if (type != 0x83) /* Linux partition */
2054 continue;
2055
2056 if (generic)
2057 multiple_generic = true;
2058 else {
2059 generic_rw = true;
2060
2061 r = free_and_strdup(&root, node);
2062 if (r < 0)
2063 return log_oom();
2064 }
2065 }
2066 }
2067
2068 if (root) {
2069 *root_device = root;
2070 root = NULL;
2071
2072 *root_device_rw = root_rw;
2073 *secondary = false;
2074 } else if (secondary_root) {
2075 *root_device = secondary_root;
2076 secondary_root = NULL;
2077
2078 *root_device_rw = secondary_root_rw;
2079 *secondary = true;
2080 } else if (generic) {
2081
2082 /* There were no partitions with precise meanings
2083 * around, but we found generic partitions. In this
2084 * case, if there's only one, we can go ahead and boot
2085 * it, otherwise we bail out, because we really cannot
2086 * make any sense of it. */
2087
2088 if (multiple_generic) {
2089 log_error("Identified multiple bootable Linux partitions on\n"
2090 " %s\n"
2091 PARTITION_TABLE_BLURB, arg_image);
2092 return -EINVAL;
2093 }
2094
2095 *root_device = generic;
2096 generic = NULL;
2097
2098 *root_device_rw = generic_rw;
2099 *secondary = false;
2100 } else {
2101 log_error("Failed to identify root partition in disk image\n"
2102 " %s\n"
2103 PARTITION_TABLE_BLURB, arg_image);
2104 return -EINVAL;
2105 }
2106
2107 if (home) {
2108 *home_device = home;
2109 home = NULL;
2110
2111 *home_device_rw = home_rw;
2112 }
2113
2114 if (srv) {
2115 *srv_device = srv;
2116 srv = NULL;
2117
2118 *srv_device_rw = srv_rw;
2119 }
2120
2121 return 0;
2122 #else
2123 log_error("--image= is not supported, compiled without blkid support.");
2124 return -EOPNOTSUPP;
2125 #endif
2126 }
2127
2128 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2129 #ifdef HAVE_BLKID
2130 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2131 const char *fstype, *p;
2132 int r;
2133
2134 assert(what);
2135 assert(where);
2136
2137 if (arg_read_only)
2138 rw = false;
2139
2140 if (directory)
2141 p = strjoina(where, directory);
2142 else
2143 p = where;
2144
2145 errno = 0;
2146 b = blkid_new_probe_from_filename(what);
2147 if (!b) {
2148 if (errno == 0)
2149 return log_oom();
2150 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2151 }
2152
2153 blkid_probe_enable_superblocks(b, 1);
2154 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2155
2156 errno = 0;
2157 r = blkid_do_safeprobe(b);
2158 if (r == -1 || r == 1) {
2159 log_error("Cannot determine file system type of %s", what);
2160 return -EINVAL;
2161 } else if (r != 0) {
2162 if (errno == 0)
2163 errno = EIO;
2164 return log_error_errno(errno, "Failed to probe %s: %m", what);
2165 }
2166
2167 errno = 0;
2168 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2169 if (errno == 0)
2170 errno = EINVAL;
2171 log_error("Failed to determine file system type of %s", what);
2172 return -errno;
2173 }
2174
2175 if (streq(fstype, "crypto_LUKS")) {
2176 log_error("nspawn currently does not support LUKS disk images.");
2177 return -EOPNOTSUPP;
2178 }
2179
2180 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2181 return log_error_errno(errno, "Failed to mount %s: %m", what);
2182
2183 return 0;
2184 #else
2185 log_error("--image= is not supported, compiled without blkid support.");
2186 return -EOPNOTSUPP;
2187 #endif
2188 }
2189
2190 static int setup_machine_id(const char *directory) {
2191 int r;
2192 const char *etc_machine_id, *t;
2193 _cleanup_free_ char *s = NULL;
2194
2195 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2196
2197 r = read_one_line_file(etc_machine_id, &s);
2198 if (r < 0)
2199 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
2200
2201 t = strstrip(s);
2202
2203 if (!isempty(t)) {
2204 r = sd_id128_from_string(t, &arg_uuid);
2205 if (r < 0)
2206 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
2207 } else {
2208 if (sd_id128_is_null(arg_uuid)) {
2209 r = sd_id128_randomize(&arg_uuid);
2210 if (r < 0)
2211 return log_error_errno(r, "Failed to generate random machine ID: %m");
2212 }
2213 }
2214
2215 r = machine_id_setup(directory, arg_uuid);
2216 if (r < 0)
2217 return log_error_errno(r, "Failed to setup machine ID: %m");
2218
2219 return 0;
2220 }
2221
2222 static int mount_devices(
2223 const char *where,
2224 const char *root_device, bool root_device_rw,
2225 const char *home_device, bool home_device_rw,
2226 const char *srv_device, bool srv_device_rw) {
2227 int r;
2228
2229 assert(where);
2230
2231 if (root_device) {
2232 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2233 if (r < 0)
2234 return log_error_errno(r, "Failed to mount root directory: %m");
2235 }
2236
2237 if (home_device) {
2238 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2239 if (r < 0)
2240 return log_error_errno(r, "Failed to mount home directory: %m");
2241 }
2242
2243 if (srv_device) {
2244 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2245 if (r < 0)
2246 return log_error_errno(r, "Failed to mount server data directory: %m");
2247 }
2248
2249 return 0;
2250 }
2251
2252 static void loop_remove(int nr, int *image_fd) {
2253 _cleanup_close_ int control = -1;
2254 int r;
2255
2256 if (nr < 0)
2257 return;
2258
2259 if (image_fd && *image_fd >= 0) {
2260 r = ioctl(*image_fd, LOOP_CLR_FD);
2261 if (r < 0)
2262 log_debug_errno(errno, "Failed to close loop image: %m");
2263 *image_fd = safe_close(*image_fd);
2264 }
2265
2266 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2267 if (control < 0) {
2268 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2269 return;
2270 }
2271
2272 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2273 if (r < 0)
2274 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2275 }
2276
2277 /*
2278 * Return values:
2279 * < 0 : wait_for_terminate() failed to get the state of the
2280 * container, the container was terminated by a signal, or
2281 * failed for an unknown reason. No change is made to the
2282 * container argument.
2283 * > 0 : The program executed in the container terminated with an
2284 * error. The exit code of the program executed in the
2285 * container is returned. The container argument has been set
2286 * to CONTAINER_TERMINATED.
2287 * 0 : The container is being rebooted, has been shut down or exited
2288 * successfully. The container argument has been set to either
2289 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2290 *
2291 * That is, success is indicated by a return value of zero, and an
2292 * error is indicated by a non-zero value.
2293 */
2294 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2295 siginfo_t status;
2296 int r;
2297
2298 r = wait_for_terminate(pid, &status);
2299 if (r < 0)
2300 return log_warning_errno(r, "Failed to wait for container: %m");
2301
2302 switch (status.si_code) {
2303
2304 case CLD_EXITED:
2305 if (status.si_status == 0) {
2306 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2307
2308 } else
2309 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2310
2311 *container = CONTAINER_TERMINATED;
2312 return status.si_status;
2313
2314 case CLD_KILLED:
2315 if (status.si_status == SIGINT) {
2316
2317 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2318 *container = CONTAINER_TERMINATED;
2319 return 0;
2320
2321 } else if (status.si_status == SIGHUP) {
2322
2323 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2324 *container = CONTAINER_REBOOTED;
2325 return 0;
2326 }
2327
2328 /* CLD_KILLED fallthrough */
2329
2330 case CLD_DUMPED:
2331 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2332 return -EIO;
2333
2334 default:
2335 log_error("Container %s failed due to unknown reason.", arg_machine);
2336 return -EIO;
2337 }
2338
2339 return r;
2340 }
2341
2342 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2343 pid_t pid;
2344
2345 pid = PTR_TO_PID(userdata);
2346 if (pid > 0) {
2347 if (kill(pid, arg_kill_signal) >= 0) {
2348 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2349 sd_event_source_set_userdata(s, NULL);
2350 return 0;
2351 }
2352 }
2353
2354 sd_event_exit(sd_event_source_get_event(s), 0);
2355 return 0;
2356 }
2357
2358 static int determine_names(void) {
2359 int r;
2360
2361 if (arg_template && !arg_directory && arg_machine) {
2362
2363 /* If --template= was specified then we should not
2364 * search for a machine, but instead create a new one
2365 * in /var/lib/machine. */
2366
2367 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2368 if (!arg_directory)
2369 return log_oom();
2370 }
2371
2372 if (!arg_image && !arg_directory) {
2373 if (arg_machine) {
2374 _cleanup_(image_unrefp) Image *i = NULL;
2375
2376 r = image_find(arg_machine, &i);
2377 if (r < 0)
2378 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2379 else if (r == 0) {
2380 log_error("No image for machine '%s': %m", arg_machine);
2381 return -ENOENT;
2382 }
2383
2384 if (i->type == IMAGE_RAW)
2385 r = free_and_strdup(&arg_image, i->path);
2386 else
2387 r = free_and_strdup(&arg_directory, i->path);
2388 if (r < 0)
2389 return log_error_errno(r, "Invalid image directory: %m");
2390
2391 if (!arg_ephemeral)
2392 arg_read_only = arg_read_only || i->read_only;
2393 } else
2394 arg_directory = get_current_dir_name();
2395
2396 if (!arg_directory && !arg_machine) {
2397 log_error("Failed to determine path, please use -D or -i.");
2398 return -EINVAL;
2399 }
2400 }
2401
2402 if (!arg_machine) {
2403 if (arg_directory && path_equal(arg_directory, "/"))
2404 arg_machine = gethostname_malloc();
2405 else
2406 arg_machine = strdup(basename(arg_image ?: arg_directory));
2407
2408 if (!arg_machine)
2409 return log_oom();
2410
2411 hostname_cleanup(arg_machine);
2412 if (!machine_name_is_valid(arg_machine)) {
2413 log_error("Failed to determine machine name automatically, please use -M.");
2414 return -EINVAL;
2415 }
2416
2417 if (arg_ephemeral) {
2418 char *b;
2419
2420 /* Add a random suffix when this is an
2421 * ephemeral machine, so that we can run many
2422 * instances at once without manually having
2423 * to specify -M each time. */
2424
2425 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2426 return log_oom();
2427
2428 free(arg_machine);
2429 arg_machine = b;
2430 }
2431 }
2432
2433 return 0;
2434 }
2435
2436 static int determine_uid_shift(const char *directory) {
2437 int r;
2438
2439 if (!arg_userns) {
2440 arg_uid_shift = 0;
2441 return 0;
2442 }
2443
2444 if (arg_uid_shift == UID_INVALID) {
2445 struct stat st;
2446
2447 r = stat(directory, &st);
2448 if (r < 0)
2449 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2450
2451 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2452
2453 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2454 log_error("UID and GID base of %s don't match.", directory);
2455 return -EINVAL;
2456 }
2457
2458 arg_uid_range = UINT32_C(0x10000);
2459 }
2460
2461 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2462 log_error("UID base too high for UID range.");
2463 return -EINVAL;
2464 }
2465
2466 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2467 return 0;
2468 }
2469
2470 static int inner_child(
2471 Barrier *barrier,
2472 const char *directory,
2473 bool secondary,
2474 int kmsg_socket,
2475 int rtnl_socket,
2476 FDSet *fds) {
2477
2478 _cleanup_free_ char *home = NULL;
2479 char as_uuid[37];
2480 unsigned n_env = 1;
2481 const char *envp[] = {
2482 "PATH=" DEFAULT_PATH_SPLIT_USR,
2483 NULL, /* container */
2484 NULL, /* TERM */
2485 NULL, /* HOME */
2486 NULL, /* USER */
2487 NULL, /* LOGNAME */
2488 NULL, /* container_uuid */
2489 NULL, /* LISTEN_FDS */
2490 NULL, /* LISTEN_PID */
2491 NULL
2492 };
2493
2494 _cleanup_strv_free_ char **env_use = NULL;
2495 int r;
2496
2497 assert(barrier);
2498 assert(directory);
2499 assert(kmsg_socket >= 0);
2500
2501 cg_unified_flush();
2502
2503 if (arg_userns) {
2504 /* Tell the parent, that it now can write the UID map. */
2505 (void) barrier_place(barrier); /* #1 */
2506
2507 /* Wait until the parent wrote the UID map */
2508 if (!barrier_place_and_sync(barrier)) { /* #2 */
2509 log_error("Parent died too early");
2510 return -ESRCH;
2511 }
2512 }
2513
2514 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
2515 if (r < 0)
2516 return r;
2517
2518 r = mount_sysfs(NULL);
2519 if (r < 0)
2520 return r;
2521
2522 /* Wait until we are cgroup-ified, so that we
2523 * can mount the right cgroup path writable */
2524 if (!barrier_place_and_sync(barrier)) { /* #3 */
2525 log_error("Parent died too early");
2526 return -ESRCH;
2527 }
2528
2529 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2530 if (r < 0)
2531 return r;
2532
2533 r = reset_uid_gid();
2534 if (r < 0)
2535 return log_error_errno(r, "Couldn't become new root: %m");
2536
2537 r = setup_boot_id(NULL);
2538 if (r < 0)
2539 return r;
2540
2541 r = setup_kmsg(NULL, kmsg_socket);
2542 if (r < 0)
2543 return r;
2544 kmsg_socket = safe_close(kmsg_socket);
2545
2546 umask(0022);
2547
2548 if (setsid() < 0)
2549 return log_error_errno(errno, "setsid() failed: %m");
2550
2551 if (arg_private_network)
2552 loopback_setup();
2553
2554 if (arg_expose_ports) {
2555 r = expose_port_send_rtnl(rtnl_socket);
2556 if (r < 0)
2557 return r;
2558 rtnl_socket = safe_close(rtnl_socket);
2559 }
2560
2561 r = drop_capabilities();
2562 if (r < 0)
2563 return log_error_errno(r, "drop_capabilities() failed: %m");
2564
2565 setup_hostname();
2566
2567 if (arg_personality != PERSONALITY_INVALID) {
2568 if (personality(arg_personality) < 0)
2569 return log_error_errno(errno, "personality() failed: %m");
2570 } else if (secondary) {
2571 if (personality(PER_LINUX32) < 0)
2572 return log_error_errno(errno, "personality() failed: %m");
2573 }
2574
2575 #ifdef HAVE_SELINUX
2576 if (arg_selinux_context)
2577 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2578 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2579 #endif
2580
2581 r = change_uid_gid(arg_user, &home);
2582 if (r < 0)
2583 return r;
2584
2585 /* LXC sets container=lxc, so follow the scheme here */
2586 envp[n_env++] = strjoina("container=", arg_container_service_name);
2587
2588 envp[n_env] = strv_find_prefix(environ, "TERM=");
2589 if (envp[n_env])
2590 n_env++;
2591
2592 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2593 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2594 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2595 return log_oom();
2596
2597 assert(!sd_id128_equal(arg_uuid, SD_ID128_NULL));
2598
2599 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2600 return log_oom();
2601
2602 if (fdset_size(fds) > 0) {
2603 r = fdset_cloexec(fds, false);
2604 if (r < 0)
2605 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2606
2607 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2608 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2609 return log_oom();
2610 }
2611
2612 env_use = strv_env_merge(2, envp, arg_setenv);
2613 if (!env_use)
2614 return log_oom();
2615
2616 /* Let the parent know that we are ready and
2617 * wait until the parent is ready with the
2618 * setup, too... */
2619 if (!barrier_place_and_sync(barrier)) { /* #4 */
2620 log_error("Parent died too early");
2621 return -ESRCH;
2622 }
2623
2624 if (arg_chdir)
2625 if (chdir(arg_chdir) < 0)
2626 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2627
2628 if (arg_start_mode == START_PID2) {
2629 r = stub_pid1();
2630 if (r < 0)
2631 return r;
2632 }
2633
2634 /* Now, explicitly close the log, so that we
2635 * then can close all remaining fds. Closing
2636 * the log explicitly first has the benefit
2637 * that the logging subsystem knows about it,
2638 * and is thus ready to be reopened should we
2639 * need it again. Note that the other fds
2640 * closed here are at least the locking and
2641 * barrier fds. */
2642 log_close();
2643 (void) fdset_close_others(fds);
2644
2645 if (arg_start_mode == START_BOOT) {
2646 char **a;
2647 size_t m;
2648
2649 /* Automatically search for the init system */
2650
2651 m = strv_length(arg_parameters);
2652 a = newa(char*, m + 2);
2653 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2654 a[1 + m] = NULL;
2655
2656 a[0] = (char*) "/usr/lib/systemd/systemd";
2657 execve(a[0], a, env_use);
2658
2659 a[0] = (char*) "/lib/systemd/systemd";
2660 execve(a[0], a, env_use);
2661
2662 a[0] = (char*) "/sbin/init";
2663 execve(a[0], a, env_use);
2664 } else if (!strv_isempty(arg_parameters))
2665 execvpe(arg_parameters[0], arg_parameters, env_use);
2666 else {
2667 if (!arg_chdir)
2668 chdir(home ?: "/root");
2669
2670 execle("/bin/bash", "-bash", NULL, env_use);
2671 execle("/bin/sh", "-sh", NULL, env_use);
2672 }
2673
2674 r = -errno;
2675 (void) log_open();
2676 return log_error_errno(r, "execv() failed: %m");
2677 }
2678
2679 static int outer_child(
2680 Barrier *barrier,
2681 const char *directory,
2682 const char *console,
2683 const char *root_device, bool root_device_rw,
2684 const char *home_device, bool home_device_rw,
2685 const char *srv_device, bool srv_device_rw,
2686 bool interactive,
2687 bool secondary,
2688 int pid_socket,
2689 int uuid_socket,
2690 int kmsg_socket,
2691 int rtnl_socket,
2692 int uid_shift_socket,
2693 FDSet *fds) {
2694
2695 pid_t pid;
2696 ssize_t l;
2697 int r;
2698
2699 assert(barrier);
2700 assert(directory);
2701 assert(console);
2702 assert(pid_socket >= 0);
2703 assert(uuid_socket >= 0);
2704 assert(kmsg_socket >= 0);
2705
2706 cg_unified_flush();
2707
2708 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2709 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2710
2711 if (interactive) {
2712 close_nointr(STDIN_FILENO);
2713 close_nointr(STDOUT_FILENO);
2714 close_nointr(STDERR_FILENO);
2715
2716 r = open_terminal(console, O_RDWR);
2717 if (r != STDIN_FILENO) {
2718 if (r >= 0) {
2719 safe_close(r);
2720 r = -EINVAL;
2721 }
2722
2723 return log_error_errno(r, "Failed to open console: %m");
2724 }
2725
2726 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2727 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2728 return log_error_errno(errno, "Failed to duplicate console: %m");
2729 }
2730
2731 r = reset_audit_loginuid();
2732 if (r < 0)
2733 return r;
2734
2735 /* Mark everything as slave, so that we still
2736 * receive mounts from the real root, but don't
2737 * propagate mounts to the real root. */
2738 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2739 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2740
2741 r = mount_devices(directory,
2742 root_device, root_device_rw,
2743 home_device, home_device_rw,
2744 srv_device, srv_device_rw);
2745 if (r < 0)
2746 return r;
2747
2748 r = determine_uid_shift(directory);
2749 if (r < 0)
2750 return r;
2751
2752 if (arg_userns) {
2753 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2754 if (l < 0)
2755 return log_error_errno(errno, "Failed to send UID shift: %m");
2756 if (l != sizeof(arg_uid_shift)) {
2757 log_error("Short write while sending UID shift.");
2758 return -EIO;
2759 }
2760 }
2761
2762 /* Turn directory into bind mount */
2763 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2764 return log_error_errno(errno, "Failed to make bind mount: %m");
2765
2766 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2767 if (r < 0)
2768 return r;
2769
2770 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2771 if (r < 0)
2772 return r;
2773
2774 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2775 if (r < 0)
2776 return r;
2777
2778 if (arg_read_only) {
2779 r = bind_remount_recursive(directory, true);
2780 if (r < 0)
2781 return log_error_errno(r, "Failed to make tree read-only: %m");
2782 }
2783
2784 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2785 if (r < 0)
2786 return r;
2787
2788 r = copy_devnodes(directory);
2789 if (r < 0)
2790 return r;
2791
2792 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2793
2794 r = setup_pts(directory);
2795 if (r < 0)
2796 return r;
2797
2798 r = setup_propagate(directory);
2799 if (r < 0)
2800 return r;
2801
2802 r = setup_dev_console(directory, console);
2803 if (r < 0)
2804 return r;
2805
2806 r = setup_seccomp();
2807 if (r < 0)
2808 return r;
2809
2810 r = setup_timezone(directory);
2811 if (r < 0)
2812 return r;
2813
2814 r = setup_resolv_conf(directory);
2815 if (r < 0)
2816 return r;
2817
2818 r = setup_machine_id(directory);
2819 if (r < 0)
2820 return r;
2821
2822 r = setup_journal(directory);
2823 if (r < 0)
2824 return r;
2825
2826 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2827 if (r < 0)
2828 return r;
2829
2830 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2831 if (r < 0)
2832 return r;
2833
2834 r = mount_move_root(directory);
2835 if (r < 0)
2836 return log_error_errno(r, "Failed to move root directory: %m");
2837
2838 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2839 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2840 (arg_private_network ? CLONE_NEWNET : 0) |
2841 (arg_userns ? CLONE_NEWUSER : 0),
2842 NULL);
2843 if (pid < 0)
2844 return log_error_errno(errno, "Failed to fork inner child: %m");
2845 if (pid == 0) {
2846 pid_socket = safe_close(pid_socket);
2847 uuid_socket = safe_close(uuid_socket);
2848 uid_shift_socket = safe_close(uid_shift_socket);
2849
2850 /* The inner child has all namespaces that are
2851 * requested, so that we all are owned by the user if
2852 * user namespaces are turned on. */
2853
2854 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2855 if (r < 0)
2856 _exit(EXIT_FAILURE);
2857
2858 _exit(EXIT_SUCCESS);
2859 }
2860
2861 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2862 if (l < 0)
2863 return log_error_errno(errno, "Failed to send PID: %m");
2864 if (l != sizeof(pid)) {
2865 log_error("Short write while sending PID.");
2866 return -EIO;
2867 }
2868
2869 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
2870 if (l < 0)
2871 return log_error_errno(errno, "Failed to send machine ID: %m");
2872 if (l != sizeof(arg_uuid)) {
2873 log_error("Short write while sending machine ID.");
2874 return -EIO;
2875 }
2876
2877 pid_socket = safe_close(pid_socket);
2878 uuid_socket = safe_close(uuid_socket);
2879 kmsg_socket = safe_close(kmsg_socket);
2880 rtnl_socket = safe_close(rtnl_socket);
2881
2882 return 0;
2883 }
2884
2885 static int setup_uid_map(pid_t pid) {
2886 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2887 int r;
2888
2889 assert(pid > 1);
2890
2891 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2892 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2893 r = write_string_file(uid_map, line, 0);
2894 if (r < 0)
2895 return log_error_errno(r, "Failed to write UID map: %m");
2896
2897 /* We always assign the same UID and GID ranges */
2898 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2899 r = write_string_file(uid_map, line, 0);
2900 if (r < 0)
2901 return log_error_errno(r, "Failed to write GID map: %m");
2902
2903 return 0;
2904 }
2905
2906 static int load_settings(void) {
2907 _cleanup_(settings_freep) Settings *settings = NULL;
2908 _cleanup_fclose_ FILE *f = NULL;
2909 _cleanup_free_ char *p = NULL;
2910 const char *fn, *i;
2911 int r;
2912
2913 /* If all settings are masked, there's no point in looking for
2914 * the settings file */
2915 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2916 return 0;
2917
2918 fn = strjoina(arg_machine, ".nspawn");
2919
2920 /* We first look in the admin's directories in /etc and /run */
2921 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2922 _cleanup_free_ char *j = NULL;
2923
2924 j = strjoin(i, "/", fn, NULL);
2925 if (!j)
2926 return log_oom();
2927
2928 f = fopen(j, "re");
2929 if (f) {
2930 p = j;
2931 j = NULL;
2932
2933 /* By default, we trust configuration from /etc and /run */
2934 if (arg_settings_trusted < 0)
2935 arg_settings_trusted = true;
2936
2937 break;
2938 }
2939
2940 if (errno != ENOENT)
2941 return log_error_errno(errno, "Failed to open %s: %m", j);
2942 }
2943
2944 if (!f) {
2945 /* After that, let's look for a file next to the
2946 * actual image we shall boot. */
2947
2948 if (arg_image) {
2949 p = file_in_same_dir(arg_image, fn);
2950 if (!p)
2951 return log_oom();
2952 } else if (arg_directory) {
2953 p = file_in_same_dir(arg_directory, fn);
2954 if (!p)
2955 return log_oom();
2956 }
2957
2958 if (p) {
2959 f = fopen(p, "re");
2960 if (!f && errno != ENOENT)
2961 return log_error_errno(errno, "Failed to open %s: %m", p);
2962
2963 /* By default, we do not trust configuration from /var/lib/machines */
2964 if (arg_settings_trusted < 0)
2965 arg_settings_trusted = false;
2966 }
2967 }
2968
2969 if (!f)
2970 return 0;
2971
2972 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2973
2974 r = settings_load(f, p, &settings);
2975 if (r < 0)
2976 return r;
2977
2978 /* Copy over bits from the settings, unless they have been
2979 * explicitly masked by command line switches. */
2980
2981 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
2982 settings->start_mode >= 0) {
2983 arg_start_mode = settings->start_mode;
2984
2985 strv_free(arg_parameters);
2986 arg_parameters = settings->parameters;
2987 settings->parameters = NULL;
2988 }
2989
2990 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
2991 settings->working_directory) {
2992 free(arg_chdir);
2993 arg_chdir = settings->working_directory;
2994 settings->working_directory = NULL;
2995 }
2996
2997 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2998 settings->environment) {
2999 strv_free(arg_setenv);
3000 arg_setenv = settings->environment;
3001 settings->environment = NULL;
3002 }
3003
3004 if ((arg_settings_mask & SETTING_USER) == 0 &&
3005 settings->user) {
3006 free(arg_user);
3007 arg_user = settings->user;
3008 settings->user = NULL;
3009 }
3010
3011 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
3012 uint64_t plus;
3013
3014 plus = settings->capability;
3015 if (settings_private_network(settings))
3016 plus |= (1ULL << CAP_NET_ADMIN);
3017
3018 if (!arg_settings_trusted && plus != 0) {
3019 if (settings->capability != 0)
3020 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3021 } else
3022 arg_retain |= plus;
3023
3024 arg_retain &= ~settings->drop_capability;
3025 }
3026
3027 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3028 settings->kill_signal > 0)
3029 arg_kill_signal = settings->kill_signal;
3030
3031 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3032 settings->personality != PERSONALITY_INVALID)
3033 arg_personality = settings->personality;
3034
3035 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3036 !sd_id128_is_null(settings->machine_id)) {
3037
3038 if (!arg_settings_trusted)
3039 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3040 else
3041 arg_uuid = settings->machine_id;
3042 }
3043
3044 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3045 settings->read_only >= 0)
3046 arg_read_only = settings->read_only;
3047
3048 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3049 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3050 arg_volatile_mode = settings->volatile_mode;
3051
3052 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3053 settings->n_custom_mounts > 0) {
3054
3055 if (!arg_settings_trusted)
3056 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3057 else {
3058 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3059 arg_custom_mounts = settings->custom_mounts;
3060 arg_n_custom_mounts = settings->n_custom_mounts;
3061
3062 settings->custom_mounts = NULL;
3063 settings->n_custom_mounts = 0;
3064 }
3065 }
3066
3067 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3068 (settings->private_network >= 0 ||
3069 settings->network_veth >= 0 ||
3070 settings->network_bridge ||
3071 settings->network_interfaces ||
3072 settings->network_macvlan ||
3073 settings->network_ipvlan ||
3074 settings->network_veth_extra)) {
3075
3076 if (!arg_settings_trusted)
3077 log_warning("Ignoring network settings, file %s is not trusted.", p);
3078 else {
3079 arg_network_veth = settings_network_veth(settings);
3080 arg_private_network = settings_private_network(settings);
3081
3082 strv_free(arg_network_interfaces);
3083 arg_network_interfaces = settings->network_interfaces;
3084 settings->network_interfaces = NULL;
3085
3086 strv_free(arg_network_macvlan);
3087 arg_network_macvlan = settings->network_macvlan;
3088 settings->network_macvlan = NULL;
3089
3090 strv_free(arg_network_ipvlan);
3091 arg_network_ipvlan = settings->network_ipvlan;
3092 settings->network_ipvlan = NULL;
3093
3094 strv_free(arg_network_veth_extra);
3095 arg_network_veth_extra = settings->network_veth_extra;
3096 settings->network_veth_extra = NULL;
3097
3098 free(arg_network_bridge);
3099 arg_network_bridge = settings->network_bridge;
3100 settings->network_bridge = NULL;
3101 }
3102 }
3103
3104 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3105 settings->expose_ports) {
3106
3107 if (!arg_settings_trusted)
3108 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3109 else {
3110 expose_port_free_all(arg_expose_ports);
3111 arg_expose_ports = settings->expose_ports;
3112 settings->expose_ports = NULL;
3113 }
3114 }
3115
3116 return 0;
3117 }
3118
3119 int main(int argc, char *argv[]) {
3120
3121 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3122 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3123 _cleanup_close_ int master = -1, image_fd = -1;
3124 _cleanup_fdset_free_ FDSet *fds = NULL;
3125 int r, n_fd_passed, loop_nr = -1;
3126 char veth_name[IFNAMSIZ];
3127 bool secondary = false, remove_subvol = false;
3128 sigset_t mask_chld;
3129 pid_t pid = 0;
3130 int ret = EXIT_SUCCESS;
3131 union in_addr_union exposed = {};
3132 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3133 bool interactive;
3134
3135 log_parse_environment();
3136 log_open();
3137
3138 /* Make sure rename_process() in the stub init process can work */
3139 saved_argv = argv;
3140 saved_argc = argc;
3141
3142 r = parse_argv(argc, argv);
3143 if (r <= 0)
3144 goto finish;
3145
3146 if (geteuid() != 0) {
3147 log_error("Need to be root.");
3148 r = -EPERM;
3149 goto finish;
3150 }
3151 r = determine_names();
3152 if (r < 0)
3153 goto finish;
3154
3155 r = load_settings();
3156 if (r < 0)
3157 goto finish;
3158
3159 r = verify_arguments();
3160 if (r < 0)
3161 goto finish;
3162
3163 n_fd_passed = sd_listen_fds(false);
3164 if (n_fd_passed > 0) {
3165 r = fdset_new_listen_fds(&fds, false);
3166 if (r < 0) {
3167 log_error_errno(r, "Failed to collect file descriptors: %m");
3168 goto finish;
3169 }
3170 }
3171
3172 if (arg_directory) {
3173 assert(!arg_image);
3174
3175 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3176 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3177 r = -EINVAL;
3178 goto finish;
3179 }
3180
3181 if (arg_ephemeral) {
3182 _cleanup_free_ char *np = NULL;
3183
3184 /* If the specified path is a mount point we
3185 * generate the new snapshot immediately
3186 * inside it under a random name. However if
3187 * the specified is not a mount point we
3188 * create the new snapshot in the parent
3189 * directory, just next to it. */
3190 r = path_is_mount_point(arg_directory, 0);
3191 if (r < 0) {
3192 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3193 goto finish;
3194 }
3195 if (r > 0)
3196 r = tempfn_random_child(arg_directory, "machine.", &np);
3197 else
3198 r = tempfn_random(arg_directory, "machine.", &np);
3199 if (r < 0) {
3200 log_error_errno(r, "Failed to generate name for snapshot: %m");
3201 goto finish;
3202 }
3203
3204 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3205 if (r < 0) {
3206 log_error_errno(r, "Failed to lock %s: %m", np);
3207 goto finish;
3208 }
3209
3210 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3211 if (r < 0) {
3212 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3213 goto finish;
3214 }
3215
3216 free(arg_directory);
3217 arg_directory = np;
3218 np = NULL;
3219
3220 remove_subvol = true;
3221
3222 } else {
3223 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3224 if (r == -EBUSY) {
3225 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3226 goto finish;
3227 }
3228 if (r < 0) {
3229 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3230 return r;
3231 }
3232
3233 if (arg_template) {
3234 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3235 if (r == -EEXIST) {
3236 if (!arg_quiet)
3237 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3238 } else if (r < 0) {
3239 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3240 goto finish;
3241 } else {
3242 if (!arg_quiet)
3243 log_info("Populated %s from template %s.", arg_directory, arg_template);
3244 }
3245 }
3246 }
3247
3248 if (arg_start_mode == START_BOOT) {
3249 if (path_is_os_tree(arg_directory) <= 0) {
3250 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3251 r = -EINVAL;
3252 goto finish;
3253 }
3254 } else {
3255 const char *p;
3256
3257 p = strjoina(arg_directory, "/usr/");
3258 if (laccess(p, F_OK) < 0) {
3259 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
3260 r = -EINVAL;
3261 goto finish;
3262 }
3263 }
3264
3265 } else {
3266 char template[] = "/tmp/nspawn-root-XXXXXX";
3267
3268 assert(arg_image);
3269 assert(!arg_template);
3270
3271 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3272 if (r == -EBUSY) {
3273 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3274 goto finish;
3275 }
3276 if (r < 0) {
3277 r = log_error_errno(r, "Failed to create image lock: %m");
3278 goto finish;
3279 }
3280
3281 if (!mkdtemp(template)) {
3282 log_error_errno(errno, "Failed to create temporary directory: %m");
3283 r = -errno;
3284 goto finish;
3285 }
3286
3287 arg_directory = strdup(template);
3288 if (!arg_directory) {
3289 r = log_oom();
3290 goto finish;
3291 }
3292
3293 image_fd = setup_image(&device_path, &loop_nr);
3294 if (image_fd < 0) {
3295 r = image_fd;
3296 goto finish;
3297 }
3298
3299 r = dissect_image(image_fd,
3300 &root_device, &root_device_rw,
3301 &home_device, &home_device_rw,
3302 &srv_device, &srv_device_rw,
3303 &secondary);
3304 if (r < 0)
3305 goto finish;
3306 }
3307
3308 r = custom_mounts_prepare();
3309 if (r < 0)
3310 goto finish;
3311
3312 interactive =
3313 isatty(STDIN_FILENO) > 0 &&
3314 isatty(STDOUT_FILENO) > 0;
3315
3316 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3317 if (master < 0) {
3318 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3319 goto finish;
3320 }
3321
3322 r = ptsname_malloc(master, &console);
3323 if (r < 0) {
3324 r = log_error_errno(r, "Failed to determine tty name: %m");
3325 goto finish;
3326 }
3327
3328 if (arg_selinux_apifs_context) {
3329 r = mac_selinux_apply(console, arg_selinux_apifs_context);
3330 if (r < 0)
3331 goto finish;
3332 }
3333
3334 if (unlockpt(master) < 0) {
3335 r = log_error_errno(errno, "Failed to unlock tty: %m");
3336 goto finish;
3337 }
3338
3339 if (!arg_quiet)
3340 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3341 arg_machine, arg_image ?: arg_directory);
3342
3343 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3344
3345 assert_se(sigemptyset(&mask_chld) == 0);
3346 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3347
3348 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3349 r = log_error_errno(errno, "Failed to become subreaper: %m");
3350 goto finish;
3351 }
3352
3353 for (;;) {
3354 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 },
3355 pid_socket_pair[2] = { -1, -1 }, uuid_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 };
3356 ContainerStatus container_status;
3357 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3358 static const struct sigaction sa = {
3359 .sa_handler = nop_signal_handler,
3360 .sa_flags = SA_NOCLDSTOP,
3361 };
3362 int ifi = 0;
3363 ssize_t l;
3364 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3365 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3366 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3367 char last_char = 0;
3368
3369 r = barrier_create(&barrier);
3370 if (r < 0) {
3371 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3372 goto finish;
3373 }
3374
3375 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3376 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3377 goto finish;
3378 }
3379
3380 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3381 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3382 goto finish;
3383 }
3384
3385 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3386 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3387 goto finish;
3388 }
3389
3390 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0) {
3391 r = log_error_errno(errno, "Failed to create id socket pair: %m");
3392 goto finish;
3393 }
3394
3395 if (arg_userns)
3396 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3397 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3398 goto finish;
3399 }
3400
3401 /* Child can be killed before execv(), so handle SIGCHLD
3402 * in order to interrupt parent's blocking calls and
3403 * give it a chance to call wait() and terminate. */
3404 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3405 if (r < 0) {
3406 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3407 goto finish;
3408 }
3409
3410 r = sigaction(SIGCHLD, &sa, NULL);
3411 if (r < 0) {
3412 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3413 goto finish;
3414 }
3415
3416 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3417 if (pid < 0) {
3418 if (errno == EINVAL)
3419 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3420 else
3421 r = log_error_errno(errno, "clone() failed: %m");
3422
3423 goto finish;
3424 }
3425
3426 if (pid == 0) {
3427 /* The outer child only has a file system namespace. */
3428 barrier_set_role(&barrier, BARRIER_CHILD);
3429
3430 master = safe_close(master);
3431
3432 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3433 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3434 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3435 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3436 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3437
3438 (void) reset_all_signal_handlers();
3439 (void) reset_signal_mask();
3440
3441 r = outer_child(&barrier,
3442 arg_directory,
3443 console,
3444 root_device, root_device_rw,
3445 home_device, home_device_rw,
3446 srv_device, srv_device_rw,
3447 interactive,
3448 secondary,
3449 pid_socket_pair[1],
3450 uuid_socket_pair[1],
3451 kmsg_socket_pair[1],
3452 rtnl_socket_pair[1],
3453 uid_shift_socket_pair[1],
3454 fds);
3455 if (r < 0)
3456 _exit(EXIT_FAILURE);
3457
3458 _exit(EXIT_SUCCESS);
3459 }
3460
3461 barrier_set_role(&barrier, BARRIER_PARENT);
3462
3463 fds = fdset_free(fds);
3464
3465 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3466 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3467 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3468 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3469 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3470
3471 /* Wait for the outer child. */
3472 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3473 if (r < 0)
3474 goto finish;
3475 if (r != 0) {
3476 r = -EIO;
3477 goto finish;
3478 }
3479 pid = 0;
3480
3481 /* And now retrieve the PID of the inner child. */
3482 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3483 if (l < 0) {
3484 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3485 goto finish;
3486 }
3487 if (l != sizeof(pid)) {
3488 log_error("Short read while reading inner child PID.");
3489 r = EIO;
3490 goto finish;
3491 }
3492
3493 /* We also retrieve container UUID in case it was generated by outer child */
3494 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof(arg_uuid), 0);
3495 if (l < 0) {
3496 r = log_error_errno(errno, "Failed to read container machine ID: %m");
3497 goto finish;
3498 }
3499 if (l != sizeof(arg_uuid)) {
3500 log_error("Short read while reading container machined ID.");
3501 r = EIO;
3502 goto finish;
3503 }
3504
3505 log_debug("Init process invoked as PID " PID_FMT, pid);
3506
3507 if (arg_userns) {
3508 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3509 log_error("Child died too early.");
3510 r = -ESRCH;
3511 goto finish;
3512 }
3513
3514 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3515 if (l < 0) {
3516 r = log_error_errno(errno, "Failed to read UID shift: %m");
3517 goto finish;
3518 }
3519 if (l != sizeof(arg_uid_shift)) {
3520 log_error("Short read while reading UID shift.");
3521 r = EIO;
3522 goto finish;
3523 }
3524
3525 r = setup_uid_map(pid);
3526 if (r < 0)
3527 goto finish;
3528
3529 (void) barrier_place(&barrier); /* #2 */
3530 }
3531
3532 if (arg_private_network) {
3533
3534 r = move_network_interfaces(pid, arg_network_interfaces);
3535 if (r < 0)
3536 goto finish;
3537
3538 if (arg_network_veth) {
3539 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3540 if (r < 0)
3541 goto finish;
3542 else if (r > 0)
3543 ifi = r;
3544
3545 if (arg_network_bridge) {
3546 r = setup_bridge(veth_name, arg_network_bridge);
3547 if (r < 0)
3548 goto finish;
3549 if (r > 0)
3550 ifi = r;
3551 }
3552 }
3553
3554 r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
3555 if (r < 0)
3556 goto finish;
3557
3558 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3559 if (r < 0)
3560 goto finish;
3561
3562 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3563 if (r < 0)
3564 goto finish;
3565 }
3566
3567 if (arg_register) {
3568 r = register_machine(
3569 arg_machine,
3570 pid,
3571 arg_directory,
3572 arg_uuid,
3573 ifi,
3574 arg_slice,
3575 arg_custom_mounts, arg_n_custom_mounts,
3576 arg_kill_signal,
3577 arg_property,
3578 arg_keep_unit,
3579 arg_container_service_name);
3580 if (r < 0)
3581 goto finish;
3582 }
3583
3584 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3585 if (r < 0)
3586 goto finish;
3587
3588 if (arg_keep_unit) {
3589 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3590 if (r < 0)
3591 goto finish;
3592 }
3593
3594 r = chown_cgroup(pid, arg_uid_shift);
3595 if (r < 0)
3596 goto finish;
3597
3598 /* Notify the child that the parent is ready with all
3599 * its setup (including cgroup-ification), and that
3600 * the child can now hand over control to the code to
3601 * run inside the container. */
3602 (void) barrier_place(&barrier); /* #3 */
3603
3604 /* Block SIGCHLD here, before notifying child.
3605 * process_pty() will handle it with the other signals. */
3606 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3607
3608 /* Reset signal to default */
3609 r = default_signals(SIGCHLD, -1);
3610 if (r < 0) {
3611 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3612 goto finish;
3613 }
3614
3615 /* Let the child know that we are ready and wait that the child is completely ready now. */
3616 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3617 log_error("Child died too early.");
3618 r = -ESRCH;
3619 goto finish;
3620 }
3621
3622 sd_notifyf(false,
3623 "READY=1\n"
3624 "STATUS=Container running.\n"
3625 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3626
3627 r = sd_event_new(&event);
3628 if (r < 0) {
3629 log_error_errno(r, "Failed to get default event source: %m");
3630 goto finish;
3631 }
3632
3633 if (arg_kill_signal > 0) {
3634 /* Try to kill the init system on SIGINT or SIGTERM */
3635 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid));
3636 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid));
3637 } else {
3638 /* Immediately exit */
3639 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3640 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3641 }
3642
3643 /* simply exit on sigchld */
3644 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3645
3646 if (arg_expose_ports) {
3647 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3648 if (r < 0)
3649 goto finish;
3650
3651 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3652 }
3653
3654 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3655
3656 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
3657 if (r < 0) {
3658 log_error_errno(r, "Failed to create PTY forwarder: %m");
3659 goto finish;
3660 }
3661
3662 r = sd_event_loop(event);
3663 if (r < 0) {
3664 log_error_errno(r, "Failed to run event loop: %m");
3665 goto finish;
3666 }
3667
3668 pty_forward_get_last_char(forward, &last_char);
3669
3670 forward = pty_forward_free(forward);
3671
3672 if (!arg_quiet && last_char != '\n')
3673 putc('\n', stdout);
3674
3675 /* Kill if it is not dead yet anyway */
3676 if (arg_register && !arg_keep_unit)
3677 terminate_machine(pid);
3678
3679 /* Normally redundant, but better safe than sorry */
3680 kill(pid, SIGKILL);
3681
3682 r = wait_for_container(pid, &container_status);
3683 pid = 0;
3684
3685 if (r < 0)
3686 /* We failed to wait for the container, or the
3687 * container exited abnormally */
3688 goto finish;
3689 else if (r > 0 || container_status == CONTAINER_TERMINATED) {
3690 /* The container exited with a non-zero
3691 * status, or with zero status and no reboot
3692 * was requested. */
3693 ret = r;
3694 break;
3695 }
3696
3697 /* CONTAINER_REBOOTED, loop again */
3698
3699 if (arg_keep_unit) {
3700 /* Special handling if we are running as a
3701 * service: instead of simply restarting the
3702 * machine we want to restart the entire
3703 * service, so let's inform systemd about this
3704 * with the special exit code 133. The service
3705 * file uses RestartForceExitStatus=133 so
3706 * that this results in a full nspawn
3707 * restart. This is necessary since we might
3708 * have cgroup parameters set we want to have
3709 * flushed out. */
3710 ret = 133;
3711 r = 0;
3712 break;
3713 }
3714
3715 expose_port_flush(arg_expose_ports, &exposed);
3716 }
3717
3718 finish:
3719 sd_notify(false,
3720 "STOPPING=1\n"
3721 "STATUS=Terminating...");
3722
3723 if (pid > 0)
3724 kill(pid, SIGKILL);
3725
3726 /* Try to flush whatever is still queued in the pty */
3727 if (master >= 0)
3728 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
3729
3730 loop_remove(loop_nr, &image_fd);
3731
3732 if (remove_subvol && arg_directory) {
3733 int k;
3734
3735 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
3736 if (k < 0)
3737 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3738 }
3739
3740 if (arg_machine) {
3741 const char *p;
3742
3743 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3744 (void) rm_rf(p, REMOVE_ROOT);
3745 }
3746
3747 expose_port_flush(arg_expose_ports, &exposed);
3748
3749 free(arg_directory);
3750 free(arg_template);
3751 free(arg_image);
3752 free(arg_machine);
3753 free(arg_user);
3754 free(arg_chdir);
3755 strv_free(arg_setenv);
3756 free(arg_network_bridge);
3757 strv_free(arg_network_interfaces);
3758 strv_free(arg_network_macvlan);
3759 strv_free(arg_network_ipvlan);
3760 strv_free(arg_network_veth_extra);
3761 strv_free(arg_parameters);
3762 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3763 expose_port_free_all(arg_expose_ports);
3764
3765 return r < 0 ? EXIT_FAILURE : ret;
3766 }