]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #2981 from keszybz/test-nss
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #ifdef HAVE_BLKID
21 #include <blkid/blkid.h>
22 #endif
23 #include <errno.h>
24 #include <getopt.h>
25 #include <linux/loop.h>
26 #include <sched.h>
27 #ifdef HAVE_SECCOMP
28 #include <seccomp.h>
29 #endif
30 #ifdef HAVE_SELINUX
31 #include <selinux/selinux.h>
32 #endif
33 #include <signal.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <sys/file.h>
38 #include <sys/mount.h>
39 #include <sys/personality.h>
40 #include <sys/prctl.h>
41 #include <sys/types.h>
42 #include <unistd.h>
43
44 #include "sd-daemon.h"
45 #include "sd-id128.h"
46
47 #include "alloc-util.h"
48 #include "barrier.h"
49 #include "base-filesystem.h"
50 #include "blkid-util.h"
51 #include "btrfs-util.h"
52 #include "cap-list.h"
53 #include "capability-util.h"
54 #include "cgroup-util.h"
55 #include "copy.h"
56 #include "dev-setup.h"
57 #include "env-util.h"
58 #include "fd-util.h"
59 #include "fdset.h"
60 #include "fileio.h"
61 #include "formats-util.h"
62 #include "fs-util.h"
63 #include "gpt.h"
64 #include "hostname-util.h"
65 #include "log.h"
66 #include "loopback-setup.h"
67 #include "machine-image.h"
68 #include "macro.h"
69 #include "missing.h"
70 #include "mkdir.h"
71 #include "mount-util.h"
72 #include "netlink-util.h"
73 #include "nspawn-cgroup.h"
74 #include "nspawn-expose-ports.h"
75 #include "nspawn-mount.h"
76 #include "nspawn-network.h"
77 #include "nspawn-register.h"
78 #include "nspawn-settings.h"
79 #include "nspawn-setuid.h"
80 #include "nspawn-stub-pid1.h"
81 #include "parse-util.h"
82 #include "path-util.h"
83 #include "process-util.h"
84 #include "ptyfwd.h"
85 #include "random-util.h"
86 #include "rm-rf.h"
87 #ifdef HAVE_SECCOMP
88 #include "seccomp-util.h"
89 #endif
90 #include "selinux-util.h"
91 #include "signal-util.h"
92 #include "socket-util.h"
93 #include "stat-util.h"
94 #include "stdio-util.h"
95 #include "string-util.h"
96 #include "strv.h"
97 #include "terminal-util.h"
98 #include "udev-util.h"
99 #include "umask-util.h"
100 #include "user-util.h"
101 #include "util.h"
102
103 typedef enum ContainerStatus {
104 CONTAINER_TERMINATED,
105 CONTAINER_REBOOTED
106 } ContainerStatus;
107
108 typedef enum LinkJournal {
109 LINK_NO,
110 LINK_AUTO,
111 LINK_HOST,
112 LINK_GUEST
113 } LinkJournal;
114
115 static char *arg_directory = NULL;
116 static char *arg_template = NULL;
117 static char *arg_chdir = NULL;
118 static char *arg_user = NULL;
119 static sd_id128_t arg_uuid = {};
120 static char *arg_machine = NULL;
121 static const char *arg_selinux_context = NULL;
122 static const char *arg_selinux_apifs_context = NULL;
123 static const char *arg_slice = NULL;
124 static bool arg_private_network = false;
125 static bool arg_read_only = false;
126 static StartMode arg_start_mode = START_PID1;
127 static bool arg_ephemeral = false;
128 static LinkJournal arg_link_journal = LINK_AUTO;
129 static bool arg_link_journal_try = false;
130 static uint64_t arg_retain =
131 (1ULL << CAP_CHOWN) |
132 (1ULL << CAP_DAC_OVERRIDE) |
133 (1ULL << CAP_DAC_READ_SEARCH) |
134 (1ULL << CAP_FOWNER) |
135 (1ULL << CAP_FSETID) |
136 (1ULL << CAP_IPC_OWNER) |
137 (1ULL << CAP_KILL) |
138 (1ULL << CAP_LEASE) |
139 (1ULL << CAP_LINUX_IMMUTABLE) |
140 (1ULL << CAP_NET_BIND_SERVICE) |
141 (1ULL << CAP_NET_BROADCAST) |
142 (1ULL << CAP_NET_RAW) |
143 (1ULL << CAP_SETGID) |
144 (1ULL << CAP_SETFCAP) |
145 (1ULL << CAP_SETPCAP) |
146 (1ULL << CAP_SETUID) |
147 (1ULL << CAP_SYS_ADMIN) |
148 (1ULL << CAP_SYS_CHROOT) |
149 (1ULL << CAP_SYS_NICE) |
150 (1ULL << CAP_SYS_PTRACE) |
151 (1ULL << CAP_SYS_TTY_CONFIG) |
152 (1ULL << CAP_SYS_RESOURCE) |
153 (1ULL << CAP_SYS_BOOT) |
154 (1ULL << CAP_AUDIT_WRITE) |
155 (1ULL << CAP_AUDIT_CONTROL) |
156 (1ULL << CAP_MKNOD);
157 static CustomMount *arg_custom_mounts = NULL;
158 static unsigned arg_n_custom_mounts = 0;
159 static char **arg_setenv = NULL;
160 static bool arg_quiet = false;
161 static bool arg_share_system = false;
162 static bool arg_register = true;
163 static bool arg_keep_unit = false;
164 static char **arg_network_interfaces = NULL;
165 static char **arg_network_macvlan = NULL;
166 static char **arg_network_ipvlan = NULL;
167 static bool arg_network_veth = false;
168 static char **arg_network_veth_extra = NULL;
169 static char *arg_network_bridge = NULL;
170 static unsigned long arg_personality = PERSONALITY_INVALID;
171 static char *arg_image = NULL;
172 static VolatileMode arg_volatile_mode = VOLATILE_NO;
173 static ExposePort *arg_expose_ports = NULL;
174 static char **arg_property = NULL;
175 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
176 static bool arg_userns = false;
177 static int arg_kill_signal = 0;
178 static bool arg_unified_cgroup_hierarchy = false;
179 static SettingsMask arg_settings_mask = 0;
180 static int arg_settings_trusted = -1;
181 static char **arg_parameters = NULL;
182 static const char *arg_container_service_name = "systemd-nspawn";
183
184 static void help(void) {
185 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
186 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
187 " -h --help Show this help\n"
188 " --version Print version string\n"
189 " -q --quiet Do not show status information\n"
190 " -D --directory=PATH Root directory for the container\n"
191 " --template=PATH Initialize root directory from template directory,\n"
192 " if missing\n"
193 " -x --ephemeral Run container with snapshot of root directory, and\n"
194 " remove it after exit\n"
195 " -i --image=PATH File system device or disk image for the container\n"
196 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
197 " -b --boot Boot up full system (i.e. invoke init)\n"
198 " --chdir=PATH Set working directory in the container\n"
199 " -u --user=USER Run the command under specified user or uid\n"
200 " -M --machine=NAME Set the machine name for the container\n"
201 " --uuid=UUID Set a specific machine UUID for the container\n"
202 " -S --slice=SLICE Place the container in the specified slice\n"
203 " --property=NAME=VALUE Set scope unit property\n"
204 " --private-users[=UIDBASE[:NUIDS]]\n"
205 " Run within user namespace\n"
206 " --private-network Disable network in container\n"
207 " --network-interface=INTERFACE\n"
208 " Assign an existing network interface to the\n"
209 " container\n"
210 " --network-macvlan=INTERFACE\n"
211 " Create a macvlan network interface based on an\n"
212 " existing network interface to the container\n"
213 " --network-ipvlan=INTERFACE\n"
214 " Create a ipvlan network interface based on an\n"
215 " existing network interface to the container\n"
216 " -n --network-veth Add a virtual Ethernet connection between host\n"
217 " and container\n"
218 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
219 " Add an additional virtual Ethernet link between\n"
220 " host and container\n"
221 " --network-bridge=INTERFACE\n"
222 " Add a virtual Ethernet connection between host\n"
223 " and container and add it to an existing bridge on\n"
224 " the host\n"
225 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
226 " Expose a container IP port on the host\n"
227 " -Z --selinux-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " processes in the container\n"
230 " -L --selinux-apifs-context=SECLABEL\n"
231 " Set the SELinux security context to be used by\n"
232 " API/tmpfs file systems in the container\n"
233 " --capability=CAP In addition to the default, retain specified\n"
234 " capability\n"
235 " --drop-capability=CAP Drop the specified capability from the default set\n"
236 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
237 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
238 " host, try-guest, try-host\n"
239 " -j Equivalent to --link-journal=try-guest\n"
240 " --read-only Mount the root directory read-only\n"
241 " --bind=PATH[:PATH[:OPTIONS]]\n"
242 " Bind mount a file or directory from the host into\n"
243 " the container\n"
244 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
245 " Similar, but creates a read-only bind mount\n"
246 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
247 " --overlay=PATH[:PATH...]:PATH\n"
248 " Create an overlay mount from the host to \n"
249 " the container\n"
250 " --overlay-ro=PATH[:PATH...]:PATH\n"
251 " Similar, but creates a read-only overlay mount\n"
252 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
253 " --share-system Share system namespaces with host\n"
254 " --register=BOOLEAN Register container as machine\n"
255 " --keep-unit Do not register a scope for the machine, reuse\n"
256 " the service unit nspawn is running in\n"
257 " --volatile[=MODE] Run the system in volatile mode\n"
258 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
259 , program_invocation_short_name);
260 }
261
262
263 static int custom_mounts_prepare(void) {
264 unsigned i;
265 int r;
266
267 /* Ensure the mounts are applied prefix first. */
268 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
269
270 /* Allocate working directories for the overlay file systems that need it */
271 for (i = 0; i < arg_n_custom_mounts; i++) {
272 CustomMount *m = &arg_custom_mounts[i];
273
274 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
275 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
276 return -EINVAL;
277 }
278
279 if (m->type != CUSTOM_MOUNT_OVERLAY)
280 continue;
281
282 if (m->work_dir)
283 continue;
284
285 if (m->read_only)
286 continue;
287
288 r = tempfn_random(m->source, NULL, &m->work_dir);
289 if (r < 0)
290 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
291 }
292
293 return 0;
294 }
295
296 static int detect_unified_cgroup_hierarchy(void) {
297 const char *e;
298 int r;
299
300 /* Allow the user to control whether the unified hierarchy is used */
301 e = getenv("UNIFIED_CGROUP_HIERARCHY");
302 if (e) {
303 r = parse_boolean(e);
304 if (r < 0)
305 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
306
307 arg_unified_cgroup_hierarchy = r;
308 return 0;
309 }
310
311 /* Otherwise inherit the default from the host system */
312 r = cg_unified();
313 if (r < 0)
314 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
315
316 arg_unified_cgroup_hierarchy = r;
317 return 0;
318 }
319
320 static int parse_argv(int argc, char *argv[]) {
321
322 enum {
323 ARG_VERSION = 0x100,
324 ARG_PRIVATE_NETWORK,
325 ARG_UUID,
326 ARG_READ_ONLY,
327 ARG_CAPABILITY,
328 ARG_DROP_CAPABILITY,
329 ARG_LINK_JOURNAL,
330 ARG_BIND,
331 ARG_BIND_RO,
332 ARG_TMPFS,
333 ARG_OVERLAY,
334 ARG_OVERLAY_RO,
335 ARG_SETENV,
336 ARG_SHARE_SYSTEM,
337 ARG_REGISTER,
338 ARG_KEEP_UNIT,
339 ARG_NETWORK_INTERFACE,
340 ARG_NETWORK_MACVLAN,
341 ARG_NETWORK_IPVLAN,
342 ARG_NETWORK_BRIDGE,
343 ARG_NETWORK_VETH_EXTRA,
344 ARG_PERSONALITY,
345 ARG_VOLATILE,
346 ARG_TEMPLATE,
347 ARG_PROPERTY,
348 ARG_PRIVATE_USERS,
349 ARG_KILL_SIGNAL,
350 ARG_SETTINGS,
351 ARG_CHDIR,
352 };
353
354 static const struct option options[] = {
355 { "help", no_argument, NULL, 'h' },
356 { "version", no_argument, NULL, ARG_VERSION },
357 { "directory", required_argument, NULL, 'D' },
358 { "template", required_argument, NULL, ARG_TEMPLATE },
359 { "ephemeral", no_argument, NULL, 'x' },
360 { "user", required_argument, NULL, 'u' },
361 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
362 { "as-pid2", no_argument, NULL, 'a' },
363 { "boot", no_argument, NULL, 'b' },
364 { "uuid", required_argument, NULL, ARG_UUID },
365 { "read-only", no_argument, NULL, ARG_READ_ONLY },
366 { "capability", required_argument, NULL, ARG_CAPABILITY },
367 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
368 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
369 { "bind", required_argument, NULL, ARG_BIND },
370 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
371 { "tmpfs", required_argument, NULL, ARG_TMPFS },
372 { "overlay", required_argument, NULL, ARG_OVERLAY },
373 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
374 { "machine", required_argument, NULL, 'M' },
375 { "slice", required_argument, NULL, 'S' },
376 { "setenv", required_argument, NULL, ARG_SETENV },
377 { "selinux-context", required_argument, NULL, 'Z' },
378 { "selinux-apifs-context", required_argument, NULL, 'L' },
379 { "quiet", no_argument, NULL, 'q' },
380 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
381 { "register", required_argument, NULL, ARG_REGISTER },
382 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
383 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
384 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
385 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
386 { "network-veth", no_argument, NULL, 'n' },
387 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
388 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
389 { "personality", required_argument, NULL, ARG_PERSONALITY },
390 { "image", required_argument, NULL, 'i' },
391 { "volatile", optional_argument, NULL, ARG_VOLATILE },
392 { "port", required_argument, NULL, 'p' },
393 { "property", required_argument, NULL, ARG_PROPERTY },
394 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
395 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
396 { "settings", required_argument, NULL, ARG_SETTINGS },
397 { "chdir", required_argument, NULL, ARG_CHDIR },
398 {}
399 };
400
401 int c, r;
402 const char *p, *e;
403 uint64_t plus = 0, minus = 0;
404 bool mask_all_settings = false, mask_no_settings = false;
405
406 assert(argc >= 0);
407 assert(argv);
408
409 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
410
411 switch (c) {
412
413 case 'h':
414 help();
415 return 0;
416
417 case ARG_VERSION:
418 return version();
419
420 case 'D':
421 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
422 if (r < 0)
423 return r;
424 break;
425
426 case ARG_TEMPLATE:
427 r = parse_path_argument_and_warn(optarg, false, &arg_template);
428 if (r < 0)
429 return r;
430 break;
431
432 case 'i':
433 r = parse_path_argument_and_warn(optarg, false, &arg_image);
434 if (r < 0)
435 return r;
436 break;
437
438 case 'x':
439 arg_ephemeral = true;
440 break;
441
442 case 'u':
443 r = free_and_strdup(&arg_user, optarg);
444 if (r < 0)
445 return log_oom();
446
447 arg_settings_mask |= SETTING_USER;
448 break;
449
450 case ARG_NETWORK_BRIDGE:
451 r = free_and_strdup(&arg_network_bridge, optarg);
452 if (r < 0)
453 return log_oom();
454
455 /* fall through */
456
457 case 'n':
458 arg_network_veth = true;
459 arg_private_network = true;
460 arg_settings_mask |= SETTING_NETWORK;
461 break;
462
463 case ARG_NETWORK_VETH_EXTRA:
464 r = veth_extra_parse(&arg_network_veth_extra, optarg);
465 if (r < 0)
466 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
467
468 arg_private_network = true;
469 arg_settings_mask |= SETTING_NETWORK;
470 break;
471
472 case ARG_NETWORK_INTERFACE:
473 if (strv_extend(&arg_network_interfaces, optarg) < 0)
474 return log_oom();
475
476 arg_private_network = true;
477 arg_settings_mask |= SETTING_NETWORK;
478 break;
479
480 case ARG_NETWORK_MACVLAN:
481 if (strv_extend(&arg_network_macvlan, optarg) < 0)
482 return log_oom();
483
484 arg_private_network = true;
485 arg_settings_mask |= SETTING_NETWORK;
486 break;
487
488 case ARG_NETWORK_IPVLAN:
489 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
490 return log_oom();
491
492 /* fall through */
493
494 case ARG_PRIVATE_NETWORK:
495 arg_private_network = true;
496 arg_settings_mask |= SETTING_NETWORK;
497 break;
498
499 case 'b':
500 if (arg_start_mode == START_PID2) {
501 log_error("--boot and --as-pid2 may not be combined.");
502 return -EINVAL;
503 }
504
505 arg_start_mode = START_BOOT;
506 arg_settings_mask |= SETTING_START_MODE;
507 break;
508
509 case 'a':
510 if (arg_start_mode == START_BOOT) {
511 log_error("--boot and --as-pid2 may not be combined.");
512 return -EINVAL;
513 }
514
515 arg_start_mode = START_PID2;
516 arg_settings_mask |= SETTING_START_MODE;
517 break;
518
519 case ARG_UUID:
520 r = sd_id128_from_string(optarg, &arg_uuid);
521 if (r < 0) {
522 log_error("Invalid UUID: %s", optarg);
523 return r;
524 }
525
526 arg_settings_mask |= SETTING_MACHINE_ID;
527 break;
528
529 case 'S':
530 arg_slice = optarg;
531 break;
532
533 case 'M':
534 if (isempty(optarg))
535 arg_machine = mfree(arg_machine);
536 else {
537 if (!machine_name_is_valid(optarg)) {
538 log_error("Invalid machine name: %s", optarg);
539 return -EINVAL;
540 }
541
542 r = free_and_strdup(&arg_machine, optarg);
543 if (r < 0)
544 return log_oom();
545
546 break;
547 }
548
549 case 'Z':
550 arg_selinux_context = optarg;
551 break;
552
553 case 'L':
554 arg_selinux_apifs_context = optarg;
555 break;
556
557 case ARG_READ_ONLY:
558 arg_read_only = true;
559 arg_settings_mask |= SETTING_READ_ONLY;
560 break;
561
562 case ARG_CAPABILITY:
563 case ARG_DROP_CAPABILITY: {
564 p = optarg;
565 for (;;) {
566 _cleanup_free_ char *t = NULL;
567
568 r = extract_first_word(&p, &t, ",", 0);
569 if (r < 0)
570 return log_error_errno(r, "Failed to parse capability %s.", t);
571
572 if (r == 0)
573 break;
574
575 if (streq(t, "all")) {
576 if (c == ARG_CAPABILITY)
577 plus = (uint64_t) -1;
578 else
579 minus = (uint64_t) -1;
580 } else {
581 int cap;
582
583 cap = capability_from_name(t);
584 if (cap < 0) {
585 log_error("Failed to parse capability %s.", t);
586 return -EINVAL;
587 }
588
589 if (c == ARG_CAPABILITY)
590 plus |= 1ULL << (uint64_t) cap;
591 else
592 minus |= 1ULL << (uint64_t) cap;
593 }
594 }
595
596 arg_settings_mask |= SETTING_CAPABILITY;
597 break;
598 }
599
600 case 'j':
601 arg_link_journal = LINK_GUEST;
602 arg_link_journal_try = true;
603 break;
604
605 case ARG_LINK_JOURNAL:
606 if (streq(optarg, "auto")) {
607 arg_link_journal = LINK_AUTO;
608 arg_link_journal_try = false;
609 } else if (streq(optarg, "no")) {
610 arg_link_journal = LINK_NO;
611 arg_link_journal_try = false;
612 } else if (streq(optarg, "guest")) {
613 arg_link_journal = LINK_GUEST;
614 arg_link_journal_try = false;
615 } else if (streq(optarg, "host")) {
616 arg_link_journal = LINK_HOST;
617 arg_link_journal_try = false;
618 } else if (streq(optarg, "try-guest")) {
619 arg_link_journal = LINK_GUEST;
620 arg_link_journal_try = true;
621 } else if (streq(optarg, "try-host")) {
622 arg_link_journal = LINK_HOST;
623 arg_link_journal_try = true;
624 } else {
625 log_error("Failed to parse link journal mode %s", optarg);
626 return -EINVAL;
627 }
628
629 break;
630
631 case ARG_BIND:
632 case ARG_BIND_RO:
633 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
634 if (r < 0)
635 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
636
637 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
638 break;
639
640 case ARG_TMPFS:
641 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
642 if (r < 0)
643 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
644
645 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
646 break;
647
648 case ARG_OVERLAY:
649 case ARG_OVERLAY_RO: {
650 _cleanup_free_ char *upper = NULL, *destination = NULL;
651 _cleanup_strv_free_ char **lower = NULL;
652 CustomMount *m;
653 unsigned n = 0;
654 char **i;
655
656 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
657 if (r == -ENOMEM)
658 return log_oom();
659 else if (r < 0) {
660 log_error("Invalid overlay specification: %s", optarg);
661 return r;
662 }
663
664 STRV_FOREACH(i, lower) {
665 if (!path_is_absolute(*i)) {
666 log_error("Overlay path %s is not absolute.", *i);
667 return -EINVAL;
668 }
669
670 n++;
671 }
672
673 if (n < 2) {
674 log_error("--overlay= needs at least two colon-separated directories specified.");
675 return -EINVAL;
676 }
677
678 if (n == 2) {
679 /* If two parameters are specified,
680 * the first one is the lower, the
681 * second one the upper directory. And
682 * we'll also define the destination
683 * mount point the same as the upper. */
684 upper = lower[1];
685 lower[1] = NULL;
686
687 destination = strdup(upper);
688 if (!destination)
689 return log_oom();
690
691 } else {
692 upper = lower[n - 2];
693 destination = lower[n - 1];
694 lower[n - 2] = NULL;
695 }
696
697 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
698 if (!m)
699 return log_oom();
700
701 m->destination = destination;
702 m->source = upper;
703 m->lower = lower;
704 m->read_only = c == ARG_OVERLAY_RO;
705
706 upper = destination = NULL;
707 lower = NULL;
708
709 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
710 break;
711 }
712
713 case ARG_SETENV: {
714 char **n;
715
716 if (!env_assignment_is_valid(optarg)) {
717 log_error("Environment variable assignment '%s' is not valid.", optarg);
718 return -EINVAL;
719 }
720
721 n = strv_env_set(arg_setenv, optarg);
722 if (!n)
723 return log_oom();
724
725 strv_free(arg_setenv);
726 arg_setenv = n;
727
728 arg_settings_mask |= SETTING_ENVIRONMENT;
729 break;
730 }
731
732 case 'q':
733 arg_quiet = true;
734 break;
735
736 case ARG_SHARE_SYSTEM:
737 arg_share_system = true;
738 break;
739
740 case ARG_REGISTER:
741 r = parse_boolean(optarg);
742 if (r < 0) {
743 log_error("Failed to parse --register= argument: %s", optarg);
744 return r;
745 }
746
747 arg_register = r;
748 break;
749
750 case ARG_KEEP_UNIT:
751 arg_keep_unit = true;
752 break;
753
754 case ARG_PERSONALITY:
755
756 arg_personality = personality_from_string(optarg);
757 if (arg_personality == PERSONALITY_INVALID) {
758 log_error("Unknown or unsupported personality '%s'.", optarg);
759 return -EINVAL;
760 }
761
762 arg_settings_mask |= SETTING_PERSONALITY;
763 break;
764
765 case ARG_VOLATILE:
766
767 if (!optarg)
768 arg_volatile_mode = VOLATILE_YES;
769 else {
770 VolatileMode m;
771
772 m = volatile_mode_from_string(optarg);
773 if (m < 0) {
774 log_error("Failed to parse --volatile= argument: %s", optarg);
775 return -EINVAL;
776 } else
777 arg_volatile_mode = m;
778 }
779
780 arg_settings_mask |= SETTING_VOLATILE_MODE;
781 break;
782
783 case 'p':
784 r = expose_port_parse(&arg_expose_ports, optarg);
785 if (r == -EEXIST)
786 return log_error_errno(r, "Duplicate port specification: %s", optarg);
787 if (r < 0)
788 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
789
790 arg_settings_mask |= SETTING_EXPOSE_PORTS;
791 break;
792
793 case ARG_PROPERTY:
794 if (strv_extend(&arg_property, optarg) < 0)
795 return log_oom();
796
797 break;
798
799 case ARG_PRIVATE_USERS:
800 if (optarg) {
801 _cleanup_free_ char *buffer = NULL;
802 const char *range, *shift;
803
804 range = strchr(optarg, ':');
805 if (range) {
806 buffer = strndup(optarg, range - optarg);
807 if (!buffer)
808 return log_oom();
809 shift = buffer;
810
811 range++;
812 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
813 log_error("Failed to parse UID range: %s", range);
814 return -EINVAL;
815 }
816 } else
817 shift = optarg;
818
819 if (parse_uid(shift, &arg_uid_shift) < 0) {
820 log_error("Failed to parse UID: %s", optarg);
821 return -EINVAL;
822 }
823 }
824
825 arg_userns = true;
826 break;
827
828 case ARG_KILL_SIGNAL:
829 arg_kill_signal = signal_from_string_try_harder(optarg);
830 if (arg_kill_signal < 0) {
831 log_error("Cannot parse signal: %s", optarg);
832 return -EINVAL;
833 }
834
835 arg_settings_mask |= SETTING_KILL_SIGNAL;
836 break;
837
838 case ARG_SETTINGS:
839
840 /* no → do not read files
841 * yes → read files, do not override cmdline, trust only subset
842 * override → read files, override cmdline, trust only subset
843 * trusted → read files, do not override cmdline, trust all
844 */
845
846 r = parse_boolean(optarg);
847 if (r < 0) {
848 if (streq(optarg, "trusted")) {
849 mask_all_settings = false;
850 mask_no_settings = false;
851 arg_settings_trusted = true;
852
853 } else if (streq(optarg, "override")) {
854 mask_all_settings = false;
855 mask_no_settings = true;
856 arg_settings_trusted = -1;
857 } else
858 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
859 } else if (r > 0) {
860 /* yes */
861 mask_all_settings = false;
862 mask_no_settings = false;
863 arg_settings_trusted = -1;
864 } else {
865 /* no */
866 mask_all_settings = true;
867 mask_no_settings = false;
868 arg_settings_trusted = false;
869 }
870
871 break;
872
873 case ARG_CHDIR:
874 if (!path_is_absolute(optarg)) {
875 log_error("Working directory %s is not an absolute path.", optarg);
876 return -EINVAL;
877 }
878
879 r = free_and_strdup(&arg_chdir, optarg);
880 if (r < 0)
881 return log_oom();
882
883 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
884 break;
885
886 case '?':
887 return -EINVAL;
888
889 default:
890 assert_not_reached("Unhandled option");
891 }
892
893 if (arg_share_system)
894 arg_register = false;
895
896 if (arg_start_mode != START_PID1 && arg_share_system) {
897 log_error("--boot and --share-system may not be combined.");
898 return -EINVAL;
899 }
900
901 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
902 log_error("--keep-unit may not be used when invoked from a user session.");
903 return -EINVAL;
904 }
905
906 if (arg_directory && arg_image) {
907 log_error("--directory= and --image= may not be combined.");
908 return -EINVAL;
909 }
910
911 if (arg_template && arg_image) {
912 log_error("--template= and --image= may not be combined.");
913 return -EINVAL;
914 }
915
916 if (arg_template && !(arg_directory || arg_machine)) {
917 log_error("--template= needs --directory= or --machine=.");
918 return -EINVAL;
919 }
920
921 if (arg_ephemeral && arg_template) {
922 log_error("--ephemeral and --template= may not be combined.");
923 return -EINVAL;
924 }
925
926 if (arg_ephemeral && arg_image) {
927 log_error("--ephemeral and --image= may not be combined.");
928 return -EINVAL;
929 }
930
931 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
932 log_error("--ephemeral and --link-journal= may not be combined.");
933 return -EINVAL;
934 }
935
936 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
937 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
938
939 if (argc > optind) {
940 arg_parameters = strv_copy(argv + optind);
941 if (!arg_parameters)
942 return log_oom();
943
944 arg_settings_mask |= SETTING_START_MODE;
945 }
946
947 /* Load all settings from .nspawn files */
948 if (mask_no_settings)
949 arg_settings_mask = 0;
950
951 /* Don't load any settings from .nspawn files */
952 if (mask_all_settings)
953 arg_settings_mask = _SETTINGS_MASK_ALL;
954
955 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
956
957 r = detect_unified_cgroup_hierarchy();
958 if (r < 0)
959 return r;
960
961 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
962 if (e)
963 arg_container_service_name = e;
964
965 return 1;
966 }
967
968 static int verify_arguments(void) {
969
970 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
971 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
972 return -EINVAL;
973 }
974
975 if (arg_expose_ports && !arg_private_network) {
976 log_error("Cannot use --port= without private networking.");
977 return -EINVAL;
978 }
979
980 #ifndef HAVE_LIBIPTC
981 if (arg_expose_ports) {
982 log_error("--port= is not supported, compiled without libiptc support.");
983 return -EOPNOTSUPP;
984 }
985 #endif
986
987 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
988 arg_kill_signal = SIGRTMIN+3;
989
990 return 0;
991 }
992
993 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
994 assert(p);
995
996 if (!arg_userns)
997 return 0;
998
999 if (uid == UID_INVALID && gid == GID_INVALID)
1000 return 0;
1001
1002 if (uid != UID_INVALID) {
1003 uid += arg_uid_shift;
1004
1005 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1006 return -EOVERFLOW;
1007 }
1008
1009 if (gid != GID_INVALID) {
1010 gid += (gid_t) arg_uid_shift;
1011
1012 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1013 return -EOVERFLOW;
1014 }
1015
1016 if (lchown(p, uid, gid) < 0)
1017 return -errno;
1018
1019 return 0;
1020 }
1021
1022 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1023 const char *q;
1024
1025 q = prefix_roota(root, path);
1026 if (mkdir(q, mode) < 0) {
1027 if (errno == EEXIST)
1028 return 0;
1029 return -errno;
1030 }
1031
1032 return userns_lchown(q, uid, gid);
1033 }
1034
1035 static int setup_timezone(const char *dest) {
1036 _cleanup_free_ char *p = NULL, *q = NULL;
1037 const char *where, *check, *what;
1038 char *z, *y;
1039 int r;
1040
1041 assert(dest);
1042
1043 /* Fix the timezone, if possible */
1044 r = readlink_malloc("/etc/localtime", &p);
1045 if (r < 0) {
1046 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1047 return 0;
1048 }
1049
1050 z = path_startswith(p, "../usr/share/zoneinfo/");
1051 if (!z)
1052 z = path_startswith(p, "/usr/share/zoneinfo/");
1053 if (!z) {
1054 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1055 return 0;
1056 }
1057
1058 where = prefix_roota(dest, "/etc/localtime");
1059 r = readlink_malloc(where, &q);
1060 if (r >= 0) {
1061 y = path_startswith(q, "../usr/share/zoneinfo/");
1062 if (!y)
1063 y = path_startswith(q, "/usr/share/zoneinfo/");
1064
1065 /* Already pointing to the right place? Then do nothing .. */
1066 if (y && streq(y, z))
1067 return 0;
1068 }
1069
1070 check = strjoina("/usr/share/zoneinfo/", z);
1071 check = prefix_roota(dest, check);
1072 if (laccess(check, F_OK) < 0) {
1073 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1074 return 0;
1075 }
1076
1077 r = unlink(where);
1078 if (r < 0 && errno != ENOENT) {
1079 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1080 return 0;
1081 }
1082
1083 what = strjoina("../usr/share/zoneinfo/", z);
1084 if (symlink(what, where) < 0) {
1085 log_error_errno(errno, "Failed to correct timezone of container: %m");
1086 return 0;
1087 }
1088
1089 r = userns_lchown(where, 0, 0);
1090 if (r < 0)
1091 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1092
1093 return 0;
1094 }
1095
1096 static int setup_resolv_conf(const char *dest) {
1097 const char *where = NULL;
1098 int r;
1099
1100 assert(dest);
1101
1102 if (arg_private_network)
1103 return 0;
1104
1105 /* Fix resolv.conf, if possible */
1106 where = prefix_roota(dest, "/etc/resolv.conf");
1107
1108 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1109 if (r < 0) {
1110 /* If the file already exists as symlink, let's
1111 * suppress the warning, under the assumption that
1112 * resolved or something similar runs inside and the
1113 * symlink points there.
1114 *
1115 * If the disk image is read-only, there's also no
1116 * point in complaining.
1117 */
1118 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1119 "Failed to copy /etc/resolv.conf to %s: %m", where);
1120 return 0;
1121 }
1122
1123 r = userns_lchown(where, 0, 0);
1124 if (r < 0)
1125 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1126
1127 return 0;
1128 }
1129
1130 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1131 assert(s);
1132
1133 snprintf(s, 37,
1134 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1135 SD_ID128_FORMAT_VAL(id));
1136
1137 return s;
1138 }
1139
1140 static int setup_boot_id(const char *dest) {
1141 const char *from, *to;
1142 sd_id128_t rnd = {};
1143 char as_uuid[37];
1144 int r;
1145
1146 if (arg_share_system)
1147 return 0;
1148
1149 /* Generate a new randomized boot ID, so that each boot-up of
1150 * the container gets a new one */
1151
1152 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1153 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1154
1155 r = sd_id128_randomize(&rnd);
1156 if (r < 0)
1157 return log_error_errno(r, "Failed to generate random boot id: %m");
1158
1159 id128_format_as_uuid(rnd, as_uuid);
1160
1161 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1162 if (r < 0)
1163 return log_error_errno(r, "Failed to write boot id: %m");
1164
1165 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1166 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1167 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1168 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1169
1170 unlink(from);
1171 return r;
1172 }
1173
1174 static int copy_devnodes(const char *dest) {
1175
1176 static const char devnodes[] =
1177 "null\0"
1178 "zero\0"
1179 "full\0"
1180 "random\0"
1181 "urandom\0"
1182 "tty\0"
1183 "net/tun\0";
1184
1185 const char *d;
1186 int r = 0;
1187 _cleanup_umask_ mode_t u;
1188
1189 assert(dest);
1190
1191 u = umask(0000);
1192
1193 /* Create /dev/net, so that we can create /dev/net/tun in it */
1194 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1195 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1196
1197 NULSTR_FOREACH(d, devnodes) {
1198 _cleanup_free_ char *from = NULL, *to = NULL;
1199 struct stat st;
1200
1201 from = strappend("/dev/", d);
1202 to = prefix_root(dest, from);
1203
1204 if (stat(from, &st) < 0) {
1205
1206 if (errno != ENOENT)
1207 return log_error_errno(errno, "Failed to stat %s: %m", from);
1208
1209 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1210
1211 log_error("%s is not a char or block device, cannot copy.", from);
1212 return -EIO;
1213
1214 } else {
1215 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1216 if (errno != EPERM)
1217 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1218
1219 /* Some systems abusively restrict mknod but
1220 * allow bind mounts. */
1221 r = touch(to);
1222 if (r < 0)
1223 return log_error_errno(r, "touch (%s) failed: %m", to);
1224 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1225 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1226 }
1227
1228 r = userns_lchown(to, 0, 0);
1229 if (r < 0)
1230 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1231 }
1232 }
1233
1234 return r;
1235 }
1236
1237 static int setup_pts(const char *dest) {
1238 _cleanup_free_ char *options = NULL;
1239 const char *p;
1240 int r;
1241
1242 #ifdef HAVE_SELINUX
1243 if (arg_selinux_apifs_context)
1244 (void) asprintf(&options,
1245 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1246 arg_uid_shift + TTY_GID,
1247 arg_selinux_apifs_context);
1248 else
1249 #endif
1250 (void) asprintf(&options,
1251 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1252 arg_uid_shift + TTY_GID);
1253
1254 if (!options)
1255 return log_oom();
1256
1257 /* Mount /dev/pts itself */
1258 p = prefix_roota(dest, "/dev/pts");
1259 if (mkdir(p, 0755) < 0)
1260 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1261 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1262 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1263 r = userns_lchown(p, 0, 0);
1264 if (r < 0)
1265 return log_error_errno(r, "Failed to chown /dev/pts: %m");
1266
1267 /* Create /dev/ptmx symlink */
1268 p = prefix_roota(dest, "/dev/ptmx");
1269 if (symlink("pts/ptmx", p) < 0)
1270 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1271 r = userns_lchown(p, 0, 0);
1272 if (r < 0)
1273 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
1274
1275 /* And fix /dev/pts/ptmx ownership */
1276 p = prefix_roota(dest, "/dev/pts/ptmx");
1277 r = userns_lchown(p, 0, 0);
1278 if (r < 0)
1279 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
1280
1281 return 0;
1282 }
1283
1284 static int setup_dev_console(const char *dest, const char *console) {
1285 _cleanup_umask_ mode_t u;
1286 const char *to;
1287 int r;
1288
1289 assert(dest);
1290 assert(console);
1291
1292 u = umask(0000);
1293
1294 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1295 if (r < 0)
1296 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1297
1298 /* We need to bind mount the right tty to /dev/console since
1299 * ptys can only exist on pts file systems. To have something
1300 * to bind mount things on we create a empty regular file. */
1301
1302 to = prefix_roota(dest, "/dev/console");
1303 r = touch(to);
1304 if (r < 0)
1305 return log_error_errno(r, "touch() for /dev/console failed: %m");
1306
1307 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1308 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1309
1310 return 0;
1311 }
1312
1313 static int setup_kmsg(const char *dest, int kmsg_socket) {
1314 const char *from, *to;
1315 _cleanup_umask_ mode_t u;
1316 int fd, r;
1317
1318 assert(kmsg_socket >= 0);
1319
1320 u = umask(0000);
1321
1322 /* We create the kmsg FIFO as /run/kmsg, but immediately
1323 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1324 * on the reading side behave very similar to /proc/kmsg,
1325 * their writing side behaves differently from /dev/kmsg in
1326 * that writing blocks when nothing is reading. In order to
1327 * avoid any problems with containers deadlocking due to this
1328 * we simply make /dev/kmsg unavailable to the container. */
1329 from = prefix_roota(dest, "/run/kmsg");
1330 to = prefix_roota(dest, "/proc/kmsg");
1331
1332 if (mkfifo(from, 0600) < 0)
1333 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1334 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1335 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1336
1337 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1338 if (fd < 0)
1339 return log_error_errno(errno, "Failed to open fifo: %m");
1340
1341 /* Store away the fd in the socket, so that it stays open as
1342 * long as we run the child */
1343 r = send_one_fd(kmsg_socket, fd, 0);
1344 safe_close(fd);
1345
1346 if (r < 0)
1347 return log_error_errno(r, "Failed to send FIFO fd: %m");
1348
1349 /* And now make the FIFO unavailable as /run/kmsg... */
1350 (void) unlink(from);
1351
1352 return 0;
1353 }
1354
1355 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1356 union in_addr_union *exposed = userdata;
1357
1358 assert(rtnl);
1359 assert(m);
1360 assert(exposed);
1361
1362 expose_port_execute(rtnl, arg_expose_ports, exposed);
1363 return 0;
1364 }
1365
1366 static int setup_hostname(void) {
1367
1368 if (arg_share_system)
1369 return 0;
1370
1371 if (sethostname_idempotent(arg_machine) < 0)
1372 return -errno;
1373
1374 return 0;
1375 }
1376
1377 static int setup_journal(const char *directory) {
1378 sd_id128_t machine_id, this_id;
1379 _cleanup_free_ char *b = NULL, *d = NULL;
1380 const char *etc_machine_id, *p, *q;
1381 bool try;
1382 char *id;
1383 int r;
1384
1385 /* Don't link journals in ephemeral mode */
1386 if (arg_ephemeral)
1387 return 0;
1388
1389 if (arg_link_journal == LINK_NO)
1390 return 0;
1391
1392 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1393
1394 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1395
1396 r = read_one_line_file(etc_machine_id, &b);
1397 if (r == -ENOENT && try)
1398 return 0;
1399 else if (r < 0)
1400 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
1401
1402 id = strstrip(b);
1403 if (isempty(id) && try)
1404 return 0;
1405
1406 /* Verify validity */
1407 r = sd_id128_from_string(id, &machine_id);
1408 if (r < 0)
1409 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
1410
1411 r = sd_id128_get_machine(&this_id);
1412 if (r < 0)
1413 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1414
1415 if (sd_id128_equal(machine_id, this_id)) {
1416 log_full(try ? LOG_WARNING : LOG_ERR,
1417 "Host and machine ids are equal (%s): refusing to link journals", id);
1418 if (try)
1419 return 0;
1420 return -EEXIST;
1421 }
1422
1423 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1424 if (r < 0)
1425 return log_error_errno(r, "Failed to create /var: %m");
1426
1427 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1428 if (r < 0)
1429 return log_error_errno(r, "Failed to create /var/log: %m");
1430
1431 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1432 if (r < 0)
1433 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1434
1435 p = strjoina("/var/log/journal/", id);
1436 q = prefix_roota(directory, p);
1437
1438 if (path_is_mount_point(p, 0) > 0) {
1439 if (try)
1440 return 0;
1441
1442 log_error("%s: already a mount point, refusing to use for journal", p);
1443 return -EEXIST;
1444 }
1445
1446 if (path_is_mount_point(q, 0) > 0) {
1447 if (try)
1448 return 0;
1449
1450 log_error("%s: already a mount point, refusing to use for journal", q);
1451 return -EEXIST;
1452 }
1453
1454 r = readlink_and_make_absolute(p, &d);
1455 if (r >= 0) {
1456 if ((arg_link_journal == LINK_GUEST ||
1457 arg_link_journal == LINK_AUTO) &&
1458 path_equal(d, q)) {
1459
1460 r = userns_mkdir(directory, p, 0755, 0, 0);
1461 if (r < 0)
1462 log_warning_errno(r, "Failed to create directory %s: %m", q);
1463 return 0;
1464 }
1465
1466 if (unlink(p) < 0)
1467 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1468 } else if (r == -EINVAL) {
1469
1470 if (arg_link_journal == LINK_GUEST &&
1471 rmdir(p) < 0) {
1472
1473 if (errno == ENOTDIR) {
1474 log_error("%s already exists and is neither a symlink nor a directory", p);
1475 return r;
1476 } else
1477 return log_error_errno(errno, "Failed to remove %s: %m", p);
1478 }
1479 } else if (r != -ENOENT)
1480 return log_error_errno(r, "readlink(%s) failed: %m", p);
1481
1482 if (arg_link_journal == LINK_GUEST) {
1483
1484 if (symlink(q, p) < 0) {
1485 if (try) {
1486 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1487 return 0;
1488 } else
1489 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1490 }
1491
1492 r = userns_mkdir(directory, p, 0755, 0, 0);
1493 if (r < 0)
1494 log_warning_errno(r, "Failed to create directory %s: %m", q);
1495 return 0;
1496 }
1497
1498 if (arg_link_journal == LINK_HOST) {
1499 /* don't create parents here -- if the host doesn't have
1500 * permanent journal set up, don't force it here */
1501
1502 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
1503 if (try) {
1504 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1505 return 0;
1506 } else
1507 return log_error_errno(errno, "Failed to create %s: %m", p);
1508 }
1509
1510 } else if (access(p, F_OK) < 0)
1511 return 0;
1512
1513 if (dir_is_empty(q) == 0)
1514 log_warning("%s is not empty, proceeding anyway.", q);
1515
1516 r = userns_mkdir(directory, p, 0755, 0, 0);
1517 if (r < 0)
1518 return log_error_errno(r, "Failed to create %s: %m", q);
1519
1520 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1521 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1522
1523 return 0;
1524 }
1525
1526 static int drop_capabilities(void) {
1527 return capability_bounding_set_drop(arg_retain, false);
1528 }
1529
1530 static int reset_audit_loginuid(void) {
1531 _cleanup_free_ char *p = NULL;
1532 int r;
1533
1534 if (arg_share_system)
1535 return 0;
1536
1537 r = read_one_line_file("/proc/self/loginuid", &p);
1538 if (r == -ENOENT)
1539 return 0;
1540 if (r < 0)
1541 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1542
1543 /* Already reset? */
1544 if (streq(p, "4294967295"))
1545 return 0;
1546
1547 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1548 if (r < 0) {
1549 log_error_errno(r,
1550 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1551 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1552 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1553 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1554 "using systemd-nspawn. Sleeping for 5s... (%m)");
1555
1556 sleep(5);
1557 }
1558
1559 return 0;
1560 }
1561
1562 static int setup_seccomp(void) {
1563
1564 #ifdef HAVE_SECCOMP
1565 static const struct {
1566 uint64_t capability;
1567 int syscall_num;
1568 } blacklist[] = {
1569 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1570 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1571 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1572 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1573 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1574 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1575 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1576 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1577 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1578 { CAP_SYSLOG, SCMP_SYS(syslog) },
1579 };
1580
1581 scmp_filter_ctx seccomp;
1582 unsigned i;
1583 int r;
1584
1585 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1586 if (!seccomp)
1587 return log_oom();
1588
1589 r = seccomp_add_secondary_archs(seccomp);
1590 if (r < 0) {
1591 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1592 goto finish;
1593 }
1594
1595 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1596 if (arg_retain & (1ULL << blacklist[i].capability))
1597 continue;
1598
1599 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1600 if (r == -EFAULT)
1601 continue; /* unknown syscall */
1602 if (r < 0) {
1603 log_error_errno(r, "Failed to block syscall: %m");
1604 goto finish;
1605 }
1606 }
1607
1608
1609 /*
1610 Audit is broken in containers, much of the userspace audit
1611 hookup will fail if running inside a container. We don't
1612 care and just turn off creation of audit sockets.
1613
1614 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1615 with EAFNOSUPPORT which audit userspace uses as indication
1616 that audit is disabled in the kernel.
1617 */
1618
1619 r = seccomp_rule_add(
1620 seccomp,
1621 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1622 SCMP_SYS(socket),
1623 2,
1624 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1625 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1626 if (r < 0) {
1627 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1628 goto finish;
1629 }
1630
1631 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1632 if (r < 0) {
1633 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1634 goto finish;
1635 }
1636
1637 r = seccomp_load(seccomp);
1638 if (r == -EINVAL) {
1639 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1640 r = 0;
1641 goto finish;
1642 }
1643 if (r < 0) {
1644 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1645 goto finish;
1646 }
1647
1648 finish:
1649 seccomp_release(seccomp);
1650 return r;
1651 #else
1652 return 0;
1653 #endif
1654
1655 }
1656
1657 static int setup_propagate(const char *root) {
1658 const char *p, *q;
1659 int r;
1660
1661 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1662 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1663 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1664 (void) mkdir_p(p, 0600);
1665
1666 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1667 if (r < 0)
1668 return log_error_errno(r, "Failed to create /run/systemd: %m");
1669
1670 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1671 if (r < 0)
1672 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
1673
1674 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1675 if (r < 0)
1676 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
1677
1678 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1679 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1680 return log_error_errno(errno, "Failed to install propagation bind mount.");
1681
1682 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1683 return log_error_errno(errno, "Failed to make propagation mount read-only");
1684
1685 return 0;
1686 }
1687
1688 static int setup_image(char **device_path, int *loop_nr) {
1689 struct loop_info64 info = {
1690 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1691 };
1692 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1693 _cleanup_free_ char* loopdev = NULL;
1694 struct stat st;
1695 int r, nr;
1696
1697 assert(device_path);
1698 assert(loop_nr);
1699 assert(arg_image);
1700
1701 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1702 if (fd < 0)
1703 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1704
1705 if (fstat(fd, &st) < 0)
1706 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1707
1708 if (S_ISBLK(st.st_mode)) {
1709 char *p;
1710
1711 p = strdup(arg_image);
1712 if (!p)
1713 return log_oom();
1714
1715 *device_path = p;
1716
1717 *loop_nr = -1;
1718
1719 r = fd;
1720 fd = -1;
1721
1722 return r;
1723 }
1724
1725 if (!S_ISREG(st.st_mode)) {
1726 log_error("%s is not a regular file or block device.", arg_image);
1727 return -EINVAL;
1728 }
1729
1730 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1731 if (control < 0)
1732 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1733
1734 nr = ioctl(control, LOOP_CTL_GET_FREE);
1735 if (nr < 0)
1736 return log_error_errno(errno, "Failed to allocate loop device: %m");
1737
1738 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1739 return log_oom();
1740
1741 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1742 if (loop < 0)
1743 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1744
1745 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1746 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1747
1748 if (arg_read_only)
1749 info.lo_flags |= LO_FLAGS_READ_ONLY;
1750
1751 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1752 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1753
1754 *device_path = loopdev;
1755 loopdev = NULL;
1756
1757 *loop_nr = nr;
1758
1759 r = loop;
1760 loop = -1;
1761
1762 return r;
1763 }
1764
1765 #define PARTITION_TABLE_BLURB \
1766 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1767 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1768 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1769 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1770 "to be bootable with systemd-nspawn."
1771
1772 static int dissect_image(
1773 int fd,
1774 char **root_device, bool *root_device_rw,
1775 char **home_device, bool *home_device_rw,
1776 char **srv_device, bool *srv_device_rw,
1777 bool *secondary) {
1778
1779 #ifdef HAVE_BLKID
1780 int home_nr = -1, srv_nr = -1;
1781 #ifdef GPT_ROOT_NATIVE
1782 int root_nr = -1;
1783 #endif
1784 #ifdef GPT_ROOT_SECONDARY
1785 int secondary_root_nr = -1;
1786 #endif
1787 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1788 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1789 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1790 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1791 _cleanup_udev_unref_ struct udev *udev = NULL;
1792 struct udev_list_entry *first, *item;
1793 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1794 bool is_gpt, is_mbr, multiple_generic = false;
1795 const char *pttype = NULL;
1796 blkid_partlist pl;
1797 struct stat st;
1798 unsigned i;
1799 int r;
1800
1801 assert(fd >= 0);
1802 assert(root_device);
1803 assert(home_device);
1804 assert(srv_device);
1805 assert(secondary);
1806 assert(arg_image);
1807
1808 b = blkid_new_probe();
1809 if (!b)
1810 return log_oom();
1811
1812 errno = 0;
1813 r = blkid_probe_set_device(b, fd, 0, 0);
1814 if (r != 0) {
1815 if (errno == 0)
1816 return log_oom();
1817
1818 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1819 }
1820
1821 blkid_probe_enable_partitions(b, 1);
1822 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1823
1824 errno = 0;
1825 r = blkid_do_safeprobe(b);
1826 if (r == -2 || r == 1) {
1827 log_error("Failed to identify any partition table on\n"
1828 " %s\n"
1829 PARTITION_TABLE_BLURB, arg_image);
1830 return -EINVAL;
1831 } else if (r != 0) {
1832 if (errno == 0)
1833 errno = EIO;
1834 return log_error_errno(errno, "Failed to probe: %m");
1835 }
1836
1837 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1838
1839 is_gpt = streq_ptr(pttype, "gpt");
1840 is_mbr = streq_ptr(pttype, "dos");
1841
1842 if (!is_gpt && !is_mbr) {
1843 log_error("No GPT or MBR partition table discovered on\n"
1844 " %s\n"
1845 PARTITION_TABLE_BLURB, arg_image);
1846 return -EINVAL;
1847 }
1848
1849 errno = 0;
1850 pl = blkid_probe_get_partitions(b);
1851 if (!pl) {
1852 if (errno == 0)
1853 return log_oom();
1854
1855 log_error("Failed to list partitions of %s", arg_image);
1856 return -errno;
1857 }
1858
1859 udev = udev_new();
1860 if (!udev)
1861 return log_oom();
1862
1863 if (fstat(fd, &st) < 0)
1864 return log_error_errno(errno, "Failed to stat block device: %m");
1865
1866 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1867 if (!d)
1868 return log_oom();
1869
1870 for (i = 0;; i++) {
1871 int n, m;
1872
1873 if (i >= 10) {
1874 log_error("Kernel partitions never appeared.");
1875 return -ENXIO;
1876 }
1877
1878 e = udev_enumerate_new(udev);
1879 if (!e)
1880 return log_oom();
1881
1882 r = udev_enumerate_add_match_parent(e, d);
1883 if (r < 0)
1884 return log_oom();
1885
1886 r = udev_enumerate_scan_devices(e);
1887 if (r < 0)
1888 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1889
1890 /* Count the partitions enumerated by the kernel */
1891 n = 0;
1892 first = udev_enumerate_get_list_entry(e);
1893 udev_list_entry_foreach(item, first)
1894 n++;
1895
1896 /* Count the partitions enumerated by blkid */
1897 m = blkid_partlist_numof_partitions(pl);
1898 if (n == m + 1)
1899 break;
1900 if (n > m + 1) {
1901 log_error("blkid and kernel partition list do not match.");
1902 return -EIO;
1903 }
1904 if (n < m + 1) {
1905 unsigned j;
1906
1907 /* The kernel has probed fewer partitions than
1908 * blkid? Maybe the kernel prober is still
1909 * running or it got EBUSY because udev
1910 * already opened the device. Let's reprobe
1911 * the device, which is a synchronous call
1912 * that waits until probing is complete. */
1913
1914 for (j = 0; j < 20; j++) {
1915
1916 r = ioctl(fd, BLKRRPART, 0);
1917 if (r < 0)
1918 r = -errno;
1919 if (r >= 0 || r != -EBUSY)
1920 break;
1921
1922 /* If something else has the device
1923 * open, such as an udev rule, the
1924 * ioctl will return EBUSY. Since
1925 * there's no way to wait until it
1926 * isn't busy anymore, let's just wait
1927 * a bit, and try again.
1928 *
1929 * This is really something they
1930 * should fix in the kernel! */
1931
1932 usleep(50 * USEC_PER_MSEC);
1933 }
1934
1935 if (r < 0)
1936 return log_error_errno(r, "Failed to reread partition table: %m");
1937 }
1938
1939 e = udev_enumerate_unref(e);
1940 }
1941
1942 first = udev_enumerate_get_list_entry(e);
1943 udev_list_entry_foreach(item, first) {
1944 _cleanup_udev_device_unref_ struct udev_device *q;
1945 const char *node;
1946 unsigned long long flags;
1947 blkid_partition pp;
1948 dev_t qn;
1949 int nr;
1950
1951 errno = 0;
1952 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1953 if (!q) {
1954 if (!errno)
1955 errno = ENOMEM;
1956
1957 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1958 }
1959
1960 qn = udev_device_get_devnum(q);
1961 if (major(qn) == 0)
1962 continue;
1963
1964 if (st.st_rdev == qn)
1965 continue;
1966
1967 node = udev_device_get_devnode(q);
1968 if (!node)
1969 continue;
1970
1971 pp = blkid_partlist_devno_to_partition(pl, qn);
1972 if (!pp)
1973 continue;
1974
1975 flags = blkid_partition_get_flags(pp);
1976
1977 nr = blkid_partition_get_partno(pp);
1978 if (nr < 0)
1979 continue;
1980
1981 if (is_gpt) {
1982 sd_id128_t type_id;
1983 const char *stype;
1984
1985 if (flags & GPT_FLAG_NO_AUTO)
1986 continue;
1987
1988 stype = blkid_partition_get_type_string(pp);
1989 if (!stype)
1990 continue;
1991
1992 if (sd_id128_from_string(stype, &type_id) < 0)
1993 continue;
1994
1995 if (sd_id128_equal(type_id, GPT_HOME)) {
1996
1997 if (home && nr >= home_nr)
1998 continue;
1999
2000 home_nr = nr;
2001 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2002
2003 r = free_and_strdup(&home, node);
2004 if (r < 0)
2005 return log_oom();
2006
2007 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2008
2009 if (srv && nr >= srv_nr)
2010 continue;
2011
2012 srv_nr = nr;
2013 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2014
2015 r = free_and_strdup(&srv, node);
2016 if (r < 0)
2017 return log_oom();
2018 }
2019 #ifdef GPT_ROOT_NATIVE
2020 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2021
2022 if (root && nr >= root_nr)
2023 continue;
2024
2025 root_nr = nr;
2026 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2027
2028 r = free_and_strdup(&root, node);
2029 if (r < 0)
2030 return log_oom();
2031 }
2032 #endif
2033 #ifdef GPT_ROOT_SECONDARY
2034 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2035
2036 if (secondary_root && nr >= secondary_root_nr)
2037 continue;
2038
2039 secondary_root_nr = nr;
2040 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2041
2042 r = free_and_strdup(&secondary_root, node);
2043 if (r < 0)
2044 return log_oom();
2045 }
2046 #endif
2047 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2048
2049 if (generic)
2050 multiple_generic = true;
2051 else {
2052 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2053
2054 r = free_and_strdup(&generic, node);
2055 if (r < 0)
2056 return log_oom();
2057 }
2058 }
2059
2060 } else if (is_mbr) {
2061 int type;
2062
2063 if (flags != 0x80) /* Bootable flag */
2064 continue;
2065
2066 type = blkid_partition_get_type(pp);
2067 if (type != 0x83) /* Linux partition */
2068 continue;
2069
2070 if (generic)
2071 multiple_generic = true;
2072 else {
2073 generic_rw = true;
2074
2075 r = free_and_strdup(&root, node);
2076 if (r < 0)
2077 return log_oom();
2078 }
2079 }
2080 }
2081
2082 if (root) {
2083 *root_device = root;
2084 root = NULL;
2085
2086 *root_device_rw = root_rw;
2087 *secondary = false;
2088 } else if (secondary_root) {
2089 *root_device = secondary_root;
2090 secondary_root = NULL;
2091
2092 *root_device_rw = secondary_root_rw;
2093 *secondary = true;
2094 } else if (generic) {
2095
2096 /* There were no partitions with precise meanings
2097 * around, but we found generic partitions. In this
2098 * case, if there's only one, we can go ahead and boot
2099 * it, otherwise we bail out, because we really cannot
2100 * make any sense of it. */
2101
2102 if (multiple_generic) {
2103 log_error("Identified multiple bootable Linux partitions on\n"
2104 " %s\n"
2105 PARTITION_TABLE_BLURB, arg_image);
2106 return -EINVAL;
2107 }
2108
2109 *root_device = generic;
2110 generic = NULL;
2111
2112 *root_device_rw = generic_rw;
2113 *secondary = false;
2114 } else {
2115 log_error("Failed to identify root partition in disk image\n"
2116 " %s\n"
2117 PARTITION_TABLE_BLURB, arg_image);
2118 return -EINVAL;
2119 }
2120
2121 if (home) {
2122 *home_device = home;
2123 home = NULL;
2124
2125 *home_device_rw = home_rw;
2126 }
2127
2128 if (srv) {
2129 *srv_device = srv;
2130 srv = NULL;
2131
2132 *srv_device_rw = srv_rw;
2133 }
2134
2135 return 0;
2136 #else
2137 log_error("--image= is not supported, compiled without blkid support.");
2138 return -EOPNOTSUPP;
2139 #endif
2140 }
2141
2142 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2143 #ifdef HAVE_BLKID
2144 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2145 const char *fstype, *p;
2146 int r;
2147
2148 assert(what);
2149 assert(where);
2150
2151 if (arg_read_only)
2152 rw = false;
2153
2154 if (directory)
2155 p = strjoina(where, directory);
2156 else
2157 p = where;
2158
2159 errno = 0;
2160 b = blkid_new_probe_from_filename(what);
2161 if (!b) {
2162 if (errno == 0)
2163 return log_oom();
2164 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2165 }
2166
2167 blkid_probe_enable_superblocks(b, 1);
2168 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2169
2170 errno = 0;
2171 r = blkid_do_safeprobe(b);
2172 if (r == -1 || r == 1) {
2173 log_error("Cannot determine file system type of %s", what);
2174 return -EINVAL;
2175 } else if (r != 0) {
2176 if (errno == 0)
2177 errno = EIO;
2178 return log_error_errno(errno, "Failed to probe %s: %m", what);
2179 }
2180
2181 errno = 0;
2182 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2183 if (errno == 0)
2184 errno = EINVAL;
2185 log_error("Failed to determine file system type of %s", what);
2186 return -errno;
2187 }
2188
2189 if (streq(fstype, "crypto_LUKS")) {
2190 log_error("nspawn currently does not support LUKS disk images.");
2191 return -EOPNOTSUPP;
2192 }
2193
2194 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2195 return log_error_errno(errno, "Failed to mount %s: %m", what);
2196
2197 return 0;
2198 #else
2199 log_error("--image= is not supported, compiled without blkid support.");
2200 return -EOPNOTSUPP;
2201 #endif
2202 }
2203
2204 static int mount_devices(
2205 const char *where,
2206 const char *root_device, bool root_device_rw,
2207 const char *home_device, bool home_device_rw,
2208 const char *srv_device, bool srv_device_rw) {
2209 int r;
2210
2211 assert(where);
2212
2213 if (root_device) {
2214 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2215 if (r < 0)
2216 return log_error_errno(r, "Failed to mount root directory: %m");
2217 }
2218
2219 if (home_device) {
2220 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2221 if (r < 0)
2222 return log_error_errno(r, "Failed to mount home directory: %m");
2223 }
2224
2225 if (srv_device) {
2226 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2227 if (r < 0)
2228 return log_error_errno(r, "Failed to mount server data directory: %m");
2229 }
2230
2231 return 0;
2232 }
2233
2234 static void loop_remove(int nr, int *image_fd) {
2235 _cleanup_close_ int control = -1;
2236 int r;
2237
2238 if (nr < 0)
2239 return;
2240
2241 if (image_fd && *image_fd >= 0) {
2242 r = ioctl(*image_fd, LOOP_CLR_FD);
2243 if (r < 0)
2244 log_debug_errno(errno, "Failed to close loop image: %m");
2245 *image_fd = safe_close(*image_fd);
2246 }
2247
2248 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2249 if (control < 0) {
2250 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2251 return;
2252 }
2253
2254 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2255 if (r < 0)
2256 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2257 }
2258
2259 /*
2260 * Return values:
2261 * < 0 : wait_for_terminate() failed to get the state of the
2262 * container, the container was terminated by a signal, or
2263 * failed for an unknown reason. No change is made to the
2264 * container argument.
2265 * > 0 : The program executed in the container terminated with an
2266 * error. The exit code of the program executed in the
2267 * container is returned. The container argument has been set
2268 * to CONTAINER_TERMINATED.
2269 * 0 : The container is being rebooted, has been shut down or exited
2270 * successfully. The container argument has been set to either
2271 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2272 *
2273 * That is, success is indicated by a return value of zero, and an
2274 * error is indicated by a non-zero value.
2275 */
2276 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2277 siginfo_t status;
2278 int r;
2279
2280 r = wait_for_terminate(pid, &status);
2281 if (r < 0)
2282 return log_warning_errno(r, "Failed to wait for container: %m");
2283
2284 switch (status.si_code) {
2285
2286 case CLD_EXITED:
2287 if (status.si_status == 0) {
2288 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2289
2290 } else
2291 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2292
2293 *container = CONTAINER_TERMINATED;
2294 return status.si_status;
2295
2296 case CLD_KILLED:
2297 if (status.si_status == SIGINT) {
2298
2299 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2300 *container = CONTAINER_TERMINATED;
2301 return 0;
2302
2303 } else if (status.si_status == SIGHUP) {
2304
2305 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2306 *container = CONTAINER_REBOOTED;
2307 return 0;
2308 }
2309
2310 /* CLD_KILLED fallthrough */
2311
2312 case CLD_DUMPED:
2313 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2314 return -EIO;
2315
2316 default:
2317 log_error("Container %s failed due to unknown reason.", arg_machine);
2318 return -EIO;
2319 }
2320
2321 return r;
2322 }
2323
2324 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2325 pid_t pid;
2326
2327 pid = PTR_TO_PID(userdata);
2328 if (pid > 0) {
2329 if (kill(pid, arg_kill_signal) >= 0) {
2330 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2331 sd_event_source_set_userdata(s, NULL);
2332 return 0;
2333 }
2334 }
2335
2336 sd_event_exit(sd_event_source_get_event(s), 0);
2337 return 0;
2338 }
2339
2340 static int determine_names(void) {
2341 int r;
2342
2343 if (arg_template && !arg_directory && arg_machine) {
2344
2345 /* If --template= was specified then we should not
2346 * search for a machine, but instead create a new one
2347 * in /var/lib/machine. */
2348
2349 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2350 if (!arg_directory)
2351 return log_oom();
2352 }
2353
2354 if (!arg_image && !arg_directory) {
2355 if (arg_machine) {
2356 _cleanup_(image_unrefp) Image *i = NULL;
2357
2358 r = image_find(arg_machine, &i);
2359 if (r < 0)
2360 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2361 else if (r == 0) {
2362 log_error("No image for machine '%s': %m", arg_machine);
2363 return -ENOENT;
2364 }
2365
2366 if (i->type == IMAGE_RAW)
2367 r = free_and_strdup(&arg_image, i->path);
2368 else
2369 r = free_and_strdup(&arg_directory, i->path);
2370 if (r < 0)
2371 return log_error_errno(r, "Invalid image directory: %m");
2372
2373 if (!arg_ephemeral)
2374 arg_read_only = arg_read_only || i->read_only;
2375 } else
2376 arg_directory = get_current_dir_name();
2377
2378 if (!arg_directory && !arg_machine) {
2379 log_error("Failed to determine path, please use -D or -i.");
2380 return -EINVAL;
2381 }
2382 }
2383
2384 if (!arg_machine) {
2385 if (arg_directory && path_equal(arg_directory, "/"))
2386 arg_machine = gethostname_malloc();
2387 else
2388 arg_machine = strdup(basename(arg_image ?: arg_directory));
2389
2390 if (!arg_machine)
2391 return log_oom();
2392
2393 hostname_cleanup(arg_machine);
2394 if (!machine_name_is_valid(arg_machine)) {
2395 log_error("Failed to determine machine name automatically, please use -M.");
2396 return -EINVAL;
2397 }
2398
2399 if (arg_ephemeral) {
2400 char *b;
2401
2402 /* Add a random suffix when this is an
2403 * ephemeral machine, so that we can run many
2404 * instances at once without manually having
2405 * to specify -M each time. */
2406
2407 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2408 return log_oom();
2409
2410 free(arg_machine);
2411 arg_machine = b;
2412 }
2413 }
2414
2415 return 0;
2416 }
2417
2418 static int determine_uid_shift(const char *directory) {
2419 int r;
2420
2421 if (!arg_userns) {
2422 arg_uid_shift = 0;
2423 return 0;
2424 }
2425
2426 if (arg_uid_shift == UID_INVALID) {
2427 struct stat st;
2428
2429 r = stat(directory, &st);
2430 if (r < 0)
2431 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2432
2433 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2434
2435 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2436 log_error("UID and GID base of %s don't match.", directory);
2437 return -EINVAL;
2438 }
2439
2440 arg_uid_range = UINT32_C(0x10000);
2441 }
2442
2443 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2444 log_error("UID base too high for UID range.");
2445 return -EINVAL;
2446 }
2447
2448 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2449 return 0;
2450 }
2451
2452 static int inner_child(
2453 Barrier *barrier,
2454 const char *directory,
2455 bool secondary,
2456 int kmsg_socket,
2457 int rtnl_socket,
2458 FDSet *fds) {
2459
2460 _cleanup_free_ char *home = NULL;
2461 unsigned n_env = 1;
2462 const char *envp[] = {
2463 "PATH=" DEFAULT_PATH_SPLIT_USR,
2464 NULL, /* container */
2465 NULL, /* TERM */
2466 NULL, /* HOME */
2467 NULL, /* USER */
2468 NULL, /* LOGNAME */
2469 NULL, /* container_uuid */
2470 NULL, /* LISTEN_FDS */
2471 NULL, /* LISTEN_PID */
2472 NULL
2473 };
2474
2475 _cleanup_strv_free_ char **env_use = NULL;
2476 int r;
2477
2478 assert(barrier);
2479 assert(directory);
2480 assert(kmsg_socket >= 0);
2481
2482 cg_unified_flush();
2483
2484 if (arg_userns) {
2485 /* Tell the parent, that it now can write the UID map. */
2486 (void) barrier_place(barrier); /* #1 */
2487
2488 /* Wait until the parent wrote the UID map */
2489 if (!barrier_place_and_sync(barrier)) { /* #2 */
2490 log_error("Parent died too early");
2491 return -ESRCH;
2492 }
2493 }
2494
2495 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
2496 if (r < 0)
2497 return r;
2498
2499 r = mount_sysfs(NULL);
2500 if (r < 0)
2501 return r;
2502
2503 /* Wait until we are cgroup-ified, so that we
2504 * can mount the right cgroup path writable */
2505 if (!barrier_place_and_sync(barrier)) { /* #3 */
2506 log_error("Parent died too early");
2507 return -ESRCH;
2508 }
2509
2510 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2511 if (r < 0)
2512 return r;
2513
2514 r = reset_uid_gid();
2515 if (r < 0)
2516 return log_error_errno(r, "Couldn't become new root: %m");
2517
2518 r = setup_boot_id(NULL);
2519 if (r < 0)
2520 return r;
2521
2522 r = setup_kmsg(NULL, kmsg_socket);
2523 if (r < 0)
2524 return r;
2525 kmsg_socket = safe_close(kmsg_socket);
2526
2527 umask(0022);
2528
2529 if (setsid() < 0)
2530 return log_error_errno(errno, "setsid() failed: %m");
2531
2532 if (arg_private_network)
2533 loopback_setup();
2534
2535 if (arg_expose_ports) {
2536 r = expose_port_send_rtnl(rtnl_socket);
2537 if (r < 0)
2538 return r;
2539 rtnl_socket = safe_close(rtnl_socket);
2540 }
2541
2542 r = drop_capabilities();
2543 if (r < 0)
2544 return log_error_errno(r, "drop_capabilities() failed: %m");
2545
2546 setup_hostname();
2547
2548 if (arg_personality != PERSONALITY_INVALID) {
2549 if (personality(arg_personality) < 0)
2550 return log_error_errno(errno, "personality() failed: %m");
2551 } else if (secondary) {
2552 if (personality(PER_LINUX32) < 0)
2553 return log_error_errno(errno, "personality() failed: %m");
2554 }
2555
2556 #ifdef HAVE_SELINUX
2557 if (arg_selinux_context)
2558 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2559 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2560 #endif
2561
2562 r = change_uid_gid(arg_user, &home);
2563 if (r < 0)
2564 return r;
2565
2566 /* LXC sets container=lxc, so follow the scheme here */
2567 envp[n_env++] = strjoina("container=", arg_container_service_name);
2568
2569 envp[n_env] = strv_find_prefix(environ, "TERM=");
2570 if (envp[n_env])
2571 n_env++;
2572
2573 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2574 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2575 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2576 return log_oom();
2577
2578 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2579 char as_uuid[37];
2580
2581 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2582 return log_oom();
2583 }
2584
2585 if (fdset_size(fds) > 0) {
2586 r = fdset_cloexec(fds, false);
2587 if (r < 0)
2588 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2589
2590 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2591 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2592 return log_oom();
2593 }
2594
2595 env_use = strv_env_merge(2, envp, arg_setenv);
2596 if (!env_use)
2597 return log_oom();
2598
2599 /* Let the parent know that we are ready and
2600 * wait until the parent is ready with the
2601 * setup, too... */
2602 if (!barrier_place_and_sync(barrier)) { /* #4 */
2603 log_error("Parent died too early");
2604 return -ESRCH;
2605 }
2606
2607 if (arg_chdir)
2608 if (chdir(arg_chdir) < 0)
2609 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2610
2611 if (arg_start_mode == START_PID2) {
2612 r = stub_pid1();
2613 if (r < 0)
2614 return r;
2615 }
2616
2617 /* Now, explicitly close the log, so that we
2618 * then can close all remaining fds. Closing
2619 * the log explicitly first has the benefit
2620 * that the logging subsystem knows about it,
2621 * and is thus ready to be reopened should we
2622 * need it again. Note that the other fds
2623 * closed here are at least the locking and
2624 * barrier fds. */
2625 log_close();
2626 (void) fdset_close_others(fds);
2627
2628 if (arg_start_mode == START_BOOT) {
2629 char **a;
2630 size_t m;
2631
2632 /* Automatically search for the init system */
2633
2634 m = strv_length(arg_parameters);
2635 a = newa(char*, m + 2);
2636 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2637 a[1 + m] = NULL;
2638
2639 a[0] = (char*) "/usr/lib/systemd/systemd";
2640 execve(a[0], a, env_use);
2641
2642 a[0] = (char*) "/lib/systemd/systemd";
2643 execve(a[0], a, env_use);
2644
2645 a[0] = (char*) "/sbin/init";
2646 execve(a[0], a, env_use);
2647 } else if (!strv_isempty(arg_parameters))
2648 execvpe(arg_parameters[0], arg_parameters, env_use);
2649 else {
2650 if (!arg_chdir)
2651 chdir(home ?: "/root");
2652
2653 execle("/bin/bash", "-bash", NULL, env_use);
2654 execle("/bin/sh", "-sh", NULL, env_use);
2655 }
2656
2657 r = -errno;
2658 (void) log_open();
2659 return log_error_errno(r, "execv() failed: %m");
2660 }
2661
2662 static int outer_child(
2663 Barrier *barrier,
2664 const char *directory,
2665 const char *console,
2666 const char *root_device, bool root_device_rw,
2667 const char *home_device, bool home_device_rw,
2668 const char *srv_device, bool srv_device_rw,
2669 bool interactive,
2670 bool secondary,
2671 int pid_socket,
2672 int kmsg_socket,
2673 int rtnl_socket,
2674 int uid_shift_socket,
2675 FDSet *fds) {
2676
2677 pid_t pid;
2678 ssize_t l;
2679 int r;
2680
2681 assert(barrier);
2682 assert(directory);
2683 assert(console);
2684 assert(pid_socket >= 0);
2685 assert(kmsg_socket >= 0);
2686
2687 cg_unified_flush();
2688
2689 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2690 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2691
2692 if (interactive) {
2693 close_nointr(STDIN_FILENO);
2694 close_nointr(STDOUT_FILENO);
2695 close_nointr(STDERR_FILENO);
2696
2697 r = open_terminal(console, O_RDWR);
2698 if (r != STDIN_FILENO) {
2699 if (r >= 0) {
2700 safe_close(r);
2701 r = -EINVAL;
2702 }
2703
2704 return log_error_errno(r, "Failed to open console: %m");
2705 }
2706
2707 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2708 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2709 return log_error_errno(errno, "Failed to duplicate console: %m");
2710 }
2711
2712 r = reset_audit_loginuid();
2713 if (r < 0)
2714 return r;
2715
2716 /* Mark everything as slave, so that we still
2717 * receive mounts from the real root, but don't
2718 * propagate mounts to the real root. */
2719 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2720 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2721
2722 r = mount_devices(directory,
2723 root_device, root_device_rw,
2724 home_device, home_device_rw,
2725 srv_device, srv_device_rw);
2726 if (r < 0)
2727 return r;
2728
2729 r = determine_uid_shift(directory);
2730 if (r < 0)
2731 return r;
2732
2733 if (arg_userns) {
2734 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2735 if (l < 0)
2736 return log_error_errno(errno, "Failed to send UID shift: %m");
2737 if (l != sizeof(arg_uid_shift)) {
2738 log_error("Short write while sending UID shift.");
2739 return -EIO;
2740 }
2741 }
2742
2743 /* Turn directory into bind mount */
2744 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2745 return log_error_errno(errno, "Failed to make bind mount: %m");
2746
2747 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2748 if (r < 0)
2749 return r;
2750
2751 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2752 if (r < 0)
2753 return r;
2754
2755 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2756 if (r < 0)
2757 return r;
2758
2759 if (arg_read_only) {
2760 r = bind_remount_recursive(directory, true);
2761 if (r < 0)
2762 return log_error_errno(r, "Failed to make tree read-only: %m");
2763 }
2764
2765 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2766 if (r < 0)
2767 return r;
2768
2769 r = copy_devnodes(directory);
2770 if (r < 0)
2771 return r;
2772
2773 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2774
2775 r = setup_pts(directory);
2776 if (r < 0)
2777 return r;
2778
2779 r = setup_propagate(directory);
2780 if (r < 0)
2781 return r;
2782
2783 r = setup_dev_console(directory, console);
2784 if (r < 0)
2785 return r;
2786
2787 r = setup_seccomp();
2788 if (r < 0)
2789 return r;
2790
2791 r = setup_timezone(directory);
2792 if (r < 0)
2793 return r;
2794
2795 r = setup_resolv_conf(directory);
2796 if (r < 0)
2797 return r;
2798
2799 r = setup_journal(directory);
2800 if (r < 0)
2801 return r;
2802
2803 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2804 if (r < 0)
2805 return r;
2806
2807 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2808 if (r < 0)
2809 return r;
2810
2811 r = mount_move_root(directory);
2812 if (r < 0)
2813 return log_error_errno(r, "Failed to move root directory: %m");
2814
2815 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2816 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2817 (arg_private_network ? CLONE_NEWNET : 0) |
2818 (arg_userns ? CLONE_NEWUSER : 0),
2819 NULL);
2820 if (pid < 0)
2821 return log_error_errno(errno, "Failed to fork inner child: %m");
2822 if (pid == 0) {
2823 pid_socket = safe_close(pid_socket);
2824 uid_shift_socket = safe_close(uid_shift_socket);
2825
2826 /* The inner child has all namespaces that are
2827 * requested, so that we all are owned by the user if
2828 * user namespaces are turned on. */
2829
2830 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2831 if (r < 0)
2832 _exit(EXIT_FAILURE);
2833
2834 _exit(EXIT_SUCCESS);
2835 }
2836
2837 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2838 if (l < 0)
2839 return log_error_errno(errno, "Failed to send PID: %m");
2840 if (l != sizeof(pid)) {
2841 log_error("Short write while sending PID.");
2842 return -EIO;
2843 }
2844
2845 pid_socket = safe_close(pid_socket);
2846 kmsg_socket = safe_close(kmsg_socket);
2847 rtnl_socket = safe_close(rtnl_socket);
2848
2849 return 0;
2850 }
2851
2852 static int setup_uid_map(pid_t pid) {
2853 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2854 int r;
2855
2856 assert(pid > 1);
2857
2858 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2859 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2860 r = write_string_file(uid_map, line, 0);
2861 if (r < 0)
2862 return log_error_errno(r, "Failed to write UID map: %m");
2863
2864 /* We always assign the same UID and GID ranges */
2865 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2866 r = write_string_file(uid_map, line, 0);
2867 if (r < 0)
2868 return log_error_errno(r, "Failed to write GID map: %m");
2869
2870 return 0;
2871 }
2872
2873 static int load_settings(void) {
2874 _cleanup_(settings_freep) Settings *settings = NULL;
2875 _cleanup_fclose_ FILE *f = NULL;
2876 _cleanup_free_ char *p = NULL;
2877 const char *fn, *i;
2878 int r;
2879
2880 /* If all settings are masked, there's no point in looking for
2881 * the settings file */
2882 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2883 return 0;
2884
2885 fn = strjoina(arg_machine, ".nspawn");
2886
2887 /* We first look in the admin's directories in /etc and /run */
2888 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2889 _cleanup_free_ char *j = NULL;
2890
2891 j = strjoin(i, "/", fn, NULL);
2892 if (!j)
2893 return log_oom();
2894
2895 f = fopen(j, "re");
2896 if (f) {
2897 p = j;
2898 j = NULL;
2899
2900 /* By default, we trust configuration from /etc and /run */
2901 if (arg_settings_trusted < 0)
2902 arg_settings_trusted = true;
2903
2904 break;
2905 }
2906
2907 if (errno != ENOENT)
2908 return log_error_errno(errno, "Failed to open %s: %m", j);
2909 }
2910
2911 if (!f) {
2912 /* After that, let's look for a file next to the
2913 * actual image we shall boot. */
2914
2915 if (arg_image) {
2916 p = file_in_same_dir(arg_image, fn);
2917 if (!p)
2918 return log_oom();
2919 } else if (arg_directory) {
2920 p = file_in_same_dir(arg_directory, fn);
2921 if (!p)
2922 return log_oom();
2923 }
2924
2925 if (p) {
2926 f = fopen(p, "re");
2927 if (!f && errno != ENOENT)
2928 return log_error_errno(errno, "Failed to open %s: %m", p);
2929
2930 /* By default, we do not trust configuration from /var/lib/machines */
2931 if (arg_settings_trusted < 0)
2932 arg_settings_trusted = false;
2933 }
2934 }
2935
2936 if (!f)
2937 return 0;
2938
2939 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2940
2941 r = settings_load(f, p, &settings);
2942 if (r < 0)
2943 return r;
2944
2945 /* Copy over bits from the settings, unless they have been
2946 * explicitly masked by command line switches. */
2947
2948 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
2949 settings->start_mode >= 0) {
2950 arg_start_mode = settings->start_mode;
2951
2952 strv_free(arg_parameters);
2953 arg_parameters = settings->parameters;
2954 settings->parameters = NULL;
2955 }
2956
2957 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
2958 settings->working_directory) {
2959 free(arg_chdir);
2960 arg_chdir = settings->working_directory;
2961 settings->working_directory = NULL;
2962 }
2963
2964 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2965 settings->environment) {
2966 strv_free(arg_setenv);
2967 arg_setenv = settings->environment;
2968 settings->environment = NULL;
2969 }
2970
2971 if ((arg_settings_mask & SETTING_USER) == 0 &&
2972 settings->user) {
2973 free(arg_user);
2974 arg_user = settings->user;
2975 settings->user = NULL;
2976 }
2977
2978 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
2979 uint64_t plus;
2980
2981 plus = settings->capability;
2982 if (settings_private_network(settings))
2983 plus |= (1ULL << CAP_NET_ADMIN);
2984
2985 if (!arg_settings_trusted && plus != 0) {
2986 if (settings->capability != 0)
2987 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2988 } else
2989 arg_retain |= plus;
2990
2991 arg_retain &= ~settings->drop_capability;
2992 }
2993
2994 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2995 settings->kill_signal > 0)
2996 arg_kill_signal = settings->kill_signal;
2997
2998 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2999 settings->personality != PERSONALITY_INVALID)
3000 arg_personality = settings->personality;
3001
3002 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3003 !sd_id128_is_null(settings->machine_id)) {
3004
3005 if (!arg_settings_trusted)
3006 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3007 else
3008 arg_uuid = settings->machine_id;
3009 }
3010
3011 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3012 settings->read_only >= 0)
3013 arg_read_only = settings->read_only;
3014
3015 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3016 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3017 arg_volatile_mode = settings->volatile_mode;
3018
3019 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3020 settings->n_custom_mounts > 0) {
3021
3022 if (!arg_settings_trusted)
3023 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3024 else {
3025 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3026 arg_custom_mounts = settings->custom_mounts;
3027 arg_n_custom_mounts = settings->n_custom_mounts;
3028
3029 settings->custom_mounts = NULL;
3030 settings->n_custom_mounts = 0;
3031 }
3032 }
3033
3034 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3035 (settings->private_network >= 0 ||
3036 settings->network_veth >= 0 ||
3037 settings->network_bridge ||
3038 settings->network_interfaces ||
3039 settings->network_macvlan ||
3040 settings->network_ipvlan ||
3041 settings->network_veth_extra)) {
3042
3043 if (!arg_settings_trusted)
3044 log_warning("Ignoring network settings, file %s is not trusted.", p);
3045 else {
3046 arg_network_veth = settings_network_veth(settings);
3047 arg_private_network = settings_private_network(settings);
3048
3049 strv_free(arg_network_interfaces);
3050 arg_network_interfaces = settings->network_interfaces;
3051 settings->network_interfaces = NULL;
3052
3053 strv_free(arg_network_macvlan);
3054 arg_network_macvlan = settings->network_macvlan;
3055 settings->network_macvlan = NULL;
3056
3057 strv_free(arg_network_ipvlan);
3058 arg_network_ipvlan = settings->network_ipvlan;
3059 settings->network_ipvlan = NULL;
3060
3061 strv_free(arg_network_veth_extra);
3062 arg_network_veth_extra = settings->network_veth_extra;
3063 settings->network_veth_extra = NULL;
3064
3065 free(arg_network_bridge);
3066 arg_network_bridge = settings->network_bridge;
3067 settings->network_bridge = NULL;
3068 }
3069 }
3070
3071 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3072 settings->expose_ports) {
3073
3074 if (!arg_settings_trusted)
3075 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3076 else {
3077 expose_port_free_all(arg_expose_ports);
3078 arg_expose_ports = settings->expose_ports;
3079 settings->expose_ports = NULL;
3080 }
3081 }
3082
3083 return 0;
3084 }
3085
3086 int main(int argc, char *argv[]) {
3087
3088 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3089 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3090 _cleanup_close_ int master = -1, image_fd = -1;
3091 _cleanup_fdset_free_ FDSet *fds = NULL;
3092 int r, n_fd_passed, loop_nr = -1;
3093 char veth_name[IFNAMSIZ];
3094 bool secondary = false, remove_subvol = false;
3095 sigset_t mask_chld;
3096 pid_t pid = 0;
3097 int ret = EXIT_SUCCESS;
3098 union in_addr_union exposed = {};
3099 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3100 bool interactive;
3101
3102 log_parse_environment();
3103 log_open();
3104
3105 /* Make sure rename_process() in the stub init process can work */
3106 saved_argv = argv;
3107 saved_argc = argc;
3108
3109 r = parse_argv(argc, argv);
3110 if (r <= 0)
3111 goto finish;
3112
3113 if (geteuid() != 0) {
3114 log_error("Need to be root.");
3115 r = -EPERM;
3116 goto finish;
3117 }
3118 r = determine_names();
3119 if (r < 0)
3120 goto finish;
3121
3122 r = load_settings();
3123 if (r < 0)
3124 goto finish;
3125
3126 r = verify_arguments();
3127 if (r < 0)
3128 goto finish;
3129
3130 n_fd_passed = sd_listen_fds(false);
3131 if (n_fd_passed > 0) {
3132 r = fdset_new_listen_fds(&fds, false);
3133 if (r < 0) {
3134 log_error_errno(r, "Failed to collect file descriptors: %m");
3135 goto finish;
3136 }
3137 }
3138
3139 if (arg_directory) {
3140 assert(!arg_image);
3141
3142 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3143 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3144 r = -EINVAL;
3145 goto finish;
3146 }
3147
3148 if (arg_ephemeral) {
3149 _cleanup_free_ char *np = NULL;
3150
3151 /* If the specified path is a mount point we
3152 * generate the new snapshot immediately
3153 * inside it under a random name. However if
3154 * the specified is not a mount point we
3155 * create the new snapshot in the parent
3156 * directory, just next to it. */
3157 r = path_is_mount_point(arg_directory, 0);
3158 if (r < 0) {
3159 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3160 goto finish;
3161 }
3162 if (r > 0)
3163 r = tempfn_random_child(arg_directory, "machine.", &np);
3164 else
3165 r = tempfn_random(arg_directory, "machine.", &np);
3166 if (r < 0) {
3167 log_error_errno(r, "Failed to generate name for snapshot: %m");
3168 goto finish;
3169 }
3170
3171 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3172 if (r < 0) {
3173 log_error_errno(r, "Failed to lock %s: %m", np);
3174 goto finish;
3175 }
3176
3177 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3178 if (r < 0) {
3179 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3180 goto finish;
3181 }
3182
3183 free(arg_directory);
3184 arg_directory = np;
3185 np = NULL;
3186
3187 remove_subvol = true;
3188
3189 } else {
3190 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3191 if (r == -EBUSY) {
3192 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3193 goto finish;
3194 }
3195 if (r < 0) {
3196 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3197 return r;
3198 }
3199
3200 if (arg_template) {
3201 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3202 if (r == -EEXIST) {
3203 if (!arg_quiet)
3204 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3205 } else if (r < 0) {
3206 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3207 goto finish;
3208 } else {
3209 if (!arg_quiet)
3210 log_info("Populated %s from template %s.", arg_directory, arg_template);
3211 }
3212 }
3213 }
3214
3215 if (arg_start_mode == START_BOOT) {
3216 if (path_is_os_tree(arg_directory) <= 0) {
3217 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3218 r = -EINVAL;
3219 goto finish;
3220 }
3221 } else {
3222 const char *p;
3223
3224 p = strjoina(arg_directory, "/usr/");
3225 if (laccess(p, F_OK) < 0) {
3226 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
3227 r = -EINVAL;
3228 goto finish;
3229 }
3230 }
3231
3232 } else {
3233 char template[] = "/tmp/nspawn-root-XXXXXX";
3234
3235 assert(arg_image);
3236 assert(!arg_template);
3237
3238 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3239 if (r == -EBUSY) {
3240 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3241 goto finish;
3242 }
3243 if (r < 0) {
3244 r = log_error_errno(r, "Failed to create image lock: %m");
3245 goto finish;
3246 }
3247
3248 if (!mkdtemp(template)) {
3249 log_error_errno(errno, "Failed to create temporary directory: %m");
3250 r = -errno;
3251 goto finish;
3252 }
3253
3254 arg_directory = strdup(template);
3255 if (!arg_directory) {
3256 r = log_oom();
3257 goto finish;
3258 }
3259
3260 image_fd = setup_image(&device_path, &loop_nr);
3261 if (image_fd < 0) {
3262 r = image_fd;
3263 goto finish;
3264 }
3265
3266 r = dissect_image(image_fd,
3267 &root_device, &root_device_rw,
3268 &home_device, &home_device_rw,
3269 &srv_device, &srv_device_rw,
3270 &secondary);
3271 if (r < 0)
3272 goto finish;
3273 }
3274
3275 r = custom_mounts_prepare();
3276 if (r < 0)
3277 goto finish;
3278
3279 interactive =
3280 isatty(STDIN_FILENO) > 0 &&
3281 isatty(STDOUT_FILENO) > 0;
3282
3283 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3284 if (master < 0) {
3285 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3286 goto finish;
3287 }
3288
3289 r = ptsname_malloc(master, &console);
3290 if (r < 0) {
3291 r = log_error_errno(r, "Failed to determine tty name: %m");
3292 goto finish;
3293 }
3294
3295 if (arg_selinux_apifs_context) {
3296 r = mac_selinux_apply(console, arg_selinux_apifs_context);
3297 if (r < 0)
3298 goto finish;
3299 }
3300
3301 if (unlockpt(master) < 0) {
3302 r = log_error_errno(errno, "Failed to unlock tty: %m");
3303 goto finish;
3304 }
3305
3306 if (!arg_quiet)
3307 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3308 arg_machine, arg_image ?: arg_directory);
3309
3310 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3311
3312 assert_se(sigemptyset(&mask_chld) == 0);
3313 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3314
3315 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3316 r = log_error_errno(errno, "Failed to become subreaper: %m");
3317 goto finish;
3318 }
3319
3320 for (;;) {
3321 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 };
3322 ContainerStatus container_status;
3323 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3324 static const struct sigaction sa = {
3325 .sa_handler = nop_signal_handler,
3326 .sa_flags = SA_NOCLDSTOP,
3327 };
3328 int ifi = 0;
3329 ssize_t l;
3330 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3331 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3332 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3333 char last_char = 0;
3334
3335 r = barrier_create(&barrier);
3336 if (r < 0) {
3337 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3338 goto finish;
3339 }
3340
3341 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3342 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3343 goto finish;
3344 }
3345
3346 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3347 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3348 goto finish;
3349 }
3350
3351 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3352 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3353 goto finish;
3354 }
3355
3356 if (arg_userns)
3357 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3358 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3359 goto finish;
3360 }
3361
3362 /* Child can be killed before execv(), so handle SIGCHLD
3363 * in order to interrupt parent's blocking calls and
3364 * give it a chance to call wait() and terminate. */
3365 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3366 if (r < 0) {
3367 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3368 goto finish;
3369 }
3370
3371 r = sigaction(SIGCHLD, &sa, NULL);
3372 if (r < 0) {
3373 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3374 goto finish;
3375 }
3376
3377 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3378 if (pid < 0) {
3379 if (errno == EINVAL)
3380 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3381 else
3382 r = log_error_errno(errno, "clone() failed: %m");
3383
3384 goto finish;
3385 }
3386
3387 if (pid == 0) {
3388 /* The outer child only has a file system namespace. */
3389 barrier_set_role(&barrier, BARRIER_CHILD);
3390
3391 master = safe_close(master);
3392
3393 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3394 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3395 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3396 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3397
3398 (void) reset_all_signal_handlers();
3399 (void) reset_signal_mask();
3400
3401 r = outer_child(&barrier,
3402 arg_directory,
3403 console,
3404 root_device, root_device_rw,
3405 home_device, home_device_rw,
3406 srv_device, srv_device_rw,
3407 interactive,
3408 secondary,
3409 pid_socket_pair[1],
3410 kmsg_socket_pair[1],
3411 rtnl_socket_pair[1],
3412 uid_shift_socket_pair[1],
3413 fds);
3414 if (r < 0)
3415 _exit(EXIT_FAILURE);
3416
3417 _exit(EXIT_SUCCESS);
3418 }
3419
3420 barrier_set_role(&barrier, BARRIER_PARENT);
3421
3422 fds = fdset_free(fds);
3423
3424 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3425 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3426 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3427 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3428
3429 /* Wait for the outer child. */
3430 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3431 if (r < 0)
3432 goto finish;
3433 if (r != 0) {
3434 r = -EIO;
3435 goto finish;
3436 }
3437 pid = 0;
3438
3439 /* And now retrieve the PID of the inner child. */
3440 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3441 if (l < 0) {
3442 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3443 goto finish;
3444 }
3445 if (l != sizeof(pid)) {
3446 log_error("Short read while reading inner child PID.");
3447 r = EIO;
3448 goto finish;
3449 }
3450
3451 log_debug("Init process invoked as PID " PID_FMT, pid);
3452
3453 if (arg_userns) {
3454 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3455 log_error("Child died too early.");
3456 r = -ESRCH;
3457 goto finish;
3458 }
3459
3460 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3461 if (l < 0) {
3462 r = log_error_errno(errno, "Failed to read UID shift: %m");
3463 goto finish;
3464 }
3465 if (l != sizeof(arg_uid_shift)) {
3466 log_error("Short read while reading UID shift.");
3467 r = EIO;
3468 goto finish;
3469 }
3470
3471 r = setup_uid_map(pid);
3472 if (r < 0)
3473 goto finish;
3474
3475 (void) barrier_place(&barrier); /* #2 */
3476 }
3477
3478 if (arg_private_network) {
3479
3480 r = move_network_interfaces(pid, arg_network_interfaces);
3481 if (r < 0)
3482 goto finish;
3483
3484 if (arg_network_veth) {
3485 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3486 if (r < 0)
3487 goto finish;
3488 else if (r > 0)
3489 ifi = r;
3490
3491 if (arg_network_bridge) {
3492 r = setup_bridge(veth_name, arg_network_bridge);
3493 if (r < 0)
3494 goto finish;
3495 if (r > 0)
3496 ifi = r;
3497 }
3498 }
3499
3500 r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
3501 if (r < 0)
3502 goto finish;
3503
3504 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3505 if (r < 0)
3506 goto finish;
3507
3508 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3509 if (r < 0)
3510 goto finish;
3511 }
3512
3513 if (arg_register) {
3514 r = register_machine(
3515 arg_machine,
3516 pid,
3517 arg_directory,
3518 arg_uuid,
3519 ifi,
3520 arg_slice,
3521 arg_custom_mounts, arg_n_custom_mounts,
3522 arg_kill_signal,
3523 arg_property,
3524 arg_keep_unit,
3525 arg_container_service_name);
3526 if (r < 0)
3527 goto finish;
3528 }
3529
3530 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3531 if (r < 0)
3532 goto finish;
3533
3534 if (arg_keep_unit) {
3535 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3536 if (r < 0)
3537 goto finish;
3538 }
3539
3540 r = chown_cgroup(pid, arg_uid_shift);
3541 if (r < 0)
3542 goto finish;
3543
3544 /* Notify the child that the parent is ready with all
3545 * its setup (including cgroup-ification), and that
3546 * the child can now hand over control to the code to
3547 * run inside the container. */
3548 (void) barrier_place(&barrier); /* #3 */
3549
3550 /* Block SIGCHLD here, before notifying child.
3551 * process_pty() will handle it with the other signals. */
3552 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3553
3554 /* Reset signal to default */
3555 r = default_signals(SIGCHLD, -1);
3556 if (r < 0) {
3557 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3558 goto finish;
3559 }
3560
3561 /* Let the child know that we are ready and wait that the child is completely ready now. */
3562 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3563 log_error("Child died too early.");
3564 r = -ESRCH;
3565 goto finish;
3566 }
3567
3568 sd_notifyf(false,
3569 "READY=1\n"
3570 "STATUS=Container running.\n"
3571 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3572
3573 r = sd_event_new(&event);
3574 if (r < 0) {
3575 log_error_errno(r, "Failed to get default event source: %m");
3576 goto finish;
3577 }
3578
3579 if (arg_kill_signal > 0) {
3580 /* Try to kill the init system on SIGINT or SIGTERM */
3581 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid));
3582 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid));
3583 } else {
3584 /* Immediately exit */
3585 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3586 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3587 }
3588
3589 /* simply exit on sigchld */
3590 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3591
3592 if (arg_expose_ports) {
3593 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3594 if (r < 0)
3595 goto finish;
3596
3597 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3598 }
3599
3600 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3601
3602 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
3603 if (r < 0) {
3604 log_error_errno(r, "Failed to create PTY forwarder: %m");
3605 goto finish;
3606 }
3607
3608 r = sd_event_loop(event);
3609 if (r < 0) {
3610 log_error_errno(r, "Failed to run event loop: %m");
3611 goto finish;
3612 }
3613
3614 pty_forward_get_last_char(forward, &last_char);
3615
3616 forward = pty_forward_free(forward);
3617
3618 if (!arg_quiet && last_char != '\n')
3619 putc('\n', stdout);
3620
3621 /* Kill if it is not dead yet anyway */
3622 if (arg_register && !arg_keep_unit)
3623 terminate_machine(pid);
3624
3625 /* Normally redundant, but better safe than sorry */
3626 kill(pid, SIGKILL);
3627
3628 r = wait_for_container(pid, &container_status);
3629 pid = 0;
3630
3631 if (r < 0)
3632 /* We failed to wait for the container, or the
3633 * container exited abnormally */
3634 goto finish;
3635 else if (r > 0 || container_status == CONTAINER_TERMINATED) {
3636 /* The container exited with a non-zero
3637 * status, or with zero status and no reboot
3638 * was requested. */
3639 ret = r;
3640 break;
3641 }
3642
3643 /* CONTAINER_REBOOTED, loop again */
3644
3645 if (arg_keep_unit) {
3646 /* Special handling if we are running as a
3647 * service: instead of simply restarting the
3648 * machine we want to restart the entire
3649 * service, so let's inform systemd about this
3650 * with the special exit code 133. The service
3651 * file uses RestartForceExitStatus=133 so
3652 * that this results in a full nspawn
3653 * restart. This is necessary since we might
3654 * have cgroup parameters set we want to have
3655 * flushed out. */
3656 ret = 133;
3657 r = 0;
3658 break;
3659 }
3660
3661 expose_port_flush(arg_expose_ports, &exposed);
3662 }
3663
3664 finish:
3665 sd_notify(false,
3666 "STOPPING=1\n"
3667 "STATUS=Terminating...");
3668
3669 if (pid > 0)
3670 kill(pid, SIGKILL);
3671
3672 /* Try to flush whatever is still queued in the pty */
3673 if (master >= 0)
3674 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
3675
3676 loop_remove(loop_nr, &image_fd);
3677
3678 if (remove_subvol && arg_directory) {
3679 int k;
3680
3681 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
3682 if (k < 0)
3683 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3684 }
3685
3686 if (arg_machine) {
3687 const char *p;
3688
3689 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3690 (void) rm_rf(p, REMOVE_ROOT);
3691 }
3692
3693 expose_port_flush(arg_expose_ports, &exposed);
3694
3695 free(arg_directory);
3696 free(arg_template);
3697 free(arg_image);
3698 free(arg_machine);
3699 free(arg_user);
3700 free(arg_chdir);
3701 strv_free(arg_setenv);
3702 free(arg_network_bridge);
3703 strv_free(arg_network_interfaces);
3704 strv_free(arg_network_macvlan);
3705 strv_free(arg_network_ipvlan);
3706 strv_free(arg_network_veth_extra);
3707 strv_free(arg_parameters);
3708 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3709 expose_port_free_all(arg_expose_ports);
3710
3711 return r < 0 ? EXIT_FAILURE : ret;
3712 }