]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #1871 from poettering/nspawn-veth-extra
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #ifdef HAVE_BLKID
23 #include <blkid/blkid.h>
24 #endif
25 #include <errno.h>
26 #include <getopt.h>
27 #include <linux/loop.h>
28 #include <sched.h>
29 #ifdef HAVE_SECCOMP
30 #include <seccomp.h>
31 #endif
32 #ifdef HAVE_SELINUX
33 #include <selinux/selinux.h>
34 #endif
35 #include <signal.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <sys/file.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
44 #include <unistd.h>
45
46 #include "sd-daemon.h"
47 #include "sd-id128.h"
48
49 #include "alloc-util.h"
50 #include "barrier.h"
51 #include "base-filesystem.h"
52 #include "blkid-util.h"
53 #include "btrfs-util.h"
54 #include "cap-list.h"
55 #include "capability-util.h"
56 #include "cgroup-util.h"
57 #include "copy.h"
58 #include "dev-setup.h"
59 #include "env-util.h"
60 #include "event-util.h"
61 #include "fd-util.h"
62 #include "fdset.h"
63 #include "fileio.h"
64 #include "formats-util.h"
65 #include "fs-util.h"
66 #include "gpt.h"
67 #include "hostname-util.h"
68 #include "log.h"
69 #include "loopback-setup.h"
70 #include "machine-image.h"
71 #include "macro.h"
72 #include "missing.h"
73 #include "mkdir.h"
74 #include "mount-util.h"
75 #include "netlink-util.h"
76 #include "nspawn-cgroup.h"
77 #include "nspawn-expose-ports.h"
78 #include "nspawn-mount.h"
79 #include "nspawn-network.h"
80 #include "nspawn-register.h"
81 #include "nspawn-settings.h"
82 #include "nspawn-setuid.h"
83 #include "parse-util.h"
84 #include "path-util.h"
85 #include "process-util.h"
86 #include "ptyfwd.h"
87 #include "random-util.h"
88 #include "rm-rf.h"
89 #ifdef HAVE_SECCOMP
90 #include "seccomp-util.h"
91 #endif
92 #include "signal-util.h"
93 #include "socket-util.h"
94 #include "stat-util.h"
95 #include "stdio-util.h"
96 #include "string-util.h"
97 #include "strv.h"
98 #include "terminal-util.h"
99 #include "udev-util.h"
100 #include "umask-util.h"
101 #include "user-util.h"
102 #include "util.h"
103
104 typedef enum ContainerStatus {
105 CONTAINER_TERMINATED,
106 CONTAINER_REBOOTED
107 } ContainerStatus;
108
109 typedef enum LinkJournal {
110 LINK_NO,
111 LINK_AUTO,
112 LINK_HOST,
113 LINK_GUEST
114 } LinkJournal;
115
116 static char *arg_directory = NULL;
117 static char *arg_template = NULL;
118 static char *arg_user = NULL;
119 static sd_id128_t arg_uuid = {};
120 static char *arg_machine = NULL;
121 static const char *arg_selinux_context = NULL;
122 static const char *arg_selinux_apifs_context = NULL;
123 static const char *arg_slice = NULL;
124 static bool arg_private_network = false;
125 static bool arg_read_only = false;
126 static bool arg_boot = false;
127 static bool arg_ephemeral = false;
128 static LinkJournal arg_link_journal = LINK_AUTO;
129 static bool arg_link_journal_try = false;
130 static uint64_t arg_retain =
131 (1ULL << CAP_CHOWN) |
132 (1ULL << CAP_DAC_OVERRIDE) |
133 (1ULL << CAP_DAC_READ_SEARCH) |
134 (1ULL << CAP_FOWNER) |
135 (1ULL << CAP_FSETID) |
136 (1ULL << CAP_IPC_OWNER) |
137 (1ULL << CAP_KILL) |
138 (1ULL << CAP_LEASE) |
139 (1ULL << CAP_LINUX_IMMUTABLE) |
140 (1ULL << CAP_NET_BIND_SERVICE) |
141 (1ULL << CAP_NET_BROADCAST) |
142 (1ULL << CAP_NET_RAW) |
143 (1ULL << CAP_SETGID) |
144 (1ULL << CAP_SETFCAP) |
145 (1ULL << CAP_SETPCAP) |
146 (1ULL << CAP_SETUID) |
147 (1ULL << CAP_SYS_ADMIN) |
148 (1ULL << CAP_SYS_CHROOT) |
149 (1ULL << CAP_SYS_NICE) |
150 (1ULL << CAP_SYS_PTRACE) |
151 (1ULL << CAP_SYS_TTY_CONFIG) |
152 (1ULL << CAP_SYS_RESOURCE) |
153 (1ULL << CAP_SYS_BOOT) |
154 (1ULL << CAP_AUDIT_WRITE) |
155 (1ULL << CAP_AUDIT_CONTROL) |
156 (1ULL << CAP_MKNOD);
157 static CustomMount *arg_custom_mounts = NULL;
158 static unsigned arg_n_custom_mounts = 0;
159 static char **arg_setenv = NULL;
160 static bool arg_quiet = false;
161 static bool arg_share_system = false;
162 static bool arg_register = true;
163 static bool arg_keep_unit = false;
164 static char **arg_network_interfaces = NULL;
165 static char **arg_network_macvlan = NULL;
166 static char **arg_network_ipvlan = NULL;
167 static bool arg_network_veth = false;
168 static char **arg_network_veth_extra = NULL;
169 static char *arg_network_bridge = NULL;
170 static unsigned long arg_personality = PERSONALITY_INVALID;
171 static char *arg_image = NULL;
172 static VolatileMode arg_volatile_mode = VOLATILE_NO;
173 static ExposePort *arg_expose_ports = NULL;
174 static char **arg_property = NULL;
175 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
176 static bool arg_userns = false;
177 static int arg_kill_signal = 0;
178 static bool arg_unified_cgroup_hierarchy = false;
179 static SettingsMask arg_settings_mask = 0;
180 static int arg_settings_trusted = -1;
181 static char **arg_parameters = NULL;
182 static const char *arg_container_service_name = "systemd-nspawn";
183
184 static void help(void) {
185 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
186 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
187 " -h --help Show this help\n"
188 " --version Print version string\n"
189 " -q --quiet Do not show status information\n"
190 " -D --directory=PATH Root directory for the container\n"
191 " --template=PATH Initialize root directory from template directory,\n"
192 " if missing\n"
193 " -x --ephemeral Run container with snapshot of root directory, and\n"
194 " remove it after exit\n"
195 " -i --image=PATH File system device or disk image for the container\n"
196 " -b --boot Boot up full system (i.e. invoke init)\n"
197 " -u --user=USER Run the command under specified user or uid\n"
198 " -M --machine=NAME Set the machine name for the container\n"
199 " --uuid=UUID Set a specific machine UUID for the container\n"
200 " -S --slice=SLICE Place the container in the specified slice\n"
201 " --property=NAME=VALUE Set scope unit property\n"
202 " --private-users[=UIDBASE[:NUIDS]]\n"
203 " Run within user namespace\n"
204 " --private-network Disable network in container\n"
205 " --network-interface=INTERFACE\n"
206 " Assign an existing network interface to the\n"
207 " container\n"
208 " --network-macvlan=INTERFACE\n"
209 " Create a macvlan network interface based on an\n"
210 " existing network interface to the container\n"
211 " --network-ipvlan=INTERFACE\n"
212 " Create a ipvlan network interface based on an\n"
213 " existing network interface to the container\n"
214 " -n --network-veth Add a virtual Ethernet connection between host\n"
215 " and container\n"
216 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
217 " Add an additional virtual Ethernet link between\n"
218 " host and container\n"
219 " --network-bridge=INTERFACE\n"
220 " Add a virtual Ethernet connection between host\n"
221 " and container and add it to an existing bridge on\n"
222 " the host\n"
223 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
224 " Expose a container IP port on the host\n"
225 " -Z --selinux-context=SECLABEL\n"
226 " Set the SELinux security context to be used by\n"
227 " processes in the container\n"
228 " -L --selinux-apifs-context=SECLABEL\n"
229 " Set the SELinux security context to be used by\n"
230 " API/tmpfs file systems in the container\n"
231 " --capability=CAP In addition to the default, retain specified\n"
232 " capability\n"
233 " --drop-capability=CAP Drop the specified capability from the default set\n"
234 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
235 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
236 " try-guest, try-host\n"
237 " -j Equivalent to --link-journal=try-guest\n"
238 " --read-only Mount the root directory read-only\n"
239 " --bind=PATH[:PATH[:OPTIONS]]\n"
240 " Bind mount a file or directory from the host into\n"
241 " the container\n"
242 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
243 " Similar, but creates a read-only bind mount\n"
244 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
245 " --overlay=PATH[:PATH...]:PATH\n"
246 " Create an overlay mount from the host to \n"
247 " the container\n"
248 " --overlay-ro=PATH[:PATH...]:PATH\n"
249 " Similar, but creates a read-only overlay mount\n"
250 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
251 " --share-system Share system namespaces with host\n"
252 " --register=BOOLEAN Register container as machine\n"
253 " --keep-unit Do not register a scope for the machine, reuse\n"
254 " the service unit nspawn is running in\n"
255 " --volatile[=MODE] Run the system in volatile mode\n"
256 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
257 , program_invocation_short_name);
258 }
259
260
261 static int custom_mounts_prepare(void) {
262 unsigned i;
263 int r;
264
265 /* Ensure the mounts are applied prefix first. */
266 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
267
268 /* Allocate working directories for the overlay file systems that need it */
269 for (i = 0; i < arg_n_custom_mounts; i++) {
270 CustomMount *m = &arg_custom_mounts[i];
271
272 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
273 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
274 return -EINVAL;
275 }
276
277 if (m->type != CUSTOM_MOUNT_OVERLAY)
278 continue;
279
280 if (m->work_dir)
281 continue;
282
283 if (m->read_only)
284 continue;
285
286 r = tempfn_random(m->source, NULL, &m->work_dir);
287 if (r < 0)
288 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
289 }
290
291 return 0;
292 }
293
294 static int detect_unified_cgroup_hierarchy(void) {
295 const char *e;
296 int r;
297
298 /* Allow the user to control whether the unified hierarchy is used */
299 e = getenv("UNIFIED_CGROUP_HIERARCHY");
300 if (e) {
301 r = parse_boolean(e);
302 if (r < 0)
303 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
304
305 arg_unified_cgroup_hierarchy = r;
306 return 0;
307 }
308
309 /* Otherwise inherit the default from the host system */
310 r = cg_unified();
311 if (r < 0)
312 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
313
314 arg_unified_cgroup_hierarchy = r;
315 return 0;
316 }
317
318 static int parse_argv(int argc, char *argv[]) {
319
320 enum {
321 ARG_VERSION = 0x100,
322 ARG_PRIVATE_NETWORK,
323 ARG_UUID,
324 ARG_READ_ONLY,
325 ARG_CAPABILITY,
326 ARG_DROP_CAPABILITY,
327 ARG_LINK_JOURNAL,
328 ARG_BIND,
329 ARG_BIND_RO,
330 ARG_TMPFS,
331 ARG_OVERLAY,
332 ARG_OVERLAY_RO,
333 ARG_SETENV,
334 ARG_SHARE_SYSTEM,
335 ARG_REGISTER,
336 ARG_KEEP_UNIT,
337 ARG_NETWORK_INTERFACE,
338 ARG_NETWORK_MACVLAN,
339 ARG_NETWORK_IPVLAN,
340 ARG_NETWORK_BRIDGE,
341 ARG_NETWORK_VETH_EXTRA,
342 ARG_PERSONALITY,
343 ARG_VOLATILE,
344 ARG_TEMPLATE,
345 ARG_PROPERTY,
346 ARG_PRIVATE_USERS,
347 ARG_KILL_SIGNAL,
348 ARG_SETTINGS,
349 };
350
351 static const struct option options[] = {
352 { "help", no_argument, NULL, 'h' },
353 { "version", no_argument, NULL, ARG_VERSION },
354 { "directory", required_argument, NULL, 'D' },
355 { "template", required_argument, NULL, ARG_TEMPLATE },
356 { "ephemeral", no_argument, NULL, 'x' },
357 { "user", required_argument, NULL, 'u' },
358 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
359 { "boot", no_argument, NULL, 'b' },
360 { "uuid", required_argument, NULL, ARG_UUID },
361 { "read-only", no_argument, NULL, ARG_READ_ONLY },
362 { "capability", required_argument, NULL, ARG_CAPABILITY },
363 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
364 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
365 { "bind", required_argument, NULL, ARG_BIND },
366 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
367 { "tmpfs", required_argument, NULL, ARG_TMPFS },
368 { "overlay", required_argument, NULL, ARG_OVERLAY },
369 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
370 { "machine", required_argument, NULL, 'M' },
371 { "slice", required_argument, NULL, 'S' },
372 { "setenv", required_argument, NULL, ARG_SETENV },
373 { "selinux-context", required_argument, NULL, 'Z' },
374 { "selinux-apifs-context", required_argument, NULL, 'L' },
375 { "quiet", no_argument, NULL, 'q' },
376 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
377 { "register", required_argument, NULL, ARG_REGISTER },
378 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
379 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
380 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
381 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
382 { "network-veth", no_argument, NULL, 'n' },
383 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
384 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
385 { "personality", required_argument, NULL, ARG_PERSONALITY },
386 { "image", required_argument, NULL, 'i' },
387 { "volatile", optional_argument, NULL, ARG_VOLATILE },
388 { "port", required_argument, NULL, 'p' },
389 { "property", required_argument, NULL, ARG_PROPERTY },
390 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
391 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
392 { "settings", required_argument, NULL, ARG_SETTINGS },
393 {}
394 };
395
396 int c, r;
397 const char *p, *e;
398 uint64_t plus = 0, minus = 0;
399 bool mask_all_settings = false, mask_no_settings = false;
400
401 assert(argc >= 0);
402 assert(argv);
403
404 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
405
406 switch (c) {
407
408 case 'h':
409 help();
410 return 0;
411
412 case ARG_VERSION:
413 return version();
414
415 case 'D':
416 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
417 if (r < 0)
418 return r;
419 break;
420
421 case ARG_TEMPLATE:
422 r = parse_path_argument_and_warn(optarg, false, &arg_template);
423 if (r < 0)
424 return r;
425 break;
426
427 case 'i':
428 r = parse_path_argument_and_warn(optarg, false, &arg_image);
429 if (r < 0)
430 return r;
431 break;
432
433 case 'x':
434 arg_ephemeral = true;
435 break;
436
437 case 'u':
438 r = free_and_strdup(&arg_user, optarg);
439 if (r < 0)
440 return log_oom();
441
442 arg_settings_mask |= SETTING_USER;
443 break;
444
445 case ARG_NETWORK_BRIDGE:
446 r = free_and_strdup(&arg_network_bridge, optarg);
447 if (r < 0)
448 return log_oom();
449
450 /* fall through */
451
452 case 'n':
453 arg_network_veth = true;
454 arg_private_network = true;
455 arg_settings_mask |= SETTING_NETWORK;
456 break;
457
458 case ARG_NETWORK_VETH_EXTRA:
459 r = veth_extra_parse(&arg_network_veth_extra, optarg);
460 if (r < 0)
461 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
462
463 arg_private_network = true;
464 arg_settings_mask |= SETTING_NETWORK;
465 break;
466
467 case ARG_NETWORK_INTERFACE:
468 if (strv_extend(&arg_network_interfaces, optarg) < 0)
469 return log_oom();
470
471 arg_private_network = true;
472 arg_settings_mask |= SETTING_NETWORK;
473 break;
474
475 case ARG_NETWORK_MACVLAN:
476 if (strv_extend(&arg_network_macvlan, optarg) < 0)
477 return log_oom();
478
479 arg_private_network = true;
480 arg_settings_mask |= SETTING_NETWORK;
481 break;
482
483 case ARG_NETWORK_IPVLAN:
484 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
485 return log_oom();
486
487 /* fall through */
488
489 case ARG_PRIVATE_NETWORK:
490 arg_private_network = true;
491 arg_settings_mask |= SETTING_NETWORK;
492 break;
493
494 case 'b':
495 arg_boot = true;
496 arg_settings_mask |= SETTING_BOOT;
497 break;
498
499 case ARG_UUID:
500 r = sd_id128_from_string(optarg, &arg_uuid);
501 if (r < 0) {
502 log_error("Invalid UUID: %s", optarg);
503 return r;
504 }
505
506 arg_settings_mask |= SETTING_MACHINE_ID;
507 break;
508
509 case 'S':
510 arg_slice = optarg;
511 break;
512
513 case 'M':
514 if (isempty(optarg))
515 arg_machine = mfree(arg_machine);
516 else {
517 if (!machine_name_is_valid(optarg)) {
518 log_error("Invalid machine name: %s", optarg);
519 return -EINVAL;
520 }
521
522 r = free_and_strdup(&arg_machine, optarg);
523 if (r < 0)
524 return log_oom();
525
526 break;
527 }
528
529 case 'Z':
530 arg_selinux_context = optarg;
531 break;
532
533 case 'L':
534 arg_selinux_apifs_context = optarg;
535 break;
536
537 case ARG_READ_ONLY:
538 arg_read_only = true;
539 arg_settings_mask |= SETTING_READ_ONLY;
540 break;
541
542 case ARG_CAPABILITY:
543 case ARG_DROP_CAPABILITY: {
544 p = optarg;
545 for(;;) {
546 _cleanup_free_ char *t = NULL;
547
548 r = extract_first_word(&p, &t, ",", 0);
549 if (r < 0)
550 return log_error_errno(r, "Failed to parse capability %s.", t);
551
552 if (r == 0)
553 break;
554
555 if (streq(t, "all")) {
556 if (c == ARG_CAPABILITY)
557 plus = (uint64_t) -1;
558 else
559 minus = (uint64_t) -1;
560 } else {
561 int cap;
562
563 cap = capability_from_name(t);
564 if (cap < 0) {
565 log_error("Failed to parse capability %s.", t);
566 return -EINVAL;
567 }
568
569 if (c == ARG_CAPABILITY)
570 plus |= 1ULL << (uint64_t) cap;
571 else
572 minus |= 1ULL << (uint64_t) cap;
573 }
574 }
575
576 arg_settings_mask |= SETTING_CAPABILITY;
577 break;
578 }
579
580 case 'j':
581 arg_link_journal = LINK_GUEST;
582 arg_link_journal_try = true;
583 break;
584
585 case ARG_LINK_JOURNAL:
586 if (streq(optarg, "auto")) {
587 arg_link_journal = LINK_AUTO;
588 arg_link_journal_try = false;
589 } else if (streq(optarg, "no")) {
590 arg_link_journal = LINK_NO;
591 arg_link_journal_try = false;
592 } else if (streq(optarg, "guest")) {
593 arg_link_journal = LINK_GUEST;
594 arg_link_journal_try = false;
595 } else if (streq(optarg, "host")) {
596 arg_link_journal = LINK_HOST;
597 arg_link_journal_try = false;
598 } else if (streq(optarg, "try-guest")) {
599 arg_link_journal = LINK_GUEST;
600 arg_link_journal_try = true;
601 } else if (streq(optarg, "try-host")) {
602 arg_link_journal = LINK_HOST;
603 arg_link_journal_try = true;
604 } else {
605 log_error("Failed to parse link journal mode %s", optarg);
606 return -EINVAL;
607 }
608
609 break;
610
611 case ARG_BIND:
612 case ARG_BIND_RO:
613 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
614 if (r < 0)
615 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
616
617 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
618 break;
619
620 case ARG_TMPFS:
621 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
622 if (r < 0)
623 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
624
625 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
626 break;
627
628 case ARG_OVERLAY:
629 case ARG_OVERLAY_RO: {
630 _cleanup_free_ char *upper = NULL, *destination = NULL;
631 _cleanup_strv_free_ char **lower = NULL;
632 CustomMount *m;
633 unsigned n = 0;
634 char **i;
635
636 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
637 if (r == -ENOMEM)
638 return log_oom();
639 else if (r < 0) {
640 log_error("Invalid overlay specification: %s", optarg);
641 return r;
642 }
643
644 STRV_FOREACH(i, lower) {
645 if (!path_is_absolute(*i)) {
646 log_error("Overlay path %s is not absolute.", *i);
647 return -EINVAL;
648 }
649
650 n++;
651 }
652
653 if (n < 2) {
654 log_error("--overlay= needs at least two colon-separated directories specified.");
655 return -EINVAL;
656 }
657
658 if (n == 2) {
659 /* If two parameters are specified,
660 * the first one is the lower, the
661 * second one the upper directory. And
662 * we'll also define the destination
663 * mount point the same as the upper. */
664 upper = lower[1];
665 lower[1] = NULL;
666
667 destination = strdup(upper);
668 if (!destination)
669 return log_oom();
670
671 } else {
672 upper = lower[n - 2];
673 destination = lower[n - 1];
674 lower[n - 2] = NULL;
675 }
676
677 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
678 if (!m)
679 return log_oom();
680
681 m->destination = destination;
682 m->source = upper;
683 m->lower = lower;
684 m->read_only = c == ARG_OVERLAY_RO;
685
686 upper = destination = NULL;
687 lower = NULL;
688
689 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
690 break;
691 }
692
693 case ARG_SETENV: {
694 char **n;
695
696 if (!env_assignment_is_valid(optarg)) {
697 log_error("Environment variable assignment '%s' is not valid.", optarg);
698 return -EINVAL;
699 }
700
701 n = strv_env_set(arg_setenv, optarg);
702 if (!n)
703 return log_oom();
704
705 strv_free(arg_setenv);
706 arg_setenv = n;
707
708 arg_settings_mask |= SETTING_ENVIRONMENT;
709 break;
710 }
711
712 case 'q':
713 arg_quiet = true;
714 break;
715
716 case ARG_SHARE_SYSTEM:
717 arg_share_system = true;
718 break;
719
720 case ARG_REGISTER:
721 r = parse_boolean(optarg);
722 if (r < 0) {
723 log_error("Failed to parse --register= argument: %s", optarg);
724 return r;
725 }
726
727 arg_register = r;
728 break;
729
730 case ARG_KEEP_UNIT:
731 arg_keep_unit = true;
732 break;
733
734 case ARG_PERSONALITY:
735
736 arg_personality = personality_from_string(optarg);
737 if (arg_personality == PERSONALITY_INVALID) {
738 log_error("Unknown or unsupported personality '%s'.", optarg);
739 return -EINVAL;
740 }
741
742 arg_settings_mask |= SETTING_PERSONALITY;
743 break;
744
745 case ARG_VOLATILE:
746
747 if (!optarg)
748 arg_volatile_mode = VOLATILE_YES;
749 else {
750 VolatileMode m;
751
752 m = volatile_mode_from_string(optarg);
753 if (m < 0) {
754 log_error("Failed to parse --volatile= argument: %s", optarg);
755 return -EINVAL;
756 } else
757 arg_volatile_mode = m;
758 }
759
760 arg_settings_mask |= SETTING_VOLATILE_MODE;
761 break;
762
763 case 'p':
764 r = expose_port_parse(&arg_expose_ports, optarg);
765 if (r == -EEXIST)
766 return log_error_errno(r, "Duplicate port specification: %s", optarg);
767 if (r < 0)
768 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
769
770 arg_settings_mask |= SETTING_EXPOSE_PORTS;
771 break;
772
773 case ARG_PROPERTY:
774 if (strv_extend(&arg_property, optarg) < 0)
775 return log_oom();
776
777 break;
778
779 case ARG_PRIVATE_USERS:
780 if (optarg) {
781 _cleanup_free_ char *buffer = NULL;
782 const char *range, *shift;
783
784 range = strchr(optarg, ':');
785 if (range) {
786 buffer = strndup(optarg, range - optarg);
787 if (!buffer)
788 return log_oom();
789 shift = buffer;
790
791 range++;
792 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
793 log_error("Failed to parse UID range: %s", range);
794 return -EINVAL;
795 }
796 } else
797 shift = optarg;
798
799 if (parse_uid(shift, &arg_uid_shift) < 0) {
800 log_error("Failed to parse UID: %s", optarg);
801 return -EINVAL;
802 }
803 }
804
805 arg_userns = true;
806 break;
807
808 case ARG_KILL_SIGNAL:
809 arg_kill_signal = signal_from_string_try_harder(optarg);
810 if (arg_kill_signal < 0) {
811 log_error("Cannot parse signal: %s", optarg);
812 return -EINVAL;
813 }
814
815 arg_settings_mask |= SETTING_KILL_SIGNAL;
816 break;
817
818 case ARG_SETTINGS:
819
820 /* no → do not read files
821 * yes → read files, do not override cmdline, trust only subset
822 * override → read files, override cmdline, trust only subset
823 * trusted → read files, do not override cmdline, trust all
824 */
825
826 r = parse_boolean(optarg);
827 if (r < 0) {
828 if (streq(optarg, "trusted")) {
829 mask_all_settings = false;
830 mask_no_settings = false;
831 arg_settings_trusted = true;
832
833 } else if (streq(optarg, "override")) {
834 mask_all_settings = false;
835 mask_no_settings = true;
836 arg_settings_trusted = -1;
837 } else
838 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
839 } else if (r > 0) {
840 /* yes */
841 mask_all_settings = false;
842 mask_no_settings = false;
843 arg_settings_trusted = -1;
844 } else {
845 /* no */
846 mask_all_settings = true;
847 mask_no_settings = false;
848 arg_settings_trusted = false;
849 }
850
851 break;
852
853 case '?':
854 return -EINVAL;
855
856 default:
857 assert_not_reached("Unhandled option");
858 }
859
860 if (arg_share_system)
861 arg_register = false;
862
863 if (arg_boot && arg_share_system) {
864 log_error("--boot and --share-system may not be combined.");
865 return -EINVAL;
866 }
867
868 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
869 log_error("--keep-unit may not be used when invoked from a user session.");
870 return -EINVAL;
871 }
872
873 if (arg_directory && arg_image) {
874 log_error("--directory= and --image= may not be combined.");
875 return -EINVAL;
876 }
877
878 if (arg_template && arg_image) {
879 log_error("--template= and --image= may not be combined.");
880 return -EINVAL;
881 }
882
883 if (arg_template && !(arg_directory || arg_machine)) {
884 log_error("--template= needs --directory= or --machine=.");
885 return -EINVAL;
886 }
887
888 if (arg_ephemeral && arg_template) {
889 log_error("--ephemeral and --template= may not be combined.");
890 return -EINVAL;
891 }
892
893 if (arg_ephemeral && arg_image) {
894 log_error("--ephemeral and --image= may not be combined.");
895 return -EINVAL;
896 }
897
898 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
899 log_error("--ephemeral and --link-journal= may not be combined.");
900 return -EINVAL;
901 }
902
903 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
904 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
905
906 if (argc > optind) {
907 arg_parameters = strv_copy(argv + optind);
908 if (!arg_parameters)
909 return log_oom();
910
911 arg_settings_mask |= SETTING_BOOT;
912 }
913
914 /* Load all settings from .nspawn files */
915 if (mask_no_settings)
916 arg_settings_mask = 0;
917
918 /* Don't load any settings from .nspawn files */
919 if (mask_all_settings)
920 arg_settings_mask = _SETTINGS_MASK_ALL;
921
922 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
923
924 r = detect_unified_cgroup_hierarchy();
925 if (r < 0)
926 return r;
927
928 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
929 if (e)
930 arg_container_service_name = e;
931
932 return 1;
933 }
934
935 static int verify_arguments(void) {
936
937 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
938 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
939 return -EINVAL;
940 }
941
942 if (arg_expose_ports && !arg_private_network) {
943 log_error("Cannot use --port= without private networking.");
944 return -EINVAL;
945 }
946
947 if (arg_boot && arg_kill_signal <= 0)
948 arg_kill_signal = SIGRTMIN+3;
949
950 return 0;
951 }
952
953 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
954 assert(p);
955
956 if (!arg_userns)
957 return 0;
958
959 if (uid == UID_INVALID && gid == GID_INVALID)
960 return 0;
961
962 if (uid != UID_INVALID) {
963 uid += arg_uid_shift;
964
965 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
966 return -EOVERFLOW;
967 }
968
969 if (gid != GID_INVALID) {
970 gid += (gid_t) arg_uid_shift;
971
972 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
973 return -EOVERFLOW;
974 }
975
976 if (lchown(p, uid, gid) < 0)
977 return -errno;
978
979 return 0;
980 }
981
982 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
983 const char *q;
984
985 q = prefix_roota(root, path);
986 if (mkdir(q, mode) < 0) {
987 if (errno == EEXIST)
988 return 0;
989 return -errno;
990 }
991
992 return userns_lchown(q, uid, gid);
993 }
994
995 static int setup_timezone(const char *dest) {
996 _cleanup_free_ char *p = NULL, *q = NULL;
997 const char *where, *check, *what;
998 char *z, *y;
999 int r;
1000
1001 assert(dest);
1002
1003 /* Fix the timezone, if possible */
1004 r = readlink_malloc("/etc/localtime", &p);
1005 if (r < 0) {
1006 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1007 return 0;
1008 }
1009
1010 z = path_startswith(p, "../usr/share/zoneinfo/");
1011 if (!z)
1012 z = path_startswith(p, "/usr/share/zoneinfo/");
1013 if (!z) {
1014 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1015 return 0;
1016 }
1017
1018 where = prefix_roota(dest, "/etc/localtime");
1019 r = readlink_malloc(where, &q);
1020 if (r >= 0) {
1021 y = path_startswith(q, "../usr/share/zoneinfo/");
1022 if (!y)
1023 y = path_startswith(q, "/usr/share/zoneinfo/");
1024
1025 /* Already pointing to the right place? Then do nothing .. */
1026 if (y && streq(y, z))
1027 return 0;
1028 }
1029
1030 check = strjoina("/usr/share/zoneinfo/", z);
1031 check = prefix_root(dest, check);
1032 if (laccess(check, F_OK) < 0) {
1033 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1034 return 0;
1035 }
1036
1037 r = unlink(where);
1038 if (r < 0 && errno != ENOENT) {
1039 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1040 return 0;
1041 }
1042
1043 what = strjoina("../usr/share/zoneinfo/", z);
1044 if (symlink(what, where) < 0) {
1045 log_error_errno(errno, "Failed to correct timezone of container: %m");
1046 return 0;
1047 }
1048
1049 r = userns_lchown(where, 0, 0);
1050 if (r < 0)
1051 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1052
1053 return 0;
1054 }
1055
1056 static int setup_resolv_conf(const char *dest) {
1057 const char *where = NULL;
1058 int r;
1059
1060 assert(dest);
1061
1062 if (arg_private_network)
1063 return 0;
1064
1065 /* Fix resolv.conf, if possible */
1066 where = prefix_roota(dest, "/etc/resolv.conf");
1067
1068 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1069 if (r < 0) {
1070 /* If the file already exists as symlink, let's
1071 * suppress the warning, under the assumption that
1072 * resolved or something similar runs inside and the
1073 * symlink points there.
1074 *
1075 * If the disk image is read-only, there's also no
1076 * point in complaining.
1077 */
1078 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1079 "Failed to copy /etc/resolv.conf to %s: %m", where);
1080 return 0;
1081 }
1082
1083 r = userns_lchown(where, 0, 0);
1084 if (r < 0)
1085 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1086
1087 return 0;
1088 }
1089
1090 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1091 assert(s);
1092
1093 snprintf(s, 37,
1094 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1095 SD_ID128_FORMAT_VAL(id));
1096
1097 return s;
1098 }
1099
1100 static int setup_boot_id(const char *dest) {
1101 const char *from, *to;
1102 sd_id128_t rnd = {};
1103 char as_uuid[37];
1104 int r;
1105
1106 if (arg_share_system)
1107 return 0;
1108
1109 /* Generate a new randomized boot ID, so that each boot-up of
1110 * the container gets a new one */
1111
1112 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1113 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1114
1115 r = sd_id128_randomize(&rnd);
1116 if (r < 0)
1117 return log_error_errno(r, "Failed to generate random boot id: %m");
1118
1119 id128_format_as_uuid(rnd, as_uuid);
1120
1121 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1122 if (r < 0)
1123 return log_error_errno(r, "Failed to write boot id: %m");
1124
1125 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1126 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1127 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1128 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1129
1130 unlink(from);
1131 return r;
1132 }
1133
1134 static int copy_devnodes(const char *dest) {
1135
1136 static const char devnodes[] =
1137 "null\0"
1138 "zero\0"
1139 "full\0"
1140 "random\0"
1141 "urandom\0"
1142 "tty\0"
1143 "net/tun\0";
1144
1145 const char *d;
1146 int r = 0;
1147 _cleanup_umask_ mode_t u;
1148
1149 assert(dest);
1150
1151 u = umask(0000);
1152
1153 /* Create /dev/net, so that we can create /dev/net/tun in it */
1154 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1155 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1156
1157 NULSTR_FOREACH(d, devnodes) {
1158 _cleanup_free_ char *from = NULL, *to = NULL;
1159 struct stat st;
1160
1161 from = strappend("/dev/", d);
1162 to = prefix_root(dest, from);
1163
1164 if (stat(from, &st) < 0) {
1165
1166 if (errno != ENOENT)
1167 return log_error_errno(errno, "Failed to stat %s: %m", from);
1168
1169 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1170
1171 log_error("%s is not a char or block device, cannot copy.", from);
1172 return -EIO;
1173
1174 } else {
1175 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1176 if (errno != EPERM)
1177 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1178
1179 /* Some systems abusively restrict mknod but
1180 * allow bind mounts. */
1181 r = touch(to);
1182 if (r < 0)
1183 return log_error_errno(r, "touch (%s) failed: %m", to);
1184 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1185 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1186 }
1187
1188 r = userns_lchown(to, 0, 0);
1189 if (r < 0)
1190 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1191 }
1192 }
1193
1194 return r;
1195 }
1196
1197 static int setup_pts(const char *dest) {
1198 _cleanup_free_ char *options = NULL;
1199 const char *p;
1200 int r;
1201
1202 #ifdef HAVE_SELINUX
1203 if (arg_selinux_apifs_context)
1204 (void) asprintf(&options,
1205 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1206 arg_uid_shift + TTY_GID,
1207 arg_selinux_apifs_context);
1208 else
1209 #endif
1210 (void) asprintf(&options,
1211 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1212 arg_uid_shift + TTY_GID);
1213
1214 if (!options)
1215 return log_oom();
1216
1217 /* Mount /dev/pts itself */
1218 p = prefix_roota(dest, "/dev/pts");
1219 if (mkdir(p, 0755) < 0)
1220 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1221 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1222 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1223 r = userns_lchown(p, 0, 0);
1224 if (r < 0)
1225 return log_error_errno(r, "Failed to chown /dev/pts: %m");
1226
1227 /* Create /dev/ptmx symlink */
1228 p = prefix_roota(dest, "/dev/ptmx");
1229 if (symlink("pts/ptmx", p) < 0)
1230 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1231 r = userns_lchown(p, 0, 0);
1232 if (r < 0)
1233 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
1234
1235 /* And fix /dev/pts/ptmx ownership */
1236 p = prefix_roota(dest, "/dev/pts/ptmx");
1237 r = userns_lchown(p, 0, 0);
1238 if (r < 0)
1239 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
1240
1241 return 0;
1242 }
1243
1244 static int setup_dev_console(const char *dest, const char *console) {
1245 _cleanup_umask_ mode_t u;
1246 const char *to;
1247 int r;
1248
1249 assert(dest);
1250 assert(console);
1251
1252 u = umask(0000);
1253
1254 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1255 if (r < 0)
1256 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1257
1258 /* We need to bind mount the right tty to /dev/console since
1259 * ptys can only exist on pts file systems. To have something
1260 * to bind mount things on we create a empty regular file. */
1261
1262 to = prefix_roota(dest, "/dev/console");
1263 r = touch(to);
1264 if (r < 0)
1265 return log_error_errno(r, "touch() for /dev/console failed: %m");
1266
1267 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1268 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1269
1270 return 0;
1271 }
1272
1273 static int setup_kmsg(const char *dest, int kmsg_socket) {
1274 const char *from, *to;
1275 _cleanup_umask_ mode_t u;
1276 int fd, r;
1277
1278 assert(kmsg_socket >= 0);
1279
1280 u = umask(0000);
1281
1282 /* We create the kmsg FIFO as /run/kmsg, but immediately
1283 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1284 * on the reading side behave very similar to /proc/kmsg,
1285 * their writing side behaves differently from /dev/kmsg in
1286 * that writing blocks when nothing is reading. In order to
1287 * avoid any problems with containers deadlocking due to this
1288 * we simply make /dev/kmsg unavailable to the container. */
1289 from = prefix_roota(dest, "/run/kmsg");
1290 to = prefix_roota(dest, "/proc/kmsg");
1291
1292 if (mkfifo(from, 0600) < 0)
1293 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1294 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1295 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1296
1297 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1298 if (fd < 0)
1299 return log_error_errno(errno, "Failed to open fifo: %m");
1300
1301 /* Store away the fd in the socket, so that it stays open as
1302 * long as we run the child */
1303 r = send_one_fd(kmsg_socket, fd, 0);
1304 safe_close(fd);
1305
1306 if (r < 0)
1307 return log_error_errno(r, "Failed to send FIFO fd: %m");
1308
1309 /* And now make the FIFO unavailable as /run/kmsg... */
1310 (void) unlink(from);
1311
1312 return 0;
1313 }
1314
1315 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1316 union in_addr_union *exposed = userdata;
1317
1318 assert(rtnl);
1319 assert(m);
1320 assert(exposed);
1321
1322 expose_port_execute(rtnl, arg_expose_ports, exposed);
1323 return 0;
1324 }
1325
1326 static int setup_hostname(void) {
1327
1328 if (arg_share_system)
1329 return 0;
1330
1331 if (sethostname_idempotent(arg_machine) < 0)
1332 return -errno;
1333
1334 return 0;
1335 }
1336
1337 static int setup_journal(const char *directory) {
1338 sd_id128_t machine_id, this_id;
1339 _cleanup_free_ char *b = NULL, *d = NULL;
1340 const char *etc_machine_id, *p, *q;
1341 char *id;
1342 int r;
1343
1344 /* Don't link journals in ephemeral mode */
1345 if (arg_ephemeral)
1346 return 0;
1347
1348 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1349
1350 r = read_one_line_file(etc_machine_id, &b);
1351 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1352 return 0;
1353 else if (r < 0)
1354 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
1355
1356 id = strstrip(b);
1357 if (isempty(id) && arg_link_journal == LINK_AUTO)
1358 return 0;
1359
1360 /* Verify validity */
1361 r = sd_id128_from_string(id, &machine_id);
1362 if (r < 0)
1363 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
1364
1365 r = sd_id128_get_machine(&this_id);
1366 if (r < 0)
1367 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1368
1369 if (sd_id128_equal(machine_id, this_id)) {
1370 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1371 "Host and machine ids are equal (%s): refusing to link journals", id);
1372 if (arg_link_journal == LINK_AUTO)
1373 return 0;
1374 return -EEXIST;
1375 }
1376
1377 if (arg_link_journal == LINK_NO)
1378 return 0;
1379
1380 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1381 if (r < 0)
1382 return log_error_errno(r, "Failed to create /var: %m");
1383
1384 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1385 if (r < 0)
1386 return log_error_errno(r, "Failed to create /var/log: %m");
1387
1388 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1389 if (r < 0)
1390 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1391
1392 p = strjoina("/var/log/journal/", id);
1393 q = prefix_roota(directory, p);
1394
1395 if (path_is_mount_point(p, 0) > 0) {
1396 if (arg_link_journal != LINK_AUTO) {
1397 log_error("%s: already a mount point, refusing to use for journal", p);
1398 return -EEXIST;
1399 }
1400
1401 return 0;
1402 }
1403
1404 if (path_is_mount_point(q, 0) > 0) {
1405 if (arg_link_journal != LINK_AUTO) {
1406 log_error("%s: already a mount point, refusing to use for journal", q);
1407 return -EEXIST;
1408 }
1409
1410 return 0;
1411 }
1412
1413 r = readlink_and_make_absolute(p, &d);
1414 if (r >= 0) {
1415 if ((arg_link_journal == LINK_GUEST ||
1416 arg_link_journal == LINK_AUTO) &&
1417 path_equal(d, q)) {
1418
1419 r = userns_mkdir(directory, p, 0755, 0, 0);
1420 if (r < 0)
1421 log_warning_errno(r, "Failed to create directory %s: %m", q);
1422 return 0;
1423 }
1424
1425 if (unlink(p) < 0)
1426 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1427 } else if (r == -EINVAL) {
1428
1429 if (arg_link_journal == LINK_GUEST &&
1430 rmdir(p) < 0) {
1431
1432 if (errno == ENOTDIR) {
1433 log_error("%s already exists and is neither a symlink nor a directory", p);
1434 return r;
1435 } else
1436 return log_error_errno(errno, "Failed to remove %s: %m", p);
1437 }
1438 } else if (r != -ENOENT)
1439 return log_error_errno(r, "readlink(%s) failed: %m", p);
1440
1441 if (arg_link_journal == LINK_GUEST) {
1442
1443 if (symlink(q, p) < 0) {
1444 if (arg_link_journal_try) {
1445 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1446 return 0;
1447 } else
1448 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1449 }
1450
1451 r = userns_mkdir(directory, p, 0755, 0, 0);
1452 if (r < 0)
1453 log_warning_errno(r, "Failed to create directory %s: %m", q);
1454 return 0;
1455 }
1456
1457 if (arg_link_journal == LINK_HOST) {
1458 /* don't create parents here -- if the host doesn't have
1459 * permanent journal set up, don't force it here */
1460 r = mkdir(p, 0755);
1461 if (r < 0) {
1462 if (arg_link_journal_try) {
1463 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1464 return 0;
1465 } else
1466 return log_error_errno(errno, "Failed to create %s: %m", p);
1467 }
1468
1469 } else if (access(p, F_OK) < 0)
1470 return 0;
1471
1472 if (dir_is_empty(q) == 0)
1473 log_warning("%s is not empty, proceeding anyway.", q);
1474
1475 r = userns_mkdir(directory, p, 0755, 0, 0);
1476 if (r < 0)
1477 return log_error_errno(r, "Failed to create %s: %m", q);
1478
1479 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1480 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1481
1482 return 0;
1483 }
1484
1485 static int drop_capabilities(void) {
1486 return capability_bounding_set_drop(~arg_retain, false);
1487 }
1488
1489 static int reset_audit_loginuid(void) {
1490 _cleanup_free_ char *p = NULL;
1491 int r;
1492
1493 if (arg_share_system)
1494 return 0;
1495
1496 r = read_one_line_file("/proc/self/loginuid", &p);
1497 if (r == -ENOENT)
1498 return 0;
1499 if (r < 0)
1500 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1501
1502 /* Already reset? */
1503 if (streq(p, "4294967295"))
1504 return 0;
1505
1506 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1507 if (r < 0) {
1508 log_error_errno(r,
1509 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1510 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1511 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1512 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1513 "using systemd-nspawn. Sleeping for 5s... (%m)");
1514
1515 sleep(5);
1516 }
1517
1518 return 0;
1519 }
1520
1521 static int setup_seccomp(void) {
1522
1523 #ifdef HAVE_SECCOMP
1524 static const struct {
1525 uint64_t capability;
1526 int syscall_num;
1527 } blacklist[] = {
1528 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1529 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1530 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1531 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1532 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1533 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1534 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1535 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1536 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1537 { CAP_SYSLOG, SCMP_SYS(syslog) },
1538 };
1539
1540 scmp_filter_ctx seccomp;
1541 unsigned i;
1542 int r;
1543
1544 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1545 if (!seccomp)
1546 return log_oom();
1547
1548 r = seccomp_add_secondary_archs(seccomp);
1549 if (r < 0) {
1550 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1551 goto finish;
1552 }
1553
1554 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1555 if (arg_retain & (1ULL << blacklist[i].capability))
1556 continue;
1557
1558 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1559 if (r == -EFAULT)
1560 continue; /* unknown syscall */
1561 if (r < 0) {
1562 log_error_errno(r, "Failed to block syscall: %m");
1563 goto finish;
1564 }
1565 }
1566
1567
1568 /*
1569 Audit is broken in containers, much of the userspace audit
1570 hookup will fail if running inside a container. We don't
1571 care and just turn off creation of audit sockets.
1572
1573 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1574 with EAFNOSUPPORT which audit userspace uses as indication
1575 that audit is disabled in the kernel.
1576 */
1577
1578 r = seccomp_rule_add(
1579 seccomp,
1580 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1581 SCMP_SYS(socket),
1582 2,
1583 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1584 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1585 if (r < 0) {
1586 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1587 goto finish;
1588 }
1589
1590 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1591 if (r < 0) {
1592 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1593 goto finish;
1594 }
1595
1596 r = seccomp_load(seccomp);
1597 if (r == -EINVAL) {
1598 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1599 r = 0;
1600 goto finish;
1601 }
1602 if (r < 0) {
1603 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1604 goto finish;
1605 }
1606
1607 finish:
1608 seccomp_release(seccomp);
1609 return r;
1610 #else
1611 return 0;
1612 #endif
1613
1614 }
1615
1616 static int setup_propagate(const char *root) {
1617 const char *p, *q;
1618 int r;
1619
1620 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1621 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1622 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1623 (void) mkdir_p(p, 0600);
1624
1625 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1626 if (r < 0)
1627 return log_error_errno(r, "Failed to create /run/systemd: %m");
1628
1629 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1630 if (r < 0)
1631 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
1632
1633 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1634 if (r < 0)
1635 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
1636
1637 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1638 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1639 return log_error_errno(errno, "Failed to install propagation bind mount.");
1640
1641 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1642 return log_error_errno(errno, "Failed to make propagation mount read-only");
1643
1644 return 0;
1645 }
1646
1647 static int setup_image(char **device_path, int *loop_nr) {
1648 struct loop_info64 info = {
1649 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1650 };
1651 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1652 _cleanup_free_ char* loopdev = NULL;
1653 struct stat st;
1654 int r, nr;
1655
1656 assert(device_path);
1657 assert(loop_nr);
1658 assert(arg_image);
1659
1660 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1661 if (fd < 0)
1662 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1663
1664 if (fstat(fd, &st) < 0)
1665 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1666
1667 if (S_ISBLK(st.st_mode)) {
1668 char *p;
1669
1670 p = strdup(arg_image);
1671 if (!p)
1672 return log_oom();
1673
1674 *device_path = p;
1675
1676 *loop_nr = -1;
1677
1678 r = fd;
1679 fd = -1;
1680
1681 return r;
1682 }
1683
1684 if (!S_ISREG(st.st_mode)) {
1685 log_error("%s is not a regular file or block device.", arg_image);
1686 return -EINVAL;
1687 }
1688
1689 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1690 if (control < 0)
1691 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1692
1693 nr = ioctl(control, LOOP_CTL_GET_FREE);
1694 if (nr < 0)
1695 return log_error_errno(errno, "Failed to allocate loop device: %m");
1696
1697 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1698 return log_oom();
1699
1700 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1701 if (loop < 0)
1702 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1703
1704 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1705 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1706
1707 if (arg_read_only)
1708 info.lo_flags |= LO_FLAGS_READ_ONLY;
1709
1710 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1711 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1712
1713 *device_path = loopdev;
1714 loopdev = NULL;
1715
1716 *loop_nr = nr;
1717
1718 r = loop;
1719 loop = -1;
1720
1721 return r;
1722 }
1723
1724 #define PARTITION_TABLE_BLURB \
1725 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1726 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1727 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1728 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1729 "to be bootable with systemd-nspawn."
1730
1731 static int dissect_image(
1732 int fd,
1733 char **root_device, bool *root_device_rw,
1734 char **home_device, bool *home_device_rw,
1735 char **srv_device, bool *srv_device_rw,
1736 bool *secondary) {
1737
1738 #ifdef HAVE_BLKID
1739 int home_nr = -1, srv_nr = -1;
1740 #ifdef GPT_ROOT_NATIVE
1741 int root_nr = -1;
1742 #endif
1743 #ifdef GPT_ROOT_SECONDARY
1744 int secondary_root_nr = -1;
1745 #endif
1746 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1747 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1748 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1749 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1750 _cleanup_udev_unref_ struct udev *udev = NULL;
1751 struct udev_list_entry *first, *item;
1752 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1753 bool is_gpt, is_mbr, multiple_generic = false;
1754 const char *pttype = NULL;
1755 blkid_partlist pl;
1756 struct stat st;
1757 unsigned i;
1758 int r;
1759
1760 assert(fd >= 0);
1761 assert(root_device);
1762 assert(home_device);
1763 assert(srv_device);
1764 assert(secondary);
1765 assert(arg_image);
1766
1767 b = blkid_new_probe();
1768 if (!b)
1769 return log_oom();
1770
1771 errno = 0;
1772 r = blkid_probe_set_device(b, fd, 0, 0);
1773 if (r != 0) {
1774 if (errno == 0)
1775 return log_oom();
1776
1777 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1778 }
1779
1780 blkid_probe_enable_partitions(b, 1);
1781 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1782
1783 errno = 0;
1784 r = blkid_do_safeprobe(b);
1785 if (r == -2 || r == 1) {
1786 log_error("Failed to identify any partition table on\n"
1787 " %s\n"
1788 PARTITION_TABLE_BLURB, arg_image);
1789 return -EINVAL;
1790 } else if (r != 0) {
1791 if (errno == 0)
1792 errno = EIO;
1793 return log_error_errno(errno, "Failed to probe: %m");
1794 }
1795
1796 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1797
1798 is_gpt = streq_ptr(pttype, "gpt");
1799 is_mbr = streq_ptr(pttype, "dos");
1800
1801 if (!is_gpt && !is_mbr) {
1802 log_error("No GPT or MBR partition table discovered on\n"
1803 " %s\n"
1804 PARTITION_TABLE_BLURB, arg_image);
1805 return -EINVAL;
1806 }
1807
1808 errno = 0;
1809 pl = blkid_probe_get_partitions(b);
1810 if (!pl) {
1811 if (errno == 0)
1812 return log_oom();
1813
1814 log_error("Failed to list partitions of %s", arg_image);
1815 return -errno;
1816 }
1817
1818 udev = udev_new();
1819 if (!udev)
1820 return log_oom();
1821
1822 if (fstat(fd, &st) < 0)
1823 return log_error_errno(errno, "Failed to stat block device: %m");
1824
1825 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1826 if (!d)
1827 return log_oom();
1828
1829 for (i = 0;; i++) {
1830 int n, m;
1831
1832 if (i >= 10) {
1833 log_error("Kernel partitions never appeared.");
1834 return -ENXIO;
1835 }
1836
1837 e = udev_enumerate_new(udev);
1838 if (!e)
1839 return log_oom();
1840
1841 r = udev_enumerate_add_match_parent(e, d);
1842 if (r < 0)
1843 return log_oom();
1844
1845 r = udev_enumerate_scan_devices(e);
1846 if (r < 0)
1847 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1848
1849 /* Count the partitions enumerated by the kernel */
1850 n = 0;
1851 first = udev_enumerate_get_list_entry(e);
1852 udev_list_entry_foreach(item, first)
1853 n++;
1854
1855 /* Count the partitions enumerated by blkid */
1856 m = blkid_partlist_numof_partitions(pl);
1857 if (n == m + 1)
1858 break;
1859 if (n > m + 1) {
1860 log_error("blkid and kernel partition list do not match.");
1861 return -EIO;
1862 }
1863 if (n < m + 1) {
1864 unsigned j;
1865
1866 /* The kernel has probed fewer partitions than
1867 * blkid? Maybe the kernel prober is still
1868 * running or it got EBUSY because udev
1869 * already opened the device. Let's reprobe
1870 * the device, which is a synchronous call
1871 * that waits until probing is complete. */
1872
1873 for (j = 0; j < 20; j++) {
1874
1875 r = ioctl(fd, BLKRRPART, 0);
1876 if (r < 0)
1877 r = -errno;
1878 if (r >= 0 || r != -EBUSY)
1879 break;
1880
1881 /* If something else has the device
1882 * open, such as an udev rule, the
1883 * ioctl will return EBUSY. Since
1884 * there's no way to wait until it
1885 * isn't busy anymore, let's just wait
1886 * a bit, and try again.
1887 *
1888 * This is really something they
1889 * should fix in the kernel! */
1890
1891 usleep(50 * USEC_PER_MSEC);
1892 }
1893
1894 if (r < 0)
1895 return log_error_errno(r, "Failed to reread partition table: %m");
1896 }
1897
1898 e = udev_enumerate_unref(e);
1899 }
1900
1901 first = udev_enumerate_get_list_entry(e);
1902 udev_list_entry_foreach(item, first) {
1903 _cleanup_udev_device_unref_ struct udev_device *q;
1904 const char *node;
1905 unsigned long long flags;
1906 blkid_partition pp;
1907 dev_t qn;
1908 int nr;
1909
1910 errno = 0;
1911 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1912 if (!q) {
1913 if (!errno)
1914 errno = ENOMEM;
1915
1916 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1917 }
1918
1919 qn = udev_device_get_devnum(q);
1920 if (major(qn) == 0)
1921 continue;
1922
1923 if (st.st_rdev == qn)
1924 continue;
1925
1926 node = udev_device_get_devnode(q);
1927 if (!node)
1928 continue;
1929
1930 pp = blkid_partlist_devno_to_partition(pl, qn);
1931 if (!pp)
1932 continue;
1933
1934 flags = blkid_partition_get_flags(pp);
1935
1936 nr = blkid_partition_get_partno(pp);
1937 if (nr < 0)
1938 continue;
1939
1940 if (is_gpt) {
1941 sd_id128_t type_id;
1942 const char *stype;
1943
1944 if (flags & GPT_FLAG_NO_AUTO)
1945 continue;
1946
1947 stype = blkid_partition_get_type_string(pp);
1948 if (!stype)
1949 continue;
1950
1951 if (sd_id128_from_string(stype, &type_id) < 0)
1952 continue;
1953
1954 if (sd_id128_equal(type_id, GPT_HOME)) {
1955
1956 if (home && nr >= home_nr)
1957 continue;
1958
1959 home_nr = nr;
1960 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1961
1962 r = free_and_strdup(&home, node);
1963 if (r < 0)
1964 return log_oom();
1965
1966 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1967
1968 if (srv && nr >= srv_nr)
1969 continue;
1970
1971 srv_nr = nr;
1972 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1973
1974 r = free_and_strdup(&srv, node);
1975 if (r < 0)
1976 return log_oom();
1977 }
1978 #ifdef GPT_ROOT_NATIVE
1979 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1980
1981 if (root && nr >= root_nr)
1982 continue;
1983
1984 root_nr = nr;
1985 root_rw = !(flags & GPT_FLAG_READ_ONLY);
1986
1987 r = free_and_strdup(&root, node);
1988 if (r < 0)
1989 return log_oom();
1990 }
1991 #endif
1992 #ifdef GPT_ROOT_SECONDARY
1993 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1994
1995 if (secondary_root && nr >= secondary_root_nr)
1996 continue;
1997
1998 secondary_root_nr = nr;
1999 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2000
2001 r = free_and_strdup(&secondary_root, node);
2002 if (r < 0)
2003 return log_oom();
2004 }
2005 #endif
2006 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2007
2008 if (generic)
2009 multiple_generic = true;
2010 else {
2011 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2012
2013 r = free_and_strdup(&generic, node);
2014 if (r < 0)
2015 return log_oom();
2016 }
2017 }
2018
2019 } else if (is_mbr) {
2020 int type;
2021
2022 if (flags != 0x80) /* Bootable flag */
2023 continue;
2024
2025 type = blkid_partition_get_type(pp);
2026 if (type != 0x83) /* Linux partition */
2027 continue;
2028
2029 if (generic)
2030 multiple_generic = true;
2031 else {
2032 generic_rw = true;
2033
2034 r = free_and_strdup(&root, node);
2035 if (r < 0)
2036 return log_oom();
2037 }
2038 }
2039 }
2040
2041 if (root) {
2042 *root_device = root;
2043 root = NULL;
2044
2045 *root_device_rw = root_rw;
2046 *secondary = false;
2047 } else if (secondary_root) {
2048 *root_device = secondary_root;
2049 secondary_root = NULL;
2050
2051 *root_device_rw = secondary_root_rw;
2052 *secondary = true;
2053 } else if (generic) {
2054
2055 /* There were no partitions with precise meanings
2056 * around, but we found generic partitions. In this
2057 * case, if there's only one, we can go ahead and boot
2058 * it, otherwise we bail out, because we really cannot
2059 * make any sense of it. */
2060
2061 if (multiple_generic) {
2062 log_error("Identified multiple bootable Linux partitions on\n"
2063 " %s\n"
2064 PARTITION_TABLE_BLURB, arg_image);
2065 return -EINVAL;
2066 }
2067
2068 *root_device = generic;
2069 generic = NULL;
2070
2071 *root_device_rw = generic_rw;
2072 *secondary = false;
2073 } else {
2074 log_error("Failed to identify root partition in disk image\n"
2075 " %s\n"
2076 PARTITION_TABLE_BLURB, arg_image);
2077 return -EINVAL;
2078 }
2079
2080 if (home) {
2081 *home_device = home;
2082 home = NULL;
2083
2084 *home_device_rw = home_rw;
2085 }
2086
2087 if (srv) {
2088 *srv_device = srv;
2089 srv = NULL;
2090
2091 *srv_device_rw = srv_rw;
2092 }
2093
2094 return 0;
2095 #else
2096 log_error("--image= is not supported, compiled without blkid support.");
2097 return -EOPNOTSUPP;
2098 #endif
2099 }
2100
2101 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2102 #ifdef HAVE_BLKID
2103 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2104 const char *fstype, *p;
2105 int r;
2106
2107 assert(what);
2108 assert(where);
2109
2110 if (arg_read_only)
2111 rw = false;
2112
2113 if (directory)
2114 p = strjoina(where, directory);
2115 else
2116 p = where;
2117
2118 errno = 0;
2119 b = blkid_new_probe_from_filename(what);
2120 if (!b) {
2121 if (errno == 0)
2122 return log_oom();
2123 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2124 }
2125
2126 blkid_probe_enable_superblocks(b, 1);
2127 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2128
2129 errno = 0;
2130 r = blkid_do_safeprobe(b);
2131 if (r == -1 || r == 1) {
2132 log_error("Cannot determine file system type of %s", what);
2133 return -EINVAL;
2134 } else if (r != 0) {
2135 if (errno == 0)
2136 errno = EIO;
2137 return log_error_errno(errno, "Failed to probe %s: %m", what);
2138 }
2139
2140 errno = 0;
2141 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2142 if (errno == 0)
2143 errno = EINVAL;
2144 log_error("Failed to determine file system type of %s", what);
2145 return -errno;
2146 }
2147
2148 if (streq(fstype, "crypto_LUKS")) {
2149 log_error("nspawn currently does not support LUKS disk images.");
2150 return -EOPNOTSUPP;
2151 }
2152
2153 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2154 return log_error_errno(errno, "Failed to mount %s: %m", what);
2155
2156 return 0;
2157 #else
2158 log_error("--image= is not supported, compiled without blkid support.");
2159 return -EOPNOTSUPP;
2160 #endif
2161 }
2162
2163 static int mount_devices(
2164 const char *where,
2165 const char *root_device, bool root_device_rw,
2166 const char *home_device, bool home_device_rw,
2167 const char *srv_device, bool srv_device_rw) {
2168 int r;
2169
2170 assert(where);
2171
2172 if (root_device) {
2173 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2174 if (r < 0)
2175 return log_error_errno(r, "Failed to mount root directory: %m");
2176 }
2177
2178 if (home_device) {
2179 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2180 if (r < 0)
2181 return log_error_errno(r, "Failed to mount home directory: %m");
2182 }
2183
2184 if (srv_device) {
2185 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2186 if (r < 0)
2187 return log_error_errno(r, "Failed to mount server data directory: %m");
2188 }
2189
2190 return 0;
2191 }
2192
2193 static void loop_remove(int nr, int *image_fd) {
2194 _cleanup_close_ int control = -1;
2195 int r;
2196
2197 if (nr < 0)
2198 return;
2199
2200 if (image_fd && *image_fd >= 0) {
2201 r = ioctl(*image_fd, LOOP_CLR_FD);
2202 if (r < 0)
2203 log_debug_errno(errno, "Failed to close loop image: %m");
2204 *image_fd = safe_close(*image_fd);
2205 }
2206
2207 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2208 if (control < 0) {
2209 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2210 return;
2211 }
2212
2213 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2214 if (r < 0)
2215 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2216 }
2217
2218 /*
2219 * Return values:
2220 * < 0 : wait_for_terminate() failed to get the state of the
2221 * container, the container was terminated by a signal, or
2222 * failed for an unknown reason. No change is made to the
2223 * container argument.
2224 * > 0 : The program executed in the container terminated with an
2225 * error. The exit code of the program executed in the
2226 * container is returned. The container argument has been set
2227 * to CONTAINER_TERMINATED.
2228 * 0 : The container is being rebooted, has been shut down or exited
2229 * successfully. The container argument has been set to either
2230 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2231 *
2232 * That is, success is indicated by a return value of zero, and an
2233 * error is indicated by a non-zero value.
2234 */
2235 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2236 siginfo_t status;
2237 int r;
2238
2239 r = wait_for_terminate(pid, &status);
2240 if (r < 0)
2241 return log_warning_errno(r, "Failed to wait for container: %m");
2242
2243 switch (status.si_code) {
2244
2245 case CLD_EXITED:
2246 if (status.si_status == 0) {
2247 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2248
2249 } else
2250 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2251
2252 *container = CONTAINER_TERMINATED;
2253 return status.si_status;
2254
2255 case CLD_KILLED:
2256 if (status.si_status == SIGINT) {
2257
2258 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2259 *container = CONTAINER_TERMINATED;
2260 return 0;
2261
2262 } else if (status.si_status == SIGHUP) {
2263
2264 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2265 *container = CONTAINER_REBOOTED;
2266 return 0;
2267 }
2268
2269 /* CLD_KILLED fallthrough */
2270
2271 case CLD_DUMPED:
2272 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2273 return -EIO;
2274
2275 default:
2276 log_error("Container %s failed due to unknown reason.", arg_machine);
2277 return -EIO;
2278 }
2279
2280 return r;
2281 }
2282
2283 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2284 pid_t pid;
2285
2286 pid = PTR_TO_UINT32(userdata);
2287 if (pid > 0) {
2288 if (kill(pid, arg_kill_signal) >= 0) {
2289 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2290 sd_event_source_set_userdata(s, NULL);
2291 return 0;
2292 }
2293 }
2294
2295 sd_event_exit(sd_event_source_get_event(s), 0);
2296 return 0;
2297 }
2298
2299 static int determine_names(void) {
2300 int r;
2301
2302 if (arg_template && !arg_directory && arg_machine) {
2303
2304 /* If --template= was specified then we should not
2305 * search for a machine, but instead create a new one
2306 * in /var/lib/machine. */
2307
2308 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2309 if (!arg_directory)
2310 return log_oom();
2311 }
2312
2313 if (!arg_image && !arg_directory) {
2314 if (arg_machine) {
2315 _cleanup_(image_unrefp) Image *i = NULL;
2316
2317 r = image_find(arg_machine, &i);
2318 if (r < 0)
2319 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2320 else if (r == 0) {
2321 log_error("No image for machine '%s': %m", arg_machine);
2322 return -ENOENT;
2323 }
2324
2325 if (i->type == IMAGE_RAW)
2326 r = free_and_strdup(&arg_image, i->path);
2327 else
2328 r = free_and_strdup(&arg_directory, i->path);
2329 if (r < 0)
2330 return log_error_errno(r, "Invalid image directory: %m");
2331
2332 if (!arg_ephemeral)
2333 arg_read_only = arg_read_only || i->read_only;
2334 } else
2335 arg_directory = get_current_dir_name();
2336
2337 if (!arg_directory && !arg_machine) {
2338 log_error("Failed to determine path, please use -D or -i.");
2339 return -EINVAL;
2340 }
2341 }
2342
2343 if (!arg_machine) {
2344 if (arg_directory && path_equal(arg_directory, "/"))
2345 arg_machine = gethostname_malloc();
2346 else
2347 arg_machine = strdup(basename(arg_image ?: arg_directory));
2348
2349 if (!arg_machine)
2350 return log_oom();
2351
2352 hostname_cleanup(arg_machine);
2353 if (!machine_name_is_valid(arg_machine)) {
2354 log_error("Failed to determine machine name automatically, please use -M.");
2355 return -EINVAL;
2356 }
2357
2358 if (arg_ephemeral) {
2359 char *b;
2360
2361 /* Add a random suffix when this is an
2362 * ephemeral machine, so that we can run many
2363 * instances at once without manually having
2364 * to specify -M each time. */
2365
2366 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2367 return log_oom();
2368
2369 free(arg_machine);
2370 arg_machine = b;
2371 }
2372 }
2373
2374 return 0;
2375 }
2376
2377 static int determine_uid_shift(const char *directory) {
2378 int r;
2379
2380 if (!arg_userns) {
2381 arg_uid_shift = 0;
2382 return 0;
2383 }
2384
2385 if (arg_uid_shift == UID_INVALID) {
2386 struct stat st;
2387
2388 r = stat(directory, &st);
2389 if (r < 0)
2390 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2391
2392 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2393
2394 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2395 log_error("UID and GID base of %s don't match.", directory);
2396 return -EINVAL;
2397 }
2398
2399 arg_uid_range = UINT32_C(0x10000);
2400 }
2401
2402 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2403 log_error("UID base too high for UID range.");
2404 return -EINVAL;
2405 }
2406
2407 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2408 return 0;
2409 }
2410
2411 static int inner_child(
2412 Barrier *barrier,
2413 const char *directory,
2414 bool secondary,
2415 int kmsg_socket,
2416 int rtnl_socket,
2417 FDSet *fds) {
2418
2419 _cleanup_free_ char *home = NULL;
2420 unsigned n_env = 1;
2421 const char *envp[] = {
2422 "PATH=" DEFAULT_PATH_SPLIT_USR,
2423 NULL, /* container */
2424 NULL, /* TERM */
2425 NULL, /* HOME */
2426 NULL, /* USER */
2427 NULL, /* LOGNAME */
2428 NULL, /* container_uuid */
2429 NULL, /* LISTEN_FDS */
2430 NULL, /* LISTEN_PID */
2431 NULL
2432 };
2433
2434 _cleanup_strv_free_ char **env_use = NULL;
2435 int r;
2436
2437 assert(barrier);
2438 assert(directory);
2439 assert(kmsg_socket >= 0);
2440
2441 cg_unified_flush();
2442
2443 if (arg_userns) {
2444 /* Tell the parent, that it now can write the UID map. */
2445 (void) barrier_place(barrier); /* #1 */
2446
2447 /* Wait until the parent wrote the UID map */
2448 if (!barrier_place_and_sync(barrier)) { /* #2 */
2449 log_error("Parent died too early");
2450 return -ESRCH;
2451 }
2452 }
2453
2454 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
2455 if (r < 0)
2456 return r;
2457
2458 r = mount_sysfs(NULL);
2459 if (r < 0)
2460 return r;
2461
2462 /* Wait until we are cgroup-ified, so that we
2463 * can mount the right cgroup path writable */
2464 if (!barrier_place_and_sync(barrier)) { /* #3 */
2465 log_error("Parent died too early");
2466 return -ESRCH;
2467 }
2468
2469 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2470 if (r < 0)
2471 return r;
2472
2473 r = reset_uid_gid();
2474 if (r < 0)
2475 return log_error_errno(r, "Couldn't become new root: %m");
2476
2477 r = setup_boot_id(NULL);
2478 if (r < 0)
2479 return r;
2480
2481 r = setup_kmsg(NULL, kmsg_socket);
2482 if (r < 0)
2483 return r;
2484 kmsg_socket = safe_close(kmsg_socket);
2485
2486 umask(0022);
2487
2488 if (setsid() < 0)
2489 return log_error_errno(errno, "setsid() failed: %m");
2490
2491 if (arg_private_network)
2492 loopback_setup();
2493
2494 if (arg_expose_ports) {
2495 r = expose_port_send_rtnl(rtnl_socket);
2496 if (r < 0)
2497 return r;
2498 rtnl_socket = safe_close(rtnl_socket);
2499 }
2500
2501 r = drop_capabilities();
2502 if (r < 0)
2503 return log_error_errno(r, "drop_capabilities() failed: %m");
2504
2505 setup_hostname();
2506
2507 if (arg_personality != PERSONALITY_INVALID) {
2508 if (personality(arg_personality) < 0)
2509 return log_error_errno(errno, "personality() failed: %m");
2510 } else if (secondary) {
2511 if (personality(PER_LINUX32) < 0)
2512 return log_error_errno(errno, "personality() failed: %m");
2513 }
2514
2515 #ifdef HAVE_SELINUX
2516 if (arg_selinux_context)
2517 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2518 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2519 #endif
2520
2521 r = change_uid_gid(arg_user, &home);
2522 if (r < 0)
2523 return r;
2524
2525 /* LXC sets container=lxc, so follow the scheme here */
2526 envp[n_env++] = strjoina("container=", arg_container_service_name);
2527
2528 envp[n_env] = strv_find_prefix(environ, "TERM=");
2529 if (envp[n_env])
2530 n_env ++;
2531
2532 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2533 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2534 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2535 return log_oom();
2536
2537 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2538 char as_uuid[37];
2539
2540 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2541 return log_oom();
2542 }
2543
2544 if (fdset_size(fds) > 0) {
2545 r = fdset_cloexec(fds, false);
2546 if (r < 0)
2547 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2548
2549 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2550 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2551 return log_oom();
2552 }
2553
2554 env_use = strv_env_merge(2, envp, arg_setenv);
2555 if (!env_use)
2556 return log_oom();
2557
2558 /* Let the parent know that we are ready and
2559 * wait until the parent is ready with the
2560 * setup, too... */
2561 if (!barrier_place_and_sync(barrier)) { /* #4 */
2562 log_error("Parent died too early");
2563 return -ESRCH;
2564 }
2565
2566 /* Now, explicitly close the log, so that we
2567 * then can close all remaining fds. Closing
2568 * the log explicitly first has the benefit
2569 * that the logging subsystem knows about it,
2570 * and is thus ready to be reopened should we
2571 * need it again. Note that the other fds
2572 * closed here are at least the locking and
2573 * barrier fds. */
2574 log_close();
2575 (void) fdset_close_others(fds);
2576
2577 if (arg_boot) {
2578 char **a;
2579 size_t m;
2580
2581 /* Automatically search for the init system */
2582
2583 m = 1 + strv_length(arg_parameters);
2584 a = newa(char*, m + 1);
2585 if (strv_isempty(arg_parameters))
2586 a[1] = NULL;
2587 else
2588 memcpy(a + 1, arg_parameters, m * sizeof(char*));
2589
2590 a[0] = (char*) "/usr/lib/systemd/systemd";
2591 execve(a[0], a, env_use);
2592
2593 a[0] = (char*) "/lib/systemd/systemd";
2594 execve(a[0], a, env_use);
2595
2596 a[0] = (char*) "/sbin/init";
2597 execve(a[0], a, env_use);
2598 } else if (!strv_isempty(arg_parameters))
2599 execvpe(arg_parameters[0], arg_parameters, env_use);
2600 else {
2601 chdir(home ?: "/root");
2602 execle("/bin/bash", "-bash", NULL, env_use);
2603 execle("/bin/sh", "-sh", NULL, env_use);
2604 }
2605
2606 r = -errno;
2607 (void) log_open();
2608 return log_error_errno(r, "execv() failed: %m");
2609 }
2610
2611 static int outer_child(
2612 Barrier *barrier,
2613 const char *directory,
2614 const char *console,
2615 const char *root_device, bool root_device_rw,
2616 const char *home_device, bool home_device_rw,
2617 const char *srv_device, bool srv_device_rw,
2618 bool interactive,
2619 bool secondary,
2620 int pid_socket,
2621 int kmsg_socket,
2622 int rtnl_socket,
2623 int uid_shift_socket,
2624 FDSet *fds) {
2625
2626 pid_t pid;
2627 ssize_t l;
2628 int r;
2629
2630 assert(barrier);
2631 assert(directory);
2632 assert(console);
2633 assert(pid_socket >= 0);
2634 assert(kmsg_socket >= 0);
2635
2636 cg_unified_flush();
2637
2638 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2639 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2640
2641 if (interactive) {
2642 close_nointr(STDIN_FILENO);
2643 close_nointr(STDOUT_FILENO);
2644 close_nointr(STDERR_FILENO);
2645
2646 r = open_terminal(console, O_RDWR);
2647 if (r != STDIN_FILENO) {
2648 if (r >= 0) {
2649 safe_close(r);
2650 r = -EINVAL;
2651 }
2652
2653 return log_error_errno(r, "Failed to open console: %m");
2654 }
2655
2656 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2657 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2658 return log_error_errno(errno, "Failed to duplicate console: %m");
2659 }
2660
2661 r = reset_audit_loginuid();
2662 if (r < 0)
2663 return r;
2664
2665 /* Mark everything as slave, so that we still
2666 * receive mounts from the real root, but don't
2667 * propagate mounts to the real root. */
2668 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2669 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2670
2671 r = mount_devices(directory,
2672 root_device, root_device_rw,
2673 home_device, home_device_rw,
2674 srv_device, srv_device_rw);
2675 if (r < 0)
2676 return r;
2677
2678 r = determine_uid_shift(directory);
2679 if (r < 0)
2680 return r;
2681
2682 if (arg_userns) {
2683 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2684 if (l < 0)
2685 return log_error_errno(errno, "Failed to send UID shift: %m");
2686 if (l != sizeof(arg_uid_shift)) {
2687 log_error("Short write while sending UID shift.");
2688 return -EIO;
2689 }
2690 }
2691
2692 /* Turn directory into bind mount */
2693 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2694 return log_error_errno(errno, "Failed to make bind mount: %m");
2695
2696 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2697 if (r < 0)
2698 return r;
2699
2700 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2701 if (r < 0)
2702 return r;
2703
2704 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2705 if (r < 0)
2706 return r;
2707
2708 if (arg_read_only) {
2709 r = bind_remount_recursive(directory, true);
2710 if (r < 0)
2711 return log_error_errno(r, "Failed to make tree read-only: %m");
2712 }
2713
2714 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2715 if (r < 0)
2716 return r;
2717
2718 r = copy_devnodes(directory);
2719 if (r < 0)
2720 return r;
2721
2722 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2723
2724 r = setup_pts(directory);
2725 if (r < 0)
2726 return r;
2727
2728 r = setup_propagate(directory);
2729 if (r < 0)
2730 return r;
2731
2732 r = setup_dev_console(directory, console);
2733 if (r < 0)
2734 return r;
2735
2736 r = setup_seccomp();
2737 if (r < 0)
2738 return r;
2739
2740 r = setup_timezone(directory);
2741 if (r < 0)
2742 return r;
2743
2744 r = setup_resolv_conf(directory);
2745 if (r < 0)
2746 return r;
2747
2748 r = setup_journal(directory);
2749 if (r < 0)
2750 return r;
2751
2752 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2753 if (r < 0)
2754 return r;
2755
2756 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2757 if (r < 0)
2758 return r;
2759
2760 r = mount_move_root(directory);
2761 if (r < 0)
2762 return log_error_errno(r, "Failed to move root directory: %m");
2763
2764 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2765 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2766 (arg_private_network ? CLONE_NEWNET : 0) |
2767 (arg_userns ? CLONE_NEWUSER : 0),
2768 NULL);
2769 if (pid < 0)
2770 return log_error_errno(errno, "Failed to fork inner child: %m");
2771 if (pid == 0) {
2772 pid_socket = safe_close(pid_socket);
2773 uid_shift_socket = safe_close(uid_shift_socket);
2774
2775 /* The inner child has all namespaces that are
2776 * requested, so that we all are owned by the user if
2777 * user namespaces are turned on. */
2778
2779 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2780 if (r < 0)
2781 _exit(EXIT_FAILURE);
2782
2783 _exit(EXIT_SUCCESS);
2784 }
2785
2786 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2787 if (l < 0)
2788 return log_error_errno(errno, "Failed to send PID: %m");
2789 if (l != sizeof(pid)) {
2790 log_error("Short write while sending PID.");
2791 return -EIO;
2792 }
2793
2794 pid_socket = safe_close(pid_socket);
2795 kmsg_socket = safe_close(kmsg_socket);
2796 rtnl_socket = safe_close(rtnl_socket);
2797
2798 return 0;
2799 }
2800
2801 static int setup_uid_map(pid_t pid) {
2802 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2803 int r;
2804
2805 assert(pid > 1);
2806
2807 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2808 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2809 r = write_string_file(uid_map, line, 0);
2810 if (r < 0)
2811 return log_error_errno(r, "Failed to write UID map: %m");
2812
2813 /* We always assign the same UID and GID ranges */
2814 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2815 r = write_string_file(uid_map, line, 0);
2816 if (r < 0)
2817 return log_error_errno(r, "Failed to write GID map: %m");
2818
2819 return 0;
2820 }
2821
2822 static int load_settings(void) {
2823 _cleanup_(settings_freep) Settings *settings = NULL;
2824 _cleanup_fclose_ FILE *f = NULL;
2825 _cleanup_free_ char *p = NULL;
2826 const char *fn, *i;
2827 int r;
2828
2829 /* If all settings are masked, there's no point in looking for
2830 * the settings file */
2831 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2832 return 0;
2833
2834 fn = strjoina(arg_machine, ".nspawn");
2835
2836 /* We first look in the admin's directories in /etc and /run */
2837 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2838 _cleanup_free_ char *j = NULL;
2839
2840 j = strjoin(i, "/", fn, NULL);
2841 if (!j)
2842 return log_oom();
2843
2844 f = fopen(j, "re");
2845 if (f) {
2846 p = j;
2847 j = NULL;
2848
2849 /* By default, we trust configuration from /etc and /run */
2850 if (arg_settings_trusted < 0)
2851 arg_settings_trusted = true;
2852
2853 break;
2854 }
2855
2856 if (errno != ENOENT)
2857 return log_error_errno(errno, "Failed to open %s: %m", j);
2858 }
2859
2860 if (!f) {
2861 /* After that, let's look for a file next to the
2862 * actual image we shall boot. */
2863
2864 if (arg_image) {
2865 p = file_in_same_dir(arg_image, fn);
2866 if (!p)
2867 return log_oom();
2868 } else if (arg_directory) {
2869 p = file_in_same_dir(arg_directory, fn);
2870 if (!p)
2871 return log_oom();
2872 }
2873
2874 if (p) {
2875 f = fopen(p, "re");
2876 if (!f && errno != ENOENT)
2877 return log_error_errno(errno, "Failed to open %s: %m", p);
2878
2879 /* By default, we do not trust configuration from /var/lib/machines */
2880 if (arg_settings_trusted < 0)
2881 arg_settings_trusted = false;
2882 }
2883 }
2884
2885 if (!f)
2886 return 0;
2887
2888 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2889
2890 r = settings_load(f, p, &settings);
2891 if (r < 0)
2892 return r;
2893
2894 /* Copy over bits from the settings, unless they have been
2895 * explicitly masked by command line switches. */
2896
2897 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2898 settings->boot >= 0) {
2899 arg_boot = settings->boot;
2900
2901 strv_free(arg_parameters);
2902 arg_parameters = settings->parameters;
2903 settings->parameters = NULL;
2904 }
2905
2906 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2907 settings->environment) {
2908 strv_free(arg_setenv);
2909 arg_setenv = settings->environment;
2910 settings->environment = NULL;
2911 }
2912
2913 if ((arg_settings_mask & SETTING_USER) == 0 &&
2914 settings->user) {
2915 free(arg_user);
2916 arg_user = settings->user;
2917 settings->user = NULL;
2918 }
2919
2920 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
2921 uint64_t plus;
2922
2923 plus = settings->capability;
2924 if (settings_private_network(settings))
2925 plus |= (1ULL << CAP_NET_ADMIN);
2926
2927 if (!arg_settings_trusted && plus != 0) {
2928 if (settings->capability != 0)
2929 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2930 } else
2931 arg_retain |= plus;
2932
2933 arg_retain &= ~settings->drop_capability;
2934 }
2935
2936 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2937 settings->kill_signal > 0)
2938 arg_kill_signal = settings->kill_signal;
2939
2940 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2941 settings->personality != PERSONALITY_INVALID)
2942 arg_personality = settings->personality;
2943
2944 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2945 !sd_id128_is_null(settings->machine_id)) {
2946
2947 if (!arg_settings_trusted)
2948 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2949 else
2950 arg_uuid = settings->machine_id;
2951 }
2952
2953 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2954 settings->read_only >= 0)
2955 arg_read_only = settings->read_only;
2956
2957 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2958 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2959 arg_volatile_mode = settings->volatile_mode;
2960
2961 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2962 settings->n_custom_mounts > 0) {
2963
2964 if (!arg_settings_trusted)
2965 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2966 else {
2967 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2968 arg_custom_mounts = settings->custom_mounts;
2969 arg_n_custom_mounts = settings->n_custom_mounts;
2970
2971 settings->custom_mounts = NULL;
2972 settings->n_custom_mounts = 0;
2973 }
2974 }
2975
2976 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2977 (settings->private_network >= 0 ||
2978 settings->network_veth >= 0 ||
2979 settings->network_bridge ||
2980 settings->network_interfaces ||
2981 settings->network_macvlan ||
2982 settings->network_ipvlan ||
2983 settings->network_veth_extra)) {
2984
2985 if (!arg_settings_trusted)
2986 log_warning("Ignoring network settings, file %s is not trusted.", p);
2987 else {
2988 arg_network_veth = settings_network_veth(settings);
2989 arg_private_network = settings_private_network(settings);
2990
2991 strv_free(arg_network_interfaces);
2992 arg_network_interfaces = settings->network_interfaces;
2993 settings->network_interfaces = NULL;
2994
2995 strv_free(arg_network_macvlan);
2996 arg_network_macvlan = settings->network_macvlan;
2997 settings->network_macvlan = NULL;
2998
2999 strv_free(arg_network_ipvlan);
3000 arg_network_ipvlan = settings->network_ipvlan;
3001 settings->network_ipvlan = NULL;
3002
3003 strv_free(arg_network_veth_extra);
3004 arg_network_veth_extra = settings->network_veth_extra;
3005 settings->network_veth_extra = NULL;
3006
3007 free(arg_network_bridge);
3008 arg_network_bridge = settings->network_bridge;
3009 settings->network_bridge = NULL;
3010 }
3011 }
3012
3013 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3014 settings->expose_ports) {
3015
3016 if (!arg_settings_trusted)
3017 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3018 else {
3019 expose_port_free_all(arg_expose_ports);
3020 arg_expose_ports = settings->expose_ports;
3021 settings->expose_ports = NULL;
3022 }
3023 }
3024
3025 return 0;
3026 }
3027
3028 int main(int argc, char *argv[]) {
3029
3030 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3031 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3032 _cleanup_close_ int master = -1, image_fd = -1;
3033 _cleanup_fdset_free_ FDSet *fds = NULL;
3034 int r, n_fd_passed, loop_nr = -1;
3035 char veth_name[IFNAMSIZ];
3036 bool secondary = false, remove_subvol = false;
3037 sigset_t mask_chld;
3038 pid_t pid = 0;
3039 int ret = EXIT_SUCCESS;
3040 union in_addr_union exposed = {};
3041 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3042 bool interactive;
3043
3044 log_parse_environment();
3045 log_open();
3046
3047 r = parse_argv(argc, argv);
3048 if (r <= 0)
3049 goto finish;
3050
3051 if (geteuid() != 0) {
3052 log_error("Need to be root.");
3053 r = -EPERM;
3054 goto finish;
3055 }
3056 r = determine_names();
3057 if (r < 0)
3058 goto finish;
3059
3060 r = load_settings();
3061 if (r < 0)
3062 goto finish;
3063
3064 r = verify_arguments();
3065 if (r < 0)
3066 goto finish;
3067
3068 n_fd_passed = sd_listen_fds(false);
3069 if (n_fd_passed > 0) {
3070 r = fdset_new_listen_fds(&fds, false);
3071 if (r < 0) {
3072 log_error_errno(r, "Failed to collect file descriptors: %m");
3073 goto finish;
3074 }
3075 }
3076
3077 if (arg_directory) {
3078 assert(!arg_image);
3079
3080 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3081 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3082 r = -EINVAL;
3083 goto finish;
3084 }
3085
3086 if (arg_ephemeral) {
3087 _cleanup_free_ char *np = NULL;
3088
3089 /* If the specified path is a mount point we
3090 * generate the new snapshot immediately
3091 * inside it under a random name. However if
3092 * the specified is not a mount point we
3093 * create the new snapshot in the parent
3094 * directory, just next to it. */
3095 r = path_is_mount_point(arg_directory, 0);
3096 if (r < 0) {
3097 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3098 goto finish;
3099 }
3100 if (r > 0)
3101 r = tempfn_random_child(arg_directory, "machine.", &np);
3102 else
3103 r = tempfn_random(arg_directory, "machine.", &np);
3104 if (r < 0) {
3105 log_error_errno(r, "Failed to generate name for snapshot: %m");
3106 goto finish;
3107 }
3108
3109 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3110 if (r < 0) {
3111 log_error_errno(r, "Failed to lock %s: %m", np);
3112 goto finish;
3113 }
3114
3115 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3116 if (r < 0) {
3117 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3118 goto finish;
3119 }
3120
3121 free(arg_directory);
3122 arg_directory = np;
3123 np = NULL;
3124
3125 remove_subvol = true;
3126
3127 } else {
3128 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3129 if (r == -EBUSY) {
3130 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3131 goto finish;
3132 }
3133 if (r < 0) {
3134 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3135 return r;
3136 }
3137
3138 if (arg_template) {
3139 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3140 if (r == -EEXIST) {
3141 if (!arg_quiet)
3142 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3143 } else if (r < 0) {
3144 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3145 goto finish;
3146 } else {
3147 if (!arg_quiet)
3148 log_info("Populated %s from template %s.", arg_directory, arg_template);
3149 }
3150 }
3151 }
3152
3153 if (arg_boot) {
3154 if (path_is_os_tree(arg_directory) <= 0) {
3155 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3156 r = -EINVAL;
3157 goto finish;
3158 }
3159 } else {
3160 const char *p;
3161
3162 p = strjoina(arg_directory, "/usr/");
3163 if (laccess(p, F_OK) < 0) {
3164 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
3165 r = -EINVAL;
3166 goto finish;
3167 }
3168 }
3169
3170 } else {
3171 char template[] = "/tmp/nspawn-root-XXXXXX";
3172
3173 assert(arg_image);
3174 assert(!arg_template);
3175
3176 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3177 if (r == -EBUSY) {
3178 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3179 goto finish;
3180 }
3181 if (r < 0) {
3182 r = log_error_errno(r, "Failed to create image lock: %m");
3183 goto finish;
3184 }
3185
3186 if (!mkdtemp(template)) {
3187 log_error_errno(errno, "Failed to create temporary directory: %m");
3188 r = -errno;
3189 goto finish;
3190 }
3191
3192 arg_directory = strdup(template);
3193 if (!arg_directory) {
3194 r = log_oom();
3195 goto finish;
3196 }
3197
3198 image_fd = setup_image(&device_path, &loop_nr);
3199 if (image_fd < 0) {
3200 r = image_fd;
3201 goto finish;
3202 }
3203
3204 r = dissect_image(image_fd,
3205 &root_device, &root_device_rw,
3206 &home_device, &home_device_rw,
3207 &srv_device, &srv_device_rw,
3208 &secondary);
3209 if (r < 0)
3210 goto finish;
3211 }
3212
3213 r = custom_mounts_prepare();
3214 if (r < 0)
3215 goto finish;
3216
3217 interactive =
3218 isatty(STDIN_FILENO) > 0 &&
3219 isatty(STDOUT_FILENO) > 0;
3220
3221 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3222 if (master < 0) {
3223 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3224 goto finish;
3225 }
3226
3227 r = ptsname_malloc(master, &console);
3228 if (r < 0) {
3229 r = log_error_errno(r, "Failed to determine tty name: %m");
3230 goto finish;
3231 }
3232
3233 if (unlockpt(master) < 0) {
3234 r = log_error_errno(errno, "Failed to unlock tty: %m");
3235 goto finish;
3236 }
3237
3238 if (!arg_quiet)
3239 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3240 arg_machine, arg_image ?: arg_directory);
3241
3242 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3243
3244 assert_se(sigemptyset(&mask_chld) == 0);
3245 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3246
3247 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3248 r = log_error_errno(errno, "Failed to become subreaper: %m");
3249 goto finish;
3250 }
3251
3252 for (;;) {
3253 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 };
3254 ContainerStatus container_status;
3255 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3256 static const struct sigaction sa = {
3257 .sa_handler = nop_signal_handler,
3258 .sa_flags = SA_NOCLDSTOP,
3259 };
3260 int ifi = 0;
3261 ssize_t l;
3262 _cleanup_event_unref_ sd_event *event = NULL;
3263 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3264 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3265 char last_char = 0;
3266
3267 r = barrier_create(&barrier);
3268 if (r < 0) {
3269 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3270 goto finish;
3271 }
3272
3273 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3274 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3275 goto finish;
3276 }
3277
3278 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3279 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3280 goto finish;
3281 }
3282
3283 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3284 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3285 goto finish;
3286 }
3287
3288 if (arg_userns)
3289 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3290 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3291 goto finish;
3292 }
3293
3294 /* Child can be killed before execv(), so handle SIGCHLD
3295 * in order to interrupt parent's blocking calls and
3296 * give it a chance to call wait() and terminate. */
3297 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3298 if (r < 0) {
3299 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3300 goto finish;
3301 }
3302
3303 r = sigaction(SIGCHLD, &sa, NULL);
3304 if (r < 0) {
3305 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3306 goto finish;
3307 }
3308
3309 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3310 if (pid < 0) {
3311 if (errno == EINVAL)
3312 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3313 else
3314 r = log_error_errno(errno, "clone() failed: %m");
3315
3316 goto finish;
3317 }
3318
3319 if (pid == 0) {
3320 /* The outer child only has a file system namespace. */
3321 barrier_set_role(&barrier, BARRIER_CHILD);
3322
3323 master = safe_close(master);
3324
3325 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3326 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3327 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3328 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3329
3330 (void) reset_all_signal_handlers();
3331 (void) reset_signal_mask();
3332
3333 r = outer_child(&barrier,
3334 arg_directory,
3335 console,
3336 root_device, root_device_rw,
3337 home_device, home_device_rw,
3338 srv_device, srv_device_rw,
3339 interactive,
3340 secondary,
3341 pid_socket_pair[1],
3342 kmsg_socket_pair[1],
3343 rtnl_socket_pair[1],
3344 uid_shift_socket_pair[1],
3345 fds);
3346 if (r < 0)
3347 _exit(EXIT_FAILURE);
3348
3349 _exit(EXIT_SUCCESS);
3350 }
3351
3352 barrier_set_role(&barrier, BARRIER_PARENT);
3353
3354 fds = fdset_free(fds);
3355
3356 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3357 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3358 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3359 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3360
3361 /* Wait for the outer child. */
3362 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3363 if (r < 0)
3364 goto finish;
3365 if (r != 0) {
3366 r = -EIO;
3367 goto finish;
3368 }
3369 pid = 0;
3370
3371 /* And now retrieve the PID of the inner child. */
3372 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3373 if (l < 0) {
3374 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3375 goto finish;
3376 }
3377 if (l != sizeof(pid)) {
3378 log_error("Short read while reading inner child PID.");
3379 r = EIO;
3380 goto finish;
3381 }
3382
3383 log_debug("Init process invoked as PID " PID_FMT, pid);
3384
3385 if (arg_userns) {
3386 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3387 log_error("Child died too early.");
3388 r = -ESRCH;
3389 goto finish;
3390 }
3391
3392 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3393 if (l < 0) {
3394 r = log_error_errno(errno, "Failed to read UID shift: %m");
3395 goto finish;
3396 }
3397 if (l != sizeof(arg_uid_shift)) {
3398 log_error("Short read while reading UID shift.");
3399 r = EIO;
3400 goto finish;
3401 }
3402
3403 r = setup_uid_map(pid);
3404 if (r < 0)
3405 goto finish;
3406
3407 (void) barrier_place(&barrier); /* #2 */
3408 }
3409
3410 if (arg_private_network) {
3411
3412 r = move_network_interfaces(pid, arg_network_interfaces);
3413 if (r < 0)
3414 goto finish;
3415
3416 if (arg_network_veth) {
3417 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3418 if (r < 0)
3419 goto finish;
3420 else if (r > 0)
3421 ifi = r;
3422
3423 if (arg_network_bridge) {
3424 r = setup_bridge(veth_name, arg_network_bridge);
3425 if (r < 0)
3426 goto finish;
3427 if (r > 0)
3428 ifi = r;
3429 }
3430 }
3431
3432 r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
3433 if (r < 0)
3434 goto finish;
3435
3436 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3437 if (r < 0)
3438 goto finish;
3439
3440 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3441 if (r < 0)
3442 goto finish;
3443 }
3444
3445 if (arg_register) {
3446 r = register_machine(
3447 arg_machine,
3448 pid,
3449 arg_directory,
3450 arg_uuid,
3451 ifi,
3452 arg_slice,
3453 arg_custom_mounts, arg_n_custom_mounts,
3454 arg_kill_signal,
3455 arg_property,
3456 arg_keep_unit,
3457 arg_container_service_name);
3458 if (r < 0)
3459 goto finish;
3460 }
3461
3462 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3463 if (r < 0)
3464 goto finish;
3465
3466 if (arg_keep_unit) {
3467 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3468 if (r < 0)
3469 goto finish;
3470 }
3471
3472 r = chown_cgroup(pid, arg_uid_shift);
3473 if (r < 0)
3474 goto finish;
3475
3476 /* Notify the child that the parent is ready with all
3477 * its setup (including cgroup-ification), and that
3478 * the child can now hand over control to the code to
3479 * run inside the container. */
3480 (void) barrier_place(&barrier); /* #3 */
3481
3482 /* Block SIGCHLD here, before notifying child.
3483 * process_pty() will handle it with the other signals. */
3484 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3485
3486 /* Reset signal to default */
3487 r = default_signals(SIGCHLD, -1);
3488 if (r < 0) {
3489 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3490 goto finish;
3491 }
3492
3493 /* Let the child know that we are ready and wait that the child is completely ready now. */
3494 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3495 log_error("Child died too early.");
3496 r = -ESRCH;
3497 goto finish;
3498 }
3499
3500 sd_notifyf(false,
3501 "READY=1\n"
3502 "STATUS=Container running.\n"
3503 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3504
3505 r = sd_event_new(&event);
3506 if (r < 0) {
3507 log_error_errno(r, "Failed to get default event source: %m");
3508 goto finish;
3509 }
3510
3511 if (arg_kill_signal > 0) {
3512 /* Try to kill the init system on SIGINT or SIGTERM */
3513 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3514 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3515 } else {
3516 /* Immediately exit */
3517 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3518 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3519 }
3520
3521 /* simply exit on sigchld */
3522 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3523
3524 if (arg_expose_ports) {
3525 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3526 if (r < 0)
3527 goto finish;
3528
3529 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3530 }
3531
3532 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3533
3534 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
3535 if (r < 0) {
3536 log_error_errno(r, "Failed to create PTY forwarder: %m");
3537 goto finish;
3538 }
3539
3540 r = sd_event_loop(event);
3541 if (r < 0) {
3542 log_error_errno(r, "Failed to run event loop: %m");
3543 goto finish;
3544 }
3545
3546 pty_forward_get_last_char(forward, &last_char);
3547
3548 forward = pty_forward_free(forward);
3549
3550 if (!arg_quiet && last_char != '\n')
3551 putc('\n', stdout);
3552
3553 /* Kill if it is not dead yet anyway */
3554 if (arg_register && !arg_keep_unit)
3555 terminate_machine(pid);
3556
3557 /* Normally redundant, but better safe than sorry */
3558 kill(pid, SIGKILL);
3559
3560 r = wait_for_container(pid, &container_status);
3561 pid = 0;
3562
3563 if (r < 0)
3564 /* We failed to wait for the container, or the
3565 * container exited abnormally */
3566 goto finish;
3567 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3568 /* The container exited with a non-zero
3569 * status, or with zero status and no reboot
3570 * was requested. */
3571 ret = r;
3572 break;
3573 }
3574
3575 /* CONTAINER_REBOOTED, loop again */
3576
3577 if (arg_keep_unit) {
3578 /* Special handling if we are running as a
3579 * service: instead of simply restarting the
3580 * machine we want to restart the entire
3581 * service, so let's inform systemd about this
3582 * with the special exit code 133. The service
3583 * file uses RestartForceExitStatus=133 so
3584 * that this results in a full nspawn
3585 * restart. This is necessary since we might
3586 * have cgroup parameters set we want to have
3587 * flushed out. */
3588 ret = 133;
3589 r = 0;
3590 break;
3591 }
3592
3593 expose_port_flush(arg_expose_ports, &exposed);
3594 }
3595
3596 finish:
3597 sd_notify(false,
3598 "STOPPING=1\n"
3599 "STATUS=Terminating...");
3600
3601 if (pid > 0)
3602 kill(pid, SIGKILL);
3603
3604 /* Try to flush whatever is still queued in the pty */
3605 if (master >= 0)
3606 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
3607
3608 loop_remove(loop_nr, &image_fd);
3609
3610 if (remove_subvol && arg_directory) {
3611 int k;
3612
3613 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
3614 if (k < 0)
3615 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3616 }
3617
3618 if (arg_machine) {
3619 const char *p;
3620
3621 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3622 (void) rm_rf(p, REMOVE_ROOT);
3623 }
3624
3625 expose_port_flush(arg_expose_ports, &exposed);
3626
3627 free(arg_directory);
3628 free(arg_template);
3629 free(arg_image);
3630 free(arg_machine);
3631 free(arg_user);
3632 strv_free(arg_setenv);
3633 free(arg_network_bridge);
3634 strv_free(arg_network_interfaces);
3635 strv_free(arg_network_macvlan);
3636 strv_free(arg_network_ipvlan);
3637 strv_free(arg_network_veth_extra);
3638 strv_free(arg_parameters);
3639 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3640 expose_port_free_all(arg_expose_ports);
3641
3642 return r < 0 ? EXIT_FAILURE : ret;
3643 }