]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: rework how we determine private networking settings
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #ifdef HAVE_BLKID
23 #include <blkid/blkid.h>
24 #endif
25 #include <errno.h>
26 #include <getopt.h>
27 #include <linux/loop.h>
28 #include <sched.h>
29 #ifdef HAVE_SECCOMP
30 #include <seccomp.h>
31 #endif
32 #ifdef HAVE_SELINUX
33 #include <selinux/selinux.h>
34 #endif
35 #include <signal.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <sys/file.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
44 #include <unistd.h>
45
46 #include "sd-daemon.h"
47 #include "sd-id128.h"
48
49 #include "barrier.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
53 #include "cap-list.h"
54 #include "capability.h"
55 #include "cgroup-util.h"
56 #include "copy.h"
57 #include "dev-setup.h"
58 #include "env-util.h"
59 #include "event-util.h"
60 #include "fdset.h"
61 #include "fileio.h"
62 #include "formats-util.h"
63 #include "gpt.h"
64 #include "hostname-util.h"
65 #include "log.h"
66 #include "loopback-setup.h"
67 #include "machine-image.h"
68 #include "macro.h"
69 #include "missing.h"
70 #include "mkdir.h"
71 #include "netlink-util.h"
72 #include "path-util.h"
73 #include "process-util.h"
74 #include "ptyfwd.h"
75 #include "random-util.h"
76 #include "rm-rf.h"
77 #ifdef HAVE_SECCOMP
78 #include "seccomp-util.h"
79 #endif
80 #include "signal-util.h"
81 #include "strv.h"
82 #include "terminal-util.h"
83 #include "udev-util.h"
84 #include "util.h"
85
86 #include "nspawn-cgroup.h"
87 #include "nspawn-expose-ports.h"
88 #include "nspawn-mount.h"
89 #include "nspawn-network.h"
90 #include "nspawn-register.h"
91 #include "nspawn-settings.h"
92 #include "nspawn-setuid.h"
93
94 typedef enum ContainerStatus {
95 CONTAINER_TERMINATED,
96 CONTAINER_REBOOTED
97 } ContainerStatus;
98
99 typedef enum LinkJournal {
100 LINK_NO,
101 LINK_AUTO,
102 LINK_HOST,
103 LINK_GUEST
104 } LinkJournal;
105
106 static char *arg_directory = NULL;
107 static char *arg_template = NULL;
108 static char *arg_user = NULL;
109 static sd_id128_t arg_uuid = {};
110 static char *arg_machine = NULL;
111 static const char *arg_selinux_context = NULL;
112 static const char *arg_selinux_apifs_context = NULL;
113 static const char *arg_slice = NULL;
114 static bool arg_private_network = false;
115 static bool arg_read_only = false;
116 static bool arg_boot = false;
117 static bool arg_ephemeral = false;
118 static LinkJournal arg_link_journal = LINK_AUTO;
119 static bool arg_link_journal_try = false;
120 static uint64_t arg_retain =
121 (1ULL << CAP_CHOWN) |
122 (1ULL << CAP_DAC_OVERRIDE) |
123 (1ULL << CAP_DAC_READ_SEARCH) |
124 (1ULL << CAP_FOWNER) |
125 (1ULL << CAP_FSETID) |
126 (1ULL << CAP_IPC_OWNER) |
127 (1ULL << CAP_KILL) |
128 (1ULL << CAP_LEASE) |
129 (1ULL << CAP_LINUX_IMMUTABLE) |
130 (1ULL << CAP_NET_BIND_SERVICE) |
131 (1ULL << CAP_NET_BROADCAST) |
132 (1ULL << CAP_NET_RAW) |
133 (1ULL << CAP_SETGID) |
134 (1ULL << CAP_SETFCAP) |
135 (1ULL << CAP_SETPCAP) |
136 (1ULL << CAP_SETUID) |
137 (1ULL << CAP_SYS_ADMIN) |
138 (1ULL << CAP_SYS_CHROOT) |
139 (1ULL << CAP_SYS_NICE) |
140 (1ULL << CAP_SYS_PTRACE) |
141 (1ULL << CAP_SYS_TTY_CONFIG) |
142 (1ULL << CAP_SYS_RESOURCE) |
143 (1ULL << CAP_SYS_BOOT) |
144 (1ULL << CAP_AUDIT_WRITE) |
145 (1ULL << CAP_AUDIT_CONTROL) |
146 (1ULL << CAP_MKNOD);
147 static CustomMount *arg_custom_mounts = NULL;
148 static unsigned arg_n_custom_mounts = 0;
149 static char **arg_setenv = NULL;
150 static bool arg_quiet = false;
151 static bool arg_share_system = false;
152 static bool arg_register = true;
153 static bool arg_keep_unit = false;
154 static char **arg_network_interfaces = NULL;
155 static char **arg_network_macvlan = NULL;
156 static char **arg_network_ipvlan = NULL;
157 static bool arg_network_veth = false;
158 static char *arg_network_bridge = NULL;
159 static unsigned long arg_personality = PERSONALITY_INVALID;
160 static char *arg_image = NULL;
161 static VolatileMode arg_volatile_mode = VOLATILE_NO;
162 static ExposePort *arg_expose_ports = NULL;
163 static char **arg_property = NULL;
164 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
165 static bool arg_userns = false;
166 static int arg_kill_signal = 0;
167 static bool arg_unified_cgroup_hierarchy = false;
168 static SettingsMask arg_settings_mask = 0;
169 static int arg_settings_trusted = -1;
170 static char **arg_parameters = NULL;
171
172 static void help(void) {
173 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
174 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
175 " -h --help Show this help\n"
176 " --version Print version string\n"
177 " -q --quiet Do not show status information\n"
178 " -D --directory=PATH Root directory for the container\n"
179 " --template=PATH Initialize root directory from template directory,\n"
180 " if missing\n"
181 " -x --ephemeral Run container with snapshot of root directory, and\n"
182 " remove it after exit\n"
183 " -i --image=PATH File system device or disk image for the container\n"
184 " -b --boot Boot up full system (i.e. invoke init)\n"
185 " -u --user=USER Run the command under specified user or uid\n"
186 " -M --machine=NAME Set the machine name for the container\n"
187 " --uuid=UUID Set a specific machine UUID for the container\n"
188 " -S --slice=SLICE Place the container in the specified slice\n"
189 " --property=NAME=VALUE Set scope unit property\n"
190 " --private-users[=UIDBASE[:NUIDS]]\n"
191 " Run within user namespace\n"
192 " --private-network Disable network in container\n"
193 " --network-interface=INTERFACE\n"
194 " Assign an existing network interface to the\n"
195 " container\n"
196 " --network-macvlan=INTERFACE\n"
197 " Create a macvlan network interface based on an\n"
198 " existing network interface to the container\n"
199 " --network-ipvlan=INTERFACE\n"
200 " Create a ipvlan network interface based on an\n"
201 " existing network interface to the container\n"
202 " -n --network-veth Add a virtual ethernet connection between host\n"
203 " and container\n"
204 " --network-bridge=INTERFACE\n"
205 " Add a virtual ethernet connection between host\n"
206 " and container and add it to an existing bridge on\n"
207 " the host\n"
208 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
209 " Expose a container IP port on the host\n"
210 " -Z --selinux-context=SECLABEL\n"
211 " Set the SELinux security context to be used by\n"
212 " processes in the container\n"
213 " -L --selinux-apifs-context=SECLABEL\n"
214 " Set the SELinux security context to be used by\n"
215 " API/tmpfs file systems in the container\n"
216 " --capability=CAP In addition to the default, retain specified\n"
217 " capability\n"
218 " --drop-capability=CAP Drop the specified capability from the default set\n"
219 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
220 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
221 " try-guest, try-host\n"
222 " -j Equivalent to --link-journal=try-guest\n"
223 " --read-only Mount the root directory read-only\n"
224 " --bind=PATH[:PATH[:OPTIONS]]\n"
225 " Bind mount a file or directory from the host into\n"
226 " the container\n"
227 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
228 " Similar, but creates a read-only bind mount\n"
229 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
230 " --overlay=PATH[:PATH...]:PATH\n"
231 " Create an overlay mount from the host to \n"
232 " the container\n"
233 " --overlay-ro=PATH[:PATH...]:PATH\n"
234 " Similar, but creates a read-only overlay mount\n"
235 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
236 " --share-system Share system namespaces with host\n"
237 " --register=BOOLEAN Register container as machine\n"
238 " --keep-unit Do not register a scope for the machine, reuse\n"
239 " the service unit nspawn is running in\n"
240 " --volatile[=MODE] Run the system in volatile mode\n"
241 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
242 , program_invocation_short_name);
243 }
244
245
246 static int custom_mounts_prepare(void) {
247 unsigned i;
248 int r;
249
250 /* Ensure the mounts are applied prefix first. */
251 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
252
253 /* Allocate working directories for the overlay file systems that need it */
254 for (i = 0; i < arg_n_custom_mounts; i++) {
255 CustomMount *m = &arg_custom_mounts[i];
256
257 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
258 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
259 return -EINVAL;
260 }
261
262 if (m->type != CUSTOM_MOUNT_OVERLAY)
263 continue;
264
265 if (m->work_dir)
266 continue;
267
268 if (m->read_only)
269 continue;
270
271 r = tempfn_random(m->source, NULL, &m->work_dir);
272 if (r < 0)
273 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
274 }
275
276 return 0;
277 }
278
279 static int set_sanitized_path(char **b, const char *path) {
280 char *p;
281
282 assert(b);
283 assert(path);
284
285 p = canonicalize_file_name(path);
286 if (!p) {
287 if (errno != ENOENT)
288 return -errno;
289
290 p = path_make_absolute_cwd(path);
291 if (!p)
292 return -ENOMEM;
293 }
294
295 free(*b);
296 *b = path_kill_slashes(p);
297 return 0;
298 }
299
300 static int detect_unified_cgroup_hierarchy(void) {
301 const char *e;
302 int r;
303
304 /* Allow the user to control whether the unified hierarchy is used */
305 e = getenv("UNIFIED_CGROUP_HIERARCHY");
306 if (e) {
307 r = parse_boolean(e);
308 if (r < 0)
309 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
310
311 arg_unified_cgroup_hierarchy = r;
312 return 0;
313 }
314
315 /* Otherwise inherit the default from the host system */
316 r = cg_unified();
317 if (r < 0)
318 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
319
320 arg_unified_cgroup_hierarchy = r;
321 return 0;
322 }
323
324 static int parse_argv(int argc, char *argv[]) {
325
326 enum {
327 ARG_VERSION = 0x100,
328 ARG_PRIVATE_NETWORK,
329 ARG_UUID,
330 ARG_READ_ONLY,
331 ARG_CAPABILITY,
332 ARG_DROP_CAPABILITY,
333 ARG_LINK_JOURNAL,
334 ARG_BIND,
335 ARG_BIND_RO,
336 ARG_TMPFS,
337 ARG_OVERLAY,
338 ARG_OVERLAY_RO,
339 ARG_SETENV,
340 ARG_SHARE_SYSTEM,
341 ARG_REGISTER,
342 ARG_KEEP_UNIT,
343 ARG_NETWORK_INTERFACE,
344 ARG_NETWORK_MACVLAN,
345 ARG_NETWORK_IPVLAN,
346 ARG_NETWORK_BRIDGE,
347 ARG_PERSONALITY,
348 ARG_VOLATILE,
349 ARG_TEMPLATE,
350 ARG_PROPERTY,
351 ARG_PRIVATE_USERS,
352 ARG_KILL_SIGNAL,
353 ARG_SETTINGS,
354 };
355
356 static const struct option options[] = {
357 { "help", no_argument, NULL, 'h' },
358 { "version", no_argument, NULL, ARG_VERSION },
359 { "directory", required_argument, NULL, 'D' },
360 { "template", required_argument, NULL, ARG_TEMPLATE },
361 { "ephemeral", no_argument, NULL, 'x' },
362 { "user", required_argument, NULL, 'u' },
363 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
364 { "boot", no_argument, NULL, 'b' },
365 { "uuid", required_argument, NULL, ARG_UUID },
366 { "read-only", no_argument, NULL, ARG_READ_ONLY },
367 { "capability", required_argument, NULL, ARG_CAPABILITY },
368 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
369 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
370 { "bind", required_argument, NULL, ARG_BIND },
371 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
372 { "tmpfs", required_argument, NULL, ARG_TMPFS },
373 { "overlay", required_argument, NULL, ARG_OVERLAY },
374 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
375 { "machine", required_argument, NULL, 'M' },
376 { "slice", required_argument, NULL, 'S' },
377 { "setenv", required_argument, NULL, ARG_SETENV },
378 { "selinux-context", required_argument, NULL, 'Z' },
379 { "selinux-apifs-context", required_argument, NULL, 'L' },
380 { "quiet", no_argument, NULL, 'q' },
381 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
382 { "register", required_argument, NULL, ARG_REGISTER },
383 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
384 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
385 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
386 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
387 { "network-veth", no_argument, NULL, 'n' },
388 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
389 { "personality", required_argument, NULL, ARG_PERSONALITY },
390 { "image", required_argument, NULL, 'i' },
391 { "volatile", optional_argument, NULL, ARG_VOLATILE },
392 { "port", required_argument, NULL, 'p' },
393 { "property", required_argument, NULL, ARG_PROPERTY },
394 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
395 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
396 { "settings", required_argument, NULL, ARG_SETTINGS },
397 {}
398 };
399
400 int c, r;
401 uint64_t plus = 0, minus = 0;
402 bool mask_all_settings = false, mask_no_settings = false;
403
404 assert(argc >= 0);
405 assert(argv);
406
407 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
408
409 switch (c) {
410
411 case 'h':
412 help();
413 return 0;
414
415 case ARG_VERSION:
416 return version();
417
418 case 'D':
419 r = set_sanitized_path(&arg_directory, optarg);
420 if (r < 0)
421 return log_error_errno(r, "Invalid root directory: %m");
422
423 break;
424
425 case ARG_TEMPLATE:
426 r = set_sanitized_path(&arg_template, optarg);
427 if (r < 0)
428 return log_error_errno(r, "Invalid template directory: %m");
429
430 break;
431
432 case 'i':
433 r = set_sanitized_path(&arg_image, optarg);
434 if (r < 0)
435 return log_error_errno(r, "Invalid image path: %m");
436
437 break;
438
439 case 'x':
440 arg_ephemeral = true;
441 break;
442
443 case 'u':
444 r = free_and_strdup(&arg_user, optarg);
445 if (r < 0)
446 return log_oom();
447
448 arg_settings_mask |= SETTING_USER;
449 break;
450
451 case ARG_NETWORK_BRIDGE:
452 r = free_and_strdup(&arg_network_bridge, optarg);
453 if (r < 0)
454 return log_oom();
455
456 /* fall through */
457
458 case 'n':
459 arg_network_veth = true;
460 arg_private_network = true;
461 arg_settings_mask |= SETTING_NETWORK;
462 break;
463
464 case ARG_NETWORK_INTERFACE:
465 if (strv_extend(&arg_network_interfaces, optarg) < 0)
466 return log_oom();
467
468 arg_private_network = true;
469 arg_settings_mask |= SETTING_NETWORK;
470 break;
471
472 case ARG_NETWORK_MACVLAN:
473 if (strv_extend(&arg_network_macvlan, optarg) < 0)
474 return log_oom();
475
476 arg_private_network = true;
477 arg_settings_mask |= SETTING_NETWORK;
478 break;
479
480 case ARG_NETWORK_IPVLAN:
481 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
482 return log_oom();
483
484 /* fall through */
485
486 case ARG_PRIVATE_NETWORK:
487 arg_private_network = true;
488 arg_settings_mask |= SETTING_NETWORK;
489 break;
490
491 case 'b':
492 arg_boot = true;
493 arg_settings_mask |= SETTING_BOOT;
494 break;
495
496 case ARG_UUID:
497 r = sd_id128_from_string(optarg, &arg_uuid);
498 if (r < 0) {
499 log_error("Invalid UUID: %s", optarg);
500 return r;
501 }
502
503 arg_settings_mask |= SETTING_MACHINE_ID;
504 break;
505
506 case 'S':
507 arg_slice = optarg;
508 break;
509
510 case 'M':
511 if (isempty(optarg))
512 arg_machine = mfree(arg_machine);
513 else {
514 if (!machine_name_is_valid(optarg)) {
515 log_error("Invalid machine name: %s", optarg);
516 return -EINVAL;
517 }
518
519 r = free_and_strdup(&arg_machine, optarg);
520 if (r < 0)
521 return log_oom();
522
523 break;
524 }
525
526 case 'Z':
527 arg_selinux_context = optarg;
528 break;
529
530 case 'L':
531 arg_selinux_apifs_context = optarg;
532 break;
533
534 case ARG_READ_ONLY:
535 arg_read_only = true;
536 arg_settings_mask |= SETTING_READ_ONLY;
537 break;
538
539 case ARG_CAPABILITY:
540 case ARG_DROP_CAPABILITY: {
541 const char *state, *word;
542 size_t length;
543
544 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
545 _cleanup_free_ char *t;
546
547 t = strndup(word, length);
548 if (!t)
549 return log_oom();
550
551 if (streq(t, "all")) {
552 if (c == ARG_CAPABILITY)
553 plus = (uint64_t) -1;
554 else
555 minus = (uint64_t) -1;
556 } else {
557 int cap;
558
559 cap = capability_from_name(t);
560 if (cap < 0) {
561 log_error("Failed to parse capability %s.", t);
562 return -EINVAL;
563 }
564
565 if (c == ARG_CAPABILITY)
566 plus |= 1ULL << (uint64_t) cap;
567 else
568 minus |= 1ULL << (uint64_t) cap;
569 }
570 }
571
572 arg_settings_mask |= SETTING_CAPABILITY;
573 break;
574 }
575
576 case 'j':
577 arg_link_journal = LINK_GUEST;
578 arg_link_journal_try = true;
579 break;
580
581 case ARG_LINK_JOURNAL:
582 if (streq(optarg, "auto")) {
583 arg_link_journal = LINK_AUTO;
584 arg_link_journal_try = false;
585 } else if (streq(optarg, "no")) {
586 arg_link_journal = LINK_NO;
587 arg_link_journal_try = false;
588 } else if (streq(optarg, "guest")) {
589 arg_link_journal = LINK_GUEST;
590 arg_link_journal_try = false;
591 } else if (streq(optarg, "host")) {
592 arg_link_journal = LINK_HOST;
593 arg_link_journal_try = false;
594 } else if (streq(optarg, "try-guest")) {
595 arg_link_journal = LINK_GUEST;
596 arg_link_journal_try = true;
597 } else if (streq(optarg, "try-host")) {
598 arg_link_journal = LINK_HOST;
599 arg_link_journal_try = true;
600 } else {
601 log_error("Failed to parse link journal mode %s", optarg);
602 return -EINVAL;
603 }
604
605 break;
606
607 case ARG_BIND:
608 case ARG_BIND_RO:
609 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
610 if (r < 0)
611 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
612
613 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
614 break;
615
616 case ARG_TMPFS:
617 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
618 if (r < 0)
619 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
620
621 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
622 break;
623
624 case ARG_OVERLAY:
625 case ARG_OVERLAY_RO: {
626 _cleanup_free_ char *upper = NULL, *destination = NULL;
627 _cleanup_strv_free_ char **lower = NULL;
628 CustomMount *m;
629 unsigned n = 0;
630 char **i;
631
632 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
633 if (r == -ENOMEM)
634 return log_oom();
635 else if (r < 0) {
636 log_error("Invalid overlay specification: %s", optarg);
637 return r;
638 }
639
640 STRV_FOREACH(i, lower) {
641 if (!path_is_absolute(*i)) {
642 log_error("Overlay path %s is not absolute.", *i);
643 return -EINVAL;
644 }
645
646 n++;
647 }
648
649 if (n < 2) {
650 log_error("--overlay= needs at least two colon-separated directories specified.");
651 return -EINVAL;
652 }
653
654 if (n == 2) {
655 /* If two parameters are specified,
656 * the first one is the lower, the
657 * second one the upper directory. And
658 * we'll also define the destination
659 * mount point the same as the upper. */
660 upper = lower[1];
661 lower[1] = NULL;
662
663 destination = strdup(upper);
664 if (!destination)
665 return log_oom();
666
667 } else {
668 upper = lower[n - 2];
669 destination = lower[n - 1];
670 lower[n - 2] = NULL;
671 }
672
673 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
674 if (!m)
675 return log_oom();
676
677 m->destination = destination;
678 m->source = upper;
679 m->lower = lower;
680 m->read_only = c == ARG_OVERLAY_RO;
681
682 upper = destination = NULL;
683 lower = NULL;
684
685 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
686 break;
687 }
688
689 case ARG_SETENV: {
690 char **n;
691
692 if (!env_assignment_is_valid(optarg)) {
693 log_error("Environment variable assignment '%s' is not valid.", optarg);
694 return -EINVAL;
695 }
696
697 n = strv_env_set(arg_setenv, optarg);
698 if (!n)
699 return log_oom();
700
701 strv_free(arg_setenv);
702 arg_setenv = n;
703
704 arg_settings_mask |= SETTING_ENVIRONMENT;
705 break;
706 }
707
708 case 'q':
709 arg_quiet = true;
710 break;
711
712 case ARG_SHARE_SYSTEM:
713 arg_share_system = true;
714 break;
715
716 case ARG_REGISTER:
717 r = parse_boolean(optarg);
718 if (r < 0) {
719 log_error("Failed to parse --register= argument: %s", optarg);
720 return r;
721 }
722
723 arg_register = r;
724 break;
725
726 case ARG_KEEP_UNIT:
727 arg_keep_unit = true;
728 break;
729
730 case ARG_PERSONALITY:
731
732 arg_personality = personality_from_string(optarg);
733 if (arg_personality == PERSONALITY_INVALID) {
734 log_error("Unknown or unsupported personality '%s'.", optarg);
735 return -EINVAL;
736 }
737
738 arg_settings_mask |= SETTING_PERSONALITY;
739 break;
740
741 case ARG_VOLATILE:
742
743 if (!optarg)
744 arg_volatile_mode = VOLATILE_YES;
745 else {
746 VolatileMode m;
747
748 m = volatile_mode_from_string(optarg);
749 if (m < 0) {
750 log_error("Failed to parse --volatile= argument: %s", optarg);
751 return -EINVAL;
752 } else
753 arg_volatile_mode = m;
754 }
755
756 arg_settings_mask |= SETTING_VOLATILE_MODE;
757 break;
758
759 case 'p':
760 r = expose_port_parse(&arg_expose_ports, optarg);
761 if (r == -EEXIST)
762 return log_error_errno(r, "Duplicate port specification: %s", optarg);
763 if (r < 0)
764 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
765
766 arg_settings_mask |= SETTING_EXPOSE_PORTS;
767 break;
768
769 case ARG_PROPERTY:
770 if (strv_extend(&arg_property, optarg) < 0)
771 return log_oom();
772
773 break;
774
775 case ARG_PRIVATE_USERS:
776 if (optarg) {
777 _cleanup_free_ char *buffer = NULL;
778 const char *range, *shift;
779
780 range = strchr(optarg, ':');
781 if (range) {
782 buffer = strndup(optarg, range - optarg);
783 if (!buffer)
784 return log_oom();
785 shift = buffer;
786
787 range++;
788 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
789 log_error("Failed to parse UID range: %s", range);
790 return -EINVAL;
791 }
792 } else
793 shift = optarg;
794
795 if (parse_uid(shift, &arg_uid_shift) < 0) {
796 log_error("Failed to parse UID: %s", optarg);
797 return -EINVAL;
798 }
799 }
800
801 arg_userns = true;
802 break;
803
804 case ARG_KILL_SIGNAL:
805 arg_kill_signal = signal_from_string_try_harder(optarg);
806 if (arg_kill_signal < 0) {
807 log_error("Cannot parse signal: %s", optarg);
808 return -EINVAL;
809 }
810
811 arg_settings_mask |= SETTING_KILL_SIGNAL;
812 break;
813
814 case ARG_SETTINGS:
815
816 /* no → do not read files
817 * yes → read files, do not override cmdline, trust only subset
818 * override → read files, override cmdline, trust only subset
819 * trusted → read files, do not override cmdline, trust all
820 */
821
822 r = parse_boolean(optarg);
823 if (r < 0) {
824 if (streq(optarg, "trusted")) {
825 mask_all_settings = false;
826 mask_no_settings = false;
827 arg_settings_trusted = true;
828
829 } else if (streq(optarg, "override")) {
830 mask_all_settings = false;
831 mask_no_settings = true;
832 arg_settings_trusted = -1;
833 } else
834 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
835 } else if (r > 0) {
836 /* yes */
837 mask_all_settings = false;
838 mask_no_settings = false;
839 arg_settings_trusted = -1;
840 } else {
841 /* no */
842 mask_all_settings = true;
843 mask_no_settings = false;
844 arg_settings_trusted = false;
845 }
846
847 break;
848
849 case '?':
850 return -EINVAL;
851
852 default:
853 assert_not_reached("Unhandled option");
854 }
855
856 if (arg_share_system)
857 arg_register = false;
858
859 if (arg_boot && arg_share_system) {
860 log_error("--boot and --share-system may not be combined.");
861 return -EINVAL;
862 }
863
864 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
865 log_error("--keep-unit may not be used when invoked from a user session.");
866 return -EINVAL;
867 }
868
869 if (arg_directory && arg_image) {
870 log_error("--directory= and --image= may not be combined.");
871 return -EINVAL;
872 }
873
874 if (arg_template && arg_image) {
875 log_error("--template= and --image= may not be combined.");
876 return -EINVAL;
877 }
878
879 if (arg_template && !(arg_directory || arg_machine)) {
880 log_error("--template= needs --directory= or --machine=.");
881 return -EINVAL;
882 }
883
884 if (arg_ephemeral && arg_template) {
885 log_error("--ephemeral and --template= may not be combined.");
886 return -EINVAL;
887 }
888
889 if (arg_ephemeral && arg_image) {
890 log_error("--ephemeral and --image= may not be combined.");
891 return -EINVAL;
892 }
893
894 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
895 log_error("--ephemeral and --link-journal= may not be combined.");
896 return -EINVAL;
897 }
898
899 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
900 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
901
902 if (argc > optind) {
903 arg_parameters = strv_copy(argv + optind);
904 if (!arg_parameters)
905 return log_oom();
906
907 arg_settings_mask |= SETTING_BOOT;
908 }
909
910 /* Load all settings from .nspawn files */
911 if (mask_no_settings)
912 arg_settings_mask = 0;
913
914 /* Don't load any settings from .nspawn files */
915 if (mask_all_settings)
916 arg_settings_mask = _SETTINGS_MASK_ALL;
917
918 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
919
920 r = detect_unified_cgroup_hierarchy();
921 if (r < 0)
922 return r;
923
924 return 1;
925 }
926
927 static int verify_arguments(void) {
928
929 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
930 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
931 return -EINVAL;
932 }
933
934 if (arg_expose_ports && !arg_private_network) {
935 log_error("Cannot use --port= without private networking.");
936 return -EINVAL;
937 }
938
939 if (arg_boot && arg_kill_signal <= 0)
940 arg_kill_signal = SIGRTMIN+3;
941
942 return 0;
943 }
944
945 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
946 assert(p);
947
948 if (!arg_userns)
949 return 0;
950
951 if (uid == UID_INVALID && gid == GID_INVALID)
952 return 0;
953
954 if (uid != UID_INVALID) {
955 uid += arg_uid_shift;
956
957 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
958 return -EOVERFLOW;
959 }
960
961 if (gid != GID_INVALID) {
962 gid += (gid_t) arg_uid_shift;
963
964 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
965 return -EOVERFLOW;
966 }
967
968 if (lchown(p, uid, gid) < 0)
969 return -errno;
970
971 return 0;
972 }
973
974 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
975 const char *q;
976
977 q = prefix_roota(root, path);
978 if (mkdir(q, mode) < 0) {
979 if (errno == EEXIST)
980 return 0;
981 return -errno;
982 }
983
984 return userns_lchown(q, uid, gid);
985 }
986
987 static int setup_timezone(const char *dest) {
988 _cleanup_free_ char *p = NULL, *q = NULL;
989 const char *where, *check, *what;
990 char *z, *y;
991 int r;
992
993 assert(dest);
994
995 /* Fix the timezone, if possible */
996 r = readlink_malloc("/etc/localtime", &p);
997 if (r < 0) {
998 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
999 return 0;
1000 }
1001
1002 z = path_startswith(p, "../usr/share/zoneinfo/");
1003 if (!z)
1004 z = path_startswith(p, "/usr/share/zoneinfo/");
1005 if (!z) {
1006 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1007 return 0;
1008 }
1009
1010 where = prefix_roota(dest, "/etc/localtime");
1011 r = readlink_malloc(where, &q);
1012 if (r >= 0) {
1013 y = path_startswith(q, "../usr/share/zoneinfo/");
1014 if (!y)
1015 y = path_startswith(q, "/usr/share/zoneinfo/");
1016
1017 /* Already pointing to the right place? Then do nothing .. */
1018 if (y && streq(y, z))
1019 return 0;
1020 }
1021
1022 check = strjoina("/usr/share/zoneinfo/", z);
1023 check = prefix_root(dest, check);
1024 if (laccess(check, F_OK) < 0) {
1025 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1026 return 0;
1027 }
1028
1029 r = unlink(where);
1030 if (r < 0 && errno != ENOENT) {
1031 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1032 return 0;
1033 }
1034
1035 what = strjoina("../usr/share/zoneinfo/", z);
1036 if (symlink(what, where) < 0) {
1037 log_error_errno(errno, "Failed to correct timezone of container: %m");
1038 return 0;
1039 }
1040
1041 r = userns_lchown(where, 0, 0);
1042 if (r < 0)
1043 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1044
1045 return 0;
1046 }
1047
1048 static int setup_resolv_conf(const char *dest) {
1049 const char *where = NULL;
1050 int r;
1051
1052 assert(dest);
1053
1054 if (arg_private_network)
1055 return 0;
1056
1057 /* Fix resolv.conf, if possible */
1058 where = prefix_roota(dest, "/etc/resolv.conf");
1059
1060 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1061 if (r < 0) {
1062 /* If the file already exists as symlink, let's
1063 * suppress the warning, under the assumption that
1064 * resolved or something similar runs inside and the
1065 * symlink points there.
1066 *
1067 * If the disk image is read-only, there's also no
1068 * point in complaining.
1069 */
1070 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1071 "Failed to copy /etc/resolv.conf to %s: %m", where);
1072 return 0;
1073 }
1074
1075 r = userns_lchown(where, 0, 0);
1076 if (r < 0)
1077 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1078
1079 return 0;
1080 }
1081
1082 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1083 assert(s);
1084
1085 snprintf(s, 37,
1086 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1087 SD_ID128_FORMAT_VAL(id));
1088
1089 return s;
1090 }
1091
1092 static int setup_boot_id(const char *dest) {
1093 const char *from, *to;
1094 sd_id128_t rnd = {};
1095 char as_uuid[37];
1096 int r;
1097
1098 if (arg_share_system)
1099 return 0;
1100
1101 /* Generate a new randomized boot ID, so that each boot-up of
1102 * the container gets a new one */
1103
1104 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1105 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1106
1107 r = sd_id128_randomize(&rnd);
1108 if (r < 0)
1109 return log_error_errno(r, "Failed to generate random boot id: %m");
1110
1111 id128_format_as_uuid(rnd, as_uuid);
1112
1113 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1114 if (r < 0)
1115 return log_error_errno(r, "Failed to write boot id: %m");
1116
1117 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1118 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1119 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1120 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1121
1122 unlink(from);
1123 return r;
1124 }
1125
1126 static int copy_devnodes(const char *dest) {
1127
1128 static const char devnodes[] =
1129 "null\0"
1130 "zero\0"
1131 "full\0"
1132 "random\0"
1133 "urandom\0"
1134 "tty\0"
1135 "net/tun\0";
1136
1137 const char *d;
1138 int r = 0;
1139 _cleanup_umask_ mode_t u;
1140
1141 assert(dest);
1142
1143 u = umask(0000);
1144
1145 /* Create /dev/net, so that we can create /dev/net/tun in it */
1146 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1147 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1148
1149 NULSTR_FOREACH(d, devnodes) {
1150 _cleanup_free_ char *from = NULL, *to = NULL;
1151 struct stat st;
1152
1153 from = strappend("/dev/", d);
1154 to = prefix_root(dest, from);
1155
1156 if (stat(from, &st) < 0) {
1157
1158 if (errno != ENOENT)
1159 return log_error_errno(errno, "Failed to stat %s: %m", from);
1160
1161 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1162
1163 log_error("%s is not a char or block device, cannot copy.", from);
1164 return -EIO;
1165
1166 } else {
1167 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1168 if (errno != EPERM)
1169 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1170
1171 /* Some systems abusively restrict mknod but
1172 * allow bind mounts. */
1173 r = touch(to);
1174 if (r < 0)
1175 return log_error_errno(r, "touch (%s) failed: %m", to);
1176 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1177 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1178 }
1179
1180 r = userns_lchown(to, 0, 0);
1181 if (r < 0)
1182 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1183 }
1184 }
1185
1186 return r;
1187 }
1188
1189 static int setup_pts(const char *dest) {
1190 _cleanup_free_ char *options = NULL;
1191 const char *p;
1192
1193 #ifdef HAVE_SELINUX
1194 if (arg_selinux_apifs_context)
1195 (void) asprintf(&options,
1196 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1197 arg_uid_shift + TTY_GID,
1198 arg_selinux_apifs_context);
1199 else
1200 #endif
1201 (void) asprintf(&options,
1202 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1203 arg_uid_shift + TTY_GID);
1204
1205 if (!options)
1206 return log_oom();
1207
1208 /* Mount /dev/pts itself */
1209 p = prefix_roota(dest, "/dev/pts");
1210 if (mkdir(p, 0755) < 0)
1211 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1212 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1213 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1214 if (userns_lchown(p, 0, 0) < 0)
1215 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1216
1217 /* Create /dev/ptmx symlink */
1218 p = prefix_roota(dest, "/dev/ptmx");
1219 if (symlink("pts/ptmx", p) < 0)
1220 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1221 if (userns_lchown(p, 0, 0) < 0)
1222 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1223
1224 /* And fix /dev/pts/ptmx ownership */
1225 p = prefix_roota(dest, "/dev/pts/ptmx");
1226 if (userns_lchown(p, 0, 0) < 0)
1227 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1228
1229 return 0;
1230 }
1231
1232 static int setup_dev_console(const char *dest, const char *console) {
1233 _cleanup_umask_ mode_t u;
1234 const char *to;
1235 int r;
1236
1237 assert(dest);
1238 assert(console);
1239
1240 u = umask(0000);
1241
1242 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1243 if (r < 0)
1244 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1245
1246 /* We need to bind mount the right tty to /dev/console since
1247 * ptys can only exist on pts file systems. To have something
1248 * to bind mount things on we create a empty regular file. */
1249
1250 to = prefix_roota(dest, "/dev/console");
1251 r = touch(to);
1252 if (r < 0)
1253 return log_error_errno(r, "touch() for /dev/console failed: %m");
1254
1255 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1256 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1257
1258 return 0;
1259 }
1260
1261 static int setup_kmsg(const char *dest, int kmsg_socket) {
1262 const char *from, *to;
1263 _cleanup_umask_ mode_t u;
1264 int fd, r;
1265
1266 assert(kmsg_socket >= 0);
1267
1268 u = umask(0000);
1269
1270 /* We create the kmsg FIFO as /run/kmsg, but immediately
1271 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1272 * on the reading side behave very similar to /proc/kmsg,
1273 * their writing side behaves differently from /dev/kmsg in
1274 * that writing blocks when nothing is reading. In order to
1275 * avoid any problems with containers deadlocking due to this
1276 * we simply make /dev/kmsg unavailable to the container. */
1277 from = prefix_roota(dest, "/run/kmsg");
1278 to = prefix_roota(dest, "/proc/kmsg");
1279
1280 if (mkfifo(from, 0600) < 0)
1281 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1282 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1283 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1284
1285 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1286 if (fd < 0)
1287 return log_error_errno(errno, "Failed to open fifo: %m");
1288
1289 /* Store away the fd in the socket, so that it stays open as
1290 * long as we run the child */
1291 r = send_one_fd(kmsg_socket, fd, 0);
1292 safe_close(fd);
1293
1294 if (r < 0)
1295 return log_error_errno(r, "Failed to send FIFO fd: %m");
1296
1297 /* And now make the FIFO unavailable as /run/kmsg... */
1298 (void) unlink(from);
1299
1300 return 0;
1301 }
1302
1303 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1304 union in_addr_union *exposed = userdata;
1305
1306 assert(rtnl);
1307 assert(m);
1308 assert(exposed);
1309
1310 expose_port_execute(rtnl, arg_expose_ports, exposed);
1311 return 0;
1312 }
1313
1314 static int setup_hostname(void) {
1315
1316 if (arg_share_system)
1317 return 0;
1318
1319 if (sethostname_idempotent(arg_machine) < 0)
1320 return -errno;
1321
1322 return 0;
1323 }
1324
1325 static int setup_journal(const char *directory) {
1326 sd_id128_t machine_id, this_id;
1327 _cleanup_free_ char *b = NULL, *d = NULL;
1328 const char *etc_machine_id, *p, *q;
1329 char *id;
1330 int r;
1331
1332 /* Don't link journals in ephemeral mode */
1333 if (arg_ephemeral)
1334 return 0;
1335
1336 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1337
1338 r = read_one_line_file(etc_machine_id, &b);
1339 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1340 return 0;
1341 else if (r < 0)
1342 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
1343
1344 id = strstrip(b);
1345 if (isempty(id) && arg_link_journal == LINK_AUTO)
1346 return 0;
1347
1348 /* Verify validity */
1349 r = sd_id128_from_string(id, &machine_id);
1350 if (r < 0)
1351 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
1352
1353 r = sd_id128_get_machine(&this_id);
1354 if (r < 0)
1355 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1356
1357 if (sd_id128_equal(machine_id, this_id)) {
1358 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1359 "Host and machine ids are equal (%s): refusing to link journals", id);
1360 if (arg_link_journal == LINK_AUTO)
1361 return 0;
1362 return -EEXIST;
1363 }
1364
1365 if (arg_link_journal == LINK_NO)
1366 return 0;
1367
1368 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1369 if (r < 0)
1370 return log_error_errno(r, "Failed to create /var: %m");
1371
1372 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1373 if (r < 0)
1374 return log_error_errno(r, "Failed to create /var/log: %m");
1375
1376 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1377 if (r < 0)
1378 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1379
1380 p = strjoina("/var/log/journal/", id);
1381 q = prefix_roota(directory, p);
1382
1383 if (path_is_mount_point(p, 0) > 0) {
1384 if (arg_link_journal != LINK_AUTO) {
1385 log_error("%s: already a mount point, refusing to use for journal", p);
1386 return -EEXIST;
1387 }
1388
1389 return 0;
1390 }
1391
1392 if (path_is_mount_point(q, 0) > 0) {
1393 if (arg_link_journal != LINK_AUTO) {
1394 log_error("%s: already a mount point, refusing to use for journal", q);
1395 return -EEXIST;
1396 }
1397
1398 return 0;
1399 }
1400
1401 r = readlink_and_make_absolute(p, &d);
1402 if (r >= 0) {
1403 if ((arg_link_journal == LINK_GUEST ||
1404 arg_link_journal == LINK_AUTO) &&
1405 path_equal(d, q)) {
1406
1407 r = userns_mkdir(directory, p, 0755, 0, 0);
1408 if (r < 0)
1409 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1410 return 0;
1411 }
1412
1413 if (unlink(p) < 0)
1414 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1415 } else if (r == -EINVAL) {
1416
1417 if (arg_link_journal == LINK_GUEST &&
1418 rmdir(p) < 0) {
1419
1420 if (errno == ENOTDIR) {
1421 log_error("%s already exists and is neither a symlink nor a directory", p);
1422 return r;
1423 } else {
1424 log_error_errno(errno, "Failed to remove %s: %m", p);
1425 return -errno;
1426 }
1427 }
1428 } else if (r != -ENOENT) {
1429 log_error_errno(errno, "readlink(%s) failed: %m", p);
1430 return r;
1431 }
1432
1433 if (arg_link_journal == LINK_GUEST) {
1434
1435 if (symlink(q, p) < 0) {
1436 if (arg_link_journal_try) {
1437 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1438 return 0;
1439 } else {
1440 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1441 return -errno;
1442 }
1443 }
1444
1445 r = userns_mkdir(directory, p, 0755, 0, 0);
1446 if (r < 0)
1447 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1448 return 0;
1449 }
1450
1451 if (arg_link_journal == LINK_HOST) {
1452 /* don't create parents here -- if the host doesn't have
1453 * permanent journal set up, don't force it here */
1454 r = mkdir(p, 0755);
1455 if (r < 0) {
1456 if (arg_link_journal_try) {
1457 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1458 return 0;
1459 } else {
1460 log_error_errno(errno, "Failed to create %s: %m", p);
1461 return r;
1462 }
1463 }
1464
1465 } else if (access(p, F_OK) < 0)
1466 return 0;
1467
1468 if (dir_is_empty(q) == 0)
1469 log_warning("%s is not empty, proceeding anyway.", q);
1470
1471 r = userns_mkdir(directory, p, 0755, 0, 0);
1472 if (r < 0) {
1473 log_error_errno(errno, "Failed to create %s: %m", q);
1474 return r;
1475 }
1476
1477 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1478 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1479
1480 return 0;
1481 }
1482
1483 static int drop_capabilities(void) {
1484 return capability_bounding_set_drop(~arg_retain, false);
1485 }
1486
1487 static int reset_audit_loginuid(void) {
1488 _cleanup_free_ char *p = NULL;
1489 int r;
1490
1491 if (arg_share_system)
1492 return 0;
1493
1494 r = read_one_line_file("/proc/self/loginuid", &p);
1495 if (r == -ENOENT)
1496 return 0;
1497 if (r < 0)
1498 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1499
1500 /* Already reset? */
1501 if (streq(p, "4294967295"))
1502 return 0;
1503
1504 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1505 if (r < 0) {
1506 log_error_errno(r,
1507 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1508 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1509 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1510 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1511 "using systemd-nspawn. Sleeping for 5s... (%m)");
1512
1513 sleep(5);
1514 }
1515
1516 return 0;
1517 }
1518
1519 static int setup_seccomp(void) {
1520
1521 #ifdef HAVE_SECCOMP
1522 static const struct {
1523 uint64_t capability;
1524 int syscall_num;
1525 } blacklist[] = {
1526 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1527 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1528 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1529 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1530 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1531 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1532 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1533 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1534 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1535 { CAP_SYSLOG, SCMP_SYS(syslog) },
1536 };
1537
1538 scmp_filter_ctx seccomp;
1539 unsigned i;
1540 int r;
1541
1542 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1543 if (!seccomp)
1544 return log_oom();
1545
1546 r = seccomp_add_secondary_archs(seccomp);
1547 if (r < 0) {
1548 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1549 goto finish;
1550 }
1551
1552 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1553 if (arg_retain & (1ULL << blacklist[i].capability))
1554 continue;
1555
1556 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1557 if (r == -EFAULT)
1558 continue; /* unknown syscall */
1559 if (r < 0) {
1560 log_error_errno(r, "Failed to block syscall: %m");
1561 goto finish;
1562 }
1563 }
1564
1565
1566 /*
1567 Audit is broken in containers, much of the userspace audit
1568 hookup will fail if running inside a container. We don't
1569 care and just turn off creation of audit sockets.
1570
1571 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1572 with EAFNOSUPPORT which audit userspace uses as indication
1573 that audit is disabled in the kernel.
1574 */
1575
1576 r = seccomp_rule_add(
1577 seccomp,
1578 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1579 SCMP_SYS(socket),
1580 2,
1581 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1582 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1583 if (r < 0) {
1584 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1585 goto finish;
1586 }
1587
1588 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1589 if (r < 0) {
1590 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1591 goto finish;
1592 }
1593
1594 r = seccomp_load(seccomp);
1595 if (r == -EINVAL) {
1596 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1597 r = 0;
1598 goto finish;
1599 }
1600 if (r < 0) {
1601 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1602 goto finish;
1603 }
1604
1605 finish:
1606 seccomp_release(seccomp);
1607 return r;
1608 #else
1609 return 0;
1610 #endif
1611
1612 }
1613
1614 static int setup_propagate(const char *root) {
1615 const char *p, *q;
1616
1617 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1618 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1619 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1620 (void) mkdir_p(p, 0600);
1621
1622 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
1623 return log_error_errno(errno, "Failed to create /run/systemd: %m");
1624
1625 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1626 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
1627
1628 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1629 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
1630
1631 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1632 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1633 return log_error_errno(errno, "Failed to install propagation bind mount.");
1634
1635 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1636 return log_error_errno(errno, "Failed to make propagation mount read-only");
1637
1638 return 0;
1639 }
1640
1641 static int setup_image(char **device_path, int *loop_nr) {
1642 struct loop_info64 info = {
1643 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1644 };
1645 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1646 _cleanup_free_ char* loopdev = NULL;
1647 struct stat st;
1648 int r, nr;
1649
1650 assert(device_path);
1651 assert(loop_nr);
1652 assert(arg_image);
1653
1654 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1655 if (fd < 0)
1656 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1657
1658 if (fstat(fd, &st) < 0)
1659 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1660
1661 if (S_ISBLK(st.st_mode)) {
1662 char *p;
1663
1664 p = strdup(arg_image);
1665 if (!p)
1666 return log_oom();
1667
1668 *device_path = p;
1669
1670 *loop_nr = -1;
1671
1672 r = fd;
1673 fd = -1;
1674
1675 return r;
1676 }
1677
1678 if (!S_ISREG(st.st_mode)) {
1679 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1680 return -EINVAL;
1681 }
1682
1683 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1684 if (control < 0)
1685 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1686
1687 nr = ioctl(control, LOOP_CTL_GET_FREE);
1688 if (nr < 0)
1689 return log_error_errno(errno, "Failed to allocate loop device: %m");
1690
1691 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1692 return log_oom();
1693
1694 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1695 if (loop < 0)
1696 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1697
1698 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1699 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1700
1701 if (arg_read_only)
1702 info.lo_flags |= LO_FLAGS_READ_ONLY;
1703
1704 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1705 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1706
1707 *device_path = loopdev;
1708 loopdev = NULL;
1709
1710 *loop_nr = nr;
1711
1712 r = loop;
1713 loop = -1;
1714
1715 return r;
1716 }
1717
1718 #define PARTITION_TABLE_BLURB \
1719 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1720 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1721 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1722 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1723 "to be bootable with systemd-nspawn."
1724
1725 static int dissect_image(
1726 int fd,
1727 char **root_device, bool *root_device_rw,
1728 char **home_device, bool *home_device_rw,
1729 char **srv_device, bool *srv_device_rw,
1730 bool *secondary) {
1731
1732 #ifdef HAVE_BLKID
1733 int home_nr = -1, srv_nr = -1;
1734 #ifdef GPT_ROOT_NATIVE
1735 int root_nr = -1;
1736 #endif
1737 #ifdef GPT_ROOT_SECONDARY
1738 int secondary_root_nr = -1;
1739 #endif
1740 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1741 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1742 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1743 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1744 _cleanup_udev_unref_ struct udev *udev = NULL;
1745 struct udev_list_entry *first, *item;
1746 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1747 bool is_gpt, is_mbr, multiple_generic = false;
1748 const char *pttype = NULL;
1749 blkid_partlist pl;
1750 struct stat st;
1751 unsigned i;
1752 int r;
1753
1754 assert(fd >= 0);
1755 assert(root_device);
1756 assert(home_device);
1757 assert(srv_device);
1758 assert(secondary);
1759 assert(arg_image);
1760
1761 b = blkid_new_probe();
1762 if (!b)
1763 return log_oom();
1764
1765 errno = 0;
1766 r = blkid_probe_set_device(b, fd, 0, 0);
1767 if (r != 0) {
1768 if (errno == 0)
1769 return log_oom();
1770
1771 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1772 return -errno;
1773 }
1774
1775 blkid_probe_enable_partitions(b, 1);
1776 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1777
1778 errno = 0;
1779 r = blkid_do_safeprobe(b);
1780 if (r == -2 || r == 1) {
1781 log_error("Failed to identify any partition table on\n"
1782 " %s\n"
1783 PARTITION_TABLE_BLURB, arg_image);
1784 return -EINVAL;
1785 } else if (r != 0) {
1786 if (errno == 0)
1787 errno = EIO;
1788 log_error_errno(errno, "Failed to probe: %m");
1789 return -errno;
1790 }
1791
1792 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1793
1794 is_gpt = streq_ptr(pttype, "gpt");
1795 is_mbr = streq_ptr(pttype, "dos");
1796
1797 if (!is_gpt && !is_mbr) {
1798 log_error("No GPT or MBR partition table discovered on\n"
1799 " %s\n"
1800 PARTITION_TABLE_BLURB, arg_image);
1801 return -EINVAL;
1802 }
1803
1804 errno = 0;
1805 pl = blkid_probe_get_partitions(b);
1806 if (!pl) {
1807 if (errno == 0)
1808 return log_oom();
1809
1810 log_error("Failed to list partitions of %s", arg_image);
1811 return -errno;
1812 }
1813
1814 udev = udev_new();
1815 if (!udev)
1816 return log_oom();
1817
1818 if (fstat(fd, &st) < 0)
1819 return log_error_errno(errno, "Failed to stat block device: %m");
1820
1821 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1822 if (!d)
1823 return log_oom();
1824
1825 for (i = 0;; i++) {
1826 int n, m;
1827
1828 if (i >= 10) {
1829 log_error("Kernel partitions never appeared.");
1830 return -ENXIO;
1831 }
1832
1833 e = udev_enumerate_new(udev);
1834 if (!e)
1835 return log_oom();
1836
1837 r = udev_enumerate_add_match_parent(e, d);
1838 if (r < 0)
1839 return log_oom();
1840
1841 r = udev_enumerate_scan_devices(e);
1842 if (r < 0)
1843 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1844
1845 /* Count the partitions enumerated by the kernel */
1846 n = 0;
1847 first = udev_enumerate_get_list_entry(e);
1848 udev_list_entry_foreach(item, first)
1849 n++;
1850
1851 /* Count the partitions enumerated by blkid */
1852 m = blkid_partlist_numof_partitions(pl);
1853 if (n == m + 1)
1854 break;
1855 if (n > m + 1) {
1856 log_error("blkid and kernel partition list do not match.");
1857 return -EIO;
1858 }
1859 if (n < m + 1) {
1860 unsigned j;
1861
1862 /* The kernel has probed fewer partitions than
1863 * blkid? Maybe the kernel prober is still
1864 * running or it got EBUSY because udev
1865 * already opened the device. Let's reprobe
1866 * the device, which is a synchronous call
1867 * that waits until probing is complete. */
1868
1869 for (j = 0; j < 20; j++) {
1870
1871 r = ioctl(fd, BLKRRPART, 0);
1872 if (r < 0)
1873 r = -errno;
1874 if (r >= 0 || r != -EBUSY)
1875 break;
1876
1877 /* If something else has the device
1878 * open, such as an udev rule, the
1879 * ioctl will return EBUSY. Since
1880 * there's no way to wait until it
1881 * isn't busy anymore, let's just wait
1882 * a bit, and try again.
1883 *
1884 * This is really something they
1885 * should fix in the kernel! */
1886
1887 usleep(50 * USEC_PER_MSEC);
1888 }
1889
1890 if (r < 0)
1891 return log_error_errno(r, "Failed to reread partition table: %m");
1892 }
1893
1894 e = udev_enumerate_unref(e);
1895 }
1896
1897 first = udev_enumerate_get_list_entry(e);
1898 udev_list_entry_foreach(item, first) {
1899 _cleanup_udev_device_unref_ struct udev_device *q;
1900 const char *node;
1901 unsigned long long flags;
1902 blkid_partition pp;
1903 dev_t qn;
1904 int nr;
1905
1906 errno = 0;
1907 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1908 if (!q) {
1909 if (!errno)
1910 errno = ENOMEM;
1911
1912 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1913 return -errno;
1914 }
1915
1916 qn = udev_device_get_devnum(q);
1917 if (major(qn) == 0)
1918 continue;
1919
1920 if (st.st_rdev == qn)
1921 continue;
1922
1923 node = udev_device_get_devnode(q);
1924 if (!node)
1925 continue;
1926
1927 pp = blkid_partlist_devno_to_partition(pl, qn);
1928 if (!pp)
1929 continue;
1930
1931 flags = blkid_partition_get_flags(pp);
1932
1933 nr = blkid_partition_get_partno(pp);
1934 if (nr < 0)
1935 continue;
1936
1937 if (is_gpt) {
1938 sd_id128_t type_id;
1939 const char *stype;
1940
1941 if (flags & GPT_FLAG_NO_AUTO)
1942 continue;
1943
1944 stype = blkid_partition_get_type_string(pp);
1945 if (!stype)
1946 continue;
1947
1948 if (sd_id128_from_string(stype, &type_id) < 0)
1949 continue;
1950
1951 if (sd_id128_equal(type_id, GPT_HOME)) {
1952
1953 if (home && nr >= home_nr)
1954 continue;
1955
1956 home_nr = nr;
1957 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1958
1959 r = free_and_strdup(&home, node);
1960 if (r < 0)
1961 return log_oom();
1962
1963 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1964
1965 if (srv && nr >= srv_nr)
1966 continue;
1967
1968 srv_nr = nr;
1969 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1970
1971 r = free_and_strdup(&srv, node);
1972 if (r < 0)
1973 return log_oom();
1974 }
1975 #ifdef GPT_ROOT_NATIVE
1976 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1977
1978 if (root && nr >= root_nr)
1979 continue;
1980
1981 root_nr = nr;
1982 root_rw = !(flags & GPT_FLAG_READ_ONLY);
1983
1984 r = free_and_strdup(&root, node);
1985 if (r < 0)
1986 return log_oom();
1987 }
1988 #endif
1989 #ifdef GPT_ROOT_SECONDARY
1990 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1991
1992 if (secondary_root && nr >= secondary_root_nr)
1993 continue;
1994
1995 secondary_root_nr = nr;
1996 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
1997
1998 r = free_and_strdup(&secondary_root, node);
1999 if (r < 0)
2000 return log_oom();
2001 }
2002 #endif
2003 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2004
2005 if (generic)
2006 multiple_generic = true;
2007 else {
2008 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2009
2010 r = free_and_strdup(&generic, node);
2011 if (r < 0)
2012 return log_oom();
2013 }
2014 }
2015
2016 } else if (is_mbr) {
2017 int type;
2018
2019 if (flags != 0x80) /* Bootable flag */
2020 continue;
2021
2022 type = blkid_partition_get_type(pp);
2023 if (type != 0x83) /* Linux partition */
2024 continue;
2025
2026 if (generic)
2027 multiple_generic = true;
2028 else {
2029 generic_rw = true;
2030
2031 r = free_and_strdup(&root, node);
2032 if (r < 0)
2033 return log_oom();
2034 }
2035 }
2036 }
2037
2038 if (root) {
2039 *root_device = root;
2040 root = NULL;
2041
2042 *root_device_rw = root_rw;
2043 *secondary = false;
2044 } else if (secondary_root) {
2045 *root_device = secondary_root;
2046 secondary_root = NULL;
2047
2048 *root_device_rw = secondary_root_rw;
2049 *secondary = true;
2050 } else if (generic) {
2051
2052 /* There were no partitions with precise meanings
2053 * around, but we found generic partitions. In this
2054 * case, if there's only one, we can go ahead and boot
2055 * it, otherwise we bail out, because we really cannot
2056 * make any sense of it. */
2057
2058 if (multiple_generic) {
2059 log_error("Identified multiple bootable Linux partitions on\n"
2060 " %s\n"
2061 PARTITION_TABLE_BLURB, arg_image);
2062 return -EINVAL;
2063 }
2064
2065 *root_device = generic;
2066 generic = NULL;
2067
2068 *root_device_rw = generic_rw;
2069 *secondary = false;
2070 } else {
2071 log_error("Failed to identify root partition in disk image\n"
2072 " %s\n"
2073 PARTITION_TABLE_BLURB, arg_image);
2074 return -EINVAL;
2075 }
2076
2077 if (home) {
2078 *home_device = home;
2079 home = NULL;
2080
2081 *home_device_rw = home_rw;
2082 }
2083
2084 if (srv) {
2085 *srv_device = srv;
2086 srv = NULL;
2087
2088 *srv_device_rw = srv_rw;
2089 }
2090
2091 return 0;
2092 #else
2093 log_error("--image= is not supported, compiled without blkid support.");
2094 return -EOPNOTSUPP;
2095 #endif
2096 }
2097
2098 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2099 #ifdef HAVE_BLKID
2100 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2101 const char *fstype, *p;
2102 int r;
2103
2104 assert(what);
2105 assert(where);
2106
2107 if (arg_read_only)
2108 rw = false;
2109
2110 if (directory)
2111 p = strjoina(where, directory);
2112 else
2113 p = where;
2114
2115 errno = 0;
2116 b = blkid_new_probe_from_filename(what);
2117 if (!b) {
2118 if (errno == 0)
2119 return log_oom();
2120 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2121 return -errno;
2122 }
2123
2124 blkid_probe_enable_superblocks(b, 1);
2125 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2126
2127 errno = 0;
2128 r = blkid_do_safeprobe(b);
2129 if (r == -1 || r == 1) {
2130 log_error("Cannot determine file system type of %s", what);
2131 return -EINVAL;
2132 } else if (r != 0) {
2133 if (errno == 0)
2134 errno = EIO;
2135 log_error_errno(errno, "Failed to probe %s: %m", what);
2136 return -errno;
2137 }
2138
2139 errno = 0;
2140 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2141 if (errno == 0)
2142 errno = EINVAL;
2143 log_error("Failed to determine file system type of %s", what);
2144 return -errno;
2145 }
2146
2147 if (streq(fstype, "crypto_LUKS")) {
2148 log_error("nspawn currently does not support LUKS disk images.");
2149 return -EOPNOTSUPP;
2150 }
2151
2152 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2153 return log_error_errno(errno, "Failed to mount %s: %m", what);
2154
2155 return 0;
2156 #else
2157 log_error("--image= is not supported, compiled without blkid support.");
2158 return -EOPNOTSUPP;
2159 #endif
2160 }
2161
2162 static int mount_devices(
2163 const char *where,
2164 const char *root_device, bool root_device_rw,
2165 const char *home_device, bool home_device_rw,
2166 const char *srv_device, bool srv_device_rw) {
2167 int r;
2168
2169 assert(where);
2170
2171 if (root_device) {
2172 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2173 if (r < 0)
2174 return log_error_errno(r, "Failed to mount root directory: %m");
2175 }
2176
2177 if (home_device) {
2178 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2179 if (r < 0)
2180 return log_error_errno(r, "Failed to mount home directory: %m");
2181 }
2182
2183 if (srv_device) {
2184 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2185 if (r < 0)
2186 return log_error_errno(r, "Failed to mount server data directory: %m");
2187 }
2188
2189 return 0;
2190 }
2191
2192 static void loop_remove(int nr, int *image_fd) {
2193 _cleanup_close_ int control = -1;
2194 int r;
2195
2196 if (nr < 0)
2197 return;
2198
2199 if (image_fd && *image_fd >= 0) {
2200 r = ioctl(*image_fd, LOOP_CLR_FD);
2201 if (r < 0)
2202 log_debug_errno(errno, "Failed to close loop image: %m");
2203 *image_fd = safe_close(*image_fd);
2204 }
2205
2206 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2207 if (control < 0) {
2208 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2209 return;
2210 }
2211
2212 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2213 if (r < 0)
2214 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2215 }
2216
2217 /*
2218 * Return values:
2219 * < 0 : wait_for_terminate() failed to get the state of the
2220 * container, the container was terminated by a signal, or
2221 * failed for an unknown reason. No change is made to the
2222 * container argument.
2223 * > 0 : The program executed in the container terminated with an
2224 * error. The exit code of the program executed in the
2225 * container is returned. The container argument has been set
2226 * to CONTAINER_TERMINATED.
2227 * 0 : The container is being rebooted, has been shut down or exited
2228 * successfully. The container argument has been set to either
2229 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2230 *
2231 * That is, success is indicated by a return value of zero, and an
2232 * error is indicated by a non-zero value.
2233 */
2234 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2235 siginfo_t status;
2236 int r;
2237
2238 r = wait_for_terminate(pid, &status);
2239 if (r < 0)
2240 return log_warning_errno(r, "Failed to wait for container: %m");
2241
2242 switch (status.si_code) {
2243
2244 case CLD_EXITED:
2245 if (status.si_status == 0) {
2246 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2247
2248 } else
2249 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2250
2251 *container = CONTAINER_TERMINATED;
2252 return status.si_status;
2253
2254 case CLD_KILLED:
2255 if (status.si_status == SIGINT) {
2256
2257 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2258 *container = CONTAINER_TERMINATED;
2259 return 0;
2260
2261 } else if (status.si_status == SIGHUP) {
2262
2263 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2264 *container = CONTAINER_REBOOTED;
2265 return 0;
2266 }
2267
2268 /* CLD_KILLED fallthrough */
2269
2270 case CLD_DUMPED:
2271 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2272 return -EIO;
2273
2274 default:
2275 log_error("Container %s failed due to unknown reason.", arg_machine);
2276 return -EIO;
2277 }
2278
2279 return r;
2280 }
2281
2282 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2283 pid_t pid;
2284
2285 pid = PTR_TO_UINT32(userdata);
2286 if (pid > 0) {
2287 if (kill(pid, arg_kill_signal) >= 0) {
2288 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2289 sd_event_source_set_userdata(s, NULL);
2290 return 0;
2291 }
2292 }
2293
2294 sd_event_exit(sd_event_source_get_event(s), 0);
2295 return 0;
2296 }
2297
2298 static int determine_names(void) {
2299 int r;
2300
2301 if (arg_template && !arg_directory && arg_machine) {
2302
2303 /* If --template= was specified then we should not
2304 * search for a machine, but instead create a new one
2305 * in /var/lib/machine. */
2306
2307 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2308 if (!arg_directory)
2309 return log_oom();
2310 }
2311
2312 if (!arg_image && !arg_directory) {
2313 if (arg_machine) {
2314 _cleanup_(image_unrefp) Image *i = NULL;
2315
2316 r = image_find(arg_machine, &i);
2317 if (r < 0)
2318 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2319 else if (r == 0) {
2320 log_error("No image for machine '%s': %m", arg_machine);
2321 return -ENOENT;
2322 }
2323
2324 if (i->type == IMAGE_RAW)
2325 r = set_sanitized_path(&arg_image, i->path);
2326 else
2327 r = set_sanitized_path(&arg_directory, i->path);
2328 if (r < 0)
2329 return log_error_errno(r, "Invalid image directory: %m");
2330
2331 if (!arg_ephemeral)
2332 arg_read_only = arg_read_only || i->read_only;
2333 } else
2334 arg_directory = get_current_dir_name();
2335
2336 if (!arg_directory && !arg_machine) {
2337 log_error("Failed to determine path, please use -D or -i.");
2338 return -EINVAL;
2339 }
2340 }
2341
2342 if (!arg_machine) {
2343 if (arg_directory && path_equal(arg_directory, "/"))
2344 arg_machine = gethostname_malloc();
2345 else
2346 arg_machine = strdup(basename(arg_image ?: arg_directory));
2347
2348 if (!arg_machine)
2349 return log_oom();
2350
2351 hostname_cleanup(arg_machine);
2352 if (!machine_name_is_valid(arg_machine)) {
2353 log_error("Failed to determine machine name automatically, please use -M.");
2354 return -EINVAL;
2355 }
2356
2357 if (arg_ephemeral) {
2358 char *b;
2359
2360 /* Add a random suffix when this is an
2361 * ephemeral machine, so that we can run many
2362 * instances at once without manually having
2363 * to specify -M each time. */
2364
2365 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2366 return log_oom();
2367
2368 free(arg_machine);
2369 arg_machine = b;
2370 }
2371 }
2372
2373 return 0;
2374 }
2375
2376 static int determine_uid_shift(const char *directory) {
2377 int r;
2378
2379 if (!arg_userns) {
2380 arg_uid_shift = 0;
2381 return 0;
2382 }
2383
2384 if (arg_uid_shift == UID_INVALID) {
2385 struct stat st;
2386
2387 r = stat(directory, &st);
2388 if (r < 0)
2389 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2390
2391 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2392
2393 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2394 log_error("UID and GID base of %s don't match.", directory);
2395 return -EINVAL;
2396 }
2397
2398 arg_uid_range = UINT32_C(0x10000);
2399 }
2400
2401 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2402 log_error("UID base too high for UID range.");
2403 return -EINVAL;
2404 }
2405
2406 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2407 return 0;
2408 }
2409
2410 static int inner_child(
2411 Barrier *barrier,
2412 const char *directory,
2413 bool secondary,
2414 int kmsg_socket,
2415 int rtnl_socket,
2416 FDSet *fds) {
2417
2418 _cleanup_free_ char *home = NULL;
2419 unsigned n_env = 2;
2420 const char *envp[] = {
2421 "PATH=" DEFAULT_PATH_SPLIT_USR,
2422 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2423 NULL, /* TERM */
2424 NULL, /* HOME */
2425 NULL, /* USER */
2426 NULL, /* LOGNAME */
2427 NULL, /* container_uuid */
2428 NULL, /* LISTEN_FDS */
2429 NULL, /* LISTEN_PID */
2430 NULL
2431 };
2432
2433 _cleanup_strv_free_ char **env_use = NULL;
2434 int r;
2435
2436 assert(barrier);
2437 assert(directory);
2438 assert(kmsg_socket >= 0);
2439
2440 cg_unified_flush();
2441
2442 if (arg_userns) {
2443 /* Tell the parent, that it now can write the UID map. */
2444 (void) barrier_place(barrier); /* #1 */
2445
2446 /* Wait until the parent wrote the UID map */
2447 if (!barrier_place_and_sync(barrier)) { /* #2 */
2448 log_error("Parent died too early");
2449 return -ESRCH;
2450 }
2451 }
2452
2453 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
2454 if (r < 0)
2455 return r;
2456
2457 r = mount_sysfs(NULL);
2458 if (r < 0)
2459 return r;
2460
2461 /* Wait until we are cgroup-ified, so that we
2462 * can mount the right cgroup path writable */
2463 if (!barrier_place_and_sync(barrier)) { /* #3 */
2464 log_error("Parent died too early");
2465 return -ESRCH;
2466 }
2467
2468 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2469 if (r < 0)
2470 return r;
2471
2472 r = reset_uid_gid();
2473 if (r < 0)
2474 return log_error_errno(r, "Couldn't become new root: %m");
2475
2476 r = setup_boot_id(NULL);
2477 if (r < 0)
2478 return r;
2479
2480 r = setup_kmsg(NULL, kmsg_socket);
2481 if (r < 0)
2482 return r;
2483 kmsg_socket = safe_close(kmsg_socket);
2484
2485 umask(0022);
2486
2487 if (setsid() < 0)
2488 return log_error_errno(errno, "setsid() failed: %m");
2489
2490 if (arg_private_network)
2491 loopback_setup();
2492
2493 if (arg_expose_ports) {
2494 r = expose_port_send_rtnl(rtnl_socket);
2495 if (r < 0)
2496 return r;
2497 rtnl_socket = safe_close(rtnl_socket);
2498 }
2499
2500 if (drop_capabilities() < 0)
2501 return log_error_errno(errno, "drop_capabilities() failed: %m");
2502
2503 setup_hostname();
2504
2505 if (arg_personality != PERSONALITY_INVALID) {
2506 if (personality(arg_personality) < 0)
2507 return log_error_errno(errno, "personality() failed: %m");
2508 } else if (secondary) {
2509 if (personality(PER_LINUX32) < 0)
2510 return log_error_errno(errno, "personality() failed: %m");
2511 }
2512
2513 #ifdef HAVE_SELINUX
2514 if (arg_selinux_context)
2515 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2516 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2517 #endif
2518
2519 r = change_uid_gid(arg_user, &home);
2520 if (r < 0)
2521 return r;
2522
2523 envp[n_env] = strv_find_prefix(environ, "TERM=");
2524 if (envp[n_env])
2525 n_env ++;
2526
2527 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2528 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2529 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2530 return log_oom();
2531
2532 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2533 char as_uuid[37];
2534
2535 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2536 return log_oom();
2537 }
2538
2539 if (fdset_size(fds) > 0) {
2540 r = fdset_cloexec(fds, false);
2541 if (r < 0)
2542 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2543
2544 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2545 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2546 return log_oom();
2547 }
2548
2549 env_use = strv_env_merge(2, envp, arg_setenv);
2550 if (!env_use)
2551 return log_oom();
2552
2553 /* Let the parent know that we are ready and
2554 * wait until the parent is ready with the
2555 * setup, too... */
2556 if (!barrier_place_and_sync(barrier)) { /* #4 */
2557 log_error("Parent died too early");
2558 return -ESRCH;
2559 }
2560
2561 /* Now, explicitly close the log, so that we
2562 * then can close all remaining fds. Closing
2563 * the log explicitly first has the benefit
2564 * that the logging subsystem knows about it,
2565 * and is thus ready to be reopened should we
2566 * need it again. Note that the other fds
2567 * closed here are at least the locking and
2568 * barrier fds. */
2569 log_close();
2570 (void) fdset_close_others(fds);
2571
2572 if (arg_boot) {
2573 char **a;
2574 size_t m;
2575
2576 /* Automatically search for the init system */
2577
2578 m = 1 + strv_length(arg_parameters);
2579 a = newa(char*, m + 1);
2580 if (strv_isempty(arg_parameters))
2581 a[1] = NULL;
2582 else
2583 memcpy(a + 1, arg_parameters, m * sizeof(char*));
2584
2585 a[0] = (char*) "/usr/lib/systemd/systemd";
2586 execve(a[0], a, env_use);
2587
2588 a[0] = (char*) "/lib/systemd/systemd";
2589 execve(a[0], a, env_use);
2590
2591 a[0] = (char*) "/sbin/init";
2592 execve(a[0], a, env_use);
2593 } else if (!strv_isempty(arg_parameters))
2594 execvpe(arg_parameters[0], arg_parameters, env_use);
2595 else {
2596 chdir(home ?: "/root");
2597 execle("/bin/bash", "-bash", NULL, env_use);
2598 execle("/bin/sh", "-sh", NULL, env_use);
2599 }
2600
2601 (void) log_open();
2602 return log_error_errno(errno, "execv() failed: %m");
2603 }
2604
2605 static int outer_child(
2606 Barrier *barrier,
2607 const char *directory,
2608 const char *console,
2609 const char *root_device, bool root_device_rw,
2610 const char *home_device, bool home_device_rw,
2611 const char *srv_device, bool srv_device_rw,
2612 bool interactive,
2613 bool secondary,
2614 int pid_socket,
2615 int kmsg_socket,
2616 int rtnl_socket,
2617 int uid_shift_socket,
2618 FDSet *fds) {
2619
2620 pid_t pid;
2621 ssize_t l;
2622 int r;
2623
2624 assert(barrier);
2625 assert(directory);
2626 assert(console);
2627 assert(pid_socket >= 0);
2628 assert(kmsg_socket >= 0);
2629
2630 cg_unified_flush();
2631
2632 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2633 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2634
2635 if (interactive) {
2636 close_nointr(STDIN_FILENO);
2637 close_nointr(STDOUT_FILENO);
2638 close_nointr(STDERR_FILENO);
2639
2640 r = open_terminal(console, O_RDWR);
2641 if (r != STDIN_FILENO) {
2642 if (r >= 0) {
2643 safe_close(r);
2644 r = -EINVAL;
2645 }
2646
2647 return log_error_errno(r, "Failed to open console: %m");
2648 }
2649
2650 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2651 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2652 return log_error_errno(errno, "Failed to duplicate console: %m");
2653 }
2654
2655 r = reset_audit_loginuid();
2656 if (r < 0)
2657 return r;
2658
2659 /* Mark everything as slave, so that we still
2660 * receive mounts from the real root, but don't
2661 * propagate mounts to the real root. */
2662 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2663 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2664
2665 r = mount_devices(directory,
2666 root_device, root_device_rw,
2667 home_device, home_device_rw,
2668 srv_device, srv_device_rw);
2669 if (r < 0)
2670 return r;
2671
2672 r = determine_uid_shift(directory);
2673 if (r < 0)
2674 return r;
2675
2676 if (arg_userns) {
2677 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2678 if (l < 0)
2679 return log_error_errno(errno, "Failed to send UID shift: %m");
2680 if (l != sizeof(arg_uid_shift)) {
2681 log_error("Short write while sending UID shift.");
2682 return -EIO;
2683 }
2684 }
2685
2686 /* Turn directory into bind mount */
2687 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2688 return log_error_errno(errno, "Failed to make bind mount: %m");
2689
2690 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2691 if (r < 0)
2692 return r;
2693
2694 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2695 if (r < 0)
2696 return r;
2697
2698 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2699 if (r < 0)
2700 return r;
2701
2702 if (arg_read_only) {
2703 r = bind_remount_recursive(directory, true);
2704 if (r < 0)
2705 return log_error_errno(r, "Failed to make tree read-only: %m");
2706 }
2707
2708 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2709 if (r < 0)
2710 return r;
2711
2712 r = copy_devnodes(directory);
2713 if (r < 0)
2714 return r;
2715
2716 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2717
2718 r = setup_pts(directory);
2719 if (r < 0)
2720 return r;
2721
2722 r = setup_propagate(directory);
2723 if (r < 0)
2724 return r;
2725
2726 r = setup_dev_console(directory, console);
2727 if (r < 0)
2728 return r;
2729
2730 r = setup_seccomp();
2731 if (r < 0)
2732 return r;
2733
2734 r = setup_timezone(directory);
2735 if (r < 0)
2736 return r;
2737
2738 r = setup_resolv_conf(directory);
2739 if (r < 0)
2740 return r;
2741
2742 r = setup_journal(directory);
2743 if (r < 0)
2744 return r;
2745
2746 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2747 if (r < 0)
2748 return r;
2749
2750 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2751 if (r < 0)
2752 return r;
2753
2754 r = mount_move_root(directory);
2755 if (r < 0)
2756 return log_error_errno(r, "Failed to move root directory: %m");
2757
2758 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2759 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2760 (arg_private_network ? CLONE_NEWNET : 0) |
2761 (arg_userns ? CLONE_NEWUSER : 0),
2762 NULL);
2763 if (pid < 0)
2764 return log_error_errno(errno, "Failed to fork inner child: %m");
2765 if (pid == 0) {
2766 pid_socket = safe_close(pid_socket);
2767 uid_shift_socket = safe_close(uid_shift_socket);
2768
2769 /* The inner child has all namespaces that are
2770 * requested, so that we all are owned by the user if
2771 * user namespaces are turned on. */
2772
2773 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2774 if (r < 0)
2775 _exit(EXIT_FAILURE);
2776
2777 _exit(EXIT_SUCCESS);
2778 }
2779
2780 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2781 if (l < 0)
2782 return log_error_errno(errno, "Failed to send PID: %m");
2783 if (l != sizeof(pid)) {
2784 log_error("Short write while sending PID.");
2785 return -EIO;
2786 }
2787
2788 pid_socket = safe_close(pid_socket);
2789 kmsg_socket = safe_close(kmsg_socket);
2790 rtnl_socket = safe_close(rtnl_socket);
2791
2792 return 0;
2793 }
2794
2795 static int setup_uid_map(pid_t pid) {
2796 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2797 int r;
2798
2799 assert(pid > 1);
2800
2801 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2802 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2803 r = write_string_file(uid_map, line, 0);
2804 if (r < 0)
2805 return log_error_errno(r, "Failed to write UID map: %m");
2806
2807 /* We always assign the same UID and GID ranges */
2808 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2809 r = write_string_file(uid_map, line, 0);
2810 if (r < 0)
2811 return log_error_errno(r, "Failed to write GID map: %m");
2812
2813 return 0;
2814 }
2815
2816 static int load_settings(void) {
2817 _cleanup_(settings_freep) Settings *settings = NULL;
2818 _cleanup_fclose_ FILE *f = NULL;
2819 _cleanup_free_ char *p = NULL;
2820 const char *fn, *i;
2821 int r;
2822
2823 /* If all settings are masked, there's no point in looking for
2824 * the settings file */
2825 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2826 return 0;
2827
2828 fn = strjoina(arg_machine, ".nspawn");
2829
2830 /* We first look in the admin's directories in /etc and /run */
2831 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2832 _cleanup_free_ char *j = NULL;
2833
2834 j = strjoin(i, "/", fn, NULL);
2835 if (!j)
2836 return log_oom();
2837
2838 f = fopen(j, "re");
2839 if (f) {
2840 p = j;
2841 j = NULL;
2842
2843 /* By default we trust configuration from /etc and /run */
2844 if (arg_settings_trusted < 0)
2845 arg_settings_trusted = true;
2846
2847 break;
2848 }
2849
2850 if (errno != ENOENT)
2851 return log_error_errno(errno, "Failed to open %s: %m", j);
2852 }
2853
2854 if (!f) {
2855 /* After that, let's look for a file next to the
2856 * actual image we shall boot. */
2857
2858 if (arg_image) {
2859 p = file_in_same_dir(arg_image, fn);
2860 if (!p)
2861 return log_oom();
2862 } else if (arg_directory) {
2863 p = file_in_same_dir(arg_directory, fn);
2864 if (!p)
2865 return log_oom();
2866 }
2867
2868 if (p) {
2869 f = fopen(p, "re");
2870 if (!f && errno != ENOENT)
2871 return log_error_errno(errno, "Failed to open %s: %m", p);
2872
2873 /* By default we do not trust configuration from /var/lib/machines */
2874 if (arg_settings_trusted < 0)
2875 arg_settings_trusted = false;
2876 }
2877 }
2878
2879 if (!f)
2880 return 0;
2881
2882 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2883
2884 r = settings_load(f, p, &settings);
2885 if (r < 0)
2886 return r;
2887
2888 /* Copy over bits from the settings, unless they have been
2889 * explicitly masked by command line switches. */
2890
2891 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2892 settings->boot >= 0) {
2893 arg_boot = settings->boot;
2894
2895 strv_free(arg_parameters);
2896 arg_parameters = settings->parameters;
2897 settings->parameters = NULL;
2898 }
2899
2900 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2901 settings->environment) {
2902 strv_free(arg_setenv);
2903 arg_setenv = settings->environment;
2904 settings->environment = NULL;
2905 }
2906
2907 if ((arg_settings_mask & SETTING_USER) == 0 &&
2908 settings->user) {
2909 free(arg_user);
2910 arg_user = settings->user;
2911 settings->user = NULL;
2912 }
2913
2914 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
2915 uint64_t plus;
2916
2917 plus = settings->capability;
2918 if (settings_private_network(settings))
2919 plus |= (1ULL << CAP_NET_ADMIN);
2920
2921 if (!arg_settings_trusted && plus != 0) {
2922 if (settings->capability != 0)
2923 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2924 } else
2925 arg_retain |= plus;
2926
2927 arg_retain &= ~settings->drop_capability;
2928 }
2929
2930 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2931 settings->kill_signal > 0)
2932 arg_kill_signal = settings->kill_signal;
2933
2934 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2935 settings->personality != PERSONALITY_INVALID)
2936 arg_personality = settings->personality;
2937
2938 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2939 !sd_id128_is_null(settings->machine_id)) {
2940
2941 if (!arg_settings_trusted)
2942 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2943 else
2944 arg_uuid = settings->machine_id;
2945 }
2946
2947 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2948 settings->read_only >= 0)
2949 arg_read_only = settings->read_only;
2950
2951 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2952 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2953 arg_volatile_mode = settings->volatile_mode;
2954
2955 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2956 settings->n_custom_mounts > 0) {
2957
2958 if (!arg_settings_trusted)
2959 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2960 else {
2961 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2962 arg_custom_mounts = settings->custom_mounts;
2963 arg_n_custom_mounts = settings->n_custom_mounts;
2964
2965 settings->custom_mounts = NULL;
2966 settings->n_custom_mounts = 0;
2967 }
2968 }
2969
2970 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2971 (settings->private_network >= 0 ||
2972 settings->network_veth >= 0 ||
2973 settings->network_bridge ||
2974 settings->network_interfaces ||
2975 settings->network_macvlan ||
2976 settings->network_ipvlan)) {
2977
2978 if (!arg_settings_trusted)
2979 log_warning("Ignoring network settings, file %s is not trusted.", p);
2980 else {
2981 arg_network_veth = settings_private_network(settings);
2982 arg_private_network = settings_private_network(settings);
2983
2984 strv_free(arg_network_interfaces);
2985 arg_network_interfaces = settings->network_interfaces;
2986 settings->network_interfaces = NULL;
2987
2988 strv_free(arg_network_macvlan);
2989 arg_network_macvlan = settings->network_macvlan;
2990 settings->network_macvlan = NULL;
2991
2992 strv_free(arg_network_ipvlan);
2993 arg_network_ipvlan = settings->network_ipvlan;
2994 settings->network_ipvlan = NULL;
2995
2996 free(arg_network_bridge);
2997 arg_network_bridge = settings->network_bridge;
2998 settings->network_bridge = NULL;
2999 }
3000 }
3001
3002 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3003 settings->expose_ports) {
3004
3005 if (!arg_settings_trusted)
3006 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3007 else {
3008 expose_port_free_all(arg_expose_ports);
3009 arg_expose_ports = settings->expose_ports;
3010 settings->expose_ports = NULL;
3011 }
3012 }
3013
3014 return 0;
3015 }
3016
3017 int main(int argc, char *argv[]) {
3018
3019 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3020 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3021 _cleanup_close_ int master = -1, image_fd = -1;
3022 _cleanup_fdset_free_ FDSet *fds = NULL;
3023 int r, n_fd_passed, loop_nr = -1;
3024 char veth_name[IFNAMSIZ];
3025 bool secondary = false, remove_subvol = false;
3026 sigset_t mask_chld;
3027 pid_t pid = 0;
3028 int ret = EXIT_SUCCESS;
3029 union in_addr_union exposed = {};
3030 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3031 bool interactive;
3032
3033 log_parse_environment();
3034 log_open();
3035
3036 r = parse_argv(argc, argv);
3037 if (r <= 0)
3038 goto finish;
3039
3040 if (geteuid() != 0) {
3041 log_error("Need to be root.");
3042 r = -EPERM;
3043 goto finish;
3044 }
3045 r = determine_names();
3046 if (r < 0)
3047 goto finish;
3048
3049 r = load_settings();
3050 if (r < 0)
3051 goto finish;
3052
3053 r = verify_arguments();
3054 if (r < 0)
3055 goto finish;
3056
3057 n_fd_passed = sd_listen_fds(false);
3058 if (n_fd_passed > 0) {
3059 r = fdset_new_listen_fds(&fds, false);
3060 if (r < 0) {
3061 log_error_errno(r, "Failed to collect file descriptors: %m");
3062 goto finish;
3063 }
3064 }
3065
3066 if (arg_directory) {
3067 assert(!arg_image);
3068
3069 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3070 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3071 r = -EINVAL;
3072 goto finish;
3073 }
3074
3075 if (arg_ephemeral) {
3076 _cleanup_free_ char *np = NULL;
3077
3078 /* If the specified path is a mount point we
3079 * generate the new snapshot immediately
3080 * inside it under a random name. However if
3081 * the specified is not a mount point we
3082 * create the new snapshot in the parent
3083 * directory, just next to it. */
3084 r = path_is_mount_point(arg_directory, 0);
3085 if (r < 0) {
3086 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3087 goto finish;
3088 }
3089 if (r > 0)
3090 r = tempfn_random_child(arg_directory, "machine.", &np);
3091 else
3092 r = tempfn_random(arg_directory, "machine.", &np);
3093 if (r < 0) {
3094 log_error_errno(r, "Failed to generate name for snapshot: %m");
3095 goto finish;
3096 }
3097
3098 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3099 if (r < 0) {
3100 log_error_errno(r, "Failed to lock %s: %m", np);
3101 goto finish;
3102 }
3103
3104 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3105 if (r < 0) {
3106 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3107 goto finish;
3108 }
3109
3110 free(arg_directory);
3111 arg_directory = np;
3112 np = NULL;
3113
3114 remove_subvol = true;
3115
3116 } else {
3117 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3118 if (r == -EBUSY) {
3119 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3120 goto finish;
3121 }
3122 if (r < 0) {
3123 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3124 return r;
3125 }
3126
3127 if (arg_template) {
3128 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3129 if (r == -EEXIST) {
3130 if (!arg_quiet)
3131 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3132 } else if (r < 0) {
3133 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3134 goto finish;
3135 } else {
3136 if (!arg_quiet)
3137 log_info("Populated %s from template %s.", arg_directory, arg_template);
3138 }
3139 }
3140 }
3141
3142 if (arg_boot) {
3143 if (path_is_os_tree(arg_directory) <= 0) {
3144 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3145 r = -EINVAL;
3146 goto finish;
3147 }
3148 } else {
3149 const char *p;
3150
3151 p = strjoina(arg_directory,
3152 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3153 if (access(p, F_OK) < 0) {
3154 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3155 r = -EINVAL;
3156 goto finish;
3157 }
3158 }
3159
3160 } else {
3161 char template[] = "/tmp/nspawn-root-XXXXXX";
3162
3163 assert(arg_image);
3164 assert(!arg_template);
3165
3166 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3167 if (r == -EBUSY) {
3168 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3169 goto finish;
3170 }
3171 if (r < 0) {
3172 r = log_error_errno(r, "Failed to create image lock: %m");
3173 goto finish;
3174 }
3175
3176 if (!mkdtemp(template)) {
3177 log_error_errno(errno, "Failed to create temporary directory: %m");
3178 r = -errno;
3179 goto finish;
3180 }
3181
3182 arg_directory = strdup(template);
3183 if (!arg_directory) {
3184 r = log_oom();
3185 goto finish;
3186 }
3187
3188 image_fd = setup_image(&device_path, &loop_nr);
3189 if (image_fd < 0) {
3190 r = image_fd;
3191 goto finish;
3192 }
3193
3194 r = dissect_image(image_fd,
3195 &root_device, &root_device_rw,
3196 &home_device, &home_device_rw,
3197 &srv_device, &srv_device_rw,
3198 &secondary);
3199 if (r < 0)
3200 goto finish;
3201 }
3202
3203 r = custom_mounts_prepare();
3204 if (r < 0)
3205 goto finish;
3206
3207 interactive =
3208 isatty(STDIN_FILENO) > 0 &&
3209 isatty(STDOUT_FILENO) > 0;
3210
3211 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3212 if (master < 0) {
3213 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3214 goto finish;
3215 }
3216
3217 r = ptsname_malloc(master, &console);
3218 if (r < 0) {
3219 r = log_error_errno(r, "Failed to determine tty name: %m");
3220 goto finish;
3221 }
3222
3223 if (unlockpt(master) < 0) {
3224 r = log_error_errno(errno, "Failed to unlock tty: %m");
3225 goto finish;
3226 }
3227
3228 if (!arg_quiet)
3229 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3230 arg_machine, arg_image ?: arg_directory);
3231
3232 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3233
3234 assert_se(sigemptyset(&mask_chld) == 0);
3235 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3236
3237 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3238 r = log_error_errno(errno, "Failed to become subreaper: %m");
3239 goto finish;
3240 }
3241
3242 for (;;) {
3243 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
3244 uid_shift_socket_pair[2] = { -1, -1 };
3245 ContainerStatus container_status;
3246 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3247 static const struct sigaction sa = {
3248 .sa_handler = nop_signal_handler,
3249 .sa_flags = SA_NOCLDSTOP,
3250 };
3251 int ifi = 0;
3252 ssize_t l;
3253 _cleanup_event_unref_ sd_event *event = NULL;
3254 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3255 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3256 char last_char = 0;
3257
3258 r = barrier_create(&barrier);
3259 if (r < 0) {
3260 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3261 goto finish;
3262 }
3263
3264 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3265 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3266 goto finish;
3267 }
3268
3269 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3270 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3271 goto finish;
3272 }
3273
3274 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3275 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3276 goto finish;
3277 }
3278
3279 if (arg_userns)
3280 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3281 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3282 goto finish;
3283 }
3284
3285 /* Child can be killed before execv(), so handle SIGCHLD
3286 * in order to interrupt parent's blocking calls and
3287 * give it a chance to call wait() and terminate. */
3288 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3289 if (r < 0) {
3290 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3291 goto finish;
3292 }
3293
3294 r = sigaction(SIGCHLD, &sa, NULL);
3295 if (r < 0) {
3296 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3297 goto finish;
3298 }
3299
3300 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3301 if (pid < 0) {
3302 if (errno == EINVAL)
3303 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3304 else
3305 r = log_error_errno(errno, "clone() failed: %m");
3306
3307 goto finish;
3308 }
3309
3310 if (pid == 0) {
3311 /* The outer child only has a file system namespace. */
3312 barrier_set_role(&barrier, BARRIER_CHILD);
3313
3314 master = safe_close(master);
3315
3316 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3317 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3318 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3319 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3320
3321 (void) reset_all_signal_handlers();
3322 (void) reset_signal_mask();
3323
3324 r = outer_child(&barrier,
3325 arg_directory,
3326 console,
3327 root_device, root_device_rw,
3328 home_device, home_device_rw,
3329 srv_device, srv_device_rw,
3330 interactive,
3331 secondary,
3332 pid_socket_pair[1],
3333 kmsg_socket_pair[1],
3334 rtnl_socket_pair[1],
3335 uid_shift_socket_pair[1],
3336 fds);
3337 if (r < 0)
3338 _exit(EXIT_FAILURE);
3339
3340 _exit(EXIT_SUCCESS);
3341 }
3342
3343 barrier_set_role(&barrier, BARRIER_PARENT);
3344
3345 fds = fdset_free(fds);
3346
3347 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3348 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3349 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3350 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3351
3352 /* Wait for the outer child. */
3353 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3354 if (r < 0)
3355 goto finish;
3356 if (r != 0) {
3357 r = -EIO;
3358 goto finish;
3359 }
3360 pid = 0;
3361
3362 /* And now retrieve the PID of the inner child. */
3363 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3364 if (l < 0) {
3365 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3366 goto finish;
3367 }
3368 if (l != sizeof(pid)) {
3369 log_error("Short read while reading inner child PID.");
3370 r = EIO;
3371 goto finish;
3372 }
3373
3374 log_debug("Init process invoked as PID " PID_FMT, pid);
3375
3376 if (arg_userns) {
3377 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3378 log_error("Child died too early.");
3379 r = -ESRCH;
3380 goto finish;
3381 }
3382
3383 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3384 if (l < 0) {
3385 r = log_error_errno(errno, "Failed to read UID shift: %m");
3386 goto finish;
3387 }
3388 if (l != sizeof(arg_uid_shift)) {
3389 log_error("Short read while reading UID shift.");
3390 r = EIO;
3391 goto finish;
3392 }
3393
3394 r = setup_uid_map(pid);
3395 if (r < 0)
3396 goto finish;
3397
3398 (void) barrier_place(&barrier); /* #2 */
3399 }
3400
3401 if (arg_private_network) {
3402
3403 r = move_network_interfaces(pid, arg_network_interfaces);
3404 if (r < 0)
3405 goto finish;
3406
3407 if (arg_network_veth) {
3408 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3409 if (r < 0)
3410 goto finish;
3411 else if (r > 0)
3412 ifi = r;
3413
3414 if (arg_network_bridge) {
3415 r = setup_bridge(veth_name, arg_network_bridge);
3416 if (r < 0)
3417 goto finish;
3418 if (r > 0)
3419 ifi = r;
3420 }
3421 }
3422
3423 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3424 if (r < 0)
3425 goto finish;
3426
3427 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3428 if (r < 0)
3429 goto finish;
3430 }
3431
3432 if (arg_register) {
3433 r = register_machine(
3434 arg_machine,
3435 pid,
3436 arg_directory,
3437 arg_uuid,
3438 ifi,
3439 arg_slice,
3440 arg_custom_mounts, arg_n_custom_mounts,
3441 arg_kill_signal,
3442 arg_property,
3443 arg_keep_unit);
3444 if (r < 0)
3445 goto finish;
3446 }
3447
3448 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3449 if (r < 0)
3450 goto finish;
3451
3452 if (arg_keep_unit) {
3453 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3454 if (r < 0)
3455 goto finish;
3456 }
3457
3458 r = chown_cgroup(pid, arg_uid_shift);
3459 if (r < 0)
3460 goto finish;
3461
3462 /* Notify the child that the parent is ready with all
3463 * its setup (including cgroup-ification), and that
3464 * the child can now hand over control to the code to
3465 * run inside the container. */
3466 (void) barrier_place(&barrier); /* #3 */
3467
3468 /* Block SIGCHLD here, before notifying child.
3469 * process_pty() will handle it with the other signals. */
3470 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3471
3472 /* Reset signal to default */
3473 r = default_signals(SIGCHLD, -1);
3474 if (r < 0) {
3475 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3476 goto finish;
3477 }
3478
3479 /* Let the child know that we are ready and wait that the child is completely ready now. */
3480 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3481 log_error("Child died too early.");
3482 r = -ESRCH;
3483 goto finish;
3484 }
3485
3486 sd_notifyf(false,
3487 "READY=1\n"
3488 "STATUS=Container running.\n"
3489 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3490
3491 r = sd_event_new(&event);
3492 if (r < 0) {
3493 log_error_errno(r, "Failed to get default event source: %m");
3494 goto finish;
3495 }
3496
3497 if (arg_kill_signal > 0) {
3498 /* Try to kill the init system on SIGINT or SIGTERM */
3499 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3500 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3501 } else {
3502 /* Immediately exit */
3503 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3504 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3505 }
3506
3507 /* simply exit on sigchld */
3508 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3509
3510 if (arg_expose_ports) {
3511 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3512 if (r < 0)
3513 goto finish;
3514
3515 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3516 }
3517
3518 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3519
3520 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
3521 if (r < 0) {
3522 log_error_errno(r, "Failed to create PTY forwarder: %m");
3523 goto finish;
3524 }
3525
3526 r = sd_event_loop(event);
3527 if (r < 0) {
3528 log_error_errno(r, "Failed to run event loop: %m");
3529 goto finish;
3530 }
3531
3532 pty_forward_get_last_char(forward, &last_char);
3533
3534 forward = pty_forward_free(forward);
3535
3536 if (!arg_quiet && last_char != '\n')
3537 putc('\n', stdout);
3538
3539 /* Kill if it is not dead yet anyway */
3540 if (arg_register && !arg_keep_unit)
3541 terminate_machine(pid);
3542
3543 /* Normally redundant, but better safe than sorry */
3544 kill(pid, SIGKILL);
3545
3546 r = wait_for_container(pid, &container_status);
3547 pid = 0;
3548
3549 if (r < 0)
3550 /* We failed to wait for the container, or the
3551 * container exited abnormally */
3552 goto finish;
3553 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3554 /* The container exited with a non-zero
3555 * status, or with zero status and no reboot
3556 * was requested. */
3557 ret = r;
3558 break;
3559 }
3560
3561 /* CONTAINER_REBOOTED, loop again */
3562
3563 if (arg_keep_unit) {
3564 /* Special handling if we are running as a
3565 * service: instead of simply restarting the
3566 * machine we want to restart the entire
3567 * service, so let's inform systemd about this
3568 * with the special exit code 133. The service
3569 * file uses RestartForceExitStatus=133 so
3570 * that this results in a full nspawn
3571 * restart. This is necessary since we might
3572 * have cgroup parameters set we want to have
3573 * flushed out. */
3574 ret = 133;
3575 r = 0;
3576 break;
3577 }
3578
3579 expose_port_flush(arg_expose_ports, &exposed);
3580 }
3581
3582 finish:
3583 sd_notify(false,
3584 "STOPPING=1\n"
3585 "STATUS=Terminating...");
3586
3587 if (pid > 0)
3588 kill(pid, SIGKILL);
3589
3590 /* Try to flush whatever is still queued in the pty */
3591 if (master >= 0)
3592 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
3593
3594 loop_remove(loop_nr, &image_fd);
3595
3596 if (remove_subvol && arg_directory) {
3597 int k;
3598
3599 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
3600 if (k < 0)
3601 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3602 }
3603
3604 if (arg_machine) {
3605 const char *p;
3606
3607 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3608 (void) rm_rf(p, REMOVE_ROOT);
3609 }
3610
3611 expose_port_flush(arg_expose_ports, &exposed);
3612
3613 free(arg_directory);
3614 free(arg_template);
3615 free(arg_image);
3616 free(arg_machine);
3617 free(arg_user);
3618 strv_free(arg_setenv);
3619 free(arg_network_bridge);
3620 strv_free(arg_network_interfaces);
3621 strv_free(arg_network_macvlan);
3622 strv_free(arg_network_ipvlan);
3623 strv_free(arg_parameters);
3624 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3625 expose_port_free_all(arg_expose_ports);
3626
3627 return r < 0 ? EXIT_FAILURE : ret;
3628 }