]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: split out network related code to nspawn-network.[ch]
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <sys/personality.h>
38 #include <linux/loop.h>
39 #include <sys/file.h>
40
41 #ifdef HAVE_SELINUX
42 #include <selinux/selinux.h>
43 #endif
44
45 #ifdef HAVE_SECCOMP
46 #include <seccomp.h>
47 #endif
48
49 #ifdef HAVE_BLKID
50 #include <blkid/blkid.h>
51 #endif
52
53 #include "sd-daemon.h"
54 #include "sd-bus.h"
55 #include "sd-id128.h"
56 #include "random-util.h"
57 #include "log.h"
58 #include "util.h"
59 #include "mkdir.h"
60 #include "rm-rf.h"
61 #include "macro.h"
62 #include "missing.h"
63 #include "cgroup-util.h"
64 #include "strv.h"
65 #include "path-util.h"
66 #include "loopback-setup.h"
67 #include "dev-setup.h"
68 #include "fdset.h"
69 #include "build.h"
70 #include "fileio.h"
71 #include "bus-util.h"
72 #include "bus-error.h"
73 #include "ptyfwd.h"
74 #include "env-util.h"
75 #include "netlink-util.h"
76 #include "udev-util.h"
77 #include "blkid-util.h"
78 #include "gpt.h"
79 #include "copy.h"
80 #include "base-filesystem.h"
81 #include "barrier.h"
82 #include "event-util.h"
83 #include "capability.h"
84 #include "cap-list.h"
85 #include "btrfs-util.h"
86 #include "machine-image.h"
87 #include "list.h"
88 #include "in-addr-util.h"
89 #include "formats-util.h"
90 #include "process-util.h"
91 #include "terminal-util.h"
92 #include "hostname-util.h"
93 #include "signal-util.h"
94
95 #ifdef HAVE_SECCOMP
96 #include "seccomp-util.h"
97 #endif
98
99 #include "nspawn.h"
100 #include "nspawn-settings.h"
101 #include "nspawn-mount.h"
102 #include "nspawn-network.h"
103 #include "nspawn-expose-ports.h"
104
105 typedef enum ContainerStatus {
106 CONTAINER_TERMINATED,
107 CONTAINER_REBOOTED
108 } ContainerStatus;
109
110 typedef enum LinkJournal {
111 LINK_NO,
112 LINK_AUTO,
113 LINK_HOST,
114 LINK_GUEST
115 } LinkJournal;
116
117 static char *arg_directory = NULL;
118 static char *arg_template = NULL;
119 static char *arg_user = NULL;
120 static sd_id128_t arg_uuid = {};
121 static char *arg_machine = NULL;
122 static const char *arg_selinux_context = NULL;
123 static const char *arg_selinux_apifs_context = NULL;
124 static const char *arg_slice = NULL;
125 static bool arg_private_network = false;
126 static bool arg_read_only = false;
127 static bool arg_boot = false;
128 static bool arg_ephemeral = false;
129 static LinkJournal arg_link_journal = LINK_AUTO;
130 static bool arg_link_journal_try = false;
131 static uint64_t arg_retain =
132 (1ULL << CAP_CHOWN) |
133 (1ULL << CAP_DAC_OVERRIDE) |
134 (1ULL << CAP_DAC_READ_SEARCH) |
135 (1ULL << CAP_FOWNER) |
136 (1ULL << CAP_FSETID) |
137 (1ULL << CAP_IPC_OWNER) |
138 (1ULL << CAP_KILL) |
139 (1ULL << CAP_LEASE) |
140 (1ULL << CAP_LINUX_IMMUTABLE) |
141 (1ULL << CAP_NET_BIND_SERVICE) |
142 (1ULL << CAP_NET_BROADCAST) |
143 (1ULL << CAP_NET_RAW) |
144 (1ULL << CAP_SETGID) |
145 (1ULL << CAP_SETFCAP) |
146 (1ULL << CAP_SETPCAP) |
147 (1ULL << CAP_SETUID) |
148 (1ULL << CAP_SYS_ADMIN) |
149 (1ULL << CAP_SYS_CHROOT) |
150 (1ULL << CAP_SYS_NICE) |
151 (1ULL << CAP_SYS_PTRACE) |
152 (1ULL << CAP_SYS_TTY_CONFIG) |
153 (1ULL << CAP_SYS_RESOURCE) |
154 (1ULL << CAP_SYS_BOOT) |
155 (1ULL << CAP_AUDIT_WRITE) |
156 (1ULL << CAP_AUDIT_CONTROL) |
157 (1ULL << CAP_MKNOD);
158 static CustomMount *arg_custom_mounts = NULL;
159 static unsigned arg_n_custom_mounts = 0;
160 static char **arg_setenv = NULL;
161 static bool arg_quiet = false;
162 static bool arg_share_system = false;
163 static bool arg_register = true;
164 static bool arg_keep_unit = false;
165 static char **arg_network_interfaces = NULL;
166 static char **arg_network_macvlan = NULL;
167 static char **arg_network_ipvlan = NULL;
168 static bool arg_network_veth = false;
169 static char *arg_network_bridge = NULL;
170 static unsigned long arg_personality = PERSONALITY_INVALID;
171 static char *arg_image = NULL;
172 static VolatileMode arg_volatile_mode = VOLATILE_NO;
173 static ExposePort *arg_expose_ports = NULL;
174 static char **arg_property = NULL;
175 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
176 static bool arg_userns = false;
177 static int arg_kill_signal = 0;
178 static bool arg_unified_cgroup_hierarchy = false;
179 static SettingsMask arg_settings_mask = 0;
180 static int arg_settings_trusted = -1;
181 static char **arg_parameters = NULL;
182
183 static void help(void) {
184 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
185 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
186 " -h --help Show this help\n"
187 " --version Print version string\n"
188 " -q --quiet Do not show status information\n"
189 " -D --directory=PATH Root directory for the container\n"
190 " --template=PATH Initialize root directory from template directory,\n"
191 " if missing\n"
192 " -x --ephemeral Run container with snapshot of root directory, and\n"
193 " remove it after exit\n"
194 " -i --image=PATH File system device or disk image for the container\n"
195 " -b --boot Boot up full system (i.e. invoke init)\n"
196 " -u --user=USER Run the command under specified user or uid\n"
197 " -M --machine=NAME Set the machine name for the container\n"
198 " --uuid=UUID Set a specific machine UUID for the container\n"
199 " -S --slice=SLICE Place the container in the specified slice\n"
200 " --property=NAME=VALUE Set scope unit property\n"
201 " --private-users[=UIDBASE[:NUIDS]]\n"
202 " Run within user namespace\n"
203 " --private-network Disable network in container\n"
204 " --network-interface=INTERFACE\n"
205 " Assign an existing network interface to the\n"
206 " container\n"
207 " --network-macvlan=INTERFACE\n"
208 " Create a macvlan network interface based on an\n"
209 " existing network interface to the container\n"
210 " --network-ipvlan=INTERFACE\n"
211 " Create a ipvlan network interface based on an\n"
212 " existing network interface to the container\n"
213 " -n --network-veth Add a virtual ethernet connection between host\n"
214 " and container\n"
215 " --network-bridge=INTERFACE\n"
216 " Add a virtual ethernet connection between host\n"
217 " and container and add it to an existing bridge on\n"
218 " the host\n"
219 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
220 " Expose a container IP port on the host\n"
221 " -Z --selinux-context=SECLABEL\n"
222 " Set the SELinux security context to be used by\n"
223 " processes in the container\n"
224 " -L --selinux-apifs-context=SECLABEL\n"
225 " Set the SELinux security context to be used by\n"
226 " API/tmpfs file systems in the container\n"
227 " --capability=CAP In addition to the default, retain specified\n"
228 " capability\n"
229 " --drop-capability=CAP Drop the specified capability from the default set\n"
230 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
231 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
232 " try-guest, try-host\n"
233 " -j Equivalent to --link-journal=try-guest\n"
234 " --read-only Mount the root directory read-only\n"
235 " --bind=PATH[:PATH[:OPTIONS]]\n"
236 " Bind mount a file or directory from the host into\n"
237 " the container\n"
238 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
239 " Similar, but creates a read-only bind mount\n"
240 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
241 " --overlay=PATH[:PATH...]:PATH\n"
242 " Create an overlay mount from the host to \n"
243 " the container\n"
244 " --overlay-ro=PATH[:PATH...]:PATH\n"
245 " Similar, but creates a read-only overlay mount\n"
246 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
247 " --share-system Share system namespaces with host\n"
248 " --register=BOOLEAN Register container as machine\n"
249 " --keep-unit Do not register a scope for the machine, reuse\n"
250 " the service unit nspawn is running in\n"
251 " --volatile[=MODE] Run the system in volatile mode\n"
252 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
253 , program_invocation_short_name);
254 }
255
256
257 static int custom_mounts_prepare(void) {
258 unsigned i;
259 int r;
260
261 /* Ensure the mounts are applied prefix first. */
262 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
263
264 /* Allocate working directories for the overlay file systems that need it */
265 for (i = 0; i < arg_n_custom_mounts; i++) {
266 CustomMount *m = &arg_custom_mounts[i];
267
268 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
269 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
270 return -EINVAL;
271 }
272
273 if (m->type != CUSTOM_MOUNT_OVERLAY)
274 continue;
275
276 if (m->work_dir)
277 continue;
278
279 if (m->read_only)
280 continue;
281
282 r = tempfn_random(m->source, NULL, &m->work_dir);
283 if (r < 0)
284 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
285 }
286
287 return 0;
288 }
289
290 static int set_sanitized_path(char **b, const char *path) {
291 char *p;
292
293 assert(b);
294 assert(path);
295
296 p = canonicalize_file_name(path);
297 if (!p) {
298 if (errno != ENOENT)
299 return -errno;
300
301 p = path_make_absolute_cwd(path);
302 if (!p)
303 return -ENOMEM;
304 }
305
306 free(*b);
307 *b = path_kill_slashes(p);
308 return 0;
309 }
310
311 static int detect_unified_cgroup_hierarchy(void) {
312 const char *e;
313 int r;
314
315 /* Allow the user to control whether the unified hierarchy is used */
316 e = getenv("UNIFIED_CGROUP_HIERARCHY");
317 if (e) {
318 r = parse_boolean(e);
319 if (r < 0)
320 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
321
322 arg_unified_cgroup_hierarchy = r;
323 return 0;
324 }
325
326 /* Otherwise inherit the default from the host system */
327 r = cg_unified();
328 if (r < 0)
329 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
330
331 arg_unified_cgroup_hierarchy = r;
332 return 0;
333 }
334
335 static int parse_argv(int argc, char *argv[]) {
336
337 enum {
338 ARG_VERSION = 0x100,
339 ARG_PRIVATE_NETWORK,
340 ARG_UUID,
341 ARG_READ_ONLY,
342 ARG_CAPABILITY,
343 ARG_DROP_CAPABILITY,
344 ARG_LINK_JOURNAL,
345 ARG_BIND,
346 ARG_BIND_RO,
347 ARG_TMPFS,
348 ARG_OVERLAY,
349 ARG_OVERLAY_RO,
350 ARG_SETENV,
351 ARG_SHARE_SYSTEM,
352 ARG_REGISTER,
353 ARG_KEEP_UNIT,
354 ARG_NETWORK_INTERFACE,
355 ARG_NETWORK_MACVLAN,
356 ARG_NETWORK_IPVLAN,
357 ARG_NETWORK_BRIDGE,
358 ARG_PERSONALITY,
359 ARG_VOLATILE,
360 ARG_TEMPLATE,
361 ARG_PROPERTY,
362 ARG_PRIVATE_USERS,
363 ARG_KILL_SIGNAL,
364 ARG_SETTINGS,
365 };
366
367 static const struct option options[] = {
368 { "help", no_argument, NULL, 'h' },
369 { "version", no_argument, NULL, ARG_VERSION },
370 { "directory", required_argument, NULL, 'D' },
371 { "template", required_argument, NULL, ARG_TEMPLATE },
372 { "ephemeral", no_argument, NULL, 'x' },
373 { "user", required_argument, NULL, 'u' },
374 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
375 { "boot", no_argument, NULL, 'b' },
376 { "uuid", required_argument, NULL, ARG_UUID },
377 { "read-only", no_argument, NULL, ARG_READ_ONLY },
378 { "capability", required_argument, NULL, ARG_CAPABILITY },
379 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
380 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
381 { "bind", required_argument, NULL, ARG_BIND },
382 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
383 { "tmpfs", required_argument, NULL, ARG_TMPFS },
384 { "overlay", required_argument, NULL, ARG_OVERLAY },
385 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
386 { "machine", required_argument, NULL, 'M' },
387 { "slice", required_argument, NULL, 'S' },
388 { "setenv", required_argument, NULL, ARG_SETENV },
389 { "selinux-context", required_argument, NULL, 'Z' },
390 { "selinux-apifs-context", required_argument, NULL, 'L' },
391 { "quiet", no_argument, NULL, 'q' },
392 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
393 { "register", required_argument, NULL, ARG_REGISTER },
394 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
395 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
396 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
397 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
398 { "network-veth", no_argument, NULL, 'n' },
399 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
400 { "personality", required_argument, NULL, ARG_PERSONALITY },
401 { "image", required_argument, NULL, 'i' },
402 { "volatile", optional_argument, NULL, ARG_VOLATILE },
403 { "port", required_argument, NULL, 'p' },
404 { "property", required_argument, NULL, ARG_PROPERTY },
405 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
406 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
407 { "settings", required_argument, NULL, ARG_SETTINGS },
408 {}
409 };
410
411 int c, r;
412 uint64_t plus = 0, minus = 0;
413 bool mask_all_settings = false, mask_no_settings = false;
414
415 assert(argc >= 0);
416 assert(argv);
417
418 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
419
420 switch (c) {
421
422 case 'h':
423 help();
424 return 0;
425
426 case ARG_VERSION:
427 puts(PACKAGE_STRING);
428 puts(SYSTEMD_FEATURES);
429 return 0;
430
431 case 'D':
432 r = set_sanitized_path(&arg_directory, optarg);
433 if (r < 0)
434 return log_error_errno(r, "Invalid root directory: %m");
435
436 break;
437
438 case ARG_TEMPLATE:
439 r = set_sanitized_path(&arg_template, optarg);
440 if (r < 0)
441 return log_error_errno(r, "Invalid template directory: %m");
442
443 break;
444
445 case 'i':
446 r = set_sanitized_path(&arg_image, optarg);
447 if (r < 0)
448 return log_error_errno(r, "Invalid image path: %m");
449
450 break;
451
452 case 'x':
453 arg_ephemeral = true;
454 break;
455
456 case 'u':
457 r = free_and_strdup(&arg_user, optarg);
458 if (r < 0)
459 return log_oom();
460
461 arg_settings_mask |= SETTING_USER;
462 break;
463
464 case ARG_NETWORK_BRIDGE:
465 r = free_and_strdup(&arg_network_bridge, optarg);
466 if (r < 0)
467 return log_oom();
468
469 /* fall through */
470
471 case 'n':
472 arg_network_veth = true;
473 arg_private_network = true;
474 arg_settings_mask |= SETTING_NETWORK;
475 break;
476
477 case ARG_NETWORK_INTERFACE:
478 if (strv_extend(&arg_network_interfaces, optarg) < 0)
479 return log_oom();
480
481 arg_private_network = true;
482 arg_settings_mask |= SETTING_NETWORK;
483 break;
484
485 case ARG_NETWORK_MACVLAN:
486 if (strv_extend(&arg_network_macvlan, optarg) < 0)
487 return log_oom();
488
489 arg_private_network = true;
490 arg_settings_mask |= SETTING_NETWORK;
491 break;
492
493 case ARG_NETWORK_IPVLAN:
494 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
495 return log_oom();
496
497 /* fall through */
498
499 case ARG_PRIVATE_NETWORK:
500 arg_private_network = true;
501 arg_settings_mask |= SETTING_NETWORK;
502 break;
503
504 case 'b':
505 arg_boot = true;
506 arg_settings_mask |= SETTING_BOOT;
507 break;
508
509 case ARG_UUID:
510 r = sd_id128_from_string(optarg, &arg_uuid);
511 if (r < 0) {
512 log_error("Invalid UUID: %s", optarg);
513 return r;
514 }
515
516 arg_settings_mask |= SETTING_MACHINE_ID;
517 break;
518
519 case 'S':
520 arg_slice = optarg;
521 break;
522
523 case 'M':
524 if (isempty(optarg))
525 arg_machine = mfree(arg_machine);
526 else {
527 if (!machine_name_is_valid(optarg)) {
528 log_error("Invalid machine name: %s", optarg);
529 return -EINVAL;
530 }
531
532 r = free_and_strdup(&arg_machine, optarg);
533 if (r < 0)
534 return log_oom();
535
536 break;
537 }
538
539 case 'Z':
540 arg_selinux_context = optarg;
541 break;
542
543 case 'L':
544 arg_selinux_apifs_context = optarg;
545 break;
546
547 case ARG_READ_ONLY:
548 arg_read_only = true;
549 arg_settings_mask |= SETTING_READ_ONLY;
550 break;
551
552 case ARG_CAPABILITY:
553 case ARG_DROP_CAPABILITY: {
554 const char *state, *word;
555 size_t length;
556
557 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
558 _cleanup_free_ char *t;
559
560 t = strndup(word, length);
561 if (!t)
562 return log_oom();
563
564 if (streq(t, "all")) {
565 if (c == ARG_CAPABILITY)
566 plus = (uint64_t) -1;
567 else
568 minus = (uint64_t) -1;
569 } else {
570 int cap;
571
572 cap = capability_from_name(t);
573 if (cap < 0) {
574 log_error("Failed to parse capability %s.", t);
575 return -EINVAL;
576 }
577
578 if (c == ARG_CAPABILITY)
579 plus |= 1ULL << (uint64_t) cap;
580 else
581 minus |= 1ULL << (uint64_t) cap;
582 }
583 }
584
585 arg_settings_mask |= SETTING_CAPABILITY;
586 break;
587 }
588
589 case 'j':
590 arg_link_journal = LINK_GUEST;
591 arg_link_journal_try = true;
592 break;
593
594 case ARG_LINK_JOURNAL:
595 if (streq(optarg, "auto")) {
596 arg_link_journal = LINK_AUTO;
597 arg_link_journal_try = false;
598 } else if (streq(optarg, "no")) {
599 arg_link_journal = LINK_NO;
600 arg_link_journal_try = false;
601 } else if (streq(optarg, "guest")) {
602 arg_link_journal = LINK_GUEST;
603 arg_link_journal_try = false;
604 } else if (streq(optarg, "host")) {
605 arg_link_journal = LINK_HOST;
606 arg_link_journal_try = false;
607 } else if (streq(optarg, "try-guest")) {
608 arg_link_journal = LINK_GUEST;
609 arg_link_journal_try = true;
610 } else if (streq(optarg, "try-host")) {
611 arg_link_journal = LINK_HOST;
612 arg_link_journal_try = true;
613 } else {
614 log_error("Failed to parse link journal mode %s", optarg);
615 return -EINVAL;
616 }
617
618 break;
619
620 case ARG_BIND:
621 case ARG_BIND_RO:
622 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
623 if (r < 0)
624 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
625
626 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
627 break;
628
629 case ARG_TMPFS:
630 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
631 if (r < 0)
632 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
633
634 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
635 break;
636
637 case ARG_OVERLAY:
638 case ARG_OVERLAY_RO: {
639 _cleanup_free_ char *upper = NULL, *destination = NULL;
640 _cleanup_strv_free_ char **lower = NULL;
641 CustomMount *m;
642 unsigned n = 0;
643 char **i;
644
645 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
646 if (r == -ENOMEM)
647 return log_oom();
648 else if (r < 0) {
649 log_error("Invalid overlay specification: %s", optarg);
650 return r;
651 }
652
653 STRV_FOREACH(i, lower) {
654 if (!path_is_absolute(*i)) {
655 log_error("Overlay path %s is not absolute.", *i);
656 return -EINVAL;
657 }
658
659 n++;
660 }
661
662 if (n < 2) {
663 log_error("--overlay= needs at least two colon-separated directories specified.");
664 return -EINVAL;
665 }
666
667 if (n == 2) {
668 /* If two parameters are specified,
669 * the first one is the lower, the
670 * second one the upper directory. And
671 * we'll also define the destination
672 * mount point the same as the upper. */
673 upper = lower[1];
674 lower[1] = NULL;
675
676 destination = strdup(upper);
677 if (!destination)
678 return log_oom();
679
680 } else {
681 upper = lower[n - 2];
682 destination = lower[n - 1];
683 lower[n - 2] = NULL;
684 }
685
686 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
687 if (!m)
688 return log_oom();
689
690 m->destination = destination;
691 m->source = upper;
692 m->lower = lower;
693 m->read_only = c == ARG_OVERLAY_RO;
694
695 upper = destination = NULL;
696 lower = NULL;
697
698 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
699 break;
700 }
701
702 case ARG_SETENV: {
703 char **n;
704
705 if (!env_assignment_is_valid(optarg)) {
706 log_error("Environment variable assignment '%s' is not valid.", optarg);
707 return -EINVAL;
708 }
709
710 n = strv_env_set(arg_setenv, optarg);
711 if (!n)
712 return log_oom();
713
714 strv_free(arg_setenv);
715 arg_setenv = n;
716
717 arg_settings_mask |= SETTING_ENVIRONMENT;
718 break;
719 }
720
721 case 'q':
722 arg_quiet = true;
723 break;
724
725 case ARG_SHARE_SYSTEM:
726 arg_share_system = true;
727 break;
728
729 case ARG_REGISTER:
730 r = parse_boolean(optarg);
731 if (r < 0) {
732 log_error("Failed to parse --register= argument: %s", optarg);
733 return r;
734 }
735
736 arg_register = r;
737 break;
738
739 case ARG_KEEP_UNIT:
740 arg_keep_unit = true;
741 break;
742
743 case ARG_PERSONALITY:
744
745 arg_personality = personality_from_string(optarg);
746 if (arg_personality == PERSONALITY_INVALID) {
747 log_error("Unknown or unsupported personality '%s'.", optarg);
748 return -EINVAL;
749 }
750
751 arg_settings_mask |= SETTING_PERSONALITY;
752 break;
753
754 case ARG_VOLATILE:
755
756 if (!optarg)
757 arg_volatile_mode = VOLATILE_YES;
758 else {
759 VolatileMode m;
760
761 m = volatile_mode_from_string(optarg);
762 if (m < 0) {
763 log_error("Failed to parse --volatile= argument: %s", optarg);
764 return -EINVAL;
765 } else
766 arg_volatile_mode = m;
767 }
768
769 arg_settings_mask |= SETTING_VOLATILE_MODE;
770 break;
771
772 case 'p':
773 r = expose_port_parse(&arg_expose_ports, optarg);
774 if (r == -EEXIST)
775 return log_error_errno(r, "Duplicate port specification: %s", optarg);
776 if (r < 0)
777 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
778
779 arg_settings_mask |= SETTING_EXPOSE_PORTS;
780 break;
781
782 case ARG_PROPERTY:
783 if (strv_extend(&arg_property, optarg) < 0)
784 return log_oom();
785
786 break;
787
788 case ARG_PRIVATE_USERS:
789 if (optarg) {
790 _cleanup_free_ char *buffer = NULL;
791 const char *range, *shift;
792
793 range = strchr(optarg, ':');
794 if (range) {
795 buffer = strndup(optarg, range - optarg);
796 if (!buffer)
797 return log_oom();
798 shift = buffer;
799
800 range++;
801 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
802 log_error("Failed to parse UID range: %s", range);
803 return -EINVAL;
804 }
805 } else
806 shift = optarg;
807
808 if (parse_uid(shift, &arg_uid_shift) < 0) {
809 log_error("Failed to parse UID: %s", optarg);
810 return -EINVAL;
811 }
812 }
813
814 arg_userns = true;
815 break;
816
817 case ARG_KILL_SIGNAL:
818 arg_kill_signal = signal_from_string_try_harder(optarg);
819 if (arg_kill_signal < 0) {
820 log_error("Cannot parse signal: %s", optarg);
821 return -EINVAL;
822 }
823
824 arg_settings_mask |= SETTING_KILL_SIGNAL;
825 break;
826
827 case ARG_SETTINGS:
828
829 /* no → do not read files
830 * yes → read files, do not override cmdline, trust only subset
831 * override → read files, override cmdline, trust only subset
832 * trusted → read files, do not override cmdline, trust all
833 */
834
835 r = parse_boolean(optarg);
836 if (r < 0) {
837 if (streq(optarg, "trusted")) {
838 mask_all_settings = false;
839 mask_no_settings = false;
840 arg_settings_trusted = true;
841
842 } else if (streq(optarg, "override")) {
843 mask_all_settings = false;
844 mask_no_settings = true;
845 arg_settings_trusted = -1;
846 } else
847 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
848 } else if (r > 0) {
849 /* yes */
850 mask_all_settings = false;
851 mask_no_settings = false;
852 arg_settings_trusted = -1;
853 } else {
854 /* no */
855 mask_all_settings = true;
856 mask_no_settings = false;
857 arg_settings_trusted = false;
858 }
859
860 break;
861
862 case '?':
863 return -EINVAL;
864
865 default:
866 assert_not_reached("Unhandled option");
867 }
868
869 if (arg_share_system)
870 arg_register = false;
871
872 if (arg_boot && arg_share_system) {
873 log_error("--boot and --share-system may not be combined.");
874 return -EINVAL;
875 }
876
877 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
878 log_error("--keep-unit may not be used when invoked from a user session.");
879 return -EINVAL;
880 }
881
882 if (arg_directory && arg_image) {
883 log_error("--directory= and --image= may not be combined.");
884 return -EINVAL;
885 }
886
887 if (arg_template && arg_image) {
888 log_error("--template= and --image= may not be combined.");
889 return -EINVAL;
890 }
891
892 if (arg_template && !(arg_directory || arg_machine)) {
893 log_error("--template= needs --directory= or --machine=.");
894 return -EINVAL;
895 }
896
897 if (arg_ephemeral && arg_template) {
898 log_error("--ephemeral and --template= may not be combined.");
899 return -EINVAL;
900 }
901
902 if (arg_ephemeral && arg_image) {
903 log_error("--ephemeral and --image= may not be combined.");
904 return -EINVAL;
905 }
906
907 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
908 log_error("--ephemeral and --link-journal= may not be combined.");
909 return -EINVAL;
910 }
911
912 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
913 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
914
915 if (argc > optind) {
916 arg_parameters = strv_copy(argv + optind);
917 if (!arg_parameters)
918 return log_oom();
919
920 arg_settings_mask |= SETTING_BOOT;
921 }
922
923 /* Load all settings from .nspawn files */
924 if (mask_no_settings)
925 arg_settings_mask = 0;
926
927 /* Don't load any settings from .nspawn files */
928 if (mask_all_settings)
929 arg_settings_mask = _SETTINGS_MASK_ALL;
930
931 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
932
933 r = detect_unified_cgroup_hierarchy();
934 if (r < 0)
935 return r;
936
937 return 1;
938 }
939
940 static int verify_arguments(void) {
941
942 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
943 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
944 return -EINVAL;
945 }
946
947 if (arg_expose_ports && !arg_private_network) {
948 log_error("Cannot use --port= without private networking.");
949 return -EINVAL;
950 }
951
952 if (arg_boot && arg_kill_signal <= 0)
953 arg_kill_signal = SIGRTMIN+3;
954
955 return 0;
956 }
957
958 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
959 assert(p);
960
961 if (!arg_userns)
962 return 0;
963
964 if (uid == UID_INVALID && gid == GID_INVALID)
965 return 0;
966
967 if (uid != UID_INVALID) {
968 uid += arg_uid_shift;
969
970 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
971 return -EOVERFLOW;
972 }
973
974 if (gid != GID_INVALID) {
975 gid += (gid_t) arg_uid_shift;
976
977 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
978 return -EOVERFLOW;
979 }
980
981 if (lchown(p, uid, gid) < 0)
982 return -errno;
983
984 return 0;
985 }
986
987 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
988 const char *q;
989
990 q = prefix_roota(root, path);
991 if (mkdir(q, mode) < 0) {
992 if (errno == EEXIST)
993 return 0;
994 return -errno;
995 }
996
997 return userns_lchown(q, uid, gid);
998 }
999
1000 static int setup_timezone(const char *dest) {
1001 _cleanup_free_ char *p = NULL, *q = NULL;
1002 const char *where, *check, *what;
1003 char *z, *y;
1004 int r;
1005
1006 assert(dest);
1007
1008 /* Fix the timezone, if possible */
1009 r = readlink_malloc("/etc/localtime", &p);
1010 if (r < 0) {
1011 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1012 return 0;
1013 }
1014
1015 z = path_startswith(p, "../usr/share/zoneinfo/");
1016 if (!z)
1017 z = path_startswith(p, "/usr/share/zoneinfo/");
1018 if (!z) {
1019 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1020 return 0;
1021 }
1022
1023 where = prefix_roota(dest, "/etc/localtime");
1024 r = readlink_malloc(where, &q);
1025 if (r >= 0) {
1026 y = path_startswith(q, "../usr/share/zoneinfo/");
1027 if (!y)
1028 y = path_startswith(q, "/usr/share/zoneinfo/");
1029
1030 /* Already pointing to the right place? Then do nothing .. */
1031 if (y && streq(y, z))
1032 return 0;
1033 }
1034
1035 check = strjoina("/usr/share/zoneinfo/", z);
1036 check = prefix_root(dest, check);
1037 if (laccess(check, F_OK) < 0) {
1038 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1039 return 0;
1040 }
1041
1042 r = unlink(where);
1043 if (r < 0 && errno != ENOENT) {
1044 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1045 return 0;
1046 }
1047
1048 what = strjoina("../usr/share/zoneinfo/", z);
1049 if (symlink(what, where) < 0) {
1050 log_error_errno(errno, "Failed to correct timezone of container: %m");
1051 return 0;
1052 }
1053
1054 r = userns_lchown(where, 0, 0);
1055 if (r < 0)
1056 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1057
1058 return 0;
1059 }
1060
1061 static int setup_resolv_conf(const char *dest) {
1062 const char *where = NULL;
1063 int r;
1064
1065 assert(dest);
1066
1067 if (arg_private_network)
1068 return 0;
1069
1070 /* Fix resolv.conf, if possible */
1071 where = prefix_roota(dest, "/etc/resolv.conf");
1072
1073 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1074 if (r < 0) {
1075 /* If the file already exists as symlink, let's
1076 * suppress the warning, under the assumption that
1077 * resolved or something similar runs inside and the
1078 * symlink points there.
1079 *
1080 * If the disk image is read-only, there's also no
1081 * point in complaining.
1082 */
1083 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1084 "Failed to copy /etc/resolv.conf to %s: %m", where);
1085 return 0;
1086 }
1087
1088 r = userns_lchown(where, 0, 0);
1089 if (r < 0)
1090 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1091
1092 return 0;
1093 }
1094
1095 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1096 assert(s);
1097
1098 snprintf(s, 37,
1099 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1100 SD_ID128_FORMAT_VAL(id));
1101
1102 return s;
1103 }
1104
1105 static int setup_boot_id(const char *dest) {
1106 const char *from, *to;
1107 sd_id128_t rnd = {};
1108 char as_uuid[37];
1109 int r;
1110
1111 if (arg_share_system)
1112 return 0;
1113
1114 /* Generate a new randomized boot ID, so that each boot-up of
1115 * the container gets a new one */
1116
1117 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1118 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1119
1120 r = sd_id128_randomize(&rnd);
1121 if (r < 0)
1122 return log_error_errno(r, "Failed to generate random boot id: %m");
1123
1124 id128_format_as_uuid(rnd, as_uuid);
1125
1126 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1127 if (r < 0)
1128 return log_error_errno(r, "Failed to write boot id: %m");
1129
1130 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1131 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1132 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1133 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1134
1135 unlink(from);
1136 return r;
1137 }
1138
1139 static int copy_devnodes(const char *dest) {
1140
1141 static const char devnodes[] =
1142 "null\0"
1143 "zero\0"
1144 "full\0"
1145 "random\0"
1146 "urandom\0"
1147 "tty\0"
1148 "net/tun\0";
1149
1150 const char *d;
1151 int r = 0;
1152 _cleanup_umask_ mode_t u;
1153
1154 assert(dest);
1155
1156 u = umask(0000);
1157
1158 /* Create /dev/net, so that we can create /dev/net/tun in it */
1159 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1160 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1161
1162 NULSTR_FOREACH(d, devnodes) {
1163 _cleanup_free_ char *from = NULL, *to = NULL;
1164 struct stat st;
1165
1166 from = strappend("/dev/", d);
1167 to = prefix_root(dest, from);
1168
1169 if (stat(from, &st) < 0) {
1170
1171 if (errno != ENOENT)
1172 return log_error_errno(errno, "Failed to stat %s: %m", from);
1173
1174 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1175
1176 log_error("%s is not a char or block device, cannot copy.", from);
1177 return -EIO;
1178
1179 } else {
1180 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1181 if (errno != EPERM)
1182 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1183
1184 /* Some systems abusively restrict mknod but
1185 * allow bind mounts. */
1186 r = touch(to);
1187 if (r < 0)
1188 return log_error_errno(r, "touch (%s) failed: %m", to);
1189 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1190 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1191 }
1192
1193 r = userns_lchown(to, 0, 0);
1194 if (r < 0)
1195 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1196 }
1197 }
1198
1199 return r;
1200 }
1201
1202 static int setup_pts(const char *dest) {
1203 _cleanup_free_ char *options = NULL;
1204 const char *p;
1205
1206 #ifdef HAVE_SELINUX
1207 if (arg_selinux_apifs_context)
1208 (void) asprintf(&options,
1209 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1210 arg_uid_shift + TTY_GID,
1211 arg_selinux_apifs_context);
1212 else
1213 #endif
1214 (void) asprintf(&options,
1215 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1216 arg_uid_shift + TTY_GID);
1217
1218 if (!options)
1219 return log_oom();
1220
1221 /* Mount /dev/pts itself */
1222 p = prefix_roota(dest, "/dev/pts");
1223 if (mkdir(p, 0755) < 0)
1224 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1225 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1226 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1227 if (userns_lchown(p, 0, 0) < 0)
1228 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1229
1230 /* Create /dev/ptmx symlink */
1231 p = prefix_roota(dest, "/dev/ptmx");
1232 if (symlink("pts/ptmx", p) < 0)
1233 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1234 if (userns_lchown(p, 0, 0) < 0)
1235 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1236
1237 /* And fix /dev/pts/ptmx ownership */
1238 p = prefix_roota(dest, "/dev/pts/ptmx");
1239 if (userns_lchown(p, 0, 0) < 0)
1240 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1241
1242 return 0;
1243 }
1244
1245 static int setup_dev_console(const char *dest, const char *console) {
1246 _cleanup_umask_ mode_t u;
1247 const char *to;
1248 int r;
1249
1250 assert(dest);
1251 assert(console);
1252
1253 u = umask(0000);
1254
1255 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1256 if (r < 0)
1257 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1258
1259 /* We need to bind mount the right tty to /dev/console since
1260 * ptys can only exist on pts file systems. To have something
1261 * to bind mount things on we create a empty regular file. */
1262
1263 to = prefix_roota(dest, "/dev/console");
1264 r = touch(to);
1265 if (r < 0)
1266 return log_error_errno(r, "touch() for /dev/console failed: %m");
1267
1268 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1269 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1270
1271 return 0;
1272 }
1273
1274 static int setup_kmsg(const char *dest, int kmsg_socket) {
1275 const char *from, *to;
1276 _cleanup_umask_ mode_t u;
1277 int fd, k;
1278 union {
1279 struct cmsghdr cmsghdr;
1280 uint8_t buf[CMSG_SPACE(sizeof(int))];
1281 } control = {};
1282 struct msghdr mh = {
1283 .msg_control = &control,
1284 .msg_controllen = sizeof(control),
1285 };
1286 struct cmsghdr *cmsg;
1287
1288 assert(kmsg_socket >= 0);
1289
1290 u = umask(0000);
1291
1292 /* We create the kmsg FIFO as /run/kmsg, but immediately
1293 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1294 * on the reading side behave very similar to /proc/kmsg,
1295 * their writing side behaves differently from /dev/kmsg in
1296 * that writing blocks when nothing is reading. In order to
1297 * avoid any problems with containers deadlocking due to this
1298 * we simply make /dev/kmsg unavailable to the container. */
1299 from = prefix_roota(dest, "/run/kmsg");
1300 to = prefix_roota(dest, "/proc/kmsg");
1301
1302 if (mkfifo(from, 0600) < 0)
1303 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1304 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1305 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1306
1307 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1308 if (fd < 0)
1309 return log_error_errno(errno, "Failed to open fifo: %m");
1310
1311 cmsg = CMSG_FIRSTHDR(&mh);
1312 cmsg->cmsg_level = SOL_SOCKET;
1313 cmsg->cmsg_type = SCM_RIGHTS;
1314 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1315 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1316
1317 mh.msg_controllen = cmsg->cmsg_len;
1318
1319 /* Store away the fd in the socket, so that it stays open as
1320 * long as we run the child */
1321 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1322 safe_close(fd);
1323
1324 if (k < 0)
1325 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1326
1327 /* And now make the FIFO unavailable as /run/kmsg... */
1328 (void) unlink(from);
1329
1330 return 0;
1331 }
1332
1333 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1334 union in_addr_union *exposed = userdata;
1335
1336 assert(rtnl);
1337 assert(m);
1338 assert(exposed);
1339
1340 expose_port_execute(rtnl, arg_expose_ports, exposed);
1341 return 0;
1342 }
1343
1344 static int setup_hostname(void) {
1345
1346 if (arg_share_system)
1347 return 0;
1348
1349 if (sethostname_idempotent(arg_machine) < 0)
1350 return -errno;
1351
1352 return 0;
1353 }
1354
1355 static int setup_journal(const char *directory) {
1356 sd_id128_t machine_id, this_id;
1357 _cleanup_free_ char *b = NULL, *d = NULL;
1358 const char *etc_machine_id, *p, *q;
1359 char *id;
1360 int r;
1361
1362 /* Don't link journals in ephemeral mode */
1363 if (arg_ephemeral)
1364 return 0;
1365
1366 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1367
1368 r = read_one_line_file(etc_machine_id, &b);
1369 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1370 return 0;
1371 else if (r < 0)
1372 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
1373
1374 id = strstrip(b);
1375 if (isempty(id) && arg_link_journal == LINK_AUTO)
1376 return 0;
1377
1378 /* Verify validity */
1379 r = sd_id128_from_string(id, &machine_id);
1380 if (r < 0)
1381 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
1382
1383 r = sd_id128_get_machine(&this_id);
1384 if (r < 0)
1385 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1386
1387 if (sd_id128_equal(machine_id, this_id)) {
1388 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1389 "Host and machine ids are equal (%s): refusing to link journals", id);
1390 if (arg_link_journal == LINK_AUTO)
1391 return 0;
1392 return -EEXIST;
1393 }
1394
1395 if (arg_link_journal == LINK_NO)
1396 return 0;
1397
1398 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1399 if (r < 0)
1400 return log_error_errno(r, "Failed to create /var: %m");
1401
1402 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1403 if (r < 0)
1404 return log_error_errno(r, "Failed to create /var/log: %m");
1405
1406 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1407 if (r < 0)
1408 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1409
1410 p = strjoina("/var/log/journal/", id);
1411 q = prefix_roota(directory, p);
1412
1413 if (path_is_mount_point(p, 0) > 0) {
1414 if (arg_link_journal != LINK_AUTO) {
1415 log_error("%s: already a mount point, refusing to use for journal", p);
1416 return -EEXIST;
1417 }
1418
1419 return 0;
1420 }
1421
1422 if (path_is_mount_point(q, 0) > 0) {
1423 if (arg_link_journal != LINK_AUTO) {
1424 log_error("%s: already a mount point, refusing to use for journal", q);
1425 return -EEXIST;
1426 }
1427
1428 return 0;
1429 }
1430
1431 r = readlink_and_make_absolute(p, &d);
1432 if (r >= 0) {
1433 if ((arg_link_journal == LINK_GUEST ||
1434 arg_link_journal == LINK_AUTO) &&
1435 path_equal(d, q)) {
1436
1437 r = userns_mkdir(directory, p, 0755, 0, 0);
1438 if (r < 0)
1439 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1440 return 0;
1441 }
1442
1443 if (unlink(p) < 0)
1444 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1445 } else if (r == -EINVAL) {
1446
1447 if (arg_link_journal == LINK_GUEST &&
1448 rmdir(p) < 0) {
1449
1450 if (errno == ENOTDIR) {
1451 log_error("%s already exists and is neither a symlink nor a directory", p);
1452 return r;
1453 } else {
1454 log_error_errno(errno, "Failed to remove %s: %m", p);
1455 return -errno;
1456 }
1457 }
1458 } else if (r != -ENOENT) {
1459 log_error_errno(errno, "readlink(%s) failed: %m", p);
1460 return r;
1461 }
1462
1463 if (arg_link_journal == LINK_GUEST) {
1464
1465 if (symlink(q, p) < 0) {
1466 if (arg_link_journal_try) {
1467 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1468 return 0;
1469 } else {
1470 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1471 return -errno;
1472 }
1473 }
1474
1475 r = userns_mkdir(directory, p, 0755, 0, 0);
1476 if (r < 0)
1477 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1478 return 0;
1479 }
1480
1481 if (arg_link_journal == LINK_HOST) {
1482 /* don't create parents here -- if the host doesn't have
1483 * permanent journal set up, don't force it here */
1484 r = mkdir(p, 0755);
1485 if (r < 0) {
1486 if (arg_link_journal_try) {
1487 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1488 return 0;
1489 } else {
1490 log_error_errno(errno, "Failed to create %s: %m", p);
1491 return r;
1492 }
1493 }
1494
1495 } else if (access(p, F_OK) < 0)
1496 return 0;
1497
1498 if (dir_is_empty(q) == 0)
1499 log_warning("%s is not empty, proceeding anyway.", q);
1500
1501 r = userns_mkdir(directory, p, 0755, 0, 0);
1502 if (r < 0) {
1503 log_error_errno(errno, "Failed to create %s: %m", q);
1504 return r;
1505 }
1506
1507 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1508 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1509
1510 return 0;
1511 }
1512
1513 static int drop_capabilities(void) {
1514 return capability_bounding_set_drop(~arg_retain, false);
1515 }
1516
1517 static int register_machine(pid_t pid, int local_ifindex) {
1518 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1519 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
1520 int r;
1521
1522 if (!arg_register)
1523 return 0;
1524
1525 r = sd_bus_default_system(&bus);
1526 if (r < 0)
1527 return log_error_errno(r, "Failed to open system bus: %m");
1528
1529 if (arg_keep_unit) {
1530 r = sd_bus_call_method(
1531 bus,
1532 "org.freedesktop.machine1",
1533 "/org/freedesktop/machine1",
1534 "org.freedesktop.machine1.Manager",
1535 "RegisterMachineWithNetwork",
1536 &error,
1537 NULL,
1538 "sayssusai",
1539 arg_machine,
1540 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1541 "nspawn",
1542 "container",
1543 (uint32_t) pid,
1544 strempty(arg_directory),
1545 local_ifindex > 0 ? 1 : 0, local_ifindex);
1546 } else {
1547 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1548 char **i;
1549 unsigned j;
1550
1551 r = sd_bus_message_new_method_call(
1552 bus,
1553 &m,
1554 "org.freedesktop.machine1",
1555 "/org/freedesktop/machine1",
1556 "org.freedesktop.machine1.Manager",
1557 "CreateMachineWithNetwork");
1558 if (r < 0)
1559 return bus_log_create_error(r);
1560
1561 r = sd_bus_message_append(
1562 m,
1563 "sayssusai",
1564 arg_machine,
1565 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1566 "nspawn",
1567 "container",
1568 (uint32_t) pid,
1569 strempty(arg_directory),
1570 local_ifindex > 0 ? 1 : 0, local_ifindex);
1571 if (r < 0)
1572 return bus_log_create_error(r);
1573
1574 r = sd_bus_message_open_container(m, 'a', "(sv)");
1575 if (r < 0)
1576 return bus_log_create_error(r);
1577
1578 if (!isempty(arg_slice)) {
1579 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1580 if (r < 0)
1581 return bus_log_create_error(r);
1582 }
1583
1584 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1585 if (r < 0)
1586 return bus_log_create_error(r);
1587
1588 /* If you make changes here, also make sure to update
1589 * systemd-nspawn@.service, to keep the device
1590 * policies in sync regardless if we are run with or
1591 * without the --keep-unit switch. */
1592 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1593 /* Allow the container to
1594 * access and create the API
1595 * device nodes, so that
1596 * PrivateDevices= in the
1597 * container can work
1598 * fine */
1599 "/dev/null", "rwm",
1600 "/dev/zero", "rwm",
1601 "/dev/full", "rwm",
1602 "/dev/random", "rwm",
1603 "/dev/urandom", "rwm",
1604 "/dev/tty", "rwm",
1605 "/dev/net/tun", "rwm",
1606 /* Allow the container
1607 * access to ptys. However,
1608 * do not permit the
1609 * container to ever create
1610 * these device nodes. */
1611 "/dev/pts/ptmx", "rw",
1612 "char-pts", "rw");
1613 if (r < 0)
1614 return bus_log_create_error(r);
1615
1616 for (j = 0; j < arg_n_custom_mounts; j++) {
1617 CustomMount *cm = &arg_custom_mounts[j];
1618
1619 if (cm->type != CUSTOM_MOUNT_BIND)
1620 continue;
1621
1622 r = is_device_node(cm->source);
1623 if (r < 0)
1624 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
1625
1626 if (r) {
1627 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
1628 cm->source, cm->read_only ? "r" : "rw");
1629 if (r < 0)
1630 return log_error_errno(r, "Failed to append message arguments: %m");
1631 }
1632 }
1633
1634 if (arg_kill_signal != 0) {
1635 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
1636 if (r < 0)
1637 return bus_log_create_error(r);
1638
1639 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
1640 if (r < 0)
1641 return bus_log_create_error(r);
1642 }
1643
1644 STRV_FOREACH(i, arg_property) {
1645 r = sd_bus_message_open_container(m, 'r', "sv");
1646 if (r < 0)
1647 return bus_log_create_error(r);
1648
1649 r = bus_append_unit_property_assignment(m, *i);
1650 if (r < 0)
1651 return r;
1652
1653 r = sd_bus_message_close_container(m);
1654 if (r < 0)
1655 return bus_log_create_error(r);
1656 }
1657
1658 r = sd_bus_message_close_container(m);
1659 if (r < 0)
1660 return bus_log_create_error(r);
1661
1662 r = sd_bus_call(bus, m, 0, &error, NULL);
1663 }
1664
1665 if (r < 0) {
1666 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1667 return r;
1668 }
1669
1670 return 0;
1671 }
1672
1673 static int terminate_machine(pid_t pid) {
1674 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1675 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1676 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
1677 const char *path;
1678 int r;
1679
1680 if (!arg_register)
1681 return 0;
1682
1683 /* If we are reusing the unit, then just exit, systemd will do
1684 * the right thing when we exit. */
1685 if (arg_keep_unit)
1686 return 0;
1687
1688 r = sd_bus_default_system(&bus);
1689 if (r < 0)
1690 return log_error_errno(r, "Failed to open system bus: %m");
1691
1692 r = sd_bus_call_method(
1693 bus,
1694 "org.freedesktop.machine1",
1695 "/org/freedesktop/machine1",
1696 "org.freedesktop.machine1.Manager",
1697 "GetMachineByPID",
1698 &error,
1699 &reply,
1700 "u",
1701 (uint32_t) pid);
1702 if (r < 0) {
1703 /* Note that the machine might already have been
1704 * cleaned up automatically, hence don't consider it a
1705 * failure if we cannot get the machine object. */
1706 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1707 return 0;
1708 }
1709
1710 r = sd_bus_message_read(reply, "o", &path);
1711 if (r < 0)
1712 return bus_log_parse_error(r);
1713
1714 r = sd_bus_call_method(
1715 bus,
1716 "org.freedesktop.machine1",
1717 path,
1718 "org.freedesktop.machine1.Machine",
1719 "Terminate",
1720 &error,
1721 NULL,
1722 NULL);
1723 if (r < 0) {
1724 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1725 return 0;
1726 }
1727
1728 return 0;
1729 }
1730
1731 static int reset_audit_loginuid(void) {
1732 _cleanup_free_ char *p = NULL;
1733 int r;
1734
1735 if (arg_share_system)
1736 return 0;
1737
1738 r = read_one_line_file("/proc/self/loginuid", &p);
1739 if (r == -ENOENT)
1740 return 0;
1741 if (r < 0)
1742 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1743
1744 /* Already reset? */
1745 if (streq(p, "4294967295"))
1746 return 0;
1747
1748 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1749 if (r < 0) {
1750 log_error_errno(r,
1751 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1752 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1753 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1754 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1755 "using systemd-nspawn. Sleeping for 5s... (%m)");
1756
1757 sleep(5);
1758 }
1759
1760 return 0;
1761 }
1762
1763 static int setup_seccomp(void) {
1764
1765 #ifdef HAVE_SECCOMP
1766 static const struct {
1767 uint64_t capability;
1768 int syscall_num;
1769 } blacklist[] = {
1770 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1771 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1772 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1773 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1774 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1775 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1776 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1777 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1778 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1779 { CAP_SYSLOG, SCMP_SYS(syslog) },
1780 };
1781
1782 scmp_filter_ctx seccomp;
1783 unsigned i;
1784 int r;
1785
1786 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1787 if (!seccomp)
1788 return log_oom();
1789
1790 r = seccomp_add_secondary_archs(seccomp);
1791 if (r < 0) {
1792 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1793 goto finish;
1794 }
1795
1796 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1797 if (arg_retain & (1ULL << blacklist[i].capability))
1798 continue;
1799
1800 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1801 if (r == -EFAULT)
1802 continue; /* unknown syscall */
1803 if (r < 0) {
1804 log_error_errno(r, "Failed to block syscall: %m");
1805 goto finish;
1806 }
1807 }
1808
1809
1810 /*
1811 Audit is broken in containers, much of the userspace audit
1812 hookup will fail if running inside a container. We don't
1813 care and just turn off creation of audit sockets.
1814
1815 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1816 with EAFNOSUPPORT which audit userspace uses as indication
1817 that audit is disabled in the kernel.
1818 */
1819
1820 r = seccomp_rule_add(
1821 seccomp,
1822 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1823 SCMP_SYS(socket),
1824 2,
1825 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1826 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1827 if (r < 0) {
1828 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1829 goto finish;
1830 }
1831
1832 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1833 if (r < 0) {
1834 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1835 goto finish;
1836 }
1837
1838 r = seccomp_load(seccomp);
1839 if (r == -EINVAL) {
1840 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1841 r = 0;
1842 goto finish;
1843 }
1844 if (r < 0) {
1845 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1846 goto finish;
1847 }
1848
1849 finish:
1850 seccomp_release(seccomp);
1851 return r;
1852 #else
1853 return 0;
1854 #endif
1855
1856 }
1857
1858 static int setup_propagate(const char *root) {
1859 const char *p, *q;
1860
1861 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1862 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1863 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1864 (void) mkdir_p(p, 0600);
1865
1866 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
1867 return log_error_errno(errno, "Failed to create /run/systemd: %m");
1868
1869 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1870 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
1871
1872 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1873 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
1874
1875 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1876 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1877 return log_error_errno(errno, "Failed to install propagation bind mount.");
1878
1879 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1880 return log_error_errno(errno, "Failed to make propagation mount read-only");
1881
1882 return 0;
1883 }
1884
1885 static int setup_image(char **device_path, int *loop_nr) {
1886 struct loop_info64 info = {
1887 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1888 };
1889 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1890 _cleanup_free_ char* loopdev = NULL;
1891 struct stat st;
1892 int r, nr;
1893
1894 assert(device_path);
1895 assert(loop_nr);
1896 assert(arg_image);
1897
1898 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1899 if (fd < 0)
1900 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1901
1902 if (fstat(fd, &st) < 0)
1903 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1904
1905 if (S_ISBLK(st.st_mode)) {
1906 char *p;
1907
1908 p = strdup(arg_image);
1909 if (!p)
1910 return log_oom();
1911
1912 *device_path = p;
1913
1914 *loop_nr = -1;
1915
1916 r = fd;
1917 fd = -1;
1918
1919 return r;
1920 }
1921
1922 if (!S_ISREG(st.st_mode)) {
1923 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1924 return -EINVAL;
1925 }
1926
1927 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1928 if (control < 0)
1929 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1930
1931 nr = ioctl(control, LOOP_CTL_GET_FREE);
1932 if (nr < 0)
1933 return log_error_errno(errno, "Failed to allocate loop device: %m");
1934
1935 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1936 return log_oom();
1937
1938 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1939 if (loop < 0)
1940 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1941
1942 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1943 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1944
1945 if (arg_read_only)
1946 info.lo_flags |= LO_FLAGS_READ_ONLY;
1947
1948 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1949 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1950
1951 *device_path = loopdev;
1952 loopdev = NULL;
1953
1954 *loop_nr = nr;
1955
1956 r = loop;
1957 loop = -1;
1958
1959 return r;
1960 }
1961
1962 #define PARTITION_TABLE_BLURB \
1963 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1964 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1965 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1966 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1967 "to be bootable with systemd-nspawn."
1968
1969 static int dissect_image(
1970 int fd,
1971 char **root_device, bool *root_device_rw,
1972 char **home_device, bool *home_device_rw,
1973 char **srv_device, bool *srv_device_rw,
1974 bool *secondary) {
1975
1976 #ifdef HAVE_BLKID
1977 int home_nr = -1, srv_nr = -1;
1978 #ifdef GPT_ROOT_NATIVE
1979 int root_nr = -1;
1980 #endif
1981 #ifdef GPT_ROOT_SECONDARY
1982 int secondary_root_nr = -1;
1983 #endif
1984 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1985 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1986 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1987 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1988 _cleanup_udev_unref_ struct udev *udev = NULL;
1989 struct udev_list_entry *first, *item;
1990 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1991 bool is_gpt, is_mbr, multiple_generic = false;
1992 const char *pttype = NULL;
1993 blkid_partlist pl;
1994 struct stat st;
1995 unsigned i;
1996 int r;
1997
1998 assert(fd >= 0);
1999 assert(root_device);
2000 assert(home_device);
2001 assert(srv_device);
2002 assert(secondary);
2003 assert(arg_image);
2004
2005 b = blkid_new_probe();
2006 if (!b)
2007 return log_oom();
2008
2009 errno = 0;
2010 r = blkid_probe_set_device(b, fd, 0, 0);
2011 if (r != 0) {
2012 if (errno == 0)
2013 return log_oom();
2014
2015 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2016 return -errno;
2017 }
2018
2019 blkid_probe_enable_partitions(b, 1);
2020 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2021
2022 errno = 0;
2023 r = blkid_do_safeprobe(b);
2024 if (r == -2 || r == 1) {
2025 log_error("Failed to identify any partition table on\n"
2026 " %s\n"
2027 PARTITION_TABLE_BLURB, arg_image);
2028 return -EINVAL;
2029 } else if (r != 0) {
2030 if (errno == 0)
2031 errno = EIO;
2032 log_error_errno(errno, "Failed to probe: %m");
2033 return -errno;
2034 }
2035
2036 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2037
2038 is_gpt = streq_ptr(pttype, "gpt");
2039 is_mbr = streq_ptr(pttype, "dos");
2040
2041 if (!is_gpt && !is_mbr) {
2042 log_error("No GPT or MBR partition table discovered on\n"
2043 " %s\n"
2044 PARTITION_TABLE_BLURB, arg_image);
2045 return -EINVAL;
2046 }
2047
2048 errno = 0;
2049 pl = blkid_probe_get_partitions(b);
2050 if (!pl) {
2051 if (errno == 0)
2052 return log_oom();
2053
2054 log_error("Failed to list partitions of %s", arg_image);
2055 return -errno;
2056 }
2057
2058 udev = udev_new();
2059 if (!udev)
2060 return log_oom();
2061
2062 if (fstat(fd, &st) < 0)
2063 return log_error_errno(errno, "Failed to stat block device: %m");
2064
2065 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2066 if (!d)
2067 return log_oom();
2068
2069 for (i = 0;; i++) {
2070 int n, m;
2071
2072 if (i >= 10) {
2073 log_error("Kernel partitions never appeared.");
2074 return -ENXIO;
2075 }
2076
2077 e = udev_enumerate_new(udev);
2078 if (!e)
2079 return log_oom();
2080
2081 r = udev_enumerate_add_match_parent(e, d);
2082 if (r < 0)
2083 return log_oom();
2084
2085 r = udev_enumerate_scan_devices(e);
2086 if (r < 0)
2087 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2088
2089 /* Count the partitions enumerated by the kernel */
2090 n = 0;
2091 first = udev_enumerate_get_list_entry(e);
2092 udev_list_entry_foreach(item, first)
2093 n++;
2094
2095 /* Count the partitions enumerated by blkid */
2096 m = blkid_partlist_numof_partitions(pl);
2097 if (n == m + 1)
2098 break;
2099 if (n > m + 1) {
2100 log_error("blkid and kernel partition list do not match.");
2101 return -EIO;
2102 }
2103 if (n < m + 1) {
2104 unsigned j;
2105
2106 /* The kernel has probed fewer partitions than
2107 * blkid? Maybe the kernel prober is still
2108 * running or it got EBUSY because udev
2109 * already opened the device. Let's reprobe
2110 * the device, which is a synchronous call
2111 * that waits until probing is complete. */
2112
2113 for (j = 0; j < 20; j++) {
2114
2115 r = ioctl(fd, BLKRRPART, 0);
2116 if (r < 0)
2117 r = -errno;
2118 if (r >= 0 || r != -EBUSY)
2119 break;
2120
2121 /* If something else has the device
2122 * open, such as an udev rule, the
2123 * ioctl will return EBUSY. Since
2124 * there's no way to wait until it
2125 * isn't busy anymore, let's just wait
2126 * a bit, and try again.
2127 *
2128 * This is really something they
2129 * should fix in the kernel! */
2130
2131 usleep(50 * USEC_PER_MSEC);
2132 }
2133
2134 if (r < 0)
2135 return log_error_errno(r, "Failed to reread partition table: %m");
2136 }
2137
2138 e = udev_enumerate_unref(e);
2139 }
2140
2141 first = udev_enumerate_get_list_entry(e);
2142 udev_list_entry_foreach(item, first) {
2143 _cleanup_udev_device_unref_ struct udev_device *q;
2144 const char *node;
2145 unsigned long long flags;
2146 blkid_partition pp;
2147 dev_t qn;
2148 int nr;
2149
2150 errno = 0;
2151 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2152 if (!q) {
2153 if (!errno)
2154 errno = ENOMEM;
2155
2156 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2157 return -errno;
2158 }
2159
2160 qn = udev_device_get_devnum(q);
2161 if (major(qn) == 0)
2162 continue;
2163
2164 if (st.st_rdev == qn)
2165 continue;
2166
2167 node = udev_device_get_devnode(q);
2168 if (!node)
2169 continue;
2170
2171 pp = blkid_partlist_devno_to_partition(pl, qn);
2172 if (!pp)
2173 continue;
2174
2175 flags = blkid_partition_get_flags(pp);
2176
2177 nr = blkid_partition_get_partno(pp);
2178 if (nr < 0)
2179 continue;
2180
2181 if (is_gpt) {
2182 sd_id128_t type_id;
2183 const char *stype;
2184
2185 if (flags & GPT_FLAG_NO_AUTO)
2186 continue;
2187
2188 stype = blkid_partition_get_type_string(pp);
2189 if (!stype)
2190 continue;
2191
2192 if (sd_id128_from_string(stype, &type_id) < 0)
2193 continue;
2194
2195 if (sd_id128_equal(type_id, GPT_HOME)) {
2196
2197 if (home && nr >= home_nr)
2198 continue;
2199
2200 home_nr = nr;
2201 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2202
2203 r = free_and_strdup(&home, node);
2204 if (r < 0)
2205 return log_oom();
2206
2207 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2208
2209 if (srv && nr >= srv_nr)
2210 continue;
2211
2212 srv_nr = nr;
2213 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2214
2215 r = free_and_strdup(&srv, node);
2216 if (r < 0)
2217 return log_oom();
2218 }
2219 #ifdef GPT_ROOT_NATIVE
2220 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2221
2222 if (root && nr >= root_nr)
2223 continue;
2224
2225 root_nr = nr;
2226 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2227
2228 r = free_and_strdup(&root, node);
2229 if (r < 0)
2230 return log_oom();
2231 }
2232 #endif
2233 #ifdef GPT_ROOT_SECONDARY
2234 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2235
2236 if (secondary_root && nr >= secondary_root_nr)
2237 continue;
2238
2239 secondary_root_nr = nr;
2240 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2241
2242 r = free_and_strdup(&secondary_root, node);
2243 if (r < 0)
2244 return log_oom();
2245 }
2246 #endif
2247 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2248
2249 if (generic)
2250 multiple_generic = true;
2251 else {
2252 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2253
2254 r = free_and_strdup(&generic, node);
2255 if (r < 0)
2256 return log_oom();
2257 }
2258 }
2259
2260 } else if (is_mbr) {
2261 int type;
2262
2263 if (flags != 0x80) /* Bootable flag */
2264 continue;
2265
2266 type = blkid_partition_get_type(pp);
2267 if (type != 0x83) /* Linux partition */
2268 continue;
2269
2270 if (generic)
2271 multiple_generic = true;
2272 else {
2273 generic_rw = true;
2274
2275 r = free_and_strdup(&root, node);
2276 if (r < 0)
2277 return log_oom();
2278 }
2279 }
2280 }
2281
2282 if (root) {
2283 *root_device = root;
2284 root = NULL;
2285
2286 *root_device_rw = root_rw;
2287 *secondary = false;
2288 } else if (secondary_root) {
2289 *root_device = secondary_root;
2290 secondary_root = NULL;
2291
2292 *root_device_rw = secondary_root_rw;
2293 *secondary = true;
2294 } else if (generic) {
2295
2296 /* There were no partitions with precise meanings
2297 * around, but we found generic partitions. In this
2298 * case, if there's only one, we can go ahead and boot
2299 * it, otherwise we bail out, because we really cannot
2300 * make any sense of it. */
2301
2302 if (multiple_generic) {
2303 log_error("Identified multiple bootable Linux partitions on\n"
2304 " %s\n"
2305 PARTITION_TABLE_BLURB, arg_image);
2306 return -EINVAL;
2307 }
2308
2309 *root_device = generic;
2310 generic = NULL;
2311
2312 *root_device_rw = generic_rw;
2313 *secondary = false;
2314 } else {
2315 log_error("Failed to identify root partition in disk image\n"
2316 " %s\n"
2317 PARTITION_TABLE_BLURB, arg_image);
2318 return -EINVAL;
2319 }
2320
2321 if (home) {
2322 *home_device = home;
2323 home = NULL;
2324
2325 *home_device_rw = home_rw;
2326 }
2327
2328 if (srv) {
2329 *srv_device = srv;
2330 srv = NULL;
2331
2332 *srv_device_rw = srv_rw;
2333 }
2334
2335 return 0;
2336 #else
2337 log_error("--image= is not supported, compiled without blkid support.");
2338 return -EOPNOTSUPP;
2339 #endif
2340 }
2341
2342 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2343 #ifdef HAVE_BLKID
2344 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2345 const char *fstype, *p;
2346 int r;
2347
2348 assert(what);
2349 assert(where);
2350
2351 if (arg_read_only)
2352 rw = false;
2353
2354 if (directory)
2355 p = strjoina(where, directory);
2356 else
2357 p = where;
2358
2359 errno = 0;
2360 b = blkid_new_probe_from_filename(what);
2361 if (!b) {
2362 if (errno == 0)
2363 return log_oom();
2364 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2365 return -errno;
2366 }
2367
2368 blkid_probe_enable_superblocks(b, 1);
2369 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2370
2371 errno = 0;
2372 r = blkid_do_safeprobe(b);
2373 if (r == -1 || r == 1) {
2374 log_error("Cannot determine file system type of %s", what);
2375 return -EINVAL;
2376 } else if (r != 0) {
2377 if (errno == 0)
2378 errno = EIO;
2379 log_error_errno(errno, "Failed to probe %s: %m", what);
2380 return -errno;
2381 }
2382
2383 errno = 0;
2384 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2385 if (errno == 0)
2386 errno = EINVAL;
2387 log_error("Failed to determine file system type of %s", what);
2388 return -errno;
2389 }
2390
2391 if (streq(fstype, "crypto_LUKS")) {
2392 log_error("nspawn currently does not support LUKS disk images.");
2393 return -EOPNOTSUPP;
2394 }
2395
2396 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2397 return log_error_errno(errno, "Failed to mount %s: %m", what);
2398
2399 return 0;
2400 #else
2401 log_error("--image= is not supported, compiled without blkid support.");
2402 return -EOPNOTSUPP;
2403 #endif
2404 }
2405
2406 static int mount_devices(
2407 const char *where,
2408 const char *root_device, bool root_device_rw,
2409 const char *home_device, bool home_device_rw,
2410 const char *srv_device, bool srv_device_rw) {
2411 int r;
2412
2413 assert(where);
2414
2415 if (root_device) {
2416 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2417 if (r < 0)
2418 return log_error_errno(r, "Failed to mount root directory: %m");
2419 }
2420
2421 if (home_device) {
2422 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2423 if (r < 0)
2424 return log_error_errno(r, "Failed to mount home directory: %m");
2425 }
2426
2427 if (srv_device) {
2428 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2429 if (r < 0)
2430 return log_error_errno(r, "Failed to mount server data directory: %m");
2431 }
2432
2433 return 0;
2434 }
2435
2436 static void loop_remove(int nr, int *image_fd) {
2437 _cleanup_close_ int control = -1;
2438 int r;
2439
2440 if (nr < 0)
2441 return;
2442
2443 if (image_fd && *image_fd >= 0) {
2444 r = ioctl(*image_fd, LOOP_CLR_FD);
2445 if (r < 0)
2446 log_debug_errno(errno, "Failed to close loop image: %m");
2447 *image_fd = safe_close(*image_fd);
2448 }
2449
2450 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2451 if (control < 0) {
2452 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2453 return;
2454 }
2455
2456 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2457 if (r < 0)
2458 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2459 }
2460
2461 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2462 int pipe_fds[2];
2463 pid_t pid;
2464
2465 assert(database);
2466 assert(key);
2467 assert(rpid);
2468
2469 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2470 return log_error_errno(errno, "Failed to allocate pipe: %m");
2471
2472 pid = fork();
2473 if (pid < 0)
2474 return log_error_errno(errno, "Failed to fork getent child: %m");
2475 else if (pid == 0) {
2476 int nullfd;
2477 char *empty_env = NULL;
2478
2479 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2480 _exit(EXIT_FAILURE);
2481
2482 if (pipe_fds[0] > 2)
2483 safe_close(pipe_fds[0]);
2484 if (pipe_fds[1] > 2)
2485 safe_close(pipe_fds[1]);
2486
2487 nullfd = open("/dev/null", O_RDWR);
2488 if (nullfd < 0)
2489 _exit(EXIT_FAILURE);
2490
2491 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2492 _exit(EXIT_FAILURE);
2493
2494 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2495 _exit(EXIT_FAILURE);
2496
2497 if (nullfd > 2)
2498 safe_close(nullfd);
2499
2500 (void) reset_all_signal_handlers();
2501 (void) reset_signal_mask();
2502 close_all_fds(NULL, 0);
2503
2504 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2505 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2506 _exit(EXIT_FAILURE);
2507 }
2508
2509 pipe_fds[1] = safe_close(pipe_fds[1]);
2510
2511 *rpid = pid;
2512
2513 return pipe_fds[0];
2514 }
2515
2516 static int change_uid_gid(char **_home) {
2517 char line[LINE_MAX], *x, *u, *g, *h;
2518 const char *word, *state;
2519 _cleanup_free_ uid_t *uids = NULL;
2520 _cleanup_free_ char *home = NULL;
2521 _cleanup_fclose_ FILE *f = NULL;
2522 _cleanup_close_ int fd = -1;
2523 unsigned n_uids = 0;
2524 size_t sz = 0, l;
2525 uid_t uid;
2526 gid_t gid;
2527 pid_t pid;
2528 int r;
2529
2530 assert(_home);
2531
2532 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2533 /* Reset everything fully to 0, just in case */
2534
2535 r = reset_uid_gid();
2536 if (r < 0)
2537 return log_error_errno(r, "Failed to become root: %m");
2538
2539 *_home = NULL;
2540 return 0;
2541 }
2542
2543 /* First, get user credentials */
2544 fd = spawn_getent("passwd", arg_user, &pid);
2545 if (fd < 0)
2546 return fd;
2547
2548 f = fdopen(fd, "r");
2549 if (!f)
2550 return log_oom();
2551 fd = -1;
2552
2553 if (!fgets(line, sizeof(line), f)) {
2554
2555 if (!ferror(f)) {
2556 log_error("Failed to resolve user %s.", arg_user);
2557 return -ESRCH;
2558 }
2559
2560 log_error_errno(errno, "Failed to read from getent: %m");
2561 return -errno;
2562 }
2563
2564 truncate_nl(line);
2565
2566 wait_for_terminate_and_warn("getent passwd", pid, true);
2567
2568 x = strchr(line, ':');
2569 if (!x) {
2570 log_error("/etc/passwd entry has invalid user field.");
2571 return -EIO;
2572 }
2573
2574 u = strchr(x+1, ':');
2575 if (!u) {
2576 log_error("/etc/passwd entry has invalid password field.");
2577 return -EIO;
2578 }
2579
2580 u++;
2581 g = strchr(u, ':');
2582 if (!g) {
2583 log_error("/etc/passwd entry has invalid UID field.");
2584 return -EIO;
2585 }
2586
2587 *g = 0;
2588 g++;
2589 x = strchr(g, ':');
2590 if (!x) {
2591 log_error("/etc/passwd entry has invalid GID field.");
2592 return -EIO;
2593 }
2594
2595 *x = 0;
2596 h = strchr(x+1, ':');
2597 if (!h) {
2598 log_error("/etc/passwd entry has invalid GECOS field.");
2599 return -EIO;
2600 }
2601
2602 h++;
2603 x = strchr(h, ':');
2604 if (!x) {
2605 log_error("/etc/passwd entry has invalid home directory field.");
2606 return -EIO;
2607 }
2608
2609 *x = 0;
2610
2611 r = parse_uid(u, &uid);
2612 if (r < 0) {
2613 log_error("Failed to parse UID of user.");
2614 return -EIO;
2615 }
2616
2617 r = parse_gid(g, &gid);
2618 if (r < 0) {
2619 log_error("Failed to parse GID of user.");
2620 return -EIO;
2621 }
2622
2623 home = strdup(h);
2624 if (!home)
2625 return log_oom();
2626
2627 /* Second, get group memberships */
2628 fd = spawn_getent("initgroups", arg_user, &pid);
2629 if (fd < 0)
2630 return fd;
2631
2632 fclose(f);
2633 f = fdopen(fd, "r");
2634 if (!f)
2635 return log_oom();
2636 fd = -1;
2637
2638 if (!fgets(line, sizeof(line), f)) {
2639 if (!ferror(f)) {
2640 log_error("Failed to resolve user %s.", arg_user);
2641 return -ESRCH;
2642 }
2643
2644 log_error_errno(errno, "Failed to read from getent: %m");
2645 return -errno;
2646 }
2647
2648 truncate_nl(line);
2649
2650 wait_for_terminate_and_warn("getent initgroups", pid, true);
2651
2652 /* Skip over the username and subsequent separator whitespace */
2653 x = line;
2654 x += strcspn(x, WHITESPACE);
2655 x += strspn(x, WHITESPACE);
2656
2657 FOREACH_WORD(word, l, x, state) {
2658 char c[l+1];
2659
2660 memcpy(c, word, l);
2661 c[l] = 0;
2662
2663 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2664 return log_oom();
2665
2666 r = parse_uid(c, &uids[n_uids++]);
2667 if (r < 0) {
2668 log_error("Failed to parse group data from getent.");
2669 return -EIO;
2670 }
2671 }
2672
2673 r = mkdir_parents(home, 0775);
2674 if (r < 0)
2675 return log_error_errno(r, "Failed to make home root directory: %m");
2676
2677 r = mkdir_safe(home, 0755, uid, gid);
2678 if (r < 0 && r != -EEXIST)
2679 return log_error_errno(r, "Failed to make home directory: %m");
2680
2681 (void) fchown(STDIN_FILENO, uid, gid);
2682 (void) fchown(STDOUT_FILENO, uid, gid);
2683 (void) fchown(STDERR_FILENO, uid, gid);
2684
2685 if (setgroups(n_uids, uids) < 0)
2686 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
2687
2688 if (setresgid(gid, gid, gid) < 0)
2689 return log_error_errno(errno, "setregid() failed: %m");
2690
2691 if (setresuid(uid, uid, uid) < 0)
2692 return log_error_errno(errno, "setreuid() failed: %m");
2693
2694 if (_home) {
2695 *_home = home;
2696 home = NULL;
2697 }
2698
2699 return 0;
2700 }
2701
2702 /*
2703 * Return values:
2704 * < 0 : wait_for_terminate() failed to get the state of the
2705 * container, the container was terminated by a signal, or
2706 * failed for an unknown reason. No change is made to the
2707 * container argument.
2708 * > 0 : The program executed in the container terminated with an
2709 * error. The exit code of the program executed in the
2710 * container is returned. The container argument has been set
2711 * to CONTAINER_TERMINATED.
2712 * 0 : The container is being rebooted, has been shut down or exited
2713 * successfully. The container argument has been set to either
2714 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2715 *
2716 * That is, success is indicated by a return value of zero, and an
2717 * error is indicated by a non-zero value.
2718 */
2719 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2720 siginfo_t status;
2721 int r;
2722
2723 r = wait_for_terminate(pid, &status);
2724 if (r < 0)
2725 return log_warning_errno(r, "Failed to wait for container: %m");
2726
2727 switch (status.si_code) {
2728
2729 case CLD_EXITED:
2730 if (status.si_status == 0) {
2731 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2732
2733 } else
2734 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2735
2736 *container = CONTAINER_TERMINATED;
2737 return status.si_status;
2738
2739 case CLD_KILLED:
2740 if (status.si_status == SIGINT) {
2741
2742 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2743 *container = CONTAINER_TERMINATED;
2744 return 0;
2745
2746 } else if (status.si_status == SIGHUP) {
2747
2748 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2749 *container = CONTAINER_REBOOTED;
2750 return 0;
2751 }
2752
2753 /* CLD_KILLED fallthrough */
2754
2755 case CLD_DUMPED:
2756 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2757 return -EIO;
2758
2759 default:
2760 log_error("Container %s failed due to unknown reason.", arg_machine);
2761 return -EIO;
2762 }
2763
2764 return r;
2765 }
2766
2767 static void nop_handler(int sig) {}
2768
2769 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2770 pid_t pid;
2771
2772 pid = PTR_TO_UINT32(userdata);
2773 if (pid > 0) {
2774 if (kill(pid, arg_kill_signal) >= 0) {
2775 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2776 sd_event_source_set_userdata(s, NULL);
2777 return 0;
2778 }
2779 }
2780
2781 sd_event_exit(sd_event_source_get_event(s), 0);
2782 return 0;
2783 }
2784
2785 static int determine_names(void) {
2786 int r;
2787
2788 if (arg_template && !arg_directory && arg_machine) {
2789
2790 /* If --template= was specified then we should not
2791 * search for a machine, but instead create a new one
2792 * in /var/lib/machine. */
2793
2794 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2795 if (!arg_directory)
2796 return log_oom();
2797 }
2798
2799 if (!arg_image && !arg_directory) {
2800 if (arg_machine) {
2801 _cleanup_(image_unrefp) Image *i = NULL;
2802
2803 r = image_find(arg_machine, &i);
2804 if (r < 0)
2805 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2806 else if (r == 0) {
2807 log_error("No image for machine '%s': %m", arg_machine);
2808 return -ENOENT;
2809 }
2810
2811 if (i->type == IMAGE_RAW)
2812 r = set_sanitized_path(&arg_image, i->path);
2813 else
2814 r = set_sanitized_path(&arg_directory, i->path);
2815 if (r < 0)
2816 return log_error_errno(r, "Invalid image directory: %m");
2817
2818 if (!arg_ephemeral)
2819 arg_read_only = arg_read_only || i->read_only;
2820 } else
2821 arg_directory = get_current_dir_name();
2822
2823 if (!arg_directory && !arg_machine) {
2824 log_error("Failed to determine path, please use -D or -i.");
2825 return -EINVAL;
2826 }
2827 }
2828
2829 if (!arg_machine) {
2830 if (arg_directory && path_equal(arg_directory, "/"))
2831 arg_machine = gethostname_malloc();
2832 else
2833 arg_machine = strdup(basename(arg_image ?: arg_directory));
2834
2835 if (!arg_machine)
2836 return log_oom();
2837
2838 hostname_cleanup(arg_machine);
2839 if (!machine_name_is_valid(arg_machine)) {
2840 log_error("Failed to determine machine name automatically, please use -M.");
2841 return -EINVAL;
2842 }
2843
2844 if (arg_ephemeral) {
2845 char *b;
2846
2847 /* Add a random suffix when this is an
2848 * ephemeral machine, so that we can run many
2849 * instances at once without manually having
2850 * to specify -M each time. */
2851
2852 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2853 return log_oom();
2854
2855 free(arg_machine);
2856 arg_machine = b;
2857 }
2858 }
2859
2860 return 0;
2861 }
2862
2863 static int determine_uid_shift(const char *directory) {
2864 int r;
2865
2866 if (!arg_userns) {
2867 arg_uid_shift = 0;
2868 return 0;
2869 }
2870
2871 if (arg_uid_shift == UID_INVALID) {
2872 struct stat st;
2873
2874 r = stat(directory, &st);
2875 if (r < 0)
2876 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2877
2878 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2879
2880 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2881 log_error("UID and GID base of %s don't match.", directory);
2882 return -EINVAL;
2883 }
2884
2885 arg_uid_range = UINT32_C(0x10000);
2886 }
2887
2888 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2889 log_error("UID base too high for UID range.");
2890 return -EINVAL;
2891 }
2892
2893 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2894 return 0;
2895 }
2896
2897 static int inner_child(
2898 Barrier *barrier,
2899 const char *directory,
2900 bool secondary,
2901 int kmsg_socket,
2902 int rtnl_socket,
2903 FDSet *fds) {
2904
2905 _cleanup_free_ char *home = NULL;
2906 unsigned n_env = 2;
2907 const char *envp[] = {
2908 "PATH=" DEFAULT_PATH_SPLIT_USR,
2909 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2910 NULL, /* TERM */
2911 NULL, /* HOME */
2912 NULL, /* USER */
2913 NULL, /* LOGNAME */
2914 NULL, /* container_uuid */
2915 NULL, /* LISTEN_FDS */
2916 NULL, /* LISTEN_PID */
2917 NULL
2918 };
2919
2920 _cleanup_strv_free_ char **env_use = NULL;
2921 int r;
2922
2923 assert(barrier);
2924 assert(directory);
2925 assert(kmsg_socket >= 0);
2926
2927 cg_unified_flush();
2928
2929 if (arg_userns) {
2930 /* Tell the parent, that it now can write the UID map. */
2931 (void) barrier_place(barrier); /* #1 */
2932
2933 /* Wait until the parent wrote the UID map */
2934 if (!barrier_place_and_sync(barrier)) { /* #2 */
2935 log_error("Parent died too early");
2936 return -ESRCH;
2937 }
2938 }
2939
2940 r = mount_all(NULL, true, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2941 if (r < 0)
2942 return r;
2943
2944 /* Wait until we are cgroup-ified, so that we
2945 * can mount the right cgroup path writable */
2946 if (!barrier_place_and_sync(barrier)) { /* #3 */
2947 log_error("Parent died too early");
2948 return -ESRCH;
2949 }
2950
2951 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2952 if (r < 0)
2953 return r;
2954
2955 r = reset_uid_gid();
2956 if (r < 0)
2957 return log_error_errno(r, "Couldn't become new root: %m");
2958
2959 r = setup_boot_id(NULL);
2960 if (r < 0)
2961 return r;
2962
2963 r = setup_kmsg(NULL, kmsg_socket);
2964 if (r < 0)
2965 return r;
2966 kmsg_socket = safe_close(kmsg_socket);
2967
2968 umask(0022);
2969
2970 if (setsid() < 0)
2971 return log_error_errno(errno, "setsid() failed: %m");
2972
2973 if (arg_private_network)
2974 loopback_setup();
2975
2976 if (arg_expose_ports) {
2977 r = expose_port_send_rtnl(rtnl_socket);
2978 if (r < 0)
2979 return r;
2980 rtnl_socket = safe_close(rtnl_socket);
2981 }
2982
2983 if (drop_capabilities() < 0)
2984 return log_error_errno(errno, "drop_capabilities() failed: %m");
2985
2986 setup_hostname();
2987
2988 if (arg_personality != PERSONALITY_INVALID) {
2989 if (personality(arg_personality) < 0)
2990 return log_error_errno(errno, "personality() failed: %m");
2991 } else if (secondary) {
2992 if (personality(PER_LINUX32) < 0)
2993 return log_error_errno(errno, "personality() failed: %m");
2994 }
2995
2996 #ifdef HAVE_SELINUX
2997 if (arg_selinux_context)
2998 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2999 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3000 #endif
3001
3002 r = change_uid_gid(&home);
3003 if (r < 0)
3004 return r;
3005
3006 envp[n_env] = strv_find_prefix(environ, "TERM=");
3007 if (envp[n_env])
3008 n_env ++;
3009
3010 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3011 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3012 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
3013 return log_oom();
3014
3015 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3016 char as_uuid[37];
3017
3018 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
3019 return log_oom();
3020 }
3021
3022 if (fdset_size(fds) > 0) {
3023 r = fdset_cloexec(fds, false);
3024 if (r < 0)
3025 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3026
3027 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3028 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3029 return log_oom();
3030 }
3031
3032 env_use = strv_env_merge(2, envp, arg_setenv);
3033 if (!env_use)
3034 return log_oom();
3035
3036 /* Let the parent know that we are ready and
3037 * wait until the parent is ready with the
3038 * setup, too... */
3039 if (!barrier_place_and_sync(barrier)) { /* #4 */
3040 log_error("Parent died too early");
3041 return -ESRCH;
3042 }
3043
3044 /* Now, explicitly close the log, so that we
3045 * then can close all remaining fds. Closing
3046 * the log explicitly first has the benefit
3047 * that the logging subsystem knows about it,
3048 * and is thus ready to be reopened should we
3049 * need it again. Note that the other fds
3050 * closed here are at least the locking and
3051 * barrier fds. */
3052 log_close();
3053 (void) fdset_close_others(fds);
3054
3055 if (arg_boot) {
3056 char **a;
3057 size_t m;
3058
3059 /* Automatically search for the init system */
3060
3061 m = 1 + strv_length(arg_parameters);
3062 a = newa(char*, m + 1);
3063 if (strv_isempty(arg_parameters))
3064 a[1] = NULL;
3065 else
3066 memcpy(a + 1, arg_parameters, m * sizeof(char*));
3067
3068 a[0] = (char*) "/usr/lib/systemd/systemd";
3069 execve(a[0], a, env_use);
3070
3071 a[0] = (char*) "/lib/systemd/systemd";
3072 execve(a[0], a, env_use);
3073
3074 a[0] = (char*) "/sbin/init";
3075 execve(a[0], a, env_use);
3076 } else if (!strv_isempty(arg_parameters))
3077 execvpe(arg_parameters[0], arg_parameters, env_use);
3078 else {
3079 chdir(home ?: "/root");
3080 execle("/bin/bash", "-bash", NULL, env_use);
3081 execle("/bin/sh", "-sh", NULL, env_use);
3082 }
3083
3084 (void) log_open();
3085 return log_error_errno(errno, "execv() failed: %m");
3086 }
3087
3088 static int outer_child(
3089 Barrier *barrier,
3090 const char *directory,
3091 const char *console,
3092 const char *root_device, bool root_device_rw,
3093 const char *home_device, bool home_device_rw,
3094 const char *srv_device, bool srv_device_rw,
3095 bool interactive,
3096 bool secondary,
3097 int pid_socket,
3098 int kmsg_socket,
3099 int rtnl_socket,
3100 int uid_shift_socket,
3101 FDSet *fds) {
3102
3103 pid_t pid;
3104 ssize_t l;
3105 int r;
3106
3107 assert(barrier);
3108 assert(directory);
3109 assert(console);
3110 assert(pid_socket >= 0);
3111 assert(kmsg_socket >= 0);
3112
3113 cg_unified_flush();
3114
3115 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3116 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3117
3118 if (interactive) {
3119 close_nointr(STDIN_FILENO);
3120 close_nointr(STDOUT_FILENO);
3121 close_nointr(STDERR_FILENO);
3122
3123 r = open_terminal(console, O_RDWR);
3124 if (r != STDIN_FILENO) {
3125 if (r >= 0) {
3126 safe_close(r);
3127 r = -EINVAL;
3128 }
3129
3130 return log_error_errno(r, "Failed to open console: %m");
3131 }
3132
3133 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3134 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
3135 return log_error_errno(errno, "Failed to duplicate console: %m");
3136 }
3137
3138 r = reset_audit_loginuid();
3139 if (r < 0)
3140 return r;
3141
3142 /* Mark everything as slave, so that we still
3143 * receive mounts from the real root, but don't
3144 * propagate mounts to the real root. */
3145 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
3146 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3147
3148 r = mount_devices(directory,
3149 root_device, root_device_rw,
3150 home_device, home_device_rw,
3151 srv_device, srv_device_rw);
3152 if (r < 0)
3153 return r;
3154
3155 r = determine_uid_shift(directory);
3156 if (r < 0)
3157 return r;
3158
3159 if (arg_userns) {
3160 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3161 if (l < 0)
3162 return log_error_errno(errno, "Failed to send UID shift: %m");
3163 if (l != sizeof(arg_uid_shift)) {
3164 log_error("Short write while sending UID shift.");
3165 return -EIO;
3166 }
3167 }
3168
3169 /* Turn directory into bind mount */
3170 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
3171 return log_error_errno(errno, "Failed to make bind mount: %m");
3172
3173 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
3174 if (r < 0)
3175 return r;
3176
3177 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
3178 if (r < 0)
3179 return r;
3180
3181 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3182 if (r < 0)
3183 return r;
3184
3185 if (arg_read_only) {
3186 r = bind_remount_recursive(directory, true);
3187 if (r < 0)
3188 return log_error_errno(r, "Failed to make tree read-only: %m");
3189 }
3190
3191 r = mount_all(directory, false, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
3192 if (r < 0)
3193 return r;
3194
3195 if (copy_devnodes(directory) < 0)
3196 return r;
3197
3198 dev_setup(directory, arg_uid_shift, arg_uid_shift);
3199
3200 if (setup_pts(directory) < 0)
3201 return r;
3202
3203 r = setup_propagate(directory);
3204 if (r < 0)
3205 return r;
3206
3207 r = setup_dev_console(directory, console);
3208 if (r < 0)
3209 return r;
3210
3211 r = setup_seccomp();
3212 if (r < 0)
3213 return r;
3214
3215 r = setup_timezone(directory);
3216 if (r < 0)
3217 return r;
3218
3219 r = setup_resolv_conf(directory);
3220 if (r < 0)
3221 return r;
3222
3223 r = setup_journal(directory);
3224 if (r < 0)
3225 return r;
3226
3227 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
3228 if (r < 0)
3229 return r;
3230
3231 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
3232 if (r < 0)
3233 return r;
3234
3235 r = mount_move_root(directory);
3236 if (r < 0)
3237 return log_error_errno(r, "Failed to move root directory: %m");
3238
3239 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3240 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
3241 (arg_private_network ? CLONE_NEWNET : 0) |
3242 (arg_userns ? CLONE_NEWUSER : 0),
3243 NULL);
3244 if (pid < 0)
3245 return log_error_errno(errno, "Failed to fork inner child: %m");
3246 if (pid == 0) {
3247 pid_socket = safe_close(pid_socket);
3248 uid_shift_socket = safe_close(uid_shift_socket);
3249
3250 /* The inner child has all namespaces that are
3251 * requested, so that we all are owned by the user if
3252 * user namespaces are turned on. */
3253
3254 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
3255 if (r < 0)
3256 _exit(EXIT_FAILURE);
3257
3258 _exit(EXIT_SUCCESS);
3259 }
3260
3261 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3262 if (l < 0)
3263 return log_error_errno(errno, "Failed to send PID: %m");
3264 if (l != sizeof(pid)) {
3265 log_error("Short write while sending PID.");
3266 return -EIO;
3267 }
3268
3269 pid_socket = safe_close(pid_socket);
3270
3271 return 0;
3272 }
3273
3274 static int setup_uid_map(pid_t pid) {
3275 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3276 int r;
3277
3278 assert(pid > 1);
3279
3280 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3281 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
3282 r = write_string_file(uid_map, line, 0);
3283 if (r < 0)
3284 return log_error_errno(r, "Failed to write UID map: %m");
3285
3286 /* We always assign the same UID and GID ranges */
3287 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
3288 r = write_string_file(uid_map, line, 0);
3289 if (r < 0)
3290 return log_error_errno(r, "Failed to write GID map: %m");
3291
3292 return 0;
3293 }
3294
3295 static int chown_cgroup(pid_t pid) {
3296 _cleanup_free_ char *path = NULL, *fs = NULL;
3297 _cleanup_close_ int fd = -1;
3298 const char *fn;
3299 int r;
3300
3301 r = cg_pid_get_path(NULL, pid, &path);
3302 if (r < 0)
3303 return log_error_errno(r, "Failed to get container cgroup path: %m");
3304
3305 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
3306 if (r < 0)
3307 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
3308
3309 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
3310 if (fd < 0)
3311 return log_error_errno(errno, "Failed to open %s: %m", fs);
3312
3313 FOREACH_STRING(fn,
3314 ".",
3315 "tasks",
3316 "notify_on_release",
3317 "cgroup.procs",
3318 "cgroup.clone_children",
3319 "cgroup.controllers",
3320 "cgroup.subtree_control",
3321 "cgroup.populated")
3322 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
3323 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
3324 "Failed to chown() cgroup file %s, ignoring: %m", fn);
3325
3326 return 0;
3327 }
3328
3329 static int sync_cgroup(pid_t pid) {
3330 _cleanup_free_ char *cgroup = NULL;
3331 char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
3332 bool undo_mount = false;
3333 const char *fn;
3334 int unified, r;
3335
3336 unified = cg_unified();
3337 if (unified < 0)
3338 return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
3339
3340 if ((unified > 0) == arg_unified_cgroup_hierarchy)
3341 return 0;
3342
3343 /* When the host uses the legacy cgroup setup, but the
3344 * container shall use the unified hierarchy, let's make sure
3345 * we copy the path from the name=systemd hierarchy into the
3346 * unified hierarchy. Similar for the reverse situation. */
3347
3348 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
3349 if (r < 0)
3350 return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid);
3351
3352 /* In order to access the unified hierarchy we need to mount it */
3353 if (!mkdtemp(tree))
3354 return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m");
3355
3356 if (unified)
3357 r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
3358 else
3359 r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior");
3360 if (r < 0) {
3361 r = log_error_errno(errno, "Failed to mount unified hierarchy: %m");
3362 goto finish;
3363 }
3364
3365 undo_mount = true;
3366
3367 fn = strjoina(tree, cgroup, "/cgroup.procs");
3368 (void) mkdir_parents(fn, 0755);
3369
3370 sprintf(pid_string, PID_FMT, pid);
3371 r = write_string_file(fn, pid_string, 0);
3372 if (r < 0)
3373 log_error_errno(r, "Failed to move process: %m");
3374
3375 finish:
3376 if (undo_mount)
3377 (void) umount(tree);
3378
3379 (void) rmdir(tree);
3380 return r;
3381 }
3382
3383 static int create_subcgroup(pid_t pid) {
3384 _cleanup_free_ char *cgroup = NULL;
3385 const char *child;
3386 int unified, r;
3387 CGroupMask supported;
3388
3389 /* In the unified hierarchy inner nodes may only only contain
3390 * subgroups, but not processes. Hence, if we running in the
3391 * unified hierarchy and the container does the same, and we
3392 * did not create a scope unit for the container move us and
3393 * the container into two separate subcgroups. */
3394
3395 if (!arg_keep_unit)
3396 return 0;
3397
3398 if (!arg_unified_cgroup_hierarchy)
3399 return 0;
3400
3401 unified = cg_unified();
3402 if (unified < 0)
3403 return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
3404 if (unified == 0)
3405 return 0;
3406
3407 r = cg_mask_supported(&supported);
3408 if (r < 0)
3409 return log_error_errno(r, "Failed to determine supported controllers: %m");
3410
3411 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
3412 if (r < 0)
3413 return log_error_errno(r, "Failed to get our control group: %m");
3414
3415 child = strjoina(cgroup, "/payload");
3416 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid);
3417 if (r < 0)
3418 return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
3419
3420 child = strjoina(cgroup, "/supervisor");
3421 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0);
3422 if (r < 0)
3423 return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
3424
3425 /* Try to enable as many controllers as possible for the new payload. */
3426 (void) cg_enable_everywhere(supported, supported, cgroup);
3427 return 0;
3428 }
3429
3430 static int load_settings(void) {
3431 _cleanup_(settings_freep) Settings *settings = NULL;
3432 _cleanup_fclose_ FILE *f = NULL;
3433 _cleanup_free_ char *p = NULL;
3434 const char *fn, *i;
3435 int r;
3436
3437 /* If all settings are masked, there's no point in looking for
3438 * the settings file */
3439 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3440 return 0;
3441
3442 fn = strjoina(arg_machine, ".nspawn");
3443
3444 /* We first look in the admin's directories in /etc and /run */
3445 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3446 _cleanup_free_ char *j = NULL;
3447
3448 j = strjoin(i, "/", fn, NULL);
3449 if (!j)
3450 return log_oom();
3451
3452 f = fopen(j, "re");
3453 if (f) {
3454 p = j;
3455 j = NULL;
3456
3457 /* By default we trust configuration from /etc and /run */
3458 if (arg_settings_trusted < 0)
3459 arg_settings_trusted = true;
3460
3461 break;
3462 }
3463
3464 if (errno != ENOENT)
3465 return log_error_errno(errno, "Failed to open %s: %m", j);
3466 }
3467
3468 if (!f) {
3469 /* After that, let's look for a file next to the
3470 * actual image we shall boot. */
3471
3472 if (arg_image) {
3473 p = file_in_same_dir(arg_image, fn);
3474 if (!p)
3475 return log_oom();
3476 } else if (arg_directory) {
3477 p = file_in_same_dir(arg_directory, fn);
3478 if (!p)
3479 return log_oom();
3480 }
3481
3482 if (p) {
3483 f = fopen(p, "re");
3484 if (!f && errno != ENOENT)
3485 return log_error_errno(errno, "Failed to open %s: %m", p);
3486
3487 /* By default we do not trust configuration from /var/lib/machines */
3488 if (arg_settings_trusted < 0)
3489 arg_settings_trusted = false;
3490 }
3491 }
3492
3493 if (!f)
3494 return 0;
3495
3496 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3497
3498 r = settings_load(f, p, &settings);
3499 if (r < 0)
3500 return r;
3501
3502 /* Copy over bits from the settings, unless they have been
3503 * explicitly masked by command line switches. */
3504
3505 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
3506 settings->boot >= 0) {
3507 arg_boot = settings->boot;
3508
3509 strv_free(arg_parameters);
3510 arg_parameters = settings->parameters;
3511 settings->parameters = NULL;
3512 }
3513
3514 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3515 settings->environment) {
3516 strv_free(arg_setenv);
3517 arg_setenv = settings->environment;
3518 settings->environment = NULL;
3519 }
3520
3521 if ((arg_settings_mask & SETTING_USER) == 0 &&
3522 settings->user) {
3523 free(arg_user);
3524 arg_user = settings->user;
3525 settings->user = NULL;
3526 }
3527
3528 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
3529
3530 if (!arg_settings_trusted && settings->capability != 0)
3531 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3532 else
3533 arg_retain |= settings->capability;
3534
3535 arg_retain &= ~settings->drop_capability;
3536 }
3537
3538 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3539 settings->kill_signal > 0)
3540 arg_kill_signal = settings->kill_signal;
3541
3542 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3543 settings->personality != PERSONALITY_INVALID)
3544 arg_personality = settings->personality;
3545
3546 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3547 !sd_id128_is_null(settings->machine_id)) {
3548
3549 if (!arg_settings_trusted)
3550 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3551 else
3552 arg_uuid = settings->machine_id;
3553 }
3554
3555 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3556 settings->read_only >= 0)
3557 arg_read_only = settings->read_only;
3558
3559 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3560 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3561 arg_volatile_mode = settings->volatile_mode;
3562
3563 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3564 settings->n_custom_mounts > 0) {
3565
3566 if (!arg_settings_trusted)
3567 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3568 else {
3569 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3570 arg_custom_mounts = settings->custom_mounts;
3571 arg_n_custom_mounts = settings->n_custom_mounts;
3572
3573 settings->custom_mounts = NULL;
3574 settings->n_custom_mounts = 0;
3575 }
3576 }
3577
3578 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3579 (settings->private_network >= 0 ||
3580 settings->network_veth >= 0 ||
3581 settings->network_bridge ||
3582 settings->network_interfaces ||
3583 settings->network_macvlan ||
3584 settings->network_ipvlan)) {
3585
3586 if (!arg_settings_trusted)
3587 log_warning("Ignoring network settings, file %s is not trusted.", p);
3588 else {
3589 strv_free(arg_network_interfaces);
3590 arg_network_interfaces = settings->network_interfaces;
3591 settings->network_interfaces = NULL;
3592
3593 strv_free(arg_network_macvlan);
3594 arg_network_macvlan = settings->network_macvlan;
3595 settings->network_macvlan = NULL;
3596
3597 strv_free(arg_network_ipvlan);
3598 arg_network_ipvlan = settings->network_ipvlan;
3599 settings->network_ipvlan = NULL;
3600
3601 free(arg_network_bridge);
3602 arg_network_bridge = settings->network_bridge;
3603 settings->network_bridge = NULL;
3604
3605 arg_network_veth = settings->network_veth > 0 || settings->network_bridge;
3606
3607 arg_private_network = true; /* all these settings imply private networking */
3608 }
3609 }
3610
3611 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3612 settings->expose_ports) {
3613
3614 if (!arg_settings_trusted)
3615 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3616 else {
3617 expose_port_free_all(arg_expose_ports);
3618 arg_expose_ports = settings->expose_ports;
3619 settings->expose_ports = NULL;
3620 }
3621 }
3622
3623 return 0;
3624 }
3625
3626 int main(int argc, char *argv[]) {
3627
3628 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3629 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3630 _cleanup_close_ int master = -1, image_fd = -1;
3631 _cleanup_fdset_free_ FDSet *fds = NULL;
3632 int r, n_fd_passed, loop_nr = -1;
3633 char veth_name[IFNAMSIZ];
3634 bool secondary = false, remove_subvol = false;
3635 sigset_t mask_chld;
3636 pid_t pid = 0;
3637 int ret = EXIT_SUCCESS;
3638 union in_addr_union exposed = {};
3639 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3640 bool interactive;
3641
3642 log_parse_environment();
3643 log_open();
3644
3645 r = parse_argv(argc, argv);
3646 if (r <= 0)
3647 goto finish;
3648
3649 if (geteuid() != 0) {
3650 log_error("Need to be root.");
3651 r = -EPERM;
3652 goto finish;
3653 }
3654 r = determine_names();
3655 if (r < 0)
3656 goto finish;
3657
3658 r = load_settings();
3659 if (r < 0)
3660 goto finish;
3661
3662 r = verify_arguments();
3663 if (r < 0)
3664 goto finish;
3665
3666 n_fd_passed = sd_listen_fds(false);
3667 if (n_fd_passed > 0) {
3668 r = fdset_new_listen_fds(&fds, false);
3669 if (r < 0) {
3670 log_error_errno(r, "Failed to collect file descriptors: %m");
3671 goto finish;
3672 }
3673 }
3674
3675 if (arg_directory) {
3676 assert(!arg_image);
3677
3678 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3679 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3680 r = -EINVAL;
3681 goto finish;
3682 }
3683
3684 if (arg_ephemeral) {
3685 _cleanup_free_ char *np = NULL;
3686
3687 /* If the specified path is a mount point we
3688 * generate the new snapshot immediately
3689 * inside it under a random name. However if
3690 * the specified is not a mount point we
3691 * create the new snapshot in the parent
3692 * directory, just next to it. */
3693 r = path_is_mount_point(arg_directory, 0);
3694 if (r < 0) {
3695 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3696 goto finish;
3697 }
3698 if (r > 0)
3699 r = tempfn_random_child(arg_directory, "machine.", &np);
3700 else
3701 r = tempfn_random(arg_directory, "machine.", &np);
3702 if (r < 0) {
3703 log_error_errno(r, "Failed to generate name for snapshot: %m");
3704 goto finish;
3705 }
3706
3707 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3708 if (r < 0) {
3709 log_error_errno(r, "Failed to lock %s: %m", np);
3710 goto finish;
3711 }
3712
3713 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
3714 if (r < 0) {
3715 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3716 goto finish;
3717 }
3718
3719 free(arg_directory);
3720 arg_directory = np;
3721 np = NULL;
3722
3723 remove_subvol = true;
3724
3725 } else {
3726 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3727 if (r == -EBUSY) {
3728 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3729 goto finish;
3730 }
3731 if (r < 0) {
3732 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3733 return r;
3734 }
3735
3736 if (arg_template) {
3737 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
3738 if (r == -EEXIST) {
3739 if (!arg_quiet)
3740 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3741 } else if (r < 0) {
3742 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3743 goto finish;
3744 } else {
3745 if (!arg_quiet)
3746 log_info("Populated %s from template %s.", arg_directory, arg_template);
3747 }
3748 }
3749 }
3750
3751 if (arg_boot) {
3752 if (path_is_os_tree(arg_directory) <= 0) {
3753 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3754 r = -EINVAL;
3755 goto finish;
3756 }
3757 } else {
3758 const char *p;
3759
3760 p = strjoina(arg_directory,
3761 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3762 if (access(p, F_OK) < 0) {
3763 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3764 r = -EINVAL;
3765 goto finish;
3766 }
3767 }
3768
3769 } else {
3770 char template[] = "/tmp/nspawn-root-XXXXXX";
3771
3772 assert(arg_image);
3773 assert(!arg_template);
3774
3775 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3776 if (r == -EBUSY) {
3777 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3778 goto finish;
3779 }
3780 if (r < 0) {
3781 r = log_error_errno(r, "Failed to create image lock: %m");
3782 goto finish;
3783 }
3784
3785 if (!mkdtemp(template)) {
3786 log_error_errno(errno, "Failed to create temporary directory: %m");
3787 r = -errno;
3788 goto finish;
3789 }
3790
3791 arg_directory = strdup(template);
3792 if (!arg_directory) {
3793 r = log_oom();
3794 goto finish;
3795 }
3796
3797 image_fd = setup_image(&device_path, &loop_nr);
3798 if (image_fd < 0) {
3799 r = image_fd;
3800 goto finish;
3801 }
3802
3803 r = dissect_image(image_fd,
3804 &root_device, &root_device_rw,
3805 &home_device, &home_device_rw,
3806 &srv_device, &srv_device_rw,
3807 &secondary);
3808 if (r < 0)
3809 goto finish;
3810 }
3811
3812 r = custom_mounts_prepare();
3813 if (r < 0)
3814 goto finish;
3815
3816 interactive =
3817 isatty(STDIN_FILENO) > 0 &&
3818 isatty(STDOUT_FILENO) > 0;
3819
3820 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3821 if (master < 0) {
3822 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3823 goto finish;
3824 }
3825
3826 r = ptsname_malloc(master, &console);
3827 if (r < 0) {
3828 r = log_error_errno(r, "Failed to determine tty name: %m");
3829 goto finish;
3830 }
3831
3832 if (unlockpt(master) < 0) {
3833 r = log_error_errno(errno, "Failed to unlock tty: %m");
3834 goto finish;
3835 }
3836
3837 if (!arg_quiet)
3838 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3839 arg_machine, arg_image ?: arg_directory);
3840
3841 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3842
3843 assert_se(sigemptyset(&mask_chld) == 0);
3844 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3845
3846 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3847 r = log_error_errno(errno, "Failed to become subreaper: %m");
3848 goto finish;
3849 }
3850
3851 for (;;) {
3852 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
3853 uid_shift_socket_pair[2] = { -1, -1 };
3854 ContainerStatus container_status;
3855 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3856 static const struct sigaction sa = {
3857 .sa_handler = nop_handler,
3858 .sa_flags = SA_NOCLDSTOP,
3859 };
3860 int ifi = 0;
3861 ssize_t l;
3862 _cleanup_event_unref_ sd_event *event = NULL;
3863 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3864 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3865 char last_char = 0;
3866
3867 r = barrier_create(&barrier);
3868 if (r < 0) {
3869 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3870 goto finish;
3871 }
3872
3873 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3874 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3875 goto finish;
3876 }
3877
3878 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3879 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3880 goto finish;
3881 }
3882
3883 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3884 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3885 goto finish;
3886 }
3887
3888 if (arg_userns)
3889 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3890 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3891 goto finish;
3892 }
3893
3894 /* Child can be killed before execv(), so handle SIGCHLD
3895 * in order to interrupt parent's blocking calls and
3896 * give it a chance to call wait() and terminate. */
3897 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3898 if (r < 0) {
3899 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3900 goto finish;
3901 }
3902
3903 r = sigaction(SIGCHLD, &sa, NULL);
3904 if (r < 0) {
3905 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3906 goto finish;
3907 }
3908
3909 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3910 if (pid < 0) {
3911 if (errno == EINVAL)
3912 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3913 else
3914 r = log_error_errno(errno, "clone() failed: %m");
3915
3916 goto finish;
3917 }
3918
3919 if (pid == 0) {
3920 /* The outer child only has a file system namespace. */
3921 barrier_set_role(&barrier, BARRIER_CHILD);
3922
3923 master = safe_close(master);
3924
3925 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3926 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3927 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3928 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3929
3930 (void) reset_all_signal_handlers();
3931 (void) reset_signal_mask();
3932
3933 r = outer_child(&barrier,
3934 arg_directory,
3935 console,
3936 root_device, root_device_rw,
3937 home_device, home_device_rw,
3938 srv_device, srv_device_rw,
3939 interactive,
3940 secondary,
3941 pid_socket_pair[1],
3942 kmsg_socket_pair[1],
3943 rtnl_socket_pair[1],
3944 uid_shift_socket_pair[1],
3945 fds);
3946 if (r < 0)
3947 _exit(EXIT_FAILURE);
3948
3949 _exit(EXIT_SUCCESS);
3950 }
3951
3952 barrier_set_role(&barrier, BARRIER_PARENT);
3953
3954 fdset_free(fds);
3955 fds = NULL;
3956
3957 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3958 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3959 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3960
3961 /* Wait for the outer child. */
3962 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3963 if (r < 0)
3964 goto finish;
3965 if (r != 0) {
3966 r = -EIO;
3967 goto finish;
3968 }
3969 pid = 0;
3970
3971 /* And now retrieve the PID of the inner child. */
3972 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3973 if (l < 0) {
3974 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3975 goto finish;
3976 }
3977 if (l != sizeof(pid)) {
3978 log_error("Short read while reading inner child PID: %m");
3979 r = EIO;
3980 goto finish;
3981 }
3982
3983 log_debug("Init process invoked as PID " PID_FMT, pid);
3984
3985 if (arg_userns) {
3986 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3987 log_error("Child died too early.");
3988 r = -ESRCH;
3989 goto finish;
3990 }
3991
3992 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3993 if (l < 0) {
3994 r = log_error_errno(errno, "Failed to read UID shift: %m");
3995 goto finish;
3996 }
3997 if (l != sizeof(arg_uid_shift)) {
3998 log_error("Short read while reading UID shift: %m");
3999 r = EIO;
4000 goto finish;
4001 }
4002
4003 r = setup_uid_map(pid);
4004 if (r < 0)
4005 goto finish;
4006
4007 (void) barrier_place(&barrier); /* #2 */
4008 }
4009
4010 if (arg_private_network) {
4011
4012 r = move_network_interfaces(pid, arg_network_interfaces);
4013 if (r < 0)
4014 goto finish;
4015
4016 if (arg_network_veth) {
4017 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
4018 if (r < 0)
4019 goto finish;
4020 else if (r > 0)
4021 ifi = r;
4022
4023 if (arg_network_bridge) {
4024 r = setup_bridge(veth_name, arg_network_bridge);
4025 if (r < 0)
4026 goto finish;
4027 if (r > 0)
4028 ifi = r;
4029 }
4030 }
4031
4032 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
4033 if (r < 0)
4034 goto finish;
4035
4036 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
4037 if (r < 0)
4038 goto finish;
4039 }
4040
4041 r = register_machine(pid, ifi);
4042 if (r < 0)
4043 goto finish;
4044
4045 r = sync_cgroup(pid);
4046 if (r < 0)
4047 goto finish;
4048
4049 r = create_subcgroup(pid);
4050 if (r < 0)
4051 goto finish;
4052
4053 r = chown_cgroup(pid);
4054 if (r < 0)
4055 goto finish;
4056
4057 /* Notify the child that the parent is ready with all
4058 * its setup (including cgroup-ification), and that
4059 * the child can now hand over control to the code to
4060 * run inside the container. */
4061 (void) barrier_place(&barrier); /* #3 */
4062
4063 /* Block SIGCHLD here, before notifying child.
4064 * process_pty() will handle it with the other signals. */
4065 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4066
4067 /* Reset signal to default */
4068 r = default_signals(SIGCHLD, -1);
4069 if (r < 0) {
4070 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4071 goto finish;
4072 }
4073
4074 /* Let the child know that we are ready and wait that the child is completely ready now. */
4075 if (!barrier_place_and_sync(&barrier)) { /* #5 */
4076 log_error("Client died too early.");
4077 r = -ESRCH;
4078 goto finish;
4079 }
4080
4081 sd_notifyf(false,
4082 "READY=1\n"
4083 "STATUS=Container running.\n"
4084 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4085
4086 r = sd_event_new(&event);
4087 if (r < 0) {
4088 log_error_errno(r, "Failed to get default event source: %m");
4089 goto finish;
4090 }
4091
4092 if (arg_kill_signal > 0) {
4093 /* Try to kill the init system on SIGINT or SIGTERM */
4094 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4095 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4096 } else {
4097 /* Immediately exit */
4098 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4099 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4100 }
4101
4102 /* simply exit on sigchld */
4103 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4104
4105 if (arg_expose_ports) {
4106 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
4107 if (r < 0)
4108 goto finish;
4109
4110 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
4111 }
4112
4113 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4114
4115 r = pty_forward_new(event, master, true, !interactive, &forward);
4116 if (r < 0) {
4117 log_error_errno(r, "Failed to create PTY forwarder: %m");
4118 goto finish;
4119 }
4120
4121 r = sd_event_loop(event);
4122 if (r < 0) {
4123 log_error_errno(r, "Failed to run event loop: %m");
4124 goto finish;
4125 }
4126
4127 pty_forward_get_last_char(forward, &last_char);
4128
4129 forward = pty_forward_free(forward);
4130
4131 if (!arg_quiet && last_char != '\n')
4132 putc('\n', stdout);
4133
4134 /* Kill if it is not dead yet anyway */
4135 terminate_machine(pid);
4136
4137 /* Normally redundant, but better safe than sorry */
4138 kill(pid, SIGKILL);
4139
4140 r = wait_for_container(pid, &container_status);
4141 pid = 0;
4142
4143 if (r < 0)
4144 /* We failed to wait for the container, or the
4145 * container exited abnormally */
4146 goto finish;
4147 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4148 /* The container exited with a non-zero
4149 * status, or with zero status and no reboot
4150 * was requested. */
4151 ret = r;
4152 break;
4153 }
4154
4155 /* CONTAINER_REBOOTED, loop again */
4156
4157 if (arg_keep_unit) {
4158 /* Special handling if we are running as a
4159 * service: instead of simply restarting the
4160 * machine we want to restart the entire
4161 * service, so let's inform systemd about this
4162 * with the special exit code 133. The service
4163 * file uses RestartForceExitStatus=133 so
4164 * that this results in a full nspawn
4165 * restart. This is necessary since we might
4166 * have cgroup parameters set we want to have
4167 * flushed out. */
4168 ret = 133;
4169 r = 0;
4170 break;
4171 }
4172
4173 expose_port_flush(arg_expose_ports, &exposed);
4174 }
4175
4176 finish:
4177 sd_notify(false,
4178 "STOPPING=1\n"
4179 "STATUS=Terminating...");
4180
4181 if (pid > 0)
4182 kill(pid, SIGKILL);
4183
4184 /* Try to flush whatever is still queued in the pty */
4185 if (master >= 0)
4186 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
4187
4188 loop_remove(loop_nr, &image_fd);
4189
4190 if (remove_subvol && arg_directory) {
4191 int k;
4192
4193 k = btrfs_subvol_remove(arg_directory, true);
4194 if (k < 0)
4195 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4196 }
4197
4198 if (arg_machine) {
4199 const char *p;
4200
4201 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4202 (void) rm_rf(p, REMOVE_ROOT);
4203 }
4204
4205 expose_port_flush(arg_expose_ports, &exposed);
4206
4207 free(arg_directory);
4208 free(arg_template);
4209 free(arg_image);
4210 free(arg_machine);
4211 free(arg_user);
4212 strv_free(arg_setenv);
4213 free(arg_network_bridge);
4214 strv_free(arg_network_interfaces);
4215 strv_free(arg_network_macvlan);
4216 strv_free(arg_network_ipvlan);
4217 strv_free(arg_parameters);
4218 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4219 expose_port_free_all(arg_expose_ports);
4220
4221 return r < 0 ? EXIT_FAILURE : ret;
4222 }