]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: split out --uid= logic into nspawn-setuid.[ch]
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <sys/personality.h>
38 #include <linux/loop.h>
39 #include <sys/file.h>
40
41 #ifdef HAVE_SELINUX
42 #include <selinux/selinux.h>
43 #endif
44
45 #ifdef HAVE_SECCOMP
46 #include <seccomp.h>
47 #endif
48
49 #ifdef HAVE_BLKID
50 #include <blkid/blkid.h>
51 #endif
52
53 #include "sd-daemon.h"
54 #include "sd-bus.h"
55 #include "sd-id128.h"
56 #include "random-util.h"
57 #include "log.h"
58 #include "util.h"
59 #include "mkdir.h"
60 #include "rm-rf.h"
61 #include "macro.h"
62 #include "missing.h"
63 #include "cgroup-util.h"
64 #include "strv.h"
65 #include "path-util.h"
66 #include "loopback-setup.h"
67 #include "dev-setup.h"
68 #include "fdset.h"
69 #include "build.h"
70 #include "fileio.h"
71 #include "bus-util.h"
72 #include "bus-error.h"
73 #include "ptyfwd.h"
74 #include "env-util.h"
75 #include "netlink-util.h"
76 #include "udev-util.h"
77 #include "blkid-util.h"
78 #include "gpt.h"
79 #include "copy.h"
80 #include "base-filesystem.h"
81 #include "barrier.h"
82 #include "event-util.h"
83 #include "capability.h"
84 #include "cap-list.h"
85 #include "btrfs-util.h"
86 #include "machine-image.h"
87 #include "list.h"
88 #include "in-addr-util.h"
89 #include "formats-util.h"
90 #include "process-util.h"
91 #include "terminal-util.h"
92 #include "hostname-util.h"
93 #include "signal-util.h"
94
95 #ifdef HAVE_SECCOMP
96 #include "seccomp-util.h"
97 #endif
98
99 #include "nspawn.h"
100 #include "nspawn-settings.h"
101 #include "nspawn-mount.h"
102 #include "nspawn-network.h"
103 #include "nspawn-expose-ports.h"
104 #include "nspawn-cgroup.h"
105 #include "nspawn-register.h"
106 #include "nspawn-setuid.h"
107
108 typedef enum ContainerStatus {
109 CONTAINER_TERMINATED,
110 CONTAINER_REBOOTED
111 } ContainerStatus;
112
113 typedef enum LinkJournal {
114 LINK_NO,
115 LINK_AUTO,
116 LINK_HOST,
117 LINK_GUEST
118 } LinkJournal;
119
120 static char *arg_directory = NULL;
121 static char *arg_template = NULL;
122 static char *arg_user = NULL;
123 static sd_id128_t arg_uuid = {};
124 static char *arg_machine = NULL;
125 static const char *arg_selinux_context = NULL;
126 static const char *arg_selinux_apifs_context = NULL;
127 static const char *arg_slice = NULL;
128 static bool arg_private_network = false;
129 static bool arg_read_only = false;
130 static bool arg_boot = false;
131 static bool arg_ephemeral = false;
132 static LinkJournal arg_link_journal = LINK_AUTO;
133 static bool arg_link_journal_try = false;
134 static uint64_t arg_retain =
135 (1ULL << CAP_CHOWN) |
136 (1ULL << CAP_DAC_OVERRIDE) |
137 (1ULL << CAP_DAC_READ_SEARCH) |
138 (1ULL << CAP_FOWNER) |
139 (1ULL << CAP_FSETID) |
140 (1ULL << CAP_IPC_OWNER) |
141 (1ULL << CAP_KILL) |
142 (1ULL << CAP_LEASE) |
143 (1ULL << CAP_LINUX_IMMUTABLE) |
144 (1ULL << CAP_NET_BIND_SERVICE) |
145 (1ULL << CAP_NET_BROADCAST) |
146 (1ULL << CAP_NET_RAW) |
147 (1ULL << CAP_SETGID) |
148 (1ULL << CAP_SETFCAP) |
149 (1ULL << CAP_SETPCAP) |
150 (1ULL << CAP_SETUID) |
151 (1ULL << CAP_SYS_ADMIN) |
152 (1ULL << CAP_SYS_CHROOT) |
153 (1ULL << CAP_SYS_NICE) |
154 (1ULL << CAP_SYS_PTRACE) |
155 (1ULL << CAP_SYS_TTY_CONFIG) |
156 (1ULL << CAP_SYS_RESOURCE) |
157 (1ULL << CAP_SYS_BOOT) |
158 (1ULL << CAP_AUDIT_WRITE) |
159 (1ULL << CAP_AUDIT_CONTROL) |
160 (1ULL << CAP_MKNOD);
161 static CustomMount *arg_custom_mounts = NULL;
162 static unsigned arg_n_custom_mounts = 0;
163 static char **arg_setenv = NULL;
164 static bool arg_quiet = false;
165 static bool arg_share_system = false;
166 static bool arg_register = true;
167 static bool arg_keep_unit = false;
168 static char **arg_network_interfaces = NULL;
169 static char **arg_network_macvlan = NULL;
170 static char **arg_network_ipvlan = NULL;
171 static bool arg_network_veth = false;
172 static char *arg_network_bridge = NULL;
173 static unsigned long arg_personality = PERSONALITY_INVALID;
174 static char *arg_image = NULL;
175 static VolatileMode arg_volatile_mode = VOLATILE_NO;
176 static ExposePort *arg_expose_ports = NULL;
177 static char **arg_property = NULL;
178 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
179 static bool arg_userns = false;
180 static int arg_kill_signal = 0;
181 static bool arg_unified_cgroup_hierarchy = false;
182 static SettingsMask arg_settings_mask = 0;
183 static int arg_settings_trusted = -1;
184 static char **arg_parameters = NULL;
185
186 static void help(void) {
187 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
188 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
189 " -h --help Show this help\n"
190 " --version Print version string\n"
191 " -q --quiet Do not show status information\n"
192 " -D --directory=PATH Root directory for the container\n"
193 " --template=PATH Initialize root directory from template directory,\n"
194 " if missing\n"
195 " -x --ephemeral Run container with snapshot of root directory, and\n"
196 " remove it after exit\n"
197 " -i --image=PATH File system device or disk image for the container\n"
198 " -b --boot Boot up full system (i.e. invoke init)\n"
199 " -u --user=USER Run the command under specified user or uid\n"
200 " -M --machine=NAME Set the machine name for the container\n"
201 " --uuid=UUID Set a specific machine UUID for the container\n"
202 " -S --slice=SLICE Place the container in the specified slice\n"
203 " --property=NAME=VALUE Set scope unit property\n"
204 " --private-users[=UIDBASE[:NUIDS]]\n"
205 " Run within user namespace\n"
206 " --private-network Disable network in container\n"
207 " --network-interface=INTERFACE\n"
208 " Assign an existing network interface to the\n"
209 " container\n"
210 " --network-macvlan=INTERFACE\n"
211 " Create a macvlan network interface based on an\n"
212 " existing network interface to the container\n"
213 " --network-ipvlan=INTERFACE\n"
214 " Create a ipvlan network interface based on an\n"
215 " existing network interface to the container\n"
216 " -n --network-veth Add a virtual ethernet connection between host\n"
217 " and container\n"
218 " --network-bridge=INTERFACE\n"
219 " Add a virtual ethernet connection between host\n"
220 " and container and add it to an existing bridge on\n"
221 " the host\n"
222 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
223 " Expose a container IP port on the host\n"
224 " -Z --selinux-context=SECLABEL\n"
225 " Set the SELinux security context to be used by\n"
226 " processes in the container\n"
227 " -L --selinux-apifs-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " API/tmpfs file systems in the container\n"
230 " --capability=CAP In addition to the default, retain specified\n"
231 " capability\n"
232 " --drop-capability=CAP Drop the specified capability from the default set\n"
233 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
234 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
235 " try-guest, try-host\n"
236 " -j Equivalent to --link-journal=try-guest\n"
237 " --read-only Mount the root directory read-only\n"
238 " --bind=PATH[:PATH[:OPTIONS]]\n"
239 " Bind mount a file or directory from the host into\n"
240 " the container\n"
241 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
242 " Similar, but creates a read-only bind mount\n"
243 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
244 " --overlay=PATH[:PATH...]:PATH\n"
245 " Create an overlay mount from the host to \n"
246 " the container\n"
247 " --overlay-ro=PATH[:PATH...]:PATH\n"
248 " Similar, but creates a read-only overlay mount\n"
249 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
250 " --share-system Share system namespaces with host\n"
251 " --register=BOOLEAN Register container as machine\n"
252 " --keep-unit Do not register a scope for the machine, reuse\n"
253 " the service unit nspawn is running in\n"
254 " --volatile[=MODE] Run the system in volatile mode\n"
255 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
256 , program_invocation_short_name);
257 }
258
259
260 static int custom_mounts_prepare(void) {
261 unsigned i;
262 int r;
263
264 /* Ensure the mounts are applied prefix first. */
265 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
266
267 /* Allocate working directories for the overlay file systems that need it */
268 for (i = 0; i < arg_n_custom_mounts; i++) {
269 CustomMount *m = &arg_custom_mounts[i];
270
271 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
272 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
273 return -EINVAL;
274 }
275
276 if (m->type != CUSTOM_MOUNT_OVERLAY)
277 continue;
278
279 if (m->work_dir)
280 continue;
281
282 if (m->read_only)
283 continue;
284
285 r = tempfn_random(m->source, NULL, &m->work_dir);
286 if (r < 0)
287 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
288 }
289
290 return 0;
291 }
292
293 static int set_sanitized_path(char **b, const char *path) {
294 char *p;
295
296 assert(b);
297 assert(path);
298
299 p = canonicalize_file_name(path);
300 if (!p) {
301 if (errno != ENOENT)
302 return -errno;
303
304 p = path_make_absolute_cwd(path);
305 if (!p)
306 return -ENOMEM;
307 }
308
309 free(*b);
310 *b = path_kill_slashes(p);
311 return 0;
312 }
313
314 static int detect_unified_cgroup_hierarchy(void) {
315 const char *e;
316 int r;
317
318 /* Allow the user to control whether the unified hierarchy is used */
319 e = getenv("UNIFIED_CGROUP_HIERARCHY");
320 if (e) {
321 r = parse_boolean(e);
322 if (r < 0)
323 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
324
325 arg_unified_cgroup_hierarchy = r;
326 return 0;
327 }
328
329 /* Otherwise inherit the default from the host system */
330 r = cg_unified();
331 if (r < 0)
332 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
333
334 arg_unified_cgroup_hierarchy = r;
335 return 0;
336 }
337
338 static int parse_argv(int argc, char *argv[]) {
339
340 enum {
341 ARG_VERSION = 0x100,
342 ARG_PRIVATE_NETWORK,
343 ARG_UUID,
344 ARG_READ_ONLY,
345 ARG_CAPABILITY,
346 ARG_DROP_CAPABILITY,
347 ARG_LINK_JOURNAL,
348 ARG_BIND,
349 ARG_BIND_RO,
350 ARG_TMPFS,
351 ARG_OVERLAY,
352 ARG_OVERLAY_RO,
353 ARG_SETENV,
354 ARG_SHARE_SYSTEM,
355 ARG_REGISTER,
356 ARG_KEEP_UNIT,
357 ARG_NETWORK_INTERFACE,
358 ARG_NETWORK_MACVLAN,
359 ARG_NETWORK_IPVLAN,
360 ARG_NETWORK_BRIDGE,
361 ARG_PERSONALITY,
362 ARG_VOLATILE,
363 ARG_TEMPLATE,
364 ARG_PROPERTY,
365 ARG_PRIVATE_USERS,
366 ARG_KILL_SIGNAL,
367 ARG_SETTINGS,
368 };
369
370 static const struct option options[] = {
371 { "help", no_argument, NULL, 'h' },
372 { "version", no_argument, NULL, ARG_VERSION },
373 { "directory", required_argument, NULL, 'D' },
374 { "template", required_argument, NULL, ARG_TEMPLATE },
375 { "ephemeral", no_argument, NULL, 'x' },
376 { "user", required_argument, NULL, 'u' },
377 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
378 { "boot", no_argument, NULL, 'b' },
379 { "uuid", required_argument, NULL, ARG_UUID },
380 { "read-only", no_argument, NULL, ARG_READ_ONLY },
381 { "capability", required_argument, NULL, ARG_CAPABILITY },
382 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
383 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
384 { "bind", required_argument, NULL, ARG_BIND },
385 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
386 { "tmpfs", required_argument, NULL, ARG_TMPFS },
387 { "overlay", required_argument, NULL, ARG_OVERLAY },
388 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
389 { "machine", required_argument, NULL, 'M' },
390 { "slice", required_argument, NULL, 'S' },
391 { "setenv", required_argument, NULL, ARG_SETENV },
392 { "selinux-context", required_argument, NULL, 'Z' },
393 { "selinux-apifs-context", required_argument, NULL, 'L' },
394 { "quiet", no_argument, NULL, 'q' },
395 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
396 { "register", required_argument, NULL, ARG_REGISTER },
397 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
398 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
399 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
400 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
401 { "network-veth", no_argument, NULL, 'n' },
402 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
403 { "personality", required_argument, NULL, ARG_PERSONALITY },
404 { "image", required_argument, NULL, 'i' },
405 { "volatile", optional_argument, NULL, ARG_VOLATILE },
406 { "port", required_argument, NULL, 'p' },
407 { "property", required_argument, NULL, ARG_PROPERTY },
408 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
409 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
410 { "settings", required_argument, NULL, ARG_SETTINGS },
411 {}
412 };
413
414 int c, r;
415 uint64_t plus = 0, minus = 0;
416 bool mask_all_settings = false, mask_no_settings = false;
417
418 assert(argc >= 0);
419 assert(argv);
420
421 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
422
423 switch (c) {
424
425 case 'h':
426 help();
427 return 0;
428
429 case ARG_VERSION:
430 puts(PACKAGE_STRING);
431 puts(SYSTEMD_FEATURES);
432 return 0;
433
434 case 'D':
435 r = set_sanitized_path(&arg_directory, optarg);
436 if (r < 0)
437 return log_error_errno(r, "Invalid root directory: %m");
438
439 break;
440
441 case ARG_TEMPLATE:
442 r = set_sanitized_path(&arg_template, optarg);
443 if (r < 0)
444 return log_error_errno(r, "Invalid template directory: %m");
445
446 break;
447
448 case 'i':
449 r = set_sanitized_path(&arg_image, optarg);
450 if (r < 0)
451 return log_error_errno(r, "Invalid image path: %m");
452
453 break;
454
455 case 'x':
456 arg_ephemeral = true;
457 break;
458
459 case 'u':
460 r = free_and_strdup(&arg_user, optarg);
461 if (r < 0)
462 return log_oom();
463
464 arg_settings_mask |= SETTING_USER;
465 break;
466
467 case ARG_NETWORK_BRIDGE:
468 r = free_and_strdup(&arg_network_bridge, optarg);
469 if (r < 0)
470 return log_oom();
471
472 /* fall through */
473
474 case 'n':
475 arg_network_veth = true;
476 arg_private_network = true;
477 arg_settings_mask |= SETTING_NETWORK;
478 break;
479
480 case ARG_NETWORK_INTERFACE:
481 if (strv_extend(&arg_network_interfaces, optarg) < 0)
482 return log_oom();
483
484 arg_private_network = true;
485 arg_settings_mask |= SETTING_NETWORK;
486 break;
487
488 case ARG_NETWORK_MACVLAN:
489 if (strv_extend(&arg_network_macvlan, optarg) < 0)
490 return log_oom();
491
492 arg_private_network = true;
493 arg_settings_mask |= SETTING_NETWORK;
494 break;
495
496 case ARG_NETWORK_IPVLAN:
497 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
498 return log_oom();
499
500 /* fall through */
501
502 case ARG_PRIVATE_NETWORK:
503 arg_private_network = true;
504 arg_settings_mask |= SETTING_NETWORK;
505 break;
506
507 case 'b':
508 arg_boot = true;
509 arg_settings_mask |= SETTING_BOOT;
510 break;
511
512 case ARG_UUID:
513 r = sd_id128_from_string(optarg, &arg_uuid);
514 if (r < 0) {
515 log_error("Invalid UUID: %s", optarg);
516 return r;
517 }
518
519 arg_settings_mask |= SETTING_MACHINE_ID;
520 break;
521
522 case 'S':
523 arg_slice = optarg;
524 break;
525
526 case 'M':
527 if (isempty(optarg))
528 arg_machine = mfree(arg_machine);
529 else {
530 if (!machine_name_is_valid(optarg)) {
531 log_error("Invalid machine name: %s", optarg);
532 return -EINVAL;
533 }
534
535 r = free_and_strdup(&arg_machine, optarg);
536 if (r < 0)
537 return log_oom();
538
539 break;
540 }
541
542 case 'Z':
543 arg_selinux_context = optarg;
544 break;
545
546 case 'L':
547 arg_selinux_apifs_context = optarg;
548 break;
549
550 case ARG_READ_ONLY:
551 arg_read_only = true;
552 arg_settings_mask |= SETTING_READ_ONLY;
553 break;
554
555 case ARG_CAPABILITY:
556 case ARG_DROP_CAPABILITY: {
557 const char *state, *word;
558 size_t length;
559
560 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
561 _cleanup_free_ char *t;
562
563 t = strndup(word, length);
564 if (!t)
565 return log_oom();
566
567 if (streq(t, "all")) {
568 if (c == ARG_CAPABILITY)
569 plus = (uint64_t) -1;
570 else
571 minus = (uint64_t) -1;
572 } else {
573 int cap;
574
575 cap = capability_from_name(t);
576 if (cap < 0) {
577 log_error("Failed to parse capability %s.", t);
578 return -EINVAL;
579 }
580
581 if (c == ARG_CAPABILITY)
582 plus |= 1ULL << (uint64_t) cap;
583 else
584 minus |= 1ULL << (uint64_t) cap;
585 }
586 }
587
588 arg_settings_mask |= SETTING_CAPABILITY;
589 break;
590 }
591
592 case 'j':
593 arg_link_journal = LINK_GUEST;
594 arg_link_journal_try = true;
595 break;
596
597 case ARG_LINK_JOURNAL:
598 if (streq(optarg, "auto")) {
599 arg_link_journal = LINK_AUTO;
600 arg_link_journal_try = false;
601 } else if (streq(optarg, "no")) {
602 arg_link_journal = LINK_NO;
603 arg_link_journal_try = false;
604 } else if (streq(optarg, "guest")) {
605 arg_link_journal = LINK_GUEST;
606 arg_link_journal_try = false;
607 } else if (streq(optarg, "host")) {
608 arg_link_journal = LINK_HOST;
609 arg_link_journal_try = false;
610 } else if (streq(optarg, "try-guest")) {
611 arg_link_journal = LINK_GUEST;
612 arg_link_journal_try = true;
613 } else if (streq(optarg, "try-host")) {
614 arg_link_journal = LINK_HOST;
615 arg_link_journal_try = true;
616 } else {
617 log_error("Failed to parse link journal mode %s", optarg);
618 return -EINVAL;
619 }
620
621 break;
622
623 case ARG_BIND:
624 case ARG_BIND_RO:
625 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
626 if (r < 0)
627 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
628
629 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
630 break;
631
632 case ARG_TMPFS:
633 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
634 if (r < 0)
635 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
636
637 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
638 break;
639
640 case ARG_OVERLAY:
641 case ARG_OVERLAY_RO: {
642 _cleanup_free_ char *upper = NULL, *destination = NULL;
643 _cleanup_strv_free_ char **lower = NULL;
644 CustomMount *m;
645 unsigned n = 0;
646 char **i;
647
648 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
649 if (r == -ENOMEM)
650 return log_oom();
651 else if (r < 0) {
652 log_error("Invalid overlay specification: %s", optarg);
653 return r;
654 }
655
656 STRV_FOREACH(i, lower) {
657 if (!path_is_absolute(*i)) {
658 log_error("Overlay path %s is not absolute.", *i);
659 return -EINVAL;
660 }
661
662 n++;
663 }
664
665 if (n < 2) {
666 log_error("--overlay= needs at least two colon-separated directories specified.");
667 return -EINVAL;
668 }
669
670 if (n == 2) {
671 /* If two parameters are specified,
672 * the first one is the lower, the
673 * second one the upper directory. And
674 * we'll also define the destination
675 * mount point the same as the upper. */
676 upper = lower[1];
677 lower[1] = NULL;
678
679 destination = strdup(upper);
680 if (!destination)
681 return log_oom();
682
683 } else {
684 upper = lower[n - 2];
685 destination = lower[n - 1];
686 lower[n - 2] = NULL;
687 }
688
689 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
690 if (!m)
691 return log_oom();
692
693 m->destination = destination;
694 m->source = upper;
695 m->lower = lower;
696 m->read_only = c == ARG_OVERLAY_RO;
697
698 upper = destination = NULL;
699 lower = NULL;
700
701 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
702 break;
703 }
704
705 case ARG_SETENV: {
706 char **n;
707
708 if (!env_assignment_is_valid(optarg)) {
709 log_error("Environment variable assignment '%s' is not valid.", optarg);
710 return -EINVAL;
711 }
712
713 n = strv_env_set(arg_setenv, optarg);
714 if (!n)
715 return log_oom();
716
717 strv_free(arg_setenv);
718 arg_setenv = n;
719
720 arg_settings_mask |= SETTING_ENVIRONMENT;
721 break;
722 }
723
724 case 'q':
725 arg_quiet = true;
726 break;
727
728 case ARG_SHARE_SYSTEM:
729 arg_share_system = true;
730 break;
731
732 case ARG_REGISTER:
733 r = parse_boolean(optarg);
734 if (r < 0) {
735 log_error("Failed to parse --register= argument: %s", optarg);
736 return r;
737 }
738
739 arg_register = r;
740 break;
741
742 case ARG_KEEP_UNIT:
743 arg_keep_unit = true;
744 break;
745
746 case ARG_PERSONALITY:
747
748 arg_personality = personality_from_string(optarg);
749 if (arg_personality == PERSONALITY_INVALID) {
750 log_error("Unknown or unsupported personality '%s'.", optarg);
751 return -EINVAL;
752 }
753
754 arg_settings_mask |= SETTING_PERSONALITY;
755 break;
756
757 case ARG_VOLATILE:
758
759 if (!optarg)
760 arg_volatile_mode = VOLATILE_YES;
761 else {
762 VolatileMode m;
763
764 m = volatile_mode_from_string(optarg);
765 if (m < 0) {
766 log_error("Failed to parse --volatile= argument: %s", optarg);
767 return -EINVAL;
768 } else
769 arg_volatile_mode = m;
770 }
771
772 arg_settings_mask |= SETTING_VOLATILE_MODE;
773 break;
774
775 case 'p':
776 r = expose_port_parse(&arg_expose_ports, optarg);
777 if (r == -EEXIST)
778 return log_error_errno(r, "Duplicate port specification: %s", optarg);
779 if (r < 0)
780 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
781
782 arg_settings_mask |= SETTING_EXPOSE_PORTS;
783 break;
784
785 case ARG_PROPERTY:
786 if (strv_extend(&arg_property, optarg) < 0)
787 return log_oom();
788
789 break;
790
791 case ARG_PRIVATE_USERS:
792 if (optarg) {
793 _cleanup_free_ char *buffer = NULL;
794 const char *range, *shift;
795
796 range = strchr(optarg, ':');
797 if (range) {
798 buffer = strndup(optarg, range - optarg);
799 if (!buffer)
800 return log_oom();
801 shift = buffer;
802
803 range++;
804 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
805 log_error("Failed to parse UID range: %s", range);
806 return -EINVAL;
807 }
808 } else
809 shift = optarg;
810
811 if (parse_uid(shift, &arg_uid_shift) < 0) {
812 log_error("Failed to parse UID: %s", optarg);
813 return -EINVAL;
814 }
815 }
816
817 arg_userns = true;
818 break;
819
820 case ARG_KILL_SIGNAL:
821 arg_kill_signal = signal_from_string_try_harder(optarg);
822 if (arg_kill_signal < 0) {
823 log_error("Cannot parse signal: %s", optarg);
824 return -EINVAL;
825 }
826
827 arg_settings_mask |= SETTING_KILL_SIGNAL;
828 break;
829
830 case ARG_SETTINGS:
831
832 /* no → do not read files
833 * yes → read files, do not override cmdline, trust only subset
834 * override → read files, override cmdline, trust only subset
835 * trusted → read files, do not override cmdline, trust all
836 */
837
838 r = parse_boolean(optarg);
839 if (r < 0) {
840 if (streq(optarg, "trusted")) {
841 mask_all_settings = false;
842 mask_no_settings = false;
843 arg_settings_trusted = true;
844
845 } else if (streq(optarg, "override")) {
846 mask_all_settings = false;
847 mask_no_settings = true;
848 arg_settings_trusted = -1;
849 } else
850 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
851 } else if (r > 0) {
852 /* yes */
853 mask_all_settings = false;
854 mask_no_settings = false;
855 arg_settings_trusted = -1;
856 } else {
857 /* no */
858 mask_all_settings = true;
859 mask_no_settings = false;
860 arg_settings_trusted = false;
861 }
862
863 break;
864
865 case '?':
866 return -EINVAL;
867
868 default:
869 assert_not_reached("Unhandled option");
870 }
871
872 if (arg_share_system)
873 arg_register = false;
874
875 if (arg_boot && arg_share_system) {
876 log_error("--boot and --share-system may not be combined.");
877 return -EINVAL;
878 }
879
880 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
881 log_error("--keep-unit may not be used when invoked from a user session.");
882 return -EINVAL;
883 }
884
885 if (arg_directory && arg_image) {
886 log_error("--directory= and --image= may not be combined.");
887 return -EINVAL;
888 }
889
890 if (arg_template && arg_image) {
891 log_error("--template= and --image= may not be combined.");
892 return -EINVAL;
893 }
894
895 if (arg_template && !(arg_directory || arg_machine)) {
896 log_error("--template= needs --directory= or --machine=.");
897 return -EINVAL;
898 }
899
900 if (arg_ephemeral && arg_template) {
901 log_error("--ephemeral and --template= may not be combined.");
902 return -EINVAL;
903 }
904
905 if (arg_ephemeral && arg_image) {
906 log_error("--ephemeral and --image= may not be combined.");
907 return -EINVAL;
908 }
909
910 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
911 log_error("--ephemeral and --link-journal= may not be combined.");
912 return -EINVAL;
913 }
914
915 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
916 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
917
918 if (argc > optind) {
919 arg_parameters = strv_copy(argv + optind);
920 if (!arg_parameters)
921 return log_oom();
922
923 arg_settings_mask |= SETTING_BOOT;
924 }
925
926 /* Load all settings from .nspawn files */
927 if (mask_no_settings)
928 arg_settings_mask = 0;
929
930 /* Don't load any settings from .nspawn files */
931 if (mask_all_settings)
932 arg_settings_mask = _SETTINGS_MASK_ALL;
933
934 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
935
936 r = detect_unified_cgroup_hierarchy();
937 if (r < 0)
938 return r;
939
940 return 1;
941 }
942
943 static int verify_arguments(void) {
944
945 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
946 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
947 return -EINVAL;
948 }
949
950 if (arg_expose_ports && !arg_private_network) {
951 log_error("Cannot use --port= without private networking.");
952 return -EINVAL;
953 }
954
955 if (arg_boot && arg_kill_signal <= 0)
956 arg_kill_signal = SIGRTMIN+3;
957
958 return 0;
959 }
960
961 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
962 assert(p);
963
964 if (!arg_userns)
965 return 0;
966
967 if (uid == UID_INVALID && gid == GID_INVALID)
968 return 0;
969
970 if (uid != UID_INVALID) {
971 uid += arg_uid_shift;
972
973 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
974 return -EOVERFLOW;
975 }
976
977 if (gid != GID_INVALID) {
978 gid += (gid_t) arg_uid_shift;
979
980 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
981 return -EOVERFLOW;
982 }
983
984 if (lchown(p, uid, gid) < 0)
985 return -errno;
986
987 return 0;
988 }
989
990 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
991 const char *q;
992
993 q = prefix_roota(root, path);
994 if (mkdir(q, mode) < 0) {
995 if (errno == EEXIST)
996 return 0;
997 return -errno;
998 }
999
1000 return userns_lchown(q, uid, gid);
1001 }
1002
1003 static int setup_timezone(const char *dest) {
1004 _cleanup_free_ char *p = NULL, *q = NULL;
1005 const char *where, *check, *what;
1006 char *z, *y;
1007 int r;
1008
1009 assert(dest);
1010
1011 /* Fix the timezone, if possible */
1012 r = readlink_malloc("/etc/localtime", &p);
1013 if (r < 0) {
1014 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1015 return 0;
1016 }
1017
1018 z = path_startswith(p, "../usr/share/zoneinfo/");
1019 if (!z)
1020 z = path_startswith(p, "/usr/share/zoneinfo/");
1021 if (!z) {
1022 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1023 return 0;
1024 }
1025
1026 where = prefix_roota(dest, "/etc/localtime");
1027 r = readlink_malloc(where, &q);
1028 if (r >= 0) {
1029 y = path_startswith(q, "../usr/share/zoneinfo/");
1030 if (!y)
1031 y = path_startswith(q, "/usr/share/zoneinfo/");
1032
1033 /* Already pointing to the right place? Then do nothing .. */
1034 if (y && streq(y, z))
1035 return 0;
1036 }
1037
1038 check = strjoina("/usr/share/zoneinfo/", z);
1039 check = prefix_root(dest, check);
1040 if (laccess(check, F_OK) < 0) {
1041 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1042 return 0;
1043 }
1044
1045 r = unlink(where);
1046 if (r < 0 && errno != ENOENT) {
1047 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1048 return 0;
1049 }
1050
1051 what = strjoina("../usr/share/zoneinfo/", z);
1052 if (symlink(what, where) < 0) {
1053 log_error_errno(errno, "Failed to correct timezone of container: %m");
1054 return 0;
1055 }
1056
1057 r = userns_lchown(where, 0, 0);
1058 if (r < 0)
1059 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1060
1061 return 0;
1062 }
1063
1064 static int setup_resolv_conf(const char *dest) {
1065 const char *where = NULL;
1066 int r;
1067
1068 assert(dest);
1069
1070 if (arg_private_network)
1071 return 0;
1072
1073 /* Fix resolv.conf, if possible */
1074 where = prefix_roota(dest, "/etc/resolv.conf");
1075
1076 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1077 if (r < 0) {
1078 /* If the file already exists as symlink, let's
1079 * suppress the warning, under the assumption that
1080 * resolved or something similar runs inside and the
1081 * symlink points there.
1082 *
1083 * If the disk image is read-only, there's also no
1084 * point in complaining.
1085 */
1086 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1087 "Failed to copy /etc/resolv.conf to %s: %m", where);
1088 return 0;
1089 }
1090
1091 r = userns_lchown(where, 0, 0);
1092 if (r < 0)
1093 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1094
1095 return 0;
1096 }
1097
1098 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1099 assert(s);
1100
1101 snprintf(s, 37,
1102 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1103 SD_ID128_FORMAT_VAL(id));
1104
1105 return s;
1106 }
1107
1108 static int setup_boot_id(const char *dest) {
1109 const char *from, *to;
1110 sd_id128_t rnd = {};
1111 char as_uuid[37];
1112 int r;
1113
1114 if (arg_share_system)
1115 return 0;
1116
1117 /* Generate a new randomized boot ID, so that each boot-up of
1118 * the container gets a new one */
1119
1120 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1121 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1122
1123 r = sd_id128_randomize(&rnd);
1124 if (r < 0)
1125 return log_error_errno(r, "Failed to generate random boot id: %m");
1126
1127 id128_format_as_uuid(rnd, as_uuid);
1128
1129 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1130 if (r < 0)
1131 return log_error_errno(r, "Failed to write boot id: %m");
1132
1133 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1134 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1135 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1136 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1137
1138 unlink(from);
1139 return r;
1140 }
1141
1142 static int copy_devnodes(const char *dest) {
1143
1144 static const char devnodes[] =
1145 "null\0"
1146 "zero\0"
1147 "full\0"
1148 "random\0"
1149 "urandom\0"
1150 "tty\0"
1151 "net/tun\0";
1152
1153 const char *d;
1154 int r = 0;
1155 _cleanup_umask_ mode_t u;
1156
1157 assert(dest);
1158
1159 u = umask(0000);
1160
1161 /* Create /dev/net, so that we can create /dev/net/tun in it */
1162 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1163 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1164
1165 NULSTR_FOREACH(d, devnodes) {
1166 _cleanup_free_ char *from = NULL, *to = NULL;
1167 struct stat st;
1168
1169 from = strappend("/dev/", d);
1170 to = prefix_root(dest, from);
1171
1172 if (stat(from, &st) < 0) {
1173
1174 if (errno != ENOENT)
1175 return log_error_errno(errno, "Failed to stat %s: %m", from);
1176
1177 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1178
1179 log_error("%s is not a char or block device, cannot copy.", from);
1180 return -EIO;
1181
1182 } else {
1183 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1184 if (errno != EPERM)
1185 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1186
1187 /* Some systems abusively restrict mknod but
1188 * allow bind mounts. */
1189 r = touch(to);
1190 if (r < 0)
1191 return log_error_errno(r, "touch (%s) failed: %m", to);
1192 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1193 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1194 }
1195
1196 r = userns_lchown(to, 0, 0);
1197 if (r < 0)
1198 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1199 }
1200 }
1201
1202 return r;
1203 }
1204
1205 static int setup_pts(const char *dest) {
1206 _cleanup_free_ char *options = NULL;
1207 const char *p;
1208
1209 #ifdef HAVE_SELINUX
1210 if (arg_selinux_apifs_context)
1211 (void) asprintf(&options,
1212 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1213 arg_uid_shift + TTY_GID,
1214 arg_selinux_apifs_context);
1215 else
1216 #endif
1217 (void) asprintf(&options,
1218 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1219 arg_uid_shift + TTY_GID);
1220
1221 if (!options)
1222 return log_oom();
1223
1224 /* Mount /dev/pts itself */
1225 p = prefix_roota(dest, "/dev/pts");
1226 if (mkdir(p, 0755) < 0)
1227 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1228 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1229 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1230 if (userns_lchown(p, 0, 0) < 0)
1231 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1232
1233 /* Create /dev/ptmx symlink */
1234 p = prefix_roota(dest, "/dev/ptmx");
1235 if (symlink("pts/ptmx", p) < 0)
1236 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1237 if (userns_lchown(p, 0, 0) < 0)
1238 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1239
1240 /* And fix /dev/pts/ptmx ownership */
1241 p = prefix_roota(dest, "/dev/pts/ptmx");
1242 if (userns_lchown(p, 0, 0) < 0)
1243 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1244
1245 return 0;
1246 }
1247
1248 static int setup_dev_console(const char *dest, const char *console) {
1249 _cleanup_umask_ mode_t u;
1250 const char *to;
1251 int r;
1252
1253 assert(dest);
1254 assert(console);
1255
1256 u = umask(0000);
1257
1258 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1259 if (r < 0)
1260 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1261
1262 /* We need to bind mount the right tty to /dev/console since
1263 * ptys can only exist on pts file systems. To have something
1264 * to bind mount things on we create a empty regular file. */
1265
1266 to = prefix_roota(dest, "/dev/console");
1267 r = touch(to);
1268 if (r < 0)
1269 return log_error_errno(r, "touch() for /dev/console failed: %m");
1270
1271 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1272 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1273
1274 return 0;
1275 }
1276
1277 static int setup_kmsg(const char *dest, int kmsg_socket) {
1278 const char *from, *to;
1279 _cleanup_umask_ mode_t u;
1280 int fd, k;
1281 union {
1282 struct cmsghdr cmsghdr;
1283 uint8_t buf[CMSG_SPACE(sizeof(int))];
1284 } control = {};
1285 struct msghdr mh = {
1286 .msg_control = &control,
1287 .msg_controllen = sizeof(control),
1288 };
1289 struct cmsghdr *cmsg;
1290
1291 assert(kmsg_socket >= 0);
1292
1293 u = umask(0000);
1294
1295 /* We create the kmsg FIFO as /run/kmsg, but immediately
1296 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1297 * on the reading side behave very similar to /proc/kmsg,
1298 * their writing side behaves differently from /dev/kmsg in
1299 * that writing blocks when nothing is reading. In order to
1300 * avoid any problems with containers deadlocking due to this
1301 * we simply make /dev/kmsg unavailable to the container. */
1302 from = prefix_roota(dest, "/run/kmsg");
1303 to = prefix_roota(dest, "/proc/kmsg");
1304
1305 if (mkfifo(from, 0600) < 0)
1306 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1307 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1308 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1309
1310 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1311 if (fd < 0)
1312 return log_error_errno(errno, "Failed to open fifo: %m");
1313
1314 cmsg = CMSG_FIRSTHDR(&mh);
1315 cmsg->cmsg_level = SOL_SOCKET;
1316 cmsg->cmsg_type = SCM_RIGHTS;
1317 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1318 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1319
1320 mh.msg_controllen = cmsg->cmsg_len;
1321
1322 /* Store away the fd in the socket, so that it stays open as
1323 * long as we run the child */
1324 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1325 safe_close(fd);
1326
1327 if (k < 0)
1328 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1329
1330 /* And now make the FIFO unavailable as /run/kmsg... */
1331 (void) unlink(from);
1332
1333 return 0;
1334 }
1335
1336 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1337 union in_addr_union *exposed = userdata;
1338
1339 assert(rtnl);
1340 assert(m);
1341 assert(exposed);
1342
1343 expose_port_execute(rtnl, arg_expose_ports, exposed);
1344 return 0;
1345 }
1346
1347 static int setup_hostname(void) {
1348
1349 if (arg_share_system)
1350 return 0;
1351
1352 if (sethostname_idempotent(arg_machine) < 0)
1353 return -errno;
1354
1355 return 0;
1356 }
1357
1358 static int setup_journal(const char *directory) {
1359 sd_id128_t machine_id, this_id;
1360 _cleanup_free_ char *b = NULL, *d = NULL;
1361 const char *etc_machine_id, *p, *q;
1362 char *id;
1363 int r;
1364
1365 /* Don't link journals in ephemeral mode */
1366 if (arg_ephemeral)
1367 return 0;
1368
1369 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1370
1371 r = read_one_line_file(etc_machine_id, &b);
1372 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1373 return 0;
1374 else if (r < 0)
1375 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
1376
1377 id = strstrip(b);
1378 if (isempty(id) && arg_link_journal == LINK_AUTO)
1379 return 0;
1380
1381 /* Verify validity */
1382 r = sd_id128_from_string(id, &machine_id);
1383 if (r < 0)
1384 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
1385
1386 r = sd_id128_get_machine(&this_id);
1387 if (r < 0)
1388 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1389
1390 if (sd_id128_equal(machine_id, this_id)) {
1391 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1392 "Host and machine ids are equal (%s): refusing to link journals", id);
1393 if (arg_link_journal == LINK_AUTO)
1394 return 0;
1395 return -EEXIST;
1396 }
1397
1398 if (arg_link_journal == LINK_NO)
1399 return 0;
1400
1401 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1402 if (r < 0)
1403 return log_error_errno(r, "Failed to create /var: %m");
1404
1405 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1406 if (r < 0)
1407 return log_error_errno(r, "Failed to create /var/log: %m");
1408
1409 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1410 if (r < 0)
1411 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1412
1413 p = strjoina("/var/log/journal/", id);
1414 q = prefix_roota(directory, p);
1415
1416 if (path_is_mount_point(p, 0) > 0) {
1417 if (arg_link_journal != LINK_AUTO) {
1418 log_error("%s: already a mount point, refusing to use for journal", p);
1419 return -EEXIST;
1420 }
1421
1422 return 0;
1423 }
1424
1425 if (path_is_mount_point(q, 0) > 0) {
1426 if (arg_link_journal != LINK_AUTO) {
1427 log_error("%s: already a mount point, refusing to use for journal", q);
1428 return -EEXIST;
1429 }
1430
1431 return 0;
1432 }
1433
1434 r = readlink_and_make_absolute(p, &d);
1435 if (r >= 0) {
1436 if ((arg_link_journal == LINK_GUEST ||
1437 arg_link_journal == LINK_AUTO) &&
1438 path_equal(d, q)) {
1439
1440 r = userns_mkdir(directory, p, 0755, 0, 0);
1441 if (r < 0)
1442 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1443 return 0;
1444 }
1445
1446 if (unlink(p) < 0)
1447 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1448 } else if (r == -EINVAL) {
1449
1450 if (arg_link_journal == LINK_GUEST &&
1451 rmdir(p) < 0) {
1452
1453 if (errno == ENOTDIR) {
1454 log_error("%s already exists and is neither a symlink nor a directory", p);
1455 return r;
1456 } else {
1457 log_error_errno(errno, "Failed to remove %s: %m", p);
1458 return -errno;
1459 }
1460 }
1461 } else if (r != -ENOENT) {
1462 log_error_errno(errno, "readlink(%s) failed: %m", p);
1463 return r;
1464 }
1465
1466 if (arg_link_journal == LINK_GUEST) {
1467
1468 if (symlink(q, p) < 0) {
1469 if (arg_link_journal_try) {
1470 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1471 return 0;
1472 } else {
1473 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1474 return -errno;
1475 }
1476 }
1477
1478 r = userns_mkdir(directory, p, 0755, 0, 0);
1479 if (r < 0)
1480 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1481 return 0;
1482 }
1483
1484 if (arg_link_journal == LINK_HOST) {
1485 /* don't create parents here -- if the host doesn't have
1486 * permanent journal set up, don't force it here */
1487 r = mkdir(p, 0755);
1488 if (r < 0) {
1489 if (arg_link_journal_try) {
1490 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1491 return 0;
1492 } else {
1493 log_error_errno(errno, "Failed to create %s: %m", p);
1494 return r;
1495 }
1496 }
1497
1498 } else if (access(p, F_OK) < 0)
1499 return 0;
1500
1501 if (dir_is_empty(q) == 0)
1502 log_warning("%s is not empty, proceeding anyway.", q);
1503
1504 r = userns_mkdir(directory, p, 0755, 0, 0);
1505 if (r < 0) {
1506 log_error_errno(errno, "Failed to create %s: %m", q);
1507 return r;
1508 }
1509
1510 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1511 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1512
1513 return 0;
1514 }
1515
1516 static int drop_capabilities(void) {
1517 return capability_bounding_set_drop(~arg_retain, false);
1518 }
1519
1520 static int reset_audit_loginuid(void) {
1521 _cleanup_free_ char *p = NULL;
1522 int r;
1523
1524 if (arg_share_system)
1525 return 0;
1526
1527 r = read_one_line_file("/proc/self/loginuid", &p);
1528 if (r == -ENOENT)
1529 return 0;
1530 if (r < 0)
1531 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1532
1533 /* Already reset? */
1534 if (streq(p, "4294967295"))
1535 return 0;
1536
1537 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1538 if (r < 0) {
1539 log_error_errno(r,
1540 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1541 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1542 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1543 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1544 "using systemd-nspawn. Sleeping for 5s... (%m)");
1545
1546 sleep(5);
1547 }
1548
1549 return 0;
1550 }
1551
1552 static int setup_seccomp(void) {
1553
1554 #ifdef HAVE_SECCOMP
1555 static const struct {
1556 uint64_t capability;
1557 int syscall_num;
1558 } blacklist[] = {
1559 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1560 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1561 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1562 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1563 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1564 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1565 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1566 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1567 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1568 { CAP_SYSLOG, SCMP_SYS(syslog) },
1569 };
1570
1571 scmp_filter_ctx seccomp;
1572 unsigned i;
1573 int r;
1574
1575 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1576 if (!seccomp)
1577 return log_oom();
1578
1579 r = seccomp_add_secondary_archs(seccomp);
1580 if (r < 0) {
1581 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1582 goto finish;
1583 }
1584
1585 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1586 if (arg_retain & (1ULL << blacklist[i].capability))
1587 continue;
1588
1589 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1590 if (r == -EFAULT)
1591 continue; /* unknown syscall */
1592 if (r < 0) {
1593 log_error_errno(r, "Failed to block syscall: %m");
1594 goto finish;
1595 }
1596 }
1597
1598
1599 /*
1600 Audit is broken in containers, much of the userspace audit
1601 hookup will fail if running inside a container. We don't
1602 care and just turn off creation of audit sockets.
1603
1604 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1605 with EAFNOSUPPORT which audit userspace uses as indication
1606 that audit is disabled in the kernel.
1607 */
1608
1609 r = seccomp_rule_add(
1610 seccomp,
1611 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1612 SCMP_SYS(socket),
1613 2,
1614 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1615 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1616 if (r < 0) {
1617 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1618 goto finish;
1619 }
1620
1621 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1622 if (r < 0) {
1623 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1624 goto finish;
1625 }
1626
1627 r = seccomp_load(seccomp);
1628 if (r == -EINVAL) {
1629 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1630 r = 0;
1631 goto finish;
1632 }
1633 if (r < 0) {
1634 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1635 goto finish;
1636 }
1637
1638 finish:
1639 seccomp_release(seccomp);
1640 return r;
1641 #else
1642 return 0;
1643 #endif
1644
1645 }
1646
1647 static int setup_propagate(const char *root) {
1648 const char *p, *q;
1649
1650 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1651 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1652 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1653 (void) mkdir_p(p, 0600);
1654
1655 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
1656 return log_error_errno(errno, "Failed to create /run/systemd: %m");
1657
1658 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1659 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
1660
1661 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1662 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
1663
1664 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1665 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1666 return log_error_errno(errno, "Failed to install propagation bind mount.");
1667
1668 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1669 return log_error_errno(errno, "Failed to make propagation mount read-only");
1670
1671 return 0;
1672 }
1673
1674 static int setup_image(char **device_path, int *loop_nr) {
1675 struct loop_info64 info = {
1676 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1677 };
1678 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1679 _cleanup_free_ char* loopdev = NULL;
1680 struct stat st;
1681 int r, nr;
1682
1683 assert(device_path);
1684 assert(loop_nr);
1685 assert(arg_image);
1686
1687 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1688 if (fd < 0)
1689 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1690
1691 if (fstat(fd, &st) < 0)
1692 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1693
1694 if (S_ISBLK(st.st_mode)) {
1695 char *p;
1696
1697 p = strdup(arg_image);
1698 if (!p)
1699 return log_oom();
1700
1701 *device_path = p;
1702
1703 *loop_nr = -1;
1704
1705 r = fd;
1706 fd = -1;
1707
1708 return r;
1709 }
1710
1711 if (!S_ISREG(st.st_mode)) {
1712 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1713 return -EINVAL;
1714 }
1715
1716 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1717 if (control < 0)
1718 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1719
1720 nr = ioctl(control, LOOP_CTL_GET_FREE);
1721 if (nr < 0)
1722 return log_error_errno(errno, "Failed to allocate loop device: %m");
1723
1724 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1725 return log_oom();
1726
1727 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1728 if (loop < 0)
1729 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1730
1731 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1732 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1733
1734 if (arg_read_only)
1735 info.lo_flags |= LO_FLAGS_READ_ONLY;
1736
1737 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1738 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1739
1740 *device_path = loopdev;
1741 loopdev = NULL;
1742
1743 *loop_nr = nr;
1744
1745 r = loop;
1746 loop = -1;
1747
1748 return r;
1749 }
1750
1751 #define PARTITION_TABLE_BLURB \
1752 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1753 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1754 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1755 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1756 "to be bootable with systemd-nspawn."
1757
1758 static int dissect_image(
1759 int fd,
1760 char **root_device, bool *root_device_rw,
1761 char **home_device, bool *home_device_rw,
1762 char **srv_device, bool *srv_device_rw,
1763 bool *secondary) {
1764
1765 #ifdef HAVE_BLKID
1766 int home_nr = -1, srv_nr = -1;
1767 #ifdef GPT_ROOT_NATIVE
1768 int root_nr = -1;
1769 #endif
1770 #ifdef GPT_ROOT_SECONDARY
1771 int secondary_root_nr = -1;
1772 #endif
1773 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1774 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1775 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1776 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1777 _cleanup_udev_unref_ struct udev *udev = NULL;
1778 struct udev_list_entry *first, *item;
1779 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1780 bool is_gpt, is_mbr, multiple_generic = false;
1781 const char *pttype = NULL;
1782 blkid_partlist pl;
1783 struct stat st;
1784 unsigned i;
1785 int r;
1786
1787 assert(fd >= 0);
1788 assert(root_device);
1789 assert(home_device);
1790 assert(srv_device);
1791 assert(secondary);
1792 assert(arg_image);
1793
1794 b = blkid_new_probe();
1795 if (!b)
1796 return log_oom();
1797
1798 errno = 0;
1799 r = blkid_probe_set_device(b, fd, 0, 0);
1800 if (r != 0) {
1801 if (errno == 0)
1802 return log_oom();
1803
1804 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1805 return -errno;
1806 }
1807
1808 blkid_probe_enable_partitions(b, 1);
1809 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1810
1811 errno = 0;
1812 r = blkid_do_safeprobe(b);
1813 if (r == -2 || r == 1) {
1814 log_error("Failed to identify any partition table on\n"
1815 " %s\n"
1816 PARTITION_TABLE_BLURB, arg_image);
1817 return -EINVAL;
1818 } else if (r != 0) {
1819 if (errno == 0)
1820 errno = EIO;
1821 log_error_errno(errno, "Failed to probe: %m");
1822 return -errno;
1823 }
1824
1825 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1826
1827 is_gpt = streq_ptr(pttype, "gpt");
1828 is_mbr = streq_ptr(pttype, "dos");
1829
1830 if (!is_gpt && !is_mbr) {
1831 log_error("No GPT or MBR partition table discovered on\n"
1832 " %s\n"
1833 PARTITION_TABLE_BLURB, arg_image);
1834 return -EINVAL;
1835 }
1836
1837 errno = 0;
1838 pl = blkid_probe_get_partitions(b);
1839 if (!pl) {
1840 if (errno == 0)
1841 return log_oom();
1842
1843 log_error("Failed to list partitions of %s", arg_image);
1844 return -errno;
1845 }
1846
1847 udev = udev_new();
1848 if (!udev)
1849 return log_oom();
1850
1851 if (fstat(fd, &st) < 0)
1852 return log_error_errno(errno, "Failed to stat block device: %m");
1853
1854 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1855 if (!d)
1856 return log_oom();
1857
1858 for (i = 0;; i++) {
1859 int n, m;
1860
1861 if (i >= 10) {
1862 log_error("Kernel partitions never appeared.");
1863 return -ENXIO;
1864 }
1865
1866 e = udev_enumerate_new(udev);
1867 if (!e)
1868 return log_oom();
1869
1870 r = udev_enumerate_add_match_parent(e, d);
1871 if (r < 0)
1872 return log_oom();
1873
1874 r = udev_enumerate_scan_devices(e);
1875 if (r < 0)
1876 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1877
1878 /* Count the partitions enumerated by the kernel */
1879 n = 0;
1880 first = udev_enumerate_get_list_entry(e);
1881 udev_list_entry_foreach(item, first)
1882 n++;
1883
1884 /* Count the partitions enumerated by blkid */
1885 m = blkid_partlist_numof_partitions(pl);
1886 if (n == m + 1)
1887 break;
1888 if (n > m + 1) {
1889 log_error("blkid and kernel partition list do not match.");
1890 return -EIO;
1891 }
1892 if (n < m + 1) {
1893 unsigned j;
1894
1895 /* The kernel has probed fewer partitions than
1896 * blkid? Maybe the kernel prober is still
1897 * running or it got EBUSY because udev
1898 * already opened the device. Let's reprobe
1899 * the device, which is a synchronous call
1900 * that waits until probing is complete. */
1901
1902 for (j = 0; j < 20; j++) {
1903
1904 r = ioctl(fd, BLKRRPART, 0);
1905 if (r < 0)
1906 r = -errno;
1907 if (r >= 0 || r != -EBUSY)
1908 break;
1909
1910 /* If something else has the device
1911 * open, such as an udev rule, the
1912 * ioctl will return EBUSY. Since
1913 * there's no way to wait until it
1914 * isn't busy anymore, let's just wait
1915 * a bit, and try again.
1916 *
1917 * This is really something they
1918 * should fix in the kernel! */
1919
1920 usleep(50 * USEC_PER_MSEC);
1921 }
1922
1923 if (r < 0)
1924 return log_error_errno(r, "Failed to reread partition table: %m");
1925 }
1926
1927 e = udev_enumerate_unref(e);
1928 }
1929
1930 first = udev_enumerate_get_list_entry(e);
1931 udev_list_entry_foreach(item, first) {
1932 _cleanup_udev_device_unref_ struct udev_device *q;
1933 const char *node;
1934 unsigned long long flags;
1935 blkid_partition pp;
1936 dev_t qn;
1937 int nr;
1938
1939 errno = 0;
1940 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1941 if (!q) {
1942 if (!errno)
1943 errno = ENOMEM;
1944
1945 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1946 return -errno;
1947 }
1948
1949 qn = udev_device_get_devnum(q);
1950 if (major(qn) == 0)
1951 continue;
1952
1953 if (st.st_rdev == qn)
1954 continue;
1955
1956 node = udev_device_get_devnode(q);
1957 if (!node)
1958 continue;
1959
1960 pp = blkid_partlist_devno_to_partition(pl, qn);
1961 if (!pp)
1962 continue;
1963
1964 flags = blkid_partition_get_flags(pp);
1965
1966 nr = blkid_partition_get_partno(pp);
1967 if (nr < 0)
1968 continue;
1969
1970 if (is_gpt) {
1971 sd_id128_t type_id;
1972 const char *stype;
1973
1974 if (flags & GPT_FLAG_NO_AUTO)
1975 continue;
1976
1977 stype = blkid_partition_get_type_string(pp);
1978 if (!stype)
1979 continue;
1980
1981 if (sd_id128_from_string(stype, &type_id) < 0)
1982 continue;
1983
1984 if (sd_id128_equal(type_id, GPT_HOME)) {
1985
1986 if (home && nr >= home_nr)
1987 continue;
1988
1989 home_nr = nr;
1990 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1991
1992 r = free_and_strdup(&home, node);
1993 if (r < 0)
1994 return log_oom();
1995
1996 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1997
1998 if (srv && nr >= srv_nr)
1999 continue;
2000
2001 srv_nr = nr;
2002 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2003
2004 r = free_and_strdup(&srv, node);
2005 if (r < 0)
2006 return log_oom();
2007 }
2008 #ifdef GPT_ROOT_NATIVE
2009 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2010
2011 if (root && nr >= root_nr)
2012 continue;
2013
2014 root_nr = nr;
2015 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2016
2017 r = free_and_strdup(&root, node);
2018 if (r < 0)
2019 return log_oom();
2020 }
2021 #endif
2022 #ifdef GPT_ROOT_SECONDARY
2023 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2024
2025 if (secondary_root && nr >= secondary_root_nr)
2026 continue;
2027
2028 secondary_root_nr = nr;
2029 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2030
2031 r = free_and_strdup(&secondary_root, node);
2032 if (r < 0)
2033 return log_oom();
2034 }
2035 #endif
2036 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2037
2038 if (generic)
2039 multiple_generic = true;
2040 else {
2041 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2042
2043 r = free_and_strdup(&generic, node);
2044 if (r < 0)
2045 return log_oom();
2046 }
2047 }
2048
2049 } else if (is_mbr) {
2050 int type;
2051
2052 if (flags != 0x80) /* Bootable flag */
2053 continue;
2054
2055 type = blkid_partition_get_type(pp);
2056 if (type != 0x83) /* Linux partition */
2057 continue;
2058
2059 if (generic)
2060 multiple_generic = true;
2061 else {
2062 generic_rw = true;
2063
2064 r = free_and_strdup(&root, node);
2065 if (r < 0)
2066 return log_oom();
2067 }
2068 }
2069 }
2070
2071 if (root) {
2072 *root_device = root;
2073 root = NULL;
2074
2075 *root_device_rw = root_rw;
2076 *secondary = false;
2077 } else if (secondary_root) {
2078 *root_device = secondary_root;
2079 secondary_root = NULL;
2080
2081 *root_device_rw = secondary_root_rw;
2082 *secondary = true;
2083 } else if (generic) {
2084
2085 /* There were no partitions with precise meanings
2086 * around, but we found generic partitions. In this
2087 * case, if there's only one, we can go ahead and boot
2088 * it, otherwise we bail out, because we really cannot
2089 * make any sense of it. */
2090
2091 if (multiple_generic) {
2092 log_error("Identified multiple bootable Linux partitions on\n"
2093 " %s\n"
2094 PARTITION_TABLE_BLURB, arg_image);
2095 return -EINVAL;
2096 }
2097
2098 *root_device = generic;
2099 generic = NULL;
2100
2101 *root_device_rw = generic_rw;
2102 *secondary = false;
2103 } else {
2104 log_error("Failed to identify root partition in disk image\n"
2105 " %s\n"
2106 PARTITION_TABLE_BLURB, arg_image);
2107 return -EINVAL;
2108 }
2109
2110 if (home) {
2111 *home_device = home;
2112 home = NULL;
2113
2114 *home_device_rw = home_rw;
2115 }
2116
2117 if (srv) {
2118 *srv_device = srv;
2119 srv = NULL;
2120
2121 *srv_device_rw = srv_rw;
2122 }
2123
2124 return 0;
2125 #else
2126 log_error("--image= is not supported, compiled without blkid support.");
2127 return -EOPNOTSUPP;
2128 #endif
2129 }
2130
2131 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2132 #ifdef HAVE_BLKID
2133 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2134 const char *fstype, *p;
2135 int r;
2136
2137 assert(what);
2138 assert(where);
2139
2140 if (arg_read_only)
2141 rw = false;
2142
2143 if (directory)
2144 p = strjoina(where, directory);
2145 else
2146 p = where;
2147
2148 errno = 0;
2149 b = blkid_new_probe_from_filename(what);
2150 if (!b) {
2151 if (errno == 0)
2152 return log_oom();
2153 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2154 return -errno;
2155 }
2156
2157 blkid_probe_enable_superblocks(b, 1);
2158 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2159
2160 errno = 0;
2161 r = blkid_do_safeprobe(b);
2162 if (r == -1 || r == 1) {
2163 log_error("Cannot determine file system type of %s", what);
2164 return -EINVAL;
2165 } else if (r != 0) {
2166 if (errno == 0)
2167 errno = EIO;
2168 log_error_errno(errno, "Failed to probe %s: %m", what);
2169 return -errno;
2170 }
2171
2172 errno = 0;
2173 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2174 if (errno == 0)
2175 errno = EINVAL;
2176 log_error("Failed to determine file system type of %s", what);
2177 return -errno;
2178 }
2179
2180 if (streq(fstype, "crypto_LUKS")) {
2181 log_error("nspawn currently does not support LUKS disk images.");
2182 return -EOPNOTSUPP;
2183 }
2184
2185 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2186 return log_error_errno(errno, "Failed to mount %s: %m", what);
2187
2188 return 0;
2189 #else
2190 log_error("--image= is not supported, compiled without blkid support.");
2191 return -EOPNOTSUPP;
2192 #endif
2193 }
2194
2195 static int mount_devices(
2196 const char *where,
2197 const char *root_device, bool root_device_rw,
2198 const char *home_device, bool home_device_rw,
2199 const char *srv_device, bool srv_device_rw) {
2200 int r;
2201
2202 assert(where);
2203
2204 if (root_device) {
2205 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2206 if (r < 0)
2207 return log_error_errno(r, "Failed to mount root directory: %m");
2208 }
2209
2210 if (home_device) {
2211 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2212 if (r < 0)
2213 return log_error_errno(r, "Failed to mount home directory: %m");
2214 }
2215
2216 if (srv_device) {
2217 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2218 if (r < 0)
2219 return log_error_errno(r, "Failed to mount server data directory: %m");
2220 }
2221
2222 return 0;
2223 }
2224
2225 static void loop_remove(int nr, int *image_fd) {
2226 _cleanup_close_ int control = -1;
2227 int r;
2228
2229 if (nr < 0)
2230 return;
2231
2232 if (image_fd && *image_fd >= 0) {
2233 r = ioctl(*image_fd, LOOP_CLR_FD);
2234 if (r < 0)
2235 log_debug_errno(errno, "Failed to close loop image: %m");
2236 *image_fd = safe_close(*image_fd);
2237 }
2238
2239 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2240 if (control < 0) {
2241 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2242 return;
2243 }
2244
2245 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2246 if (r < 0)
2247 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2248 }
2249
2250 /*
2251 * Return values:
2252 * < 0 : wait_for_terminate() failed to get the state of the
2253 * container, the container was terminated by a signal, or
2254 * failed for an unknown reason. No change is made to the
2255 * container argument.
2256 * > 0 : The program executed in the container terminated with an
2257 * error. The exit code of the program executed in the
2258 * container is returned. The container argument has been set
2259 * to CONTAINER_TERMINATED.
2260 * 0 : The container is being rebooted, has been shut down or exited
2261 * successfully. The container argument has been set to either
2262 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2263 *
2264 * That is, success is indicated by a return value of zero, and an
2265 * error is indicated by a non-zero value.
2266 */
2267 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2268 siginfo_t status;
2269 int r;
2270
2271 r = wait_for_terminate(pid, &status);
2272 if (r < 0)
2273 return log_warning_errno(r, "Failed to wait for container: %m");
2274
2275 switch (status.si_code) {
2276
2277 case CLD_EXITED:
2278 if (status.si_status == 0) {
2279 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2280
2281 } else
2282 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2283
2284 *container = CONTAINER_TERMINATED;
2285 return status.si_status;
2286
2287 case CLD_KILLED:
2288 if (status.si_status == SIGINT) {
2289
2290 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2291 *container = CONTAINER_TERMINATED;
2292 return 0;
2293
2294 } else if (status.si_status == SIGHUP) {
2295
2296 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2297 *container = CONTAINER_REBOOTED;
2298 return 0;
2299 }
2300
2301 /* CLD_KILLED fallthrough */
2302
2303 case CLD_DUMPED:
2304 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2305 return -EIO;
2306
2307 default:
2308 log_error("Container %s failed due to unknown reason.", arg_machine);
2309 return -EIO;
2310 }
2311
2312 return r;
2313 }
2314
2315 static void nop_handler(int sig) {}
2316
2317 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2318 pid_t pid;
2319
2320 pid = PTR_TO_UINT32(userdata);
2321 if (pid > 0) {
2322 if (kill(pid, arg_kill_signal) >= 0) {
2323 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2324 sd_event_source_set_userdata(s, NULL);
2325 return 0;
2326 }
2327 }
2328
2329 sd_event_exit(sd_event_source_get_event(s), 0);
2330 return 0;
2331 }
2332
2333 static int determine_names(void) {
2334 int r;
2335
2336 if (arg_template && !arg_directory && arg_machine) {
2337
2338 /* If --template= was specified then we should not
2339 * search for a machine, but instead create a new one
2340 * in /var/lib/machine. */
2341
2342 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2343 if (!arg_directory)
2344 return log_oom();
2345 }
2346
2347 if (!arg_image && !arg_directory) {
2348 if (arg_machine) {
2349 _cleanup_(image_unrefp) Image *i = NULL;
2350
2351 r = image_find(arg_machine, &i);
2352 if (r < 0)
2353 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2354 else if (r == 0) {
2355 log_error("No image for machine '%s': %m", arg_machine);
2356 return -ENOENT;
2357 }
2358
2359 if (i->type == IMAGE_RAW)
2360 r = set_sanitized_path(&arg_image, i->path);
2361 else
2362 r = set_sanitized_path(&arg_directory, i->path);
2363 if (r < 0)
2364 return log_error_errno(r, "Invalid image directory: %m");
2365
2366 if (!arg_ephemeral)
2367 arg_read_only = arg_read_only || i->read_only;
2368 } else
2369 arg_directory = get_current_dir_name();
2370
2371 if (!arg_directory && !arg_machine) {
2372 log_error("Failed to determine path, please use -D or -i.");
2373 return -EINVAL;
2374 }
2375 }
2376
2377 if (!arg_machine) {
2378 if (arg_directory && path_equal(arg_directory, "/"))
2379 arg_machine = gethostname_malloc();
2380 else
2381 arg_machine = strdup(basename(arg_image ?: arg_directory));
2382
2383 if (!arg_machine)
2384 return log_oom();
2385
2386 hostname_cleanup(arg_machine);
2387 if (!machine_name_is_valid(arg_machine)) {
2388 log_error("Failed to determine machine name automatically, please use -M.");
2389 return -EINVAL;
2390 }
2391
2392 if (arg_ephemeral) {
2393 char *b;
2394
2395 /* Add a random suffix when this is an
2396 * ephemeral machine, so that we can run many
2397 * instances at once without manually having
2398 * to specify -M each time. */
2399
2400 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2401 return log_oom();
2402
2403 free(arg_machine);
2404 arg_machine = b;
2405 }
2406 }
2407
2408 return 0;
2409 }
2410
2411 static int determine_uid_shift(const char *directory) {
2412 int r;
2413
2414 if (!arg_userns) {
2415 arg_uid_shift = 0;
2416 return 0;
2417 }
2418
2419 if (arg_uid_shift == UID_INVALID) {
2420 struct stat st;
2421
2422 r = stat(directory, &st);
2423 if (r < 0)
2424 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2425
2426 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2427
2428 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2429 log_error("UID and GID base of %s don't match.", directory);
2430 return -EINVAL;
2431 }
2432
2433 arg_uid_range = UINT32_C(0x10000);
2434 }
2435
2436 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2437 log_error("UID base too high for UID range.");
2438 return -EINVAL;
2439 }
2440
2441 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2442 return 0;
2443 }
2444
2445 static int inner_child(
2446 Barrier *barrier,
2447 const char *directory,
2448 bool secondary,
2449 int kmsg_socket,
2450 int rtnl_socket,
2451 FDSet *fds) {
2452
2453 _cleanup_free_ char *home = NULL;
2454 unsigned n_env = 2;
2455 const char *envp[] = {
2456 "PATH=" DEFAULT_PATH_SPLIT_USR,
2457 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2458 NULL, /* TERM */
2459 NULL, /* HOME */
2460 NULL, /* USER */
2461 NULL, /* LOGNAME */
2462 NULL, /* container_uuid */
2463 NULL, /* LISTEN_FDS */
2464 NULL, /* LISTEN_PID */
2465 NULL
2466 };
2467
2468 _cleanup_strv_free_ char **env_use = NULL;
2469 int r;
2470
2471 assert(barrier);
2472 assert(directory);
2473 assert(kmsg_socket >= 0);
2474
2475 cg_unified_flush();
2476
2477 if (arg_userns) {
2478 /* Tell the parent, that it now can write the UID map. */
2479 (void) barrier_place(barrier); /* #1 */
2480
2481 /* Wait until the parent wrote the UID map */
2482 if (!barrier_place_and_sync(barrier)) { /* #2 */
2483 log_error("Parent died too early");
2484 return -ESRCH;
2485 }
2486 }
2487
2488 r = mount_all(NULL, true, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2489 if (r < 0)
2490 return r;
2491
2492 /* Wait until we are cgroup-ified, so that we
2493 * can mount the right cgroup path writable */
2494 if (!barrier_place_and_sync(barrier)) { /* #3 */
2495 log_error("Parent died too early");
2496 return -ESRCH;
2497 }
2498
2499 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2500 if (r < 0)
2501 return r;
2502
2503 r = reset_uid_gid();
2504 if (r < 0)
2505 return log_error_errno(r, "Couldn't become new root: %m");
2506
2507 r = setup_boot_id(NULL);
2508 if (r < 0)
2509 return r;
2510
2511 r = setup_kmsg(NULL, kmsg_socket);
2512 if (r < 0)
2513 return r;
2514 kmsg_socket = safe_close(kmsg_socket);
2515
2516 umask(0022);
2517
2518 if (setsid() < 0)
2519 return log_error_errno(errno, "setsid() failed: %m");
2520
2521 if (arg_private_network)
2522 loopback_setup();
2523
2524 if (arg_expose_ports) {
2525 r = expose_port_send_rtnl(rtnl_socket);
2526 if (r < 0)
2527 return r;
2528 rtnl_socket = safe_close(rtnl_socket);
2529 }
2530
2531 if (drop_capabilities() < 0)
2532 return log_error_errno(errno, "drop_capabilities() failed: %m");
2533
2534 setup_hostname();
2535
2536 if (arg_personality != PERSONALITY_INVALID) {
2537 if (personality(arg_personality) < 0)
2538 return log_error_errno(errno, "personality() failed: %m");
2539 } else if (secondary) {
2540 if (personality(PER_LINUX32) < 0)
2541 return log_error_errno(errno, "personality() failed: %m");
2542 }
2543
2544 #ifdef HAVE_SELINUX
2545 if (arg_selinux_context)
2546 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2547 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2548 #endif
2549
2550 r = change_uid_gid(arg_user, &home);
2551 if (r < 0)
2552 return r;
2553
2554 envp[n_env] = strv_find_prefix(environ, "TERM=");
2555 if (envp[n_env])
2556 n_env ++;
2557
2558 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2559 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2560 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2561 return log_oom();
2562
2563 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2564 char as_uuid[37];
2565
2566 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2567 return log_oom();
2568 }
2569
2570 if (fdset_size(fds) > 0) {
2571 r = fdset_cloexec(fds, false);
2572 if (r < 0)
2573 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2574
2575 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2576 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2577 return log_oom();
2578 }
2579
2580 env_use = strv_env_merge(2, envp, arg_setenv);
2581 if (!env_use)
2582 return log_oom();
2583
2584 /* Let the parent know that we are ready and
2585 * wait until the parent is ready with the
2586 * setup, too... */
2587 if (!barrier_place_and_sync(barrier)) { /* #4 */
2588 log_error("Parent died too early");
2589 return -ESRCH;
2590 }
2591
2592 /* Now, explicitly close the log, so that we
2593 * then can close all remaining fds. Closing
2594 * the log explicitly first has the benefit
2595 * that the logging subsystem knows about it,
2596 * and is thus ready to be reopened should we
2597 * need it again. Note that the other fds
2598 * closed here are at least the locking and
2599 * barrier fds. */
2600 log_close();
2601 (void) fdset_close_others(fds);
2602
2603 if (arg_boot) {
2604 char **a;
2605 size_t m;
2606
2607 /* Automatically search for the init system */
2608
2609 m = 1 + strv_length(arg_parameters);
2610 a = newa(char*, m + 1);
2611 if (strv_isempty(arg_parameters))
2612 a[1] = NULL;
2613 else
2614 memcpy(a + 1, arg_parameters, m * sizeof(char*));
2615
2616 a[0] = (char*) "/usr/lib/systemd/systemd";
2617 execve(a[0], a, env_use);
2618
2619 a[0] = (char*) "/lib/systemd/systemd";
2620 execve(a[0], a, env_use);
2621
2622 a[0] = (char*) "/sbin/init";
2623 execve(a[0], a, env_use);
2624 } else if (!strv_isempty(arg_parameters))
2625 execvpe(arg_parameters[0], arg_parameters, env_use);
2626 else {
2627 chdir(home ?: "/root");
2628 execle("/bin/bash", "-bash", NULL, env_use);
2629 execle("/bin/sh", "-sh", NULL, env_use);
2630 }
2631
2632 (void) log_open();
2633 return log_error_errno(errno, "execv() failed: %m");
2634 }
2635
2636 static int outer_child(
2637 Barrier *barrier,
2638 const char *directory,
2639 const char *console,
2640 const char *root_device, bool root_device_rw,
2641 const char *home_device, bool home_device_rw,
2642 const char *srv_device, bool srv_device_rw,
2643 bool interactive,
2644 bool secondary,
2645 int pid_socket,
2646 int kmsg_socket,
2647 int rtnl_socket,
2648 int uid_shift_socket,
2649 FDSet *fds) {
2650
2651 pid_t pid;
2652 ssize_t l;
2653 int r;
2654
2655 assert(barrier);
2656 assert(directory);
2657 assert(console);
2658 assert(pid_socket >= 0);
2659 assert(kmsg_socket >= 0);
2660
2661 cg_unified_flush();
2662
2663 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2664 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2665
2666 if (interactive) {
2667 close_nointr(STDIN_FILENO);
2668 close_nointr(STDOUT_FILENO);
2669 close_nointr(STDERR_FILENO);
2670
2671 r = open_terminal(console, O_RDWR);
2672 if (r != STDIN_FILENO) {
2673 if (r >= 0) {
2674 safe_close(r);
2675 r = -EINVAL;
2676 }
2677
2678 return log_error_errno(r, "Failed to open console: %m");
2679 }
2680
2681 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2682 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2683 return log_error_errno(errno, "Failed to duplicate console: %m");
2684 }
2685
2686 r = reset_audit_loginuid();
2687 if (r < 0)
2688 return r;
2689
2690 /* Mark everything as slave, so that we still
2691 * receive mounts from the real root, but don't
2692 * propagate mounts to the real root. */
2693 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2694 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2695
2696 r = mount_devices(directory,
2697 root_device, root_device_rw,
2698 home_device, home_device_rw,
2699 srv_device, srv_device_rw);
2700 if (r < 0)
2701 return r;
2702
2703 r = determine_uid_shift(directory);
2704 if (r < 0)
2705 return r;
2706
2707 if (arg_userns) {
2708 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2709 if (l < 0)
2710 return log_error_errno(errno, "Failed to send UID shift: %m");
2711 if (l != sizeof(arg_uid_shift)) {
2712 log_error("Short write while sending UID shift.");
2713 return -EIO;
2714 }
2715 }
2716
2717 /* Turn directory into bind mount */
2718 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2719 return log_error_errno(errno, "Failed to make bind mount: %m");
2720
2721 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2722 if (r < 0)
2723 return r;
2724
2725 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2726 if (r < 0)
2727 return r;
2728
2729 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2730 if (r < 0)
2731 return r;
2732
2733 if (arg_read_only) {
2734 r = bind_remount_recursive(directory, true);
2735 if (r < 0)
2736 return log_error_errno(r, "Failed to make tree read-only: %m");
2737 }
2738
2739 r = mount_all(directory, false, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2740 if (r < 0)
2741 return r;
2742
2743 if (copy_devnodes(directory) < 0)
2744 return r;
2745
2746 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2747
2748 if (setup_pts(directory) < 0)
2749 return r;
2750
2751 r = setup_propagate(directory);
2752 if (r < 0)
2753 return r;
2754
2755 r = setup_dev_console(directory, console);
2756 if (r < 0)
2757 return r;
2758
2759 r = setup_seccomp();
2760 if (r < 0)
2761 return r;
2762
2763 r = setup_timezone(directory);
2764 if (r < 0)
2765 return r;
2766
2767 r = setup_resolv_conf(directory);
2768 if (r < 0)
2769 return r;
2770
2771 r = setup_journal(directory);
2772 if (r < 0)
2773 return r;
2774
2775 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2776 if (r < 0)
2777 return r;
2778
2779 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2780 if (r < 0)
2781 return r;
2782
2783 r = mount_move_root(directory);
2784 if (r < 0)
2785 return log_error_errno(r, "Failed to move root directory: %m");
2786
2787 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2788 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2789 (arg_private_network ? CLONE_NEWNET : 0) |
2790 (arg_userns ? CLONE_NEWUSER : 0),
2791 NULL);
2792 if (pid < 0)
2793 return log_error_errno(errno, "Failed to fork inner child: %m");
2794 if (pid == 0) {
2795 pid_socket = safe_close(pid_socket);
2796 uid_shift_socket = safe_close(uid_shift_socket);
2797
2798 /* The inner child has all namespaces that are
2799 * requested, so that we all are owned by the user if
2800 * user namespaces are turned on. */
2801
2802 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2803 if (r < 0)
2804 _exit(EXIT_FAILURE);
2805
2806 _exit(EXIT_SUCCESS);
2807 }
2808
2809 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2810 if (l < 0)
2811 return log_error_errno(errno, "Failed to send PID: %m");
2812 if (l != sizeof(pid)) {
2813 log_error("Short write while sending PID.");
2814 return -EIO;
2815 }
2816
2817 pid_socket = safe_close(pid_socket);
2818
2819 return 0;
2820 }
2821
2822 static int setup_uid_map(pid_t pid) {
2823 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2824 int r;
2825
2826 assert(pid > 1);
2827
2828 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2829 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2830 r = write_string_file(uid_map, line, 0);
2831 if (r < 0)
2832 return log_error_errno(r, "Failed to write UID map: %m");
2833
2834 /* We always assign the same UID and GID ranges */
2835 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2836 r = write_string_file(uid_map, line, 0);
2837 if (r < 0)
2838 return log_error_errno(r, "Failed to write GID map: %m");
2839
2840 return 0;
2841 }
2842
2843 static int load_settings(void) {
2844 _cleanup_(settings_freep) Settings *settings = NULL;
2845 _cleanup_fclose_ FILE *f = NULL;
2846 _cleanup_free_ char *p = NULL;
2847 const char *fn, *i;
2848 int r;
2849
2850 /* If all settings are masked, there's no point in looking for
2851 * the settings file */
2852 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2853 return 0;
2854
2855 fn = strjoina(arg_machine, ".nspawn");
2856
2857 /* We first look in the admin's directories in /etc and /run */
2858 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2859 _cleanup_free_ char *j = NULL;
2860
2861 j = strjoin(i, "/", fn, NULL);
2862 if (!j)
2863 return log_oom();
2864
2865 f = fopen(j, "re");
2866 if (f) {
2867 p = j;
2868 j = NULL;
2869
2870 /* By default we trust configuration from /etc and /run */
2871 if (arg_settings_trusted < 0)
2872 arg_settings_trusted = true;
2873
2874 break;
2875 }
2876
2877 if (errno != ENOENT)
2878 return log_error_errno(errno, "Failed to open %s: %m", j);
2879 }
2880
2881 if (!f) {
2882 /* After that, let's look for a file next to the
2883 * actual image we shall boot. */
2884
2885 if (arg_image) {
2886 p = file_in_same_dir(arg_image, fn);
2887 if (!p)
2888 return log_oom();
2889 } else if (arg_directory) {
2890 p = file_in_same_dir(arg_directory, fn);
2891 if (!p)
2892 return log_oom();
2893 }
2894
2895 if (p) {
2896 f = fopen(p, "re");
2897 if (!f && errno != ENOENT)
2898 return log_error_errno(errno, "Failed to open %s: %m", p);
2899
2900 /* By default we do not trust configuration from /var/lib/machines */
2901 if (arg_settings_trusted < 0)
2902 arg_settings_trusted = false;
2903 }
2904 }
2905
2906 if (!f)
2907 return 0;
2908
2909 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2910
2911 r = settings_load(f, p, &settings);
2912 if (r < 0)
2913 return r;
2914
2915 /* Copy over bits from the settings, unless they have been
2916 * explicitly masked by command line switches. */
2917
2918 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2919 settings->boot >= 0) {
2920 arg_boot = settings->boot;
2921
2922 strv_free(arg_parameters);
2923 arg_parameters = settings->parameters;
2924 settings->parameters = NULL;
2925 }
2926
2927 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2928 settings->environment) {
2929 strv_free(arg_setenv);
2930 arg_setenv = settings->environment;
2931 settings->environment = NULL;
2932 }
2933
2934 if ((arg_settings_mask & SETTING_USER) == 0 &&
2935 settings->user) {
2936 free(arg_user);
2937 arg_user = settings->user;
2938 settings->user = NULL;
2939 }
2940
2941 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
2942
2943 if (!arg_settings_trusted && settings->capability != 0)
2944 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2945 else
2946 arg_retain |= settings->capability;
2947
2948 arg_retain &= ~settings->drop_capability;
2949 }
2950
2951 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2952 settings->kill_signal > 0)
2953 arg_kill_signal = settings->kill_signal;
2954
2955 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2956 settings->personality != PERSONALITY_INVALID)
2957 arg_personality = settings->personality;
2958
2959 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2960 !sd_id128_is_null(settings->machine_id)) {
2961
2962 if (!arg_settings_trusted)
2963 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2964 else
2965 arg_uuid = settings->machine_id;
2966 }
2967
2968 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2969 settings->read_only >= 0)
2970 arg_read_only = settings->read_only;
2971
2972 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2973 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2974 arg_volatile_mode = settings->volatile_mode;
2975
2976 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2977 settings->n_custom_mounts > 0) {
2978
2979 if (!arg_settings_trusted)
2980 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2981 else {
2982 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2983 arg_custom_mounts = settings->custom_mounts;
2984 arg_n_custom_mounts = settings->n_custom_mounts;
2985
2986 settings->custom_mounts = NULL;
2987 settings->n_custom_mounts = 0;
2988 }
2989 }
2990
2991 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2992 (settings->private_network >= 0 ||
2993 settings->network_veth >= 0 ||
2994 settings->network_bridge ||
2995 settings->network_interfaces ||
2996 settings->network_macvlan ||
2997 settings->network_ipvlan)) {
2998
2999 if (!arg_settings_trusted)
3000 log_warning("Ignoring network settings, file %s is not trusted.", p);
3001 else {
3002 strv_free(arg_network_interfaces);
3003 arg_network_interfaces = settings->network_interfaces;
3004 settings->network_interfaces = NULL;
3005
3006 strv_free(arg_network_macvlan);
3007 arg_network_macvlan = settings->network_macvlan;
3008 settings->network_macvlan = NULL;
3009
3010 strv_free(arg_network_ipvlan);
3011 arg_network_ipvlan = settings->network_ipvlan;
3012 settings->network_ipvlan = NULL;
3013
3014 free(arg_network_bridge);
3015 arg_network_bridge = settings->network_bridge;
3016 settings->network_bridge = NULL;
3017
3018 arg_network_veth = settings->network_veth > 0 || settings->network_bridge;
3019
3020 arg_private_network = true; /* all these settings imply private networking */
3021 }
3022 }
3023
3024 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3025 settings->expose_ports) {
3026
3027 if (!arg_settings_trusted)
3028 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3029 else {
3030 expose_port_free_all(arg_expose_ports);
3031 arg_expose_ports = settings->expose_ports;
3032 settings->expose_ports = NULL;
3033 }
3034 }
3035
3036 return 0;
3037 }
3038
3039 int main(int argc, char *argv[]) {
3040
3041 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3042 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3043 _cleanup_close_ int master = -1, image_fd = -1;
3044 _cleanup_fdset_free_ FDSet *fds = NULL;
3045 int r, n_fd_passed, loop_nr = -1;
3046 char veth_name[IFNAMSIZ];
3047 bool secondary = false, remove_subvol = false;
3048 sigset_t mask_chld;
3049 pid_t pid = 0;
3050 int ret = EXIT_SUCCESS;
3051 union in_addr_union exposed = {};
3052 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3053 bool interactive;
3054
3055 log_parse_environment();
3056 log_open();
3057
3058 r = parse_argv(argc, argv);
3059 if (r <= 0)
3060 goto finish;
3061
3062 if (geteuid() != 0) {
3063 log_error("Need to be root.");
3064 r = -EPERM;
3065 goto finish;
3066 }
3067 r = determine_names();
3068 if (r < 0)
3069 goto finish;
3070
3071 r = load_settings();
3072 if (r < 0)
3073 goto finish;
3074
3075 r = verify_arguments();
3076 if (r < 0)
3077 goto finish;
3078
3079 n_fd_passed = sd_listen_fds(false);
3080 if (n_fd_passed > 0) {
3081 r = fdset_new_listen_fds(&fds, false);
3082 if (r < 0) {
3083 log_error_errno(r, "Failed to collect file descriptors: %m");
3084 goto finish;
3085 }
3086 }
3087
3088 if (arg_directory) {
3089 assert(!arg_image);
3090
3091 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3092 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3093 r = -EINVAL;
3094 goto finish;
3095 }
3096
3097 if (arg_ephemeral) {
3098 _cleanup_free_ char *np = NULL;
3099
3100 /* If the specified path is a mount point we
3101 * generate the new snapshot immediately
3102 * inside it under a random name. However if
3103 * the specified is not a mount point we
3104 * create the new snapshot in the parent
3105 * directory, just next to it. */
3106 r = path_is_mount_point(arg_directory, 0);
3107 if (r < 0) {
3108 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3109 goto finish;
3110 }
3111 if (r > 0)
3112 r = tempfn_random_child(arg_directory, "machine.", &np);
3113 else
3114 r = tempfn_random(arg_directory, "machine.", &np);
3115 if (r < 0) {
3116 log_error_errno(r, "Failed to generate name for snapshot: %m");
3117 goto finish;
3118 }
3119
3120 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3121 if (r < 0) {
3122 log_error_errno(r, "Failed to lock %s: %m", np);
3123 goto finish;
3124 }
3125
3126 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
3127 if (r < 0) {
3128 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3129 goto finish;
3130 }
3131
3132 free(arg_directory);
3133 arg_directory = np;
3134 np = NULL;
3135
3136 remove_subvol = true;
3137
3138 } else {
3139 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3140 if (r == -EBUSY) {
3141 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3142 goto finish;
3143 }
3144 if (r < 0) {
3145 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3146 return r;
3147 }
3148
3149 if (arg_template) {
3150 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
3151 if (r == -EEXIST) {
3152 if (!arg_quiet)
3153 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3154 } else if (r < 0) {
3155 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3156 goto finish;
3157 } else {
3158 if (!arg_quiet)
3159 log_info("Populated %s from template %s.", arg_directory, arg_template);
3160 }
3161 }
3162 }
3163
3164 if (arg_boot) {
3165 if (path_is_os_tree(arg_directory) <= 0) {
3166 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3167 r = -EINVAL;
3168 goto finish;
3169 }
3170 } else {
3171 const char *p;
3172
3173 p = strjoina(arg_directory,
3174 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3175 if (access(p, F_OK) < 0) {
3176 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3177 r = -EINVAL;
3178 goto finish;
3179 }
3180 }
3181
3182 } else {
3183 char template[] = "/tmp/nspawn-root-XXXXXX";
3184
3185 assert(arg_image);
3186 assert(!arg_template);
3187
3188 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3189 if (r == -EBUSY) {
3190 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3191 goto finish;
3192 }
3193 if (r < 0) {
3194 r = log_error_errno(r, "Failed to create image lock: %m");
3195 goto finish;
3196 }
3197
3198 if (!mkdtemp(template)) {
3199 log_error_errno(errno, "Failed to create temporary directory: %m");
3200 r = -errno;
3201 goto finish;
3202 }
3203
3204 arg_directory = strdup(template);
3205 if (!arg_directory) {
3206 r = log_oom();
3207 goto finish;
3208 }
3209
3210 image_fd = setup_image(&device_path, &loop_nr);
3211 if (image_fd < 0) {
3212 r = image_fd;
3213 goto finish;
3214 }
3215
3216 r = dissect_image(image_fd,
3217 &root_device, &root_device_rw,
3218 &home_device, &home_device_rw,
3219 &srv_device, &srv_device_rw,
3220 &secondary);
3221 if (r < 0)
3222 goto finish;
3223 }
3224
3225 r = custom_mounts_prepare();
3226 if (r < 0)
3227 goto finish;
3228
3229 interactive =
3230 isatty(STDIN_FILENO) > 0 &&
3231 isatty(STDOUT_FILENO) > 0;
3232
3233 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3234 if (master < 0) {
3235 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3236 goto finish;
3237 }
3238
3239 r = ptsname_malloc(master, &console);
3240 if (r < 0) {
3241 r = log_error_errno(r, "Failed to determine tty name: %m");
3242 goto finish;
3243 }
3244
3245 if (unlockpt(master) < 0) {
3246 r = log_error_errno(errno, "Failed to unlock tty: %m");
3247 goto finish;
3248 }
3249
3250 if (!arg_quiet)
3251 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3252 arg_machine, arg_image ?: arg_directory);
3253
3254 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3255
3256 assert_se(sigemptyset(&mask_chld) == 0);
3257 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3258
3259 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3260 r = log_error_errno(errno, "Failed to become subreaper: %m");
3261 goto finish;
3262 }
3263
3264 for (;;) {
3265 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
3266 uid_shift_socket_pair[2] = { -1, -1 };
3267 ContainerStatus container_status;
3268 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3269 static const struct sigaction sa = {
3270 .sa_handler = nop_handler,
3271 .sa_flags = SA_NOCLDSTOP,
3272 };
3273 int ifi = 0;
3274 ssize_t l;
3275 _cleanup_event_unref_ sd_event *event = NULL;
3276 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3277 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3278 char last_char = 0;
3279
3280 r = barrier_create(&barrier);
3281 if (r < 0) {
3282 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3283 goto finish;
3284 }
3285
3286 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3287 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3288 goto finish;
3289 }
3290
3291 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3292 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3293 goto finish;
3294 }
3295
3296 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3297 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3298 goto finish;
3299 }
3300
3301 if (arg_userns)
3302 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3303 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3304 goto finish;
3305 }
3306
3307 /* Child can be killed before execv(), so handle SIGCHLD
3308 * in order to interrupt parent's blocking calls and
3309 * give it a chance to call wait() and terminate. */
3310 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3311 if (r < 0) {
3312 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3313 goto finish;
3314 }
3315
3316 r = sigaction(SIGCHLD, &sa, NULL);
3317 if (r < 0) {
3318 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3319 goto finish;
3320 }
3321
3322 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3323 if (pid < 0) {
3324 if (errno == EINVAL)
3325 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3326 else
3327 r = log_error_errno(errno, "clone() failed: %m");
3328
3329 goto finish;
3330 }
3331
3332 if (pid == 0) {
3333 /* The outer child only has a file system namespace. */
3334 barrier_set_role(&barrier, BARRIER_CHILD);
3335
3336 master = safe_close(master);
3337
3338 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3339 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3340 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3341 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3342
3343 (void) reset_all_signal_handlers();
3344 (void) reset_signal_mask();
3345
3346 r = outer_child(&barrier,
3347 arg_directory,
3348 console,
3349 root_device, root_device_rw,
3350 home_device, home_device_rw,
3351 srv_device, srv_device_rw,
3352 interactive,
3353 secondary,
3354 pid_socket_pair[1],
3355 kmsg_socket_pair[1],
3356 rtnl_socket_pair[1],
3357 uid_shift_socket_pair[1],
3358 fds);
3359 if (r < 0)
3360 _exit(EXIT_FAILURE);
3361
3362 _exit(EXIT_SUCCESS);
3363 }
3364
3365 barrier_set_role(&barrier, BARRIER_PARENT);
3366
3367 fdset_free(fds);
3368 fds = NULL;
3369
3370 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3371 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3372 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3373
3374 /* Wait for the outer child. */
3375 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3376 if (r < 0)
3377 goto finish;
3378 if (r != 0) {
3379 r = -EIO;
3380 goto finish;
3381 }
3382 pid = 0;
3383
3384 /* And now retrieve the PID of the inner child. */
3385 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3386 if (l < 0) {
3387 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3388 goto finish;
3389 }
3390 if (l != sizeof(pid)) {
3391 log_error("Short read while reading inner child PID: %m");
3392 r = EIO;
3393 goto finish;
3394 }
3395
3396 log_debug("Init process invoked as PID " PID_FMT, pid);
3397
3398 if (arg_userns) {
3399 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3400 log_error("Child died too early.");
3401 r = -ESRCH;
3402 goto finish;
3403 }
3404
3405 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3406 if (l < 0) {
3407 r = log_error_errno(errno, "Failed to read UID shift: %m");
3408 goto finish;
3409 }
3410 if (l != sizeof(arg_uid_shift)) {
3411 log_error("Short read while reading UID shift: %m");
3412 r = EIO;
3413 goto finish;
3414 }
3415
3416 r = setup_uid_map(pid);
3417 if (r < 0)
3418 goto finish;
3419
3420 (void) barrier_place(&barrier); /* #2 */
3421 }
3422
3423 if (arg_private_network) {
3424
3425 r = move_network_interfaces(pid, arg_network_interfaces);
3426 if (r < 0)
3427 goto finish;
3428
3429 if (arg_network_veth) {
3430 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3431 if (r < 0)
3432 goto finish;
3433 else if (r > 0)
3434 ifi = r;
3435
3436 if (arg_network_bridge) {
3437 r = setup_bridge(veth_name, arg_network_bridge);
3438 if (r < 0)
3439 goto finish;
3440 if (r > 0)
3441 ifi = r;
3442 }
3443 }
3444
3445 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3446 if (r < 0)
3447 goto finish;
3448
3449 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3450 if (r < 0)
3451 goto finish;
3452 }
3453
3454 if (arg_register) {
3455 r = register_machine(
3456 arg_machine,
3457 pid,
3458 arg_directory,
3459 arg_uuid,
3460 ifi,
3461 arg_slice,
3462 arg_custom_mounts, arg_n_custom_mounts,
3463 arg_kill_signal,
3464 arg_property,
3465 arg_keep_unit);
3466 if (r < 0)
3467 goto finish;
3468 }
3469
3470 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3471 if (r < 0)
3472 goto finish;
3473
3474 if (arg_keep_unit) {
3475 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3476 if (r < 0)
3477 goto finish;
3478 }
3479
3480 r = chown_cgroup(pid, arg_uid_shift);
3481 if (r < 0)
3482 goto finish;
3483
3484 /* Notify the child that the parent is ready with all
3485 * its setup (including cgroup-ification), and that
3486 * the child can now hand over control to the code to
3487 * run inside the container. */
3488 (void) barrier_place(&barrier); /* #3 */
3489
3490 /* Block SIGCHLD here, before notifying child.
3491 * process_pty() will handle it with the other signals. */
3492 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3493
3494 /* Reset signal to default */
3495 r = default_signals(SIGCHLD, -1);
3496 if (r < 0) {
3497 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3498 goto finish;
3499 }
3500
3501 /* Let the child know that we are ready and wait that the child is completely ready now. */
3502 if (!barrier_place_and_sync(&barrier)) { /* #5 */
3503 log_error("Client died too early.");
3504 r = -ESRCH;
3505 goto finish;
3506 }
3507
3508 sd_notifyf(false,
3509 "READY=1\n"
3510 "STATUS=Container running.\n"
3511 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3512
3513 r = sd_event_new(&event);
3514 if (r < 0) {
3515 log_error_errno(r, "Failed to get default event source: %m");
3516 goto finish;
3517 }
3518
3519 if (arg_kill_signal > 0) {
3520 /* Try to kill the init system on SIGINT or SIGTERM */
3521 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3522 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3523 } else {
3524 /* Immediately exit */
3525 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3526 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3527 }
3528
3529 /* simply exit on sigchld */
3530 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3531
3532 if (arg_expose_ports) {
3533 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3534 if (r < 0)
3535 goto finish;
3536
3537 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3538 }
3539
3540 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3541
3542 r = pty_forward_new(event, master, true, !interactive, &forward);
3543 if (r < 0) {
3544 log_error_errno(r, "Failed to create PTY forwarder: %m");
3545 goto finish;
3546 }
3547
3548 r = sd_event_loop(event);
3549 if (r < 0) {
3550 log_error_errno(r, "Failed to run event loop: %m");
3551 goto finish;
3552 }
3553
3554 pty_forward_get_last_char(forward, &last_char);
3555
3556 forward = pty_forward_free(forward);
3557
3558 if (!arg_quiet && last_char != '\n')
3559 putc('\n', stdout);
3560
3561 /* Kill if it is not dead yet anyway */
3562 if (arg_register && !arg_keep_unit)
3563 terminate_machine(pid);
3564
3565 /* Normally redundant, but better safe than sorry */
3566 kill(pid, SIGKILL);
3567
3568 r = wait_for_container(pid, &container_status);
3569 pid = 0;
3570
3571 if (r < 0)
3572 /* We failed to wait for the container, or the
3573 * container exited abnormally */
3574 goto finish;
3575 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3576 /* The container exited with a non-zero
3577 * status, or with zero status and no reboot
3578 * was requested. */
3579 ret = r;
3580 break;
3581 }
3582
3583 /* CONTAINER_REBOOTED, loop again */
3584
3585 if (arg_keep_unit) {
3586 /* Special handling if we are running as a
3587 * service: instead of simply restarting the
3588 * machine we want to restart the entire
3589 * service, so let's inform systemd about this
3590 * with the special exit code 133. The service
3591 * file uses RestartForceExitStatus=133 so
3592 * that this results in a full nspawn
3593 * restart. This is necessary since we might
3594 * have cgroup parameters set we want to have
3595 * flushed out. */
3596 ret = 133;
3597 r = 0;
3598 break;
3599 }
3600
3601 expose_port_flush(arg_expose_ports, &exposed);
3602 }
3603
3604 finish:
3605 sd_notify(false,
3606 "STOPPING=1\n"
3607 "STATUS=Terminating...");
3608
3609 if (pid > 0)
3610 kill(pid, SIGKILL);
3611
3612 /* Try to flush whatever is still queued in the pty */
3613 if (master >= 0)
3614 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
3615
3616 loop_remove(loop_nr, &image_fd);
3617
3618 if (remove_subvol && arg_directory) {
3619 int k;
3620
3621 k = btrfs_subvol_remove(arg_directory, true);
3622 if (k < 0)
3623 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3624 }
3625
3626 if (arg_machine) {
3627 const char *p;
3628
3629 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3630 (void) rm_rf(p, REMOVE_ROOT);
3631 }
3632
3633 expose_port_flush(arg_expose_ports, &exposed);
3634
3635 free(arg_directory);
3636 free(arg_template);
3637 free(arg_image);
3638 free(arg_machine);
3639 free(arg_user);
3640 strv_free(arg_setenv);
3641 free(arg_network_bridge);
3642 strv_free(arg_network_interfaces);
3643 strv_free(arg_network_macvlan);
3644 strv_free(arg_network_ipvlan);
3645 strv_free(arg_parameters);
3646 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3647 expose_port_free_all(arg_expose_ports);
3648
3649 return r < 0 ? EXIT_FAILURE : ret;
3650 }