]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
util-lib: get_current_dir_name() can return errors other than ENOMEM
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #ifdef HAVE_BLKID
23 #include <blkid/blkid.h>
24 #endif
25 #include <errno.h>
26 #include <getopt.h>
27 #include <linux/loop.h>
28 #include <sched.h>
29 #ifdef HAVE_SECCOMP
30 #include <seccomp.h>
31 #endif
32 #ifdef HAVE_SELINUX
33 #include <selinux/selinux.h>
34 #endif
35 #include <signal.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <sys/file.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
44 #include <unistd.h>
45
46 #include "sd-daemon.h"
47 #include "sd-id128.h"
48
49 #include "barrier.h"
50 #include "base-filesystem.h"
51 #include "blkid-util.h"
52 #include "btrfs-util.h"
53 #include "cap-list.h"
54 #include "capability.h"
55 #include "cgroup-util.h"
56 #include "copy.h"
57 #include "dev-setup.h"
58 #include "env-util.h"
59 #include "event-util.h"
60 #include "fdset.h"
61 #include "fileio.h"
62 #include "formats-util.h"
63 #include "gpt.h"
64 #include "hostname-util.h"
65 #include "log.h"
66 #include "loopback-setup.h"
67 #include "machine-image.h"
68 #include "macro.h"
69 #include "missing.h"
70 #include "mkdir.h"
71 #include "netlink-util.h"
72 #include "path-util.h"
73 #include "process-util.h"
74 #include "ptyfwd.h"
75 #include "random-util.h"
76 #include "rm-rf.h"
77 #ifdef HAVE_SECCOMP
78 #include "seccomp-util.h"
79 #endif
80 #include "signal-util.h"
81 #include "strv.h"
82 #include "terminal-util.h"
83 #include "udev-util.h"
84 #include "util.h"
85
86 #include "nspawn-cgroup.h"
87 #include "nspawn-expose-ports.h"
88 #include "nspawn-mount.h"
89 #include "nspawn-network.h"
90 #include "nspawn-register.h"
91 #include "nspawn-settings.h"
92 #include "nspawn-setuid.h"
93
94 typedef enum ContainerStatus {
95 CONTAINER_TERMINATED,
96 CONTAINER_REBOOTED
97 } ContainerStatus;
98
99 typedef enum LinkJournal {
100 LINK_NO,
101 LINK_AUTO,
102 LINK_HOST,
103 LINK_GUEST
104 } LinkJournal;
105
106 static char *arg_directory = NULL;
107 static char *arg_template = NULL;
108 static char *arg_user = NULL;
109 static sd_id128_t arg_uuid = {};
110 static char *arg_machine = NULL;
111 static const char *arg_selinux_context = NULL;
112 static const char *arg_selinux_apifs_context = NULL;
113 static const char *arg_slice = NULL;
114 static bool arg_private_network = false;
115 static bool arg_read_only = false;
116 static bool arg_boot = false;
117 static bool arg_ephemeral = false;
118 static LinkJournal arg_link_journal = LINK_AUTO;
119 static bool arg_link_journal_try = false;
120 static uint64_t arg_retain =
121 (1ULL << CAP_CHOWN) |
122 (1ULL << CAP_DAC_OVERRIDE) |
123 (1ULL << CAP_DAC_READ_SEARCH) |
124 (1ULL << CAP_FOWNER) |
125 (1ULL << CAP_FSETID) |
126 (1ULL << CAP_IPC_OWNER) |
127 (1ULL << CAP_KILL) |
128 (1ULL << CAP_LEASE) |
129 (1ULL << CAP_LINUX_IMMUTABLE) |
130 (1ULL << CAP_NET_BIND_SERVICE) |
131 (1ULL << CAP_NET_BROADCAST) |
132 (1ULL << CAP_NET_RAW) |
133 (1ULL << CAP_SETGID) |
134 (1ULL << CAP_SETFCAP) |
135 (1ULL << CAP_SETPCAP) |
136 (1ULL << CAP_SETUID) |
137 (1ULL << CAP_SYS_ADMIN) |
138 (1ULL << CAP_SYS_CHROOT) |
139 (1ULL << CAP_SYS_NICE) |
140 (1ULL << CAP_SYS_PTRACE) |
141 (1ULL << CAP_SYS_TTY_CONFIG) |
142 (1ULL << CAP_SYS_RESOURCE) |
143 (1ULL << CAP_SYS_BOOT) |
144 (1ULL << CAP_AUDIT_WRITE) |
145 (1ULL << CAP_AUDIT_CONTROL) |
146 (1ULL << CAP_MKNOD);
147 static CustomMount *arg_custom_mounts = NULL;
148 static unsigned arg_n_custom_mounts = 0;
149 static char **arg_setenv = NULL;
150 static bool arg_quiet = false;
151 static bool arg_share_system = false;
152 static bool arg_register = true;
153 static bool arg_keep_unit = false;
154 static char **arg_network_interfaces = NULL;
155 static char **arg_network_macvlan = NULL;
156 static char **arg_network_ipvlan = NULL;
157 static bool arg_network_veth = false;
158 static char *arg_network_bridge = NULL;
159 static unsigned long arg_personality = PERSONALITY_INVALID;
160 static char *arg_image = NULL;
161 static VolatileMode arg_volatile_mode = VOLATILE_NO;
162 static ExposePort *arg_expose_ports = NULL;
163 static char **arg_property = NULL;
164 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
165 static bool arg_userns = false;
166 static int arg_kill_signal = 0;
167 static bool arg_unified_cgroup_hierarchy = false;
168 static SettingsMask arg_settings_mask = 0;
169 static int arg_settings_trusted = -1;
170 static char **arg_parameters = NULL;
171
172 static void help(void) {
173 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
174 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
175 " -h --help Show this help\n"
176 " --version Print version string\n"
177 " -q --quiet Do not show status information\n"
178 " -D --directory=PATH Root directory for the container\n"
179 " --template=PATH Initialize root directory from template directory,\n"
180 " if missing\n"
181 " -x --ephemeral Run container with snapshot of root directory, and\n"
182 " remove it after exit\n"
183 " -i --image=PATH File system device or disk image for the container\n"
184 " -b --boot Boot up full system (i.e. invoke init)\n"
185 " -u --user=USER Run the command under specified user or uid\n"
186 " -M --machine=NAME Set the machine name for the container\n"
187 " --uuid=UUID Set a specific machine UUID for the container\n"
188 " -S --slice=SLICE Place the container in the specified slice\n"
189 " --property=NAME=VALUE Set scope unit property\n"
190 " --private-users[=UIDBASE[:NUIDS]]\n"
191 " Run within user namespace\n"
192 " --private-network Disable network in container\n"
193 " --network-interface=INTERFACE\n"
194 " Assign an existing network interface to the\n"
195 " container\n"
196 " --network-macvlan=INTERFACE\n"
197 " Create a macvlan network interface based on an\n"
198 " existing network interface to the container\n"
199 " --network-ipvlan=INTERFACE\n"
200 " Create a ipvlan network interface based on an\n"
201 " existing network interface to the container\n"
202 " -n --network-veth Add a virtual ethernet connection between host\n"
203 " and container\n"
204 " --network-bridge=INTERFACE\n"
205 " Add a virtual ethernet connection between host\n"
206 " and container and add it to an existing bridge on\n"
207 " the host\n"
208 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
209 " Expose a container IP port on the host\n"
210 " -Z --selinux-context=SECLABEL\n"
211 " Set the SELinux security context to be used by\n"
212 " processes in the container\n"
213 " -L --selinux-apifs-context=SECLABEL\n"
214 " Set the SELinux security context to be used by\n"
215 " API/tmpfs file systems in the container\n"
216 " --capability=CAP In addition to the default, retain specified\n"
217 " capability\n"
218 " --drop-capability=CAP Drop the specified capability from the default set\n"
219 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
220 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
221 " try-guest, try-host\n"
222 " -j Equivalent to --link-journal=try-guest\n"
223 " --read-only Mount the root directory read-only\n"
224 " --bind=PATH[:PATH[:OPTIONS]]\n"
225 " Bind mount a file or directory from the host into\n"
226 " the container\n"
227 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
228 " Similar, but creates a read-only bind mount\n"
229 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
230 " --overlay=PATH[:PATH...]:PATH\n"
231 " Create an overlay mount from the host to \n"
232 " the container\n"
233 " --overlay-ro=PATH[:PATH...]:PATH\n"
234 " Similar, but creates a read-only overlay mount\n"
235 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
236 " --share-system Share system namespaces with host\n"
237 " --register=BOOLEAN Register container as machine\n"
238 " --keep-unit Do not register a scope for the machine, reuse\n"
239 " the service unit nspawn is running in\n"
240 " --volatile[=MODE] Run the system in volatile mode\n"
241 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
242 , program_invocation_short_name);
243 }
244
245
246 static int custom_mounts_prepare(void) {
247 unsigned i;
248 int r;
249
250 /* Ensure the mounts are applied prefix first. */
251 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
252
253 /* Allocate working directories for the overlay file systems that need it */
254 for (i = 0; i < arg_n_custom_mounts; i++) {
255 CustomMount *m = &arg_custom_mounts[i];
256
257 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
258 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
259 return -EINVAL;
260 }
261
262 if (m->type != CUSTOM_MOUNT_OVERLAY)
263 continue;
264
265 if (m->work_dir)
266 continue;
267
268 if (m->read_only)
269 continue;
270
271 r = tempfn_random(m->source, NULL, &m->work_dir);
272 if (r < 0)
273 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
274 }
275
276 return 0;
277 }
278
279 static int set_sanitized_path(char **b, const char *path) {
280 char *p;
281 int r;
282
283 assert(b);
284 assert(path);
285
286 p = canonicalize_file_name(path);
287 if (!p) {
288 if (errno != ENOENT)
289 return -errno;
290
291 r = path_make_absolute_cwd(path, &p);
292 if (r < 0)
293 return r;
294 }
295
296 free(*b);
297 *b = path_kill_slashes(p);
298 return 0;
299 }
300
301 static int detect_unified_cgroup_hierarchy(void) {
302 const char *e;
303 int r;
304
305 /* Allow the user to control whether the unified hierarchy is used */
306 e = getenv("UNIFIED_CGROUP_HIERARCHY");
307 if (e) {
308 r = parse_boolean(e);
309 if (r < 0)
310 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
311
312 arg_unified_cgroup_hierarchy = r;
313 return 0;
314 }
315
316 /* Otherwise inherit the default from the host system */
317 r = cg_unified();
318 if (r < 0)
319 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
320
321 arg_unified_cgroup_hierarchy = r;
322 return 0;
323 }
324
325 static int parse_argv(int argc, char *argv[]) {
326
327 enum {
328 ARG_VERSION = 0x100,
329 ARG_PRIVATE_NETWORK,
330 ARG_UUID,
331 ARG_READ_ONLY,
332 ARG_CAPABILITY,
333 ARG_DROP_CAPABILITY,
334 ARG_LINK_JOURNAL,
335 ARG_BIND,
336 ARG_BIND_RO,
337 ARG_TMPFS,
338 ARG_OVERLAY,
339 ARG_OVERLAY_RO,
340 ARG_SETENV,
341 ARG_SHARE_SYSTEM,
342 ARG_REGISTER,
343 ARG_KEEP_UNIT,
344 ARG_NETWORK_INTERFACE,
345 ARG_NETWORK_MACVLAN,
346 ARG_NETWORK_IPVLAN,
347 ARG_NETWORK_BRIDGE,
348 ARG_PERSONALITY,
349 ARG_VOLATILE,
350 ARG_TEMPLATE,
351 ARG_PROPERTY,
352 ARG_PRIVATE_USERS,
353 ARG_KILL_SIGNAL,
354 ARG_SETTINGS,
355 };
356
357 static const struct option options[] = {
358 { "help", no_argument, NULL, 'h' },
359 { "version", no_argument, NULL, ARG_VERSION },
360 { "directory", required_argument, NULL, 'D' },
361 { "template", required_argument, NULL, ARG_TEMPLATE },
362 { "ephemeral", no_argument, NULL, 'x' },
363 { "user", required_argument, NULL, 'u' },
364 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
365 { "boot", no_argument, NULL, 'b' },
366 { "uuid", required_argument, NULL, ARG_UUID },
367 { "read-only", no_argument, NULL, ARG_READ_ONLY },
368 { "capability", required_argument, NULL, ARG_CAPABILITY },
369 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
370 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
371 { "bind", required_argument, NULL, ARG_BIND },
372 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
373 { "tmpfs", required_argument, NULL, ARG_TMPFS },
374 { "overlay", required_argument, NULL, ARG_OVERLAY },
375 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
376 { "machine", required_argument, NULL, 'M' },
377 { "slice", required_argument, NULL, 'S' },
378 { "setenv", required_argument, NULL, ARG_SETENV },
379 { "selinux-context", required_argument, NULL, 'Z' },
380 { "selinux-apifs-context", required_argument, NULL, 'L' },
381 { "quiet", no_argument, NULL, 'q' },
382 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
383 { "register", required_argument, NULL, ARG_REGISTER },
384 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
385 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
386 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
387 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
388 { "network-veth", no_argument, NULL, 'n' },
389 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
390 { "personality", required_argument, NULL, ARG_PERSONALITY },
391 { "image", required_argument, NULL, 'i' },
392 { "volatile", optional_argument, NULL, ARG_VOLATILE },
393 { "port", required_argument, NULL, 'p' },
394 { "property", required_argument, NULL, ARG_PROPERTY },
395 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
396 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
397 { "settings", required_argument, NULL, ARG_SETTINGS },
398 {}
399 };
400
401 int c, r;
402 uint64_t plus = 0, minus = 0;
403 bool mask_all_settings = false, mask_no_settings = false;
404
405 assert(argc >= 0);
406 assert(argv);
407
408 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
409
410 switch (c) {
411
412 case 'h':
413 help();
414 return 0;
415
416 case ARG_VERSION:
417 return version();
418
419 case 'D':
420 r = set_sanitized_path(&arg_directory, optarg);
421 if (r < 0)
422 return log_error_errno(r, "Invalid root directory: %m");
423
424 break;
425
426 case ARG_TEMPLATE:
427 r = set_sanitized_path(&arg_template, optarg);
428 if (r < 0)
429 return log_error_errno(r, "Invalid template directory: %m");
430
431 break;
432
433 case 'i':
434 r = set_sanitized_path(&arg_image, optarg);
435 if (r < 0)
436 return log_error_errno(r, "Invalid image path: %m");
437
438 break;
439
440 case 'x':
441 arg_ephemeral = true;
442 break;
443
444 case 'u':
445 r = free_and_strdup(&arg_user, optarg);
446 if (r < 0)
447 return log_oom();
448
449 arg_settings_mask |= SETTING_USER;
450 break;
451
452 case ARG_NETWORK_BRIDGE:
453 r = free_and_strdup(&arg_network_bridge, optarg);
454 if (r < 0)
455 return log_oom();
456
457 /* fall through */
458
459 case 'n':
460 arg_network_veth = true;
461 arg_private_network = true;
462 arg_settings_mask |= SETTING_NETWORK;
463 break;
464
465 case ARG_NETWORK_INTERFACE:
466 if (strv_extend(&arg_network_interfaces, optarg) < 0)
467 return log_oom();
468
469 arg_private_network = true;
470 arg_settings_mask |= SETTING_NETWORK;
471 break;
472
473 case ARG_NETWORK_MACVLAN:
474 if (strv_extend(&arg_network_macvlan, optarg) < 0)
475 return log_oom();
476
477 arg_private_network = true;
478 arg_settings_mask |= SETTING_NETWORK;
479 break;
480
481 case ARG_NETWORK_IPVLAN:
482 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
483 return log_oom();
484
485 /* fall through */
486
487 case ARG_PRIVATE_NETWORK:
488 arg_private_network = true;
489 arg_settings_mask |= SETTING_NETWORK;
490 break;
491
492 case 'b':
493 arg_boot = true;
494 arg_settings_mask |= SETTING_BOOT;
495 break;
496
497 case ARG_UUID:
498 r = sd_id128_from_string(optarg, &arg_uuid);
499 if (r < 0) {
500 log_error("Invalid UUID: %s", optarg);
501 return r;
502 }
503
504 arg_settings_mask |= SETTING_MACHINE_ID;
505 break;
506
507 case 'S':
508 arg_slice = optarg;
509 break;
510
511 case 'M':
512 if (isempty(optarg))
513 arg_machine = mfree(arg_machine);
514 else {
515 if (!machine_name_is_valid(optarg)) {
516 log_error("Invalid machine name: %s", optarg);
517 return -EINVAL;
518 }
519
520 r = free_and_strdup(&arg_machine, optarg);
521 if (r < 0)
522 return log_oom();
523
524 break;
525 }
526
527 case 'Z':
528 arg_selinux_context = optarg;
529 break;
530
531 case 'L':
532 arg_selinux_apifs_context = optarg;
533 break;
534
535 case ARG_READ_ONLY:
536 arg_read_only = true;
537 arg_settings_mask |= SETTING_READ_ONLY;
538 break;
539
540 case ARG_CAPABILITY:
541 case ARG_DROP_CAPABILITY: {
542 const char *state, *word;
543 size_t length;
544
545 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
546 _cleanup_free_ char *t;
547
548 t = strndup(word, length);
549 if (!t)
550 return log_oom();
551
552 if (streq(t, "all")) {
553 if (c == ARG_CAPABILITY)
554 plus = (uint64_t) -1;
555 else
556 minus = (uint64_t) -1;
557 } else {
558 int cap;
559
560 cap = capability_from_name(t);
561 if (cap < 0) {
562 log_error("Failed to parse capability %s.", t);
563 return -EINVAL;
564 }
565
566 if (c == ARG_CAPABILITY)
567 plus |= 1ULL << (uint64_t) cap;
568 else
569 minus |= 1ULL << (uint64_t) cap;
570 }
571 }
572
573 arg_settings_mask |= SETTING_CAPABILITY;
574 break;
575 }
576
577 case 'j':
578 arg_link_journal = LINK_GUEST;
579 arg_link_journal_try = true;
580 break;
581
582 case ARG_LINK_JOURNAL:
583 if (streq(optarg, "auto")) {
584 arg_link_journal = LINK_AUTO;
585 arg_link_journal_try = false;
586 } else if (streq(optarg, "no")) {
587 arg_link_journal = LINK_NO;
588 arg_link_journal_try = false;
589 } else if (streq(optarg, "guest")) {
590 arg_link_journal = LINK_GUEST;
591 arg_link_journal_try = false;
592 } else if (streq(optarg, "host")) {
593 arg_link_journal = LINK_HOST;
594 arg_link_journal_try = false;
595 } else if (streq(optarg, "try-guest")) {
596 arg_link_journal = LINK_GUEST;
597 arg_link_journal_try = true;
598 } else if (streq(optarg, "try-host")) {
599 arg_link_journal = LINK_HOST;
600 arg_link_journal_try = true;
601 } else {
602 log_error("Failed to parse link journal mode %s", optarg);
603 return -EINVAL;
604 }
605
606 break;
607
608 case ARG_BIND:
609 case ARG_BIND_RO:
610 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
611 if (r < 0)
612 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
613
614 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
615 break;
616
617 case ARG_TMPFS:
618 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
619 if (r < 0)
620 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
621
622 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
623 break;
624
625 case ARG_OVERLAY:
626 case ARG_OVERLAY_RO: {
627 _cleanup_free_ char *upper = NULL, *destination = NULL;
628 _cleanup_strv_free_ char **lower = NULL;
629 CustomMount *m;
630 unsigned n = 0;
631 char **i;
632
633 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
634 if (r == -ENOMEM)
635 return log_oom();
636 else if (r < 0) {
637 log_error("Invalid overlay specification: %s", optarg);
638 return r;
639 }
640
641 STRV_FOREACH(i, lower) {
642 if (!path_is_absolute(*i)) {
643 log_error("Overlay path %s is not absolute.", *i);
644 return -EINVAL;
645 }
646
647 n++;
648 }
649
650 if (n < 2) {
651 log_error("--overlay= needs at least two colon-separated directories specified.");
652 return -EINVAL;
653 }
654
655 if (n == 2) {
656 /* If two parameters are specified,
657 * the first one is the lower, the
658 * second one the upper directory. And
659 * we'll also define the destination
660 * mount point the same as the upper. */
661 upper = lower[1];
662 lower[1] = NULL;
663
664 destination = strdup(upper);
665 if (!destination)
666 return log_oom();
667
668 } else {
669 upper = lower[n - 2];
670 destination = lower[n - 1];
671 lower[n - 2] = NULL;
672 }
673
674 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
675 if (!m)
676 return log_oom();
677
678 m->destination = destination;
679 m->source = upper;
680 m->lower = lower;
681 m->read_only = c == ARG_OVERLAY_RO;
682
683 upper = destination = NULL;
684 lower = NULL;
685
686 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
687 break;
688 }
689
690 case ARG_SETENV: {
691 char **n;
692
693 if (!env_assignment_is_valid(optarg)) {
694 log_error("Environment variable assignment '%s' is not valid.", optarg);
695 return -EINVAL;
696 }
697
698 n = strv_env_set(arg_setenv, optarg);
699 if (!n)
700 return log_oom();
701
702 strv_free(arg_setenv);
703 arg_setenv = n;
704
705 arg_settings_mask |= SETTING_ENVIRONMENT;
706 break;
707 }
708
709 case 'q':
710 arg_quiet = true;
711 break;
712
713 case ARG_SHARE_SYSTEM:
714 arg_share_system = true;
715 break;
716
717 case ARG_REGISTER:
718 r = parse_boolean(optarg);
719 if (r < 0) {
720 log_error("Failed to parse --register= argument: %s", optarg);
721 return r;
722 }
723
724 arg_register = r;
725 break;
726
727 case ARG_KEEP_UNIT:
728 arg_keep_unit = true;
729 break;
730
731 case ARG_PERSONALITY:
732
733 arg_personality = personality_from_string(optarg);
734 if (arg_personality == PERSONALITY_INVALID) {
735 log_error("Unknown or unsupported personality '%s'.", optarg);
736 return -EINVAL;
737 }
738
739 arg_settings_mask |= SETTING_PERSONALITY;
740 break;
741
742 case ARG_VOLATILE:
743
744 if (!optarg)
745 arg_volatile_mode = VOLATILE_YES;
746 else {
747 VolatileMode m;
748
749 m = volatile_mode_from_string(optarg);
750 if (m < 0) {
751 log_error("Failed to parse --volatile= argument: %s", optarg);
752 return -EINVAL;
753 } else
754 arg_volatile_mode = m;
755 }
756
757 arg_settings_mask |= SETTING_VOLATILE_MODE;
758 break;
759
760 case 'p':
761 r = expose_port_parse(&arg_expose_ports, optarg);
762 if (r == -EEXIST)
763 return log_error_errno(r, "Duplicate port specification: %s", optarg);
764 if (r < 0)
765 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
766
767 arg_settings_mask |= SETTING_EXPOSE_PORTS;
768 break;
769
770 case ARG_PROPERTY:
771 if (strv_extend(&arg_property, optarg) < 0)
772 return log_oom();
773
774 break;
775
776 case ARG_PRIVATE_USERS:
777 if (optarg) {
778 _cleanup_free_ char *buffer = NULL;
779 const char *range, *shift;
780
781 range = strchr(optarg, ':');
782 if (range) {
783 buffer = strndup(optarg, range - optarg);
784 if (!buffer)
785 return log_oom();
786 shift = buffer;
787
788 range++;
789 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
790 log_error("Failed to parse UID range: %s", range);
791 return -EINVAL;
792 }
793 } else
794 shift = optarg;
795
796 if (parse_uid(shift, &arg_uid_shift) < 0) {
797 log_error("Failed to parse UID: %s", optarg);
798 return -EINVAL;
799 }
800 }
801
802 arg_userns = true;
803 break;
804
805 case ARG_KILL_SIGNAL:
806 arg_kill_signal = signal_from_string_try_harder(optarg);
807 if (arg_kill_signal < 0) {
808 log_error("Cannot parse signal: %s", optarg);
809 return -EINVAL;
810 }
811
812 arg_settings_mask |= SETTING_KILL_SIGNAL;
813 break;
814
815 case ARG_SETTINGS:
816
817 /* no → do not read files
818 * yes → read files, do not override cmdline, trust only subset
819 * override → read files, override cmdline, trust only subset
820 * trusted → read files, do not override cmdline, trust all
821 */
822
823 r = parse_boolean(optarg);
824 if (r < 0) {
825 if (streq(optarg, "trusted")) {
826 mask_all_settings = false;
827 mask_no_settings = false;
828 arg_settings_trusted = true;
829
830 } else if (streq(optarg, "override")) {
831 mask_all_settings = false;
832 mask_no_settings = true;
833 arg_settings_trusted = -1;
834 } else
835 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
836 } else if (r > 0) {
837 /* yes */
838 mask_all_settings = false;
839 mask_no_settings = false;
840 arg_settings_trusted = -1;
841 } else {
842 /* no */
843 mask_all_settings = true;
844 mask_no_settings = false;
845 arg_settings_trusted = false;
846 }
847
848 break;
849
850 case '?':
851 return -EINVAL;
852
853 default:
854 assert_not_reached("Unhandled option");
855 }
856
857 if (arg_share_system)
858 arg_register = false;
859
860 if (arg_boot && arg_share_system) {
861 log_error("--boot and --share-system may not be combined.");
862 return -EINVAL;
863 }
864
865 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
866 log_error("--keep-unit may not be used when invoked from a user session.");
867 return -EINVAL;
868 }
869
870 if (arg_directory && arg_image) {
871 log_error("--directory= and --image= may not be combined.");
872 return -EINVAL;
873 }
874
875 if (arg_template && arg_image) {
876 log_error("--template= and --image= may not be combined.");
877 return -EINVAL;
878 }
879
880 if (arg_template && !(arg_directory || arg_machine)) {
881 log_error("--template= needs --directory= or --machine=.");
882 return -EINVAL;
883 }
884
885 if (arg_ephemeral && arg_template) {
886 log_error("--ephemeral and --template= may not be combined.");
887 return -EINVAL;
888 }
889
890 if (arg_ephemeral && arg_image) {
891 log_error("--ephemeral and --image= may not be combined.");
892 return -EINVAL;
893 }
894
895 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
896 log_error("--ephemeral and --link-journal= may not be combined.");
897 return -EINVAL;
898 }
899
900 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
901 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
902
903 if (argc > optind) {
904 arg_parameters = strv_copy(argv + optind);
905 if (!arg_parameters)
906 return log_oom();
907
908 arg_settings_mask |= SETTING_BOOT;
909 }
910
911 /* Load all settings from .nspawn files */
912 if (mask_no_settings)
913 arg_settings_mask = 0;
914
915 /* Don't load any settings from .nspawn files */
916 if (mask_all_settings)
917 arg_settings_mask = _SETTINGS_MASK_ALL;
918
919 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
920
921 r = detect_unified_cgroup_hierarchy();
922 if (r < 0)
923 return r;
924
925 return 1;
926 }
927
928 static int verify_arguments(void) {
929
930 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
931 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
932 return -EINVAL;
933 }
934
935 if (arg_expose_ports && !arg_private_network) {
936 log_error("Cannot use --port= without private networking.");
937 return -EINVAL;
938 }
939
940 if (arg_boot && arg_kill_signal <= 0)
941 arg_kill_signal = SIGRTMIN+3;
942
943 return 0;
944 }
945
946 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
947 assert(p);
948
949 if (!arg_userns)
950 return 0;
951
952 if (uid == UID_INVALID && gid == GID_INVALID)
953 return 0;
954
955 if (uid != UID_INVALID) {
956 uid += arg_uid_shift;
957
958 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
959 return -EOVERFLOW;
960 }
961
962 if (gid != GID_INVALID) {
963 gid += (gid_t) arg_uid_shift;
964
965 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
966 return -EOVERFLOW;
967 }
968
969 if (lchown(p, uid, gid) < 0)
970 return -errno;
971
972 return 0;
973 }
974
975 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
976 const char *q;
977
978 q = prefix_roota(root, path);
979 if (mkdir(q, mode) < 0) {
980 if (errno == EEXIST)
981 return 0;
982 return -errno;
983 }
984
985 return userns_lchown(q, uid, gid);
986 }
987
988 static int setup_timezone(const char *dest) {
989 _cleanup_free_ char *p = NULL, *q = NULL;
990 const char *where, *check, *what;
991 char *z, *y;
992 int r;
993
994 assert(dest);
995
996 /* Fix the timezone, if possible */
997 r = readlink_malloc("/etc/localtime", &p);
998 if (r < 0) {
999 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1000 return 0;
1001 }
1002
1003 z = path_startswith(p, "../usr/share/zoneinfo/");
1004 if (!z)
1005 z = path_startswith(p, "/usr/share/zoneinfo/");
1006 if (!z) {
1007 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1008 return 0;
1009 }
1010
1011 where = prefix_roota(dest, "/etc/localtime");
1012 r = readlink_malloc(where, &q);
1013 if (r >= 0) {
1014 y = path_startswith(q, "../usr/share/zoneinfo/");
1015 if (!y)
1016 y = path_startswith(q, "/usr/share/zoneinfo/");
1017
1018 /* Already pointing to the right place? Then do nothing .. */
1019 if (y && streq(y, z))
1020 return 0;
1021 }
1022
1023 check = strjoina("/usr/share/zoneinfo/", z);
1024 check = prefix_root(dest, check);
1025 if (laccess(check, F_OK) < 0) {
1026 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1027 return 0;
1028 }
1029
1030 r = unlink(where);
1031 if (r < 0 && errno != ENOENT) {
1032 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1033 return 0;
1034 }
1035
1036 what = strjoina("../usr/share/zoneinfo/", z);
1037 if (symlink(what, where) < 0) {
1038 log_error_errno(errno, "Failed to correct timezone of container: %m");
1039 return 0;
1040 }
1041
1042 r = userns_lchown(where, 0, 0);
1043 if (r < 0)
1044 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1045
1046 return 0;
1047 }
1048
1049 static int setup_resolv_conf(const char *dest) {
1050 const char *where = NULL;
1051 int r;
1052
1053 assert(dest);
1054
1055 if (arg_private_network)
1056 return 0;
1057
1058 /* Fix resolv.conf, if possible */
1059 where = prefix_roota(dest, "/etc/resolv.conf");
1060
1061 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1062 if (r < 0) {
1063 /* If the file already exists as symlink, let's
1064 * suppress the warning, under the assumption that
1065 * resolved or something similar runs inside and the
1066 * symlink points there.
1067 *
1068 * If the disk image is read-only, there's also no
1069 * point in complaining.
1070 */
1071 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1072 "Failed to copy /etc/resolv.conf to %s: %m", where);
1073 return 0;
1074 }
1075
1076 r = userns_lchown(where, 0, 0);
1077 if (r < 0)
1078 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1079
1080 return 0;
1081 }
1082
1083 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1084 assert(s);
1085
1086 snprintf(s, 37,
1087 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1088 SD_ID128_FORMAT_VAL(id));
1089
1090 return s;
1091 }
1092
1093 static int setup_boot_id(const char *dest) {
1094 const char *from, *to;
1095 sd_id128_t rnd = {};
1096 char as_uuid[37];
1097 int r;
1098
1099 if (arg_share_system)
1100 return 0;
1101
1102 /* Generate a new randomized boot ID, so that each boot-up of
1103 * the container gets a new one */
1104
1105 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1106 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1107
1108 r = sd_id128_randomize(&rnd);
1109 if (r < 0)
1110 return log_error_errno(r, "Failed to generate random boot id: %m");
1111
1112 id128_format_as_uuid(rnd, as_uuid);
1113
1114 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1115 if (r < 0)
1116 return log_error_errno(r, "Failed to write boot id: %m");
1117
1118 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1119 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1120 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1121 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1122
1123 unlink(from);
1124 return r;
1125 }
1126
1127 static int copy_devnodes(const char *dest) {
1128
1129 static const char devnodes[] =
1130 "null\0"
1131 "zero\0"
1132 "full\0"
1133 "random\0"
1134 "urandom\0"
1135 "tty\0"
1136 "net/tun\0";
1137
1138 const char *d;
1139 int r = 0;
1140 _cleanup_umask_ mode_t u;
1141
1142 assert(dest);
1143
1144 u = umask(0000);
1145
1146 /* Create /dev/net, so that we can create /dev/net/tun in it */
1147 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1148 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1149
1150 NULSTR_FOREACH(d, devnodes) {
1151 _cleanup_free_ char *from = NULL, *to = NULL;
1152 struct stat st;
1153
1154 from = strappend("/dev/", d);
1155 to = prefix_root(dest, from);
1156
1157 if (stat(from, &st) < 0) {
1158
1159 if (errno != ENOENT)
1160 return log_error_errno(errno, "Failed to stat %s: %m", from);
1161
1162 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1163
1164 log_error("%s is not a char or block device, cannot copy.", from);
1165 return -EIO;
1166
1167 } else {
1168 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1169 if (errno != EPERM)
1170 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1171
1172 /* Some systems abusively restrict mknod but
1173 * allow bind mounts. */
1174 r = touch(to);
1175 if (r < 0)
1176 return log_error_errno(r, "touch (%s) failed: %m", to);
1177 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1178 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1179 }
1180
1181 r = userns_lchown(to, 0, 0);
1182 if (r < 0)
1183 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1184 }
1185 }
1186
1187 return r;
1188 }
1189
1190 static int setup_pts(const char *dest) {
1191 _cleanup_free_ char *options = NULL;
1192 const char *p;
1193
1194 #ifdef HAVE_SELINUX
1195 if (arg_selinux_apifs_context)
1196 (void) asprintf(&options,
1197 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1198 arg_uid_shift + TTY_GID,
1199 arg_selinux_apifs_context);
1200 else
1201 #endif
1202 (void) asprintf(&options,
1203 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1204 arg_uid_shift + TTY_GID);
1205
1206 if (!options)
1207 return log_oom();
1208
1209 /* Mount /dev/pts itself */
1210 p = prefix_roota(dest, "/dev/pts");
1211 if (mkdir(p, 0755) < 0)
1212 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1213 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1214 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1215 if (userns_lchown(p, 0, 0) < 0)
1216 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1217
1218 /* Create /dev/ptmx symlink */
1219 p = prefix_roota(dest, "/dev/ptmx");
1220 if (symlink("pts/ptmx", p) < 0)
1221 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1222 if (userns_lchown(p, 0, 0) < 0)
1223 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1224
1225 /* And fix /dev/pts/ptmx ownership */
1226 p = prefix_roota(dest, "/dev/pts/ptmx");
1227 if (userns_lchown(p, 0, 0) < 0)
1228 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1229
1230 return 0;
1231 }
1232
1233 static int setup_dev_console(const char *dest, const char *console) {
1234 _cleanup_umask_ mode_t u;
1235 const char *to;
1236 int r;
1237
1238 assert(dest);
1239 assert(console);
1240
1241 u = umask(0000);
1242
1243 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1244 if (r < 0)
1245 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1246
1247 /* We need to bind mount the right tty to /dev/console since
1248 * ptys can only exist on pts file systems. To have something
1249 * to bind mount things on we create a empty regular file. */
1250
1251 to = prefix_roota(dest, "/dev/console");
1252 r = touch(to);
1253 if (r < 0)
1254 return log_error_errno(r, "touch() for /dev/console failed: %m");
1255
1256 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1257 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1258
1259 return 0;
1260 }
1261
1262 static int setup_kmsg(const char *dest, int kmsg_socket) {
1263 const char *from, *to;
1264 _cleanup_umask_ mode_t u;
1265 int fd, r;
1266
1267 assert(kmsg_socket >= 0);
1268
1269 u = umask(0000);
1270
1271 /* We create the kmsg FIFO as /run/kmsg, but immediately
1272 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1273 * on the reading side behave very similar to /proc/kmsg,
1274 * their writing side behaves differently from /dev/kmsg in
1275 * that writing blocks when nothing is reading. In order to
1276 * avoid any problems with containers deadlocking due to this
1277 * we simply make /dev/kmsg unavailable to the container. */
1278 from = prefix_roota(dest, "/run/kmsg");
1279 to = prefix_roota(dest, "/proc/kmsg");
1280
1281 if (mkfifo(from, 0600) < 0)
1282 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1283 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1284 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1285
1286 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1287 if (fd < 0)
1288 return log_error_errno(errno, "Failed to open fifo: %m");
1289
1290 /* Store away the fd in the socket, so that it stays open as
1291 * long as we run the child */
1292 r = send_one_fd(kmsg_socket, fd, 0);
1293 safe_close(fd);
1294
1295 if (r < 0)
1296 return log_error_errno(r, "Failed to send FIFO fd: %m");
1297
1298 /* And now make the FIFO unavailable as /run/kmsg... */
1299 (void) unlink(from);
1300
1301 return 0;
1302 }
1303
1304 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1305 union in_addr_union *exposed = userdata;
1306
1307 assert(rtnl);
1308 assert(m);
1309 assert(exposed);
1310
1311 expose_port_execute(rtnl, arg_expose_ports, exposed);
1312 return 0;
1313 }
1314
1315 static int setup_hostname(void) {
1316
1317 if (arg_share_system)
1318 return 0;
1319
1320 if (sethostname_idempotent(arg_machine) < 0)
1321 return -errno;
1322
1323 return 0;
1324 }
1325
1326 static int setup_journal(const char *directory) {
1327 sd_id128_t machine_id, this_id;
1328 _cleanup_free_ char *b = NULL, *d = NULL;
1329 const char *etc_machine_id, *p, *q;
1330 char *id;
1331 int r;
1332
1333 /* Don't link journals in ephemeral mode */
1334 if (arg_ephemeral)
1335 return 0;
1336
1337 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
1338
1339 r = read_one_line_file(etc_machine_id, &b);
1340 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1341 return 0;
1342 else if (r < 0)
1343 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
1344
1345 id = strstrip(b);
1346 if (isempty(id) && arg_link_journal == LINK_AUTO)
1347 return 0;
1348
1349 /* Verify validity */
1350 r = sd_id128_from_string(id, &machine_id);
1351 if (r < 0)
1352 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
1353
1354 r = sd_id128_get_machine(&this_id);
1355 if (r < 0)
1356 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1357
1358 if (sd_id128_equal(machine_id, this_id)) {
1359 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1360 "Host and machine ids are equal (%s): refusing to link journals", id);
1361 if (arg_link_journal == LINK_AUTO)
1362 return 0;
1363 return -EEXIST;
1364 }
1365
1366 if (arg_link_journal == LINK_NO)
1367 return 0;
1368
1369 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1370 if (r < 0)
1371 return log_error_errno(r, "Failed to create /var: %m");
1372
1373 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1374 if (r < 0)
1375 return log_error_errno(r, "Failed to create /var/log: %m");
1376
1377 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1378 if (r < 0)
1379 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1380
1381 p = strjoina("/var/log/journal/", id);
1382 q = prefix_roota(directory, p);
1383
1384 if (path_is_mount_point(p, 0) > 0) {
1385 if (arg_link_journal != LINK_AUTO) {
1386 log_error("%s: already a mount point, refusing to use for journal", p);
1387 return -EEXIST;
1388 }
1389
1390 return 0;
1391 }
1392
1393 if (path_is_mount_point(q, 0) > 0) {
1394 if (arg_link_journal != LINK_AUTO) {
1395 log_error("%s: already a mount point, refusing to use for journal", q);
1396 return -EEXIST;
1397 }
1398
1399 return 0;
1400 }
1401
1402 r = readlink_and_make_absolute(p, &d);
1403 if (r >= 0) {
1404 if ((arg_link_journal == LINK_GUEST ||
1405 arg_link_journal == LINK_AUTO) &&
1406 path_equal(d, q)) {
1407
1408 r = userns_mkdir(directory, p, 0755, 0, 0);
1409 if (r < 0)
1410 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1411 return 0;
1412 }
1413
1414 if (unlink(p) < 0)
1415 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1416 } else if (r == -EINVAL) {
1417
1418 if (arg_link_journal == LINK_GUEST &&
1419 rmdir(p) < 0) {
1420
1421 if (errno == ENOTDIR) {
1422 log_error("%s already exists and is neither a symlink nor a directory", p);
1423 return r;
1424 } else {
1425 log_error_errno(errno, "Failed to remove %s: %m", p);
1426 return -errno;
1427 }
1428 }
1429 } else if (r != -ENOENT) {
1430 log_error_errno(errno, "readlink(%s) failed: %m", p);
1431 return r;
1432 }
1433
1434 if (arg_link_journal == LINK_GUEST) {
1435
1436 if (symlink(q, p) < 0) {
1437 if (arg_link_journal_try) {
1438 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1439 return 0;
1440 } else {
1441 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1442 return -errno;
1443 }
1444 }
1445
1446 r = userns_mkdir(directory, p, 0755, 0, 0);
1447 if (r < 0)
1448 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1449 return 0;
1450 }
1451
1452 if (arg_link_journal == LINK_HOST) {
1453 /* don't create parents here -- if the host doesn't have
1454 * permanent journal set up, don't force it here */
1455 r = mkdir(p, 0755);
1456 if (r < 0) {
1457 if (arg_link_journal_try) {
1458 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1459 return 0;
1460 } else {
1461 log_error_errno(errno, "Failed to create %s: %m", p);
1462 return r;
1463 }
1464 }
1465
1466 } else if (access(p, F_OK) < 0)
1467 return 0;
1468
1469 if (dir_is_empty(q) == 0)
1470 log_warning("%s is not empty, proceeding anyway.", q);
1471
1472 r = userns_mkdir(directory, p, 0755, 0, 0);
1473 if (r < 0) {
1474 log_error_errno(errno, "Failed to create %s: %m", q);
1475 return r;
1476 }
1477
1478 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1479 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1480
1481 return 0;
1482 }
1483
1484 static int drop_capabilities(void) {
1485 return capability_bounding_set_drop(~arg_retain, false);
1486 }
1487
1488 static int reset_audit_loginuid(void) {
1489 _cleanup_free_ char *p = NULL;
1490 int r;
1491
1492 if (arg_share_system)
1493 return 0;
1494
1495 r = read_one_line_file("/proc/self/loginuid", &p);
1496 if (r == -ENOENT)
1497 return 0;
1498 if (r < 0)
1499 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1500
1501 /* Already reset? */
1502 if (streq(p, "4294967295"))
1503 return 0;
1504
1505 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1506 if (r < 0) {
1507 log_error_errno(r,
1508 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1509 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1510 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1511 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1512 "using systemd-nspawn. Sleeping for 5s... (%m)");
1513
1514 sleep(5);
1515 }
1516
1517 return 0;
1518 }
1519
1520 static int setup_seccomp(void) {
1521
1522 #ifdef HAVE_SECCOMP
1523 static const struct {
1524 uint64_t capability;
1525 int syscall_num;
1526 } blacklist[] = {
1527 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1528 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1529 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1530 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1531 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1532 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1533 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1534 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1535 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1536 { CAP_SYSLOG, SCMP_SYS(syslog) },
1537 };
1538
1539 scmp_filter_ctx seccomp;
1540 unsigned i;
1541 int r;
1542
1543 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1544 if (!seccomp)
1545 return log_oom();
1546
1547 r = seccomp_add_secondary_archs(seccomp);
1548 if (r < 0) {
1549 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1550 goto finish;
1551 }
1552
1553 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1554 if (arg_retain & (1ULL << blacklist[i].capability))
1555 continue;
1556
1557 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1558 if (r == -EFAULT)
1559 continue; /* unknown syscall */
1560 if (r < 0) {
1561 log_error_errno(r, "Failed to block syscall: %m");
1562 goto finish;
1563 }
1564 }
1565
1566
1567 /*
1568 Audit is broken in containers, much of the userspace audit
1569 hookup will fail if running inside a container. We don't
1570 care and just turn off creation of audit sockets.
1571
1572 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1573 with EAFNOSUPPORT which audit userspace uses as indication
1574 that audit is disabled in the kernel.
1575 */
1576
1577 r = seccomp_rule_add(
1578 seccomp,
1579 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1580 SCMP_SYS(socket),
1581 2,
1582 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1583 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1584 if (r < 0) {
1585 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1586 goto finish;
1587 }
1588
1589 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1590 if (r < 0) {
1591 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1592 goto finish;
1593 }
1594
1595 r = seccomp_load(seccomp);
1596 if (r == -EINVAL) {
1597 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1598 r = 0;
1599 goto finish;
1600 }
1601 if (r < 0) {
1602 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1603 goto finish;
1604 }
1605
1606 finish:
1607 seccomp_release(seccomp);
1608 return r;
1609 #else
1610 return 0;
1611 #endif
1612
1613 }
1614
1615 static int setup_propagate(const char *root) {
1616 const char *p, *q;
1617
1618 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1619 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1620 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1621 (void) mkdir_p(p, 0600);
1622
1623 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
1624 return log_error_errno(errno, "Failed to create /run/systemd: %m");
1625
1626 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1627 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
1628
1629 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1630 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
1631
1632 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1633 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1634 return log_error_errno(errno, "Failed to install propagation bind mount.");
1635
1636 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1637 return log_error_errno(errno, "Failed to make propagation mount read-only");
1638
1639 return 0;
1640 }
1641
1642 static int setup_image(char **device_path, int *loop_nr) {
1643 struct loop_info64 info = {
1644 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1645 };
1646 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1647 _cleanup_free_ char* loopdev = NULL;
1648 struct stat st;
1649 int r, nr;
1650
1651 assert(device_path);
1652 assert(loop_nr);
1653 assert(arg_image);
1654
1655 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1656 if (fd < 0)
1657 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1658
1659 if (fstat(fd, &st) < 0)
1660 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1661
1662 if (S_ISBLK(st.st_mode)) {
1663 char *p;
1664
1665 p = strdup(arg_image);
1666 if (!p)
1667 return log_oom();
1668
1669 *device_path = p;
1670
1671 *loop_nr = -1;
1672
1673 r = fd;
1674 fd = -1;
1675
1676 return r;
1677 }
1678
1679 if (!S_ISREG(st.st_mode)) {
1680 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1681 return -EINVAL;
1682 }
1683
1684 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1685 if (control < 0)
1686 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1687
1688 nr = ioctl(control, LOOP_CTL_GET_FREE);
1689 if (nr < 0)
1690 return log_error_errno(errno, "Failed to allocate loop device: %m");
1691
1692 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1693 return log_oom();
1694
1695 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1696 if (loop < 0)
1697 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1698
1699 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1700 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1701
1702 if (arg_read_only)
1703 info.lo_flags |= LO_FLAGS_READ_ONLY;
1704
1705 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1706 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1707
1708 *device_path = loopdev;
1709 loopdev = NULL;
1710
1711 *loop_nr = nr;
1712
1713 r = loop;
1714 loop = -1;
1715
1716 return r;
1717 }
1718
1719 #define PARTITION_TABLE_BLURB \
1720 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1721 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1722 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1723 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1724 "to be bootable with systemd-nspawn."
1725
1726 static int dissect_image(
1727 int fd,
1728 char **root_device, bool *root_device_rw,
1729 char **home_device, bool *home_device_rw,
1730 char **srv_device, bool *srv_device_rw,
1731 bool *secondary) {
1732
1733 #ifdef HAVE_BLKID
1734 int home_nr = -1, srv_nr = -1;
1735 #ifdef GPT_ROOT_NATIVE
1736 int root_nr = -1;
1737 #endif
1738 #ifdef GPT_ROOT_SECONDARY
1739 int secondary_root_nr = -1;
1740 #endif
1741 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1742 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1743 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1744 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1745 _cleanup_udev_unref_ struct udev *udev = NULL;
1746 struct udev_list_entry *first, *item;
1747 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1748 bool is_gpt, is_mbr, multiple_generic = false;
1749 const char *pttype = NULL;
1750 blkid_partlist pl;
1751 struct stat st;
1752 unsigned i;
1753 int r;
1754
1755 assert(fd >= 0);
1756 assert(root_device);
1757 assert(home_device);
1758 assert(srv_device);
1759 assert(secondary);
1760 assert(arg_image);
1761
1762 b = blkid_new_probe();
1763 if (!b)
1764 return log_oom();
1765
1766 errno = 0;
1767 r = blkid_probe_set_device(b, fd, 0, 0);
1768 if (r != 0) {
1769 if (errno == 0)
1770 return log_oom();
1771
1772 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1773 return -errno;
1774 }
1775
1776 blkid_probe_enable_partitions(b, 1);
1777 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1778
1779 errno = 0;
1780 r = blkid_do_safeprobe(b);
1781 if (r == -2 || r == 1) {
1782 log_error("Failed to identify any partition table on\n"
1783 " %s\n"
1784 PARTITION_TABLE_BLURB, arg_image);
1785 return -EINVAL;
1786 } else if (r != 0) {
1787 if (errno == 0)
1788 errno = EIO;
1789 log_error_errno(errno, "Failed to probe: %m");
1790 return -errno;
1791 }
1792
1793 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1794
1795 is_gpt = streq_ptr(pttype, "gpt");
1796 is_mbr = streq_ptr(pttype, "dos");
1797
1798 if (!is_gpt && !is_mbr) {
1799 log_error("No GPT or MBR partition table discovered on\n"
1800 " %s\n"
1801 PARTITION_TABLE_BLURB, arg_image);
1802 return -EINVAL;
1803 }
1804
1805 errno = 0;
1806 pl = blkid_probe_get_partitions(b);
1807 if (!pl) {
1808 if (errno == 0)
1809 return log_oom();
1810
1811 log_error("Failed to list partitions of %s", arg_image);
1812 return -errno;
1813 }
1814
1815 udev = udev_new();
1816 if (!udev)
1817 return log_oom();
1818
1819 if (fstat(fd, &st) < 0)
1820 return log_error_errno(errno, "Failed to stat block device: %m");
1821
1822 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1823 if (!d)
1824 return log_oom();
1825
1826 for (i = 0;; i++) {
1827 int n, m;
1828
1829 if (i >= 10) {
1830 log_error("Kernel partitions never appeared.");
1831 return -ENXIO;
1832 }
1833
1834 e = udev_enumerate_new(udev);
1835 if (!e)
1836 return log_oom();
1837
1838 r = udev_enumerate_add_match_parent(e, d);
1839 if (r < 0)
1840 return log_oom();
1841
1842 r = udev_enumerate_scan_devices(e);
1843 if (r < 0)
1844 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1845
1846 /* Count the partitions enumerated by the kernel */
1847 n = 0;
1848 first = udev_enumerate_get_list_entry(e);
1849 udev_list_entry_foreach(item, first)
1850 n++;
1851
1852 /* Count the partitions enumerated by blkid */
1853 m = blkid_partlist_numof_partitions(pl);
1854 if (n == m + 1)
1855 break;
1856 if (n > m + 1) {
1857 log_error("blkid and kernel partition list do not match.");
1858 return -EIO;
1859 }
1860 if (n < m + 1) {
1861 unsigned j;
1862
1863 /* The kernel has probed fewer partitions than
1864 * blkid? Maybe the kernel prober is still
1865 * running or it got EBUSY because udev
1866 * already opened the device. Let's reprobe
1867 * the device, which is a synchronous call
1868 * that waits until probing is complete. */
1869
1870 for (j = 0; j < 20; j++) {
1871
1872 r = ioctl(fd, BLKRRPART, 0);
1873 if (r < 0)
1874 r = -errno;
1875 if (r >= 0 || r != -EBUSY)
1876 break;
1877
1878 /* If something else has the device
1879 * open, such as an udev rule, the
1880 * ioctl will return EBUSY. Since
1881 * there's no way to wait until it
1882 * isn't busy anymore, let's just wait
1883 * a bit, and try again.
1884 *
1885 * This is really something they
1886 * should fix in the kernel! */
1887
1888 usleep(50 * USEC_PER_MSEC);
1889 }
1890
1891 if (r < 0)
1892 return log_error_errno(r, "Failed to reread partition table: %m");
1893 }
1894
1895 e = udev_enumerate_unref(e);
1896 }
1897
1898 first = udev_enumerate_get_list_entry(e);
1899 udev_list_entry_foreach(item, first) {
1900 _cleanup_udev_device_unref_ struct udev_device *q;
1901 const char *node;
1902 unsigned long long flags;
1903 blkid_partition pp;
1904 dev_t qn;
1905 int nr;
1906
1907 errno = 0;
1908 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1909 if (!q) {
1910 if (!errno)
1911 errno = ENOMEM;
1912
1913 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1914 return -errno;
1915 }
1916
1917 qn = udev_device_get_devnum(q);
1918 if (major(qn) == 0)
1919 continue;
1920
1921 if (st.st_rdev == qn)
1922 continue;
1923
1924 node = udev_device_get_devnode(q);
1925 if (!node)
1926 continue;
1927
1928 pp = blkid_partlist_devno_to_partition(pl, qn);
1929 if (!pp)
1930 continue;
1931
1932 flags = blkid_partition_get_flags(pp);
1933
1934 nr = blkid_partition_get_partno(pp);
1935 if (nr < 0)
1936 continue;
1937
1938 if (is_gpt) {
1939 sd_id128_t type_id;
1940 const char *stype;
1941
1942 if (flags & GPT_FLAG_NO_AUTO)
1943 continue;
1944
1945 stype = blkid_partition_get_type_string(pp);
1946 if (!stype)
1947 continue;
1948
1949 if (sd_id128_from_string(stype, &type_id) < 0)
1950 continue;
1951
1952 if (sd_id128_equal(type_id, GPT_HOME)) {
1953
1954 if (home && nr >= home_nr)
1955 continue;
1956
1957 home_nr = nr;
1958 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1959
1960 r = free_and_strdup(&home, node);
1961 if (r < 0)
1962 return log_oom();
1963
1964 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1965
1966 if (srv && nr >= srv_nr)
1967 continue;
1968
1969 srv_nr = nr;
1970 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1971
1972 r = free_and_strdup(&srv, node);
1973 if (r < 0)
1974 return log_oom();
1975 }
1976 #ifdef GPT_ROOT_NATIVE
1977 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1978
1979 if (root && nr >= root_nr)
1980 continue;
1981
1982 root_nr = nr;
1983 root_rw = !(flags & GPT_FLAG_READ_ONLY);
1984
1985 r = free_and_strdup(&root, node);
1986 if (r < 0)
1987 return log_oom();
1988 }
1989 #endif
1990 #ifdef GPT_ROOT_SECONDARY
1991 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1992
1993 if (secondary_root && nr >= secondary_root_nr)
1994 continue;
1995
1996 secondary_root_nr = nr;
1997 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
1998
1999 r = free_and_strdup(&secondary_root, node);
2000 if (r < 0)
2001 return log_oom();
2002 }
2003 #endif
2004 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2005
2006 if (generic)
2007 multiple_generic = true;
2008 else {
2009 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2010
2011 r = free_and_strdup(&generic, node);
2012 if (r < 0)
2013 return log_oom();
2014 }
2015 }
2016
2017 } else if (is_mbr) {
2018 int type;
2019
2020 if (flags != 0x80) /* Bootable flag */
2021 continue;
2022
2023 type = blkid_partition_get_type(pp);
2024 if (type != 0x83) /* Linux partition */
2025 continue;
2026
2027 if (generic)
2028 multiple_generic = true;
2029 else {
2030 generic_rw = true;
2031
2032 r = free_and_strdup(&root, node);
2033 if (r < 0)
2034 return log_oom();
2035 }
2036 }
2037 }
2038
2039 if (root) {
2040 *root_device = root;
2041 root = NULL;
2042
2043 *root_device_rw = root_rw;
2044 *secondary = false;
2045 } else if (secondary_root) {
2046 *root_device = secondary_root;
2047 secondary_root = NULL;
2048
2049 *root_device_rw = secondary_root_rw;
2050 *secondary = true;
2051 } else if (generic) {
2052
2053 /* There were no partitions with precise meanings
2054 * around, but we found generic partitions. In this
2055 * case, if there's only one, we can go ahead and boot
2056 * it, otherwise we bail out, because we really cannot
2057 * make any sense of it. */
2058
2059 if (multiple_generic) {
2060 log_error("Identified multiple bootable Linux partitions on\n"
2061 " %s\n"
2062 PARTITION_TABLE_BLURB, arg_image);
2063 return -EINVAL;
2064 }
2065
2066 *root_device = generic;
2067 generic = NULL;
2068
2069 *root_device_rw = generic_rw;
2070 *secondary = false;
2071 } else {
2072 log_error("Failed to identify root partition in disk image\n"
2073 " %s\n"
2074 PARTITION_TABLE_BLURB, arg_image);
2075 return -EINVAL;
2076 }
2077
2078 if (home) {
2079 *home_device = home;
2080 home = NULL;
2081
2082 *home_device_rw = home_rw;
2083 }
2084
2085 if (srv) {
2086 *srv_device = srv;
2087 srv = NULL;
2088
2089 *srv_device_rw = srv_rw;
2090 }
2091
2092 return 0;
2093 #else
2094 log_error("--image= is not supported, compiled without blkid support.");
2095 return -EOPNOTSUPP;
2096 #endif
2097 }
2098
2099 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2100 #ifdef HAVE_BLKID
2101 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2102 const char *fstype, *p;
2103 int r;
2104
2105 assert(what);
2106 assert(where);
2107
2108 if (arg_read_only)
2109 rw = false;
2110
2111 if (directory)
2112 p = strjoina(where, directory);
2113 else
2114 p = where;
2115
2116 errno = 0;
2117 b = blkid_new_probe_from_filename(what);
2118 if (!b) {
2119 if (errno == 0)
2120 return log_oom();
2121 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2122 return -errno;
2123 }
2124
2125 blkid_probe_enable_superblocks(b, 1);
2126 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2127
2128 errno = 0;
2129 r = blkid_do_safeprobe(b);
2130 if (r == -1 || r == 1) {
2131 log_error("Cannot determine file system type of %s", what);
2132 return -EINVAL;
2133 } else if (r != 0) {
2134 if (errno == 0)
2135 errno = EIO;
2136 log_error_errno(errno, "Failed to probe %s: %m", what);
2137 return -errno;
2138 }
2139
2140 errno = 0;
2141 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2142 if (errno == 0)
2143 errno = EINVAL;
2144 log_error("Failed to determine file system type of %s", what);
2145 return -errno;
2146 }
2147
2148 if (streq(fstype, "crypto_LUKS")) {
2149 log_error("nspawn currently does not support LUKS disk images.");
2150 return -EOPNOTSUPP;
2151 }
2152
2153 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2154 return log_error_errno(errno, "Failed to mount %s: %m", what);
2155
2156 return 0;
2157 #else
2158 log_error("--image= is not supported, compiled without blkid support.");
2159 return -EOPNOTSUPP;
2160 #endif
2161 }
2162
2163 static int mount_devices(
2164 const char *where,
2165 const char *root_device, bool root_device_rw,
2166 const char *home_device, bool home_device_rw,
2167 const char *srv_device, bool srv_device_rw) {
2168 int r;
2169
2170 assert(where);
2171
2172 if (root_device) {
2173 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2174 if (r < 0)
2175 return log_error_errno(r, "Failed to mount root directory: %m");
2176 }
2177
2178 if (home_device) {
2179 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2180 if (r < 0)
2181 return log_error_errno(r, "Failed to mount home directory: %m");
2182 }
2183
2184 if (srv_device) {
2185 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2186 if (r < 0)
2187 return log_error_errno(r, "Failed to mount server data directory: %m");
2188 }
2189
2190 return 0;
2191 }
2192
2193 static void loop_remove(int nr, int *image_fd) {
2194 _cleanup_close_ int control = -1;
2195 int r;
2196
2197 if (nr < 0)
2198 return;
2199
2200 if (image_fd && *image_fd >= 0) {
2201 r = ioctl(*image_fd, LOOP_CLR_FD);
2202 if (r < 0)
2203 log_debug_errno(errno, "Failed to close loop image: %m");
2204 *image_fd = safe_close(*image_fd);
2205 }
2206
2207 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2208 if (control < 0) {
2209 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2210 return;
2211 }
2212
2213 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2214 if (r < 0)
2215 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2216 }
2217
2218 /*
2219 * Return values:
2220 * < 0 : wait_for_terminate() failed to get the state of the
2221 * container, the container was terminated by a signal, or
2222 * failed for an unknown reason. No change is made to the
2223 * container argument.
2224 * > 0 : The program executed in the container terminated with an
2225 * error. The exit code of the program executed in the
2226 * container is returned. The container argument has been set
2227 * to CONTAINER_TERMINATED.
2228 * 0 : The container is being rebooted, has been shut down or exited
2229 * successfully. The container argument has been set to either
2230 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2231 *
2232 * That is, success is indicated by a return value of zero, and an
2233 * error is indicated by a non-zero value.
2234 */
2235 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2236 siginfo_t status;
2237 int r;
2238
2239 r = wait_for_terminate(pid, &status);
2240 if (r < 0)
2241 return log_warning_errno(r, "Failed to wait for container: %m");
2242
2243 switch (status.si_code) {
2244
2245 case CLD_EXITED:
2246 if (status.si_status == 0) {
2247 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2248
2249 } else
2250 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2251
2252 *container = CONTAINER_TERMINATED;
2253 return status.si_status;
2254
2255 case CLD_KILLED:
2256 if (status.si_status == SIGINT) {
2257
2258 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2259 *container = CONTAINER_TERMINATED;
2260 return 0;
2261
2262 } else if (status.si_status == SIGHUP) {
2263
2264 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2265 *container = CONTAINER_REBOOTED;
2266 return 0;
2267 }
2268
2269 /* CLD_KILLED fallthrough */
2270
2271 case CLD_DUMPED:
2272 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2273 return -EIO;
2274
2275 default:
2276 log_error("Container %s failed due to unknown reason.", arg_machine);
2277 return -EIO;
2278 }
2279
2280 return r;
2281 }
2282
2283 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2284 pid_t pid;
2285
2286 pid = PTR_TO_UINT32(userdata);
2287 if (pid > 0) {
2288 if (kill(pid, arg_kill_signal) >= 0) {
2289 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2290 sd_event_source_set_userdata(s, NULL);
2291 return 0;
2292 }
2293 }
2294
2295 sd_event_exit(sd_event_source_get_event(s), 0);
2296 return 0;
2297 }
2298
2299 static int determine_names(void) {
2300 int r;
2301
2302 if (arg_template && !arg_directory && arg_machine) {
2303
2304 /* If --template= was specified then we should not
2305 * search for a machine, but instead create a new one
2306 * in /var/lib/machine. */
2307
2308 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2309 if (!arg_directory)
2310 return log_oom();
2311 }
2312
2313 if (!arg_image && !arg_directory) {
2314 if (arg_machine) {
2315 _cleanup_(image_unrefp) Image *i = NULL;
2316
2317 r = image_find(arg_machine, &i);
2318 if (r < 0)
2319 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2320 else if (r == 0) {
2321 log_error("No image for machine '%s': %m", arg_machine);
2322 return -ENOENT;
2323 }
2324
2325 if (i->type == IMAGE_RAW)
2326 r = set_sanitized_path(&arg_image, i->path);
2327 else
2328 r = set_sanitized_path(&arg_directory, i->path);
2329 if (r < 0)
2330 return log_error_errno(r, "Invalid image directory: %m");
2331
2332 if (!arg_ephemeral)
2333 arg_read_only = arg_read_only || i->read_only;
2334 } else
2335 arg_directory = get_current_dir_name();
2336
2337 if (!arg_directory && !arg_machine) {
2338 log_error("Failed to determine path, please use -D or -i.");
2339 return -EINVAL;
2340 }
2341 }
2342
2343 if (!arg_machine) {
2344 if (arg_directory && path_equal(arg_directory, "/"))
2345 arg_machine = gethostname_malloc();
2346 else
2347 arg_machine = strdup(basename(arg_image ?: arg_directory));
2348
2349 if (!arg_machine)
2350 return log_oom();
2351
2352 hostname_cleanup(arg_machine);
2353 if (!machine_name_is_valid(arg_machine)) {
2354 log_error("Failed to determine machine name automatically, please use -M.");
2355 return -EINVAL;
2356 }
2357
2358 if (arg_ephemeral) {
2359 char *b;
2360
2361 /* Add a random suffix when this is an
2362 * ephemeral machine, so that we can run many
2363 * instances at once without manually having
2364 * to specify -M each time. */
2365
2366 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2367 return log_oom();
2368
2369 free(arg_machine);
2370 arg_machine = b;
2371 }
2372 }
2373
2374 return 0;
2375 }
2376
2377 static int determine_uid_shift(const char *directory) {
2378 int r;
2379
2380 if (!arg_userns) {
2381 arg_uid_shift = 0;
2382 return 0;
2383 }
2384
2385 if (arg_uid_shift == UID_INVALID) {
2386 struct stat st;
2387
2388 r = stat(directory, &st);
2389 if (r < 0)
2390 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2391
2392 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2393
2394 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2395 log_error("UID and GID base of %s don't match.", directory);
2396 return -EINVAL;
2397 }
2398
2399 arg_uid_range = UINT32_C(0x10000);
2400 }
2401
2402 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2403 log_error("UID base too high for UID range.");
2404 return -EINVAL;
2405 }
2406
2407 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2408 return 0;
2409 }
2410
2411 static int inner_child(
2412 Barrier *barrier,
2413 const char *directory,
2414 bool secondary,
2415 int kmsg_socket,
2416 int rtnl_socket,
2417 FDSet *fds) {
2418
2419 _cleanup_free_ char *home = NULL;
2420 unsigned n_env = 2;
2421 const char *envp[] = {
2422 "PATH=" DEFAULT_PATH_SPLIT_USR,
2423 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2424 NULL, /* TERM */
2425 NULL, /* HOME */
2426 NULL, /* USER */
2427 NULL, /* LOGNAME */
2428 NULL, /* container_uuid */
2429 NULL, /* LISTEN_FDS */
2430 NULL, /* LISTEN_PID */
2431 NULL
2432 };
2433
2434 _cleanup_strv_free_ char **env_use = NULL;
2435 int r;
2436
2437 assert(barrier);
2438 assert(directory);
2439 assert(kmsg_socket >= 0);
2440
2441 cg_unified_flush();
2442
2443 if (arg_userns) {
2444 /* Tell the parent, that it now can write the UID map. */
2445 (void) barrier_place(barrier); /* #1 */
2446
2447 /* Wait until the parent wrote the UID map */
2448 if (!barrier_place_and_sync(barrier)) { /* #2 */
2449 log_error("Parent died too early");
2450 return -ESRCH;
2451 }
2452 }
2453
2454 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
2455 if (r < 0)
2456 return r;
2457
2458 r = mount_sysfs(NULL);
2459 if (r < 0)
2460 return r;
2461
2462 /* Wait until we are cgroup-ified, so that we
2463 * can mount the right cgroup path writable */
2464 if (!barrier_place_and_sync(barrier)) { /* #3 */
2465 log_error("Parent died too early");
2466 return -ESRCH;
2467 }
2468
2469 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2470 if (r < 0)
2471 return r;
2472
2473 r = reset_uid_gid();
2474 if (r < 0)
2475 return log_error_errno(r, "Couldn't become new root: %m");
2476
2477 r = setup_boot_id(NULL);
2478 if (r < 0)
2479 return r;
2480
2481 r = setup_kmsg(NULL, kmsg_socket);
2482 if (r < 0)
2483 return r;
2484 kmsg_socket = safe_close(kmsg_socket);
2485
2486 umask(0022);
2487
2488 if (setsid() < 0)
2489 return log_error_errno(errno, "setsid() failed: %m");
2490
2491 if (arg_private_network)
2492 loopback_setup();
2493
2494 if (arg_expose_ports) {
2495 r = expose_port_send_rtnl(rtnl_socket);
2496 if (r < 0)
2497 return r;
2498 rtnl_socket = safe_close(rtnl_socket);
2499 }
2500
2501 if (drop_capabilities() < 0)
2502 return log_error_errno(errno, "drop_capabilities() failed: %m");
2503
2504 setup_hostname();
2505
2506 if (arg_personality != PERSONALITY_INVALID) {
2507 if (personality(arg_personality) < 0)
2508 return log_error_errno(errno, "personality() failed: %m");
2509 } else if (secondary) {
2510 if (personality(PER_LINUX32) < 0)
2511 return log_error_errno(errno, "personality() failed: %m");
2512 }
2513
2514 #ifdef HAVE_SELINUX
2515 if (arg_selinux_context)
2516 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2517 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2518 #endif
2519
2520 r = change_uid_gid(arg_user, &home);
2521 if (r < 0)
2522 return r;
2523
2524 envp[n_env] = strv_find_prefix(environ, "TERM=");
2525 if (envp[n_env])
2526 n_env ++;
2527
2528 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2529 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2530 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2531 return log_oom();
2532
2533 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2534 char as_uuid[37];
2535
2536 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2537 return log_oom();
2538 }
2539
2540 if (fdset_size(fds) > 0) {
2541 r = fdset_cloexec(fds, false);
2542 if (r < 0)
2543 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2544
2545 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2546 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2547 return log_oom();
2548 }
2549
2550 env_use = strv_env_merge(2, envp, arg_setenv);
2551 if (!env_use)
2552 return log_oom();
2553
2554 /* Let the parent know that we are ready and
2555 * wait until the parent is ready with the
2556 * setup, too... */
2557 if (!barrier_place_and_sync(barrier)) { /* #4 */
2558 log_error("Parent died too early");
2559 return -ESRCH;
2560 }
2561
2562 /* Now, explicitly close the log, so that we
2563 * then can close all remaining fds. Closing
2564 * the log explicitly first has the benefit
2565 * that the logging subsystem knows about it,
2566 * and is thus ready to be reopened should we
2567 * need it again. Note that the other fds
2568 * closed here are at least the locking and
2569 * barrier fds. */
2570 log_close();
2571 (void) fdset_close_others(fds);
2572
2573 if (arg_boot) {
2574 char **a;
2575 size_t m;
2576
2577 /* Automatically search for the init system */
2578
2579 m = 1 + strv_length(arg_parameters);
2580 a = newa(char*, m + 1);
2581 if (strv_isempty(arg_parameters))
2582 a[1] = NULL;
2583 else
2584 memcpy(a + 1, arg_parameters, m * sizeof(char*));
2585
2586 a[0] = (char*) "/usr/lib/systemd/systemd";
2587 execve(a[0], a, env_use);
2588
2589 a[0] = (char*) "/lib/systemd/systemd";
2590 execve(a[0], a, env_use);
2591
2592 a[0] = (char*) "/sbin/init";
2593 execve(a[0], a, env_use);
2594 } else if (!strv_isempty(arg_parameters))
2595 execvpe(arg_parameters[0], arg_parameters, env_use);
2596 else {
2597 chdir(home ?: "/root");
2598 execle("/bin/bash", "-bash", NULL, env_use);
2599 execle("/bin/sh", "-sh", NULL, env_use);
2600 }
2601
2602 (void) log_open();
2603 return log_error_errno(errno, "execv() failed: %m");
2604 }
2605
2606 static int outer_child(
2607 Barrier *barrier,
2608 const char *directory,
2609 const char *console,
2610 const char *root_device, bool root_device_rw,
2611 const char *home_device, bool home_device_rw,
2612 const char *srv_device, bool srv_device_rw,
2613 bool interactive,
2614 bool secondary,
2615 int pid_socket,
2616 int kmsg_socket,
2617 int rtnl_socket,
2618 int uid_shift_socket,
2619 FDSet *fds) {
2620
2621 pid_t pid;
2622 ssize_t l;
2623 int r;
2624
2625 assert(barrier);
2626 assert(directory);
2627 assert(console);
2628 assert(pid_socket >= 0);
2629 assert(kmsg_socket >= 0);
2630
2631 cg_unified_flush();
2632
2633 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2634 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2635
2636 if (interactive) {
2637 close_nointr(STDIN_FILENO);
2638 close_nointr(STDOUT_FILENO);
2639 close_nointr(STDERR_FILENO);
2640
2641 r = open_terminal(console, O_RDWR);
2642 if (r != STDIN_FILENO) {
2643 if (r >= 0) {
2644 safe_close(r);
2645 r = -EINVAL;
2646 }
2647
2648 return log_error_errno(r, "Failed to open console: %m");
2649 }
2650
2651 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2652 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2653 return log_error_errno(errno, "Failed to duplicate console: %m");
2654 }
2655
2656 r = reset_audit_loginuid();
2657 if (r < 0)
2658 return r;
2659
2660 /* Mark everything as slave, so that we still
2661 * receive mounts from the real root, but don't
2662 * propagate mounts to the real root. */
2663 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2664 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2665
2666 r = mount_devices(directory,
2667 root_device, root_device_rw,
2668 home_device, home_device_rw,
2669 srv_device, srv_device_rw);
2670 if (r < 0)
2671 return r;
2672
2673 r = determine_uid_shift(directory);
2674 if (r < 0)
2675 return r;
2676
2677 if (arg_userns) {
2678 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2679 if (l < 0)
2680 return log_error_errno(errno, "Failed to send UID shift: %m");
2681 if (l != sizeof(arg_uid_shift)) {
2682 log_error("Short write while sending UID shift.");
2683 return -EIO;
2684 }
2685 }
2686
2687 /* Turn directory into bind mount */
2688 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2689 return log_error_errno(errno, "Failed to make bind mount: %m");
2690
2691 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2692 if (r < 0)
2693 return r;
2694
2695 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
2696 if (r < 0)
2697 return r;
2698
2699 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2700 if (r < 0)
2701 return r;
2702
2703 if (arg_read_only) {
2704 r = bind_remount_recursive(directory, true);
2705 if (r < 0)
2706 return log_error_errno(r, "Failed to make tree read-only: %m");
2707 }
2708
2709 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2710 if (r < 0)
2711 return r;
2712
2713 r = copy_devnodes(directory);
2714 if (r < 0)
2715 return r;
2716
2717 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2718
2719 r = setup_pts(directory);
2720 if (r < 0)
2721 return r;
2722
2723 r = setup_propagate(directory);
2724 if (r < 0)
2725 return r;
2726
2727 r = setup_dev_console(directory, console);
2728 if (r < 0)
2729 return r;
2730
2731 r = setup_seccomp();
2732 if (r < 0)
2733 return r;
2734
2735 r = setup_timezone(directory);
2736 if (r < 0)
2737 return r;
2738
2739 r = setup_resolv_conf(directory);
2740 if (r < 0)
2741 return r;
2742
2743 r = setup_journal(directory);
2744 if (r < 0)
2745 return r;
2746
2747 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2748 if (r < 0)
2749 return r;
2750
2751 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2752 if (r < 0)
2753 return r;
2754
2755 r = mount_move_root(directory);
2756 if (r < 0)
2757 return log_error_errno(r, "Failed to move root directory: %m");
2758
2759 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2760 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2761 (arg_private_network ? CLONE_NEWNET : 0) |
2762 (arg_userns ? CLONE_NEWUSER : 0),
2763 NULL);
2764 if (pid < 0)
2765 return log_error_errno(errno, "Failed to fork inner child: %m");
2766 if (pid == 0) {
2767 pid_socket = safe_close(pid_socket);
2768 uid_shift_socket = safe_close(uid_shift_socket);
2769
2770 /* The inner child has all namespaces that are
2771 * requested, so that we all are owned by the user if
2772 * user namespaces are turned on. */
2773
2774 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
2775 if (r < 0)
2776 _exit(EXIT_FAILURE);
2777
2778 _exit(EXIT_SUCCESS);
2779 }
2780
2781 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2782 if (l < 0)
2783 return log_error_errno(errno, "Failed to send PID: %m");
2784 if (l != sizeof(pid)) {
2785 log_error("Short write while sending PID.");
2786 return -EIO;
2787 }
2788
2789 pid_socket = safe_close(pid_socket);
2790 kmsg_socket = safe_close(kmsg_socket);
2791 rtnl_socket = safe_close(rtnl_socket);
2792
2793 return 0;
2794 }
2795
2796 static int setup_uid_map(pid_t pid) {
2797 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2798 int r;
2799
2800 assert(pid > 1);
2801
2802 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2803 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
2804 r = write_string_file(uid_map, line, 0);
2805 if (r < 0)
2806 return log_error_errno(r, "Failed to write UID map: %m");
2807
2808 /* We always assign the same UID and GID ranges */
2809 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
2810 r = write_string_file(uid_map, line, 0);
2811 if (r < 0)
2812 return log_error_errno(r, "Failed to write GID map: %m");
2813
2814 return 0;
2815 }
2816
2817 static int load_settings(void) {
2818 _cleanup_(settings_freep) Settings *settings = NULL;
2819 _cleanup_fclose_ FILE *f = NULL;
2820 _cleanup_free_ char *p = NULL;
2821 const char *fn, *i;
2822 int r;
2823
2824 /* If all settings are masked, there's no point in looking for
2825 * the settings file */
2826 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2827 return 0;
2828
2829 fn = strjoina(arg_machine, ".nspawn");
2830
2831 /* We first look in the admin's directories in /etc and /run */
2832 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2833 _cleanup_free_ char *j = NULL;
2834
2835 j = strjoin(i, "/", fn, NULL);
2836 if (!j)
2837 return log_oom();
2838
2839 f = fopen(j, "re");
2840 if (f) {
2841 p = j;
2842 j = NULL;
2843
2844 /* By default we trust configuration from /etc and /run */
2845 if (arg_settings_trusted < 0)
2846 arg_settings_trusted = true;
2847
2848 break;
2849 }
2850
2851 if (errno != ENOENT)
2852 return log_error_errno(errno, "Failed to open %s: %m", j);
2853 }
2854
2855 if (!f) {
2856 /* After that, let's look for a file next to the
2857 * actual image we shall boot. */
2858
2859 if (arg_image) {
2860 p = file_in_same_dir(arg_image, fn);
2861 if (!p)
2862 return log_oom();
2863 } else if (arg_directory) {
2864 p = file_in_same_dir(arg_directory, fn);
2865 if (!p)
2866 return log_oom();
2867 }
2868
2869 if (p) {
2870 f = fopen(p, "re");
2871 if (!f && errno != ENOENT)
2872 return log_error_errno(errno, "Failed to open %s: %m", p);
2873
2874 /* By default we do not trust configuration from /var/lib/machines */
2875 if (arg_settings_trusted < 0)
2876 arg_settings_trusted = false;
2877 }
2878 }
2879
2880 if (!f)
2881 return 0;
2882
2883 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2884
2885 r = settings_load(f, p, &settings);
2886 if (r < 0)
2887 return r;
2888
2889 /* Copy over bits from the settings, unless they have been
2890 * explicitly masked by command line switches. */
2891
2892 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2893 settings->boot >= 0) {
2894 arg_boot = settings->boot;
2895
2896 strv_free(arg_parameters);
2897 arg_parameters = settings->parameters;
2898 settings->parameters = NULL;
2899 }
2900
2901 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2902 settings->environment) {
2903 strv_free(arg_setenv);
2904 arg_setenv = settings->environment;
2905 settings->environment = NULL;
2906 }
2907
2908 if ((arg_settings_mask & SETTING_USER) == 0 &&
2909 settings->user) {
2910 free(arg_user);
2911 arg_user = settings->user;
2912 settings->user = NULL;
2913 }
2914
2915 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
2916 uint64_t plus;
2917
2918 plus = settings->capability;
2919 if (settings_private_network(settings))
2920 plus |= (1ULL << CAP_NET_ADMIN);
2921
2922 if (!arg_settings_trusted && plus != 0) {
2923 if (settings->capability != 0)
2924 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2925 } else
2926 arg_retain |= plus;
2927
2928 arg_retain &= ~settings->drop_capability;
2929 }
2930
2931 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2932 settings->kill_signal > 0)
2933 arg_kill_signal = settings->kill_signal;
2934
2935 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2936 settings->personality != PERSONALITY_INVALID)
2937 arg_personality = settings->personality;
2938
2939 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2940 !sd_id128_is_null(settings->machine_id)) {
2941
2942 if (!arg_settings_trusted)
2943 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2944 else
2945 arg_uuid = settings->machine_id;
2946 }
2947
2948 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2949 settings->read_only >= 0)
2950 arg_read_only = settings->read_only;
2951
2952 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2953 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2954 arg_volatile_mode = settings->volatile_mode;
2955
2956 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2957 settings->n_custom_mounts > 0) {
2958
2959 if (!arg_settings_trusted)
2960 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2961 else {
2962 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2963 arg_custom_mounts = settings->custom_mounts;
2964 arg_n_custom_mounts = settings->n_custom_mounts;
2965
2966 settings->custom_mounts = NULL;
2967 settings->n_custom_mounts = 0;
2968 }
2969 }
2970
2971 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2972 (settings->private_network >= 0 ||
2973 settings->network_veth >= 0 ||
2974 settings->network_bridge ||
2975 settings->network_interfaces ||
2976 settings->network_macvlan ||
2977 settings->network_ipvlan)) {
2978
2979 if (!arg_settings_trusted)
2980 log_warning("Ignoring network settings, file %s is not trusted.", p);
2981 else {
2982 arg_network_veth = settings_private_network(settings);
2983 arg_private_network = settings_private_network(settings);
2984
2985 strv_free(arg_network_interfaces);
2986 arg_network_interfaces = settings->network_interfaces;
2987 settings->network_interfaces = NULL;
2988
2989 strv_free(arg_network_macvlan);
2990 arg_network_macvlan = settings->network_macvlan;
2991 settings->network_macvlan = NULL;
2992
2993 strv_free(arg_network_ipvlan);
2994 arg_network_ipvlan = settings->network_ipvlan;
2995 settings->network_ipvlan = NULL;
2996
2997 free(arg_network_bridge);
2998 arg_network_bridge = settings->network_bridge;
2999 settings->network_bridge = NULL;
3000 }
3001 }
3002
3003 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3004 settings->expose_ports) {
3005
3006 if (!arg_settings_trusted)
3007 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3008 else {
3009 expose_port_free_all(arg_expose_ports);
3010 arg_expose_ports = settings->expose_ports;
3011 settings->expose_ports = NULL;
3012 }
3013 }
3014
3015 return 0;
3016 }
3017
3018 int main(int argc, char *argv[]) {
3019
3020 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3021 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3022 _cleanup_close_ int master = -1, image_fd = -1;
3023 _cleanup_fdset_free_ FDSet *fds = NULL;
3024 int r, n_fd_passed, loop_nr = -1;
3025 char veth_name[IFNAMSIZ];
3026 bool secondary = false, remove_subvol = false;
3027 sigset_t mask_chld;
3028 pid_t pid = 0;
3029 int ret = EXIT_SUCCESS;
3030 union in_addr_union exposed = {};
3031 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3032 bool interactive;
3033
3034 log_parse_environment();
3035 log_open();
3036
3037 r = parse_argv(argc, argv);
3038 if (r <= 0)
3039 goto finish;
3040
3041 if (geteuid() != 0) {
3042 log_error("Need to be root.");
3043 r = -EPERM;
3044 goto finish;
3045 }
3046 r = determine_names();
3047 if (r < 0)
3048 goto finish;
3049
3050 r = load_settings();
3051 if (r < 0)
3052 goto finish;
3053
3054 r = verify_arguments();
3055 if (r < 0)
3056 goto finish;
3057
3058 n_fd_passed = sd_listen_fds(false);
3059 if (n_fd_passed > 0) {
3060 r = fdset_new_listen_fds(&fds, false);
3061 if (r < 0) {
3062 log_error_errno(r, "Failed to collect file descriptors: %m");
3063 goto finish;
3064 }
3065 }
3066
3067 if (arg_directory) {
3068 assert(!arg_image);
3069
3070 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3071 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3072 r = -EINVAL;
3073 goto finish;
3074 }
3075
3076 if (arg_ephemeral) {
3077 _cleanup_free_ char *np = NULL;
3078
3079 /* If the specified path is a mount point we
3080 * generate the new snapshot immediately
3081 * inside it under a random name. However if
3082 * the specified is not a mount point we
3083 * create the new snapshot in the parent
3084 * directory, just next to it. */
3085 r = path_is_mount_point(arg_directory, 0);
3086 if (r < 0) {
3087 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3088 goto finish;
3089 }
3090 if (r > 0)
3091 r = tempfn_random_child(arg_directory, "machine.", &np);
3092 else
3093 r = tempfn_random(arg_directory, "machine.", &np);
3094 if (r < 0) {
3095 log_error_errno(r, "Failed to generate name for snapshot: %m");
3096 goto finish;
3097 }
3098
3099 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3100 if (r < 0) {
3101 log_error_errno(r, "Failed to lock %s: %m", np);
3102 goto finish;
3103 }
3104
3105 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3106 if (r < 0) {
3107 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3108 goto finish;
3109 }
3110
3111 free(arg_directory);
3112 arg_directory = np;
3113 np = NULL;
3114
3115 remove_subvol = true;
3116
3117 } else {
3118 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3119 if (r == -EBUSY) {
3120 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3121 goto finish;
3122 }
3123 if (r < 0) {
3124 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3125 return r;
3126 }
3127
3128 if (arg_template) {
3129 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3130 if (r == -EEXIST) {
3131 if (!arg_quiet)
3132 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3133 } else if (r < 0) {
3134 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3135 goto finish;
3136 } else {
3137 if (!arg_quiet)
3138 log_info("Populated %s from template %s.", arg_directory, arg_template);
3139 }
3140 }
3141 }
3142
3143 if (arg_boot) {
3144 if (path_is_os_tree(arg_directory) <= 0) {
3145 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3146 r = -EINVAL;
3147 goto finish;
3148 }
3149 } else {
3150 const char *p;
3151
3152 p = strjoina(arg_directory, "/usr/");
3153 if (laccess(p, F_OK) < 0) {
3154 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
3155 r = -EINVAL;
3156 goto finish;
3157 }
3158 }
3159
3160 } else {
3161 char template[] = "/tmp/nspawn-root-XXXXXX";
3162
3163 assert(arg_image);
3164 assert(!arg_template);
3165
3166 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3167 if (r == -EBUSY) {
3168 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3169 goto finish;
3170 }
3171 if (r < 0) {
3172 r = log_error_errno(r, "Failed to create image lock: %m");
3173 goto finish;
3174 }
3175
3176 if (!mkdtemp(template)) {
3177 log_error_errno(errno, "Failed to create temporary directory: %m");
3178 r = -errno;
3179 goto finish;
3180 }
3181
3182 arg_directory = strdup(template);
3183 if (!arg_directory) {
3184 r = log_oom();
3185 goto finish;
3186 }
3187
3188 image_fd = setup_image(&device_path, &loop_nr);
3189 if (image_fd < 0) {
3190 r = image_fd;
3191 goto finish;
3192 }
3193
3194 r = dissect_image(image_fd,
3195 &root_device, &root_device_rw,
3196 &home_device, &home_device_rw,
3197 &srv_device, &srv_device_rw,
3198 &secondary);
3199 if (r < 0)
3200 goto finish;
3201 }
3202
3203 r = custom_mounts_prepare();
3204 if (r < 0)
3205 goto finish;
3206
3207 interactive =
3208 isatty(STDIN_FILENO) > 0 &&
3209 isatty(STDOUT_FILENO) > 0;
3210
3211 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3212 if (master < 0) {
3213 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3214 goto finish;
3215 }
3216
3217 r = ptsname_malloc(master, &console);
3218 if (r < 0) {
3219 r = log_error_errno(r, "Failed to determine tty name: %m");
3220 goto finish;
3221 }
3222
3223 if (unlockpt(master) < 0) {
3224 r = log_error_errno(errno, "Failed to unlock tty: %m");
3225 goto finish;
3226 }
3227
3228 if (!arg_quiet)
3229 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3230 arg_machine, arg_image ?: arg_directory);
3231
3232 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3233
3234 assert_se(sigemptyset(&mask_chld) == 0);
3235 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3236
3237 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3238 r = log_error_errno(errno, "Failed to become subreaper: %m");
3239 goto finish;
3240 }
3241
3242 for (;;) {
3243 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
3244 uid_shift_socket_pair[2] = { -1, -1 };
3245 ContainerStatus container_status;
3246 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3247 static const struct sigaction sa = {
3248 .sa_handler = nop_signal_handler,
3249 .sa_flags = SA_NOCLDSTOP,
3250 };
3251 int ifi = 0;
3252 ssize_t l;
3253 _cleanup_event_unref_ sd_event *event = NULL;
3254 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3255 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3256 char last_char = 0;
3257
3258 r = barrier_create(&barrier);
3259 if (r < 0) {
3260 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3261 goto finish;
3262 }
3263
3264 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3265 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3266 goto finish;
3267 }
3268
3269 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3270 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3271 goto finish;
3272 }
3273
3274 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3275 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3276 goto finish;
3277 }
3278
3279 if (arg_userns)
3280 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3281 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3282 goto finish;
3283 }
3284
3285 /* Child can be killed before execv(), so handle SIGCHLD
3286 * in order to interrupt parent's blocking calls and
3287 * give it a chance to call wait() and terminate. */
3288 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3289 if (r < 0) {
3290 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3291 goto finish;
3292 }
3293
3294 r = sigaction(SIGCHLD, &sa, NULL);
3295 if (r < 0) {
3296 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3297 goto finish;
3298 }
3299
3300 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3301 if (pid < 0) {
3302 if (errno == EINVAL)
3303 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3304 else
3305 r = log_error_errno(errno, "clone() failed: %m");
3306
3307 goto finish;
3308 }
3309
3310 if (pid == 0) {
3311 /* The outer child only has a file system namespace. */
3312 barrier_set_role(&barrier, BARRIER_CHILD);
3313
3314 master = safe_close(master);
3315
3316 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3317 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3318 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3319 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3320
3321 (void) reset_all_signal_handlers();
3322 (void) reset_signal_mask();
3323
3324 r = outer_child(&barrier,
3325 arg_directory,
3326 console,
3327 root_device, root_device_rw,
3328 home_device, home_device_rw,
3329 srv_device, srv_device_rw,
3330 interactive,
3331 secondary,
3332 pid_socket_pair[1],
3333 kmsg_socket_pair[1],
3334 rtnl_socket_pair[1],
3335 uid_shift_socket_pair[1],
3336 fds);
3337 if (r < 0)
3338 _exit(EXIT_FAILURE);
3339
3340 _exit(EXIT_SUCCESS);
3341 }
3342
3343 barrier_set_role(&barrier, BARRIER_PARENT);
3344
3345 fds = fdset_free(fds);
3346
3347 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3348 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3349 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3350 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3351
3352 /* Wait for the outer child. */
3353 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3354 if (r < 0)
3355 goto finish;
3356 if (r != 0) {
3357 r = -EIO;
3358 goto finish;
3359 }
3360 pid = 0;
3361
3362 /* And now retrieve the PID of the inner child. */
3363 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3364 if (l < 0) {
3365 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3366 goto finish;
3367 }
3368 if (l != sizeof(pid)) {
3369 log_error("Short read while reading inner child PID.");
3370 r = EIO;
3371 goto finish;
3372 }
3373
3374 log_debug("Init process invoked as PID " PID_FMT, pid);
3375
3376 if (arg_userns) {
3377 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3378 log_error("Child died too early.");
3379 r = -ESRCH;
3380 goto finish;
3381 }
3382
3383 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3384 if (l < 0) {
3385 r = log_error_errno(errno, "Failed to read UID shift: %m");
3386 goto finish;
3387 }
3388 if (l != sizeof(arg_uid_shift)) {
3389 log_error("Short read while reading UID shift.");
3390 r = EIO;
3391 goto finish;
3392 }
3393
3394 r = setup_uid_map(pid);
3395 if (r < 0)
3396 goto finish;
3397
3398 (void) barrier_place(&barrier); /* #2 */
3399 }
3400
3401 if (arg_private_network) {
3402
3403 r = move_network_interfaces(pid, arg_network_interfaces);
3404 if (r < 0)
3405 goto finish;
3406
3407 if (arg_network_veth) {
3408 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3409 if (r < 0)
3410 goto finish;
3411 else if (r > 0)
3412 ifi = r;
3413
3414 if (arg_network_bridge) {
3415 r = setup_bridge(veth_name, arg_network_bridge);
3416 if (r < 0)
3417 goto finish;
3418 if (r > 0)
3419 ifi = r;
3420 }
3421 }
3422
3423 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3424 if (r < 0)
3425 goto finish;
3426
3427 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3428 if (r < 0)
3429 goto finish;
3430 }
3431
3432 if (arg_register) {
3433 r = register_machine(
3434 arg_machine,
3435 pid,
3436 arg_directory,
3437 arg_uuid,
3438 ifi,
3439 arg_slice,
3440 arg_custom_mounts, arg_n_custom_mounts,
3441 arg_kill_signal,
3442 arg_property,
3443 arg_keep_unit);
3444 if (r < 0)
3445 goto finish;
3446 }
3447
3448 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3449 if (r < 0)
3450 goto finish;
3451
3452 if (arg_keep_unit) {
3453 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3454 if (r < 0)
3455 goto finish;
3456 }
3457
3458 r = chown_cgroup(pid, arg_uid_shift);
3459 if (r < 0)
3460 goto finish;
3461
3462 /* Notify the child that the parent is ready with all
3463 * its setup (including cgroup-ification), and that
3464 * the child can now hand over control to the code to
3465 * run inside the container. */
3466 (void) barrier_place(&barrier); /* #3 */
3467
3468 /* Block SIGCHLD here, before notifying child.
3469 * process_pty() will handle it with the other signals. */
3470 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3471
3472 /* Reset signal to default */
3473 r = default_signals(SIGCHLD, -1);
3474 if (r < 0) {
3475 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3476 goto finish;
3477 }
3478
3479 /* Let the child know that we are ready and wait that the child is completely ready now. */
3480 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3481 log_error("Child died too early.");
3482 r = -ESRCH;
3483 goto finish;
3484 }
3485
3486 sd_notifyf(false,
3487 "READY=1\n"
3488 "STATUS=Container running.\n"
3489 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3490
3491 r = sd_event_new(&event);
3492 if (r < 0) {
3493 log_error_errno(r, "Failed to get default event source: %m");
3494 goto finish;
3495 }
3496
3497 if (arg_kill_signal > 0) {
3498 /* Try to kill the init system on SIGINT or SIGTERM */
3499 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3500 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3501 } else {
3502 /* Immediately exit */
3503 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3504 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3505 }
3506
3507 /* simply exit on sigchld */
3508 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3509
3510 if (arg_expose_ports) {
3511 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3512 if (r < 0)
3513 goto finish;
3514
3515 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3516 }
3517
3518 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3519
3520 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
3521 if (r < 0) {
3522 log_error_errno(r, "Failed to create PTY forwarder: %m");
3523 goto finish;
3524 }
3525
3526 r = sd_event_loop(event);
3527 if (r < 0) {
3528 log_error_errno(r, "Failed to run event loop: %m");
3529 goto finish;
3530 }
3531
3532 pty_forward_get_last_char(forward, &last_char);
3533
3534 forward = pty_forward_free(forward);
3535
3536 if (!arg_quiet && last_char != '\n')
3537 putc('\n', stdout);
3538
3539 /* Kill if it is not dead yet anyway */
3540 if (arg_register && !arg_keep_unit)
3541 terminate_machine(pid);
3542
3543 /* Normally redundant, but better safe than sorry */
3544 kill(pid, SIGKILL);
3545
3546 r = wait_for_container(pid, &container_status);
3547 pid = 0;
3548
3549 if (r < 0)
3550 /* We failed to wait for the container, or the
3551 * container exited abnormally */
3552 goto finish;
3553 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3554 /* The container exited with a non-zero
3555 * status, or with zero status and no reboot
3556 * was requested. */
3557 ret = r;
3558 break;
3559 }
3560
3561 /* CONTAINER_REBOOTED, loop again */
3562
3563 if (arg_keep_unit) {
3564 /* Special handling if we are running as a
3565 * service: instead of simply restarting the
3566 * machine we want to restart the entire
3567 * service, so let's inform systemd about this
3568 * with the special exit code 133. The service
3569 * file uses RestartForceExitStatus=133 so
3570 * that this results in a full nspawn
3571 * restart. This is necessary since we might
3572 * have cgroup parameters set we want to have
3573 * flushed out. */
3574 ret = 133;
3575 r = 0;
3576 break;
3577 }
3578
3579 expose_port_flush(arg_expose_ports, &exposed);
3580 }
3581
3582 finish:
3583 sd_notify(false,
3584 "STOPPING=1\n"
3585 "STATUS=Terminating...");
3586
3587 if (pid > 0)
3588 kill(pid, SIGKILL);
3589
3590 /* Try to flush whatever is still queued in the pty */
3591 if (master >= 0)
3592 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
3593
3594 loop_remove(loop_nr, &image_fd);
3595
3596 if (remove_subvol && arg_directory) {
3597 int k;
3598
3599 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
3600 if (k < 0)
3601 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3602 }
3603
3604 if (arg_machine) {
3605 const char *p;
3606
3607 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3608 (void) rm_rf(p, REMOVE_ROOT);
3609 }
3610
3611 expose_port_flush(arg_expose_ports, &exposed);
3612
3613 free(arg_directory);
3614 free(arg_template);
3615 free(arg_image);
3616 free(arg_machine);
3617 free(arg_user);
3618 strv_free(arg_setenv);
3619 free(arg_network_bridge);
3620 strv_free(arg_network_interfaces);
3621 strv_free(arg_network_macvlan);
3622 strv_free(arg_network_ipvlan);
3623 strv_free(arg_parameters);
3624 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3625 expose_port_free_all(arg_expose_ports);
3626
3627 return r < 0 ? EXIT_FAILURE : ret;
3628 }