]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: allow configuration of user namespaces in .nspawn files
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #ifdef HAVE_BLKID
21 #include <blkid/blkid.h>
22 #endif
23 #include <errno.h>
24 #include <getopt.h>
25 #include <grp.h>
26 #include <linux/loop.h>
27 #include <pwd.h>
28 #include <sched.h>
29 #ifdef HAVE_SECCOMP
30 #include <seccomp.h>
31 #endif
32 #ifdef HAVE_SELINUX
33 #include <selinux/selinux.h>
34 #endif
35 #include <signal.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <sys/file.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
44 #include <unistd.h>
45
46 #include "sd-daemon.h"
47 #include "sd-id128.h"
48
49 #include "alloc-util.h"
50 #include "barrier.h"
51 #include "base-filesystem.h"
52 #include "blkid-util.h"
53 #include "btrfs-util.h"
54 #include "cap-list.h"
55 #include "capability-util.h"
56 #include "cgroup-util.h"
57 #include "copy.h"
58 #include "dev-setup.h"
59 #include "env-util.h"
60 #include "fd-util.h"
61 #include "fdset.h"
62 #include "fileio.h"
63 #include "formats-util.h"
64 #include "fs-util.h"
65 #include "gpt.h"
66 #include "hostname-util.h"
67 #include "log.h"
68 #include "loopback-setup.h"
69 #include "machine-id-setup.h"
70 #include "machine-image.h"
71 #include "macro.h"
72 #include "missing.h"
73 #include "mkdir.h"
74 #include "mount-util.h"
75 #include "netlink-util.h"
76 #include "nspawn-cgroup.h"
77 #include "nspawn-expose-ports.h"
78 #include "nspawn-mount.h"
79 #include "nspawn-network.h"
80 #include "nspawn-patch-uid.h"
81 #include "nspawn-register.h"
82 #include "nspawn-settings.h"
83 #include "nspawn-setuid.h"
84 #include "nspawn-stub-pid1.h"
85 #include "parse-util.h"
86 #include "path-util.h"
87 #include "process-util.h"
88 #include "ptyfwd.h"
89 #include "random-util.h"
90 #include "rm-rf.h"
91 #ifdef HAVE_SECCOMP
92 #include "seccomp-util.h"
93 #endif
94 #include "selinux-util.h"
95 #include "signal-util.h"
96 #include "socket-util.h"
97 #include "stat-util.h"
98 #include "stdio-util.h"
99 #include "string-util.h"
100 #include "strv.h"
101 #include "terminal-util.h"
102 #include "udev-util.h"
103 #include "umask-util.h"
104 #include "user-util.h"
105 #include "util.h"
106
107 /* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
108 * UID range here */
109 #define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
110 #define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
111
112 typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115 } ContainerStatus;
116
117 typedef enum LinkJournal {
118 LINK_NO,
119 LINK_AUTO,
120 LINK_HOST,
121 LINK_GUEST
122 } LinkJournal;
123
124 static char *arg_directory = NULL;
125 static char *arg_template = NULL;
126 static char *arg_chdir = NULL;
127 static char *arg_user = NULL;
128 static sd_id128_t arg_uuid = {};
129 static char *arg_machine = NULL;
130 static const char *arg_selinux_context = NULL;
131 static const char *arg_selinux_apifs_context = NULL;
132 static const char *arg_slice = NULL;
133 static bool arg_private_network = false;
134 static bool arg_read_only = false;
135 static StartMode arg_start_mode = START_PID1;
136 static bool arg_ephemeral = false;
137 static LinkJournal arg_link_journal = LINK_AUTO;
138 static bool arg_link_journal_try = false;
139 static uint64_t arg_retain =
140 (1ULL << CAP_CHOWN) |
141 (1ULL << CAP_DAC_OVERRIDE) |
142 (1ULL << CAP_DAC_READ_SEARCH) |
143 (1ULL << CAP_FOWNER) |
144 (1ULL << CAP_FSETID) |
145 (1ULL << CAP_IPC_OWNER) |
146 (1ULL << CAP_KILL) |
147 (1ULL << CAP_LEASE) |
148 (1ULL << CAP_LINUX_IMMUTABLE) |
149 (1ULL << CAP_NET_BIND_SERVICE) |
150 (1ULL << CAP_NET_BROADCAST) |
151 (1ULL << CAP_NET_RAW) |
152 (1ULL << CAP_SETGID) |
153 (1ULL << CAP_SETFCAP) |
154 (1ULL << CAP_SETPCAP) |
155 (1ULL << CAP_SETUID) |
156 (1ULL << CAP_SYS_ADMIN) |
157 (1ULL << CAP_SYS_CHROOT) |
158 (1ULL << CAP_SYS_NICE) |
159 (1ULL << CAP_SYS_PTRACE) |
160 (1ULL << CAP_SYS_TTY_CONFIG) |
161 (1ULL << CAP_SYS_RESOURCE) |
162 (1ULL << CAP_SYS_BOOT) |
163 (1ULL << CAP_AUDIT_WRITE) |
164 (1ULL << CAP_AUDIT_CONTROL) |
165 (1ULL << CAP_MKNOD);
166 static CustomMount *arg_custom_mounts = NULL;
167 static unsigned arg_n_custom_mounts = 0;
168 static char **arg_setenv = NULL;
169 static bool arg_quiet = false;
170 static bool arg_share_system = false;
171 static bool arg_register = true;
172 static bool arg_keep_unit = false;
173 static char **arg_network_interfaces = NULL;
174 static char **arg_network_macvlan = NULL;
175 static char **arg_network_ipvlan = NULL;
176 static bool arg_network_veth = false;
177 static char **arg_network_veth_extra = NULL;
178 static char *arg_network_bridge = NULL;
179 static unsigned long arg_personality = PERSONALITY_INVALID;
180 static char *arg_image = NULL;
181 static VolatileMode arg_volatile_mode = VOLATILE_NO;
182 static ExposePort *arg_expose_ports = NULL;
183 static char **arg_property = NULL;
184 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
185 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
186 static bool arg_userns_chown = false;
187 static int arg_kill_signal = 0;
188 static bool arg_unified_cgroup_hierarchy = false;
189 static SettingsMask arg_settings_mask = 0;
190 static int arg_settings_trusted = -1;
191 static char **arg_parameters = NULL;
192 static const char *arg_container_service_name = "systemd-nspawn";
193
194 static void help(void) {
195 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
196 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
197 " -h --help Show this help\n"
198 " --version Print version string\n"
199 " -q --quiet Do not show status information\n"
200 " -D --directory=PATH Root directory for the container\n"
201 " --template=PATH Initialize root directory from template directory,\n"
202 " if missing\n"
203 " -x --ephemeral Run container with snapshot of root directory, and\n"
204 " remove it after exit\n"
205 " -i --image=PATH File system device or disk image for the container\n"
206 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
207 " -b --boot Boot up full system (i.e. invoke init)\n"
208 " --chdir=PATH Set working directory in the container\n"
209 " -u --user=USER Run the command under specified user or uid\n"
210 " -M --machine=NAME Set the machine name for the container\n"
211 " --uuid=UUID Set a specific machine UUID for the container\n"
212 " -S --slice=SLICE Place the container in the specified slice\n"
213 " --property=NAME=VALUE Set scope unit property\n"
214 " -U --private-users=pick Run within user namespace, pick UID/GID range automatically\n"
215 " --private-users[=UIDBASE[:NUIDS]]\n"
216 " Run within user namespace, user configured UID/GID range\n"
217 " --private-user-chown Adjust OS tree file ownership for private UID/GID range\n"
218 " --private-network Disable network in container\n"
219 " --network-interface=INTERFACE\n"
220 " Assign an existing network interface to the\n"
221 " container\n"
222 " --network-macvlan=INTERFACE\n"
223 " Create a macvlan network interface based on an\n"
224 " existing network interface to the container\n"
225 " --network-ipvlan=INTERFACE\n"
226 " Create a ipvlan network interface based on an\n"
227 " existing network interface to the container\n"
228 " -n --network-veth Add a virtual Ethernet connection between host\n"
229 " and container\n"
230 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
231 " Add an additional virtual Ethernet link between\n"
232 " host and container\n"
233 " --network-bridge=INTERFACE\n"
234 " Add a virtual Ethernet connection between host\n"
235 " and container and add it to an existing bridge on\n"
236 " the host\n"
237 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
238 " Expose a container IP port on the host\n"
239 " -Z --selinux-context=SECLABEL\n"
240 " Set the SELinux security context to be used by\n"
241 " processes in the container\n"
242 " -L --selinux-apifs-context=SECLABEL\n"
243 " Set the SELinux security context to be used by\n"
244 " API/tmpfs file systems in the container\n"
245 " --capability=CAP In addition to the default, retain specified\n"
246 " capability\n"
247 " --drop-capability=CAP Drop the specified capability from the default set\n"
248 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
249 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
250 " host, try-guest, try-host\n"
251 " -j Equivalent to --link-journal=try-guest\n"
252 " --read-only Mount the root directory read-only\n"
253 " --bind=PATH[:PATH[:OPTIONS]]\n"
254 " Bind mount a file or directory from the host into\n"
255 " the container\n"
256 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
257 " Similar, but creates a read-only bind mount\n"
258 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
259 " --overlay=PATH[:PATH...]:PATH\n"
260 " Create an overlay mount from the host to \n"
261 " the container\n"
262 " --overlay-ro=PATH[:PATH...]:PATH\n"
263 " Similar, but creates a read-only overlay mount\n"
264 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
265 " --share-system Share system namespaces with host\n"
266 " --register=BOOLEAN Register container as machine\n"
267 " --keep-unit Do not register a scope for the machine, reuse\n"
268 " the service unit nspawn is running in\n"
269 " --volatile[=MODE] Run the system in volatile mode\n"
270 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
271 , program_invocation_short_name);
272 }
273
274
275 static int custom_mounts_prepare(void) {
276 unsigned i;
277 int r;
278
279 /* Ensure the mounts are applied prefix first. */
280 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
281
282 /* Allocate working directories for the overlay file systems that need it */
283 for (i = 0; i < arg_n_custom_mounts; i++) {
284 CustomMount *m = &arg_custom_mounts[i];
285
286 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
287
288 if (arg_userns_chown) {
289 log_error("--private-users-chown may not be combined with custom root mounts.");
290 return -EINVAL;
291 } else if (arg_uid_shift == UID_INVALID) {
292 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
293 return -EINVAL;
294 }
295 }
296
297 if (m->type != CUSTOM_MOUNT_OVERLAY)
298 continue;
299
300 if (m->work_dir)
301 continue;
302
303 if (m->read_only)
304 continue;
305
306 r = tempfn_random(m->source, NULL, &m->work_dir);
307 if (r < 0)
308 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
309 }
310
311 return 0;
312 }
313
314 static int detect_unified_cgroup_hierarchy(void) {
315 const char *e;
316 int r;
317
318 /* Allow the user to control whether the unified hierarchy is used */
319 e = getenv("UNIFIED_CGROUP_HIERARCHY");
320 if (e) {
321 r = parse_boolean(e);
322 if (r < 0)
323 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
324
325 arg_unified_cgroup_hierarchy = r;
326 return 0;
327 }
328
329 /* Otherwise inherit the default from the host system */
330 r = cg_unified();
331 if (r < 0)
332 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
333
334 arg_unified_cgroup_hierarchy = r;
335 return 0;
336 }
337
338 static int parse_argv(int argc, char *argv[]) {
339
340 enum {
341 ARG_VERSION = 0x100,
342 ARG_PRIVATE_NETWORK,
343 ARG_UUID,
344 ARG_READ_ONLY,
345 ARG_CAPABILITY,
346 ARG_DROP_CAPABILITY,
347 ARG_LINK_JOURNAL,
348 ARG_BIND,
349 ARG_BIND_RO,
350 ARG_TMPFS,
351 ARG_OVERLAY,
352 ARG_OVERLAY_RO,
353 ARG_SHARE_SYSTEM,
354 ARG_REGISTER,
355 ARG_KEEP_UNIT,
356 ARG_NETWORK_INTERFACE,
357 ARG_NETWORK_MACVLAN,
358 ARG_NETWORK_IPVLAN,
359 ARG_NETWORK_BRIDGE,
360 ARG_NETWORK_VETH_EXTRA,
361 ARG_PERSONALITY,
362 ARG_VOLATILE,
363 ARG_TEMPLATE,
364 ARG_PROPERTY,
365 ARG_PRIVATE_USERS,
366 ARG_KILL_SIGNAL,
367 ARG_SETTINGS,
368 ARG_CHDIR,
369 ARG_PRIVATE_USERS_CHOWN,
370 };
371
372 static const struct option options[] = {
373 { "help", no_argument, NULL, 'h' },
374 { "version", no_argument, NULL, ARG_VERSION },
375 { "directory", required_argument, NULL, 'D' },
376 { "template", required_argument, NULL, ARG_TEMPLATE },
377 { "ephemeral", no_argument, NULL, 'x' },
378 { "user", required_argument, NULL, 'u' },
379 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
380 { "as-pid2", no_argument, NULL, 'a' },
381 { "boot", no_argument, NULL, 'b' },
382 { "uuid", required_argument, NULL, ARG_UUID },
383 { "read-only", no_argument, NULL, ARG_READ_ONLY },
384 { "capability", required_argument, NULL, ARG_CAPABILITY },
385 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
386 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
387 { "bind", required_argument, NULL, ARG_BIND },
388 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
389 { "tmpfs", required_argument, NULL, ARG_TMPFS },
390 { "overlay", required_argument, NULL, ARG_OVERLAY },
391 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
392 { "machine", required_argument, NULL, 'M' },
393 { "slice", required_argument, NULL, 'S' },
394 { "setenv", required_argument, NULL, 'E' },
395 { "selinux-context", required_argument, NULL, 'Z' },
396 { "selinux-apifs-context", required_argument, NULL, 'L' },
397 { "quiet", no_argument, NULL, 'q' },
398 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
399 { "register", required_argument, NULL, ARG_REGISTER },
400 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
401 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
402 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
403 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
404 { "network-veth", no_argument, NULL, 'n' },
405 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
406 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
407 { "personality", required_argument, NULL, ARG_PERSONALITY },
408 { "image", required_argument, NULL, 'i' },
409 { "volatile", optional_argument, NULL, ARG_VOLATILE },
410 { "port", required_argument, NULL, 'p' },
411 { "property", required_argument, NULL, ARG_PROPERTY },
412 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
413 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN},
414 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
415 { "settings", required_argument, NULL, ARG_SETTINGS },
416 { "chdir", required_argument, NULL, ARG_CHDIR },
417 {}
418 };
419
420 int c, r;
421 const char *p, *e;
422 uint64_t plus = 0, minus = 0;
423 bool mask_all_settings = false, mask_no_settings = false;
424
425 assert(argc >= 0);
426 assert(argv);
427
428 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nU", options, NULL)) >= 0)
429
430 switch (c) {
431
432 case 'h':
433 help();
434 return 0;
435
436 case ARG_VERSION:
437 return version();
438
439 case 'D':
440 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
441 if (r < 0)
442 return r;
443 break;
444
445 case ARG_TEMPLATE:
446 r = parse_path_argument_and_warn(optarg, false, &arg_template);
447 if (r < 0)
448 return r;
449 break;
450
451 case 'i':
452 r = parse_path_argument_and_warn(optarg, false, &arg_image);
453 if (r < 0)
454 return r;
455 break;
456
457 case 'x':
458 arg_ephemeral = true;
459 break;
460
461 case 'u':
462 r = free_and_strdup(&arg_user, optarg);
463 if (r < 0)
464 return log_oom();
465
466 arg_settings_mask |= SETTING_USER;
467 break;
468
469 case ARG_NETWORK_BRIDGE:
470 r = free_and_strdup(&arg_network_bridge, optarg);
471 if (r < 0)
472 return log_oom();
473
474 /* fall through */
475
476 case 'n':
477 arg_network_veth = true;
478 arg_private_network = true;
479 arg_settings_mask |= SETTING_NETWORK;
480 break;
481
482 case ARG_NETWORK_VETH_EXTRA:
483 r = veth_extra_parse(&arg_network_veth_extra, optarg);
484 if (r < 0)
485 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
486
487 arg_private_network = true;
488 arg_settings_mask |= SETTING_NETWORK;
489 break;
490
491 case ARG_NETWORK_INTERFACE:
492 if (strv_extend(&arg_network_interfaces, optarg) < 0)
493 return log_oom();
494
495 arg_private_network = true;
496 arg_settings_mask |= SETTING_NETWORK;
497 break;
498
499 case ARG_NETWORK_MACVLAN:
500 if (strv_extend(&arg_network_macvlan, optarg) < 0)
501 return log_oom();
502
503 arg_private_network = true;
504 arg_settings_mask |= SETTING_NETWORK;
505 break;
506
507 case ARG_NETWORK_IPVLAN:
508 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
509 return log_oom();
510
511 /* fall through */
512
513 case ARG_PRIVATE_NETWORK:
514 arg_private_network = true;
515 arg_settings_mask |= SETTING_NETWORK;
516 break;
517
518 case 'b':
519 if (arg_start_mode == START_PID2) {
520 log_error("--boot and --as-pid2 may not be combined.");
521 return -EINVAL;
522 }
523
524 arg_start_mode = START_BOOT;
525 arg_settings_mask |= SETTING_START_MODE;
526 break;
527
528 case 'a':
529 if (arg_start_mode == START_BOOT) {
530 log_error("--boot and --as-pid2 may not be combined.");
531 return -EINVAL;
532 }
533
534 arg_start_mode = START_PID2;
535 arg_settings_mask |= SETTING_START_MODE;
536 break;
537
538 case ARG_UUID:
539 r = sd_id128_from_string(optarg, &arg_uuid);
540 if (r < 0) {
541 log_error("Invalid UUID: %s", optarg);
542 return r;
543 }
544
545 arg_settings_mask |= SETTING_MACHINE_ID;
546 break;
547
548 case 'S':
549 arg_slice = optarg;
550 break;
551
552 case 'M':
553 if (isempty(optarg))
554 arg_machine = mfree(arg_machine);
555 else {
556 if (!machine_name_is_valid(optarg)) {
557 log_error("Invalid machine name: %s", optarg);
558 return -EINVAL;
559 }
560
561 r = free_and_strdup(&arg_machine, optarg);
562 if (r < 0)
563 return log_oom();
564
565 break;
566 }
567
568 case 'Z':
569 arg_selinux_context = optarg;
570 break;
571
572 case 'L':
573 arg_selinux_apifs_context = optarg;
574 break;
575
576 case ARG_READ_ONLY:
577 arg_read_only = true;
578 arg_settings_mask |= SETTING_READ_ONLY;
579 break;
580
581 case ARG_CAPABILITY:
582 case ARG_DROP_CAPABILITY: {
583 p = optarg;
584 for (;;) {
585 _cleanup_free_ char *t = NULL;
586
587 r = extract_first_word(&p, &t, ",", 0);
588 if (r < 0)
589 return log_error_errno(r, "Failed to parse capability %s.", t);
590
591 if (r == 0)
592 break;
593
594 if (streq(t, "all")) {
595 if (c == ARG_CAPABILITY)
596 plus = (uint64_t) -1;
597 else
598 minus = (uint64_t) -1;
599 } else {
600 int cap;
601
602 cap = capability_from_name(t);
603 if (cap < 0) {
604 log_error("Failed to parse capability %s.", t);
605 return -EINVAL;
606 }
607
608 if (c == ARG_CAPABILITY)
609 plus |= 1ULL << (uint64_t) cap;
610 else
611 minus |= 1ULL << (uint64_t) cap;
612 }
613 }
614
615 arg_settings_mask |= SETTING_CAPABILITY;
616 break;
617 }
618
619 case 'j':
620 arg_link_journal = LINK_GUEST;
621 arg_link_journal_try = true;
622 break;
623
624 case ARG_LINK_JOURNAL:
625 if (streq(optarg, "auto")) {
626 arg_link_journal = LINK_AUTO;
627 arg_link_journal_try = false;
628 } else if (streq(optarg, "no")) {
629 arg_link_journal = LINK_NO;
630 arg_link_journal_try = false;
631 } else if (streq(optarg, "guest")) {
632 arg_link_journal = LINK_GUEST;
633 arg_link_journal_try = false;
634 } else if (streq(optarg, "host")) {
635 arg_link_journal = LINK_HOST;
636 arg_link_journal_try = false;
637 } else if (streq(optarg, "try-guest")) {
638 arg_link_journal = LINK_GUEST;
639 arg_link_journal_try = true;
640 } else if (streq(optarg, "try-host")) {
641 arg_link_journal = LINK_HOST;
642 arg_link_journal_try = true;
643 } else {
644 log_error("Failed to parse link journal mode %s", optarg);
645 return -EINVAL;
646 }
647
648 break;
649
650 case ARG_BIND:
651 case ARG_BIND_RO:
652 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
653 if (r < 0)
654 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
655
656 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
657 break;
658
659 case ARG_TMPFS:
660 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
661 if (r < 0)
662 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
663
664 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
665 break;
666
667 case ARG_OVERLAY:
668 case ARG_OVERLAY_RO: {
669 _cleanup_free_ char *upper = NULL, *destination = NULL;
670 _cleanup_strv_free_ char **lower = NULL;
671 CustomMount *m;
672 unsigned n = 0;
673 char **i;
674
675 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
676 if (r == -ENOMEM)
677 return log_oom();
678 else if (r < 0) {
679 log_error("Invalid overlay specification: %s", optarg);
680 return r;
681 }
682
683 STRV_FOREACH(i, lower) {
684 if (!path_is_absolute(*i)) {
685 log_error("Overlay path %s is not absolute.", *i);
686 return -EINVAL;
687 }
688
689 n++;
690 }
691
692 if (n < 2) {
693 log_error("--overlay= needs at least two colon-separated directories specified.");
694 return -EINVAL;
695 }
696
697 if (n == 2) {
698 /* If two parameters are specified,
699 * the first one is the lower, the
700 * second one the upper directory. And
701 * we'll also define the destination
702 * mount point the same as the upper. */
703 upper = lower[1];
704 lower[1] = NULL;
705
706 destination = strdup(upper);
707 if (!destination)
708 return log_oom();
709
710 } else {
711 upper = lower[n - 2];
712 destination = lower[n - 1];
713 lower[n - 2] = NULL;
714 }
715
716 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
717 if (!m)
718 return log_oom();
719
720 m->destination = destination;
721 m->source = upper;
722 m->lower = lower;
723 m->read_only = c == ARG_OVERLAY_RO;
724
725 upper = destination = NULL;
726 lower = NULL;
727
728 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
729 break;
730 }
731
732 case 'E': {
733 char **n;
734
735 if (!env_assignment_is_valid(optarg)) {
736 log_error("Environment variable assignment '%s' is not valid.", optarg);
737 return -EINVAL;
738 }
739
740 n = strv_env_set(arg_setenv, optarg);
741 if (!n)
742 return log_oom();
743
744 strv_free(arg_setenv);
745 arg_setenv = n;
746
747 arg_settings_mask |= SETTING_ENVIRONMENT;
748 break;
749 }
750
751 case 'q':
752 arg_quiet = true;
753 break;
754
755 case ARG_SHARE_SYSTEM:
756 arg_share_system = true;
757 break;
758
759 case ARG_REGISTER:
760 r = parse_boolean(optarg);
761 if (r < 0) {
762 log_error("Failed to parse --register= argument: %s", optarg);
763 return r;
764 }
765
766 arg_register = r;
767 break;
768
769 case ARG_KEEP_UNIT:
770 arg_keep_unit = true;
771 break;
772
773 case ARG_PERSONALITY:
774
775 arg_personality = personality_from_string(optarg);
776 if (arg_personality == PERSONALITY_INVALID) {
777 log_error("Unknown or unsupported personality '%s'.", optarg);
778 return -EINVAL;
779 }
780
781 arg_settings_mask |= SETTING_PERSONALITY;
782 break;
783
784 case ARG_VOLATILE:
785
786 if (!optarg)
787 arg_volatile_mode = VOLATILE_YES;
788 else {
789 VolatileMode m;
790
791 m = volatile_mode_from_string(optarg);
792 if (m < 0) {
793 log_error("Failed to parse --volatile= argument: %s", optarg);
794 return -EINVAL;
795 } else
796 arg_volatile_mode = m;
797 }
798
799 arg_settings_mask |= SETTING_VOLATILE_MODE;
800 break;
801
802 case 'p':
803 r = expose_port_parse(&arg_expose_ports, optarg);
804 if (r == -EEXIST)
805 return log_error_errno(r, "Duplicate port specification: %s", optarg);
806 if (r < 0)
807 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
808
809 arg_settings_mask |= SETTING_EXPOSE_PORTS;
810 break;
811
812 case ARG_PROPERTY:
813 if (strv_extend(&arg_property, optarg) < 0)
814 return log_oom();
815
816 break;
817
818 case ARG_PRIVATE_USERS:
819
820 r = optarg ? parse_boolean(optarg) : 1;
821 if (r == 0) {
822 /* no: User namespacing off */
823 arg_userns_mode = USER_NAMESPACE_NO;
824 arg_uid_shift = UID_INVALID;
825 arg_uid_range = UINT32_C(0x10000);
826 } else if (r > 0) {
827 /* yes: User namespacing on, UID range is read from root dir */
828 arg_userns_mode = USER_NAMESPACE_FIXED;
829 arg_uid_shift = UID_INVALID;
830 arg_uid_range = UINT32_C(0x10000);
831 } else if (streq(optarg, "pick")) {
832 /* pick: User namespacing on, UID range is picked randomly */
833 arg_userns_mode = USER_NAMESPACE_PICK;
834 arg_uid_shift = UID_INVALID;
835 arg_uid_range = UINT32_C(0x10000);
836 } else {
837 _cleanup_free_ char *buffer = NULL;
838 const char *range, *shift;
839
840 /* anything else: User namespacing on, UID range is explicitly configured */
841
842 range = strchr(optarg, ':');
843 if (range) {
844 buffer = strndup(optarg, range - optarg);
845 if (!buffer)
846 return log_oom();
847 shift = buffer;
848
849 range++;
850 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
851 log_error("Failed to parse UID range: %s", range);
852 return -EINVAL;
853 }
854 } else
855 shift = optarg;
856
857 if (parse_uid(shift, &arg_uid_shift) < 0) {
858 log_error("Failed to parse UID: %s", optarg);
859 return -EINVAL;
860 }
861
862 arg_userns_mode = USER_NAMESPACE_FIXED;
863 }
864
865 arg_settings_mask |= SETTING_USERNS;
866 break;
867
868 case 'U':
869 arg_userns_mode = USER_NAMESPACE_PICK;
870 arg_uid_shift = UID_INVALID;
871 arg_uid_range = UINT32_C(0x10000);
872
873 arg_settings_mask |= SETTING_USERNS;
874 break;
875
876 case ARG_PRIVATE_USERS_CHOWN:
877 arg_userns_chown = true;
878
879 arg_settings_mask |= SETTING_USERNS;
880 break;
881
882 case ARG_KILL_SIGNAL:
883 arg_kill_signal = signal_from_string_try_harder(optarg);
884 if (arg_kill_signal < 0) {
885 log_error("Cannot parse signal: %s", optarg);
886 return -EINVAL;
887 }
888
889 arg_settings_mask |= SETTING_KILL_SIGNAL;
890 break;
891
892 case ARG_SETTINGS:
893
894 /* no → do not read files
895 * yes → read files, do not override cmdline, trust only subset
896 * override → read files, override cmdline, trust only subset
897 * trusted → read files, do not override cmdline, trust all
898 */
899
900 r = parse_boolean(optarg);
901 if (r < 0) {
902 if (streq(optarg, "trusted")) {
903 mask_all_settings = false;
904 mask_no_settings = false;
905 arg_settings_trusted = true;
906
907 } else if (streq(optarg, "override")) {
908 mask_all_settings = false;
909 mask_no_settings = true;
910 arg_settings_trusted = -1;
911 } else
912 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
913 } else if (r > 0) {
914 /* yes */
915 mask_all_settings = false;
916 mask_no_settings = false;
917 arg_settings_trusted = -1;
918 } else {
919 /* no */
920 mask_all_settings = true;
921 mask_no_settings = false;
922 arg_settings_trusted = false;
923 }
924
925 break;
926
927 case ARG_CHDIR:
928 if (!path_is_absolute(optarg)) {
929 log_error("Working directory %s is not an absolute path.", optarg);
930 return -EINVAL;
931 }
932
933 r = free_and_strdup(&arg_chdir, optarg);
934 if (r < 0)
935 return log_oom();
936
937 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
938 break;
939
940 case '?':
941 return -EINVAL;
942
943 default:
944 assert_not_reached("Unhandled option");
945 }
946
947 if (arg_share_system)
948 arg_register = false;
949
950 if (arg_userns_mode == USER_NAMESPACE_PICK)
951 arg_userns_chown = true;
952
953 if (arg_start_mode != START_PID1 && arg_share_system) {
954 log_error("--boot and --share-system may not be combined.");
955 return -EINVAL;
956 }
957
958 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
959 log_error("--keep-unit may not be used when invoked from a user session.");
960 return -EINVAL;
961 }
962
963 if (arg_directory && arg_image) {
964 log_error("--directory= and --image= may not be combined.");
965 return -EINVAL;
966 }
967
968 if (arg_template && arg_image) {
969 log_error("--template= and --image= may not be combined.");
970 return -EINVAL;
971 }
972
973 if (arg_template && !(arg_directory || arg_machine)) {
974 log_error("--template= needs --directory= or --machine=.");
975 return -EINVAL;
976 }
977
978 if (arg_ephemeral && arg_template) {
979 log_error("--ephemeral and --template= may not be combined.");
980 return -EINVAL;
981 }
982
983 if (arg_ephemeral && arg_image) {
984 log_error("--ephemeral and --image= may not be combined.");
985 return -EINVAL;
986 }
987
988 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
989 log_error("--ephemeral and --link-journal= may not be combined.");
990 return -EINVAL;
991 }
992
993 if (arg_userns_mode != USER_NAMESPACE_NO && access("/proc/self/uid_map", F_OK) < 0) {
994 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
995 return -EOPNOTSUPP;
996 }
997
998 if (arg_userns_chown && arg_read_only) {
999 log_error("--read-only and --private-users-chown may not be combined.");
1000 return -EINVAL;
1001 }
1002
1003 if (argc > optind) {
1004 arg_parameters = strv_copy(argv + optind);
1005 if (!arg_parameters)
1006 return log_oom();
1007
1008 arg_settings_mask |= SETTING_START_MODE;
1009 }
1010
1011 /* Load all settings from .nspawn files */
1012 if (mask_no_settings)
1013 arg_settings_mask = 0;
1014
1015 /* Don't load any settings from .nspawn files */
1016 if (mask_all_settings)
1017 arg_settings_mask = _SETTINGS_MASK_ALL;
1018
1019 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1020
1021 r = detect_unified_cgroup_hierarchy();
1022 if (r < 0)
1023 return r;
1024
1025 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1026 if (e)
1027 arg_container_service_name = e;
1028
1029 return 1;
1030 }
1031
1032 static int verify_arguments(void) {
1033
1034 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
1035 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1036 return -EINVAL;
1037 }
1038
1039 if (arg_expose_ports && !arg_private_network) {
1040 log_error("Cannot use --port= without private networking.");
1041 return -EINVAL;
1042 }
1043
1044 #ifndef HAVE_LIBIPTC
1045 if (arg_expose_ports) {
1046 log_error("--port= is not supported, compiled without libiptc support.");
1047 return -EOPNOTSUPP;
1048 }
1049 #endif
1050
1051 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1052 arg_kill_signal = SIGRTMIN+3;
1053
1054 return 0;
1055 }
1056
1057 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1058 assert(p);
1059
1060 if (arg_userns_mode == USER_NAMESPACE_NO)
1061 return 0;
1062
1063 if (uid == UID_INVALID && gid == GID_INVALID)
1064 return 0;
1065
1066 if (uid != UID_INVALID) {
1067 uid += arg_uid_shift;
1068
1069 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1070 return -EOVERFLOW;
1071 }
1072
1073 if (gid != GID_INVALID) {
1074 gid += (gid_t) arg_uid_shift;
1075
1076 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1077 return -EOVERFLOW;
1078 }
1079
1080 if (lchown(p, uid, gid) < 0)
1081 return -errno;
1082
1083 return 0;
1084 }
1085
1086 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1087 const char *q;
1088
1089 q = prefix_roota(root, path);
1090 if (mkdir(q, mode) < 0) {
1091 if (errno == EEXIST)
1092 return 0;
1093 return -errno;
1094 }
1095
1096 return userns_lchown(q, uid, gid);
1097 }
1098
1099 static int setup_timezone(const char *dest) {
1100 _cleanup_free_ char *p = NULL, *q = NULL;
1101 const char *where, *check, *what;
1102 char *z, *y;
1103 int r;
1104
1105 assert(dest);
1106
1107 /* Fix the timezone, if possible */
1108 r = readlink_malloc("/etc/localtime", &p);
1109 if (r < 0) {
1110 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1111 return 0;
1112 }
1113
1114 z = path_startswith(p, "../usr/share/zoneinfo/");
1115 if (!z)
1116 z = path_startswith(p, "/usr/share/zoneinfo/");
1117 if (!z) {
1118 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1119 return 0;
1120 }
1121
1122 where = prefix_roota(dest, "/etc/localtime");
1123 r = readlink_malloc(where, &q);
1124 if (r >= 0) {
1125 y = path_startswith(q, "../usr/share/zoneinfo/");
1126 if (!y)
1127 y = path_startswith(q, "/usr/share/zoneinfo/");
1128
1129 /* Already pointing to the right place? Then do nothing .. */
1130 if (y && streq(y, z))
1131 return 0;
1132 }
1133
1134 check = strjoina("/usr/share/zoneinfo/", z);
1135 check = prefix_roota(dest, check);
1136 if (laccess(check, F_OK) < 0) {
1137 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1138 return 0;
1139 }
1140
1141 r = unlink(where);
1142 if (r < 0 && errno != ENOENT) {
1143 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1144 return 0;
1145 }
1146
1147 what = strjoina("../usr/share/zoneinfo/", z);
1148 if (symlink(what, where) < 0) {
1149 log_error_errno(errno, "Failed to correct timezone of container: %m");
1150 return 0;
1151 }
1152
1153 r = userns_lchown(where, 0, 0);
1154 if (r < 0)
1155 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1156
1157 return 0;
1158 }
1159
1160 static int setup_resolv_conf(const char *dest) {
1161 const char *where = NULL;
1162 int r;
1163
1164 assert(dest);
1165
1166 if (arg_private_network)
1167 return 0;
1168
1169 /* Fix resolv.conf, if possible */
1170 where = prefix_roota(dest, "/etc/resolv.conf");
1171
1172 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1173 if (r < 0) {
1174 /* If the file already exists as symlink, let's
1175 * suppress the warning, under the assumption that
1176 * resolved or something similar runs inside and the
1177 * symlink points there.
1178 *
1179 * If the disk image is read-only, there's also no
1180 * point in complaining.
1181 */
1182 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1183 "Failed to copy /etc/resolv.conf to %s: %m", where);
1184 return 0;
1185 }
1186
1187 r = userns_lchown(where, 0, 0);
1188 if (r < 0)
1189 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1190
1191 return 0;
1192 }
1193
1194 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1195 assert(s);
1196
1197 snprintf(s, 37,
1198 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1199 SD_ID128_FORMAT_VAL(id));
1200
1201 return s;
1202 }
1203
1204 static int setup_boot_id(const char *dest) {
1205 const char *from, *to;
1206 sd_id128_t rnd = {};
1207 char as_uuid[37];
1208 int r;
1209
1210 if (arg_share_system)
1211 return 0;
1212
1213 /* Generate a new randomized boot ID, so that each boot-up of
1214 * the container gets a new one */
1215
1216 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1217 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1218
1219 r = sd_id128_randomize(&rnd);
1220 if (r < 0)
1221 return log_error_errno(r, "Failed to generate random boot id: %m");
1222
1223 id128_format_as_uuid(rnd, as_uuid);
1224
1225 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1226 if (r < 0)
1227 return log_error_errno(r, "Failed to write boot id: %m");
1228
1229 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1230 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1231 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1232 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1233
1234 unlink(from);
1235 return r;
1236 }
1237
1238 static int copy_devnodes(const char *dest) {
1239
1240 static const char devnodes[] =
1241 "null\0"
1242 "zero\0"
1243 "full\0"
1244 "random\0"
1245 "urandom\0"
1246 "tty\0"
1247 "net/tun\0";
1248
1249 const char *d;
1250 int r = 0;
1251 _cleanup_umask_ mode_t u;
1252
1253 assert(dest);
1254
1255 u = umask(0000);
1256
1257 /* Create /dev/net, so that we can create /dev/net/tun in it */
1258 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1259 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1260
1261 NULSTR_FOREACH(d, devnodes) {
1262 _cleanup_free_ char *from = NULL, *to = NULL;
1263 struct stat st;
1264
1265 from = strappend("/dev/", d);
1266 to = prefix_root(dest, from);
1267
1268 if (stat(from, &st) < 0) {
1269
1270 if (errno != ENOENT)
1271 return log_error_errno(errno, "Failed to stat %s: %m", from);
1272
1273 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1274
1275 log_error("%s is not a char or block device, cannot copy.", from);
1276 return -EIO;
1277
1278 } else {
1279 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1280 if (errno != EPERM)
1281 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1282
1283 /* Some systems abusively restrict mknod but
1284 * allow bind mounts. */
1285 r = touch(to);
1286 if (r < 0)
1287 return log_error_errno(r, "touch (%s) failed: %m", to);
1288 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1289 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1290 }
1291
1292 r = userns_lchown(to, 0, 0);
1293 if (r < 0)
1294 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1295 }
1296 }
1297
1298 return r;
1299 }
1300
1301 static int setup_pts(const char *dest) {
1302 _cleanup_free_ char *options = NULL;
1303 const char *p;
1304 int r;
1305
1306 #ifdef HAVE_SELINUX
1307 if (arg_selinux_apifs_context)
1308 (void) asprintf(&options,
1309 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1310 arg_uid_shift + TTY_GID,
1311 arg_selinux_apifs_context);
1312 else
1313 #endif
1314 (void) asprintf(&options,
1315 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1316 arg_uid_shift + TTY_GID);
1317
1318 if (!options)
1319 return log_oom();
1320
1321 /* Mount /dev/pts itself */
1322 p = prefix_roota(dest, "/dev/pts");
1323 if (mkdir(p, 0755) < 0)
1324 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1325 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1326 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1327 r = userns_lchown(p, 0, 0);
1328 if (r < 0)
1329 return log_error_errno(r, "Failed to chown /dev/pts: %m");
1330
1331 /* Create /dev/ptmx symlink */
1332 p = prefix_roota(dest, "/dev/ptmx");
1333 if (symlink("pts/ptmx", p) < 0)
1334 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1335 r = userns_lchown(p, 0, 0);
1336 if (r < 0)
1337 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
1338
1339 /* And fix /dev/pts/ptmx ownership */
1340 p = prefix_roota(dest, "/dev/pts/ptmx");
1341 r = userns_lchown(p, 0, 0);
1342 if (r < 0)
1343 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
1344
1345 return 0;
1346 }
1347
1348 static int setup_dev_console(const char *dest, const char *console) {
1349 _cleanup_umask_ mode_t u;
1350 const char *to;
1351 int r;
1352
1353 assert(dest);
1354 assert(console);
1355
1356 u = umask(0000);
1357
1358 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1359 if (r < 0)
1360 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1361
1362 /* We need to bind mount the right tty to /dev/console since
1363 * ptys can only exist on pts file systems. To have something
1364 * to bind mount things on we create a empty regular file. */
1365
1366 to = prefix_roota(dest, "/dev/console");
1367 r = touch(to);
1368 if (r < 0)
1369 return log_error_errno(r, "touch() for /dev/console failed: %m");
1370
1371 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1372 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1373
1374 return 0;
1375 }
1376
1377 static int setup_kmsg(const char *dest, int kmsg_socket) {
1378 const char *from, *to;
1379 _cleanup_umask_ mode_t u;
1380 int fd, r;
1381
1382 assert(kmsg_socket >= 0);
1383
1384 u = umask(0000);
1385
1386 /* We create the kmsg FIFO as /run/kmsg, but immediately
1387 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1388 * on the reading side behave very similar to /proc/kmsg,
1389 * their writing side behaves differently from /dev/kmsg in
1390 * that writing blocks when nothing is reading. In order to
1391 * avoid any problems with containers deadlocking due to this
1392 * we simply make /dev/kmsg unavailable to the container. */
1393 from = prefix_roota(dest, "/run/kmsg");
1394 to = prefix_roota(dest, "/proc/kmsg");
1395
1396 if (mkfifo(from, 0600) < 0)
1397 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1398 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1399 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1400
1401 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1402 if (fd < 0)
1403 return log_error_errno(errno, "Failed to open fifo: %m");
1404
1405 /* Store away the fd in the socket, so that it stays open as
1406 * long as we run the child */
1407 r = send_one_fd(kmsg_socket, fd, 0);
1408 safe_close(fd);
1409
1410 if (r < 0)
1411 return log_error_errno(r, "Failed to send FIFO fd: %m");
1412
1413 /* And now make the FIFO unavailable as /run/kmsg... */
1414 (void) unlink(from);
1415
1416 return 0;
1417 }
1418
1419 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1420 union in_addr_union *exposed = userdata;
1421
1422 assert(rtnl);
1423 assert(m);
1424 assert(exposed);
1425
1426 expose_port_execute(rtnl, arg_expose_ports, exposed);
1427 return 0;
1428 }
1429
1430 static int setup_hostname(void) {
1431
1432 if (arg_share_system)
1433 return 0;
1434
1435 if (sethostname_idempotent(arg_machine) < 0)
1436 return -errno;
1437
1438 return 0;
1439 }
1440
1441 static int setup_journal(const char *directory) {
1442 sd_id128_t this_id;
1443 _cleanup_free_ char *d = NULL;
1444 const char *p, *q;
1445 bool try;
1446 char id[33];
1447 int r;
1448
1449 /* Don't link journals in ephemeral mode */
1450 if (arg_ephemeral)
1451 return 0;
1452
1453 if (arg_link_journal == LINK_NO)
1454 return 0;
1455
1456 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1457
1458 r = sd_id128_get_machine(&this_id);
1459 if (r < 0)
1460 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1461
1462 if (sd_id128_equal(arg_uuid, this_id)) {
1463 log_full(try ? LOG_WARNING : LOG_ERR,
1464 "Host and machine ids are equal (%s): refusing to link journals", id);
1465 if (try)
1466 return 0;
1467 return -EEXIST;
1468 }
1469
1470 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1471 if (r < 0)
1472 return log_error_errno(r, "Failed to create /var: %m");
1473
1474 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1475 if (r < 0)
1476 return log_error_errno(r, "Failed to create /var/log: %m");
1477
1478 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1479 if (r < 0)
1480 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1481
1482 (void) sd_id128_to_string(arg_uuid, id);
1483
1484 p = strjoina("/var/log/journal/", id);
1485 q = prefix_roota(directory, p);
1486
1487 if (path_is_mount_point(p, 0) > 0) {
1488 if (try)
1489 return 0;
1490
1491 log_error("%s: already a mount point, refusing to use for journal", p);
1492 return -EEXIST;
1493 }
1494
1495 if (path_is_mount_point(q, 0) > 0) {
1496 if (try)
1497 return 0;
1498
1499 log_error("%s: already a mount point, refusing to use for journal", q);
1500 return -EEXIST;
1501 }
1502
1503 r = readlink_and_make_absolute(p, &d);
1504 if (r >= 0) {
1505 if ((arg_link_journal == LINK_GUEST ||
1506 arg_link_journal == LINK_AUTO) &&
1507 path_equal(d, q)) {
1508
1509 r = userns_mkdir(directory, p, 0755, 0, 0);
1510 if (r < 0)
1511 log_warning_errno(r, "Failed to create directory %s: %m", q);
1512 return 0;
1513 }
1514
1515 if (unlink(p) < 0)
1516 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1517 } else if (r == -EINVAL) {
1518
1519 if (arg_link_journal == LINK_GUEST &&
1520 rmdir(p) < 0) {
1521
1522 if (errno == ENOTDIR) {
1523 log_error("%s already exists and is neither a symlink nor a directory", p);
1524 return r;
1525 } else
1526 return log_error_errno(errno, "Failed to remove %s: %m", p);
1527 }
1528 } else if (r != -ENOENT)
1529 return log_error_errno(r, "readlink(%s) failed: %m", p);
1530
1531 if (arg_link_journal == LINK_GUEST) {
1532
1533 if (symlink(q, p) < 0) {
1534 if (try) {
1535 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1536 return 0;
1537 } else
1538 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1539 }
1540
1541 r = userns_mkdir(directory, p, 0755, 0, 0);
1542 if (r < 0)
1543 log_warning_errno(r, "Failed to create directory %s: %m", q);
1544 return 0;
1545 }
1546
1547 if (arg_link_journal == LINK_HOST) {
1548 /* don't create parents here — if the host doesn't have
1549 * permanent journal set up, don't force it here */
1550
1551 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
1552 if (try) {
1553 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1554 return 0;
1555 } else
1556 return log_error_errno(errno, "Failed to create %s: %m", p);
1557 }
1558
1559 } else if (access(p, F_OK) < 0)
1560 return 0;
1561
1562 if (dir_is_empty(q) == 0)
1563 log_warning("%s is not empty, proceeding anyway.", q);
1564
1565 r = userns_mkdir(directory, p, 0755, 0, 0);
1566 if (r < 0)
1567 return log_error_errno(r, "Failed to create %s: %m", q);
1568
1569 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1570 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1571
1572 return 0;
1573 }
1574
1575 static int drop_capabilities(void) {
1576 return capability_bounding_set_drop(arg_retain, false);
1577 }
1578
1579 static int reset_audit_loginuid(void) {
1580 _cleanup_free_ char *p = NULL;
1581 int r;
1582
1583 if (arg_share_system)
1584 return 0;
1585
1586 r = read_one_line_file("/proc/self/loginuid", &p);
1587 if (r == -ENOENT)
1588 return 0;
1589 if (r < 0)
1590 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1591
1592 /* Already reset? */
1593 if (streq(p, "4294967295"))
1594 return 0;
1595
1596 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1597 if (r < 0) {
1598 log_error_errno(r,
1599 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1600 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1601 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1602 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1603 "using systemd-nspawn. Sleeping for 5s... (%m)");
1604
1605 sleep(5);
1606 }
1607
1608 return 0;
1609 }
1610
1611 static int setup_seccomp(void) {
1612
1613 #ifdef HAVE_SECCOMP
1614 static const struct {
1615 uint64_t capability;
1616 int syscall_num;
1617 } blacklist[] = {
1618 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1619 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1620 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1621 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1622 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1623 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1624 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1625 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1626 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1627 { CAP_SYSLOG, SCMP_SYS(syslog) },
1628 };
1629
1630 scmp_filter_ctx seccomp;
1631 unsigned i;
1632 int r;
1633
1634 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1635 if (!seccomp)
1636 return log_oom();
1637
1638 r = seccomp_add_secondary_archs(seccomp);
1639 if (r < 0) {
1640 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1641 goto finish;
1642 }
1643
1644 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1645 if (arg_retain & (1ULL << blacklist[i].capability))
1646 continue;
1647
1648 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1649 if (r == -EFAULT)
1650 continue; /* unknown syscall */
1651 if (r < 0) {
1652 log_error_errno(r, "Failed to block syscall: %m");
1653 goto finish;
1654 }
1655 }
1656
1657
1658 /*
1659 Audit is broken in containers, much of the userspace audit
1660 hookup will fail if running inside a container. We don't
1661 care and just turn off creation of audit sockets.
1662
1663 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1664 with EAFNOSUPPORT which audit userspace uses as indication
1665 that audit is disabled in the kernel.
1666 */
1667
1668 r = seccomp_rule_add(
1669 seccomp,
1670 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1671 SCMP_SYS(socket),
1672 2,
1673 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1674 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1675 if (r < 0) {
1676 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1677 goto finish;
1678 }
1679
1680 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1681 if (r < 0) {
1682 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1683 goto finish;
1684 }
1685
1686 r = seccomp_load(seccomp);
1687 if (r == -EINVAL) {
1688 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1689 r = 0;
1690 goto finish;
1691 }
1692 if (r < 0) {
1693 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1694 goto finish;
1695 }
1696
1697 finish:
1698 seccomp_release(seccomp);
1699 return r;
1700 #else
1701 return 0;
1702 #endif
1703
1704 }
1705
1706 static int setup_propagate(const char *root) {
1707 const char *p, *q;
1708 int r;
1709
1710 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1711 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1712 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1713 (void) mkdir_p(p, 0600);
1714
1715 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1716 if (r < 0)
1717 return log_error_errno(r, "Failed to create /run/systemd: %m");
1718
1719 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1720 if (r < 0)
1721 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
1722
1723 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1724 if (r < 0)
1725 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
1726
1727 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1728 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1729 return log_error_errno(errno, "Failed to install propagation bind mount.");
1730
1731 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1732 return log_error_errno(errno, "Failed to make propagation mount read-only");
1733
1734 return 0;
1735 }
1736
1737 static int setup_image(char **device_path, int *loop_nr) {
1738 struct loop_info64 info = {
1739 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1740 };
1741 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1742 _cleanup_free_ char* loopdev = NULL;
1743 struct stat st;
1744 int r, nr;
1745
1746 assert(device_path);
1747 assert(loop_nr);
1748 assert(arg_image);
1749
1750 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1751 if (fd < 0)
1752 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1753
1754 if (fstat(fd, &st) < 0)
1755 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1756
1757 if (S_ISBLK(st.st_mode)) {
1758 char *p;
1759
1760 p = strdup(arg_image);
1761 if (!p)
1762 return log_oom();
1763
1764 *device_path = p;
1765
1766 *loop_nr = -1;
1767
1768 r = fd;
1769 fd = -1;
1770
1771 return r;
1772 }
1773
1774 if (!S_ISREG(st.st_mode)) {
1775 log_error("%s is not a regular file or block device.", arg_image);
1776 return -EINVAL;
1777 }
1778
1779 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1780 if (control < 0)
1781 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1782
1783 nr = ioctl(control, LOOP_CTL_GET_FREE);
1784 if (nr < 0)
1785 return log_error_errno(errno, "Failed to allocate loop device: %m");
1786
1787 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1788 return log_oom();
1789
1790 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1791 if (loop < 0)
1792 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1793
1794 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1795 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1796
1797 if (arg_read_only)
1798 info.lo_flags |= LO_FLAGS_READ_ONLY;
1799
1800 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1801 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1802
1803 *device_path = loopdev;
1804 loopdev = NULL;
1805
1806 *loop_nr = nr;
1807
1808 r = loop;
1809 loop = -1;
1810
1811 return r;
1812 }
1813
1814 #define PARTITION_TABLE_BLURB \
1815 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1816 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1817 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1818 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1819 "to be bootable with systemd-nspawn."
1820
1821 static int dissect_image(
1822 int fd,
1823 char **root_device, bool *root_device_rw,
1824 char **home_device, bool *home_device_rw,
1825 char **srv_device, bool *srv_device_rw,
1826 bool *secondary) {
1827
1828 #ifdef HAVE_BLKID
1829 int home_nr = -1, srv_nr = -1;
1830 #ifdef GPT_ROOT_NATIVE
1831 int root_nr = -1;
1832 #endif
1833 #ifdef GPT_ROOT_SECONDARY
1834 int secondary_root_nr = -1;
1835 #endif
1836 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1837 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1838 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1839 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1840 _cleanup_udev_unref_ struct udev *udev = NULL;
1841 struct udev_list_entry *first, *item;
1842 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1843 bool is_gpt, is_mbr, multiple_generic = false;
1844 const char *pttype = NULL;
1845 blkid_partlist pl;
1846 struct stat st;
1847 unsigned i;
1848 int r;
1849
1850 assert(fd >= 0);
1851 assert(root_device);
1852 assert(home_device);
1853 assert(srv_device);
1854 assert(secondary);
1855 assert(arg_image);
1856
1857 b = blkid_new_probe();
1858 if (!b)
1859 return log_oom();
1860
1861 errno = 0;
1862 r = blkid_probe_set_device(b, fd, 0, 0);
1863 if (r != 0) {
1864 if (errno == 0)
1865 return log_oom();
1866
1867 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1868 }
1869
1870 blkid_probe_enable_partitions(b, 1);
1871 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1872
1873 errno = 0;
1874 r = blkid_do_safeprobe(b);
1875 if (r == -2 || r == 1) {
1876 log_error("Failed to identify any partition table on\n"
1877 " %s\n"
1878 PARTITION_TABLE_BLURB, arg_image);
1879 return -EINVAL;
1880 } else if (r != 0) {
1881 if (errno == 0)
1882 errno = EIO;
1883 return log_error_errno(errno, "Failed to probe: %m");
1884 }
1885
1886 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1887
1888 is_gpt = streq_ptr(pttype, "gpt");
1889 is_mbr = streq_ptr(pttype, "dos");
1890
1891 if (!is_gpt && !is_mbr) {
1892 log_error("No GPT or MBR partition table discovered on\n"
1893 " %s\n"
1894 PARTITION_TABLE_BLURB, arg_image);
1895 return -EINVAL;
1896 }
1897
1898 errno = 0;
1899 pl = blkid_probe_get_partitions(b);
1900 if (!pl) {
1901 if (errno == 0)
1902 return log_oom();
1903
1904 log_error("Failed to list partitions of %s", arg_image);
1905 return -errno;
1906 }
1907
1908 udev = udev_new();
1909 if (!udev)
1910 return log_oom();
1911
1912 if (fstat(fd, &st) < 0)
1913 return log_error_errno(errno, "Failed to stat block device: %m");
1914
1915 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1916 if (!d)
1917 return log_oom();
1918
1919 for (i = 0;; i++) {
1920 int n, m;
1921
1922 if (i >= 10) {
1923 log_error("Kernel partitions never appeared.");
1924 return -ENXIO;
1925 }
1926
1927 e = udev_enumerate_new(udev);
1928 if (!e)
1929 return log_oom();
1930
1931 r = udev_enumerate_add_match_parent(e, d);
1932 if (r < 0)
1933 return log_oom();
1934
1935 r = udev_enumerate_scan_devices(e);
1936 if (r < 0)
1937 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1938
1939 /* Count the partitions enumerated by the kernel */
1940 n = 0;
1941 first = udev_enumerate_get_list_entry(e);
1942 udev_list_entry_foreach(item, first)
1943 n++;
1944
1945 /* Count the partitions enumerated by blkid */
1946 m = blkid_partlist_numof_partitions(pl);
1947 if (n == m + 1)
1948 break;
1949 if (n > m + 1) {
1950 log_error("blkid and kernel partition list do not match.");
1951 return -EIO;
1952 }
1953 if (n < m + 1) {
1954 unsigned j;
1955
1956 /* The kernel has probed fewer partitions than
1957 * blkid? Maybe the kernel prober is still
1958 * running or it got EBUSY because udev
1959 * already opened the device. Let's reprobe
1960 * the device, which is a synchronous call
1961 * that waits until probing is complete. */
1962
1963 for (j = 0; j < 20; j++) {
1964
1965 r = ioctl(fd, BLKRRPART, 0);
1966 if (r < 0)
1967 r = -errno;
1968 if (r >= 0 || r != -EBUSY)
1969 break;
1970
1971 /* If something else has the device
1972 * open, such as an udev rule, the
1973 * ioctl will return EBUSY. Since
1974 * there's no way to wait until it
1975 * isn't busy anymore, let's just wait
1976 * a bit, and try again.
1977 *
1978 * This is really something they
1979 * should fix in the kernel! */
1980
1981 usleep(50 * USEC_PER_MSEC);
1982 }
1983
1984 if (r < 0)
1985 return log_error_errno(r, "Failed to reread partition table: %m");
1986 }
1987
1988 e = udev_enumerate_unref(e);
1989 }
1990
1991 first = udev_enumerate_get_list_entry(e);
1992 udev_list_entry_foreach(item, first) {
1993 _cleanup_udev_device_unref_ struct udev_device *q;
1994 const char *node;
1995 unsigned long long flags;
1996 blkid_partition pp;
1997 dev_t qn;
1998 int nr;
1999
2000 errno = 0;
2001 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2002 if (!q) {
2003 if (!errno)
2004 errno = ENOMEM;
2005
2006 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2007 }
2008
2009 qn = udev_device_get_devnum(q);
2010 if (major(qn) == 0)
2011 continue;
2012
2013 if (st.st_rdev == qn)
2014 continue;
2015
2016 node = udev_device_get_devnode(q);
2017 if (!node)
2018 continue;
2019
2020 pp = blkid_partlist_devno_to_partition(pl, qn);
2021 if (!pp)
2022 continue;
2023
2024 flags = blkid_partition_get_flags(pp);
2025
2026 nr = blkid_partition_get_partno(pp);
2027 if (nr < 0)
2028 continue;
2029
2030 if (is_gpt) {
2031 sd_id128_t type_id;
2032 const char *stype;
2033
2034 if (flags & GPT_FLAG_NO_AUTO)
2035 continue;
2036
2037 stype = blkid_partition_get_type_string(pp);
2038 if (!stype)
2039 continue;
2040
2041 if (sd_id128_from_string(stype, &type_id) < 0)
2042 continue;
2043
2044 if (sd_id128_equal(type_id, GPT_HOME)) {
2045
2046 if (home && nr >= home_nr)
2047 continue;
2048
2049 home_nr = nr;
2050 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2051
2052 r = free_and_strdup(&home, node);
2053 if (r < 0)
2054 return log_oom();
2055
2056 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2057
2058 if (srv && nr >= srv_nr)
2059 continue;
2060
2061 srv_nr = nr;
2062 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2063
2064 r = free_and_strdup(&srv, node);
2065 if (r < 0)
2066 return log_oom();
2067 }
2068 #ifdef GPT_ROOT_NATIVE
2069 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2070
2071 if (root && nr >= root_nr)
2072 continue;
2073
2074 root_nr = nr;
2075 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2076
2077 r = free_and_strdup(&root, node);
2078 if (r < 0)
2079 return log_oom();
2080 }
2081 #endif
2082 #ifdef GPT_ROOT_SECONDARY
2083 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2084
2085 if (secondary_root && nr >= secondary_root_nr)
2086 continue;
2087
2088 secondary_root_nr = nr;
2089 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2090
2091 r = free_and_strdup(&secondary_root, node);
2092 if (r < 0)
2093 return log_oom();
2094 }
2095 #endif
2096 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2097
2098 if (generic)
2099 multiple_generic = true;
2100 else {
2101 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2102
2103 r = free_and_strdup(&generic, node);
2104 if (r < 0)
2105 return log_oom();
2106 }
2107 }
2108
2109 } else if (is_mbr) {
2110 int type;
2111
2112 if (flags != 0x80) /* Bootable flag */
2113 continue;
2114
2115 type = blkid_partition_get_type(pp);
2116 if (type != 0x83) /* Linux partition */
2117 continue;
2118
2119 if (generic)
2120 multiple_generic = true;
2121 else {
2122 generic_rw = true;
2123
2124 r = free_and_strdup(&root, node);
2125 if (r < 0)
2126 return log_oom();
2127 }
2128 }
2129 }
2130
2131 if (root) {
2132 *root_device = root;
2133 root = NULL;
2134
2135 *root_device_rw = root_rw;
2136 *secondary = false;
2137 } else if (secondary_root) {
2138 *root_device = secondary_root;
2139 secondary_root = NULL;
2140
2141 *root_device_rw = secondary_root_rw;
2142 *secondary = true;
2143 } else if (generic) {
2144
2145 /* There were no partitions with precise meanings
2146 * around, but we found generic partitions. In this
2147 * case, if there's only one, we can go ahead and boot
2148 * it, otherwise we bail out, because we really cannot
2149 * make any sense of it. */
2150
2151 if (multiple_generic) {
2152 log_error("Identified multiple bootable Linux partitions on\n"
2153 " %s\n"
2154 PARTITION_TABLE_BLURB, arg_image);
2155 return -EINVAL;
2156 }
2157
2158 *root_device = generic;
2159 generic = NULL;
2160
2161 *root_device_rw = generic_rw;
2162 *secondary = false;
2163 } else {
2164 log_error("Failed to identify root partition in disk image\n"
2165 " %s\n"
2166 PARTITION_TABLE_BLURB, arg_image);
2167 return -EINVAL;
2168 }
2169
2170 if (home) {
2171 *home_device = home;
2172 home = NULL;
2173
2174 *home_device_rw = home_rw;
2175 }
2176
2177 if (srv) {
2178 *srv_device = srv;
2179 srv = NULL;
2180
2181 *srv_device_rw = srv_rw;
2182 }
2183
2184 return 0;
2185 #else
2186 log_error("--image= is not supported, compiled without blkid support.");
2187 return -EOPNOTSUPP;
2188 #endif
2189 }
2190
2191 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2192 #ifdef HAVE_BLKID
2193 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2194 const char *fstype, *p;
2195 int r;
2196
2197 assert(what);
2198 assert(where);
2199
2200 if (arg_read_only)
2201 rw = false;
2202
2203 if (directory)
2204 p = strjoina(where, directory);
2205 else
2206 p = where;
2207
2208 errno = 0;
2209 b = blkid_new_probe_from_filename(what);
2210 if (!b) {
2211 if (errno == 0)
2212 return log_oom();
2213 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2214 }
2215
2216 blkid_probe_enable_superblocks(b, 1);
2217 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2218
2219 errno = 0;
2220 r = blkid_do_safeprobe(b);
2221 if (r == -1 || r == 1) {
2222 log_error("Cannot determine file system type of %s", what);
2223 return -EINVAL;
2224 } else if (r != 0) {
2225 if (errno == 0)
2226 errno = EIO;
2227 return log_error_errno(errno, "Failed to probe %s: %m", what);
2228 }
2229
2230 errno = 0;
2231 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2232 if (errno == 0)
2233 errno = EINVAL;
2234 log_error("Failed to determine file system type of %s", what);
2235 return -errno;
2236 }
2237
2238 if (streq(fstype, "crypto_LUKS")) {
2239 log_error("nspawn currently does not support LUKS disk images.");
2240 return -EOPNOTSUPP;
2241 }
2242
2243 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2244 return log_error_errno(errno, "Failed to mount %s: %m", what);
2245
2246 return 0;
2247 #else
2248 log_error("--image= is not supported, compiled without blkid support.");
2249 return -EOPNOTSUPP;
2250 #endif
2251 }
2252
2253 static int setup_machine_id(const char *directory) {
2254 int r;
2255 const char *etc_machine_id, *t;
2256 _cleanup_free_ char *s = NULL;
2257
2258 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2259
2260 r = read_one_line_file(etc_machine_id, &s);
2261 if (r < 0)
2262 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
2263
2264 t = strstrip(s);
2265
2266 if (!isempty(t)) {
2267 r = sd_id128_from_string(t, &arg_uuid);
2268 if (r < 0)
2269 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
2270 } else {
2271 if (sd_id128_is_null(arg_uuid)) {
2272 r = sd_id128_randomize(&arg_uuid);
2273 if (r < 0)
2274 return log_error_errno(r, "Failed to generate random machine ID: %m");
2275 }
2276 }
2277
2278 r = machine_id_setup(directory, arg_uuid);
2279 if (r < 0)
2280 return log_error_errno(r, "Failed to setup machine ID: %m");
2281
2282 return 0;
2283 }
2284
2285 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2286 int r;
2287
2288 assert(directory);
2289
2290 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
2291 return 0;
2292
2293 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2294 if (r == -EOPNOTSUPP)
2295 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2296 if (r == -EBADE)
2297 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2298 if (r < 0)
2299 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2300 if (r == 0)
2301 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2302 else
2303 log_debug("Patched directory tree to match UID/GID range.");
2304
2305 return r;
2306 }
2307
2308 static int mount_devices(
2309 const char *where,
2310 const char *root_device, bool root_device_rw,
2311 const char *home_device, bool home_device_rw,
2312 const char *srv_device, bool srv_device_rw) {
2313 int r;
2314
2315 assert(where);
2316
2317 if (root_device) {
2318 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2319 if (r < 0)
2320 return log_error_errno(r, "Failed to mount root directory: %m");
2321 }
2322
2323 if (home_device) {
2324 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2325 if (r < 0)
2326 return log_error_errno(r, "Failed to mount home directory: %m");
2327 }
2328
2329 if (srv_device) {
2330 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2331 if (r < 0)
2332 return log_error_errno(r, "Failed to mount server data directory: %m");
2333 }
2334
2335 return 0;
2336 }
2337
2338 static void loop_remove(int nr, int *image_fd) {
2339 _cleanup_close_ int control = -1;
2340 int r;
2341
2342 if (nr < 0)
2343 return;
2344
2345 if (image_fd && *image_fd >= 0) {
2346 r = ioctl(*image_fd, LOOP_CLR_FD);
2347 if (r < 0)
2348 log_debug_errno(errno, "Failed to close loop image: %m");
2349 *image_fd = safe_close(*image_fd);
2350 }
2351
2352 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2353 if (control < 0) {
2354 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2355 return;
2356 }
2357
2358 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2359 if (r < 0)
2360 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2361 }
2362
2363 /*
2364 * Return values:
2365 * < 0 : wait_for_terminate() failed to get the state of the
2366 * container, the container was terminated by a signal, or
2367 * failed for an unknown reason. No change is made to the
2368 * container argument.
2369 * > 0 : The program executed in the container terminated with an
2370 * error. The exit code of the program executed in the
2371 * container is returned. The container argument has been set
2372 * to CONTAINER_TERMINATED.
2373 * 0 : The container is being rebooted, has been shut down or exited
2374 * successfully. The container argument has been set to either
2375 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2376 *
2377 * That is, success is indicated by a return value of zero, and an
2378 * error is indicated by a non-zero value.
2379 */
2380 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2381 siginfo_t status;
2382 int r;
2383
2384 r = wait_for_terminate(pid, &status);
2385 if (r < 0)
2386 return log_warning_errno(r, "Failed to wait for container: %m");
2387
2388 switch (status.si_code) {
2389
2390 case CLD_EXITED:
2391 if (status.si_status == 0) {
2392 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2393
2394 } else
2395 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2396
2397 *container = CONTAINER_TERMINATED;
2398 return status.si_status;
2399
2400 case CLD_KILLED:
2401 if (status.si_status == SIGINT) {
2402
2403 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2404 *container = CONTAINER_TERMINATED;
2405 return 0;
2406
2407 } else if (status.si_status == SIGHUP) {
2408
2409 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2410 *container = CONTAINER_REBOOTED;
2411 return 0;
2412 }
2413
2414 /* CLD_KILLED fallthrough */
2415
2416 case CLD_DUMPED:
2417 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2418 return -EIO;
2419
2420 default:
2421 log_error("Container %s failed due to unknown reason.", arg_machine);
2422 return -EIO;
2423 }
2424
2425 return r;
2426 }
2427
2428 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2429 pid_t pid;
2430
2431 pid = PTR_TO_PID(userdata);
2432 if (pid > 0) {
2433 if (kill(pid, arg_kill_signal) >= 0) {
2434 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2435 sd_event_source_set_userdata(s, NULL);
2436 return 0;
2437 }
2438 }
2439
2440 sd_event_exit(sd_event_source_get_event(s), 0);
2441 return 0;
2442 }
2443
2444 static int determine_names(void) {
2445 int r;
2446
2447 if (arg_template && !arg_directory && arg_machine) {
2448
2449 /* If --template= was specified then we should not
2450 * search for a machine, but instead create a new one
2451 * in /var/lib/machine. */
2452
2453 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2454 if (!arg_directory)
2455 return log_oom();
2456 }
2457
2458 if (!arg_image && !arg_directory) {
2459 if (arg_machine) {
2460 _cleanup_(image_unrefp) Image *i = NULL;
2461
2462 r = image_find(arg_machine, &i);
2463 if (r < 0)
2464 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2465 else if (r == 0) {
2466 log_error("No image for machine '%s': %m", arg_machine);
2467 return -ENOENT;
2468 }
2469
2470 if (i->type == IMAGE_RAW)
2471 r = free_and_strdup(&arg_image, i->path);
2472 else
2473 r = free_and_strdup(&arg_directory, i->path);
2474 if (r < 0)
2475 return log_error_errno(r, "Invalid image directory: %m");
2476
2477 if (!arg_ephemeral)
2478 arg_read_only = arg_read_only || i->read_only;
2479 } else
2480 arg_directory = get_current_dir_name();
2481
2482 if (!arg_directory && !arg_machine) {
2483 log_error("Failed to determine path, please use -D or -i.");
2484 return -EINVAL;
2485 }
2486 }
2487
2488 if (!arg_machine) {
2489 if (arg_directory && path_equal(arg_directory, "/"))
2490 arg_machine = gethostname_malloc();
2491 else
2492 arg_machine = strdup(basename(arg_image ?: arg_directory));
2493
2494 if (!arg_machine)
2495 return log_oom();
2496
2497 hostname_cleanup(arg_machine);
2498 if (!machine_name_is_valid(arg_machine)) {
2499 log_error("Failed to determine machine name automatically, please use -M.");
2500 return -EINVAL;
2501 }
2502
2503 if (arg_ephemeral) {
2504 char *b;
2505
2506 /* Add a random suffix when this is an
2507 * ephemeral machine, so that we can run many
2508 * instances at once without manually having
2509 * to specify -M each time. */
2510
2511 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2512 return log_oom();
2513
2514 free(arg_machine);
2515 arg_machine = b;
2516 }
2517 }
2518
2519 return 0;
2520 }
2521
2522 static int determine_uid_shift(const char *directory) {
2523 int r;
2524
2525 if (arg_userns_mode == USER_NAMESPACE_NO) {
2526 arg_uid_shift = 0;
2527 return 0;
2528 }
2529
2530 if (arg_uid_shift == UID_INVALID) {
2531 struct stat st;
2532
2533 r = stat(directory, &st);
2534 if (r < 0)
2535 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2536
2537 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2538
2539 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2540 log_error("UID and GID base of %s don't match.", directory);
2541 return -EINVAL;
2542 }
2543
2544 arg_uid_range = UINT32_C(0x10000);
2545 }
2546
2547 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2548 log_error("UID base too high for UID range.");
2549 return -EINVAL;
2550 }
2551
2552 return 0;
2553 }
2554
2555 static int inner_child(
2556 Barrier *barrier,
2557 const char *directory,
2558 bool secondary,
2559 int kmsg_socket,
2560 int rtnl_socket,
2561 FDSet *fds) {
2562
2563 _cleanup_free_ char *home = NULL;
2564 char as_uuid[37];
2565 unsigned n_env = 1;
2566 const char *envp[] = {
2567 "PATH=" DEFAULT_PATH_SPLIT_USR,
2568 NULL, /* container */
2569 NULL, /* TERM */
2570 NULL, /* HOME */
2571 NULL, /* USER */
2572 NULL, /* LOGNAME */
2573 NULL, /* container_uuid */
2574 NULL, /* LISTEN_FDS */
2575 NULL, /* LISTEN_PID */
2576 NULL
2577 };
2578
2579 _cleanup_strv_free_ char **env_use = NULL;
2580 int r;
2581
2582 assert(barrier);
2583 assert(directory);
2584 assert(kmsg_socket >= 0);
2585
2586 cg_unified_flush();
2587
2588 if (arg_userns_mode != USER_NAMESPACE_NO) {
2589 /* Tell the parent, that it now can write the UID map. */
2590 (void) barrier_place(barrier); /* #1 */
2591
2592 /* Wait until the parent wrote the UID map */
2593 if (!barrier_place_and_sync(barrier)) { /* #2 */
2594 log_error("Parent died too early");
2595 return -ESRCH;
2596 }
2597 }
2598
2599 r = mount_all(NULL,
2600 arg_userns_mode != USER_NAMESPACE_NO,
2601 true,
2602 arg_private_network,
2603 arg_uid_shift,
2604 arg_uid_range,
2605 arg_selinux_apifs_context);
2606
2607 if (r < 0)
2608 return r;
2609
2610 r = mount_sysfs(NULL);
2611 if (r < 0)
2612 return r;
2613
2614 /* Wait until we are cgroup-ified, so that we
2615 * can mount the right cgroup path writable */
2616 if (!barrier_place_and_sync(barrier)) { /* #3 */
2617 log_error("Parent died too early");
2618 return -ESRCH;
2619 }
2620
2621 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2622 if (r < 0)
2623 return r;
2624
2625 r = reset_uid_gid();
2626 if (r < 0)
2627 return log_error_errno(r, "Couldn't become new root: %m");
2628
2629 r = setup_boot_id(NULL);
2630 if (r < 0)
2631 return r;
2632
2633 r = setup_kmsg(NULL, kmsg_socket);
2634 if (r < 0)
2635 return r;
2636 kmsg_socket = safe_close(kmsg_socket);
2637
2638 umask(0022);
2639
2640 if (setsid() < 0)
2641 return log_error_errno(errno, "setsid() failed: %m");
2642
2643 if (arg_private_network)
2644 loopback_setup();
2645
2646 if (arg_expose_ports) {
2647 r = expose_port_send_rtnl(rtnl_socket);
2648 if (r < 0)
2649 return r;
2650 rtnl_socket = safe_close(rtnl_socket);
2651 }
2652
2653 r = drop_capabilities();
2654 if (r < 0)
2655 return log_error_errno(r, "drop_capabilities() failed: %m");
2656
2657 setup_hostname();
2658
2659 if (arg_personality != PERSONALITY_INVALID) {
2660 if (personality(arg_personality) < 0)
2661 return log_error_errno(errno, "personality() failed: %m");
2662 } else if (secondary) {
2663 if (personality(PER_LINUX32) < 0)
2664 return log_error_errno(errno, "personality() failed: %m");
2665 }
2666
2667 #ifdef HAVE_SELINUX
2668 if (arg_selinux_context)
2669 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2670 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2671 #endif
2672
2673 r = change_uid_gid(arg_user, &home);
2674 if (r < 0)
2675 return r;
2676
2677 /* LXC sets container=lxc, so follow the scheme here */
2678 envp[n_env++] = strjoina("container=", arg_container_service_name);
2679
2680 envp[n_env] = strv_find_prefix(environ, "TERM=");
2681 if (envp[n_env])
2682 n_env++;
2683
2684 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2685 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2686 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2687 return log_oom();
2688
2689 assert(!sd_id128_equal(arg_uuid, SD_ID128_NULL));
2690
2691 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2692 return log_oom();
2693
2694 if (fdset_size(fds) > 0) {
2695 r = fdset_cloexec(fds, false);
2696 if (r < 0)
2697 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2698
2699 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2700 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2701 return log_oom();
2702 }
2703
2704 env_use = strv_env_merge(2, envp, arg_setenv);
2705 if (!env_use)
2706 return log_oom();
2707
2708 /* Let the parent know that we are ready and
2709 * wait until the parent is ready with the
2710 * setup, too... */
2711 if (!barrier_place_and_sync(barrier)) { /* #4 */
2712 log_error("Parent died too early");
2713 return -ESRCH;
2714 }
2715
2716 if (arg_chdir)
2717 if (chdir(arg_chdir) < 0)
2718 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2719
2720 if (arg_start_mode == START_PID2) {
2721 r = stub_pid1();
2722 if (r < 0)
2723 return r;
2724 }
2725
2726 /* Now, explicitly close the log, so that we
2727 * then can close all remaining fds. Closing
2728 * the log explicitly first has the benefit
2729 * that the logging subsystem knows about it,
2730 * and is thus ready to be reopened should we
2731 * need it again. Note that the other fds
2732 * closed here are at least the locking and
2733 * barrier fds. */
2734 log_close();
2735 (void) fdset_close_others(fds);
2736
2737 if (arg_start_mode == START_BOOT) {
2738 char **a;
2739 size_t m;
2740
2741 /* Automatically search for the init system */
2742
2743 m = strv_length(arg_parameters);
2744 a = newa(char*, m + 2);
2745 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2746 a[1 + m] = NULL;
2747
2748 a[0] = (char*) "/usr/lib/systemd/systemd";
2749 execve(a[0], a, env_use);
2750
2751 a[0] = (char*) "/lib/systemd/systemd";
2752 execve(a[0], a, env_use);
2753
2754 a[0] = (char*) "/sbin/init";
2755 execve(a[0], a, env_use);
2756 } else if (!strv_isempty(arg_parameters))
2757 execvpe(arg_parameters[0], arg_parameters, env_use);
2758 else {
2759 if (!arg_chdir)
2760 /* If we cannot change the directory, we'll end up in /, that is expected. */
2761 (void) chdir(home ?: "/root");
2762
2763 execle("/bin/bash", "-bash", NULL, env_use);
2764 execle("/bin/sh", "-sh", NULL, env_use);
2765 }
2766
2767 r = -errno;
2768 (void) log_open();
2769 return log_error_errno(r, "execv() failed: %m");
2770 }
2771
2772 static int outer_child(
2773 Barrier *barrier,
2774 const char *directory,
2775 const char *console,
2776 const char *root_device, bool root_device_rw,
2777 const char *home_device, bool home_device_rw,
2778 const char *srv_device, bool srv_device_rw,
2779 bool interactive,
2780 bool secondary,
2781 int pid_socket,
2782 int uuid_socket,
2783 int kmsg_socket,
2784 int rtnl_socket,
2785 int uid_shift_socket,
2786 FDSet *fds) {
2787
2788 pid_t pid;
2789 ssize_t l;
2790 int r;
2791
2792 assert(barrier);
2793 assert(directory);
2794 assert(console);
2795 assert(pid_socket >= 0);
2796 assert(uuid_socket >= 0);
2797 assert(kmsg_socket >= 0);
2798
2799 cg_unified_flush();
2800
2801 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2802 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2803
2804 if (interactive) {
2805 close_nointr(STDIN_FILENO);
2806 close_nointr(STDOUT_FILENO);
2807 close_nointr(STDERR_FILENO);
2808
2809 r = open_terminal(console, O_RDWR);
2810 if (r != STDIN_FILENO) {
2811 if (r >= 0) {
2812 safe_close(r);
2813 r = -EINVAL;
2814 }
2815
2816 return log_error_errno(r, "Failed to open console: %m");
2817 }
2818
2819 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2820 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2821 return log_error_errno(errno, "Failed to duplicate console: %m");
2822 }
2823
2824 r = reset_audit_loginuid();
2825 if (r < 0)
2826 return r;
2827
2828 /* Mark everything as slave, so that we still
2829 * receive mounts from the real root, but don't
2830 * propagate mounts to the real root. */
2831 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2832 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2833
2834 r = mount_devices(directory,
2835 root_device, root_device_rw,
2836 home_device, home_device_rw,
2837 srv_device, srv_device_rw);
2838 if (r < 0)
2839 return r;
2840
2841 r = determine_uid_shift(directory);
2842 if (r < 0)
2843 return r;
2844
2845 if (arg_userns_mode != USER_NAMESPACE_NO) {
2846 /* Let the parent know which UID shift we read from the image */
2847 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2848 if (l < 0)
2849 return log_error_errno(errno, "Failed to send UID shift: %m");
2850 if (l != sizeof(arg_uid_shift)) {
2851 log_error("Short write while sending UID shift.");
2852 return -EIO;
2853 }
2854
2855 if (arg_userns_mode == USER_NAMESPACE_PICK) {
2856 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2857 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2858 * not it will pick a different one, and send it back to us. */
2859
2860 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2861 if (l < 0)
2862 return log_error_errno(errno, "Failed to recv UID shift: %m");
2863 if (l != sizeof(arg_uid_shift)) {
2864 log_error("Short read while recieving UID shift.");
2865 return -EIO;
2866 }
2867 }
2868
2869 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2870 }
2871
2872 /* Turn directory into bind mount */
2873 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2874 return log_error_errno(errno, "Failed to make bind mount: %m");
2875
2876 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
2877 if (r < 0)
2878 return r;
2879
2880 r = setup_volatile(
2881 directory,
2882 arg_volatile_mode,
2883 arg_userns_mode != USER_NAMESPACE_NO,
2884 arg_uid_shift,
2885 arg_uid_range,
2886 arg_selinux_context);
2887 if (r < 0)
2888 return r;
2889
2890 r = setup_volatile_state(
2891 directory,
2892 arg_volatile_mode,
2893 arg_userns_mode != USER_NAMESPACE_NO,
2894 arg_uid_shift,
2895 arg_uid_range,
2896 arg_selinux_context);
2897 if (r < 0)
2898 return r;
2899
2900 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2901 if (r < 0)
2902 return r;
2903
2904 if (arg_read_only) {
2905 r = bind_remount_recursive(directory, true);
2906 if (r < 0)
2907 return log_error_errno(r, "Failed to make tree read-only: %m");
2908 }
2909
2910 r = mount_all(directory,
2911 arg_userns_mode != USER_NAMESPACE_NO,
2912 false,
2913 arg_private_network,
2914 arg_uid_shift,
2915 arg_uid_range,
2916 arg_selinux_apifs_context);
2917 if (r < 0)
2918 return r;
2919
2920 r = copy_devnodes(directory);
2921 if (r < 0)
2922 return r;
2923
2924 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2925
2926 r = setup_pts(directory);
2927 if (r < 0)
2928 return r;
2929
2930 r = setup_propagate(directory);
2931 if (r < 0)
2932 return r;
2933
2934 r = setup_dev_console(directory, console);
2935 if (r < 0)
2936 return r;
2937
2938 r = setup_seccomp();
2939 if (r < 0)
2940 return r;
2941
2942 r = setup_timezone(directory);
2943 if (r < 0)
2944 return r;
2945
2946 r = setup_resolv_conf(directory);
2947 if (r < 0)
2948 return r;
2949
2950 r = setup_machine_id(directory);
2951 if (r < 0)
2952 return r;
2953
2954 r = setup_journal(directory);
2955 if (r < 0)
2956 return r;
2957
2958 r = mount_custom(
2959 directory,
2960 arg_custom_mounts,
2961 arg_n_custom_mounts,
2962 arg_userns_mode != USER_NAMESPACE_NO,
2963 arg_uid_shift,
2964 arg_uid_range,
2965 arg_selinux_apifs_context);
2966 if (r < 0)
2967 return r;
2968
2969 r = mount_cgroups(
2970 directory,
2971 arg_unified_cgroup_hierarchy,
2972 arg_userns_mode != USER_NAMESPACE_NO,
2973 arg_uid_shift,
2974 arg_uid_range,
2975 arg_selinux_apifs_context);
2976 if (r < 0)
2977 return r;
2978
2979 r = mount_move_root(directory);
2980 if (r < 0)
2981 return log_error_errno(r, "Failed to move root directory: %m");
2982
2983 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2984 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2985 (arg_private_network ? CLONE_NEWNET : 0) |
2986 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0),
2987 NULL);
2988 if (pid < 0)
2989 return log_error_errno(errno, "Failed to fork inner child: %m");
2990 if (pid == 0) {
2991 pid_socket = safe_close(pid_socket);
2992 uuid_socket = safe_close(uuid_socket);
2993 uid_shift_socket = safe_close(uid_shift_socket);
2994
2995 /* The inner child has all namespaces that are
2996 * requested, so that we all are owned by the user if
2997 * user namespaces are turned on. */
2998
2999 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
3000 if (r < 0)
3001 _exit(EXIT_FAILURE);
3002
3003 _exit(EXIT_SUCCESS);
3004 }
3005
3006 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3007 if (l < 0)
3008 return log_error_errno(errno, "Failed to send PID: %m");
3009 if (l != sizeof(pid)) {
3010 log_error("Short write while sending PID.");
3011 return -EIO;
3012 }
3013
3014 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3015 if (l < 0)
3016 return log_error_errno(errno, "Failed to send machine ID: %m");
3017 if (l != sizeof(arg_uuid)) {
3018 log_error("Short write while sending machine ID.");
3019 return -EIO;
3020 }
3021
3022 pid_socket = safe_close(pid_socket);
3023 uuid_socket = safe_close(uuid_socket);
3024 kmsg_socket = safe_close(kmsg_socket);
3025 rtnl_socket = safe_close(rtnl_socket);
3026
3027 return 0;
3028 }
3029
3030 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3031 unsigned n_tries = 100;
3032 uid_t candidate;
3033 int r;
3034
3035 assert(shift);
3036 assert(ret_lock_file);
3037 assert(arg_userns_mode == USER_NAMESPACE_PICK);
3038 assert(arg_uid_range == 0x10000U);
3039
3040 candidate = *shift;
3041
3042 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3043
3044 for (;;) {
3045 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3046 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
3047
3048 if (--n_tries <= 0)
3049 return -EBUSY;
3050
3051 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
3052 goto next;
3053 if ((candidate & UINT32_C(0xFFFF)) != 0)
3054 goto next;
3055
3056 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3057 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3058 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3059 goto next;
3060 if (r < 0)
3061 return r;
3062
3063 /* Make some superficial checks whether the range is currently known in the user database */
3064 if (getpwuid(candidate))
3065 goto next;
3066 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3067 goto next;
3068 if (getgrgid(candidate))
3069 goto next;
3070 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3071 goto next;
3072
3073 *ret_lock_file = lf;
3074 lf = (struct LockFile) LOCK_FILE_INIT;
3075 *shift = candidate;
3076 return 0;
3077
3078 next:
3079 random_bytes(&candidate, sizeof(candidate));
3080 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
3081 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3082 }
3083 }
3084
3085 static int setup_uid_map(pid_t pid) {
3086 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3087 int r;
3088
3089 assert(pid > 1);
3090
3091 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3092 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
3093 r = write_string_file(uid_map, line, 0);
3094 if (r < 0)
3095 return log_error_errno(r, "Failed to write UID map: %m");
3096
3097 /* We always assign the same UID and GID ranges */
3098 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
3099 r = write_string_file(uid_map, line, 0);
3100 if (r < 0)
3101 return log_error_errno(r, "Failed to write GID map: %m");
3102
3103 return 0;
3104 }
3105
3106 static int load_settings(void) {
3107 _cleanup_(settings_freep) Settings *settings = NULL;
3108 _cleanup_fclose_ FILE *f = NULL;
3109 _cleanup_free_ char *p = NULL;
3110 const char *fn, *i;
3111 int r;
3112
3113 /* If all settings are masked, there's no point in looking for
3114 * the settings file */
3115 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3116 return 0;
3117
3118 fn = strjoina(arg_machine, ".nspawn");
3119
3120 /* We first look in the admin's directories in /etc and /run */
3121 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3122 _cleanup_free_ char *j = NULL;
3123
3124 j = strjoin(i, "/", fn, NULL);
3125 if (!j)
3126 return log_oom();
3127
3128 f = fopen(j, "re");
3129 if (f) {
3130 p = j;
3131 j = NULL;
3132
3133 /* By default, we trust configuration from /etc and /run */
3134 if (arg_settings_trusted < 0)
3135 arg_settings_trusted = true;
3136
3137 break;
3138 }
3139
3140 if (errno != ENOENT)
3141 return log_error_errno(errno, "Failed to open %s: %m", j);
3142 }
3143
3144 if (!f) {
3145 /* After that, let's look for a file next to the
3146 * actual image we shall boot. */
3147
3148 if (arg_image) {
3149 p = file_in_same_dir(arg_image, fn);
3150 if (!p)
3151 return log_oom();
3152 } else if (arg_directory) {
3153 p = file_in_same_dir(arg_directory, fn);
3154 if (!p)
3155 return log_oom();
3156 }
3157
3158 if (p) {
3159 f = fopen(p, "re");
3160 if (!f && errno != ENOENT)
3161 return log_error_errno(errno, "Failed to open %s: %m", p);
3162
3163 /* By default, we do not trust configuration from /var/lib/machines */
3164 if (arg_settings_trusted < 0)
3165 arg_settings_trusted = false;
3166 }
3167 }
3168
3169 if (!f)
3170 return 0;
3171
3172 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3173
3174 r = settings_load(f, p, &settings);
3175 if (r < 0)
3176 return r;
3177
3178 /* Copy over bits from the settings, unless they have been
3179 * explicitly masked by command line switches. */
3180
3181 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3182 settings->start_mode >= 0) {
3183 arg_start_mode = settings->start_mode;
3184
3185 strv_free(arg_parameters);
3186 arg_parameters = settings->parameters;
3187 settings->parameters = NULL;
3188 }
3189
3190 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3191 settings->working_directory) {
3192 free(arg_chdir);
3193 arg_chdir = settings->working_directory;
3194 settings->working_directory = NULL;
3195 }
3196
3197 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3198 settings->environment) {
3199 strv_free(arg_setenv);
3200 arg_setenv = settings->environment;
3201 settings->environment = NULL;
3202 }
3203
3204 if ((arg_settings_mask & SETTING_USER) == 0 &&
3205 settings->user) {
3206 free(arg_user);
3207 arg_user = settings->user;
3208 settings->user = NULL;
3209 }
3210
3211 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
3212 uint64_t plus;
3213
3214 plus = settings->capability;
3215 if (settings_private_network(settings))
3216 plus |= (1ULL << CAP_NET_ADMIN);
3217
3218 if (!arg_settings_trusted && plus != 0) {
3219 if (settings->capability != 0)
3220 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3221 } else
3222 arg_retain |= plus;
3223
3224 arg_retain &= ~settings->drop_capability;
3225 }
3226
3227 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3228 settings->kill_signal > 0)
3229 arg_kill_signal = settings->kill_signal;
3230
3231 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3232 settings->personality != PERSONALITY_INVALID)
3233 arg_personality = settings->personality;
3234
3235 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3236 !sd_id128_is_null(settings->machine_id)) {
3237
3238 if (!arg_settings_trusted)
3239 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3240 else
3241 arg_uuid = settings->machine_id;
3242 }
3243
3244 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3245 settings->read_only >= 0)
3246 arg_read_only = settings->read_only;
3247
3248 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3249 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3250 arg_volatile_mode = settings->volatile_mode;
3251
3252 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3253 settings->n_custom_mounts > 0) {
3254
3255 if (!arg_settings_trusted)
3256 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3257 else {
3258 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3259 arg_custom_mounts = settings->custom_mounts;
3260 arg_n_custom_mounts = settings->n_custom_mounts;
3261
3262 settings->custom_mounts = NULL;
3263 settings->n_custom_mounts = 0;
3264 }
3265 }
3266
3267 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3268 (settings->private_network >= 0 ||
3269 settings->network_veth >= 0 ||
3270 settings->network_bridge ||
3271 settings->network_interfaces ||
3272 settings->network_macvlan ||
3273 settings->network_ipvlan ||
3274 settings->network_veth_extra)) {
3275
3276 if (!arg_settings_trusted)
3277 log_warning("Ignoring network settings, file %s is not trusted.", p);
3278 else {
3279 arg_network_veth = settings_network_veth(settings);
3280 arg_private_network = settings_private_network(settings);
3281
3282 strv_free(arg_network_interfaces);
3283 arg_network_interfaces = settings->network_interfaces;
3284 settings->network_interfaces = NULL;
3285
3286 strv_free(arg_network_macvlan);
3287 arg_network_macvlan = settings->network_macvlan;
3288 settings->network_macvlan = NULL;
3289
3290 strv_free(arg_network_ipvlan);
3291 arg_network_ipvlan = settings->network_ipvlan;
3292 settings->network_ipvlan = NULL;
3293
3294 strv_free(arg_network_veth_extra);
3295 arg_network_veth_extra = settings->network_veth_extra;
3296 settings->network_veth_extra = NULL;
3297
3298 free(arg_network_bridge);
3299 arg_network_bridge = settings->network_bridge;
3300 settings->network_bridge = NULL;
3301 }
3302 }
3303
3304 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3305 settings->expose_ports) {
3306
3307 if (!arg_settings_trusted)
3308 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3309 else {
3310 expose_port_free_all(arg_expose_ports);
3311 arg_expose_ports = settings->expose_ports;
3312 settings->expose_ports = NULL;
3313 }
3314 }
3315
3316 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3317 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3318
3319 if (!arg_settings_trusted)
3320 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3321 else {
3322 arg_userns_mode = settings->userns_mode;
3323 arg_uid_shift = settings->uid_shift;
3324 arg_uid_range = settings->uid_range;
3325 arg_userns_chown = settings->userns_chown;
3326 }
3327 }
3328
3329 return 0;
3330 }
3331
3332 int main(int argc, char *argv[]) {
3333
3334 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3335 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3336 _cleanup_close_ int master = -1, image_fd = -1;
3337 _cleanup_fdset_free_ FDSet *fds = NULL;
3338 int r, n_fd_passed, loop_nr = -1;
3339 char veth_name[IFNAMSIZ];
3340 bool secondary = false, remove_subvol = false;
3341 sigset_t mask_chld;
3342 pid_t pid = 0;
3343 int ret = EXIT_SUCCESS;
3344 union in_addr_union exposed = {};
3345 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3346 bool interactive;
3347
3348 log_parse_environment();
3349 log_open();
3350
3351 /* Make sure rename_process() in the stub init process can work */
3352 saved_argv = argv;
3353 saved_argc = argc;
3354
3355 r = parse_argv(argc, argv);
3356 if (r <= 0)
3357 goto finish;
3358
3359 if (geteuid() != 0) {
3360 log_error("Need to be root.");
3361 r = -EPERM;
3362 goto finish;
3363 }
3364 r = determine_names();
3365 if (r < 0)
3366 goto finish;
3367
3368 r = load_settings();
3369 if (r < 0)
3370 goto finish;
3371
3372 r = verify_arguments();
3373 if (r < 0)
3374 goto finish;
3375
3376 n_fd_passed = sd_listen_fds(false);
3377 if (n_fd_passed > 0) {
3378 r = fdset_new_listen_fds(&fds, false);
3379 if (r < 0) {
3380 log_error_errno(r, "Failed to collect file descriptors: %m");
3381 goto finish;
3382 }
3383 }
3384
3385 if (arg_directory) {
3386 assert(!arg_image);
3387
3388 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3389 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3390 r = -EINVAL;
3391 goto finish;
3392 }
3393
3394 if (arg_ephemeral) {
3395 _cleanup_free_ char *np = NULL;
3396
3397 /* If the specified path is a mount point we
3398 * generate the new snapshot immediately
3399 * inside it under a random name. However if
3400 * the specified is not a mount point we
3401 * create the new snapshot in the parent
3402 * directory, just next to it. */
3403 r = path_is_mount_point(arg_directory, 0);
3404 if (r < 0) {
3405 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3406 goto finish;
3407 }
3408 if (r > 0)
3409 r = tempfn_random_child(arg_directory, "machine.", &np);
3410 else
3411 r = tempfn_random(arg_directory, "machine.", &np);
3412 if (r < 0) {
3413 log_error_errno(r, "Failed to generate name for snapshot: %m");
3414 goto finish;
3415 }
3416
3417 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3418 if (r < 0) {
3419 log_error_errno(r, "Failed to lock %s: %m", np);
3420 goto finish;
3421 }
3422
3423 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3424 if (r < 0) {
3425 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3426 goto finish;
3427 }
3428
3429 free(arg_directory);
3430 arg_directory = np;
3431 np = NULL;
3432
3433 remove_subvol = true;
3434
3435 } else {
3436 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3437 if (r == -EBUSY) {
3438 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3439 goto finish;
3440 }
3441 if (r < 0) {
3442 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3443 return r;
3444 }
3445
3446 if (arg_template) {
3447 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3448 if (r == -EEXIST) {
3449 if (!arg_quiet)
3450 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3451 } else if (r < 0) {
3452 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3453 goto finish;
3454 } else {
3455 if (!arg_quiet)
3456 log_info("Populated %s from template %s.", arg_directory, arg_template);
3457 }
3458 }
3459 }
3460
3461 if (arg_start_mode == START_BOOT) {
3462 if (path_is_os_tree(arg_directory) <= 0) {
3463 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3464 r = -EINVAL;
3465 goto finish;
3466 }
3467 } else {
3468 const char *p;
3469
3470 p = strjoina(arg_directory, "/usr/");
3471 if (laccess(p, F_OK) < 0) {
3472 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
3473 r = -EINVAL;
3474 goto finish;
3475 }
3476 }
3477
3478 } else {
3479 char template[] = "/tmp/nspawn-root-XXXXXX";
3480
3481 assert(arg_image);
3482 assert(!arg_template);
3483
3484 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3485 if (r == -EBUSY) {
3486 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3487 goto finish;
3488 }
3489 if (r < 0) {
3490 r = log_error_errno(r, "Failed to create image lock: %m");
3491 goto finish;
3492 }
3493
3494 if (!mkdtemp(template)) {
3495 log_error_errno(errno, "Failed to create temporary directory: %m");
3496 r = -errno;
3497 goto finish;
3498 }
3499
3500 arg_directory = strdup(template);
3501 if (!arg_directory) {
3502 r = log_oom();
3503 goto finish;
3504 }
3505
3506 image_fd = setup_image(&device_path, &loop_nr);
3507 if (image_fd < 0) {
3508 r = image_fd;
3509 goto finish;
3510 }
3511
3512 r = dissect_image(image_fd,
3513 &root_device, &root_device_rw,
3514 &home_device, &home_device_rw,
3515 &srv_device, &srv_device_rw,
3516 &secondary);
3517 if (r < 0)
3518 goto finish;
3519 }
3520
3521 r = custom_mounts_prepare();
3522 if (r < 0)
3523 goto finish;
3524
3525 interactive =
3526 isatty(STDIN_FILENO) > 0 &&
3527 isatty(STDOUT_FILENO) > 0;
3528
3529 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3530 if (master < 0) {
3531 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3532 goto finish;
3533 }
3534
3535 r = ptsname_malloc(master, &console);
3536 if (r < 0) {
3537 r = log_error_errno(r, "Failed to determine tty name: %m");
3538 goto finish;
3539 }
3540
3541 if (arg_selinux_apifs_context) {
3542 r = mac_selinux_apply(console, arg_selinux_apifs_context);
3543 if (r < 0)
3544 goto finish;
3545 }
3546
3547 if (unlockpt(master) < 0) {
3548 r = log_error_errno(errno, "Failed to unlock tty: %m");
3549 goto finish;
3550 }
3551
3552 if (!arg_quiet)
3553 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3554 arg_machine, arg_image ?: arg_directory);
3555
3556 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3557
3558 assert_se(sigemptyset(&mask_chld) == 0);
3559 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3560
3561 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3562 r = log_error_errno(errno, "Failed to become subreaper: %m");
3563 goto finish;
3564 }
3565
3566 for (;;) {
3567 static const struct sigaction sa = {
3568 .sa_handler = nop_signal_handler,
3569 .sa_flags = SA_NOCLDSTOP,
3570 };
3571
3572 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3573 _cleanup_close_ int etc_passwd_lock = -1;
3574 _cleanup_close_pair_ int
3575 kmsg_socket_pair[2] = { -1, -1 },
3576 rtnl_socket_pair[2] = { -1, -1 },
3577 pid_socket_pair[2] = { -1, -1 },
3578 uuid_socket_pair[2] = { -1, -1 },
3579 uid_shift_socket_pair[2] = { -1, -1 };
3580 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3581 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3582 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3583 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3584 ContainerStatus container_status;
3585 char last_char = 0;
3586 int ifi = 0;
3587 ssize_t l;
3588
3589 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3590 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3591 * check with getpwuid() if the specific user already exists. Note that /etc might be
3592 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3593 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3594 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3595 * really ours. */
3596
3597 etc_passwd_lock = take_etc_passwd_lock(NULL);
3598 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS) {
3599 log_error_errno(r, "Failed to take /etc/passwd lock: %m");
3600 goto finish;
3601 }
3602 }
3603
3604 r = barrier_create(&barrier);
3605 if (r < 0) {
3606 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3607 goto finish;
3608 }
3609
3610 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3611 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3612 goto finish;
3613 }
3614
3615 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3616 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3617 goto finish;
3618 }
3619
3620 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3621 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3622 goto finish;
3623 }
3624
3625 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0) {
3626 r = log_error_errno(errno, "Failed to create id socket pair: %m");
3627 goto finish;
3628 }
3629
3630 if (arg_userns_mode != USER_NAMESPACE_NO)
3631 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3632 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3633 goto finish;
3634 }
3635
3636 /* Child can be killed before execv(), so handle SIGCHLD
3637 * in order to interrupt parent's blocking calls and
3638 * give it a chance to call wait() and terminate. */
3639 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3640 if (r < 0) {
3641 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3642 goto finish;
3643 }
3644
3645 r = sigaction(SIGCHLD, &sa, NULL);
3646 if (r < 0) {
3647 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3648 goto finish;
3649 }
3650
3651 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3652 if (pid < 0) {
3653 if (errno == EINVAL)
3654 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3655 else
3656 r = log_error_errno(errno, "clone() failed: %m");
3657
3658 goto finish;
3659 }
3660
3661 if (pid == 0) {
3662 /* The outer child only has a file system namespace. */
3663 barrier_set_role(&barrier, BARRIER_CHILD);
3664
3665 master = safe_close(master);
3666
3667 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3668 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3669 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3670 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3671 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3672
3673 (void) reset_all_signal_handlers();
3674 (void) reset_signal_mask();
3675
3676 r = outer_child(&barrier,
3677 arg_directory,
3678 console,
3679 root_device, root_device_rw,
3680 home_device, home_device_rw,
3681 srv_device, srv_device_rw,
3682 interactive,
3683 secondary,
3684 pid_socket_pair[1],
3685 uuid_socket_pair[1],
3686 kmsg_socket_pair[1],
3687 rtnl_socket_pair[1],
3688 uid_shift_socket_pair[1],
3689 fds);
3690 if (r < 0)
3691 _exit(EXIT_FAILURE);
3692
3693 _exit(EXIT_SUCCESS);
3694 }
3695
3696 barrier_set_role(&barrier, BARRIER_PARENT);
3697
3698 fds = fdset_free(fds);
3699
3700 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3701 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3702 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3703 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3704 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3705
3706 if (arg_userns_mode != USER_NAMESPACE_NO) {
3707 /* The child just let us know the UID shift it might have read from the image. */
3708 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3709 if (l < 0) {
3710 r = log_error_errno(errno, "Failed to read UID shift: %m");
3711 goto finish;
3712 }
3713 if (l != sizeof(arg_uid_shift)) {
3714 log_error("Short read while reading UID shift.");
3715 r = EIO;
3716 goto finish;
3717 }
3718
3719 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3720 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3721 * image, but if that's already in use, pick a new one, and report back to the child,
3722 * which one we now picked. */
3723
3724 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3725 if (r < 0) {
3726 log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3727 goto finish;
3728 }
3729
3730 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3731 if (l < 0) {
3732 r = log_error_errno(errno, "Failed to send UID shift: %m");
3733 goto finish;
3734 }
3735 if (l != sizeof(arg_uid_shift)) {
3736 log_error("Short write while writing UID shift.");
3737 r = -EIO;
3738 goto finish;
3739 }
3740 }
3741 }
3742
3743 /* Wait for the outer child. */
3744 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3745 if (r < 0)
3746 goto finish;
3747 if (r != 0) {
3748 r = -EIO;
3749 goto finish;
3750 }
3751 pid = 0;
3752
3753 /* And now retrieve the PID of the inner child. */
3754 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3755 if (l < 0) {
3756 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3757 goto finish;
3758 }
3759 if (l != sizeof(pid)) {
3760 log_error("Short read while reading inner child PID.");
3761 r = EIO;
3762 goto finish;
3763 }
3764
3765 /* We also retrieve container UUID in case it was generated by outer child */
3766 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof(arg_uuid), 0);
3767 if (l < 0) {
3768 r = log_error_errno(errno, "Failed to read container machine ID: %m");
3769 goto finish;
3770 }
3771 if (l != sizeof(arg_uuid)) {
3772 log_error("Short read while reading container machined ID.");
3773 r = EIO;
3774 goto finish;
3775 }
3776
3777 log_debug("Init process invoked as PID " PID_FMT, pid);
3778
3779 if (arg_userns_mode != USER_NAMESPACE_NO) {
3780 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3781 log_error("Child died too early.");
3782 r = -ESRCH;
3783 goto finish;
3784 }
3785
3786 r = setup_uid_map(pid);
3787 if (r < 0)
3788 goto finish;
3789
3790 (void) barrier_place(&barrier); /* #2 */
3791 }
3792
3793 if (arg_private_network) {
3794
3795 r = move_network_interfaces(pid, arg_network_interfaces);
3796 if (r < 0)
3797 goto finish;
3798
3799 if (arg_network_veth) {
3800 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3801 if (r < 0)
3802 goto finish;
3803 else if (r > 0)
3804 ifi = r;
3805
3806 if (arg_network_bridge) {
3807 r = setup_bridge(veth_name, arg_network_bridge);
3808 if (r < 0)
3809 goto finish;
3810 if (r > 0)
3811 ifi = r;
3812 }
3813 }
3814
3815 r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
3816 if (r < 0)
3817 goto finish;
3818
3819 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3820 if (r < 0)
3821 goto finish;
3822
3823 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3824 if (r < 0)
3825 goto finish;
3826 }
3827
3828 if (arg_register) {
3829 r = register_machine(
3830 arg_machine,
3831 pid,
3832 arg_directory,
3833 arg_uuid,
3834 ifi,
3835 arg_slice,
3836 arg_custom_mounts, arg_n_custom_mounts,
3837 arg_kill_signal,
3838 arg_property,
3839 arg_keep_unit,
3840 arg_container_service_name);
3841 if (r < 0)
3842 goto finish;
3843 }
3844
3845 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3846 if (r < 0)
3847 goto finish;
3848
3849 if (arg_keep_unit) {
3850 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3851 if (r < 0)
3852 goto finish;
3853 }
3854
3855 r = chown_cgroup(pid, arg_uid_shift);
3856 if (r < 0)
3857 goto finish;
3858
3859 /* Notify the child that the parent is ready with all
3860 * its setup (including cgroup-ification), and that
3861 * the child can now hand over control to the code to
3862 * run inside the container. */
3863 (void) barrier_place(&barrier); /* #3 */
3864
3865 /* Block SIGCHLD here, before notifying child.
3866 * process_pty() will handle it with the other signals. */
3867 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3868
3869 /* Reset signal to default */
3870 r = default_signals(SIGCHLD, -1);
3871 if (r < 0) {
3872 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3873 goto finish;
3874 }
3875
3876 /* Let the child know that we are ready and wait that the child is completely ready now. */
3877 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3878 log_error("Child died too early.");
3879 r = -ESRCH;
3880 goto finish;
3881 }
3882
3883 /* At this point we have made use of the UID we picked, and thus nss-mymachines will make them appear
3884 * in getpwuid(), thus we can release the /etc/passwd lock. */
3885 etc_passwd_lock = safe_close(etc_passwd_lock);
3886
3887 sd_notifyf(false,
3888 "READY=1\n"
3889 "STATUS=Container running.\n"
3890 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3891
3892 r = sd_event_new(&event);
3893 if (r < 0) {
3894 log_error_errno(r, "Failed to get default event source: %m");
3895 goto finish;
3896 }
3897
3898 if (arg_kill_signal > 0) {
3899 /* Try to kill the init system on SIGINT or SIGTERM */
3900 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid));
3901 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid));
3902 } else {
3903 /* Immediately exit */
3904 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3905 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3906 }
3907
3908 /* simply exit on sigchld */
3909 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3910
3911 if (arg_expose_ports) {
3912 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3913 if (r < 0)
3914 goto finish;
3915
3916 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3917 }
3918
3919 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3920
3921 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
3922 if (r < 0) {
3923 log_error_errno(r, "Failed to create PTY forwarder: %m");
3924 goto finish;
3925 }
3926
3927 r = sd_event_loop(event);
3928 if (r < 0) {
3929 log_error_errno(r, "Failed to run event loop: %m");
3930 goto finish;
3931 }
3932
3933 pty_forward_get_last_char(forward, &last_char);
3934
3935 forward = pty_forward_free(forward);
3936
3937 if (!arg_quiet && last_char != '\n')
3938 putc('\n', stdout);
3939
3940 /* Kill if it is not dead yet anyway */
3941 if (arg_register && !arg_keep_unit)
3942 terminate_machine(pid);
3943
3944 /* Normally redundant, but better safe than sorry */
3945 kill(pid, SIGKILL);
3946
3947 r = wait_for_container(pid, &container_status);
3948 pid = 0;
3949
3950 if (r < 0)
3951 /* We failed to wait for the container, or the
3952 * container exited abnormally */
3953 goto finish;
3954 else if (r > 0 || container_status == CONTAINER_TERMINATED) {
3955 /* The container exited with a non-zero
3956 * status, or with zero status and no reboot
3957 * was requested. */
3958 ret = r;
3959 break;
3960 }
3961
3962 /* CONTAINER_REBOOTED, loop again */
3963
3964 if (arg_keep_unit) {
3965 /* Special handling if we are running as a
3966 * service: instead of simply restarting the
3967 * machine we want to restart the entire
3968 * service, so let's inform systemd about this
3969 * with the special exit code 133. The service
3970 * file uses RestartForceExitStatus=133 so
3971 * that this results in a full nspawn
3972 * restart. This is necessary since we might
3973 * have cgroup parameters set we want to have
3974 * flushed out. */
3975 ret = 133;
3976 r = 0;
3977 break;
3978 }
3979
3980 expose_port_flush(arg_expose_ports, &exposed);
3981 }
3982
3983 finish:
3984 sd_notify(false,
3985 "STOPPING=1\n"
3986 "STATUS=Terminating...");
3987
3988 if (pid > 0)
3989 kill(pid, SIGKILL);
3990
3991 /* Try to flush whatever is still queued in the pty */
3992 if (master >= 0)
3993 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
3994
3995 loop_remove(loop_nr, &image_fd);
3996
3997 if (remove_subvol && arg_directory) {
3998 int k;
3999
4000 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
4001 if (k < 0)
4002 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4003 }
4004
4005 if (arg_machine) {
4006 const char *p;
4007
4008 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4009 (void) rm_rf(p, REMOVE_ROOT);
4010 }
4011
4012 expose_port_flush(arg_expose_ports, &exposed);
4013
4014 free(arg_directory);
4015 free(arg_template);
4016 free(arg_image);
4017 free(arg_machine);
4018 free(arg_user);
4019 free(arg_chdir);
4020 strv_free(arg_setenv);
4021 free(arg_network_bridge);
4022 strv_free(arg_network_interfaces);
4023 strv_free(arg_network_macvlan);
4024 strv_free(arg_network_ipvlan);
4025 strv_free(arg_network_veth_extra);
4026 strv_free(arg_parameters);
4027 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4028 expose_port_free_all(arg_expose_ports);
4029
4030 return r < 0 ? EXIT_FAILURE : ret;
4031 }