]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #3093 from poettering/nspawn-userns-magic
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #ifdef HAVE_BLKID
21 #include <blkid/blkid.h>
22 #endif
23 #include <errno.h>
24 #include <getopt.h>
25 #include <grp.h>
26 #include <linux/loop.h>
27 #include <pwd.h>
28 #include <sched.h>
29 #ifdef HAVE_SECCOMP
30 #include <seccomp.h>
31 #endif
32 #ifdef HAVE_SELINUX
33 #include <selinux/selinux.h>
34 #endif
35 #include <signal.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <sys/file.h>
40 #include <sys/mount.h>
41 #include <sys/personality.h>
42 #include <sys/prctl.h>
43 #include <sys/types.h>
44 #include <unistd.h>
45
46 #include "sd-daemon.h"
47 #include "sd-id128.h"
48
49 #include "alloc-util.h"
50 #include "barrier.h"
51 #include "base-filesystem.h"
52 #include "blkid-util.h"
53 #include "btrfs-util.h"
54 #include "cap-list.h"
55 #include "capability-util.h"
56 #include "cgroup-util.h"
57 #include "copy.h"
58 #include "dev-setup.h"
59 #include "env-util.h"
60 #include "fd-util.h"
61 #include "fdset.h"
62 #include "fileio.h"
63 #include "formats-util.h"
64 #include "fs-util.h"
65 #include "gpt.h"
66 #include "hostname-util.h"
67 #include "log.h"
68 #include "loopback-setup.h"
69 #include "machine-id-setup.h"
70 #include "machine-image.h"
71 #include "macro.h"
72 #include "missing.h"
73 #include "mkdir.h"
74 #include "mount-util.h"
75 #include "netlink-util.h"
76 #include "nspawn-cgroup.h"
77 #include "nspawn-expose-ports.h"
78 #include "nspawn-mount.h"
79 #include "nspawn-network.h"
80 #include "nspawn-patch-uid.h"
81 #include "nspawn-register.h"
82 #include "nspawn-settings.h"
83 #include "nspawn-setuid.h"
84 #include "nspawn-stub-pid1.h"
85 #include "parse-util.h"
86 #include "path-util.h"
87 #include "process-util.h"
88 #include "ptyfwd.h"
89 #include "random-util.h"
90 #include "rm-rf.h"
91 #ifdef HAVE_SECCOMP
92 #include "seccomp-util.h"
93 #endif
94 #include "selinux-util.h"
95 #include "signal-util.h"
96 #include "socket-util.h"
97 #include "stat-util.h"
98 #include "stdio-util.h"
99 #include "string-util.h"
100 #include "strv.h"
101 #include "terminal-util.h"
102 #include "udev-util.h"
103 #include "umask-util.h"
104 #include "user-util.h"
105 #include "util.h"
106
107 /* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
108 * UID range here */
109 #define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
110 #define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
111
112 typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115 } ContainerStatus;
116
117 typedef enum LinkJournal {
118 LINK_NO,
119 LINK_AUTO,
120 LINK_HOST,
121 LINK_GUEST
122 } LinkJournal;
123
124 static char *arg_directory = NULL;
125 static char *arg_template = NULL;
126 static char *arg_chdir = NULL;
127 static char *arg_user = NULL;
128 static sd_id128_t arg_uuid = {};
129 static char *arg_machine = NULL;
130 static const char *arg_selinux_context = NULL;
131 static const char *arg_selinux_apifs_context = NULL;
132 static const char *arg_slice = NULL;
133 static bool arg_private_network = false;
134 static bool arg_read_only = false;
135 static StartMode arg_start_mode = START_PID1;
136 static bool arg_ephemeral = false;
137 static LinkJournal arg_link_journal = LINK_AUTO;
138 static bool arg_link_journal_try = false;
139 static uint64_t arg_retain =
140 (1ULL << CAP_CHOWN) |
141 (1ULL << CAP_DAC_OVERRIDE) |
142 (1ULL << CAP_DAC_READ_SEARCH) |
143 (1ULL << CAP_FOWNER) |
144 (1ULL << CAP_FSETID) |
145 (1ULL << CAP_IPC_OWNER) |
146 (1ULL << CAP_KILL) |
147 (1ULL << CAP_LEASE) |
148 (1ULL << CAP_LINUX_IMMUTABLE) |
149 (1ULL << CAP_NET_BIND_SERVICE) |
150 (1ULL << CAP_NET_BROADCAST) |
151 (1ULL << CAP_NET_RAW) |
152 (1ULL << CAP_SETGID) |
153 (1ULL << CAP_SETFCAP) |
154 (1ULL << CAP_SETPCAP) |
155 (1ULL << CAP_SETUID) |
156 (1ULL << CAP_SYS_ADMIN) |
157 (1ULL << CAP_SYS_CHROOT) |
158 (1ULL << CAP_SYS_NICE) |
159 (1ULL << CAP_SYS_PTRACE) |
160 (1ULL << CAP_SYS_TTY_CONFIG) |
161 (1ULL << CAP_SYS_RESOURCE) |
162 (1ULL << CAP_SYS_BOOT) |
163 (1ULL << CAP_AUDIT_WRITE) |
164 (1ULL << CAP_AUDIT_CONTROL) |
165 (1ULL << CAP_MKNOD);
166 static CustomMount *arg_custom_mounts = NULL;
167 static unsigned arg_n_custom_mounts = 0;
168 static char **arg_setenv = NULL;
169 static bool arg_quiet = false;
170 static bool arg_share_system = false;
171 static bool arg_register = true;
172 static bool arg_keep_unit = false;
173 static char **arg_network_interfaces = NULL;
174 static char **arg_network_macvlan = NULL;
175 static char **arg_network_ipvlan = NULL;
176 static bool arg_network_veth = false;
177 static char **arg_network_veth_extra = NULL;
178 static char *arg_network_bridge = NULL;
179 static unsigned long arg_personality = PERSONALITY_INVALID;
180 static char *arg_image = NULL;
181 static VolatileMode arg_volatile_mode = VOLATILE_NO;
182 static ExposePort *arg_expose_ports = NULL;
183 static char **arg_property = NULL;
184 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
185 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
186 static bool arg_userns_chown = false;
187 static int arg_kill_signal = 0;
188 static bool arg_unified_cgroup_hierarchy = false;
189 static SettingsMask arg_settings_mask = 0;
190 static int arg_settings_trusted = -1;
191 static char **arg_parameters = NULL;
192 static const char *arg_container_service_name = "systemd-nspawn";
193
194 static void help(void) {
195 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
196 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
197 " -h --help Show this help\n"
198 " --version Print version string\n"
199 " -q --quiet Do not show status information\n"
200 " -D --directory=PATH Root directory for the container\n"
201 " --template=PATH Initialize root directory from template directory,\n"
202 " if missing\n"
203 " -x --ephemeral Run container with snapshot of root directory, and\n"
204 " remove it after exit\n"
205 " -i --image=PATH File system device or disk image for the container\n"
206 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
207 " -b --boot Boot up full system (i.e. invoke init)\n"
208 " --chdir=PATH Set working directory in the container\n"
209 " -u --user=USER Run the command under specified user or uid\n"
210 " -M --machine=NAME Set the machine name for the container\n"
211 " --uuid=UUID Set a specific machine UUID for the container\n"
212 " -S --slice=SLICE Place the container in the specified slice\n"
213 " --property=NAME=VALUE Set scope unit property\n"
214 " -U --private-users=pick Run within user namespace, pick UID/GID range automatically\n"
215 " --private-users[=UIDBASE[:NUIDS]]\n"
216 " Run within user namespace, user configured UID/GID range\n"
217 " --private-user-chown Adjust OS tree file ownership for private UID/GID range\n"
218 " --private-network Disable network in container\n"
219 " --network-interface=INTERFACE\n"
220 " Assign an existing network interface to the\n"
221 " container\n"
222 " --network-macvlan=INTERFACE\n"
223 " Create a macvlan network interface based on an\n"
224 " existing network interface to the container\n"
225 " --network-ipvlan=INTERFACE\n"
226 " Create a ipvlan network interface based on an\n"
227 " existing network interface to the container\n"
228 " -n --network-veth Add a virtual Ethernet connection between host\n"
229 " and container\n"
230 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
231 " Add an additional virtual Ethernet link between\n"
232 " host and container\n"
233 " --network-bridge=INTERFACE\n"
234 " Add a virtual Ethernet connection between host\n"
235 " and container and add it to an existing bridge on\n"
236 " the host\n"
237 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
238 " Expose a container IP port on the host\n"
239 " -Z --selinux-context=SECLABEL\n"
240 " Set the SELinux security context to be used by\n"
241 " processes in the container\n"
242 " -L --selinux-apifs-context=SECLABEL\n"
243 " Set the SELinux security context to be used by\n"
244 " API/tmpfs file systems in the container\n"
245 " --capability=CAP In addition to the default, retain specified\n"
246 " capability\n"
247 " --drop-capability=CAP Drop the specified capability from the default set\n"
248 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
249 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
250 " host, try-guest, try-host\n"
251 " -j Equivalent to --link-journal=try-guest\n"
252 " --read-only Mount the root directory read-only\n"
253 " --bind=PATH[:PATH[:OPTIONS]]\n"
254 " Bind mount a file or directory from the host into\n"
255 " the container\n"
256 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
257 " Similar, but creates a read-only bind mount\n"
258 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
259 " --overlay=PATH[:PATH...]:PATH\n"
260 " Create an overlay mount from the host to \n"
261 " the container\n"
262 " --overlay-ro=PATH[:PATH...]:PATH\n"
263 " Similar, but creates a read-only overlay mount\n"
264 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
265 " --share-system Share system namespaces with host\n"
266 " --register=BOOLEAN Register container as machine\n"
267 " --keep-unit Do not register a scope for the machine, reuse\n"
268 " the service unit nspawn is running in\n"
269 " --volatile[=MODE] Run the system in volatile mode\n"
270 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
271 , program_invocation_short_name);
272 }
273
274
275 static int custom_mounts_prepare(void) {
276 unsigned i;
277 int r;
278
279 /* Ensure the mounts are applied prefix first. */
280 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
281
282 /* Allocate working directories for the overlay file systems that need it */
283 for (i = 0; i < arg_n_custom_mounts; i++) {
284 CustomMount *m = &arg_custom_mounts[i];
285
286 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
287
288 if (arg_userns_chown) {
289 log_error("--private-users-chown may not be combined with custom root mounts.");
290 return -EINVAL;
291 } else if (arg_uid_shift == UID_INVALID) {
292 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
293 return -EINVAL;
294 }
295 }
296
297 if (m->type != CUSTOM_MOUNT_OVERLAY)
298 continue;
299
300 if (m->work_dir)
301 continue;
302
303 if (m->read_only)
304 continue;
305
306 r = tempfn_random(m->source, NULL, &m->work_dir);
307 if (r < 0)
308 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
309 }
310
311 return 0;
312 }
313
314 static int detect_unified_cgroup_hierarchy(void) {
315 const char *e;
316 int r;
317
318 /* Allow the user to control whether the unified hierarchy is used */
319 e = getenv("UNIFIED_CGROUP_HIERARCHY");
320 if (e) {
321 r = parse_boolean(e);
322 if (r < 0)
323 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
324
325 arg_unified_cgroup_hierarchy = r;
326 return 0;
327 }
328
329 /* Otherwise inherit the default from the host system */
330 r = cg_unified();
331 if (r < 0)
332 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
333
334 arg_unified_cgroup_hierarchy = r;
335 return 0;
336 }
337
338 static int parse_argv(int argc, char *argv[]) {
339
340 enum {
341 ARG_VERSION = 0x100,
342 ARG_PRIVATE_NETWORK,
343 ARG_UUID,
344 ARG_READ_ONLY,
345 ARG_CAPABILITY,
346 ARG_DROP_CAPABILITY,
347 ARG_LINK_JOURNAL,
348 ARG_BIND,
349 ARG_BIND_RO,
350 ARG_TMPFS,
351 ARG_OVERLAY,
352 ARG_OVERLAY_RO,
353 ARG_SHARE_SYSTEM,
354 ARG_REGISTER,
355 ARG_KEEP_UNIT,
356 ARG_NETWORK_INTERFACE,
357 ARG_NETWORK_MACVLAN,
358 ARG_NETWORK_IPVLAN,
359 ARG_NETWORK_BRIDGE,
360 ARG_NETWORK_VETH_EXTRA,
361 ARG_PERSONALITY,
362 ARG_VOLATILE,
363 ARG_TEMPLATE,
364 ARG_PROPERTY,
365 ARG_PRIVATE_USERS,
366 ARG_KILL_SIGNAL,
367 ARG_SETTINGS,
368 ARG_CHDIR,
369 ARG_PRIVATE_USERS_CHOWN,
370 };
371
372 static const struct option options[] = {
373 { "help", no_argument, NULL, 'h' },
374 { "version", no_argument, NULL, ARG_VERSION },
375 { "directory", required_argument, NULL, 'D' },
376 { "template", required_argument, NULL, ARG_TEMPLATE },
377 { "ephemeral", no_argument, NULL, 'x' },
378 { "user", required_argument, NULL, 'u' },
379 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
380 { "as-pid2", no_argument, NULL, 'a' },
381 { "boot", no_argument, NULL, 'b' },
382 { "uuid", required_argument, NULL, ARG_UUID },
383 { "read-only", no_argument, NULL, ARG_READ_ONLY },
384 { "capability", required_argument, NULL, ARG_CAPABILITY },
385 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
386 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
387 { "bind", required_argument, NULL, ARG_BIND },
388 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
389 { "tmpfs", required_argument, NULL, ARG_TMPFS },
390 { "overlay", required_argument, NULL, ARG_OVERLAY },
391 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
392 { "machine", required_argument, NULL, 'M' },
393 { "slice", required_argument, NULL, 'S' },
394 { "setenv", required_argument, NULL, 'E' },
395 { "selinux-context", required_argument, NULL, 'Z' },
396 { "selinux-apifs-context", required_argument, NULL, 'L' },
397 { "quiet", no_argument, NULL, 'q' },
398 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
399 { "register", required_argument, NULL, ARG_REGISTER },
400 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
401 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
402 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
403 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
404 { "network-veth", no_argument, NULL, 'n' },
405 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
406 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
407 { "personality", required_argument, NULL, ARG_PERSONALITY },
408 { "image", required_argument, NULL, 'i' },
409 { "volatile", optional_argument, NULL, ARG_VOLATILE },
410 { "port", required_argument, NULL, 'p' },
411 { "property", required_argument, NULL, ARG_PROPERTY },
412 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
413 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN},
414 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
415 { "settings", required_argument, NULL, ARG_SETTINGS },
416 { "chdir", required_argument, NULL, ARG_CHDIR },
417 {}
418 };
419
420 int c, r;
421 const char *p, *e;
422 uint64_t plus = 0, minus = 0;
423 bool mask_all_settings = false, mask_no_settings = false;
424
425 assert(argc >= 0);
426 assert(argv);
427
428 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nU", options, NULL)) >= 0)
429
430 switch (c) {
431
432 case 'h':
433 help();
434 return 0;
435
436 case ARG_VERSION:
437 return version();
438
439 case 'D':
440 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
441 if (r < 0)
442 return r;
443 break;
444
445 case ARG_TEMPLATE:
446 r = parse_path_argument_and_warn(optarg, false, &arg_template);
447 if (r < 0)
448 return r;
449 break;
450
451 case 'i':
452 r = parse_path_argument_and_warn(optarg, false, &arg_image);
453 if (r < 0)
454 return r;
455 break;
456
457 case 'x':
458 arg_ephemeral = true;
459 break;
460
461 case 'u':
462 r = free_and_strdup(&arg_user, optarg);
463 if (r < 0)
464 return log_oom();
465
466 arg_settings_mask |= SETTING_USER;
467 break;
468
469 case ARG_NETWORK_BRIDGE:
470 r = free_and_strdup(&arg_network_bridge, optarg);
471 if (r < 0)
472 return log_oom();
473
474 /* fall through */
475
476 case 'n':
477 arg_network_veth = true;
478 arg_private_network = true;
479 arg_settings_mask |= SETTING_NETWORK;
480 break;
481
482 case ARG_NETWORK_VETH_EXTRA:
483 r = veth_extra_parse(&arg_network_veth_extra, optarg);
484 if (r < 0)
485 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
486
487 arg_private_network = true;
488 arg_settings_mask |= SETTING_NETWORK;
489 break;
490
491 case ARG_NETWORK_INTERFACE:
492 if (strv_extend(&arg_network_interfaces, optarg) < 0)
493 return log_oom();
494
495 arg_private_network = true;
496 arg_settings_mask |= SETTING_NETWORK;
497 break;
498
499 case ARG_NETWORK_MACVLAN:
500 if (strv_extend(&arg_network_macvlan, optarg) < 0)
501 return log_oom();
502
503 arg_private_network = true;
504 arg_settings_mask |= SETTING_NETWORK;
505 break;
506
507 case ARG_NETWORK_IPVLAN:
508 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
509 return log_oom();
510
511 /* fall through */
512
513 case ARG_PRIVATE_NETWORK:
514 arg_private_network = true;
515 arg_settings_mask |= SETTING_NETWORK;
516 break;
517
518 case 'b':
519 if (arg_start_mode == START_PID2) {
520 log_error("--boot and --as-pid2 may not be combined.");
521 return -EINVAL;
522 }
523
524 arg_start_mode = START_BOOT;
525 arg_settings_mask |= SETTING_START_MODE;
526 break;
527
528 case 'a':
529 if (arg_start_mode == START_BOOT) {
530 log_error("--boot and --as-pid2 may not be combined.");
531 return -EINVAL;
532 }
533
534 arg_start_mode = START_PID2;
535 arg_settings_mask |= SETTING_START_MODE;
536 break;
537
538 case ARG_UUID:
539 r = sd_id128_from_string(optarg, &arg_uuid);
540 if (r < 0) {
541 log_error("Invalid UUID: %s", optarg);
542 return r;
543 }
544
545 arg_settings_mask |= SETTING_MACHINE_ID;
546 break;
547
548 case 'S':
549 arg_slice = optarg;
550 break;
551
552 case 'M':
553 if (isempty(optarg))
554 arg_machine = mfree(arg_machine);
555 else {
556 if (!machine_name_is_valid(optarg)) {
557 log_error("Invalid machine name: %s", optarg);
558 return -EINVAL;
559 }
560
561 r = free_and_strdup(&arg_machine, optarg);
562 if (r < 0)
563 return log_oom();
564
565 break;
566 }
567
568 case 'Z':
569 arg_selinux_context = optarg;
570 break;
571
572 case 'L':
573 arg_selinux_apifs_context = optarg;
574 break;
575
576 case ARG_READ_ONLY:
577 arg_read_only = true;
578 arg_settings_mask |= SETTING_READ_ONLY;
579 break;
580
581 case ARG_CAPABILITY:
582 case ARG_DROP_CAPABILITY: {
583 p = optarg;
584 for (;;) {
585 _cleanup_free_ char *t = NULL;
586
587 r = extract_first_word(&p, &t, ",", 0);
588 if (r < 0)
589 return log_error_errno(r, "Failed to parse capability %s.", t);
590
591 if (r == 0)
592 break;
593
594 if (streq(t, "all")) {
595 if (c == ARG_CAPABILITY)
596 plus = (uint64_t) -1;
597 else
598 minus = (uint64_t) -1;
599 } else {
600 int cap;
601
602 cap = capability_from_name(t);
603 if (cap < 0) {
604 log_error("Failed to parse capability %s.", t);
605 return -EINVAL;
606 }
607
608 if (c == ARG_CAPABILITY)
609 plus |= 1ULL << (uint64_t) cap;
610 else
611 minus |= 1ULL << (uint64_t) cap;
612 }
613 }
614
615 arg_settings_mask |= SETTING_CAPABILITY;
616 break;
617 }
618
619 case 'j':
620 arg_link_journal = LINK_GUEST;
621 arg_link_journal_try = true;
622 break;
623
624 case ARG_LINK_JOURNAL:
625 if (streq(optarg, "auto")) {
626 arg_link_journal = LINK_AUTO;
627 arg_link_journal_try = false;
628 } else if (streq(optarg, "no")) {
629 arg_link_journal = LINK_NO;
630 arg_link_journal_try = false;
631 } else if (streq(optarg, "guest")) {
632 arg_link_journal = LINK_GUEST;
633 arg_link_journal_try = false;
634 } else if (streq(optarg, "host")) {
635 arg_link_journal = LINK_HOST;
636 arg_link_journal_try = false;
637 } else if (streq(optarg, "try-guest")) {
638 arg_link_journal = LINK_GUEST;
639 arg_link_journal_try = true;
640 } else if (streq(optarg, "try-host")) {
641 arg_link_journal = LINK_HOST;
642 arg_link_journal_try = true;
643 } else {
644 log_error("Failed to parse link journal mode %s", optarg);
645 return -EINVAL;
646 }
647
648 break;
649
650 case ARG_BIND:
651 case ARG_BIND_RO:
652 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
653 if (r < 0)
654 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
655
656 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
657 break;
658
659 case ARG_TMPFS:
660 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
661 if (r < 0)
662 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
663
664 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
665 break;
666
667 case ARG_OVERLAY:
668 case ARG_OVERLAY_RO: {
669 _cleanup_free_ char *upper = NULL, *destination = NULL;
670 _cleanup_strv_free_ char **lower = NULL;
671 CustomMount *m;
672 unsigned n = 0;
673 char **i;
674
675 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
676 if (r == -ENOMEM)
677 return log_oom();
678 else if (r < 0) {
679 log_error("Invalid overlay specification: %s", optarg);
680 return r;
681 }
682
683 STRV_FOREACH(i, lower) {
684 if (!path_is_absolute(*i)) {
685 log_error("Overlay path %s is not absolute.", *i);
686 return -EINVAL;
687 }
688
689 n++;
690 }
691
692 if (n < 2) {
693 log_error("--overlay= needs at least two colon-separated directories specified.");
694 return -EINVAL;
695 }
696
697 if (n == 2) {
698 /* If two parameters are specified,
699 * the first one is the lower, the
700 * second one the upper directory. And
701 * we'll also define the destination
702 * mount point the same as the upper. */
703 upper = lower[1];
704 lower[1] = NULL;
705
706 destination = strdup(upper);
707 if (!destination)
708 return log_oom();
709
710 } else {
711 upper = lower[n - 2];
712 destination = lower[n - 1];
713 lower[n - 2] = NULL;
714 }
715
716 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
717 if (!m)
718 return log_oom();
719
720 m->destination = destination;
721 m->source = upper;
722 m->lower = lower;
723 m->read_only = c == ARG_OVERLAY_RO;
724
725 upper = destination = NULL;
726 lower = NULL;
727
728 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
729 break;
730 }
731
732 case 'E': {
733 char **n;
734
735 if (!env_assignment_is_valid(optarg)) {
736 log_error("Environment variable assignment '%s' is not valid.", optarg);
737 return -EINVAL;
738 }
739
740 n = strv_env_set(arg_setenv, optarg);
741 if (!n)
742 return log_oom();
743
744 strv_free(arg_setenv);
745 arg_setenv = n;
746
747 arg_settings_mask |= SETTING_ENVIRONMENT;
748 break;
749 }
750
751 case 'q':
752 arg_quiet = true;
753 break;
754
755 case ARG_SHARE_SYSTEM:
756 arg_share_system = true;
757 break;
758
759 case ARG_REGISTER:
760 r = parse_boolean(optarg);
761 if (r < 0) {
762 log_error("Failed to parse --register= argument: %s", optarg);
763 return r;
764 }
765
766 arg_register = r;
767 break;
768
769 case ARG_KEEP_UNIT:
770 arg_keep_unit = true;
771 break;
772
773 case ARG_PERSONALITY:
774
775 arg_personality = personality_from_string(optarg);
776 if (arg_personality == PERSONALITY_INVALID) {
777 log_error("Unknown or unsupported personality '%s'.", optarg);
778 return -EINVAL;
779 }
780
781 arg_settings_mask |= SETTING_PERSONALITY;
782 break;
783
784 case ARG_VOLATILE:
785
786 if (!optarg)
787 arg_volatile_mode = VOLATILE_YES;
788 else {
789 VolatileMode m;
790
791 m = volatile_mode_from_string(optarg);
792 if (m < 0) {
793 log_error("Failed to parse --volatile= argument: %s", optarg);
794 return -EINVAL;
795 } else
796 arg_volatile_mode = m;
797 }
798
799 arg_settings_mask |= SETTING_VOLATILE_MODE;
800 break;
801
802 case 'p':
803 r = expose_port_parse(&arg_expose_ports, optarg);
804 if (r == -EEXIST)
805 return log_error_errno(r, "Duplicate port specification: %s", optarg);
806 if (r < 0)
807 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
808
809 arg_settings_mask |= SETTING_EXPOSE_PORTS;
810 break;
811
812 case ARG_PROPERTY:
813 if (strv_extend(&arg_property, optarg) < 0)
814 return log_oom();
815
816 break;
817
818 case ARG_PRIVATE_USERS:
819
820 r = optarg ? parse_boolean(optarg) : 1;
821 if (r == 0) {
822 /* no: User namespacing off */
823 arg_userns_mode = USER_NAMESPACE_NO;
824 arg_uid_shift = UID_INVALID;
825 arg_uid_range = UINT32_C(0x10000);
826 } else if (r > 0) {
827 /* yes: User namespacing on, UID range is read from root dir */
828 arg_userns_mode = USER_NAMESPACE_FIXED;
829 arg_uid_shift = UID_INVALID;
830 arg_uid_range = UINT32_C(0x10000);
831 } else if (streq(optarg, "pick")) {
832 /* pick: User namespacing on, UID range is picked randomly */
833 arg_userns_mode = USER_NAMESPACE_PICK;
834 arg_uid_shift = UID_INVALID;
835 arg_uid_range = UINT32_C(0x10000);
836 } else {
837 _cleanup_free_ char *buffer = NULL;
838 const char *range, *shift;
839
840 /* anything else: User namespacing on, UID range is explicitly configured */
841
842 range = strchr(optarg, ':');
843 if (range) {
844 buffer = strndup(optarg, range - optarg);
845 if (!buffer)
846 return log_oom();
847 shift = buffer;
848
849 range++;
850 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
851 log_error("Failed to parse UID range: %s", range);
852 return -EINVAL;
853 }
854 } else
855 shift = optarg;
856
857 if (parse_uid(shift, &arg_uid_shift) < 0) {
858 log_error("Failed to parse UID: %s", optarg);
859 return -EINVAL;
860 }
861
862 arg_userns_mode = USER_NAMESPACE_FIXED;
863 }
864
865 arg_settings_mask |= SETTING_USERNS;
866 break;
867
868 case 'U':
869 if (userns_supported()) {
870 arg_userns_mode = USER_NAMESPACE_PICK;
871 arg_uid_shift = UID_INVALID;
872 arg_uid_range = UINT32_C(0x10000);
873
874 arg_settings_mask |= SETTING_USERNS;
875 }
876
877 break;
878
879 case ARG_PRIVATE_USERS_CHOWN:
880 arg_userns_chown = true;
881
882 arg_settings_mask |= SETTING_USERNS;
883 break;
884
885 case ARG_KILL_SIGNAL:
886 arg_kill_signal = signal_from_string_try_harder(optarg);
887 if (arg_kill_signal < 0) {
888 log_error("Cannot parse signal: %s", optarg);
889 return -EINVAL;
890 }
891
892 arg_settings_mask |= SETTING_KILL_SIGNAL;
893 break;
894
895 case ARG_SETTINGS:
896
897 /* no → do not read files
898 * yes → read files, do not override cmdline, trust only subset
899 * override → read files, override cmdline, trust only subset
900 * trusted → read files, do not override cmdline, trust all
901 */
902
903 r = parse_boolean(optarg);
904 if (r < 0) {
905 if (streq(optarg, "trusted")) {
906 mask_all_settings = false;
907 mask_no_settings = false;
908 arg_settings_trusted = true;
909
910 } else if (streq(optarg, "override")) {
911 mask_all_settings = false;
912 mask_no_settings = true;
913 arg_settings_trusted = -1;
914 } else
915 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
916 } else if (r > 0) {
917 /* yes */
918 mask_all_settings = false;
919 mask_no_settings = false;
920 arg_settings_trusted = -1;
921 } else {
922 /* no */
923 mask_all_settings = true;
924 mask_no_settings = false;
925 arg_settings_trusted = false;
926 }
927
928 break;
929
930 case ARG_CHDIR:
931 if (!path_is_absolute(optarg)) {
932 log_error("Working directory %s is not an absolute path.", optarg);
933 return -EINVAL;
934 }
935
936 r = free_and_strdup(&arg_chdir, optarg);
937 if (r < 0)
938 return log_oom();
939
940 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
941 break;
942
943 case '?':
944 return -EINVAL;
945
946 default:
947 assert_not_reached("Unhandled option");
948 }
949
950 if (arg_share_system)
951 arg_register = false;
952
953 if (arg_userns_mode == USER_NAMESPACE_PICK)
954 arg_userns_chown = true;
955
956 if (arg_start_mode != START_PID1 && arg_share_system) {
957 log_error("--boot and --share-system may not be combined.");
958 return -EINVAL;
959 }
960
961 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
962 log_error("--keep-unit may not be used when invoked from a user session.");
963 return -EINVAL;
964 }
965
966 if (arg_directory && arg_image) {
967 log_error("--directory= and --image= may not be combined.");
968 return -EINVAL;
969 }
970
971 if (arg_template && arg_image) {
972 log_error("--template= and --image= may not be combined.");
973 return -EINVAL;
974 }
975
976 if (arg_template && !(arg_directory || arg_machine)) {
977 log_error("--template= needs --directory= or --machine=.");
978 return -EINVAL;
979 }
980
981 if (arg_ephemeral && arg_template) {
982 log_error("--ephemeral and --template= may not be combined.");
983 return -EINVAL;
984 }
985
986 if (arg_ephemeral && arg_image) {
987 log_error("--ephemeral and --image= may not be combined.");
988 return -EINVAL;
989 }
990
991 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
992 log_error("--ephemeral and --link-journal= may not be combined.");
993 return -EINVAL;
994 }
995
996 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
997 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
998 return -EOPNOTSUPP;
999 }
1000
1001 if (arg_userns_chown && arg_read_only) {
1002 log_error("--read-only and --private-users-chown may not be combined.");
1003 return -EINVAL;
1004 }
1005
1006 if (argc > optind) {
1007 arg_parameters = strv_copy(argv + optind);
1008 if (!arg_parameters)
1009 return log_oom();
1010
1011 arg_settings_mask |= SETTING_START_MODE;
1012 }
1013
1014 /* Load all settings from .nspawn files */
1015 if (mask_no_settings)
1016 arg_settings_mask = 0;
1017
1018 /* Don't load any settings from .nspawn files */
1019 if (mask_all_settings)
1020 arg_settings_mask = _SETTINGS_MASK_ALL;
1021
1022 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1023
1024 r = detect_unified_cgroup_hierarchy();
1025 if (r < 0)
1026 return r;
1027
1028 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1029 if (e)
1030 arg_container_service_name = e;
1031
1032 return 1;
1033 }
1034
1035 static int verify_arguments(void) {
1036
1037 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
1038 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1039 return -EINVAL;
1040 }
1041
1042 if (arg_expose_ports && !arg_private_network) {
1043 log_error("Cannot use --port= without private networking.");
1044 return -EINVAL;
1045 }
1046
1047 #ifndef HAVE_LIBIPTC
1048 if (arg_expose_ports) {
1049 log_error("--port= is not supported, compiled without libiptc support.");
1050 return -EOPNOTSUPP;
1051 }
1052 #endif
1053
1054 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1055 arg_kill_signal = SIGRTMIN+3;
1056
1057 return 0;
1058 }
1059
1060 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1061 assert(p);
1062
1063 if (arg_userns_mode == USER_NAMESPACE_NO)
1064 return 0;
1065
1066 if (uid == UID_INVALID && gid == GID_INVALID)
1067 return 0;
1068
1069 if (uid != UID_INVALID) {
1070 uid += arg_uid_shift;
1071
1072 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1073 return -EOVERFLOW;
1074 }
1075
1076 if (gid != GID_INVALID) {
1077 gid += (gid_t) arg_uid_shift;
1078
1079 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1080 return -EOVERFLOW;
1081 }
1082
1083 if (lchown(p, uid, gid) < 0)
1084 return -errno;
1085
1086 return 0;
1087 }
1088
1089 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1090 const char *q;
1091
1092 q = prefix_roota(root, path);
1093 if (mkdir(q, mode) < 0) {
1094 if (errno == EEXIST)
1095 return 0;
1096 return -errno;
1097 }
1098
1099 return userns_lchown(q, uid, gid);
1100 }
1101
1102 static int setup_timezone(const char *dest) {
1103 _cleanup_free_ char *p = NULL, *q = NULL;
1104 const char *where, *check, *what;
1105 char *z, *y;
1106 int r;
1107
1108 assert(dest);
1109
1110 /* Fix the timezone, if possible */
1111 r = readlink_malloc("/etc/localtime", &p);
1112 if (r < 0) {
1113 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1114 return 0;
1115 }
1116
1117 z = path_startswith(p, "../usr/share/zoneinfo/");
1118 if (!z)
1119 z = path_startswith(p, "/usr/share/zoneinfo/");
1120 if (!z) {
1121 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1122 return 0;
1123 }
1124
1125 where = prefix_roota(dest, "/etc/localtime");
1126 r = readlink_malloc(where, &q);
1127 if (r >= 0) {
1128 y = path_startswith(q, "../usr/share/zoneinfo/");
1129 if (!y)
1130 y = path_startswith(q, "/usr/share/zoneinfo/");
1131
1132 /* Already pointing to the right place? Then do nothing .. */
1133 if (y && streq(y, z))
1134 return 0;
1135 }
1136
1137 check = strjoina("/usr/share/zoneinfo/", z);
1138 check = prefix_roota(dest, check);
1139 if (laccess(check, F_OK) < 0) {
1140 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1141 return 0;
1142 }
1143
1144 r = unlink(where);
1145 if (r < 0 && errno != ENOENT) {
1146 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1147 return 0;
1148 }
1149
1150 what = strjoina("../usr/share/zoneinfo/", z);
1151 if (symlink(what, where) < 0) {
1152 log_error_errno(errno, "Failed to correct timezone of container: %m");
1153 return 0;
1154 }
1155
1156 r = userns_lchown(where, 0, 0);
1157 if (r < 0)
1158 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1159
1160 return 0;
1161 }
1162
1163 static int setup_resolv_conf(const char *dest) {
1164 const char *where = NULL;
1165 int r;
1166
1167 assert(dest);
1168
1169 if (arg_private_network)
1170 return 0;
1171
1172 /* Fix resolv.conf, if possible */
1173 where = prefix_roota(dest, "/etc/resolv.conf");
1174
1175 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1176 if (r < 0) {
1177 /* If the file already exists as symlink, let's
1178 * suppress the warning, under the assumption that
1179 * resolved or something similar runs inside and the
1180 * symlink points there.
1181 *
1182 * If the disk image is read-only, there's also no
1183 * point in complaining.
1184 */
1185 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1186 "Failed to copy /etc/resolv.conf to %s: %m", where);
1187 return 0;
1188 }
1189
1190 r = userns_lchown(where, 0, 0);
1191 if (r < 0)
1192 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1193
1194 return 0;
1195 }
1196
1197 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1198 assert(s);
1199
1200 snprintf(s, 37,
1201 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1202 SD_ID128_FORMAT_VAL(id));
1203
1204 return s;
1205 }
1206
1207 static int setup_boot_id(const char *dest) {
1208 const char *from, *to;
1209 sd_id128_t rnd = {};
1210 char as_uuid[37];
1211 int r;
1212
1213 if (arg_share_system)
1214 return 0;
1215
1216 /* Generate a new randomized boot ID, so that each boot-up of
1217 * the container gets a new one */
1218
1219 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1220 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1221
1222 r = sd_id128_randomize(&rnd);
1223 if (r < 0)
1224 return log_error_errno(r, "Failed to generate random boot id: %m");
1225
1226 id128_format_as_uuid(rnd, as_uuid);
1227
1228 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1229 if (r < 0)
1230 return log_error_errno(r, "Failed to write boot id: %m");
1231
1232 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1233 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1234 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1235 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1236
1237 unlink(from);
1238 return r;
1239 }
1240
1241 static int copy_devnodes(const char *dest) {
1242
1243 static const char devnodes[] =
1244 "null\0"
1245 "zero\0"
1246 "full\0"
1247 "random\0"
1248 "urandom\0"
1249 "tty\0"
1250 "net/tun\0";
1251
1252 const char *d;
1253 int r = 0;
1254 _cleanup_umask_ mode_t u;
1255
1256 assert(dest);
1257
1258 u = umask(0000);
1259
1260 /* Create /dev/net, so that we can create /dev/net/tun in it */
1261 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1262 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1263
1264 NULSTR_FOREACH(d, devnodes) {
1265 _cleanup_free_ char *from = NULL, *to = NULL;
1266 struct stat st;
1267
1268 from = strappend("/dev/", d);
1269 to = prefix_root(dest, from);
1270
1271 if (stat(from, &st) < 0) {
1272
1273 if (errno != ENOENT)
1274 return log_error_errno(errno, "Failed to stat %s: %m", from);
1275
1276 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1277
1278 log_error("%s is not a char or block device, cannot copy.", from);
1279 return -EIO;
1280
1281 } else {
1282 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1283 if (errno != EPERM)
1284 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1285
1286 /* Some systems abusively restrict mknod but
1287 * allow bind mounts. */
1288 r = touch(to);
1289 if (r < 0)
1290 return log_error_errno(r, "touch (%s) failed: %m", to);
1291 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1292 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1293 }
1294
1295 r = userns_lchown(to, 0, 0);
1296 if (r < 0)
1297 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1298 }
1299 }
1300
1301 return r;
1302 }
1303
1304 static int setup_pts(const char *dest) {
1305 _cleanup_free_ char *options = NULL;
1306 const char *p;
1307 int r;
1308
1309 #ifdef HAVE_SELINUX
1310 if (arg_selinux_apifs_context)
1311 (void) asprintf(&options,
1312 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1313 arg_uid_shift + TTY_GID,
1314 arg_selinux_apifs_context);
1315 else
1316 #endif
1317 (void) asprintf(&options,
1318 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1319 arg_uid_shift + TTY_GID);
1320
1321 if (!options)
1322 return log_oom();
1323
1324 /* Mount /dev/pts itself */
1325 p = prefix_roota(dest, "/dev/pts");
1326 if (mkdir(p, 0755) < 0)
1327 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1328 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1329 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1330 r = userns_lchown(p, 0, 0);
1331 if (r < 0)
1332 return log_error_errno(r, "Failed to chown /dev/pts: %m");
1333
1334 /* Create /dev/ptmx symlink */
1335 p = prefix_roota(dest, "/dev/ptmx");
1336 if (symlink("pts/ptmx", p) < 0)
1337 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1338 r = userns_lchown(p, 0, 0);
1339 if (r < 0)
1340 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
1341
1342 /* And fix /dev/pts/ptmx ownership */
1343 p = prefix_roota(dest, "/dev/pts/ptmx");
1344 r = userns_lchown(p, 0, 0);
1345 if (r < 0)
1346 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
1347
1348 return 0;
1349 }
1350
1351 static int setup_dev_console(const char *dest, const char *console) {
1352 _cleanup_umask_ mode_t u;
1353 const char *to;
1354 int r;
1355
1356 assert(dest);
1357 assert(console);
1358
1359 u = umask(0000);
1360
1361 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1362 if (r < 0)
1363 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1364
1365 /* We need to bind mount the right tty to /dev/console since
1366 * ptys can only exist on pts file systems. To have something
1367 * to bind mount things on we create a empty regular file. */
1368
1369 to = prefix_roota(dest, "/dev/console");
1370 r = touch(to);
1371 if (r < 0)
1372 return log_error_errno(r, "touch() for /dev/console failed: %m");
1373
1374 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1375 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1376
1377 return 0;
1378 }
1379
1380 static int setup_kmsg(const char *dest, int kmsg_socket) {
1381 const char *from, *to;
1382 _cleanup_umask_ mode_t u;
1383 int fd, r;
1384
1385 assert(kmsg_socket >= 0);
1386
1387 u = umask(0000);
1388
1389 /* We create the kmsg FIFO as /run/kmsg, but immediately
1390 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1391 * on the reading side behave very similar to /proc/kmsg,
1392 * their writing side behaves differently from /dev/kmsg in
1393 * that writing blocks when nothing is reading. In order to
1394 * avoid any problems with containers deadlocking due to this
1395 * we simply make /dev/kmsg unavailable to the container. */
1396 from = prefix_roota(dest, "/run/kmsg");
1397 to = prefix_roota(dest, "/proc/kmsg");
1398
1399 if (mkfifo(from, 0600) < 0)
1400 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1401 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1402 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1403
1404 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1405 if (fd < 0)
1406 return log_error_errno(errno, "Failed to open fifo: %m");
1407
1408 /* Store away the fd in the socket, so that it stays open as
1409 * long as we run the child */
1410 r = send_one_fd(kmsg_socket, fd, 0);
1411 safe_close(fd);
1412
1413 if (r < 0)
1414 return log_error_errno(r, "Failed to send FIFO fd: %m");
1415
1416 /* And now make the FIFO unavailable as /run/kmsg... */
1417 (void) unlink(from);
1418
1419 return 0;
1420 }
1421
1422 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1423 union in_addr_union *exposed = userdata;
1424
1425 assert(rtnl);
1426 assert(m);
1427 assert(exposed);
1428
1429 expose_port_execute(rtnl, arg_expose_ports, exposed);
1430 return 0;
1431 }
1432
1433 static int setup_hostname(void) {
1434
1435 if (arg_share_system)
1436 return 0;
1437
1438 if (sethostname_idempotent(arg_machine) < 0)
1439 return -errno;
1440
1441 return 0;
1442 }
1443
1444 static int setup_journal(const char *directory) {
1445 sd_id128_t this_id;
1446 _cleanup_free_ char *d = NULL;
1447 const char *p, *q;
1448 bool try;
1449 char id[33];
1450 int r;
1451
1452 /* Don't link journals in ephemeral mode */
1453 if (arg_ephemeral)
1454 return 0;
1455
1456 if (arg_link_journal == LINK_NO)
1457 return 0;
1458
1459 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1460
1461 r = sd_id128_get_machine(&this_id);
1462 if (r < 0)
1463 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1464
1465 if (sd_id128_equal(arg_uuid, this_id)) {
1466 log_full(try ? LOG_WARNING : LOG_ERR,
1467 "Host and machine ids are equal (%s): refusing to link journals", id);
1468 if (try)
1469 return 0;
1470 return -EEXIST;
1471 }
1472
1473 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1474 if (r < 0)
1475 return log_error_errno(r, "Failed to create /var: %m");
1476
1477 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1478 if (r < 0)
1479 return log_error_errno(r, "Failed to create /var/log: %m");
1480
1481 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1482 if (r < 0)
1483 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1484
1485 (void) sd_id128_to_string(arg_uuid, id);
1486
1487 p = strjoina("/var/log/journal/", id);
1488 q = prefix_roota(directory, p);
1489
1490 if (path_is_mount_point(p, 0) > 0) {
1491 if (try)
1492 return 0;
1493
1494 log_error("%s: already a mount point, refusing to use for journal", p);
1495 return -EEXIST;
1496 }
1497
1498 if (path_is_mount_point(q, 0) > 0) {
1499 if (try)
1500 return 0;
1501
1502 log_error("%s: already a mount point, refusing to use for journal", q);
1503 return -EEXIST;
1504 }
1505
1506 r = readlink_and_make_absolute(p, &d);
1507 if (r >= 0) {
1508 if ((arg_link_journal == LINK_GUEST ||
1509 arg_link_journal == LINK_AUTO) &&
1510 path_equal(d, q)) {
1511
1512 r = userns_mkdir(directory, p, 0755, 0, 0);
1513 if (r < 0)
1514 log_warning_errno(r, "Failed to create directory %s: %m", q);
1515 return 0;
1516 }
1517
1518 if (unlink(p) < 0)
1519 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1520 } else if (r == -EINVAL) {
1521
1522 if (arg_link_journal == LINK_GUEST &&
1523 rmdir(p) < 0) {
1524
1525 if (errno == ENOTDIR) {
1526 log_error("%s already exists and is neither a symlink nor a directory", p);
1527 return r;
1528 } else
1529 return log_error_errno(errno, "Failed to remove %s: %m", p);
1530 }
1531 } else if (r != -ENOENT)
1532 return log_error_errno(r, "readlink(%s) failed: %m", p);
1533
1534 if (arg_link_journal == LINK_GUEST) {
1535
1536 if (symlink(q, p) < 0) {
1537 if (try) {
1538 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1539 return 0;
1540 } else
1541 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1542 }
1543
1544 r = userns_mkdir(directory, p, 0755, 0, 0);
1545 if (r < 0)
1546 log_warning_errno(r, "Failed to create directory %s: %m", q);
1547 return 0;
1548 }
1549
1550 if (arg_link_journal == LINK_HOST) {
1551 /* don't create parents here — if the host doesn't have
1552 * permanent journal set up, don't force it here */
1553
1554 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
1555 if (try) {
1556 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1557 return 0;
1558 } else
1559 return log_error_errno(errno, "Failed to create %s: %m", p);
1560 }
1561
1562 } else if (access(p, F_OK) < 0)
1563 return 0;
1564
1565 if (dir_is_empty(q) == 0)
1566 log_warning("%s is not empty, proceeding anyway.", q);
1567
1568 r = userns_mkdir(directory, p, 0755, 0, 0);
1569 if (r < 0)
1570 return log_error_errno(r, "Failed to create %s: %m", q);
1571
1572 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1573 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1574
1575 return 0;
1576 }
1577
1578 static int drop_capabilities(void) {
1579 return capability_bounding_set_drop(arg_retain, false);
1580 }
1581
1582 static int reset_audit_loginuid(void) {
1583 _cleanup_free_ char *p = NULL;
1584 int r;
1585
1586 if (arg_share_system)
1587 return 0;
1588
1589 r = read_one_line_file("/proc/self/loginuid", &p);
1590 if (r == -ENOENT)
1591 return 0;
1592 if (r < 0)
1593 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1594
1595 /* Already reset? */
1596 if (streq(p, "4294967295"))
1597 return 0;
1598
1599 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1600 if (r < 0) {
1601 log_error_errno(r,
1602 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1603 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1604 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1605 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1606 "using systemd-nspawn. Sleeping for 5s... (%m)");
1607
1608 sleep(5);
1609 }
1610
1611 return 0;
1612 }
1613
1614 static int setup_seccomp(void) {
1615
1616 #ifdef HAVE_SECCOMP
1617 static const struct {
1618 uint64_t capability;
1619 int syscall_num;
1620 } blacklist[] = {
1621 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1622 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1623 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1624 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1625 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1626 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1627 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1628 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1629 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1630 { CAP_SYSLOG, SCMP_SYS(syslog) },
1631 };
1632
1633 scmp_filter_ctx seccomp;
1634 unsigned i;
1635 int r;
1636
1637 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1638 if (!seccomp)
1639 return log_oom();
1640
1641 r = seccomp_add_secondary_archs(seccomp);
1642 if (r < 0) {
1643 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1644 goto finish;
1645 }
1646
1647 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1648 if (arg_retain & (1ULL << blacklist[i].capability))
1649 continue;
1650
1651 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
1652 if (r == -EFAULT)
1653 continue; /* unknown syscall */
1654 if (r < 0) {
1655 log_error_errno(r, "Failed to block syscall: %m");
1656 goto finish;
1657 }
1658 }
1659
1660
1661 /*
1662 Audit is broken in containers, much of the userspace audit
1663 hookup will fail if running inside a container. We don't
1664 care and just turn off creation of audit sockets.
1665
1666 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1667 with EAFNOSUPPORT which audit userspace uses as indication
1668 that audit is disabled in the kernel.
1669 */
1670
1671 r = seccomp_rule_add(
1672 seccomp,
1673 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1674 SCMP_SYS(socket),
1675 2,
1676 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1677 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1678 if (r < 0) {
1679 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1680 goto finish;
1681 }
1682
1683 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1684 if (r < 0) {
1685 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1686 goto finish;
1687 }
1688
1689 r = seccomp_load(seccomp);
1690 if (r == -EINVAL) {
1691 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1692 r = 0;
1693 goto finish;
1694 }
1695 if (r < 0) {
1696 log_error_errno(r, "Failed to install seccomp audit filter: %m");
1697 goto finish;
1698 }
1699
1700 finish:
1701 seccomp_release(seccomp);
1702 return r;
1703 #else
1704 return 0;
1705 #endif
1706
1707 }
1708
1709 static int setup_propagate(const char *root) {
1710 const char *p, *q;
1711 int r;
1712
1713 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1714 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1715 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1716 (void) mkdir_p(p, 0600);
1717
1718 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1719 if (r < 0)
1720 return log_error_errno(r, "Failed to create /run/systemd: %m");
1721
1722 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1723 if (r < 0)
1724 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
1725
1726 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1727 if (r < 0)
1728 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
1729
1730 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1731 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1732 return log_error_errno(errno, "Failed to install propagation bind mount.");
1733
1734 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1735 return log_error_errno(errno, "Failed to make propagation mount read-only");
1736
1737 return 0;
1738 }
1739
1740 static int setup_image(char **device_path, int *loop_nr) {
1741 struct loop_info64 info = {
1742 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1743 };
1744 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1745 _cleanup_free_ char* loopdev = NULL;
1746 struct stat st;
1747 int r, nr;
1748
1749 assert(device_path);
1750 assert(loop_nr);
1751 assert(arg_image);
1752
1753 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1754 if (fd < 0)
1755 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1756
1757 if (fstat(fd, &st) < 0)
1758 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1759
1760 if (S_ISBLK(st.st_mode)) {
1761 char *p;
1762
1763 p = strdup(arg_image);
1764 if (!p)
1765 return log_oom();
1766
1767 *device_path = p;
1768
1769 *loop_nr = -1;
1770
1771 r = fd;
1772 fd = -1;
1773
1774 return r;
1775 }
1776
1777 if (!S_ISREG(st.st_mode)) {
1778 log_error("%s is not a regular file or block device.", arg_image);
1779 return -EINVAL;
1780 }
1781
1782 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1783 if (control < 0)
1784 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1785
1786 nr = ioctl(control, LOOP_CTL_GET_FREE);
1787 if (nr < 0)
1788 return log_error_errno(errno, "Failed to allocate loop device: %m");
1789
1790 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1791 return log_oom();
1792
1793 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1794 if (loop < 0)
1795 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1796
1797 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1798 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1799
1800 if (arg_read_only)
1801 info.lo_flags |= LO_FLAGS_READ_ONLY;
1802
1803 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1804 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1805
1806 *device_path = loopdev;
1807 loopdev = NULL;
1808
1809 *loop_nr = nr;
1810
1811 r = loop;
1812 loop = -1;
1813
1814 return r;
1815 }
1816
1817 #define PARTITION_TABLE_BLURB \
1818 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1819 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1820 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1821 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1822 "to be bootable with systemd-nspawn."
1823
1824 static int dissect_image(
1825 int fd,
1826 char **root_device, bool *root_device_rw,
1827 char **home_device, bool *home_device_rw,
1828 char **srv_device, bool *srv_device_rw,
1829 bool *secondary) {
1830
1831 #ifdef HAVE_BLKID
1832 int home_nr = -1, srv_nr = -1;
1833 #ifdef GPT_ROOT_NATIVE
1834 int root_nr = -1;
1835 #endif
1836 #ifdef GPT_ROOT_SECONDARY
1837 int secondary_root_nr = -1;
1838 #endif
1839 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1840 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1841 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1842 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1843 _cleanup_udev_unref_ struct udev *udev = NULL;
1844 struct udev_list_entry *first, *item;
1845 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1846 bool is_gpt, is_mbr, multiple_generic = false;
1847 const char *pttype = NULL;
1848 blkid_partlist pl;
1849 struct stat st;
1850 unsigned i;
1851 int r;
1852
1853 assert(fd >= 0);
1854 assert(root_device);
1855 assert(home_device);
1856 assert(srv_device);
1857 assert(secondary);
1858 assert(arg_image);
1859
1860 b = blkid_new_probe();
1861 if (!b)
1862 return log_oom();
1863
1864 errno = 0;
1865 r = blkid_probe_set_device(b, fd, 0, 0);
1866 if (r != 0) {
1867 if (errno == 0)
1868 return log_oom();
1869
1870 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1871 }
1872
1873 blkid_probe_enable_partitions(b, 1);
1874 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1875
1876 errno = 0;
1877 r = blkid_do_safeprobe(b);
1878 if (r == -2 || r == 1) {
1879 log_error("Failed to identify any partition table on\n"
1880 " %s\n"
1881 PARTITION_TABLE_BLURB, arg_image);
1882 return -EINVAL;
1883 } else if (r != 0) {
1884 if (errno == 0)
1885 errno = EIO;
1886 return log_error_errno(errno, "Failed to probe: %m");
1887 }
1888
1889 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1890
1891 is_gpt = streq_ptr(pttype, "gpt");
1892 is_mbr = streq_ptr(pttype, "dos");
1893
1894 if (!is_gpt && !is_mbr) {
1895 log_error("No GPT or MBR partition table discovered on\n"
1896 " %s\n"
1897 PARTITION_TABLE_BLURB, arg_image);
1898 return -EINVAL;
1899 }
1900
1901 errno = 0;
1902 pl = blkid_probe_get_partitions(b);
1903 if (!pl) {
1904 if (errno == 0)
1905 return log_oom();
1906
1907 log_error("Failed to list partitions of %s", arg_image);
1908 return -errno;
1909 }
1910
1911 udev = udev_new();
1912 if (!udev)
1913 return log_oom();
1914
1915 if (fstat(fd, &st) < 0)
1916 return log_error_errno(errno, "Failed to stat block device: %m");
1917
1918 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1919 if (!d)
1920 return log_oom();
1921
1922 for (i = 0;; i++) {
1923 int n, m;
1924
1925 if (i >= 10) {
1926 log_error("Kernel partitions never appeared.");
1927 return -ENXIO;
1928 }
1929
1930 e = udev_enumerate_new(udev);
1931 if (!e)
1932 return log_oom();
1933
1934 r = udev_enumerate_add_match_parent(e, d);
1935 if (r < 0)
1936 return log_oom();
1937
1938 r = udev_enumerate_scan_devices(e);
1939 if (r < 0)
1940 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1941
1942 /* Count the partitions enumerated by the kernel */
1943 n = 0;
1944 first = udev_enumerate_get_list_entry(e);
1945 udev_list_entry_foreach(item, first)
1946 n++;
1947
1948 /* Count the partitions enumerated by blkid */
1949 m = blkid_partlist_numof_partitions(pl);
1950 if (n == m + 1)
1951 break;
1952 if (n > m + 1) {
1953 log_error("blkid and kernel partition list do not match.");
1954 return -EIO;
1955 }
1956 if (n < m + 1) {
1957 unsigned j;
1958
1959 /* The kernel has probed fewer partitions than
1960 * blkid? Maybe the kernel prober is still
1961 * running or it got EBUSY because udev
1962 * already opened the device. Let's reprobe
1963 * the device, which is a synchronous call
1964 * that waits until probing is complete. */
1965
1966 for (j = 0; j < 20; j++) {
1967
1968 r = ioctl(fd, BLKRRPART, 0);
1969 if (r < 0)
1970 r = -errno;
1971 if (r >= 0 || r != -EBUSY)
1972 break;
1973
1974 /* If something else has the device
1975 * open, such as an udev rule, the
1976 * ioctl will return EBUSY. Since
1977 * there's no way to wait until it
1978 * isn't busy anymore, let's just wait
1979 * a bit, and try again.
1980 *
1981 * This is really something they
1982 * should fix in the kernel! */
1983
1984 usleep(50 * USEC_PER_MSEC);
1985 }
1986
1987 if (r < 0)
1988 return log_error_errno(r, "Failed to reread partition table: %m");
1989 }
1990
1991 e = udev_enumerate_unref(e);
1992 }
1993
1994 first = udev_enumerate_get_list_entry(e);
1995 udev_list_entry_foreach(item, first) {
1996 _cleanup_udev_device_unref_ struct udev_device *q;
1997 const char *node;
1998 unsigned long long flags;
1999 blkid_partition pp;
2000 dev_t qn;
2001 int nr;
2002
2003 errno = 0;
2004 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2005 if (!q) {
2006 if (!errno)
2007 errno = ENOMEM;
2008
2009 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2010 }
2011
2012 qn = udev_device_get_devnum(q);
2013 if (major(qn) == 0)
2014 continue;
2015
2016 if (st.st_rdev == qn)
2017 continue;
2018
2019 node = udev_device_get_devnode(q);
2020 if (!node)
2021 continue;
2022
2023 pp = blkid_partlist_devno_to_partition(pl, qn);
2024 if (!pp)
2025 continue;
2026
2027 flags = blkid_partition_get_flags(pp);
2028
2029 nr = blkid_partition_get_partno(pp);
2030 if (nr < 0)
2031 continue;
2032
2033 if (is_gpt) {
2034 sd_id128_t type_id;
2035 const char *stype;
2036
2037 if (flags & GPT_FLAG_NO_AUTO)
2038 continue;
2039
2040 stype = blkid_partition_get_type_string(pp);
2041 if (!stype)
2042 continue;
2043
2044 if (sd_id128_from_string(stype, &type_id) < 0)
2045 continue;
2046
2047 if (sd_id128_equal(type_id, GPT_HOME)) {
2048
2049 if (home && nr >= home_nr)
2050 continue;
2051
2052 home_nr = nr;
2053 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2054
2055 r = free_and_strdup(&home, node);
2056 if (r < 0)
2057 return log_oom();
2058
2059 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2060
2061 if (srv && nr >= srv_nr)
2062 continue;
2063
2064 srv_nr = nr;
2065 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2066
2067 r = free_and_strdup(&srv, node);
2068 if (r < 0)
2069 return log_oom();
2070 }
2071 #ifdef GPT_ROOT_NATIVE
2072 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2073
2074 if (root && nr >= root_nr)
2075 continue;
2076
2077 root_nr = nr;
2078 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2079
2080 r = free_and_strdup(&root, node);
2081 if (r < 0)
2082 return log_oom();
2083 }
2084 #endif
2085 #ifdef GPT_ROOT_SECONDARY
2086 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2087
2088 if (secondary_root && nr >= secondary_root_nr)
2089 continue;
2090
2091 secondary_root_nr = nr;
2092 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2093
2094 r = free_and_strdup(&secondary_root, node);
2095 if (r < 0)
2096 return log_oom();
2097 }
2098 #endif
2099 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2100
2101 if (generic)
2102 multiple_generic = true;
2103 else {
2104 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2105
2106 r = free_and_strdup(&generic, node);
2107 if (r < 0)
2108 return log_oom();
2109 }
2110 }
2111
2112 } else if (is_mbr) {
2113 int type;
2114
2115 if (flags != 0x80) /* Bootable flag */
2116 continue;
2117
2118 type = blkid_partition_get_type(pp);
2119 if (type != 0x83) /* Linux partition */
2120 continue;
2121
2122 if (generic)
2123 multiple_generic = true;
2124 else {
2125 generic_rw = true;
2126
2127 r = free_and_strdup(&root, node);
2128 if (r < 0)
2129 return log_oom();
2130 }
2131 }
2132 }
2133
2134 if (root) {
2135 *root_device = root;
2136 root = NULL;
2137
2138 *root_device_rw = root_rw;
2139 *secondary = false;
2140 } else if (secondary_root) {
2141 *root_device = secondary_root;
2142 secondary_root = NULL;
2143
2144 *root_device_rw = secondary_root_rw;
2145 *secondary = true;
2146 } else if (generic) {
2147
2148 /* There were no partitions with precise meanings
2149 * around, but we found generic partitions. In this
2150 * case, if there's only one, we can go ahead and boot
2151 * it, otherwise we bail out, because we really cannot
2152 * make any sense of it. */
2153
2154 if (multiple_generic) {
2155 log_error("Identified multiple bootable Linux partitions on\n"
2156 " %s\n"
2157 PARTITION_TABLE_BLURB, arg_image);
2158 return -EINVAL;
2159 }
2160
2161 *root_device = generic;
2162 generic = NULL;
2163
2164 *root_device_rw = generic_rw;
2165 *secondary = false;
2166 } else {
2167 log_error("Failed to identify root partition in disk image\n"
2168 " %s\n"
2169 PARTITION_TABLE_BLURB, arg_image);
2170 return -EINVAL;
2171 }
2172
2173 if (home) {
2174 *home_device = home;
2175 home = NULL;
2176
2177 *home_device_rw = home_rw;
2178 }
2179
2180 if (srv) {
2181 *srv_device = srv;
2182 srv = NULL;
2183
2184 *srv_device_rw = srv_rw;
2185 }
2186
2187 return 0;
2188 #else
2189 log_error("--image= is not supported, compiled without blkid support.");
2190 return -EOPNOTSUPP;
2191 #endif
2192 }
2193
2194 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2195 #ifdef HAVE_BLKID
2196 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2197 const char *fstype, *p;
2198 int r;
2199
2200 assert(what);
2201 assert(where);
2202
2203 if (arg_read_only)
2204 rw = false;
2205
2206 if (directory)
2207 p = strjoina(where, directory);
2208 else
2209 p = where;
2210
2211 errno = 0;
2212 b = blkid_new_probe_from_filename(what);
2213 if (!b) {
2214 if (errno == 0)
2215 return log_oom();
2216 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2217 }
2218
2219 blkid_probe_enable_superblocks(b, 1);
2220 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2221
2222 errno = 0;
2223 r = blkid_do_safeprobe(b);
2224 if (r == -1 || r == 1) {
2225 log_error("Cannot determine file system type of %s", what);
2226 return -EINVAL;
2227 } else if (r != 0) {
2228 if (errno == 0)
2229 errno = EIO;
2230 return log_error_errno(errno, "Failed to probe %s: %m", what);
2231 }
2232
2233 errno = 0;
2234 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2235 if (errno == 0)
2236 errno = EINVAL;
2237 log_error("Failed to determine file system type of %s", what);
2238 return -errno;
2239 }
2240
2241 if (streq(fstype, "crypto_LUKS")) {
2242 log_error("nspawn currently does not support LUKS disk images.");
2243 return -EOPNOTSUPP;
2244 }
2245
2246 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2247 return log_error_errno(errno, "Failed to mount %s: %m", what);
2248
2249 return 0;
2250 #else
2251 log_error("--image= is not supported, compiled without blkid support.");
2252 return -EOPNOTSUPP;
2253 #endif
2254 }
2255
2256 static int setup_machine_id(const char *directory) {
2257 int r;
2258 const char *etc_machine_id, *t;
2259 _cleanup_free_ char *s = NULL;
2260
2261 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2262
2263 r = read_one_line_file(etc_machine_id, &s);
2264 if (r < 0)
2265 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
2266
2267 t = strstrip(s);
2268
2269 if (!isempty(t)) {
2270 r = sd_id128_from_string(t, &arg_uuid);
2271 if (r < 0)
2272 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
2273 } else {
2274 if (sd_id128_is_null(arg_uuid)) {
2275 r = sd_id128_randomize(&arg_uuid);
2276 if (r < 0)
2277 return log_error_errno(r, "Failed to generate random machine ID: %m");
2278 }
2279 }
2280
2281 r = machine_id_setup(directory, arg_uuid);
2282 if (r < 0)
2283 return log_error_errno(r, "Failed to setup machine ID: %m");
2284
2285 return 0;
2286 }
2287
2288 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2289 int r;
2290
2291 assert(directory);
2292
2293 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
2294 return 0;
2295
2296 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2297 if (r == -EOPNOTSUPP)
2298 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2299 if (r == -EBADE)
2300 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2301 if (r < 0)
2302 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2303 if (r == 0)
2304 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2305 else
2306 log_debug("Patched directory tree to match UID/GID range.");
2307
2308 return r;
2309 }
2310
2311 static int mount_devices(
2312 const char *where,
2313 const char *root_device, bool root_device_rw,
2314 const char *home_device, bool home_device_rw,
2315 const char *srv_device, bool srv_device_rw) {
2316 int r;
2317
2318 assert(where);
2319
2320 if (root_device) {
2321 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2322 if (r < 0)
2323 return log_error_errno(r, "Failed to mount root directory: %m");
2324 }
2325
2326 if (home_device) {
2327 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2328 if (r < 0)
2329 return log_error_errno(r, "Failed to mount home directory: %m");
2330 }
2331
2332 if (srv_device) {
2333 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2334 if (r < 0)
2335 return log_error_errno(r, "Failed to mount server data directory: %m");
2336 }
2337
2338 return 0;
2339 }
2340
2341 static void loop_remove(int nr, int *image_fd) {
2342 _cleanup_close_ int control = -1;
2343 int r;
2344
2345 if (nr < 0)
2346 return;
2347
2348 if (image_fd && *image_fd >= 0) {
2349 r = ioctl(*image_fd, LOOP_CLR_FD);
2350 if (r < 0)
2351 log_debug_errno(errno, "Failed to close loop image: %m");
2352 *image_fd = safe_close(*image_fd);
2353 }
2354
2355 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2356 if (control < 0) {
2357 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2358 return;
2359 }
2360
2361 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2362 if (r < 0)
2363 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2364 }
2365
2366 /*
2367 * Return values:
2368 * < 0 : wait_for_terminate() failed to get the state of the
2369 * container, the container was terminated by a signal, or
2370 * failed for an unknown reason. No change is made to the
2371 * container argument.
2372 * > 0 : The program executed in the container terminated with an
2373 * error. The exit code of the program executed in the
2374 * container is returned. The container argument has been set
2375 * to CONTAINER_TERMINATED.
2376 * 0 : The container is being rebooted, has been shut down or exited
2377 * successfully. The container argument has been set to either
2378 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2379 *
2380 * That is, success is indicated by a return value of zero, and an
2381 * error is indicated by a non-zero value.
2382 */
2383 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2384 siginfo_t status;
2385 int r;
2386
2387 r = wait_for_terminate(pid, &status);
2388 if (r < 0)
2389 return log_warning_errno(r, "Failed to wait for container: %m");
2390
2391 switch (status.si_code) {
2392
2393 case CLD_EXITED:
2394 if (status.si_status == 0) {
2395 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2396
2397 } else
2398 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2399
2400 *container = CONTAINER_TERMINATED;
2401 return status.si_status;
2402
2403 case CLD_KILLED:
2404 if (status.si_status == SIGINT) {
2405
2406 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2407 *container = CONTAINER_TERMINATED;
2408 return 0;
2409
2410 } else if (status.si_status == SIGHUP) {
2411
2412 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2413 *container = CONTAINER_REBOOTED;
2414 return 0;
2415 }
2416
2417 /* CLD_KILLED fallthrough */
2418
2419 case CLD_DUMPED:
2420 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2421 return -EIO;
2422
2423 default:
2424 log_error("Container %s failed due to unknown reason.", arg_machine);
2425 return -EIO;
2426 }
2427
2428 return r;
2429 }
2430
2431 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2432 pid_t pid;
2433
2434 pid = PTR_TO_PID(userdata);
2435 if (pid > 0) {
2436 if (kill(pid, arg_kill_signal) >= 0) {
2437 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2438 sd_event_source_set_userdata(s, NULL);
2439 return 0;
2440 }
2441 }
2442
2443 sd_event_exit(sd_event_source_get_event(s), 0);
2444 return 0;
2445 }
2446
2447 static int determine_names(void) {
2448 int r;
2449
2450 if (arg_template && !arg_directory && arg_machine) {
2451
2452 /* If --template= was specified then we should not
2453 * search for a machine, but instead create a new one
2454 * in /var/lib/machine. */
2455
2456 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2457 if (!arg_directory)
2458 return log_oom();
2459 }
2460
2461 if (!arg_image && !arg_directory) {
2462 if (arg_machine) {
2463 _cleanup_(image_unrefp) Image *i = NULL;
2464
2465 r = image_find(arg_machine, &i);
2466 if (r < 0)
2467 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2468 else if (r == 0) {
2469 log_error("No image for machine '%s': %m", arg_machine);
2470 return -ENOENT;
2471 }
2472
2473 if (i->type == IMAGE_RAW)
2474 r = free_and_strdup(&arg_image, i->path);
2475 else
2476 r = free_and_strdup(&arg_directory, i->path);
2477 if (r < 0)
2478 return log_error_errno(r, "Invalid image directory: %m");
2479
2480 if (!arg_ephemeral)
2481 arg_read_only = arg_read_only || i->read_only;
2482 } else
2483 arg_directory = get_current_dir_name();
2484
2485 if (!arg_directory && !arg_machine) {
2486 log_error("Failed to determine path, please use -D or -i.");
2487 return -EINVAL;
2488 }
2489 }
2490
2491 if (!arg_machine) {
2492 if (arg_directory && path_equal(arg_directory, "/"))
2493 arg_machine = gethostname_malloc();
2494 else
2495 arg_machine = strdup(basename(arg_image ?: arg_directory));
2496
2497 if (!arg_machine)
2498 return log_oom();
2499
2500 hostname_cleanup(arg_machine);
2501 if (!machine_name_is_valid(arg_machine)) {
2502 log_error("Failed to determine machine name automatically, please use -M.");
2503 return -EINVAL;
2504 }
2505
2506 if (arg_ephemeral) {
2507 char *b;
2508
2509 /* Add a random suffix when this is an
2510 * ephemeral machine, so that we can run many
2511 * instances at once without manually having
2512 * to specify -M each time. */
2513
2514 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2515 return log_oom();
2516
2517 free(arg_machine);
2518 arg_machine = b;
2519 }
2520 }
2521
2522 return 0;
2523 }
2524
2525 static int determine_uid_shift(const char *directory) {
2526 int r;
2527
2528 if (arg_userns_mode == USER_NAMESPACE_NO) {
2529 arg_uid_shift = 0;
2530 return 0;
2531 }
2532
2533 if (arg_uid_shift == UID_INVALID) {
2534 struct stat st;
2535
2536 r = stat(directory, &st);
2537 if (r < 0)
2538 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2539
2540 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2541
2542 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2543 log_error("UID and GID base of %s don't match.", directory);
2544 return -EINVAL;
2545 }
2546
2547 arg_uid_range = UINT32_C(0x10000);
2548 }
2549
2550 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2551 log_error("UID base too high for UID range.");
2552 return -EINVAL;
2553 }
2554
2555 return 0;
2556 }
2557
2558 static int inner_child(
2559 Barrier *barrier,
2560 const char *directory,
2561 bool secondary,
2562 int kmsg_socket,
2563 int rtnl_socket,
2564 FDSet *fds) {
2565
2566 _cleanup_free_ char *home = NULL;
2567 char as_uuid[37];
2568 unsigned n_env = 1;
2569 const char *envp[] = {
2570 "PATH=" DEFAULT_PATH_SPLIT_USR,
2571 NULL, /* container */
2572 NULL, /* TERM */
2573 NULL, /* HOME */
2574 NULL, /* USER */
2575 NULL, /* LOGNAME */
2576 NULL, /* container_uuid */
2577 NULL, /* LISTEN_FDS */
2578 NULL, /* LISTEN_PID */
2579 NULL
2580 };
2581
2582 _cleanup_strv_free_ char **env_use = NULL;
2583 int r;
2584
2585 assert(barrier);
2586 assert(directory);
2587 assert(kmsg_socket >= 0);
2588
2589 cg_unified_flush();
2590
2591 if (arg_userns_mode != USER_NAMESPACE_NO) {
2592 /* Tell the parent, that it now can write the UID map. */
2593 (void) barrier_place(barrier); /* #1 */
2594
2595 /* Wait until the parent wrote the UID map */
2596 if (!barrier_place_and_sync(barrier)) { /* #2 */
2597 log_error("Parent died too early");
2598 return -ESRCH;
2599 }
2600 }
2601
2602 r = mount_all(NULL,
2603 arg_userns_mode != USER_NAMESPACE_NO,
2604 true,
2605 arg_private_network,
2606 arg_uid_shift,
2607 arg_uid_range,
2608 arg_selinux_apifs_context);
2609
2610 if (r < 0)
2611 return r;
2612
2613 r = mount_sysfs(NULL);
2614 if (r < 0)
2615 return r;
2616
2617 /* Wait until we are cgroup-ified, so that we
2618 * can mount the right cgroup path writable */
2619 if (!barrier_place_and_sync(barrier)) { /* #3 */
2620 log_error("Parent died too early");
2621 return -ESRCH;
2622 }
2623
2624 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2625 if (r < 0)
2626 return r;
2627
2628 r = reset_uid_gid();
2629 if (r < 0)
2630 return log_error_errno(r, "Couldn't become new root: %m");
2631
2632 r = setup_boot_id(NULL);
2633 if (r < 0)
2634 return r;
2635
2636 r = setup_kmsg(NULL, kmsg_socket);
2637 if (r < 0)
2638 return r;
2639 kmsg_socket = safe_close(kmsg_socket);
2640
2641 umask(0022);
2642
2643 if (setsid() < 0)
2644 return log_error_errno(errno, "setsid() failed: %m");
2645
2646 if (arg_private_network)
2647 loopback_setup();
2648
2649 if (arg_expose_ports) {
2650 r = expose_port_send_rtnl(rtnl_socket);
2651 if (r < 0)
2652 return r;
2653 rtnl_socket = safe_close(rtnl_socket);
2654 }
2655
2656 r = drop_capabilities();
2657 if (r < 0)
2658 return log_error_errno(r, "drop_capabilities() failed: %m");
2659
2660 setup_hostname();
2661
2662 if (arg_personality != PERSONALITY_INVALID) {
2663 if (personality(arg_personality) < 0)
2664 return log_error_errno(errno, "personality() failed: %m");
2665 } else if (secondary) {
2666 if (personality(PER_LINUX32) < 0)
2667 return log_error_errno(errno, "personality() failed: %m");
2668 }
2669
2670 #ifdef HAVE_SELINUX
2671 if (arg_selinux_context)
2672 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2673 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2674 #endif
2675
2676 r = change_uid_gid(arg_user, &home);
2677 if (r < 0)
2678 return r;
2679
2680 /* LXC sets container=lxc, so follow the scheme here */
2681 envp[n_env++] = strjoina("container=", arg_container_service_name);
2682
2683 envp[n_env] = strv_find_prefix(environ, "TERM=");
2684 if (envp[n_env])
2685 n_env++;
2686
2687 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2688 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2689 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2690 return log_oom();
2691
2692 assert(!sd_id128_equal(arg_uuid, SD_ID128_NULL));
2693
2694 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2695 return log_oom();
2696
2697 if (fdset_size(fds) > 0) {
2698 r = fdset_cloexec(fds, false);
2699 if (r < 0)
2700 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2701
2702 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2703 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2704 return log_oom();
2705 }
2706
2707 env_use = strv_env_merge(2, envp, arg_setenv);
2708 if (!env_use)
2709 return log_oom();
2710
2711 /* Let the parent know that we are ready and
2712 * wait until the parent is ready with the
2713 * setup, too... */
2714 if (!barrier_place_and_sync(barrier)) { /* #4 */
2715 log_error("Parent died too early");
2716 return -ESRCH;
2717 }
2718
2719 if (arg_chdir)
2720 if (chdir(arg_chdir) < 0)
2721 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2722
2723 if (arg_start_mode == START_PID2) {
2724 r = stub_pid1();
2725 if (r < 0)
2726 return r;
2727 }
2728
2729 /* Now, explicitly close the log, so that we
2730 * then can close all remaining fds. Closing
2731 * the log explicitly first has the benefit
2732 * that the logging subsystem knows about it,
2733 * and is thus ready to be reopened should we
2734 * need it again. Note that the other fds
2735 * closed here are at least the locking and
2736 * barrier fds. */
2737 log_close();
2738 (void) fdset_close_others(fds);
2739
2740 if (arg_start_mode == START_BOOT) {
2741 char **a;
2742 size_t m;
2743
2744 /* Automatically search for the init system */
2745
2746 m = strv_length(arg_parameters);
2747 a = newa(char*, m + 2);
2748 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2749 a[1 + m] = NULL;
2750
2751 a[0] = (char*) "/usr/lib/systemd/systemd";
2752 execve(a[0], a, env_use);
2753
2754 a[0] = (char*) "/lib/systemd/systemd";
2755 execve(a[0], a, env_use);
2756
2757 a[0] = (char*) "/sbin/init";
2758 execve(a[0], a, env_use);
2759 } else if (!strv_isempty(arg_parameters))
2760 execvpe(arg_parameters[0], arg_parameters, env_use);
2761 else {
2762 if (!arg_chdir)
2763 /* If we cannot change the directory, we'll end up in /, that is expected. */
2764 (void) chdir(home ?: "/root");
2765
2766 execle("/bin/bash", "-bash", NULL, env_use);
2767 execle("/bin/sh", "-sh", NULL, env_use);
2768 }
2769
2770 r = -errno;
2771 (void) log_open();
2772 return log_error_errno(r, "execv() failed: %m");
2773 }
2774
2775 static int outer_child(
2776 Barrier *barrier,
2777 const char *directory,
2778 const char *console,
2779 const char *root_device, bool root_device_rw,
2780 const char *home_device, bool home_device_rw,
2781 const char *srv_device, bool srv_device_rw,
2782 bool interactive,
2783 bool secondary,
2784 int pid_socket,
2785 int uuid_socket,
2786 int kmsg_socket,
2787 int rtnl_socket,
2788 int uid_shift_socket,
2789 FDSet *fds) {
2790
2791 pid_t pid;
2792 ssize_t l;
2793 int r;
2794
2795 assert(barrier);
2796 assert(directory);
2797 assert(console);
2798 assert(pid_socket >= 0);
2799 assert(uuid_socket >= 0);
2800 assert(kmsg_socket >= 0);
2801
2802 cg_unified_flush();
2803
2804 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2805 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2806
2807 if (interactive) {
2808 close_nointr(STDIN_FILENO);
2809 close_nointr(STDOUT_FILENO);
2810 close_nointr(STDERR_FILENO);
2811
2812 r = open_terminal(console, O_RDWR);
2813 if (r != STDIN_FILENO) {
2814 if (r >= 0) {
2815 safe_close(r);
2816 r = -EINVAL;
2817 }
2818
2819 return log_error_errno(r, "Failed to open console: %m");
2820 }
2821
2822 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2823 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2824 return log_error_errno(errno, "Failed to duplicate console: %m");
2825 }
2826
2827 r = reset_audit_loginuid();
2828 if (r < 0)
2829 return r;
2830
2831 /* Mark everything as slave, so that we still
2832 * receive mounts from the real root, but don't
2833 * propagate mounts to the real root. */
2834 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2835 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2836
2837 r = mount_devices(directory,
2838 root_device, root_device_rw,
2839 home_device, home_device_rw,
2840 srv_device, srv_device_rw);
2841 if (r < 0)
2842 return r;
2843
2844 r = determine_uid_shift(directory);
2845 if (r < 0)
2846 return r;
2847
2848 if (arg_userns_mode != USER_NAMESPACE_NO) {
2849 /* Let the parent know which UID shift we read from the image */
2850 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2851 if (l < 0)
2852 return log_error_errno(errno, "Failed to send UID shift: %m");
2853 if (l != sizeof(arg_uid_shift)) {
2854 log_error("Short write while sending UID shift.");
2855 return -EIO;
2856 }
2857
2858 if (arg_userns_mode == USER_NAMESPACE_PICK) {
2859 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
2860 * we just read from the image is available. If yes, it will send the UID shift back to us, if
2861 * not it will pick a different one, and send it back to us. */
2862
2863 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
2864 if (l < 0)
2865 return log_error_errno(errno, "Failed to recv UID shift: %m");
2866 if (l != sizeof(arg_uid_shift)) {
2867 log_error("Short read while recieving UID shift.");
2868 return -EIO;
2869 }
2870 }
2871
2872 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2873 }
2874
2875 /* Turn directory into bind mount */
2876 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2877 return log_error_errno(errno, "Failed to make bind mount: %m");
2878
2879 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
2880 if (r < 0)
2881 return r;
2882
2883 r = setup_volatile(
2884 directory,
2885 arg_volatile_mode,
2886 arg_userns_mode != USER_NAMESPACE_NO,
2887 arg_uid_shift,
2888 arg_uid_range,
2889 arg_selinux_context);
2890 if (r < 0)
2891 return r;
2892
2893 r = setup_volatile_state(
2894 directory,
2895 arg_volatile_mode,
2896 arg_userns_mode != USER_NAMESPACE_NO,
2897 arg_uid_shift,
2898 arg_uid_range,
2899 arg_selinux_context);
2900 if (r < 0)
2901 return r;
2902
2903 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2904 if (r < 0)
2905 return r;
2906
2907 if (arg_read_only) {
2908 r = bind_remount_recursive(directory, true);
2909 if (r < 0)
2910 return log_error_errno(r, "Failed to make tree read-only: %m");
2911 }
2912
2913 r = mount_all(directory,
2914 arg_userns_mode != USER_NAMESPACE_NO,
2915 false,
2916 arg_private_network,
2917 arg_uid_shift,
2918 arg_uid_range,
2919 arg_selinux_apifs_context);
2920 if (r < 0)
2921 return r;
2922
2923 r = copy_devnodes(directory);
2924 if (r < 0)
2925 return r;
2926
2927 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2928
2929 r = setup_pts(directory);
2930 if (r < 0)
2931 return r;
2932
2933 r = setup_propagate(directory);
2934 if (r < 0)
2935 return r;
2936
2937 r = setup_dev_console(directory, console);
2938 if (r < 0)
2939 return r;
2940
2941 r = setup_seccomp();
2942 if (r < 0)
2943 return r;
2944
2945 r = setup_timezone(directory);
2946 if (r < 0)
2947 return r;
2948
2949 r = setup_resolv_conf(directory);
2950 if (r < 0)
2951 return r;
2952
2953 r = setup_machine_id(directory);
2954 if (r < 0)
2955 return r;
2956
2957 r = setup_journal(directory);
2958 if (r < 0)
2959 return r;
2960
2961 r = mount_custom(
2962 directory,
2963 arg_custom_mounts,
2964 arg_n_custom_mounts,
2965 arg_userns_mode != USER_NAMESPACE_NO,
2966 arg_uid_shift,
2967 arg_uid_range,
2968 arg_selinux_apifs_context);
2969 if (r < 0)
2970 return r;
2971
2972 r = mount_cgroups(
2973 directory,
2974 arg_unified_cgroup_hierarchy,
2975 arg_userns_mode != USER_NAMESPACE_NO,
2976 arg_uid_shift,
2977 arg_uid_range,
2978 arg_selinux_apifs_context);
2979 if (r < 0)
2980 return r;
2981
2982 r = mount_move_root(directory);
2983 if (r < 0)
2984 return log_error_errno(r, "Failed to move root directory: %m");
2985
2986 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2987 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2988 (arg_private_network ? CLONE_NEWNET : 0) |
2989 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0),
2990 NULL);
2991 if (pid < 0)
2992 return log_error_errno(errno, "Failed to fork inner child: %m");
2993 if (pid == 0) {
2994 pid_socket = safe_close(pid_socket);
2995 uuid_socket = safe_close(uuid_socket);
2996 uid_shift_socket = safe_close(uid_shift_socket);
2997
2998 /* The inner child has all namespaces that are
2999 * requested, so that we all are owned by the user if
3000 * user namespaces are turned on. */
3001
3002 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
3003 if (r < 0)
3004 _exit(EXIT_FAILURE);
3005
3006 _exit(EXIT_SUCCESS);
3007 }
3008
3009 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3010 if (l < 0)
3011 return log_error_errno(errno, "Failed to send PID: %m");
3012 if (l != sizeof(pid)) {
3013 log_error("Short write while sending PID.");
3014 return -EIO;
3015 }
3016
3017 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3018 if (l < 0)
3019 return log_error_errno(errno, "Failed to send machine ID: %m");
3020 if (l != sizeof(arg_uuid)) {
3021 log_error("Short write while sending machine ID.");
3022 return -EIO;
3023 }
3024
3025 pid_socket = safe_close(pid_socket);
3026 uuid_socket = safe_close(uuid_socket);
3027 kmsg_socket = safe_close(kmsg_socket);
3028 rtnl_socket = safe_close(rtnl_socket);
3029
3030 return 0;
3031 }
3032
3033 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3034 unsigned n_tries = 100;
3035 uid_t candidate;
3036 int r;
3037
3038 assert(shift);
3039 assert(ret_lock_file);
3040 assert(arg_userns_mode == USER_NAMESPACE_PICK);
3041 assert(arg_uid_range == 0x10000U);
3042
3043 candidate = *shift;
3044
3045 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3046
3047 for (;;) {
3048 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3049 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
3050
3051 if (--n_tries <= 0)
3052 return -EBUSY;
3053
3054 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
3055 goto next;
3056 if ((candidate & UINT32_C(0xFFFF)) != 0)
3057 goto next;
3058
3059 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3060 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3061 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3062 goto next;
3063 if (r < 0)
3064 return r;
3065
3066 /* Make some superficial checks whether the range is currently known in the user database */
3067 if (getpwuid(candidate))
3068 goto next;
3069 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3070 goto next;
3071 if (getgrgid(candidate))
3072 goto next;
3073 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3074 goto next;
3075
3076 *ret_lock_file = lf;
3077 lf = (struct LockFile) LOCK_FILE_INIT;
3078 *shift = candidate;
3079 return 0;
3080
3081 next:
3082 random_bytes(&candidate, sizeof(candidate));
3083 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
3084 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3085 }
3086 }
3087
3088 static int setup_uid_map(pid_t pid) {
3089 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3090 int r;
3091
3092 assert(pid > 1);
3093
3094 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3095 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
3096 r = write_string_file(uid_map, line, 0);
3097 if (r < 0)
3098 return log_error_errno(r, "Failed to write UID map: %m");
3099
3100 /* We always assign the same UID and GID ranges */
3101 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
3102 r = write_string_file(uid_map, line, 0);
3103 if (r < 0)
3104 return log_error_errno(r, "Failed to write GID map: %m");
3105
3106 return 0;
3107 }
3108
3109 static int load_settings(void) {
3110 _cleanup_(settings_freep) Settings *settings = NULL;
3111 _cleanup_fclose_ FILE *f = NULL;
3112 _cleanup_free_ char *p = NULL;
3113 const char *fn, *i;
3114 int r;
3115
3116 /* If all settings are masked, there's no point in looking for
3117 * the settings file */
3118 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3119 return 0;
3120
3121 fn = strjoina(arg_machine, ".nspawn");
3122
3123 /* We first look in the admin's directories in /etc and /run */
3124 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3125 _cleanup_free_ char *j = NULL;
3126
3127 j = strjoin(i, "/", fn, NULL);
3128 if (!j)
3129 return log_oom();
3130
3131 f = fopen(j, "re");
3132 if (f) {
3133 p = j;
3134 j = NULL;
3135
3136 /* By default, we trust configuration from /etc and /run */
3137 if (arg_settings_trusted < 0)
3138 arg_settings_trusted = true;
3139
3140 break;
3141 }
3142
3143 if (errno != ENOENT)
3144 return log_error_errno(errno, "Failed to open %s: %m", j);
3145 }
3146
3147 if (!f) {
3148 /* After that, let's look for a file next to the
3149 * actual image we shall boot. */
3150
3151 if (arg_image) {
3152 p = file_in_same_dir(arg_image, fn);
3153 if (!p)
3154 return log_oom();
3155 } else if (arg_directory) {
3156 p = file_in_same_dir(arg_directory, fn);
3157 if (!p)
3158 return log_oom();
3159 }
3160
3161 if (p) {
3162 f = fopen(p, "re");
3163 if (!f && errno != ENOENT)
3164 return log_error_errno(errno, "Failed to open %s: %m", p);
3165
3166 /* By default, we do not trust configuration from /var/lib/machines */
3167 if (arg_settings_trusted < 0)
3168 arg_settings_trusted = false;
3169 }
3170 }
3171
3172 if (!f)
3173 return 0;
3174
3175 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3176
3177 r = settings_load(f, p, &settings);
3178 if (r < 0)
3179 return r;
3180
3181 /* Copy over bits from the settings, unless they have been
3182 * explicitly masked by command line switches. */
3183
3184 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3185 settings->start_mode >= 0) {
3186 arg_start_mode = settings->start_mode;
3187
3188 strv_free(arg_parameters);
3189 arg_parameters = settings->parameters;
3190 settings->parameters = NULL;
3191 }
3192
3193 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3194 settings->working_directory) {
3195 free(arg_chdir);
3196 arg_chdir = settings->working_directory;
3197 settings->working_directory = NULL;
3198 }
3199
3200 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3201 settings->environment) {
3202 strv_free(arg_setenv);
3203 arg_setenv = settings->environment;
3204 settings->environment = NULL;
3205 }
3206
3207 if ((arg_settings_mask & SETTING_USER) == 0 &&
3208 settings->user) {
3209 free(arg_user);
3210 arg_user = settings->user;
3211 settings->user = NULL;
3212 }
3213
3214 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
3215 uint64_t plus;
3216
3217 plus = settings->capability;
3218 if (settings_private_network(settings))
3219 plus |= (1ULL << CAP_NET_ADMIN);
3220
3221 if (!arg_settings_trusted && plus != 0) {
3222 if (settings->capability != 0)
3223 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3224 } else
3225 arg_retain |= plus;
3226
3227 arg_retain &= ~settings->drop_capability;
3228 }
3229
3230 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3231 settings->kill_signal > 0)
3232 arg_kill_signal = settings->kill_signal;
3233
3234 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3235 settings->personality != PERSONALITY_INVALID)
3236 arg_personality = settings->personality;
3237
3238 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3239 !sd_id128_is_null(settings->machine_id)) {
3240
3241 if (!arg_settings_trusted)
3242 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3243 else
3244 arg_uuid = settings->machine_id;
3245 }
3246
3247 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3248 settings->read_only >= 0)
3249 arg_read_only = settings->read_only;
3250
3251 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3252 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3253 arg_volatile_mode = settings->volatile_mode;
3254
3255 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3256 settings->n_custom_mounts > 0) {
3257
3258 if (!arg_settings_trusted)
3259 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3260 else {
3261 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3262 arg_custom_mounts = settings->custom_mounts;
3263 arg_n_custom_mounts = settings->n_custom_mounts;
3264
3265 settings->custom_mounts = NULL;
3266 settings->n_custom_mounts = 0;
3267 }
3268 }
3269
3270 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3271 (settings->private_network >= 0 ||
3272 settings->network_veth >= 0 ||
3273 settings->network_bridge ||
3274 settings->network_interfaces ||
3275 settings->network_macvlan ||
3276 settings->network_ipvlan ||
3277 settings->network_veth_extra)) {
3278
3279 if (!arg_settings_trusted)
3280 log_warning("Ignoring network settings, file %s is not trusted.", p);
3281 else {
3282 arg_network_veth = settings_network_veth(settings);
3283 arg_private_network = settings_private_network(settings);
3284
3285 strv_free(arg_network_interfaces);
3286 arg_network_interfaces = settings->network_interfaces;
3287 settings->network_interfaces = NULL;
3288
3289 strv_free(arg_network_macvlan);
3290 arg_network_macvlan = settings->network_macvlan;
3291 settings->network_macvlan = NULL;
3292
3293 strv_free(arg_network_ipvlan);
3294 arg_network_ipvlan = settings->network_ipvlan;
3295 settings->network_ipvlan = NULL;
3296
3297 strv_free(arg_network_veth_extra);
3298 arg_network_veth_extra = settings->network_veth_extra;
3299 settings->network_veth_extra = NULL;
3300
3301 free(arg_network_bridge);
3302 arg_network_bridge = settings->network_bridge;
3303 settings->network_bridge = NULL;
3304 }
3305 }
3306
3307 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3308 settings->expose_ports) {
3309
3310 if (!arg_settings_trusted)
3311 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3312 else {
3313 expose_port_free_all(arg_expose_ports);
3314 arg_expose_ports = settings->expose_ports;
3315 settings->expose_ports = NULL;
3316 }
3317 }
3318
3319 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3320 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3321
3322 if (!arg_settings_trusted)
3323 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3324 else {
3325 arg_userns_mode = settings->userns_mode;
3326 arg_uid_shift = settings->uid_shift;
3327 arg_uid_range = settings->uid_range;
3328 arg_userns_chown = settings->userns_chown;
3329 }
3330 }
3331
3332 return 0;
3333 }
3334
3335 int main(int argc, char *argv[]) {
3336
3337 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3338 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3339 _cleanup_close_ int master = -1, image_fd = -1;
3340 _cleanup_fdset_free_ FDSet *fds = NULL;
3341 int r, n_fd_passed, loop_nr = -1;
3342 char veth_name[IFNAMSIZ];
3343 bool secondary = false, remove_subvol = false;
3344 sigset_t mask_chld;
3345 pid_t pid = 0;
3346 int ret = EXIT_SUCCESS;
3347 union in_addr_union exposed = {};
3348 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3349 bool interactive;
3350
3351 log_parse_environment();
3352 log_open();
3353
3354 /* Make sure rename_process() in the stub init process can work */
3355 saved_argv = argv;
3356 saved_argc = argc;
3357
3358 r = parse_argv(argc, argv);
3359 if (r <= 0)
3360 goto finish;
3361
3362 if (geteuid() != 0) {
3363 log_error("Need to be root.");
3364 r = -EPERM;
3365 goto finish;
3366 }
3367 r = determine_names();
3368 if (r < 0)
3369 goto finish;
3370
3371 r = load_settings();
3372 if (r < 0)
3373 goto finish;
3374
3375 r = verify_arguments();
3376 if (r < 0)
3377 goto finish;
3378
3379 n_fd_passed = sd_listen_fds(false);
3380 if (n_fd_passed > 0) {
3381 r = fdset_new_listen_fds(&fds, false);
3382 if (r < 0) {
3383 log_error_errno(r, "Failed to collect file descriptors: %m");
3384 goto finish;
3385 }
3386 }
3387
3388 if (arg_directory) {
3389 assert(!arg_image);
3390
3391 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3392 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3393 r = -EINVAL;
3394 goto finish;
3395 }
3396
3397 if (arg_ephemeral) {
3398 _cleanup_free_ char *np = NULL;
3399
3400 /* If the specified path is a mount point we
3401 * generate the new snapshot immediately
3402 * inside it under a random name. However if
3403 * the specified is not a mount point we
3404 * create the new snapshot in the parent
3405 * directory, just next to it. */
3406 r = path_is_mount_point(arg_directory, 0);
3407 if (r < 0) {
3408 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3409 goto finish;
3410 }
3411 if (r > 0)
3412 r = tempfn_random_child(arg_directory, "machine.", &np);
3413 else
3414 r = tempfn_random(arg_directory, "machine.", &np);
3415 if (r < 0) {
3416 log_error_errno(r, "Failed to generate name for snapshot: %m");
3417 goto finish;
3418 }
3419
3420 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3421 if (r < 0) {
3422 log_error_errno(r, "Failed to lock %s: %m", np);
3423 goto finish;
3424 }
3425
3426 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3427 if (r < 0) {
3428 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3429 goto finish;
3430 }
3431
3432 free(arg_directory);
3433 arg_directory = np;
3434 np = NULL;
3435
3436 remove_subvol = true;
3437
3438 } else {
3439 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3440 if (r == -EBUSY) {
3441 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3442 goto finish;
3443 }
3444 if (r < 0) {
3445 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3446 return r;
3447 }
3448
3449 if (arg_template) {
3450 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
3451 if (r == -EEXIST) {
3452 if (!arg_quiet)
3453 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3454 } else if (r < 0) {
3455 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3456 goto finish;
3457 } else {
3458 if (!arg_quiet)
3459 log_info("Populated %s from template %s.", arg_directory, arg_template);
3460 }
3461 }
3462 }
3463
3464 if (arg_start_mode == START_BOOT) {
3465 if (path_is_os_tree(arg_directory) <= 0) {
3466 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3467 r = -EINVAL;
3468 goto finish;
3469 }
3470 } else {
3471 const char *p;
3472
3473 p = strjoina(arg_directory, "/usr/");
3474 if (laccess(p, F_OK) < 0) {
3475 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
3476 r = -EINVAL;
3477 goto finish;
3478 }
3479 }
3480
3481 } else {
3482 char template[] = "/tmp/nspawn-root-XXXXXX";
3483
3484 assert(arg_image);
3485 assert(!arg_template);
3486
3487 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3488 if (r == -EBUSY) {
3489 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3490 goto finish;
3491 }
3492 if (r < 0) {
3493 r = log_error_errno(r, "Failed to create image lock: %m");
3494 goto finish;
3495 }
3496
3497 if (!mkdtemp(template)) {
3498 log_error_errno(errno, "Failed to create temporary directory: %m");
3499 r = -errno;
3500 goto finish;
3501 }
3502
3503 arg_directory = strdup(template);
3504 if (!arg_directory) {
3505 r = log_oom();
3506 goto finish;
3507 }
3508
3509 image_fd = setup_image(&device_path, &loop_nr);
3510 if (image_fd < 0) {
3511 r = image_fd;
3512 goto finish;
3513 }
3514
3515 r = dissect_image(image_fd,
3516 &root_device, &root_device_rw,
3517 &home_device, &home_device_rw,
3518 &srv_device, &srv_device_rw,
3519 &secondary);
3520 if (r < 0)
3521 goto finish;
3522 }
3523
3524 r = custom_mounts_prepare();
3525 if (r < 0)
3526 goto finish;
3527
3528 interactive =
3529 isatty(STDIN_FILENO) > 0 &&
3530 isatty(STDOUT_FILENO) > 0;
3531
3532 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3533 if (master < 0) {
3534 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3535 goto finish;
3536 }
3537
3538 r = ptsname_malloc(master, &console);
3539 if (r < 0) {
3540 r = log_error_errno(r, "Failed to determine tty name: %m");
3541 goto finish;
3542 }
3543
3544 if (arg_selinux_apifs_context) {
3545 r = mac_selinux_apply(console, arg_selinux_apifs_context);
3546 if (r < 0)
3547 goto finish;
3548 }
3549
3550 if (unlockpt(master) < 0) {
3551 r = log_error_errno(errno, "Failed to unlock tty: %m");
3552 goto finish;
3553 }
3554
3555 if (!arg_quiet)
3556 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3557 arg_machine, arg_image ?: arg_directory);
3558
3559 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
3560
3561 assert_se(sigemptyset(&mask_chld) == 0);
3562 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3563
3564 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3565 r = log_error_errno(errno, "Failed to become subreaper: %m");
3566 goto finish;
3567 }
3568
3569 for (;;) {
3570 static const struct sigaction sa = {
3571 .sa_handler = nop_signal_handler,
3572 .sa_flags = SA_NOCLDSTOP,
3573 };
3574
3575 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3576 _cleanup_close_ int etc_passwd_lock = -1;
3577 _cleanup_close_pair_ int
3578 kmsg_socket_pair[2] = { -1, -1 },
3579 rtnl_socket_pair[2] = { -1, -1 },
3580 pid_socket_pair[2] = { -1, -1 },
3581 uuid_socket_pair[2] = { -1, -1 },
3582 uid_shift_socket_pair[2] = { -1, -1 };
3583 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3584 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3585 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3586 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3587 ContainerStatus container_status;
3588 char last_char = 0;
3589 int ifi = 0;
3590 ssize_t l;
3591
3592 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3593 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3594 * check with getpwuid() if the specific user already exists. Note that /etc might be
3595 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3596 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3597 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3598 * really ours. */
3599
3600 etc_passwd_lock = take_etc_passwd_lock(NULL);
3601 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS) {
3602 log_error_errno(r, "Failed to take /etc/passwd lock: %m");
3603 goto finish;
3604 }
3605 }
3606
3607 r = barrier_create(&barrier);
3608 if (r < 0) {
3609 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3610 goto finish;
3611 }
3612
3613 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3614 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3615 goto finish;
3616 }
3617
3618 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3619 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3620 goto finish;
3621 }
3622
3623 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
3624 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3625 goto finish;
3626 }
3627
3628 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0) {
3629 r = log_error_errno(errno, "Failed to create id socket pair: %m");
3630 goto finish;
3631 }
3632
3633 if (arg_userns_mode != USER_NAMESPACE_NO)
3634 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
3635 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3636 goto finish;
3637 }
3638
3639 /* Child can be killed before execv(), so handle SIGCHLD
3640 * in order to interrupt parent's blocking calls and
3641 * give it a chance to call wait() and terminate. */
3642 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3643 if (r < 0) {
3644 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3645 goto finish;
3646 }
3647
3648 r = sigaction(SIGCHLD, &sa, NULL);
3649 if (r < 0) {
3650 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3651 goto finish;
3652 }
3653
3654 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
3655 if (pid < 0) {
3656 if (errno == EINVAL)
3657 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3658 else
3659 r = log_error_errno(errno, "clone() failed: %m");
3660
3661 goto finish;
3662 }
3663
3664 if (pid == 0) {
3665 /* The outer child only has a file system namespace. */
3666 barrier_set_role(&barrier, BARRIER_CHILD);
3667
3668 master = safe_close(master);
3669
3670 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3671 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3672 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3673 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3674 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3675
3676 (void) reset_all_signal_handlers();
3677 (void) reset_signal_mask();
3678
3679 r = outer_child(&barrier,
3680 arg_directory,
3681 console,
3682 root_device, root_device_rw,
3683 home_device, home_device_rw,
3684 srv_device, srv_device_rw,
3685 interactive,
3686 secondary,
3687 pid_socket_pair[1],
3688 uuid_socket_pair[1],
3689 kmsg_socket_pair[1],
3690 rtnl_socket_pair[1],
3691 uid_shift_socket_pair[1],
3692 fds);
3693 if (r < 0)
3694 _exit(EXIT_FAILURE);
3695
3696 _exit(EXIT_SUCCESS);
3697 }
3698
3699 barrier_set_role(&barrier, BARRIER_PARENT);
3700
3701 fds = fdset_free(fds);
3702
3703 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3704 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3705 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3706 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3707 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3708
3709 if (arg_userns_mode != USER_NAMESPACE_NO) {
3710 /* The child just let us know the UID shift it might have read from the image. */
3711 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3712 if (l < 0) {
3713 r = log_error_errno(errno, "Failed to read UID shift: %m");
3714 goto finish;
3715 }
3716 if (l != sizeof(arg_uid_shift)) {
3717 log_error("Short read while reading UID shift.");
3718 r = EIO;
3719 goto finish;
3720 }
3721
3722 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3723 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3724 * image, but if that's already in use, pick a new one, and report back to the child,
3725 * which one we now picked. */
3726
3727 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3728 if (r < 0) {
3729 log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3730 goto finish;
3731 }
3732
3733 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3734 if (l < 0) {
3735 r = log_error_errno(errno, "Failed to send UID shift: %m");
3736 goto finish;
3737 }
3738 if (l != sizeof(arg_uid_shift)) {
3739 log_error("Short write while writing UID shift.");
3740 r = -EIO;
3741 goto finish;
3742 }
3743 }
3744 }
3745
3746 /* Wait for the outer child. */
3747 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3748 if (r < 0)
3749 goto finish;
3750 if (r != 0) {
3751 r = -EIO;
3752 goto finish;
3753 }
3754 pid = 0;
3755
3756 /* And now retrieve the PID of the inner child. */
3757 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3758 if (l < 0) {
3759 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3760 goto finish;
3761 }
3762 if (l != sizeof(pid)) {
3763 log_error("Short read while reading inner child PID.");
3764 r = EIO;
3765 goto finish;
3766 }
3767
3768 /* We also retrieve container UUID in case it was generated by outer child */
3769 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof(arg_uuid), 0);
3770 if (l < 0) {
3771 r = log_error_errno(errno, "Failed to read container machine ID: %m");
3772 goto finish;
3773 }
3774 if (l != sizeof(arg_uuid)) {
3775 log_error("Short read while reading container machined ID.");
3776 r = EIO;
3777 goto finish;
3778 }
3779
3780 log_debug("Init process invoked as PID " PID_FMT, pid);
3781
3782 if (arg_userns_mode != USER_NAMESPACE_NO) {
3783 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3784 log_error("Child died too early.");
3785 r = -ESRCH;
3786 goto finish;
3787 }
3788
3789 r = setup_uid_map(pid);
3790 if (r < 0)
3791 goto finish;
3792
3793 (void) barrier_place(&barrier); /* #2 */
3794 }
3795
3796 if (arg_private_network) {
3797
3798 r = move_network_interfaces(pid, arg_network_interfaces);
3799 if (r < 0)
3800 goto finish;
3801
3802 if (arg_network_veth) {
3803 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3804 if (r < 0)
3805 goto finish;
3806 else if (r > 0)
3807 ifi = r;
3808
3809 if (arg_network_bridge) {
3810 r = setup_bridge(veth_name, arg_network_bridge);
3811 if (r < 0)
3812 goto finish;
3813 if (r > 0)
3814 ifi = r;
3815 }
3816 }
3817
3818 r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
3819 if (r < 0)
3820 goto finish;
3821
3822 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3823 if (r < 0)
3824 goto finish;
3825
3826 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3827 if (r < 0)
3828 goto finish;
3829 }
3830
3831 if (arg_register) {
3832 r = register_machine(
3833 arg_machine,
3834 pid,
3835 arg_directory,
3836 arg_uuid,
3837 ifi,
3838 arg_slice,
3839 arg_custom_mounts, arg_n_custom_mounts,
3840 arg_kill_signal,
3841 arg_property,
3842 arg_keep_unit,
3843 arg_container_service_name);
3844 if (r < 0)
3845 goto finish;
3846 }
3847
3848 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
3849 if (r < 0)
3850 goto finish;
3851
3852 if (arg_keep_unit) {
3853 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3854 if (r < 0)
3855 goto finish;
3856 }
3857
3858 r = chown_cgroup(pid, arg_uid_shift);
3859 if (r < 0)
3860 goto finish;
3861
3862 /* Notify the child that the parent is ready with all
3863 * its setup (including cgroup-ification), and that
3864 * the child can now hand over control to the code to
3865 * run inside the container. */
3866 (void) barrier_place(&barrier); /* #3 */
3867
3868 /* Block SIGCHLD here, before notifying child.
3869 * process_pty() will handle it with the other signals. */
3870 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3871
3872 /* Reset signal to default */
3873 r = default_signals(SIGCHLD, -1);
3874 if (r < 0) {
3875 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3876 goto finish;
3877 }
3878
3879 /* Let the child know that we are ready and wait that the child is completely ready now. */
3880 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3881 log_error("Child died too early.");
3882 r = -ESRCH;
3883 goto finish;
3884 }
3885
3886 /* At this point we have made use of the UID we picked, and thus nss-mymachines will make them appear
3887 * in getpwuid(), thus we can release the /etc/passwd lock. */
3888 etc_passwd_lock = safe_close(etc_passwd_lock);
3889
3890 sd_notifyf(false,
3891 "READY=1\n"
3892 "STATUS=Container running.\n"
3893 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3894
3895 r = sd_event_new(&event);
3896 if (r < 0) {
3897 log_error_errno(r, "Failed to get default event source: %m");
3898 goto finish;
3899 }
3900
3901 if (arg_kill_signal > 0) {
3902 /* Try to kill the init system on SIGINT or SIGTERM */
3903 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid));
3904 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid));
3905 } else {
3906 /* Immediately exit */
3907 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3908 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3909 }
3910
3911 /* simply exit on sigchld */
3912 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3913
3914 if (arg_expose_ports) {
3915 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
3916 if (r < 0)
3917 goto finish;
3918
3919 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
3920 }
3921
3922 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3923
3924 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
3925 if (r < 0) {
3926 log_error_errno(r, "Failed to create PTY forwarder: %m");
3927 goto finish;
3928 }
3929
3930 r = sd_event_loop(event);
3931 if (r < 0) {
3932 log_error_errno(r, "Failed to run event loop: %m");
3933 goto finish;
3934 }
3935
3936 pty_forward_get_last_char(forward, &last_char);
3937
3938 forward = pty_forward_free(forward);
3939
3940 if (!arg_quiet && last_char != '\n')
3941 putc('\n', stdout);
3942
3943 /* Kill if it is not dead yet anyway */
3944 if (arg_register && !arg_keep_unit)
3945 terminate_machine(pid);
3946
3947 /* Normally redundant, but better safe than sorry */
3948 kill(pid, SIGKILL);
3949
3950 r = wait_for_container(pid, &container_status);
3951 pid = 0;
3952
3953 if (r < 0)
3954 /* We failed to wait for the container, or the
3955 * container exited abnormally */
3956 goto finish;
3957 else if (r > 0 || container_status == CONTAINER_TERMINATED) {
3958 /* The container exited with a non-zero
3959 * status, or with zero status and no reboot
3960 * was requested. */
3961 ret = r;
3962 break;
3963 }
3964
3965 /* CONTAINER_REBOOTED, loop again */
3966
3967 if (arg_keep_unit) {
3968 /* Special handling if we are running as a
3969 * service: instead of simply restarting the
3970 * machine we want to restart the entire
3971 * service, so let's inform systemd about this
3972 * with the special exit code 133. The service
3973 * file uses RestartForceExitStatus=133 so
3974 * that this results in a full nspawn
3975 * restart. This is necessary since we might
3976 * have cgroup parameters set we want to have
3977 * flushed out. */
3978 ret = 133;
3979 r = 0;
3980 break;
3981 }
3982
3983 expose_port_flush(arg_expose_ports, &exposed);
3984 (void) remove_veth_links(veth_name, arg_network_veth_extra);
3985 }
3986
3987 finish:
3988 sd_notify(false,
3989 "STOPPING=1\n"
3990 "STATUS=Terminating...");
3991
3992 if (pid > 0)
3993 kill(pid, SIGKILL);
3994
3995 /* Try to flush whatever is still queued in the pty */
3996 if (master >= 0)
3997 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
3998
3999 loop_remove(loop_nr, &image_fd);
4000
4001 if (remove_subvol && arg_directory) {
4002 int k;
4003
4004 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
4005 if (k < 0)
4006 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4007 }
4008
4009 if (arg_machine) {
4010 const char *p;
4011
4012 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4013 (void) rm_rf(p, REMOVE_ROOT);
4014 }
4015
4016 expose_port_flush(arg_expose_ports, &exposed);
4017 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4018
4019 free(arg_directory);
4020 free(arg_template);
4021 free(arg_image);
4022 free(arg_machine);
4023 free(arg_user);
4024 free(arg_chdir);
4025 strv_free(arg_setenv);
4026 free(arg_network_bridge);
4027 strv_free(arg_network_interfaces);
4028 strv_free(arg_network_macvlan);
4029 strv_free(arg_network_ipvlan);
4030 strv_free(arg_network_veth_extra);
4031 strv_free(arg_parameters);
4032 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4033 expose_port_free_all(arg_expose_ports);
4034
4035 return r < 0 ? EXIT_FAILURE : ret;
4036 }