]> git.ipfire.org Git - thirdparty/systemd.git/blame_incremental - src/nspawn/nspawn.c
Rename formats-util.h to format-util.h
[thirdparty/systemd.git] / src / nspawn / nspawn.c
... / ...
CommitLineData
1/***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
20#ifdef HAVE_BLKID
21#include <blkid/blkid.h>
22#endif
23#include <errno.h>
24#include <getopt.h>
25#include <grp.h>
26#include <linux/loop.h>
27#include <pwd.h>
28#include <sched.h>
29#ifdef HAVE_SELINUX
30#include <selinux/selinux.h>
31#endif
32#include <signal.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <sys/file.h>
37#include <sys/mount.h>
38#include <sys/personality.h>
39#include <sys/prctl.h>
40#include <sys/types.h>
41#include <unistd.h>
42
43#include "sd-daemon.h"
44#include "sd-id128.h"
45
46#include "alloc-util.h"
47#include "barrier.h"
48#include "base-filesystem.h"
49#include "blkid-util.h"
50#include "btrfs-util.h"
51#include "cap-list.h"
52#include "capability-util.h"
53#include "cgroup-util.h"
54#include "copy.h"
55#include "dev-setup.h"
56#include "env-util.h"
57#include "fd-util.h"
58#include "fdset.h"
59#include "fileio.h"
60#include "format-util.h"
61#include "fs-util.h"
62#include "gpt.h"
63#include "hostname-util.h"
64#include "id128-util.h"
65#include "log.h"
66#include "loopback-setup.h"
67#include "machine-image.h"
68#include "macro.h"
69#include "missing.h"
70#include "mkdir.h"
71#include "mount-util.h"
72#include "netlink-util.h"
73#include "nspawn-cgroup.h"
74#include "nspawn-expose-ports.h"
75#include "nspawn-mount.h"
76#include "nspawn-network.h"
77#include "nspawn-patch-uid.h"
78#include "nspawn-register.h"
79#include "nspawn-seccomp.h"
80#include "nspawn-settings.h"
81#include "nspawn-setuid.h"
82#include "nspawn-stub-pid1.h"
83#include "parse-util.h"
84#include "path-util.h"
85#include "process-util.h"
86#include "ptyfwd.h"
87#include "random-util.h"
88#include "raw-clone.h"
89#include "rm-rf.h"
90#include "selinux-util.h"
91#include "signal-util.h"
92#include "socket-util.h"
93#include "stat-util.h"
94#include "stdio-util.h"
95#include "string-util.h"
96#include "strv.h"
97#include "terminal-util.h"
98#include "udev-util.h"
99#include "umask-util.h"
100#include "user-util.h"
101#include "util.h"
102
103/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
104 * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
105 * may have their own allocation ranges too. */
106#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
107#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
108
109/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
110 * nspawn_notify_socket_path is relative to the container
111 * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
112#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
113
114typedef enum ContainerStatus {
115 CONTAINER_TERMINATED,
116 CONTAINER_REBOOTED
117} ContainerStatus;
118
119typedef enum LinkJournal {
120 LINK_NO,
121 LINK_AUTO,
122 LINK_HOST,
123 LINK_GUEST
124} LinkJournal;
125
126static char *arg_directory = NULL;
127static char *arg_template = NULL;
128static char *arg_chdir = NULL;
129static char *arg_user = NULL;
130static sd_id128_t arg_uuid = {};
131static char *arg_machine = NULL;
132static const char *arg_selinux_context = NULL;
133static const char *arg_selinux_apifs_context = NULL;
134static const char *arg_slice = NULL;
135static bool arg_private_network = false;
136static bool arg_read_only = false;
137static StartMode arg_start_mode = START_PID1;
138static bool arg_ephemeral = false;
139static LinkJournal arg_link_journal = LINK_AUTO;
140static bool arg_link_journal_try = false;
141static uint64_t arg_caps_retain =
142 (1ULL << CAP_AUDIT_CONTROL) |
143 (1ULL << CAP_AUDIT_WRITE) |
144 (1ULL << CAP_CHOWN) |
145 (1ULL << CAP_DAC_OVERRIDE) |
146 (1ULL << CAP_DAC_READ_SEARCH) |
147 (1ULL << CAP_FOWNER) |
148 (1ULL << CAP_FSETID) |
149 (1ULL << CAP_IPC_OWNER) |
150 (1ULL << CAP_KILL) |
151 (1ULL << CAP_LEASE) |
152 (1ULL << CAP_LINUX_IMMUTABLE) |
153 (1ULL << CAP_MKNOD) |
154 (1ULL << CAP_NET_BIND_SERVICE) |
155 (1ULL << CAP_NET_BROADCAST) |
156 (1ULL << CAP_NET_RAW) |
157 (1ULL << CAP_SETFCAP) |
158 (1ULL << CAP_SETGID) |
159 (1ULL << CAP_SETPCAP) |
160 (1ULL << CAP_SETUID) |
161 (1ULL << CAP_SYS_ADMIN) |
162 (1ULL << CAP_SYS_BOOT) |
163 (1ULL << CAP_SYS_CHROOT) |
164 (1ULL << CAP_SYS_NICE) |
165 (1ULL << CAP_SYS_PTRACE) |
166 (1ULL << CAP_SYS_RESOURCE) |
167 (1ULL << CAP_SYS_TTY_CONFIG);
168static CustomMount *arg_custom_mounts = NULL;
169static unsigned arg_n_custom_mounts = 0;
170static char **arg_setenv = NULL;
171static bool arg_quiet = false;
172static bool arg_register = true;
173static bool arg_keep_unit = false;
174static char **arg_network_interfaces = NULL;
175static char **arg_network_macvlan = NULL;
176static char **arg_network_ipvlan = NULL;
177static bool arg_network_veth = false;
178static char **arg_network_veth_extra = NULL;
179static char *arg_network_bridge = NULL;
180static char *arg_network_zone = NULL;
181static unsigned long arg_personality = PERSONALITY_INVALID;
182static char *arg_image = NULL;
183static VolatileMode arg_volatile_mode = VOLATILE_NO;
184static ExposePort *arg_expose_ports = NULL;
185static char **arg_property = NULL;
186static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
187static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
188static bool arg_userns_chown = false;
189static int arg_kill_signal = 0;
190static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
191static SettingsMask arg_settings_mask = 0;
192static int arg_settings_trusted = -1;
193static char **arg_parameters = NULL;
194static const char *arg_container_service_name = "systemd-nspawn";
195static bool arg_notify_ready = false;
196static bool arg_use_cgns = true;
197static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
198
199static void help(void) {
200 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
201 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
202 " -h --help Show this help\n"
203 " --version Print version string\n"
204 " -q --quiet Do not show status information\n"
205 " -D --directory=PATH Root directory for the container\n"
206 " --template=PATH Initialize root directory from template directory,\n"
207 " if missing\n"
208 " -x --ephemeral Run container with snapshot of root directory, and\n"
209 " remove it after exit\n"
210 " -i --image=PATH File system device or disk image for the container\n"
211 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
212 " -b --boot Boot up full system (i.e. invoke init)\n"
213 " --chdir=PATH Set working directory in the container\n"
214 " -u --user=USER Run the command under specified user or uid\n"
215 " -M --machine=NAME Set the machine name for the container\n"
216 " --uuid=UUID Set a specific machine UUID for the container\n"
217 " -S --slice=SLICE Place the container in the specified slice\n"
218 " --property=NAME=VALUE Set scope unit property\n"
219 " -U --private-users=pick Run within user namespace, autoselect UID/GID range\n"
220 " --private-users[=UIDBASE[:NUIDS]]\n"
221 " Similar, but with user configured UID/GID range\n"
222 " --private-users-chown Adjust OS tree ownership to private UID/GID range\n"
223 " --private-network Disable network in container\n"
224 " --network-interface=INTERFACE\n"
225 " Assign an existing network interface to the\n"
226 " container\n"
227 " --network-macvlan=INTERFACE\n"
228 " Create a macvlan network interface based on an\n"
229 " existing network interface to the container\n"
230 " --network-ipvlan=INTERFACE\n"
231 " Create a ipvlan network interface based on an\n"
232 " existing network interface to the container\n"
233 " -n --network-veth Add a virtual Ethernet connection between host\n"
234 " and container\n"
235 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
236 " Add an additional virtual Ethernet link between\n"
237 " host and container\n"
238 " --network-bridge=INTERFACE\n"
239 " Add a virtual Ethernet connection to the container\n"
240 " and attach it to an existing bridge on the host\n"
241 " --network-zone=NAME Similar, but attach the new interface to an\n"
242 " an automatically managed bridge interface\n"
243 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
244 " Expose a container IP port on the host\n"
245 " -Z --selinux-context=SECLABEL\n"
246 " Set the SELinux security context to be used by\n"
247 " processes in the container\n"
248 " -L --selinux-apifs-context=SECLABEL\n"
249 " Set the SELinux security context to be used by\n"
250 " API/tmpfs file systems in the container\n"
251 " --capability=CAP In addition to the default, retain specified\n"
252 " capability\n"
253 " --drop-capability=CAP Drop the specified capability from the default set\n"
254 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
255 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
256 " host, try-guest, try-host\n"
257 " -j Equivalent to --link-journal=try-guest\n"
258 " --read-only Mount the root directory read-only\n"
259 " --bind=PATH[:PATH[:OPTIONS]]\n"
260 " Bind mount a file or directory from the host into\n"
261 " the container\n"
262 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
263 " Similar, but creates a read-only bind mount\n"
264 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
265 " --overlay=PATH[:PATH...]:PATH\n"
266 " Create an overlay mount from the host to \n"
267 " the container\n"
268 " --overlay-ro=PATH[:PATH...]:PATH\n"
269 " Similar, but creates a read-only overlay mount\n"
270 " -E --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
271 " --register=BOOLEAN Register container as machine\n"
272 " --keep-unit Do not register a scope for the machine, reuse\n"
273 " the service unit nspawn is running in\n"
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
276 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
277 , program_invocation_short_name);
278}
279
280static int custom_mounts_prepare(void) {
281 unsigned i;
282 int r;
283
284 /* Ensure the mounts are applied prefix first. */
285 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
286
287 /* Allocate working directories for the overlay file systems that need it */
288 for (i = 0; i < arg_n_custom_mounts; i++) {
289 CustomMount *m = &arg_custom_mounts[i];
290
291 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
292
293 if (arg_userns_chown) {
294 log_error("--private-users-chown may not be combined with custom root mounts.");
295 return -EINVAL;
296 } else if (arg_uid_shift == UID_INVALID) {
297 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
298 return -EINVAL;
299 }
300 }
301
302 if (m->type != CUSTOM_MOUNT_OVERLAY)
303 continue;
304
305 if (m->work_dir)
306 continue;
307
308 if (m->read_only)
309 continue;
310
311 r = tempfn_random(m->source, NULL, &m->work_dir);
312 if (r < 0)
313 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
314 }
315
316 return 0;
317}
318
319static int detect_unified_cgroup_hierarchy(const char *directory) {
320 const char *e;
321 int r, all_unified, systemd_unified;
322
323 /* Allow the user to control whether the unified hierarchy is used */
324 e = getenv("UNIFIED_CGROUP_HIERARCHY");
325 if (e) {
326 r = parse_boolean(e);
327 if (r < 0)
328 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
329 if (r > 0)
330 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
331 else
332 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
333
334 return 0;
335 }
336
337 all_unified = cg_all_unified();
338 systemd_unified = cg_unified(SYSTEMD_CGROUP_CONTROLLER);
339
340 if (all_unified < 0 || systemd_unified < 0)
341 return log_error_errno(all_unified < 0 ? all_unified : systemd_unified,
342 "Failed to determine whether the unified cgroups hierarchy is used: %m");
343
344 /* Otherwise inherit the default from the host system */
345 if (all_unified > 0) {
346 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
347 * routine only detects 231, so we'll have a false negative here for 230. */
348 r = systemd_installation_has_version(directory, 230);
349 if (r < 0)
350 return log_error_errno(r, "Failed to determine systemd version in container: %m");
351 if (r > 0)
352 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
353 else
354 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
355 } else if (systemd_unified > 0) {
356 /* Mixed cgroup hierarchy support was added in 232 */
357 r = systemd_installation_has_version(directory, 232);
358 if (r < 0)
359 return log_error_errno(r, "Failed to determine systemd version in container: %m");
360 if (r > 0)
361 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
362 else
363 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
364 } else
365 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
366
367 return 0;
368}
369
370static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
371 int r;
372
373 r = getenv_bool(name);
374 if (r == -ENXIO)
375 return;
376 if (r < 0)
377 log_warning_errno(r, "Failed to parse %s from environment, defaulting to false.", name);
378 arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
379}
380
381static int parse_argv(int argc, char *argv[]) {
382
383 enum {
384 ARG_VERSION = 0x100,
385 ARG_PRIVATE_NETWORK,
386 ARG_UUID,
387 ARG_READ_ONLY,
388 ARG_CAPABILITY,
389 ARG_DROP_CAPABILITY,
390 ARG_LINK_JOURNAL,
391 ARG_BIND,
392 ARG_BIND_RO,
393 ARG_TMPFS,
394 ARG_OVERLAY,
395 ARG_OVERLAY_RO,
396 ARG_SHARE_SYSTEM,
397 ARG_REGISTER,
398 ARG_KEEP_UNIT,
399 ARG_NETWORK_INTERFACE,
400 ARG_NETWORK_MACVLAN,
401 ARG_NETWORK_IPVLAN,
402 ARG_NETWORK_BRIDGE,
403 ARG_NETWORK_ZONE,
404 ARG_NETWORK_VETH_EXTRA,
405 ARG_PERSONALITY,
406 ARG_VOLATILE,
407 ARG_TEMPLATE,
408 ARG_PROPERTY,
409 ARG_PRIVATE_USERS,
410 ARG_KILL_SIGNAL,
411 ARG_SETTINGS,
412 ARG_CHDIR,
413 ARG_PRIVATE_USERS_CHOWN,
414 ARG_NOTIFY_READY,
415 };
416
417 static const struct option options[] = {
418 { "help", no_argument, NULL, 'h' },
419 { "version", no_argument, NULL, ARG_VERSION },
420 { "directory", required_argument, NULL, 'D' },
421 { "template", required_argument, NULL, ARG_TEMPLATE },
422 { "ephemeral", no_argument, NULL, 'x' },
423 { "user", required_argument, NULL, 'u' },
424 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
425 { "as-pid2", no_argument, NULL, 'a' },
426 { "boot", no_argument, NULL, 'b' },
427 { "uuid", required_argument, NULL, ARG_UUID },
428 { "read-only", no_argument, NULL, ARG_READ_ONLY },
429 { "capability", required_argument, NULL, ARG_CAPABILITY },
430 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
431 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
432 { "bind", required_argument, NULL, ARG_BIND },
433 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
434 { "tmpfs", required_argument, NULL, ARG_TMPFS },
435 { "overlay", required_argument, NULL, ARG_OVERLAY },
436 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
437 { "machine", required_argument, NULL, 'M' },
438 { "slice", required_argument, NULL, 'S' },
439 { "setenv", required_argument, NULL, 'E' },
440 { "selinux-context", required_argument, NULL, 'Z' },
441 { "selinux-apifs-context", required_argument, NULL, 'L' },
442 { "quiet", no_argument, NULL, 'q' },
443 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */
444 { "register", required_argument, NULL, ARG_REGISTER },
445 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
446 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
447 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
448 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
449 { "network-veth", no_argument, NULL, 'n' },
450 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA },
451 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
452 { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE },
453 { "personality", required_argument, NULL, ARG_PERSONALITY },
454 { "image", required_argument, NULL, 'i' },
455 { "volatile", optional_argument, NULL, ARG_VOLATILE },
456 { "port", required_argument, NULL, 'p' },
457 { "property", required_argument, NULL, ARG_PROPERTY },
458 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
459 { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN },
460 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
461 { "settings", required_argument, NULL, ARG_SETTINGS },
462 { "chdir", required_argument, NULL, ARG_CHDIR },
463 { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
464 {}
465 };
466
467 int c, r;
468 const char *p, *e;
469 uint64_t plus = 0, minus = 0;
470 bool mask_all_settings = false, mask_no_settings = false;
471
472 assert(argc >= 0);
473 assert(argv);
474
475 while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nU", options, NULL)) >= 0)
476
477 switch (c) {
478
479 case 'h':
480 help();
481 return 0;
482
483 case ARG_VERSION:
484 return version();
485
486 case 'D':
487 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
488 if (r < 0)
489 return r;
490 break;
491
492 case ARG_TEMPLATE:
493 r = parse_path_argument_and_warn(optarg, false, &arg_template);
494 if (r < 0)
495 return r;
496 break;
497
498 case 'i':
499 r = parse_path_argument_and_warn(optarg, false, &arg_image);
500 if (r < 0)
501 return r;
502 break;
503
504 case 'x':
505 arg_ephemeral = true;
506 break;
507
508 case 'u':
509 r = free_and_strdup(&arg_user, optarg);
510 if (r < 0)
511 return log_oom();
512
513 arg_settings_mask |= SETTING_USER;
514 break;
515
516 case ARG_NETWORK_ZONE: {
517 char *j;
518
519 j = strappend("vz-", optarg);
520 if (!j)
521 return log_oom();
522
523 if (!ifname_valid(j)) {
524 log_error("Network zone name not valid: %s", j);
525 free(j);
526 return -EINVAL;
527 }
528
529 free(arg_network_zone);
530 arg_network_zone = j;
531
532 arg_network_veth = true;
533 arg_private_network = true;
534 arg_settings_mask |= SETTING_NETWORK;
535 break;
536 }
537
538 case ARG_NETWORK_BRIDGE:
539
540 if (!ifname_valid(optarg)) {
541 log_error("Bridge interface name not valid: %s", optarg);
542 return -EINVAL;
543 }
544
545 r = free_and_strdup(&arg_network_bridge, optarg);
546 if (r < 0)
547 return log_oom();
548
549 /* fall through */
550
551 case 'n':
552 arg_network_veth = true;
553 arg_private_network = true;
554 arg_settings_mask |= SETTING_NETWORK;
555 break;
556
557 case ARG_NETWORK_VETH_EXTRA:
558 r = veth_extra_parse(&arg_network_veth_extra, optarg);
559 if (r < 0)
560 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
561
562 arg_private_network = true;
563 arg_settings_mask |= SETTING_NETWORK;
564 break;
565
566 case ARG_NETWORK_INTERFACE:
567
568 if (!ifname_valid(optarg)) {
569 log_error("Network interface name not valid: %s", optarg);
570 return -EINVAL;
571 }
572
573 if (strv_extend(&arg_network_interfaces, optarg) < 0)
574 return log_oom();
575
576 arg_private_network = true;
577 arg_settings_mask |= SETTING_NETWORK;
578 break;
579
580 case ARG_NETWORK_MACVLAN:
581
582 if (!ifname_valid(optarg)) {
583 log_error("MACVLAN network interface name not valid: %s", optarg);
584 return -EINVAL;
585 }
586
587 if (strv_extend(&arg_network_macvlan, optarg) < 0)
588 return log_oom();
589
590 arg_private_network = true;
591 arg_settings_mask |= SETTING_NETWORK;
592 break;
593
594 case ARG_NETWORK_IPVLAN:
595
596 if (!ifname_valid(optarg)) {
597 log_error("IPVLAN network interface name not valid: %s", optarg);
598 return -EINVAL;
599 }
600
601 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
602 return log_oom();
603
604 /* fall through */
605
606 case ARG_PRIVATE_NETWORK:
607 arg_private_network = true;
608 arg_settings_mask |= SETTING_NETWORK;
609 break;
610
611 case 'b':
612 if (arg_start_mode == START_PID2) {
613 log_error("--boot and --as-pid2 may not be combined.");
614 return -EINVAL;
615 }
616
617 arg_start_mode = START_BOOT;
618 arg_settings_mask |= SETTING_START_MODE;
619 break;
620
621 case 'a':
622 if (arg_start_mode == START_BOOT) {
623 log_error("--boot and --as-pid2 may not be combined.");
624 return -EINVAL;
625 }
626
627 arg_start_mode = START_PID2;
628 arg_settings_mask |= SETTING_START_MODE;
629 break;
630
631 case ARG_UUID:
632 r = sd_id128_from_string(optarg, &arg_uuid);
633 if (r < 0)
634 return log_error_errno(r, "Invalid UUID: %s", optarg);
635
636 if (sd_id128_is_null(arg_uuid)) {
637 log_error("Machine UUID may not be all zeroes.");
638 return -EINVAL;
639 }
640
641 arg_settings_mask |= SETTING_MACHINE_ID;
642 break;
643
644 case 'S':
645 arg_slice = optarg;
646 break;
647
648 case 'M':
649 if (isempty(optarg))
650 arg_machine = mfree(arg_machine);
651 else {
652 if (!machine_name_is_valid(optarg)) {
653 log_error("Invalid machine name: %s", optarg);
654 return -EINVAL;
655 }
656
657 r = free_and_strdup(&arg_machine, optarg);
658 if (r < 0)
659 return log_oom();
660
661 break;
662 }
663
664 case 'Z':
665 arg_selinux_context = optarg;
666 break;
667
668 case 'L':
669 arg_selinux_apifs_context = optarg;
670 break;
671
672 case ARG_READ_ONLY:
673 arg_read_only = true;
674 arg_settings_mask |= SETTING_READ_ONLY;
675 break;
676
677 case ARG_CAPABILITY:
678 case ARG_DROP_CAPABILITY: {
679 p = optarg;
680 for (;;) {
681 _cleanup_free_ char *t = NULL;
682
683 r = extract_first_word(&p, &t, ",", 0);
684 if (r < 0)
685 return log_error_errno(r, "Failed to parse capability %s.", t);
686
687 if (r == 0)
688 break;
689
690 if (streq(t, "all")) {
691 if (c == ARG_CAPABILITY)
692 plus = (uint64_t) -1;
693 else
694 minus = (uint64_t) -1;
695 } else {
696 int cap;
697
698 cap = capability_from_name(t);
699 if (cap < 0) {
700 log_error("Failed to parse capability %s.", t);
701 return -EINVAL;
702 }
703
704 if (c == ARG_CAPABILITY)
705 plus |= 1ULL << (uint64_t) cap;
706 else
707 minus |= 1ULL << (uint64_t) cap;
708 }
709 }
710
711 arg_settings_mask |= SETTING_CAPABILITY;
712 break;
713 }
714
715 case 'j':
716 arg_link_journal = LINK_GUEST;
717 arg_link_journal_try = true;
718 break;
719
720 case ARG_LINK_JOURNAL:
721 if (streq(optarg, "auto")) {
722 arg_link_journal = LINK_AUTO;
723 arg_link_journal_try = false;
724 } else if (streq(optarg, "no")) {
725 arg_link_journal = LINK_NO;
726 arg_link_journal_try = false;
727 } else if (streq(optarg, "guest")) {
728 arg_link_journal = LINK_GUEST;
729 arg_link_journal_try = false;
730 } else if (streq(optarg, "host")) {
731 arg_link_journal = LINK_HOST;
732 arg_link_journal_try = false;
733 } else if (streq(optarg, "try-guest")) {
734 arg_link_journal = LINK_GUEST;
735 arg_link_journal_try = true;
736 } else if (streq(optarg, "try-host")) {
737 arg_link_journal = LINK_HOST;
738 arg_link_journal_try = true;
739 } else {
740 log_error("Failed to parse link journal mode %s", optarg);
741 return -EINVAL;
742 }
743
744 break;
745
746 case ARG_BIND:
747 case ARG_BIND_RO:
748 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
749 if (r < 0)
750 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
751
752 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
753 break;
754
755 case ARG_TMPFS:
756 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
757 if (r < 0)
758 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
759
760 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
761 break;
762
763 case ARG_OVERLAY:
764 case ARG_OVERLAY_RO: {
765 _cleanup_free_ char *upper = NULL, *destination = NULL;
766 _cleanup_strv_free_ char **lower = NULL;
767 CustomMount *m;
768 unsigned n = 0;
769 char **i;
770
771 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
772 if (r == -ENOMEM)
773 return log_oom();
774 else if (r < 0) {
775 log_error("Invalid overlay specification: %s", optarg);
776 return r;
777 }
778
779 STRV_FOREACH(i, lower) {
780 if (!path_is_absolute(*i)) {
781 log_error("Overlay path %s is not absolute.", *i);
782 return -EINVAL;
783 }
784
785 n++;
786 }
787
788 if (n < 2) {
789 log_error("--overlay= needs at least two colon-separated directories specified.");
790 return -EINVAL;
791 }
792
793 if (n == 2) {
794 /* If two parameters are specified,
795 * the first one is the lower, the
796 * second one the upper directory. And
797 * we'll also define the destination
798 * mount point the same as the upper. */
799 upper = lower[1];
800 lower[1] = NULL;
801
802 destination = strdup(upper);
803 if (!destination)
804 return log_oom();
805
806 } else {
807 upper = lower[n - 2];
808 destination = lower[n - 1];
809 lower[n - 2] = NULL;
810 }
811
812 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
813 if (!m)
814 return log_oom();
815
816 m->destination = destination;
817 m->source = upper;
818 m->lower = lower;
819 m->read_only = c == ARG_OVERLAY_RO;
820
821 upper = destination = NULL;
822 lower = NULL;
823
824 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
825 break;
826 }
827
828 case 'E': {
829 char **n;
830
831 if (!env_assignment_is_valid(optarg)) {
832 log_error("Environment variable assignment '%s' is not valid.", optarg);
833 return -EINVAL;
834 }
835
836 n = strv_env_set(arg_setenv, optarg);
837 if (!n)
838 return log_oom();
839
840 strv_free(arg_setenv);
841 arg_setenv = n;
842
843 arg_settings_mask |= SETTING_ENVIRONMENT;
844 break;
845 }
846
847 case 'q':
848 arg_quiet = true;
849 break;
850
851 case ARG_SHARE_SYSTEM:
852 /* We don't officially support this anymore, except for compat reasons. People should use the
853 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
854 arg_clone_ns_flags = 0;
855 break;
856
857 case ARG_REGISTER:
858 r = parse_boolean(optarg);
859 if (r < 0) {
860 log_error("Failed to parse --register= argument: %s", optarg);
861 return r;
862 }
863
864 arg_register = r;
865 break;
866
867 case ARG_KEEP_UNIT:
868 arg_keep_unit = true;
869 break;
870
871 case ARG_PERSONALITY:
872
873 arg_personality = personality_from_string(optarg);
874 if (arg_personality == PERSONALITY_INVALID) {
875 log_error("Unknown or unsupported personality '%s'.", optarg);
876 return -EINVAL;
877 }
878
879 arg_settings_mask |= SETTING_PERSONALITY;
880 break;
881
882 case ARG_VOLATILE:
883
884 if (!optarg)
885 arg_volatile_mode = VOLATILE_YES;
886 else {
887 VolatileMode m;
888
889 m = volatile_mode_from_string(optarg);
890 if (m < 0) {
891 log_error("Failed to parse --volatile= argument: %s", optarg);
892 return -EINVAL;
893 } else
894 arg_volatile_mode = m;
895 }
896
897 arg_settings_mask |= SETTING_VOLATILE_MODE;
898 break;
899
900 case 'p':
901 r = expose_port_parse(&arg_expose_ports, optarg);
902 if (r == -EEXIST)
903 return log_error_errno(r, "Duplicate port specification: %s", optarg);
904 if (r < 0)
905 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
906
907 arg_settings_mask |= SETTING_EXPOSE_PORTS;
908 break;
909
910 case ARG_PROPERTY:
911 if (strv_extend(&arg_property, optarg) < 0)
912 return log_oom();
913
914 break;
915
916 case ARG_PRIVATE_USERS: {
917 int boolean = -1;
918
919 if (!optarg)
920 boolean = true;
921 else if (!in_charset(optarg, DIGITS))
922 /* do *not* parse numbers as booleans */
923 boolean = parse_boolean(optarg);
924
925 if (boolean == false) {
926 /* no: User namespacing off */
927 arg_userns_mode = USER_NAMESPACE_NO;
928 arg_uid_shift = UID_INVALID;
929 arg_uid_range = UINT32_C(0x10000);
930 } else if (boolean == true) {
931 /* yes: User namespacing on, UID range is read from root dir */
932 arg_userns_mode = USER_NAMESPACE_FIXED;
933 arg_uid_shift = UID_INVALID;
934 arg_uid_range = UINT32_C(0x10000);
935 } else if (streq(optarg, "pick")) {
936 /* pick: User namespacing on, UID range is picked randomly */
937 arg_userns_mode = USER_NAMESPACE_PICK;
938 arg_uid_shift = UID_INVALID;
939 arg_uid_range = UINT32_C(0x10000);
940 } else {
941 _cleanup_free_ char *buffer = NULL;
942 const char *range, *shift;
943
944 /* anything else: User namespacing on, UID range is explicitly configured */
945
946 range = strchr(optarg, ':');
947 if (range) {
948 buffer = strndup(optarg, range - optarg);
949 if (!buffer)
950 return log_oom();
951 shift = buffer;
952
953 range++;
954 r = safe_atou32(range, &arg_uid_range);
955 if (r < 0)
956 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
957 } else
958 shift = optarg;
959
960 r = parse_uid(shift, &arg_uid_shift);
961 if (r < 0)
962 return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
963
964 arg_userns_mode = USER_NAMESPACE_FIXED;
965 }
966
967 if (arg_uid_range <= 0) {
968 log_error("UID range cannot be 0.");
969 return -EINVAL;
970 }
971
972 arg_settings_mask |= SETTING_USERNS;
973 break;
974 }
975
976 case 'U':
977 if (userns_supported()) {
978 arg_userns_mode = USER_NAMESPACE_PICK;
979 arg_uid_shift = UID_INVALID;
980 arg_uid_range = UINT32_C(0x10000);
981
982 arg_settings_mask |= SETTING_USERNS;
983 }
984
985 break;
986
987 case ARG_PRIVATE_USERS_CHOWN:
988 arg_userns_chown = true;
989
990 arg_settings_mask |= SETTING_USERNS;
991 break;
992
993 case ARG_KILL_SIGNAL:
994 arg_kill_signal = signal_from_string_try_harder(optarg);
995 if (arg_kill_signal < 0) {
996 log_error("Cannot parse signal: %s", optarg);
997 return -EINVAL;
998 }
999
1000 arg_settings_mask |= SETTING_KILL_SIGNAL;
1001 break;
1002
1003 case ARG_SETTINGS:
1004
1005 /* no → do not read files
1006 * yes → read files, do not override cmdline, trust only subset
1007 * override → read files, override cmdline, trust only subset
1008 * trusted → read files, do not override cmdline, trust all
1009 */
1010
1011 r = parse_boolean(optarg);
1012 if (r < 0) {
1013 if (streq(optarg, "trusted")) {
1014 mask_all_settings = false;
1015 mask_no_settings = false;
1016 arg_settings_trusted = true;
1017
1018 } else if (streq(optarg, "override")) {
1019 mask_all_settings = false;
1020 mask_no_settings = true;
1021 arg_settings_trusted = -1;
1022 } else
1023 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1024 } else if (r > 0) {
1025 /* yes */
1026 mask_all_settings = false;
1027 mask_no_settings = false;
1028 arg_settings_trusted = -1;
1029 } else {
1030 /* no */
1031 mask_all_settings = true;
1032 mask_no_settings = false;
1033 arg_settings_trusted = false;
1034 }
1035
1036 break;
1037
1038 case ARG_CHDIR:
1039 if (!path_is_absolute(optarg)) {
1040 log_error("Working directory %s is not an absolute path.", optarg);
1041 return -EINVAL;
1042 }
1043
1044 r = free_and_strdup(&arg_chdir, optarg);
1045 if (r < 0)
1046 return log_oom();
1047
1048 arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1049 break;
1050
1051 case ARG_NOTIFY_READY:
1052 r = parse_boolean(optarg);
1053 if (r < 0) {
1054 log_error("%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1055 return -EINVAL;
1056 }
1057 arg_notify_ready = r;
1058 arg_settings_mask |= SETTING_NOTIFY_READY;
1059 break;
1060
1061 case '?':
1062 return -EINVAL;
1063
1064 default:
1065 assert_not_reached("Unhandled option");
1066 }
1067
1068 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
1069 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
1070 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
1071 parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
1072
1073 if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1074 !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1075 arg_register = false;
1076 if (arg_start_mode != START_PID1) {
1077 log_error("--boot cannot be used without namespacing.");
1078 return -EINVAL;
1079 }
1080 }
1081
1082 if (arg_userns_mode == USER_NAMESPACE_PICK)
1083 arg_userns_chown = true;
1084
1085 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
1086 log_error("--keep-unit may not be used when invoked from a user session.");
1087 return -EINVAL;
1088 }
1089
1090 if (arg_directory && arg_image) {
1091 log_error("--directory= and --image= may not be combined.");
1092 return -EINVAL;
1093 }
1094
1095 if (arg_template && arg_image) {
1096 log_error("--template= and --image= may not be combined.");
1097 return -EINVAL;
1098 }
1099
1100 if (arg_template && !(arg_directory || arg_machine)) {
1101 log_error("--template= needs --directory= or --machine=.");
1102 return -EINVAL;
1103 }
1104
1105 if (arg_ephemeral && arg_template) {
1106 log_error("--ephemeral and --template= may not be combined.");
1107 return -EINVAL;
1108 }
1109
1110 if (arg_ephemeral && arg_image) {
1111 log_error("--ephemeral and --image= may not be combined.");
1112 return -EINVAL;
1113 }
1114
1115 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1116 log_error("--ephemeral and --link-journal= may not be combined.");
1117 return -EINVAL;
1118 }
1119
1120 if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
1121 log_error("--private-users= is not supported, kernel compiled without user namespace support.");
1122 return -EOPNOTSUPP;
1123 }
1124
1125 if (arg_userns_chown && arg_read_only) {
1126 log_error("--read-only and --private-users-chown may not be combined.");
1127 return -EINVAL;
1128 }
1129
1130 if (arg_network_bridge && arg_network_zone) {
1131 log_error("--network-bridge= and --network-zone= may not be combined.");
1132 return -EINVAL;
1133 }
1134
1135 if (argc > optind) {
1136 arg_parameters = strv_copy(argv + optind);
1137 if (!arg_parameters)
1138 return log_oom();
1139
1140 arg_settings_mask |= SETTING_START_MODE;
1141 }
1142
1143 /* Load all settings from .nspawn files */
1144 if (mask_no_settings)
1145 arg_settings_mask = 0;
1146
1147 /* Don't load any settings from .nspawn files */
1148 if (mask_all_settings)
1149 arg_settings_mask = _SETTINGS_MASK_ALL;
1150
1151 arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1152
1153 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
1154 if (e)
1155 arg_container_service_name = e;
1156
1157 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
1158 if (r < 0)
1159 arg_use_cgns = cg_ns_supported();
1160 else
1161 arg_use_cgns = r;
1162
1163 return 1;
1164}
1165
1166static int verify_arguments(void) {
1167
1168 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
1169 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1170 return -EINVAL;
1171 }
1172
1173 if (arg_expose_ports && !arg_private_network) {
1174 log_error("Cannot use --port= without private networking.");
1175 return -EINVAL;
1176 }
1177
1178#ifndef HAVE_LIBIPTC
1179 if (arg_expose_ports) {
1180 log_error("--port= is not supported, compiled without libiptc support.");
1181 return -EOPNOTSUPP;
1182 }
1183#endif
1184
1185 if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1186 arg_kill_signal = SIGRTMIN+3;
1187
1188 return 0;
1189}
1190
1191static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1192 assert(p);
1193
1194 if (arg_userns_mode == USER_NAMESPACE_NO)
1195 return 0;
1196
1197 if (uid == UID_INVALID && gid == GID_INVALID)
1198 return 0;
1199
1200 if (uid != UID_INVALID) {
1201 uid += arg_uid_shift;
1202
1203 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1204 return -EOVERFLOW;
1205 }
1206
1207 if (gid != GID_INVALID) {
1208 gid += (gid_t) arg_uid_shift;
1209
1210 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1211 return -EOVERFLOW;
1212 }
1213
1214 if (lchown(p, uid, gid) < 0)
1215 return -errno;
1216
1217 return 0;
1218}
1219
1220static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1221 const char *q;
1222
1223 q = prefix_roota(root, path);
1224 if (mkdir(q, mode) < 0) {
1225 if (errno == EEXIST)
1226 return 0;
1227 return -errno;
1228 }
1229
1230 return userns_lchown(q, uid, gid);
1231}
1232
1233static int setup_timezone(const char *dest) {
1234 _cleanup_free_ char *p = NULL, *q = NULL;
1235 const char *where, *check, *what;
1236 char *z, *y;
1237 int r;
1238
1239 assert(dest);
1240
1241 /* Fix the timezone, if possible */
1242 r = readlink_malloc("/etc/localtime", &p);
1243 if (r < 0) {
1244 log_warning("host's /etc/localtime is not a symlink, not updating container timezone.");
1245 /* to handle warning, delete /etc/localtime and replace it
1246 * with a symbolic link to a time zone data file.
1247 *
1248 * Example:
1249 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1250 */
1251 return 0;
1252 }
1253
1254 z = path_startswith(p, "../usr/share/zoneinfo/");
1255 if (!z)
1256 z = path_startswith(p, "/usr/share/zoneinfo/");
1257 if (!z) {
1258 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1259 return 0;
1260 }
1261
1262 where = prefix_roota(dest, "/etc/localtime");
1263 r = readlink_malloc(where, &q);
1264 if (r >= 0) {
1265 y = path_startswith(q, "../usr/share/zoneinfo/");
1266 if (!y)
1267 y = path_startswith(q, "/usr/share/zoneinfo/");
1268
1269 /* Already pointing to the right place? Then do nothing .. */
1270 if (y && streq(y, z))
1271 return 0;
1272 }
1273
1274 check = strjoina("/usr/share/zoneinfo/", z);
1275 check = prefix_roota(dest, check);
1276 if (laccess(check, F_OK) < 0) {
1277 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1278 return 0;
1279 }
1280
1281 r = unlink(where);
1282 if (r < 0 && errno != ENOENT) {
1283 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1284 return 0;
1285 }
1286
1287 what = strjoina("../usr/share/zoneinfo/", z);
1288 if (symlink(what, where) < 0) {
1289 log_error_errno(errno, "Failed to correct timezone of container: %m");
1290 return 0;
1291 }
1292
1293 r = userns_lchown(where, 0, 0);
1294 if (r < 0)
1295 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1296
1297 return 0;
1298}
1299
1300static int setup_resolv_conf(const char *dest) {
1301 const char *where = NULL;
1302 int r;
1303
1304 assert(dest);
1305
1306 if (arg_private_network)
1307 return 0;
1308
1309 /* Fix resolv.conf, if possible */
1310 where = prefix_roota(dest, "/etc/resolv.conf");
1311
1312 if (access("/usr/lib/systemd/resolv.conf", F_OK) >= 0) {
1313 /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the
1314 * container, so that the container can use the host's resolver. Given that network namespacing is
1315 * disabled it's only natural of the container also uses the host's resolver. It also has the big
1316 * advantage that the container will be able to follow the host's DNS server configuration changes
1317 * transparently. */
1318
1319 r = mount_verbose(LOG_WARNING, "/usr/lib/systemd/resolv.conf", where, NULL, MS_BIND, NULL);
1320 if (r >= 0)
1321 return mount_verbose(LOG_ERR, NULL, where, NULL,
1322 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1323 }
1324
1325 /* If that didn't work, let's copy the file */
1326 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1327 if (r < 0) {
1328 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
1329 * resolved or something similar runs inside and the symlink points there.
1330 *
1331 * If the disk image is read-only, there's also no point in complaining.
1332 */
1333 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1334 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
1335 return 0;
1336 }
1337
1338 r = userns_lchown(where, 0, 0);
1339 if (r < 0)
1340 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
1341
1342 return 0;
1343}
1344
1345static int setup_boot_id(const char *dest) {
1346 sd_id128_t rnd = SD_ID128_NULL;
1347 const char *from, *to;
1348 int r;
1349
1350 /* Generate a new randomized boot ID, so that each boot-up of
1351 * the container gets a new one */
1352
1353 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1354 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1355
1356 r = sd_id128_randomize(&rnd);
1357 if (r < 0)
1358 return log_error_errno(r, "Failed to generate random boot id: %m");
1359
1360 r = id128_write(from, ID128_UUID, rnd, false);
1361 if (r < 0)
1362 return log_error_errno(r, "Failed to write boot id: %m");
1363
1364 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1365 if (r >= 0)
1366 r = mount_verbose(LOG_ERR, NULL, to, NULL,
1367 MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
1368
1369 (void) unlink(from);
1370 return r;
1371}
1372
1373static int copy_devnodes(const char *dest) {
1374
1375 static const char devnodes[] =
1376 "null\0"
1377 "zero\0"
1378 "full\0"
1379 "random\0"
1380 "urandom\0"
1381 "tty\0"
1382 "net/tun\0";
1383
1384 const char *d;
1385 int r = 0;
1386 _cleanup_umask_ mode_t u;
1387
1388 assert(dest);
1389
1390 u = umask(0000);
1391
1392 /* Create /dev/net, so that we can create /dev/net/tun in it */
1393 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1394 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1395
1396 NULSTR_FOREACH(d, devnodes) {
1397 _cleanup_free_ char *from = NULL, *to = NULL;
1398 struct stat st;
1399
1400 from = strappend("/dev/", d);
1401 to = prefix_root(dest, from);
1402
1403 if (stat(from, &st) < 0) {
1404
1405 if (errno != ENOENT)
1406 return log_error_errno(errno, "Failed to stat %s: %m", from);
1407
1408 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1409
1410 log_error("%s is not a char or block device, cannot copy.", from);
1411 return -EIO;
1412
1413 } else {
1414 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1415 /*
1416 * This is some sort of protection too against
1417 * recursive userns chown on shared /dev/
1418 */
1419 if (errno == EEXIST)
1420 log_notice("%s/dev/ should be an empty directory", dest);
1421 if (errno != EPERM)
1422 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1423
1424 /* Some systems abusively restrict mknod but
1425 * allow bind mounts. */
1426 r = touch(to);
1427 if (r < 0)
1428 return log_error_errno(r, "touch (%s) failed: %m", to);
1429 r = mount_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
1430 if (r < 0)
1431 return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
1432 }
1433
1434 r = userns_lchown(to, 0, 0);
1435 if (r < 0)
1436 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1437 }
1438 }
1439
1440 return r;
1441}
1442
1443static int setup_pts(const char *dest) {
1444 _cleanup_free_ char *options = NULL;
1445 const char *p;
1446 int r;
1447
1448#ifdef HAVE_SELINUX
1449 if (arg_selinux_apifs_context)
1450 (void) asprintf(&options,
1451 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1452 arg_uid_shift + TTY_GID,
1453 arg_selinux_apifs_context);
1454 else
1455#endif
1456 (void) asprintf(&options,
1457 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1458 arg_uid_shift + TTY_GID);
1459
1460 if (!options)
1461 return log_oom();
1462
1463 /* Mount /dev/pts itself */
1464 p = prefix_roota(dest, "/dev/pts");
1465 if (mkdir(p, 0755) < 0)
1466 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1467 r = mount_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
1468 if (r < 0)
1469 return r;
1470 r = userns_lchown(p, 0, 0);
1471 if (r < 0)
1472 return log_error_errno(r, "Failed to chown /dev/pts: %m");
1473
1474 /* Create /dev/ptmx symlink */
1475 p = prefix_roota(dest, "/dev/ptmx");
1476 if (symlink("pts/ptmx", p) < 0)
1477 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1478 r = userns_lchown(p, 0, 0);
1479 if (r < 0)
1480 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
1481
1482 /* And fix /dev/pts/ptmx ownership */
1483 p = prefix_roota(dest, "/dev/pts/ptmx");
1484 r = userns_lchown(p, 0, 0);
1485 if (r < 0)
1486 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
1487
1488 return 0;
1489}
1490
1491static int setup_dev_console(const char *dest, const char *console) {
1492 _cleanup_umask_ mode_t u;
1493 const char *to;
1494 int r;
1495
1496 assert(dest);
1497 assert(console);
1498
1499 u = umask(0000);
1500
1501 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1502 if (r < 0)
1503 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1504
1505 /* We need to bind mount the right tty to /dev/console since
1506 * ptys can only exist on pts file systems. To have something
1507 * to bind mount things on we create a empty regular file. */
1508
1509 to = prefix_roota(dest, "/dev/console");
1510 r = touch(to);
1511 if (r < 0)
1512 return log_error_errno(r, "touch() for /dev/console failed: %m");
1513
1514 return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL);
1515}
1516
1517static int setup_kmsg(const char *dest, int kmsg_socket) {
1518 const char *from, *to;
1519 _cleanup_umask_ mode_t u;
1520 int fd, r;
1521
1522 assert(kmsg_socket >= 0);
1523
1524 u = umask(0000);
1525
1526 /* We create the kmsg FIFO as /run/kmsg, but immediately
1527 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1528 * on the reading side behave very similar to /proc/kmsg,
1529 * their writing side behaves differently from /dev/kmsg in
1530 * that writing blocks when nothing is reading. In order to
1531 * avoid any problems with containers deadlocking due to this
1532 * we simply make /dev/kmsg unavailable to the container. */
1533 from = prefix_roota(dest, "/run/kmsg");
1534 to = prefix_roota(dest, "/proc/kmsg");
1535
1536 if (mkfifo(from, 0600) < 0)
1537 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1538 r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
1539 if (r < 0)
1540 return r;
1541
1542 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1543 if (fd < 0)
1544 return log_error_errno(errno, "Failed to open fifo: %m");
1545
1546 /* Store away the fd in the socket, so that it stays open as
1547 * long as we run the child */
1548 r = send_one_fd(kmsg_socket, fd, 0);
1549 safe_close(fd);
1550
1551 if (r < 0)
1552 return log_error_errno(r, "Failed to send FIFO fd: %m");
1553
1554 /* And now make the FIFO unavailable as /run/kmsg... */
1555 (void) unlink(from);
1556
1557 return 0;
1558}
1559
1560static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
1561 union in_addr_union *exposed = userdata;
1562
1563 assert(rtnl);
1564 assert(m);
1565 assert(exposed);
1566
1567 expose_port_execute(rtnl, arg_expose_ports, exposed);
1568 return 0;
1569}
1570
1571static int setup_hostname(void) {
1572
1573 if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
1574 return 0;
1575
1576 if (sethostname_idempotent(arg_machine) < 0)
1577 return -errno;
1578
1579 return 0;
1580}
1581
1582static int setup_journal(const char *directory) {
1583 sd_id128_t this_id;
1584 _cleanup_free_ char *d = NULL;
1585 const char *p, *q;
1586 bool try;
1587 char id[33];
1588 int r;
1589
1590 /* Don't link journals in ephemeral mode */
1591 if (arg_ephemeral)
1592 return 0;
1593
1594 if (arg_link_journal == LINK_NO)
1595 return 0;
1596
1597 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1598
1599 r = sd_id128_get_machine(&this_id);
1600 if (r < 0)
1601 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1602
1603 if (sd_id128_equal(arg_uuid, this_id)) {
1604 log_full(try ? LOG_WARNING : LOG_ERR,
1605 "Host and machine ids are equal (%s): refusing to link journals", sd_id128_to_string(arg_uuid, id));
1606 if (try)
1607 return 0;
1608 return -EEXIST;
1609 }
1610
1611 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1612 if (r < 0)
1613 return log_error_errno(r, "Failed to create /var: %m");
1614
1615 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1616 if (r < 0)
1617 return log_error_errno(r, "Failed to create /var/log: %m");
1618
1619 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1620 if (r < 0)
1621 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1622
1623 (void) sd_id128_to_string(arg_uuid, id);
1624
1625 p = strjoina("/var/log/journal/", id);
1626 q = prefix_roota(directory, p);
1627
1628 if (path_is_mount_point(p, 0) > 0) {
1629 if (try)
1630 return 0;
1631
1632 log_error("%s: already a mount point, refusing to use for journal", p);
1633 return -EEXIST;
1634 }
1635
1636 if (path_is_mount_point(q, 0) > 0) {
1637 if (try)
1638 return 0;
1639
1640 log_error("%s: already a mount point, refusing to use for journal", q);
1641 return -EEXIST;
1642 }
1643
1644 r = readlink_and_make_absolute(p, &d);
1645 if (r >= 0) {
1646 if ((arg_link_journal == LINK_GUEST ||
1647 arg_link_journal == LINK_AUTO) &&
1648 path_equal(d, q)) {
1649
1650 r = userns_mkdir(directory, p, 0755, 0, 0);
1651 if (r < 0)
1652 log_warning_errno(r, "Failed to create directory %s: %m", q);
1653 return 0;
1654 }
1655
1656 if (unlink(p) < 0)
1657 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1658 } else if (r == -EINVAL) {
1659
1660 if (arg_link_journal == LINK_GUEST &&
1661 rmdir(p) < 0) {
1662
1663 if (errno == ENOTDIR) {
1664 log_error("%s already exists and is neither a symlink nor a directory", p);
1665 return r;
1666 } else
1667 return log_error_errno(errno, "Failed to remove %s: %m", p);
1668 }
1669 } else if (r != -ENOENT)
1670 return log_error_errno(r, "readlink(%s) failed: %m", p);
1671
1672 if (arg_link_journal == LINK_GUEST) {
1673
1674 if (symlink(q, p) < 0) {
1675 if (try) {
1676 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1677 return 0;
1678 } else
1679 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1680 }
1681
1682 r = userns_mkdir(directory, p, 0755, 0, 0);
1683 if (r < 0)
1684 log_warning_errno(r, "Failed to create directory %s: %m", q);
1685 return 0;
1686 }
1687
1688 if (arg_link_journal == LINK_HOST) {
1689 /* don't create parents here — if the host doesn't have
1690 * permanent journal set up, don't force it here */
1691
1692 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
1693 if (try) {
1694 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1695 return 0;
1696 } else
1697 return log_error_errno(errno, "Failed to create %s: %m", p);
1698 }
1699
1700 } else if (access(p, F_OK) < 0)
1701 return 0;
1702
1703 if (dir_is_empty(q) == 0)
1704 log_warning("%s is not empty, proceeding anyway.", q);
1705
1706 r = userns_mkdir(directory, p, 0755, 0, 0);
1707 if (r < 0)
1708 return log_error_errno(r, "Failed to create %s: %m", q);
1709
1710 r = mount_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
1711 if (r < 0)
1712 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1713
1714 return 0;
1715}
1716
1717static int drop_capabilities(void) {
1718 return capability_bounding_set_drop(arg_caps_retain, false);
1719}
1720
1721static int reset_audit_loginuid(void) {
1722 _cleanup_free_ char *p = NULL;
1723 int r;
1724
1725 if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
1726 return 0;
1727
1728 r = read_one_line_file("/proc/self/loginuid", &p);
1729 if (r == -ENOENT)
1730 return 0;
1731 if (r < 0)
1732 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1733
1734 /* Already reset? */
1735 if (streq(p, "4294967295"))
1736 return 0;
1737
1738 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
1739 if (r < 0) {
1740 log_error_errno(r,
1741 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1742 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1743 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1744 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1745 "using systemd-nspawn. Sleeping for 5s... (%m)");
1746
1747 sleep(5);
1748 }
1749
1750 return 0;
1751}
1752
1753
1754static int setup_propagate(const char *root) {
1755 const char *p, *q;
1756 int r;
1757
1758 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1759 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1760 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1761 (void) mkdir_p(p, 0600);
1762
1763 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1764 if (r < 0)
1765 return log_error_errno(r, "Failed to create /run/systemd: %m");
1766
1767 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1768 if (r < 0)
1769 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
1770
1771 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1772 if (r < 0)
1773 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
1774
1775 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
1776 r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
1777 if (r < 0)
1778 return r;
1779
1780 r = mount_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
1781 if (r < 0)
1782 return r;
1783
1784 /* machined will MS_MOVE into that directory, and that's only
1785 * supported for non-shared mounts. */
1786 return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
1787}
1788
1789static int setup_image(char **device_path, int *loop_nr) {
1790 struct loop_info64 info = {
1791 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1792 };
1793 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1794 _cleanup_free_ char* loopdev = NULL;
1795 struct stat st;
1796 int r, nr;
1797
1798 assert(device_path);
1799 assert(loop_nr);
1800 assert(arg_image);
1801
1802 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1803 if (fd < 0)
1804 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1805
1806 if (fstat(fd, &st) < 0)
1807 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1808
1809 if (S_ISBLK(st.st_mode)) {
1810 char *p;
1811
1812 p = strdup(arg_image);
1813 if (!p)
1814 return log_oom();
1815
1816 *device_path = p;
1817
1818 *loop_nr = -1;
1819
1820 r = fd;
1821 fd = -1;
1822
1823 return r;
1824 }
1825
1826 if (!S_ISREG(st.st_mode)) {
1827 log_error("%s is not a regular file or block device.", arg_image);
1828 return -EINVAL;
1829 }
1830
1831 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1832 if (control < 0)
1833 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1834
1835 nr = ioctl(control, LOOP_CTL_GET_FREE);
1836 if (nr < 0)
1837 return log_error_errno(errno, "Failed to allocate loop device: %m");
1838
1839 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1840 return log_oom();
1841
1842 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1843 if (loop < 0)
1844 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1845
1846 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1847 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1848
1849 if (arg_read_only)
1850 info.lo_flags |= LO_FLAGS_READ_ONLY;
1851
1852 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1853 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1854
1855 *device_path = loopdev;
1856 loopdev = NULL;
1857
1858 *loop_nr = nr;
1859
1860 r = loop;
1861 loop = -1;
1862
1863 return r;
1864}
1865
1866#define PARTITION_TABLE_BLURB \
1867 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1868 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1869 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1870 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1871 "to be bootable with systemd-nspawn."
1872
1873static int dissect_image(
1874 int fd,
1875 char **root_device, bool *root_device_rw,
1876 char **home_device, bool *home_device_rw,
1877 char **srv_device, bool *srv_device_rw,
1878 char **esp_device,
1879 bool *secondary) {
1880
1881#ifdef HAVE_BLKID
1882 int home_nr = -1, srv_nr = -1, esp_nr = -1;
1883#ifdef GPT_ROOT_NATIVE
1884 int root_nr = -1;
1885#endif
1886#ifdef GPT_ROOT_SECONDARY
1887 int secondary_root_nr = -1;
1888#endif
1889 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *esp = NULL, *generic = NULL;
1890 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1891 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1892 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1893 _cleanup_udev_unref_ struct udev *udev = NULL;
1894 struct udev_list_entry *first, *item;
1895 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1896 bool is_gpt, is_mbr, multiple_generic = false;
1897 const char *pttype = NULL;
1898 blkid_partlist pl;
1899 struct stat st;
1900 unsigned i;
1901 int r;
1902
1903 assert(fd >= 0);
1904 assert(root_device);
1905 assert(home_device);
1906 assert(srv_device);
1907 assert(esp_device);
1908 assert(secondary);
1909 assert(arg_image);
1910
1911 b = blkid_new_probe();
1912 if (!b)
1913 return log_oom();
1914
1915 errno = 0;
1916 r = blkid_probe_set_device(b, fd, 0, 0);
1917 if (r != 0) {
1918 if (errno == 0)
1919 return log_oom();
1920
1921 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1922 }
1923
1924 blkid_probe_enable_partitions(b, 1);
1925 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1926
1927 errno = 0;
1928 r = blkid_do_safeprobe(b);
1929 if (r == -2 || r == 1) {
1930 log_error("Failed to identify any partition table on\n"
1931 " %s\n"
1932 PARTITION_TABLE_BLURB, arg_image);
1933 return -EINVAL;
1934 } else if (r != 0) {
1935 if (errno == 0)
1936 errno = EIO;
1937 return log_error_errno(errno, "Failed to probe: %m");
1938 }
1939
1940 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1941
1942 is_gpt = streq_ptr(pttype, "gpt");
1943 is_mbr = streq_ptr(pttype, "dos");
1944
1945 if (!is_gpt && !is_mbr) {
1946 log_error("No GPT or MBR partition table discovered on\n"
1947 " %s\n"
1948 PARTITION_TABLE_BLURB, arg_image);
1949 return -EINVAL;
1950 }
1951
1952 errno = 0;
1953 pl = blkid_probe_get_partitions(b);
1954 if (!pl) {
1955 if (errno == 0)
1956 return log_oom();
1957
1958 log_error("Failed to list partitions of %s", arg_image);
1959 return -errno;
1960 }
1961
1962 udev = udev_new();
1963 if (!udev)
1964 return log_oom();
1965
1966 if (fstat(fd, &st) < 0)
1967 return log_error_errno(errno, "Failed to stat block device: %m");
1968
1969 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1970 if (!d)
1971 return log_oom();
1972
1973 for (i = 0;; i++) {
1974 int n, m;
1975
1976 if (i >= 10) {
1977 log_error("Kernel partitions never appeared.");
1978 return -ENXIO;
1979 }
1980
1981 e = udev_enumerate_new(udev);
1982 if (!e)
1983 return log_oom();
1984
1985 r = udev_enumerate_add_match_parent(e, d);
1986 if (r < 0)
1987 return log_oom();
1988
1989 r = udev_enumerate_scan_devices(e);
1990 if (r < 0)
1991 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1992
1993 /* Count the partitions enumerated by the kernel */
1994 n = 0;
1995 first = udev_enumerate_get_list_entry(e);
1996 udev_list_entry_foreach(item, first)
1997 n++;
1998
1999 /* Count the partitions enumerated by blkid */
2000 m = blkid_partlist_numof_partitions(pl);
2001 if (n == m + 1)
2002 break;
2003 if (n > m + 1) {
2004 log_error("blkid and kernel partition list do not match.");
2005 return -EIO;
2006 }
2007 if (n < m + 1) {
2008 unsigned j;
2009
2010 /* The kernel has probed fewer partitions than
2011 * blkid? Maybe the kernel prober is still
2012 * running or it got EBUSY because udev
2013 * already opened the device. Let's reprobe
2014 * the device, which is a synchronous call
2015 * that waits until probing is complete. */
2016
2017 for (j = 0; j < 20; j++) {
2018
2019 r = ioctl(fd, BLKRRPART, 0);
2020 if (r < 0)
2021 r = -errno;
2022 if (r >= 0 || r != -EBUSY)
2023 break;
2024
2025 /* If something else has the device
2026 * open, such as an udev rule, the
2027 * ioctl will return EBUSY. Since
2028 * there's no way to wait until it
2029 * isn't busy anymore, let's just wait
2030 * a bit, and try again.
2031 *
2032 * This is really something they
2033 * should fix in the kernel! */
2034
2035 usleep(50 * USEC_PER_MSEC);
2036 }
2037
2038 if (r < 0)
2039 return log_error_errno(r, "Failed to reread partition table: %m");
2040 }
2041
2042 e = udev_enumerate_unref(e);
2043 }
2044
2045 first = udev_enumerate_get_list_entry(e);
2046 udev_list_entry_foreach(item, first) {
2047 _cleanup_udev_device_unref_ struct udev_device *q;
2048 const char *node;
2049 unsigned long long flags;
2050 blkid_partition pp;
2051 dev_t qn;
2052 int nr;
2053
2054 errno = 0;
2055 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2056 if (!q) {
2057 if (!errno)
2058 errno = ENOMEM;
2059
2060 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2061 }
2062
2063 qn = udev_device_get_devnum(q);
2064 if (major(qn) == 0)
2065 continue;
2066
2067 if (st.st_rdev == qn)
2068 continue;
2069
2070 node = udev_device_get_devnode(q);
2071 if (!node)
2072 continue;
2073
2074 pp = blkid_partlist_devno_to_partition(pl, qn);
2075 if (!pp)
2076 continue;
2077
2078 flags = blkid_partition_get_flags(pp);
2079
2080 nr = blkid_partition_get_partno(pp);
2081 if (nr < 0)
2082 continue;
2083
2084 if (is_gpt) {
2085 sd_id128_t type_id;
2086 const char *stype;
2087
2088 if (flags & GPT_FLAG_NO_AUTO)
2089 continue;
2090
2091 stype = blkid_partition_get_type_string(pp);
2092 if (!stype)
2093 continue;
2094
2095 if (sd_id128_from_string(stype, &type_id) < 0)
2096 continue;
2097
2098 if (sd_id128_equal(type_id, GPT_HOME)) {
2099
2100 if (home && nr >= home_nr)
2101 continue;
2102
2103 home_nr = nr;
2104 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2105
2106 r = free_and_strdup(&home, node);
2107 if (r < 0)
2108 return log_oom();
2109
2110 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2111
2112 if (srv && nr >= srv_nr)
2113 continue;
2114
2115 srv_nr = nr;
2116 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2117
2118 r = free_and_strdup(&srv, node);
2119 if (r < 0)
2120 return log_oom();
2121 } else if (sd_id128_equal(type_id, GPT_ESP)) {
2122
2123 if (esp && nr >= esp_nr)
2124 continue;
2125
2126 esp_nr = nr;
2127
2128 r = free_and_strdup(&esp, node);
2129 if (r < 0)
2130 return log_oom();
2131 }
2132#ifdef GPT_ROOT_NATIVE
2133 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2134
2135 if (root && nr >= root_nr)
2136 continue;
2137
2138 root_nr = nr;
2139 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2140
2141 r = free_and_strdup(&root, node);
2142 if (r < 0)
2143 return log_oom();
2144 }
2145#endif
2146#ifdef GPT_ROOT_SECONDARY
2147 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2148
2149 if (secondary_root && nr >= secondary_root_nr)
2150 continue;
2151
2152 secondary_root_nr = nr;
2153 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2154
2155 r = free_and_strdup(&secondary_root, node);
2156 if (r < 0)
2157 return log_oom();
2158 }
2159#endif
2160 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2161
2162 if (generic)
2163 multiple_generic = true;
2164 else {
2165 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2166
2167 r = free_and_strdup(&generic, node);
2168 if (r < 0)
2169 return log_oom();
2170 }
2171 }
2172
2173 } else if (is_mbr) {
2174 int type;
2175
2176 if (flags != 0x80) /* Bootable flag */
2177 continue;
2178
2179 type = blkid_partition_get_type(pp);
2180 if (type != 0x83) /* Linux partition */
2181 continue;
2182
2183 if (generic)
2184 multiple_generic = true;
2185 else {
2186 generic_rw = true;
2187
2188 r = free_and_strdup(&root, node);
2189 if (r < 0)
2190 return log_oom();
2191 }
2192 }
2193 }
2194
2195 if (root) {
2196 *root_device = root;
2197 root = NULL;
2198
2199 *root_device_rw = root_rw;
2200 *secondary = false;
2201 } else if (secondary_root) {
2202 *root_device = secondary_root;
2203 secondary_root = NULL;
2204
2205 *root_device_rw = secondary_root_rw;
2206 *secondary = true;
2207 } else if (generic) {
2208
2209 /* There were no partitions with precise meanings
2210 * around, but we found generic partitions. In this
2211 * case, if there's only one, we can go ahead and boot
2212 * it, otherwise we bail out, because we really cannot
2213 * make any sense of it. */
2214
2215 if (multiple_generic) {
2216 log_error("Identified multiple bootable Linux partitions on\n"
2217 " %s\n"
2218 PARTITION_TABLE_BLURB, arg_image);
2219 return -EINVAL;
2220 }
2221
2222 *root_device = generic;
2223 generic = NULL;
2224
2225 *root_device_rw = generic_rw;
2226 *secondary = false;
2227 } else {
2228 log_error("Failed to identify root partition in disk image\n"
2229 " %s\n"
2230 PARTITION_TABLE_BLURB, arg_image);
2231 return -EINVAL;
2232 }
2233
2234 if (home) {
2235 *home_device = home;
2236 home = NULL;
2237
2238 *home_device_rw = home_rw;
2239 }
2240
2241 if (srv) {
2242 *srv_device = srv;
2243 srv = NULL;
2244
2245 *srv_device_rw = srv_rw;
2246 }
2247
2248 if (esp) {
2249 *esp_device = esp;
2250 esp = NULL;
2251 }
2252
2253 return 0;
2254#else
2255 log_error("--image= is not supported, compiled without blkid support.");
2256 return -EOPNOTSUPP;
2257#endif
2258}
2259
2260static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2261#ifdef HAVE_BLKID
2262 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2263 const char *fstype, *p, *options;
2264 int r;
2265
2266 assert(what);
2267 assert(where);
2268
2269 if (arg_read_only)
2270 rw = false;
2271
2272 if (directory)
2273 p = strjoina(where, directory);
2274 else
2275 p = where;
2276
2277 errno = 0;
2278 b = blkid_new_probe_from_filename(what);
2279 if (!b) {
2280 if (errno == 0)
2281 return log_oom();
2282 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2283 }
2284
2285 blkid_probe_enable_superblocks(b, 1);
2286 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2287
2288 errno = 0;
2289 r = blkid_do_safeprobe(b);
2290 if (r == -1 || r == 1) {
2291 log_error("Cannot determine file system type of %s", what);
2292 return -EINVAL;
2293 } else if (r != 0) {
2294 if (errno == 0)
2295 errno = EIO;
2296 return log_error_errno(errno, "Failed to probe %s: %m", what);
2297 }
2298
2299 errno = 0;
2300 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2301 if (errno == 0)
2302 errno = EINVAL;
2303 log_error("Failed to determine file system type of %s", what);
2304 return -errno;
2305 }
2306
2307 if (streq(fstype, "crypto_LUKS")) {
2308 log_error("nspawn currently does not support LUKS disk images.");
2309 return -EOPNOTSUPP;
2310 }
2311
2312 /* If this is a loopback device then let's mount the image with discard, so that the underlying file remains
2313 * sparse when possible. */
2314 if (STR_IN_SET(fstype, "btrfs", "ext4", "vfat", "xfs")) {
2315 const char *l;
2316
2317 l = path_startswith(what, "/dev");
2318 if (l && startswith(l, "loop"))
2319 options = "discard";
2320 }
2321
2322 return mount_verbose(LOG_ERR, what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), options);
2323#else
2324 log_error("--image= is not supported, compiled without blkid support.");
2325 return -EOPNOTSUPP;
2326#endif
2327}
2328
2329static int setup_machine_id(const char *directory) {
2330 const char *etc_machine_id;
2331 sd_id128_t id;
2332 int r;
2333
2334 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2335 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2336 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2337 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2338 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2339 * container behaves nicely). */
2340
2341 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2342
2343 r = id128_read(etc_machine_id, ID128_PLAIN, &id);
2344 if (r < 0) {
2345 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2346 return log_error_errno(r, "Failed to read machine ID from container image: %m");
2347
2348 if (sd_id128_is_null(arg_uuid)) {
2349 r = sd_id128_randomize(&arg_uuid);
2350 if (r < 0)
2351 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2352 }
2353 } else {
2354 if (sd_id128_is_null(id)) {
2355 log_error("Machine ID in container image is zero, refusing.");
2356 return -EINVAL;
2357 }
2358
2359 arg_uuid = id;
2360 }
2361
2362 return 0;
2363}
2364
2365static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2366 int r;
2367
2368 assert(directory);
2369
2370 if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
2371 return 0;
2372
2373 r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2374 if (r == -EOPNOTSUPP)
2375 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2376 if (r == -EBADE)
2377 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2378 if (r < 0)
2379 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2380 if (r == 0)
2381 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2382 else
2383 log_debug("Patched directory tree to match UID/GID range.");
2384
2385 return r;
2386}
2387
2388static int mount_devices(
2389 const char *where,
2390 const char *root_device, bool root_device_rw,
2391 const char *home_device, bool home_device_rw,
2392 const char *srv_device, bool srv_device_rw,
2393 const char *esp_device) {
2394 int r;
2395
2396 assert(where);
2397
2398 if (root_device) {
2399 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2400 if (r < 0)
2401 return log_error_errno(r, "Failed to mount root directory: %m");
2402 }
2403
2404 if (home_device) {
2405 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2406 if (r < 0)
2407 return log_error_errno(r, "Failed to mount home directory: %m");
2408 }
2409
2410 if (srv_device) {
2411 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2412 if (r < 0)
2413 return log_error_errno(r, "Failed to mount server data directory: %m");
2414 }
2415
2416 if (esp_device) {
2417 const char *mp, *x;
2418
2419 /* Mount the ESP to /efi if it exists and is empty. If it doesn't exist, use /boot instead. */
2420
2421 mp = "/efi";
2422 x = strjoina(arg_directory, mp);
2423 r = dir_is_empty(x);
2424 if (r == -ENOENT) {
2425 mp = "/boot";
2426 x = strjoina(arg_directory, mp);
2427 r = dir_is_empty(x);
2428 }
2429
2430 if (r > 0) {
2431 r = mount_device(esp_device, arg_directory, mp, true);
2432 if (r < 0)
2433 return log_error_errno(r, "Failed to mount ESP: %m");
2434 }
2435 }
2436
2437 return 0;
2438}
2439
2440static void loop_remove(int nr, int *image_fd) {
2441 _cleanup_close_ int control = -1;
2442 int r;
2443
2444 if (nr < 0)
2445 return;
2446
2447 if (image_fd && *image_fd >= 0) {
2448 r = ioctl(*image_fd, LOOP_CLR_FD);
2449 if (r < 0)
2450 log_debug_errno(errno, "Failed to close loop image: %m");
2451 *image_fd = safe_close(*image_fd);
2452 }
2453
2454 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2455 if (control < 0) {
2456 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2457 return;
2458 }
2459
2460 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2461 if (r < 0)
2462 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2463}
2464
2465/*
2466 * Return values:
2467 * < 0 : wait_for_terminate() failed to get the state of the
2468 * container, the container was terminated by a signal, or
2469 * failed for an unknown reason. No change is made to the
2470 * container argument.
2471 * > 0 : The program executed in the container terminated with an
2472 * error. The exit code of the program executed in the
2473 * container is returned. The container argument has been set
2474 * to CONTAINER_TERMINATED.
2475 * 0 : The container is being rebooted, has been shut down or exited
2476 * successfully. The container argument has been set to either
2477 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2478 *
2479 * That is, success is indicated by a return value of zero, and an
2480 * error is indicated by a non-zero value.
2481 */
2482static int wait_for_container(pid_t pid, ContainerStatus *container) {
2483 siginfo_t status;
2484 int r;
2485
2486 r = wait_for_terminate(pid, &status);
2487 if (r < 0)
2488 return log_warning_errno(r, "Failed to wait for container: %m");
2489
2490 switch (status.si_code) {
2491
2492 case CLD_EXITED:
2493 if (status.si_status == 0)
2494 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2495 else
2496 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2497
2498 *container = CONTAINER_TERMINATED;
2499 return status.si_status;
2500
2501 case CLD_KILLED:
2502 if (status.si_status == SIGINT) {
2503 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2504 *container = CONTAINER_TERMINATED;
2505 return 0;
2506
2507 } else if (status.si_status == SIGHUP) {
2508 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2509 *container = CONTAINER_REBOOTED;
2510 return 0;
2511 }
2512
2513 /* CLD_KILLED fallthrough */
2514
2515 case CLD_DUMPED:
2516 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2517 return -EIO;
2518
2519 default:
2520 log_error("Container %s failed due to unknown reason.", arg_machine);
2521 return -EIO;
2522 }
2523}
2524
2525static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2526 pid_t pid;
2527
2528 pid = PTR_TO_PID(userdata);
2529 if (pid > 0) {
2530 if (kill(pid, arg_kill_signal) >= 0) {
2531 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2532 sd_event_source_set_userdata(s, NULL);
2533 return 0;
2534 }
2535 }
2536
2537 sd_event_exit(sd_event_source_get_event(s), 0);
2538 return 0;
2539}
2540
2541static int determine_names(void) {
2542 int r;
2543
2544 if (arg_template && !arg_directory && arg_machine) {
2545
2546 /* If --template= was specified then we should not
2547 * search for a machine, but instead create a new one
2548 * in /var/lib/machine. */
2549
2550 arg_directory = strjoin("/var/lib/machines/", arg_machine);
2551 if (!arg_directory)
2552 return log_oom();
2553 }
2554
2555 if (!arg_image && !arg_directory) {
2556 if (arg_machine) {
2557 _cleanup_(image_unrefp) Image *i = NULL;
2558
2559 r = image_find(arg_machine, &i);
2560 if (r < 0)
2561 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2562 else if (r == 0) {
2563 log_error("No image for machine '%s': %m", arg_machine);
2564 return -ENOENT;
2565 }
2566
2567 if (i->type == IMAGE_RAW)
2568 r = free_and_strdup(&arg_image, i->path);
2569 else
2570 r = free_and_strdup(&arg_directory, i->path);
2571 if (r < 0)
2572 return log_error_errno(r, "Invalid image directory: %m");
2573
2574 if (!arg_ephemeral)
2575 arg_read_only = arg_read_only || i->read_only;
2576 } else
2577 arg_directory = get_current_dir_name();
2578
2579 if (!arg_directory && !arg_machine) {
2580 log_error("Failed to determine path, please use -D or -i.");
2581 return -EINVAL;
2582 }
2583 }
2584
2585 if (!arg_machine) {
2586 if (arg_directory && path_equal(arg_directory, "/"))
2587 arg_machine = gethostname_malloc();
2588 else
2589 arg_machine = strdup(basename(arg_image ?: arg_directory));
2590
2591 if (!arg_machine)
2592 return log_oom();
2593
2594 hostname_cleanup(arg_machine);
2595 if (!machine_name_is_valid(arg_machine)) {
2596 log_error("Failed to determine machine name automatically, please use -M.");
2597 return -EINVAL;
2598 }
2599
2600 if (arg_ephemeral) {
2601 char *b;
2602
2603 /* Add a random suffix when this is an
2604 * ephemeral machine, so that we can run many
2605 * instances at once without manually having
2606 * to specify -M each time. */
2607
2608 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2609 return log_oom();
2610
2611 free(arg_machine);
2612 arg_machine = b;
2613 }
2614 }
2615
2616 return 0;
2617}
2618
2619static int determine_uid_shift(const char *directory) {
2620 int r;
2621
2622 if (arg_userns_mode == USER_NAMESPACE_NO) {
2623 arg_uid_shift = 0;
2624 return 0;
2625 }
2626
2627 if (arg_uid_shift == UID_INVALID) {
2628 struct stat st;
2629
2630 r = stat(directory, &st);
2631 if (r < 0)
2632 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
2633
2634 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2635
2636 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2637 log_error("UID and GID base of %s don't match.", directory);
2638 return -EINVAL;
2639 }
2640
2641 arg_uid_range = UINT32_C(0x10000);
2642 }
2643
2644 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2645 log_error("UID base too high for UID range.");
2646 return -EINVAL;
2647 }
2648
2649 return 0;
2650}
2651
2652static int inner_child(
2653 Barrier *barrier,
2654 const char *directory,
2655 bool secondary,
2656 int kmsg_socket,
2657 int rtnl_socket,
2658 FDSet *fds) {
2659
2660 _cleanup_free_ char *home = NULL;
2661 char as_uuid[37];
2662 unsigned n_env = 1;
2663 const char *envp[] = {
2664 "PATH=" DEFAULT_PATH_SPLIT_USR,
2665 NULL, /* container */
2666 NULL, /* TERM */
2667 NULL, /* HOME */
2668 NULL, /* USER */
2669 NULL, /* LOGNAME */
2670 NULL, /* container_uuid */
2671 NULL, /* LISTEN_FDS */
2672 NULL, /* LISTEN_PID */
2673 NULL, /* NOTIFY_SOCKET */
2674 NULL
2675 };
2676
2677 _cleanup_strv_free_ char **env_use = NULL;
2678 int r;
2679
2680 assert(barrier);
2681 assert(directory);
2682 assert(kmsg_socket >= 0);
2683
2684 cg_unified_flush();
2685
2686 if (arg_userns_mode != USER_NAMESPACE_NO) {
2687 /* Tell the parent, that it now can write the UID map. */
2688 (void) barrier_place(barrier); /* #1 */
2689
2690 /* Wait until the parent wrote the UID map */
2691 if (!barrier_place_and_sync(barrier)) { /* #2 */
2692 log_error("Parent died too early");
2693 return -ESRCH;
2694 }
2695 }
2696
2697 r = reset_uid_gid();
2698 if (r < 0)
2699 return log_error_errno(r, "Couldn't become new root: %m");
2700
2701 r = mount_all(NULL,
2702 arg_userns_mode != USER_NAMESPACE_NO,
2703 true,
2704 arg_private_network,
2705 arg_uid_shift,
2706 arg_uid_range,
2707 arg_selinux_apifs_context);
2708
2709 if (r < 0)
2710 return r;
2711
2712 r = mount_sysfs(NULL);
2713 if (r < 0)
2714 return r;
2715
2716 /* Wait until we are cgroup-ified, so that we
2717 * can mount the right cgroup path writable */
2718 if (!barrier_place_and_sync(barrier)) { /* #3 */
2719 log_error("Parent died too early");
2720 return -ESRCH;
2721 }
2722
2723 if (arg_use_cgns && cg_ns_supported()) {
2724 r = unshare(CLONE_NEWCGROUP);
2725 if (r < 0)
2726 return log_error_errno(errno, "Failed to unshare cgroup namespace");
2727 r = mount_cgroups(
2728 "",
2729 arg_unified_cgroup_hierarchy,
2730 arg_userns_mode != USER_NAMESPACE_NO,
2731 arg_uid_shift,
2732 arg_uid_range,
2733 arg_selinux_apifs_context,
2734 true);
2735 if (r < 0)
2736 return r;
2737 } else {
2738 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2739 if (r < 0)
2740 return r;
2741 }
2742
2743 r = setup_boot_id(NULL);
2744 if (r < 0)
2745 return r;
2746
2747 r = setup_kmsg(NULL, kmsg_socket);
2748 if (r < 0)
2749 return r;
2750 kmsg_socket = safe_close(kmsg_socket);
2751
2752 umask(0022);
2753
2754 if (setsid() < 0)
2755 return log_error_errno(errno, "setsid() failed: %m");
2756
2757 if (arg_private_network)
2758 loopback_setup();
2759
2760 if (arg_expose_ports) {
2761 r = expose_port_send_rtnl(rtnl_socket);
2762 if (r < 0)
2763 return r;
2764 rtnl_socket = safe_close(rtnl_socket);
2765 }
2766
2767 r = drop_capabilities();
2768 if (r < 0)
2769 return log_error_errno(r, "drop_capabilities() failed: %m");
2770
2771 setup_hostname();
2772
2773 if (arg_personality != PERSONALITY_INVALID) {
2774 if (personality(arg_personality) < 0)
2775 return log_error_errno(errno, "personality() failed: %m");
2776 } else if (secondary) {
2777 if (personality(PER_LINUX32) < 0)
2778 return log_error_errno(errno, "personality() failed: %m");
2779 }
2780
2781#ifdef HAVE_SELINUX
2782 if (arg_selinux_context)
2783 if (setexeccon(arg_selinux_context) < 0)
2784 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2785#endif
2786
2787 r = change_uid_gid(arg_user, &home);
2788 if (r < 0)
2789 return r;
2790
2791 /* LXC sets container=lxc, so follow the scheme here */
2792 envp[n_env++] = strjoina("container=", arg_container_service_name);
2793
2794 envp[n_env] = strv_find_prefix(environ, "TERM=");
2795 if (envp[n_env])
2796 n_env++;
2797
2798 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2799 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2800 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2801 return log_oom();
2802
2803 assert(!sd_id128_is_null(arg_uuid));
2804
2805 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_to_uuid_string(arg_uuid, as_uuid)) < 0)
2806 return log_oom();
2807
2808 if (fdset_size(fds) > 0) {
2809 r = fdset_cloexec(fds, false);
2810 if (r < 0)
2811 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2812
2813 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2814 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2815 return log_oom();
2816 }
2817 if (asprintf((char **)(envp + n_env++), "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
2818 return log_oom();
2819
2820 env_use = strv_env_merge(2, envp, arg_setenv);
2821 if (!env_use)
2822 return log_oom();
2823
2824 /* Let the parent know that we are ready and
2825 * wait until the parent is ready with the
2826 * setup, too... */
2827 if (!barrier_place_and_sync(barrier)) { /* #4 */
2828 log_error("Parent died too early");
2829 return -ESRCH;
2830 }
2831
2832 if (arg_chdir)
2833 if (chdir(arg_chdir) < 0)
2834 return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
2835
2836 if (arg_start_mode == START_PID2) {
2837 r = stub_pid1();
2838 if (r < 0)
2839 return r;
2840 }
2841
2842 /* Now, explicitly close the log, so that we
2843 * then can close all remaining fds. Closing
2844 * the log explicitly first has the benefit
2845 * that the logging subsystem knows about it,
2846 * and is thus ready to be reopened should we
2847 * need it again. Note that the other fds
2848 * closed here are at least the locking and
2849 * barrier fds. */
2850 log_close();
2851 (void) fdset_close_others(fds);
2852
2853 if (arg_start_mode == START_BOOT) {
2854 char **a;
2855 size_t m;
2856
2857 /* Automatically search for the init system */
2858
2859 m = strv_length(arg_parameters);
2860 a = newa(char*, m + 2);
2861 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
2862 a[1 + m] = NULL;
2863
2864 a[0] = (char*) "/usr/lib/systemd/systemd";
2865 execve(a[0], a, env_use);
2866
2867 a[0] = (char*) "/lib/systemd/systemd";
2868 execve(a[0], a, env_use);
2869
2870 a[0] = (char*) "/sbin/init";
2871 execve(a[0], a, env_use);
2872 } else if (!strv_isempty(arg_parameters))
2873 execvpe(arg_parameters[0], arg_parameters, env_use);
2874 else {
2875 if (!arg_chdir)
2876 /* If we cannot change the directory, we'll end up in /, that is expected. */
2877 (void) chdir(home ?: "/root");
2878
2879 execle("/bin/bash", "-bash", NULL, env_use);
2880 execle("/bin/sh", "-sh", NULL, env_use);
2881 }
2882
2883 r = -errno;
2884 (void) log_open();
2885 return log_error_errno(r, "execv() failed: %m");
2886}
2887
2888static int setup_sd_notify_child(void) {
2889 static const int one = 1;
2890 int fd = -1;
2891 union sockaddr_union sa = {
2892 .sa.sa_family = AF_UNIX,
2893 };
2894 int r;
2895
2896 fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2897 if (fd < 0)
2898 return log_error_errno(errno, "Failed to allocate notification socket: %m");
2899
2900 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
2901 (void) unlink(NSPAWN_NOTIFY_SOCKET_PATH);
2902
2903 strncpy(sa.un.sun_path, NSPAWN_NOTIFY_SOCKET_PATH, sizeof(sa.un.sun_path)-1);
2904 r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
2905 if (r < 0) {
2906 safe_close(fd);
2907 return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
2908 }
2909
2910 r = setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one));
2911 if (r < 0) {
2912 safe_close(fd);
2913 return log_error_errno(errno, "SO_PASSCRED failed: %m");
2914 }
2915
2916 return fd;
2917}
2918
2919static int outer_child(
2920 Barrier *barrier,
2921 const char *directory,
2922 const char *console,
2923 const char *root_device, bool root_device_rw,
2924 const char *home_device, bool home_device_rw,
2925 const char *srv_device, bool srv_device_rw,
2926 const char *esp_device,
2927 bool interactive,
2928 bool secondary,
2929 int pid_socket,
2930 int uuid_socket,
2931 int notify_socket,
2932 int kmsg_socket,
2933 int rtnl_socket,
2934 int uid_shift_socket,
2935 FDSet *fds) {
2936
2937 pid_t pid;
2938 ssize_t l;
2939 int r;
2940 _cleanup_close_ int fd = -1;
2941
2942 assert(barrier);
2943 assert(directory);
2944 assert(console);
2945 assert(pid_socket >= 0);
2946 assert(uuid_socket >= 0);
2947 assert(notify_socket >= 0);
2948 assert(kmsg_socket >= 0);
2949
2950 cg_unified_flush();
2951
2952 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2953 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2954
2955 if (interactive) {
2956 close_nointr(STDIN_FILENO);
2957 close_nointr(STDOUT_FILENO);
2958 close_nointr(STDERR_FILENO);
2959
2960 r = open_terminal(console, O_RDWR);
2961 if (r != STDIN_FILENO) {
2962 if (r >= 0) {
2963 safe_close(r);
2964 r = -EINVAL;
2965 }
2966
2967 return log_error_errno(r, "Failed to open console: %m");
2968 }
2969
2970 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2971 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2972 return log_error_errno(errno, "Failed to duplicate console: %m");
2973 }
2974
2975 r = reset_audit_loginuid();
2976 if (r < 0)
2977 return r;
2978
2979 /* Mark everything as slave, so that we still
2980 * receive mounts from the real root, but don't
2981 * propagate mounts to the real root. */
2982 r = mount_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
2983 if (r < 0)
2984 return r;
2985
2986 r = mount_devices(directory,
2987 root_device, root_device_rw,
2988 home_device, home_device_rw,
2989 srv_device, srv_device_rw,
2990 esp_device);
2991 if (r < 0)
2992 return r;
2993
2994 r = determine_uid_shift(directory);
2995 if (r < 0)
2996 return r;
2997
2998 r = detect_unified_cgroup_hierarchy(directory);
2999 if (r < 0)
3000 return r;
3001
3002 if (arg_userns_mode != USER_NAMESPACE_NO) {
3003 /* Let the parent know which UID shift we read from the image */
3004 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3005 if (l < 0)
3006 return log_error_errno(errno, "Failed to send UID shift: %m");
3007 if (l != sizeof(arg_uid_shift)) {
3008 log_error("Short write while sending UID shift.");
3009 return -EIO;
3010 }
3011
3012 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3013 /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
3014 * we just read from the image is available. If yes, it will send the UID shift back to us, if
3015 * not it will pick a different one, and send it back to us. */
3016
3017 l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3018 if (l < 0)
3019 return log_error_errno(errno, "Failed to recv UID shift: %m");
3020 if (l != sizeof(arg_uid_shift)) {
3021 log_error("Short read while receiving UID shift.");
3022 return -EIO;
3023 }
3024 }
3025
3026 log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3027 }
3028
3029 /* Turn directory into bind mount */
3030 r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
3031 if (r < 0)
3032 return r;
3033
3034 /* Mark everything as shared so our mounts get propagated down. This is
3035 * required to make new bind mounts available in systemd services
3036 * inside the containter that create a new mount namespace.
3037 * See https://github.com/systemd/systemd/issues/3860
3038 * Further submounts (such as /dev) done after this will inherit the
3039 * shared propagation mode.*/
3040 r = mount_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3041 if (r < 0)
3042 return r;
3043
3044 r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3045 if (r < 0)
3046 return r;
3047
3048 r = setup_volatile(
3049 directory,
3050 arg_volatile_mode,
3051 arg_userns_mode != USER_NAMESPACE_NO,
3052 arg_uid_shift,
3053 arg_uid_range,
3054 arg_selinux_context);
3055 if (r < 0)
3056 return r;
3057
3058 r = setup_volatile_state(
3059 directory,
3060 arg_volatile_mode,
3061 arg_userns_mode != USER_NAMESPACE_NO,
3062 arg_uid_shift,
3063 arg_uid_range,
3064 arg_selinux_context);
3065 if (r < 0)
3066 return r;
3067
3068 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3069 if (r < 0)
3070 return r;
3071
3072 if (arg_read_only) {
3073 r = bind_remount_recursive(directory, true, NULL);
3074 if (r < 0)
3075 return log_error_errno(r, "Failed to make tree read-only: %m");
3076 }
3077
3078 r = mount_all(directory,
3079 arg_userns_mode != USER_NAMESPACE_NO,
3080 false,
3081 arg_private_network,
3082 arg_uid_shift,
3083 arg_uid_range,
3084 arg_selinux_apifs_context);
3085 if (r < 0)
3086 return r;
3087
3088 r = copy_devnodes(directory);
3089 if (r < 0)
3090 return r;
3091
3092 dev_setup(directory, arg_uid_shift, arg_uid_shift);
3093
3094 r = setup_pts(directory);
3095 if (r < 0)
3096 return r;
3097
3098 r = setup_propagate(directory);
3099 if (r < 0)
3100 return r;
3101
3102 r = setup_dev_console(directory, console);
3103 if (r < 0)
3104 return r;
3105
3106 r = setup_seccomp(arg_caps_retain);
3107 if (r < 0)
3108 return r;
3109
3110 r = setup_timezone(directory);
3111 if (r < 0)
3112 return r;
3113
3114 r = setup_resolv_conf(directory);
3115 if (r < 0)
3116 return r;
3117
3118 r = setup_machine_id(directory);
3119 if (r < 0)
3120 return r;
3121
3122 r = setup_journal(directory);
3123 if (r < 0)
3124 return r;
3125
3126 r = mount_custom(
3127 directory,
3128 arg_custom_mounts,
3129 arg_n_custom_mounts,
3130 arg_userns_mode != USER_NAMESPACE_NO,
3131 arg_uid_shift,
3132 arg_uid_range,
3133 arg_selinux_apifs_context);
3134 if (r < 0)
3135 return r;
3136
3137 if (!arg_use_cgns || !cg_ns_supported()) {
3138 r = mount_cgroups(
3139 directory,
3140 arg_unified_cgroup_hierarchy,
3141 arg_userns_mode != USER_NAMESPACE_NO,
3142 arg_uid_shift,
3143 arg_uid_range,
3144 arg_selinux_apifs_context,
3145 false);
3146 if (r < 0)
3147 return r;
3148 }
3149
3150 r = mount_move_root(directory);
3151 if (r < 0)
3152 return log_error_errno(r, "Failed to move root directory: %m");
3153
3154 fd = setup_sd_notify_child();
3155 if (fd < 0)
3156 return fd;
3157
3158 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3159 arg_clone_ns_flags |
3160 (arg_private_network ? CLONE_NEWNET : 0) |
3161 (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
3162 if (pid < 0)
3163 return log_error_errno(errno, "Failed to fork inner child: %m");
3164 if (pid == 0) {
3165 pid_socket = safe_close(pid_socket);
3166 uuid_socket = safe_close(uuid_socket);
3167 notify_socket = safe_close(notify_socket);
3168 uid_shift_socket = safe_close(uid_shift_socket);
3169
3170 /* The inner child has all namespaces that are
3171 * requested, so that we all are owned by the user if
3172 * user namespaces are turned on. */
3173
3174 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
3175 if (r < 0)
3176 _exit(EXIT_FAILURE);
3177
3178 _exit(EXIT_SUCCESS);
3179 }
3180
3181 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3182 if (l < 0)
3183 return log_error_errno(errno, "Failed to send PID: %m");
3184 if (l != sizeof(pid)) {
3185 log_error("Short write while sending PID.");
3186 return -EIO;
3187 }
3188
3189 l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
3190 if (l < 0)
3191 return log_error_errno(errno, "Failed to send machine ID: %m");
3192 if (l != sizeof(arg_uuid)) {
3193 log_error("Short write while sending machine ID.");
3194 return -EIO;
3195 }
3196
3197 l = send_one_fd(notify_socket, fd, 0);
3198 if (l < 0)
3199 return log_error_errno(errno, "Failed to send notify fd: %m");
3200
3201 pid_socket = safe_close(pid_socket);
3202 uuid_socket = safe_close(uuid_socket);
3203 notify_socket = safe_close(notify_socket);
3204 kmsg_socket = safe_close(kmsg_socket);
3205 rtnl_socket = safe_close(rtnl_socket);
3206
3207 return 0;
3208}
3209
3210static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
3211 unsigned n_tries = 100;
3212 uid_t candidate;
3213 int r;
3214
3215 assert(shift);
3216 assert(ret_lock_file);
3217 assert(arg_userns_mode == USER_NAMESPACE_PICK);
3218 assert(arg_uid_range == 0x10000U);
3219
3220 candidate = *shift;
3221
3222 (void) mkdir("/run/systemd/nspawn-uid", 0755);
3223
3224 for (;;) {
3225 char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
3226 _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
3227
3228 if (--n_tries <= 0)
3229 return -EBUSY;
3230
3231 if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
3232 goto next;
3233 if ((candidate & UINT32_C(0xFFFF)) != 0)
3234 goto next;
3235
3236 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
3237 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
3238 if (r == -EBUSY) /* Range already taken by another nspawn instance */
3239 goto next;
3240 if (r < 0)
3241 return r;
3242
3243 /* Make some superficial checks whether the range is currently known in the user database */
3244 if (getpwuid(candidate))
3245 goto next;
3246 if (getpwuid(candidate + UINT32_C(0xFFFE)))
3247 goto next;
3248 if (getgrgid(candidate))
3249 goto next;
3250 if (getgrgid(candidate + UINT32_C(0xFFFE)))
3251 goto next;
3252
3253 *ret_lock_file = lf;
3254 lf = (struct LockFile) LOCK_FILE_INIT;
3255 *shift = candidate;
3256 return 0;
3257
3258 next:
3259 random_bytes(&candidate, sizeof(candidate));
3260 candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
3261 candidate &= (uid_t) UINT32_C(0xFFFF0000);
3262 }
3263}
3264
3265static int setup_uid_map(pid_t pid) {
3266 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3267 int r;
3268
3269 assert(pid > 1);
3270
3271 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3272 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
3273 r = write_string_file(uid_map, line, 0);
3274 if (r < 0)
3275 return log_error_errno(r, "Failed to write UID map: %m");
3276
3277 /* We always assign the same UID and GID ranges */
3278 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
3279 r = write_string_file(uid_map, line, 0);
3280 if (r < 0)
3281 return log_error_errno(r, "Failed to write GID map: %m");
3282
3283 return 0;
3284}
3285
3286static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
3287 char buf[NOTIFY_BUFFER_MAX+1];
3288 char *p = NULL;
3289 struct iovec iovec = {
3290 .iov_base = buf,
3291 .iov_len = sizeof(buf)-1,
3292 };
3293 union {
3294 struct cmsghdr cmsghdr;
3295 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
3296 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
3297 } control = {};
3298 struct msghdr msghdr = {
3299 .msg_iov = &iovec,
3300 .msg_iovlen = 1,
3301 .msg_control = &control,
3302 .msg_controllen = sizeof(control),
3303 };
3304 struct cmsghdr *cmsg;
3305 struct ucred *ucred = NULL;
3306 ssize_t n;
3307 pid_t inner_child_pid;
3308 _cleanup_strv_free_ char **tags = NULL;
3309
3310 assert(userdata);
3311
3312 inner_child_pid = PTR_TO_PID(userdata);
3313
3314 if (revents != EPOLLIN) {
3315 log_warning("Got unexpected poll event for notify fd.");
3316 return 0;
3317 }
3318
3319 n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
3320 if (n < 0) {
3321 if (errno == EAGAIN || errno == EINTR)
3322 return 0;
3323
3324 return log_warning_errno(errno, "Couldn't read notification socket: %m");
3325 }
3326 cmsg_close_all(&msghdr);
3327
3328 CMSG_FOREACH(cmsg, &msghdr) {
3329 if (cmsg->cmsg_level == SOL_SOCKET &&
3330 cmsg->cmsg_type == SCM_CREDENTIALS &&
3331 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
3332
3333 ucred = (struct ucred*) CMSG_DATA(cmsg);
3334 }
3335 }
3336
3337 if (!ucred || ucred->pid != inner_child_pid) {
3338 log_warning("Received notify message without valid credentials. Ignoring.");
3339 return 0;
3340 }
3341
3342 if ((size_t) n >= sizeof(buf)) {
3343 log_warning("Received notify message exceeded maximum size. Ignoring.");
3344 return 0;
3345 }
3346
3347 buf[n] = 0;
3348 tags = strv_split(buf, "\n\r");
3349 if (!tags)
3350 return log_oom();
3351
3352 if (strv_find(tags, "READY=1"))
3353 sd_notifyf(false, "READY=1\n");
3354
3355 p = strv_find_startswith(tags, "STATUS=");
3356 if (p)
3357 sd_notifyf(false, "STATUS=Container running: %s", p);
3358
3359 return 0;
3360}
3361
3362static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid) {
3363 int r;
3364 sd_event_source *notify_event_source;
3365
3366 r = sd_event_add_io(event, &notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
3367 if (r < 0)
3368 return log_error_errno(r, "Failed to allocate notify event source: %m");
3369
3370 (void) sd_event_source_set_description(notify_event_source, "nspawn-notify");
3371
3372 return 0;
3373}
3374
3375static int load_settings(void) {
3376 _cleanup_(settings_freep) Settings *settings = NULL;
3377 _cleanup_fclose_ FILE *f = NULL;
3378 _cleanup_free_ char *p = NULL;
3379 const char *fn, *i;
3380 int r;
3381
3382 /* If all settings are masked, there's no point in looking for
3383 * the settings file */
3384 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
3385 return 0;
3386
3387 fn = strjoina(arg_machine, ".nspawn");
3388
3389 /* We first look in the admin's directories in /etc and /run */
3390 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3391 _cleanup_free_ char *j = NULL;
3392
3393 j = strjoin(i, "/", fn);
3394 if (!j)
3395 return log_oom();
3396
3397 f = fopen(j, "re");
3398 if (f) {
3399 p = j;
3400 j = NULL;
3401
3402 /* By default, we trust configuration from /etc and /run */
3403 if (arg_settings_trusted < 0)
3404 arg_settings_trusted = true;
3405
3406 break;
3407 }
3408
3409 if (errno != ENOENT)
3410 return log_error_errno(errno, "Failed to open %s: %m", j);
3411 }
3412
3413 if (!f) {
3414 /* After that, let's look for a file next to the
3415 * actual image we shall boot. */
3416
3417 if (arg_image) {
3418 p = file_in_same_dir(arg_image, fn);
3419 if (!p)
3420 return log_oom();
3421 } else if (arg_directory) {
3422 p = file_in_same_dir(arg_directory, fn);
3423 if (!p)
3424 return log_oom();
3425 }
3426
3427 if (p) {
3428 f = fopen(p, "re");
3429 if (!f && errno != ENOENT)
3430 return log_error_errno(errno, "Failed to open %s: %m", p);
3431
3432 /* By default, we do not trust configuration from /var/lib/machines */
3433 if (arg_settings_trusted < 0)
3434 arg_settings_trusted = false;
3435 }
3436 }
3437
3438 if (!f)
3439 return 0;
3440
3441 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
3442
3443 r = settings_load(f, p, &settings);
3444 if (r < 0)
3445 return r;
3446
3447 /* Copy over bits from the settings, unless they have been
3448 * explicitly masked by command line switches. */
3449
3450 if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
3451 settings->start_mode >= 0) {
3452 arg_start_mode = settings->start_mode;
3453
3454 strv_free(arg_parameters);
3455 arg_parameters = settings->parameters;
3456 settings->parameters = NULL;
3457 }
3458
3459 if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
3460 settings->working_directory) {
3461 free(arg_chdir);
3462 arg_chdir = settings->working_directory;
3463 settings->working_directory = NULL;
3464 }
3465
3466 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
3467 settings->environment) {
3468 strv_free(arg_setenv);
3469 arg_setenv = settings->environment;
3470 settings->environment = NULL;
3471 }
3472
3473 if ((arg_settings_mask & SETTING_USER) == 0 &&
3474 settings->user) {
3475 free(arg_user);
3476 arg_user = settings->user;
3477 settings->user = NULL;
3478 }
3479
3480 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
3481 uint64_t plus;
3482
3483 plus = settings->capability;
3484 if (settings_private_network(settings))
3485 plus |= (1ULL << CAP_NET_ADMIN);
3486
3487 if (!arg_settings_trusted && plus != 0) {
3488 if (settings->capability != 0)
3489 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
3490 } else
3491 arg_caps_retain |= plus;
3492
3493 arg_caps_retain &= ~settings->drop_capability;
3494 }
3495
3496 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
3497 settings->kill_signal > 0)
3498 arg_kill_signal = settings->kill_signal;
3499
3500 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
3501 settings->personality != PERSONALITY_INVALID)
3502 arg_personality = settings->personality;
3503
3504 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
3505 !sd_id128_is_null(settings->machine_id)) {
3506
3507 if (!arg_settings_trusted)
3508 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
3509 else
3510 arg_uuid = settings->machine_id;
3511 }
3512
3513 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
3514 settings->read_only >= 0)
3515 arg_read_only = settings->read_only;
3516
3517 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
3518 settings->volatile_mode != _VOLATILE_MODE_INVALID)
3519 arg_volatile_mode = settings->volatile_mode;
3520
3521 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
3522 settings->n_custom_mounts > 0) {
3523
3524 if (!arg_settings_trusted)
3525 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
3526 else {
3527 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3528 arg_custom_mounts = settings->custom_mounts;
3529 arg_n_custom_mounts = settings->n_custom_mounts;
3530
3531 settings->custom_mounts = NULL;
3532 settings->n_custom_mounts = 0;
3533 }
3534 }
3535
3536 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
3537 (settings->private_network >= 0 ||
3538 settings->network_veth >= 0 ||
3539 settings->network_bridge ||
3540 settings->network_zone ||
3541 settings->network_interfaces ||
3542 settings->network_macvlan ||
3543 settings->network_ipvlan ||
3544 settings->network_veth_extra)) {
3545
3546 if (!arg_settings_trusted)
3547 log_warning("Ignoring network settings, file %s is not trusted.", p);
3548 else {
3549 arg_network_veth = settings_network_veth(settings);
3550 arg_private_network = settings_private_network(settings);
3551
3552 strv_free(arg_network_interfaces);
3553 arg_network_interfaces = settings->network_interfaces;
3554 settings->network_interfaces = NULL;
3555
3556 strv_free(arg_network_macvlan);
3557 arg_network_macvlan = settings->network_macvlan;
3558 settings->network_macvlan = NULL;
3559
3560 strv_free(arg_network_ipvlan);
3561 arg_network_ipvlan = settings->network_ipvlan;
3562 settings->network_ipvlan = NULL;
3563
3564 strv_free(arg_network_veth_extra);
3565 arg_network_veth_extra = settings->network_veth_extra;
3566 settings->network_veth_extra = NULL;
3567
3568 free(arg_network_bridge);
3569 arg_network_bridge = settings->network_bridge;
3570 settings->network_bridge = NULL;
3571
3572 free(arg_network_zone);
3573 arg_network_zone = settings->network_zone;
3574 settings->network_zone = NULL;
3575 }
3576 }
3577
3578 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3579 settings->expose_ports) {
3580
3581 if (!arg_settings_trusted)
3582 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3583 else {
3584 expose_port_free_all(arg_expose_ports);
3585 arg_expose_ports = settings->expose_ports;
3586 settings->expose_ports = NULL;
3587 }
3588 }
3589
3590 if ((arg_settings_mask & SETTING_USERNS) == 0 &&
3591 settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
3592
3593 if (!arg_settings_trusted)
3594 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
3595 else {
3596 arg_userns_mode = settings->userns_mode;
3597 arg_uid_shift = settings->uid_shift;
3598 arg_uid_range = settings->uid_range;
3599 arg_userns_chown = settings->userns_chown;
3600 }
3601 }
3602
3603 if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
3604 arg_notify_ready = settings->notify_ready;
3605
3606 return 0;
3607}
3608
3609static int run(int master,
3610 const char* console,
3611 const char *root_device, bool root_device_rw,
3612 const char *home_device, bool home_device_rw,
3613 const char *srv_device, bool srv_device_rw,
3614 const char *esp_device,
3615 bool interactive,
3616 bool secondary,
3617 FDSet *fds,
3618 char veth_name[IFNAMSIZ], bool *veth_created,
3619 union in_addr_union *exposed,
3620 pid_t *pid, int *ret) {
3621
3622 static const struct sigaction sa = {
3623 .sa_handler = nop_signal_handler,
3624 .sa_flags = SA_NOCLDSTOP,
3625 };
3626
3627 _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
3628 _cleanup_close_ int etc_passwd_lock = -1;
3629 _cleanup_close_pair_ int
3630 kmsg_socket_pair[2] = { -1, -1 },
3631 rtnl_socket_pair[2] = { -1, -1 },
3632 pid_socket_pair[2] = { -1, -1 },
3633 uuid_socket_pair[2] = { -1, -1 },
3634 notify_socket_pair[2] = { -1, -1 },
3635 uid_shift_socket_pair[2] = { -1, -1 };
3636 _cleanup_close_ int notify_socket= -1;
3637 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3638 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
3639 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3640 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
3641 ContainerStatus container_status = 0;
3642 char last_char = 0;
3643 int ifi = 0, r;
3644 ssize_t l;
3645 sigset_t mask_chld;
3646
3647 assert_se(sigemptyset(&mask_chld) == 0);
3648 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3649
3650 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3651 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
3652 * check with getpwuid() if the specific user already exists. Note that /etc might be
3653 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
3654 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
3655 * really just an extra safety net. We kinda assume that the UID range we allocate from is
3656 * really ours. */
3657
3658 etc_passwd_lock = take_etc_passwd_lock(NULL);
3659 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
3660 return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
3661 }
3662
3663 r = barrier_create(&barrier);
3664 if (r < 0)
3665 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
3666
3667 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
3668 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3669
3670 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
3671 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3672
3673 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
3674 return log_error_errno(errno, "Failed to create pid socket pair: %m");
3675
3676 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
3677 return log_error_errno(errno, "Failed to create id socket pair: %m");
3678
3679 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
3680 return log_error_errno(errno, "Failed to create notify socket pair: %m");
3681
3682 if (arg_userns_mode != USER_NAMESPACE_NO)
3683 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
3684 return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3685
3686 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
3687 * parent's blocking calls and give it a chance to call wait() and terminate. */
3688 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3689 if (r < 0)
3690 return log_error_errno(errno, "Failed to change the signal mask: %m");
3691
3692 r = sigaction(SIGCHLD, &sa, NULL);
3693 if (r < 0)
3694 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3695
3696 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
3697 if (*pid < 0)
3698 return log_error_errno(errno, "clone() failed%s: %m",
3699 errno == EINVAL ?
3700 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
3701
3702 if (*pid == 0) {
3703 /* The outer child only has a file system namespace. */
3704 barrier_set_role(&barrier, BARRIER_CHILD);
3705
3706 master = safe_close(master);
3707
3708 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3709 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3710 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
3711 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
3712 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
3713 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
3714
3715 (void) reset_all_signal_handlers();
3716 (void) reset_signal_mask();
3717
3718 r = outer_child(&barrier,
3719 arg_directory,
3720 console,
3721 root_device, root_device_rw,
3722 home_device, home_device_rw,
3723 srv_device, srv_device_rw,
3724 esp_device,
3725 interactive,
3726 secondary,
3727 pid_socket_pair[1],
3728 uuid_socket_pair[1],
3729 notify_socket_pair[1],
3730 kmsg_socket_pair[1],
3731 rtnl_socket_pair[1],
3732 uid_shift_socket_pair[1],
3733 fds);
3734 if (r < 0)
3735 _exit(EXIT_FAILURE);
3736
3737 _exit(EXIT_SUCCESS);
3738 }
3739
3740 barrier_set_role(&barrier, BARRIER_PARENT);
3741
3742 fds = fdset_free(fds);
3743
3744 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3745 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3746 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
3747 uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
3748 notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
3749 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
3750
3751 if (arg_userns_mode != USER_NAMESPACE_NO) {
3752 /* The child just let us know the UID shift it might have read from the image. */
3753 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
3754 if (l < 0)
3755 return log_error_errno(errno, "Failed to read UID shift: %m");
3756
3757 if (l != sizeof arg_uid_shift) {
3758 log_error("Short read while reading UID shift.");
3759 return -EIO;
3760 }
3761
3762 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3763 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
3764 * image, but if that's already in use, pick a new one, and report back to the child,
3765 * which one we now picked. */
3766
3767 r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
3768 if (r < 0)
3769 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
3770
3771 l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
3772 if (l < 0)
3773 return log_error_errno(errno, "Failed to send UID shift: %m");
3774 if (l != sizeof arg_uid_shift) {
3775 log_error("Short write while writing UID shift.");
3776 return -EIO;
3777 }
3778 }
3779 }
3780
3781 /* Wait for the outer child. */
3782 r = wait_for_terminate_and_warn("namespace helper", *pid, NULL);
3783 if (r != 0)
3784 return r < 0 ? r : -EIO;
3785
3786 /* And now retrieve the PID of the inner child. */
3787 l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
3788 if (l < 0)
3789 return log_error_errno(errno, "Failed to read inner child PID: %m");
3790 if (l != sizeof *pid) {
3791 log_error("Short read while reading inner child PID.");
3792 return -EIO;
3793 }
3794
3795 /* We also retrieve container UUID in case it was generated by outer child */
3796 l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
3797 if (l < 0)
3798 return log_error_errno(errno, "Failed to read container machine ID: %m");
3799 if (l != sizeof(arg_uuid)) {
3800 log_error("Short read while reading container machined ID.");
3801 return -EIO;
3802 }
3803
3804 /* We also retrieve the socket used for notifications generated by outer child */
3805 notify_socket = receive_one_fd(notify_socket_pair[0], 0);
3806 if (notify_socket < 0)
3807 return log_error_errno(notify_socket,
3808 "Failed to receive notification socket from the outer child: %m");
3809
3810 log_debug("Init process invoked as PID "PID_FMT, *pid);
3811
3812 if (arg_userns_mode != USER_NAMESPACE_NO) {
3813 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3814 log_error("Child died too early.");
3815 return -ESRCH;
3816 }
3817
3818 r = setup_uid_map(*pid);
3819 if (r < 0)
3820 return r;
3821
3822 (void) barrier_place(&barrier); /* #2 */
3823 }
3824
3825 if (arg_private_network) {
3826
3827 r = move_network_interfaces(*pid, arg_network_interfaces);
3828 if (r < 0)
3829 return r;
3830
3831 if (arg_network_veth) {
3832 r = setup_veth(arg_machine, *pid, veth_name,
3833 arg_network_bridge || arg_network_zone);
3834 if (r < 0)
3835 return r;
3836 else if (r > 0)
3837 ifi = r;
3838
3839 if (arg_network_bridge) {
3840 /* Add the interface to a bridge */
3841 r = setup_bridge(veth_name, arg_network_bridge, false);
3842 if (r < 0)
3843 return r;
3844 if (r > 0)
3845 ifi = r;
3846 } else if (arg_network_zone) {
3847 /* Add the interface to a bridge, possibly creating it */
3848 r = setup_bridge(veth_name, arg_network_zone, true);
3849 if (r < 0)
3850 return r;
3851 if (r > 0)
3852 ifi = r;
3853 }
3854 }
3855
3856 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
3857 if (r < 0)
3858 return r;
3859
3860 /* We created the primary and extra veth links now; let's remember this, so that we know to
3861 remove them later on. Note that we don't bother with removing veth links that were created
3862 here when their setup failed half-way, because in that case the kernel should be able to
3863 remove them on its own, since they cannot be referenced by anything yet. */
3864 *veth_created = true;
3865
3866 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
3867 if (r < 0)
3868 return r;
3869
3870 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
3871 if (r < 0)
3872 return r;
3873 }
3874
3875 if (arg_register) {
3876 r = register_machine(
3877 arg_machine,
3878 *pid,
3879 arg_directory,
3880 arg_uuid,
3881 ifi,
3882 arg_slice,
3883 arg_custom_mounts, arg_n_custom_mounts,
3884 arg_kill_signal,
3885 arg_property,
3886 arg_keep_unit,
3887 arg_container_service_name);
3888 if (r < 0)
3889 return r;
3890 }
3891
3892 r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
3893 if (r < 0)
3894 return r;
3895
3896 if (arg_keep_unit) {
3897 r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
3898 if (r < 0)
3899 return r;
3900 }
3901
3902 r = chown_cgroup(*pid, arg_uid_shift);
3903 if (r < 0)
3904 return r;
3905
3906 /* Notify the child that the parent is ready with all
3907 * its setup (including cgroup-ification), and that
3908 * the child can now hand over control to the code to
3909 * run inside the container. */
3910 (void) barrier_place(&barrier); /* #3 */
3911
3912 /* Block SIGCHLD here, before notifying child.
3913 * process_pty() will handle it with the other signals. */
3914 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
3915
3916 /* Reset signal to default */
3917 r = default_signals(SIGCHLD, -1);
3918 if (r < 0)
3919 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
3920
3921 r = sd_event_new(&event);
3922 if (r < 0)
3923 return log_error_errno(r, "Failed to get default event source: %m");
3924
3925 r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid));
3926 if (r < 0)
3927 return r;
3928
3929 /* Let the child know that we are ready and wait that the child is completely ready now. */
3930 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3931 log_error("Child died too early.");
3932 return -ESRCH;
3933 }
3934
3935 /* At this point we have made use of the UID we picked, and thus nss-mymachines
3936 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
3937 etc_passwd_lock = safe_close(etc_passwd_lock);
3938
3939 sd_notifyf(false,
3940 "STATUS=Container running.\n"
3941 "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
3942 if (!arg_notify_ready)
3943 sd_notify(false, "READY=1\n");
3944
3945 if (arg_kill_signal > 0) {
3946 /* Try to kill the init system on SIGINT or SIGTERM */
3947 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
3948 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
3949 } else {
3950 /* Immediately exit */
3951 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3952 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3953 }
3954
3955 /* simply exit on sigchld */
3956 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3957
3958 if (arg_expose_ports) {
3959 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, exposed, &rtnl);
3960 if (r < 0)
3961 return r;
3962
3963 (void) expose_port_execute(rtnl, arg_expose_ports, exposed);
3964 }
3965
3966 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3967
3968 r = pty_forward_new(event, master,
3969 PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY),
3970 &forward);
3971 if (r < 0)
3972 return log_error_errno(r, "Failed to create PTY forwarder: %m");
3973
3974 r = sd_event_loop(event);
3975 if (r < 0)
3976 return log_error_errno(r, "Failed to run event loop: %m");
3977
3978 pty_forward_get_last_char(forward, &last_char);
3979
3980 forward = pty_forward_free(forward);
3981
3982 if (!arg_quiet && last_char != '\n')
3983 putc('\n', stdout);
3984
3985 /* Kill if it is not dead yet anyway */
3986 if (arg_register && !arg_keep_unit)
3987 terminate_machine(*pid);
3988
3989 /* Normally redundant, but better safe than sorry */
3990 kill(*pid, SIGKILL);
3991
3992 r = wait_for_container(*pid, &container_status);
3993 *pid = 0;
3994
3995 if (r < 0)
3996 /* We failed to wait for the container, or the container exited abnormally. */
3997 return r;
3998 if (r > 0 || container_status == CONTAINER_TERMINATED) {
3999 /* r > 0 → The container exited with a non-zero status.
4000 * As a special case, we need to replace 133 with a different value,
4001 * because 133 is special-cased in the service file to reboot the container.
4002 * otherwise → The container exited with zero status and a reboot was not requested.
4003 */
4004 if (r == 133)
4005 r = EXIT_FAILURE; /* replace 133 with the general failure code */
4006 *ret = r;
4007 return 0; /* finito */
4008 }
4009
4010 /* CONTAINER_REBOOTED, loop again */
4011
4012 if (arg_keep_unit) {
4013 /* Special handling if we are running as a service: instead of simply
4014 * restarting the machine we want to restart the entire service, so let's
4015 * inform systemd about this with the special exit code 133. The service
4016 * file uses RestartForceExitStatus=133 so that this results in a full
4017 * nspawn restart. This is necessary since we might have cgroup parameters
4018 * set we want to have flushed out. */
4019 *ret = 0;
4020 return 133;
4021 }
4022
4023 expose_port_flush(arg_expose_ports, exposed);
4024
4025 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4026 *veth_created = false;
4027 return 1; /* loop again */
4028}
4029
4030int main(int argc, char *argv[]) {
4031
4032 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *esp_device = NULL, *console = NULL;
4033 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4034 _cleanup_close_ int master = -1, image_fd = -1;
4035 _cleanup_fdset_free_ FDSet *fds = NULL;
4036 int r, n_fd_passed, loop_nr = -1, ret = EXIT_FAILURE;
4037 char veth_name[IFNAMSIZ] = "";
4038 bool secondary = false, remove_subvol = false;
4039 pid_t pid = 0;
4040 union in_addr_union exposed = {};
4041 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4042 bool interactive, veth_created = false;
4043
4044 log_parse_environment();
4045 log_open();
4046
4047 /* Make sure rename_process() in the stub init process can work */
4048 saved_argv = argv;
4049 saved_argc = argc;
4050
4051 r = parse_argv(argc, argv);
4052 if (r <= 0)
4053 goto finish;
4054
4055 if (geteuid() != 0) {
4056 log_error("Need to be root.");
4057 r = -EPERM;
4058 goto finish;
4059 }
4060 r = determine_names();
4061 if (r < 0)
4062 goto finish;
4063
4064 r = load_settings();
4065 if (r < 0)
4066 goto finish;
4067
4068 r = verify_arguments();
4069 if (r < 0)
4070 goto finish;
4071
4072 n_fd_passed = sd_listen_fds(false);
4073 if (n_fd_passed > 0) {
4074 r = fdset_new_listen_fds(&fds, false);
4075 if (r < 0) {
4076 log_error_errno(r, "Failed to collect file descriptors: %m");
4077 goto finish;
4078 }
4079 }
4080
4081 if (arg_directory) {
4082 assert(!arg_image);
4083
4084 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4085 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4086 r = -EINVAL;
4087 goto finish;
4088 }
4089
4090 if (arg_ephemeral) {
4091 _cleanup_free_ char *np = NULL;
4092
4093 /* If the specified path is a mount point we
4094 * generate the new snapshot immediately
4095 * inside it under a random name. However if
4096 * the specified is not a mount point we
4097 * create the new snapshot in the parent
4098 * directory, just next to it. */
4099 r = path_is_mount_point(arg_directory, 0);
4100 if (r < 0) {
4101 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4102 goto finish;
4103 }
4104 if (r > 0)
4105 r = tempfn_random_child(arg_directory, "machine.", &np);
4106 else
4107 r = tempfn_random(arg_directory, "machine.", &np);
4108 if (r < 0) {
4109 log_error_errno(r, "Failed to generate name for snapshot: %m");
4110 goto finish;
4111 }
4112
4113 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4114 if (r < 0) {
4115 log_error_errno(r, "Failed to lock %s: %m", np);
4116 goto finish;
4117 }
4118
4119 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
4120 if (r < 0) {
4121 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4122 goto finish;
4123 }
4124
4125 free(arg_directory);
4126 arg_directory = np;
4127 np = NULL;
4128
4129 remove_subvol = true;
4130
4131 } else {
4132 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4133 if (r == -EBUSY) {
4134 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4135 goto finish;
4136 }
4137 if (r < 0) {
4138 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4139 goto finish;
4140 }
4141
4142 if (arg_template) {
4143 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
4144 if (r == -EEXIST) {
4145 if (!arg_quiet)
4146 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4147 } else if (r < 0) {
4148 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4149 goto finish;
4150 } else {
4151 if (!arg_quiet)
4152 log_info("Populated %s from template %s.", arg_directory, arg_template);
4153 }
4154 }
4155 }
4156
4157 if (arg_start_mode == START_BOOT) {
4158 if (path_is_os_tree(arg_directory) <= 0) {
4159 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
4160 r = -EINVAL;
4161 goto finish;
4162 }
4163 } else {
4164 const char *p;
4165
4166 p = strjoina(arg_directory, "/usr/");
4167 if (laccess(p, F_OK) < 0) {
4168 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
4169 r = -EINVAL;
4170 goto finish;
4171 }
4172 }
4173
4174 } else {
4175 char template[] = "/tmp/nspawn-root-XXXXXX";
4176
4177 assert(arg_image);
4178 assert(!arg_template);
4179
4180 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4181 if (r == -EBUSY) {
4182 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4183 goto finish;
4184 }
4185 if (r < 0) {
4186 r = log_error_errno(r, "Failed to create image lock: %m");
4187 goto finish;
4188 }
4189
4190 if (!mkdtemp(template)) {
4191 log_error_errno(errno, "Failed to create temporary directory: %m");
4192 r = -errno;
4193 goto finish;
4194 }
4195
4196 arg_directory = strdup(template);
4197 if (!arg_directory) {
4198 r = log_oom();
4199 goto finish;
4200 }
4201
4202 image_fd = setup_image(&device_path, &loop_nr);
4203 if (image_fd < 0) {
4204 r = image_fd;
4205 goto finish;
4206 }
4207
4208 r = dissect_image(image_fd,
4209 &root_device, &root_device_rw,
4210 &home_device, &home_device_rw,
4211 &srv_device, &srv_device_rw,
4212 &esp_device,
4213 &secondary);
4214 if (r < 0)
4215 goto finish;
4216 }
4217
4218 r = custom_mounts_prepare();
4219 if (r < 0)
4220 goto finish;
4221
4222 interactive =
4223 isatty(STDIN_FILENO) > 0 &&
4224 isatty(STDOUT_FILENO) > 0;
4225
4226 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4227 if (master < 0) {
4228 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
4229 goto finish;
4230 }
4231
4232 r = ptsname_malloc(master, &console);
4233 if (r < 0) {
4234 r = log_error_errno(r, "Failed to determine tty name: %m");
4235 goto finish;
4236 }
4237
4238 if (arg_selinux_apifs_context) {
4239 r = mac_selinux_apply(console, arg_selinux_apifs_context);
4240 if (r < 0)
4241 goto finish;
4242 }
4243
4244 if (unlockpt(master) < 0) {
4245 r = log_error_errno(errno, "Failed to unlock tty: %m");
4246 goto finish;
4247 }
4248
4249 if (!arg_quiet)
4250 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4251 arg_machine, arg_image ?: arg_directory);
4252
4253 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
4254
4255 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4256 r = log_error_errno(errno, "Failed to become subreaper: %m");
4257 goto finish;
4258 }
4259
4260 for (;;) {
4261 r = run(master,
4262 console,
4263 root_device, root_device_rw,
4264 home_device, home_device_rw,
4265 srv_device, srv_device_rw,
4266 esp_device,
4267 interactive, secondary,
4268 fds,
4269 veth_name, &veth_created,
4270 &exposed,
4271 &pid, &ret);
4272 if (r <= 0)
4273 break;
4274 }
4275
4276finish:
4277 sd_notify(false,
4278 "STOPPING=1\n"
4279 "STATUS=Terminating...");
4280
4281 if (pid > 0)
4282 kill(pid, SIGKILL);
4283
4284 /* Try to flush whatever is still queued in the pty */
4285 if (master >= 0)
4286 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
4287
4288 loop_remove(loop_nr, &image_fd);
4289
4290 if (remove_subvol && arg_directory) {
4291 int k;
4292
4293 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
4294 if (k < 0)
4295 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4296 }
4297
4298 if (arg_machine) {
4299 const char *p;
4300
4301 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4302 (void) rm_rf(p, REMOVE_ROOT);
4303 }
4304
4305 expose_port_flush(arg_expose_ports, &exposed);
4306
4307 if (veth_created)
4308 (void) remove_veth_links(veth_name, arg_network_veth_extra);
4309 (void) remove_bridge(arg_network_zone);
4310
4311 free(arg_directory);
4312 free(arg_template);
4313 free(arg_image);
4314 free(arg_machine);
4315 free(arg_user);
4316 free(arg_chdir);
4317 strv_free(arg_setenv);
4318 free(arg_network_bridge);
4319 strv_free(arg_network_interfaces);
4320 strv_free(arg_network_macvlan);
4321 strv_free(arg_network_ipvlan);
4322 strv_free(arg_network_veth_extra);
4323 strv_free(arg_parameters);
4324 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4325 expose_port_free_all(arg_expose_ports);
4326
4327 return r < 0 ? EXIT_FAILURE : ret;
4328}