]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
tty-ask-password: Split out password sending
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
8fe0087e
LP
22#ifdef HAVE_BLKID
23#include <blkid/blkid.h>
24#endif
88213476 25#include <errno.h>
88213476 26#include <getopt.h>
1b9e5b12 27#include <linux/loop.h>
8fe0087e 28#include <sched.h>
24fb1112
LP
29#ifdef HAVE_SECCOMP
30#include <seccomp.h>
31#endif
8fe0087e
LP
32#ifdef HAVE_SELINUX
33#include <selinux/selinux.h>
1b9e5b12 34#endif
8fe0087e
LP
35#include <signal.h>
36#include <stdio.h>
37#include <stdlib.h>
38#include <string.h>
39#include <sys/file.h>
40#include <sys/mount.h>
41#include <sys/personality.h>
42#include <sys/prctl.h>
43#include <sys/types.h>
44#include <unistd.h>
1b9e5b12 45
1f0cd86b 46#include "sd-daemon.h"
1f0cd86b 47#include "sd-id128.h"
8fe0087e 48
b5efdb8a 49#include "alloc-util.h"
8fe0087e
LP
50#include "barrier.h"
51#include "base-filesystem.h"
52#include "blkid-util.h"
53#include "btrfs-util.h"
8fe0087e 54#include "cap-list.h"
430f0182 55#include "capability-util.h"
04d391da 56#include "cgroup-util.h"
8fe0087e 57#include "copy.h"
4fc9982c 58#include "dev-setup.h"
8fe0087e 59#include "env-util.h"
3ffd4af2 60#include "fd-util.h"
842f3b0f 61#include "fdset.h"
a5c32cff 62#include "fileio.h"
8fe0087e 63#include "formats-util.h"
f4f15635 64#include "fs-util.h"
1b9e5b12 65#include "gpt.h"
8fe0087e
LP
66#include "hostname-util.h"
67#include "log.h"
68#include "loopback-setup.h"
1b9cebf6 69#include "machine-image.h"
8fe0087e
LP
70#include "macro.h"
71#include "missing.h"
72#include "mkdir.h"
4349cd7c 73#include "mount-util.h"
8fe0087e 74#include "netlink-util.h"
07630cea
LP
75#include "nspawn-cgroup.h"
76#include "nspawn-expose-ports.h"
77#include "nspawn-mount.h"
78#include "nspawn-network.h"
79#include "nspawn-register.h"
80#include "nspawn-settings.h"
81#include "nspawn-setuid.h"
6bedfcbb 82#include "parse-util.h"
8fe0087e 83#include "path-util.h"
0b452006 84#include "process-util.h"
8fe0087e
LP
85#include "ptyfwd.h"
86#include "random-util.h"
87#include "rm-rf.h"
e9642be2
LP
88#ifdef HAVE_SECCOMP
89#include "seccomp-util.h"
90#endif
8fe0087e 91#include "signal-util.h"
2583fbea 92#include "socket-util.h"
8fcde012 93#include "stat-util.h"
15a5e950 94#include "stdio-util.h"
07630cea 95#include "string-util.h"
8fe0087e
LP
96#include "strv.h"
97#include "terminal-util.h"
98#include "udev-util.h"
affb60b1 99#include "umask-util.h"
b1d4f8e1 100#include "user-util.h"
8fe0087e 101#include "util.h"
e9642be2 102
113cea80
DH
103typedef enum ContainerStatus {
104 CONTAINER_TERMINATED,
105 CONTAINER_REBOOTED
106} ContainerStatus;
107
57fb9fb5
LP
108typedef enum LinkJournal {
109 LINK_NO,
110 LINK_AUTO,
111 LINK_HOST,
112 LINK_GUEST
113} LinkJournal;
88213476
LP
114
115static char *arg_directory = NULL;
ec16945e 116static char *arg_template = NULL;
687d0825 117static char *arg_user = NULL;
9444b1f2 118static sd_id128_t arg_uuid = {};
7027ff61 119static char *arg_machine = NULL;
c74e630d
LP
120static const char *arg_selinux_context = NULL;
121static const char *arg_selinux_apifs_context = NULL;
9444b1f2 122static const char *arg_slice = NULL;
ff01d048 123static bool arg_private_network = false;
bc2f673e 124static bool arg_read_only = false;
0f0dbc46 125static bool arg_boot = false;
ec16945e 126static bool arg_ephemeral = false;
57fb9fb5 127static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 128static bool arg_link_journal_try = false;
5076f0cc
LP
129static uint64_t arg_retain =
130 (1ULL << CAP_CHOWN) |
131 (1ULL << CAP_DAC_OVERRIDE) |
132 (1ULL << CAP_DAC_READ_SEARCH) |
133 (1ULL << CAP_FOWNER) |
134 (1ULL << CAP_FSETID) |
135 (1ULL << CAP_IPC_OWNER) |
136 (1ULL << CAP_KILL) |
137 (1ULL << CAP_LEASE) |
138 (1ULL << CAP_LINUX_IMMUTABLE) |
139 (1ULL << CAP_NET_BIND_SERVICE) |
140 (1ULL << CAP_NET_BROADCAST) |
141 (1ULL << CAP_NET_RAW) |
142 (1ULL << CAP_SETGID) |
143 (1ULL << CAP_SETFCAP) |
144 (1ULL << CAP_SETPCAP) |
145 (1ULL << CAP_SETUID) |
146 (1ULL << CAP_SYS_ADMIN) |
147 (1ULL << CAP_SYS_CHROOT) |
148 (1ULL << CAP_SYS_NICE) |
149 (1ULL << CAP_SYS_PTRACE) |
150 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 151 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
152 (1ULL << CAP_SYS_BOOT) |
153 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
154 (1ULL << CAP_AUDIT_CONTROL) |
155 (1ULL << CAP_MKNOD);
5a8af538
LP
156static CustomMount *arg_custom_mounts = NULL;
157static unsigned arg_n_custom_mounts = 0;
f4889f65 158static char **arg_setenv = NULL;
284c0b91 159static bool arg_quiet = false;
8a96d94e 160static bool arg_share_system = false;
eb91eb18 161static bool arg_register = true;
89f7c846 162static bool arg_keep_unit = false;
aa28aefe 163static char **arg_network_interfaces = NULL;
c74e630d 164static char **arg_network_macvlan = NULL;
4bbfe7ad 165static char **arg_network_ipvlan = NULL;
69c79d3c 166static bool arg_network_veth = false;
f6d6bad1 167static char **arg_network_veth_extra = NULL;
f757855e 168static char *arg_network_bridge = NULL;
050f7277 169static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 170static char *arg_image = NULL;
f757855e 171static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 172static ExposePort *arg_expose_ports = NULL;
f36933fe 173static char **arg_property = NULL;
6dac160c
LP
174static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
175static bool arg_userns = false;
c6c8f6e2 176static int arg_kill_signal = 0;
efdb0237 177static bool arg_unified_cgroup_hierarchy = false;
f757855e
LP
178static SettingsMask arg_settings_mask = 0;
179static int arg_settings_trusted = -1;
180static char **arg_parameters = NULL;
6aadfa4c 181static const char *arg_container_service_name = "systemd-nspawn";
88213476 182
601185b4 183static void help(void) {
88213476
LP
184 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
185 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
186 " -h --help Show this help\n"
187 " --version Print version string\n"
69c79d3c 188 " -q --quiet Do not show status information\n"
1b9e5b12 189 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
190 " --template=PATH Initialize root directory from template directory,\n"
191 " if missing\n"
192 " -x --ephemeral Run container with snapshot of root directory, and\n"
193 " remove it after exit\n"
194 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
195 " -b --boot Boot up full system (i.e. invoke init)\n"
196 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 197 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 198 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 199 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 200 " --property=NAME=VALUE Set scope unit property\n"
03cfe0d5
LP
201 " --private-users[=UIDBASE[:NUIDS]]\n"
202 " Run within user namespace\n"
69c79d3c
LP
203 " --private-network Disable network in container\n"
204 " --network-interface=INTERFACE\n"
205 " Assign an existing network interface to the\n"
206 " container\n"
c74e630d
LP
207 " --network-macvlan=INTERFACE\n"
208 " Create a macvlan network interface based on an\n"
209 " existing network interface to the container\n"
4bbfe7ad
TG
210 " --network-ipvlan=INTERFACE\n"
211 " Create a ipvlan network interface based on an\n"
212 " existing network interface to the container\n"
a8eaaee7 213 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 214 " and container\n"
f6d6bad1
LP
215 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
216 " Add an additional virtual Ethernet link between\n"
217 " host and container\n"
ab046dde 218 " --network-bridge=INTERFACE\n"
a8eaaee7 219 " Add a virtual Ethernet connection between host\n"
ab046dde
TG
220 " and container and add it to an existing bridge on\n"
221 " the host\n"
6d0b55c2 222 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 223 " Expose a container IP port on the host\n"
82adf6af
LP
224 " -Z --selinux-context=SECLABEL\n"
225 " Set the SELinux security context to be used by\n"
226 " processes in the container\n"
227 " -L --selinux-apifs-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " API/tmpfs file systems in the container\n"
a8828ed9
DW
230 " --capability=CAP In addition to the default, retain specified\n"
231 " capability\n"
232 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 233 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
234 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
235 " try-guest, try-host\n"
236 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 237 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
238 " --bind=PATH[:PATH[:OPTIONS]]\n"
239 " Bind mount a file or directory from the host into\n"
a8828ed9 240 " the container\n"
5e5bfa6e
EY
241 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
242 " Similar, but creates a read-only bind mount\n"
06c17c39 243 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
244 " --overlay=PATH[:PATH...]:PATH\n"
245 " Create an overlay mount from the host to \n"
246 " the container\n"
247 " --overlay-ro=PATH[:PATH...]:PATH\n"
248 " Similar, but creates a read-only overlay mount\n"
284c0b91 249 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 250 " --share-system Share system namespaces with host\n"
eb91eb18 251 " --register=BOOLEAN Register container as machine\n"
89f7c846 252 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 253 " the service unit nspawn is running in\n"
6d0b55c2 254 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 255 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
6d0b55c2 256 , program_invocation_short_name);
88213476
LP
257}
258
5a8af538
LP
259
260static int custom_mounts_prepare(void) {
261 unsigned i;
262 int r;
263
264 /* Ensure the mounts are applied prefix first. */
265 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
266
267 /* Allocate working directories for the overlay file systems that need it */
268 for (i = 0; i < arg_n_custom_mounts; i++) {
269 CustomMount *m = &arg_custom_mounts[i];
270
825d5287
RM
271 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
272 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
273 return -EINVAL;
274 }
275
5a8af538
LP
276 if (m->type != CUSTOM_MOUNT_OVERLAY)
277 continue;
278
279 if (m->work_dir)
280 continue;
281
282 if (m->read_only)
283 continue;
284
14bcf25c 285 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
286 if (r < 0)
287 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
288 }
289
290 return 0;
291}
292
efdb0237
LP
293static int detect_unified_cgroup_hierarchy(void) {
294 const char *e;
295 int r;
296
297 /* Allow the user to control whether the unified hierarchy is used */
298 e = getenv("UNIFIED_CGROUP_HIERARCHY");
299 if (e) {
300 r = parse_boolean(e);
301 if (r < 0)
302 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
303
304 arg_unified_cgroup_hierarchy = r;
305 return 0;
306 }
307
308 /* Otherwise inherit the default from the host system */
309 r = cg_unified();
310 if (r < 0)
311 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
312
313 arg_unified_cgroup_hierarchy = r;
314 return 0;
315}
316
88213476
LP
317static int parse_argv(int argc, char *argv[]) {
318
a41fe3a2 319 enum {
acbeb427
ZJS
320 ARG_VERSION = 0x100,
321 ARG_PRIVATE_NETWORK,
bc2f673e 322 ARG_UUID,
5076f0cc 323 ARG_READ_ONLY,
57fb9fb5 324 ARG_CAPABILITY,
420c7379 325 ARG_DROP_CAPABILITY,
17fe0523
LP
326 ARG_LINK_JOURNAL,
327 ARG_BIND,
f4889f65 328 ARG_BIND_RO,
06c17c39 329 ARG_TMPFS,
5a8af538
LP
330 ARG_OVERLAY,
331 ARG_OVERLAY_RO,
f4889f65 332 ARG_SETENV,
eb91eb18 333 ARG_SHARE_SYSTEM,
89f7c846 334 ARG_REGISTER,
aa28aefe 335 ARG_KEEP_UNIT,
69c79d3c 336 ARG_NETWORK_INTERFACE,
c74e630d 337 ARG_NETWORK_MACVLAN,
4bbfe7ad 338 ARG_NETWORK_IPVLAN,
ab046dde 339 ARG_NETWORK_BRIDGE,
f6d6bad1 340 ARG_NETWORK_VETH_EXTRA,
6afc95b7 341 ARG_PERSONALITY,
4d9f07b4 342 ARG_VOLATILE,
ec16945e 343 ARG_TEMPLATE,
f36933fe 344 ARG_PROPERTY,
6dac160c 345 ARG_PRIVATE_USERS,
c6c8f6e2 346 ARG_KILL_SIGNAL,
f757855e 347 ARG_SETTINGS,
a41fe3a2
LP
348 };
349
88213476 350 static const struct option options[] = {
aa28aefe
LP
351 { "help", no_argument, NULL, 'h' },
352 { "version", no_argument, NULL, ARG_VERSION },
353 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
354 { "template", required_argument, NULL, ARG_TEMPLATE },
355 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
356 { "user", required_argument, NULL, 'u' },
357 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
358 { "boot", no_argument, NULL, 'b' },
359 { "uuid", required_argument, NULL, ARG_UUID },
360 { "read-only", no_argument, NULL, ARG_READ_ONLY },
361 { "capability", required_argument, NULL, ARG_CAPABILITY },
362 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
363 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
364 { "bind", required_argument, NULL, ARG_BIND },
365 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 366 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
367 { "overlay", required_argument, NULL, ARG_OVERLAY },
368 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
369 { "machine", required_argument, NULL, 'M' },
370 { "slice", required_argument, NULL, 'S' },
371 { "setenv", required_argument, NULL, ARG_SETENV },
372 { "selinux-context", required_argument, NULL, 'Z' },
373 { "selinux-apifs-context", required_argument, NULL, 'L' },
374 { "quiet", no_argument, NULL, 'q' },
375 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
376 { "register", required_argument, NULL, ARG_REGISTER },
377 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
378 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 379 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 380 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 381 { "network-veth", no_argument, NULL, 'n' },
f6d6bad1 382 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
ab046dde 383 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 384 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 385 { "image", required_argument, NULL, 'i' },
4d9f07b4 386 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 387 { "port", required_argument, NULL, 'p' },
f36933fe 388 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 389 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 390 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
f757855e 391 { "settings", required_argument, NULL, ARG_SETTINGS },
eb9da376 392 {}
88213476
LP
393 };
394
9444b1f2 395 int c, r;
6aadfa4c 396 const char *p, *e;
a42c8b54 397 uint64_t plus = 0, minus = 0;
f757855e 398 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
399
400 assert(argc >= 0);
401 assert(argv);
402
0dfaa006 403 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
404
405 switch (c) {
406
407 case 'h':
601185b4
ZJS
408 help();
409 return 0;
88213476 410
acbeb427 411 case ARG_VERSION:
3f6fd1ba 412 return version();
acbeb427 413
88213476 414 case 'D':
0f03c2a4 415 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 416 if (r < 0)
0f03c2a4 417 return r;
ec16945e
LP
418 break;
419
420 case ARG_TEMPLATE:
0f03c2a4 421 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 422 if (r < 0)
0f03c2a4 423 return r;
88213476
LP
424 break;
425
1b9e5b12 426 case 'i':
0f03c2a4 427 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 428 if (r < 0)
0f03c2a4 429 return r;
ec16945e
LP
430 break;
431
432 case 'x':
433 arg_ephemeral = true;
1b9e5b12
LP
434 break;
435
687d0825 436 case 'u':
2fc09a9c
DM
437 r = free_and_strdup(&arg_user, optarg);
438 if (r < 0)
7027ff61 439 return log_oom();
687d0825 440
f757855e 441 arg_settings_mask |= SETTING_USER;
687d0825
MV
442 break;
443
ab046dde 444 case ARG_NETWORK_BRIDGE:
f757855e
LP
445 r = free_and_strdup(&arg_network_bridge, optarg);
446 if (r < 0)
447 return log_oom();
ab046dde
TG
448
449 /* fall through */
450
0dfaa006 451 case 'n':
69c79d3c
LP
452 arg_network_veth = true;
453 arg_private_network = true;
f757855e 454 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
455 break;
456
f6d6bad1
LP
457 case ARG_NETWORK_VETH_EXTRA:
458 r = veth_extra_parse(&arg_network_veth_extra, optarg);
459 if (r < 0)
460 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
461
462 arg_private_network = true;
463 arg_settings_mask |= SETTING_NETWORK;
464 break;
465
aa28aefe 466 case ARG_NETWORK_INTERFACE:
c74e630d
LP
467 if (strv_extend(&arg_network_interfaces, optarg) < 0)
468 return log_oom();
469
470 arg_private_network = true;
f757855e 471 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
472 break;
473
474 case ARG_NETWORK_MACVLAN:
475 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
476 return log_oom();
477
4bbfe7ad 478 arg_private_network = true;
f757855e 479 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
480 break;
481
482 case ARG_NETWORK_IPVLAN:
483 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
484 return log_oom();
485
aa28aefe
LP
486 /* fall through */
487
ff01d048
LP
488 case ARG_PRIVATE_NETWORK:
489 arg_private_network = true;
f757855e 490 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
491 break;
492
0f0dbc46
LP
493 case 'b':
494 arg_boot = true;
f757855e 495 arg_settings_mask |= SETTING_BOOT;
0f0dbc46
LP
496 break;
497
144f0fc0 498 case ARG_UUID:
9444b1f2
LP
499 r = sd_id128_from_string(optarg, &arg_uuid);
500 if (r < 0) {
aa96c6cb 501 log_error("Invalid UUID: %s", optarg);
9444b1f2 502 return r;
aa96c6cb 503 }
f757855e
LP
504
505 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 506 break;
aa96c6cb 507
9444b1f2 508 case 'S':
c74e630d 509 arg_slice = optarg;
144f0fc0
LP
510 break;
511
7027ff61 512 case 'M':
c1521918 513 if (isempty(optarg))
97b11eed 514 arg_machine = mfree(arg_machine);
c1521918 515 else {
0c3c4284 516 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
517 log_error("Invalid machine name: %s", optarg);
518 return -EINVAL;
519 }
7027ff61 520
0c3c4284
LP
521 r = free_and_strdup(&arg_machine, optarg);
522 if (r < 0)
eb91eb18
LP
523 return log_oom();
524
525 break;
526 }
7027ff61 527
82adf6af
LP
528 case 'Z':
529 arg_selinux_context = optarg;
a8828ed9
DW
530 break;
531
82adf6af
LP
532 case 'L':
533 arg_selinux_apifs_context = optarg;
a8828ed9
DW
534 break;
535
bc2f673e
LP
536 case ARG_READ_ONLY:
537 arg_read_only = true;
f757855e 538 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
539 break;
540
420c7379
LP
541 case ARG_CAPABILITY:
542 case ARG_DROP_CAPABILITY: {
6cbe4ed1
SS
543 p = optarg;
544 for(;;) {
545 _cleanup_free_ char *t = NULL;
5076f0cc 546
6cbe4ed1
SS
547 r = extract_first_word(&p, &t, ",", 0);
548 if (r < 0)
549 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 550
6cbe4ed1
SS
551 if (r == 0)
552 break;
5076f0cc 553
39ed67d1
LP
554 if (streq(t, "all")) {
555 if (c == ARG_CAPABILITY)
a42c8b54 556 plus = (uint64_t) -1;
39ed67d1 557 else
a42c8b54 558 minus = (uint64_t) -1;
39ed67d1 559 } else {
2822da4f
LP
560 int cap;
561
562 cap = capability_from_name(t);
563 if (cap < 0) {
39ed67d1
LP
564 log_error("Failed to parse capability %s.", t);
565 return -EINVAL;
566 }
567
568 if (c == ARG_CAPABILITY)
a42c8b54 569 plus |= 1ULL << (uint64_t) cap;
39ed67d1 570 else
a42c8b54 571 minus |= 1ULL << (uint64_t) cap;
5076f0cc 572 }
5076f0cc
LP
573 }
574
f757855e 575 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
576 break;
577 }
578
57fb9fb5
LP
579 case 'j':
580 arg_link_journal = LINK_GUEST;
574edc90 581 arg_link_journal_try = true;
57fb9fb5
LP
582 break;
583
584 case ARG_LINK_JOURNAL:
53e438e3 585 if (streq(optarg, "auto")) {
57fb9fb5 586 arg_link_journal = LINK_AUTO;
53e438e3
LP
587 arg_link_journal_try = false;
588 } else if (streq(optarg, "no")) {
57fb9fb5 589 arg_link_journal = LINK_NO;
53e438e3
LP
590 arg_link_journal_try = false;
591 } else if (streq(optarg, "guest")) {
57fb9fb5 592 arg_link_journal = LINK_GUEST;
53e438e3
LP
593 arg_link_journal_try = false;
594 } else if (streq(optarg, "host")) {
57fb9fb5 595 arg_link_journal = LINK_HOST;
53e438e3
LP
596 arg_link_journal_try = false;
597 } else if (streq(optarg, "try-guest")) {
574edc90
MP
598 arg_link_journal = LINK_GUEST;
599 arg_link_journal_try = true;
600 } else if (streq(optarg, "try-host")) {
601 arg_link_journal = LINK_HOST;
602 arg_link_journal_try = true;
603 } else {
57fb9fb5
LP
604 log_error("Failed to parse link journal mode %s", optarg);
605 return -EINVAL;
606 }
607
608 break;
609
17fe0523 610 case ARG_BIND:
f757855e
LP
611 case ARG_BIND_RO:
612 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
613 if (r < 0)
614 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 615
f757855e 616 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 617 break;
06c17c39 618
f757855e
LP
619 case ARG_TMPFS:
620 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
621 if (r < 0)
622 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 623
f757855e 624 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 625 break;
5a8af538
LP
626
627 case ARG_OVERLAY:
628 case ARG_OVERLAY_RO: {
629 _cleanup_free_ char *upper = NULL, *destination = NULL;
630 _cleanup_strv_free_ char **lower = NULL;
631 CustomMount *m;
632 unsigned n = 0;
633 char **i;
634
62f9f39a
RM
635 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
636 if (r == -ENOMEM)
06c17c39 637 return log_oom();
62f9f39a
RM
638 else if (r < 0) {
639 log_error("Invalid overlay specification: %s", optarg);
640 return r;
641 }
06c17c39 642
5a8af538
LP
643 STRV_FOREACH(i, lower) {
644 if (!path_is_absolute(*i)) {
645 log_error("Overlay path %s is not absolute.", *i);
646 return -EINVAL;
647 }
648
649 n++;
650 }
651
652 if (n < 2) {
653 log_error("--overlay= needs at least two colon-separated directories specified.");
654 return -EINVAL;
655 }
656
657 if (n == 2) {
658 /* If two parameters are specified,
659 * the first one is the lower, the
660 * second one the upper directory. And
af86c440
ZJS
661 * we'll also define the destination
662 * mount point the same as the upper. */
5a8af538
LP
663 upper = lower[1];
664 lower[1] = NULL;
665
666 destination = strdup(upper);
667 if (!destination)
668 return log_oom();
669
670 } else {
671 upper = lower[n - 2];
672 destination = lower[n - 1];
673 lower[n - 2] = NULL;
674 }
675
f757855e 676 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
5a8af538
LP
677 if (!m)
678 return log_oom();
679
680 m->destination = destination;
681 m->source = upper;
682 m->lower = lower;
683 m->read_only = c == ARG_OVERLAY_RO;
684
685 upper = destination = NULL;
686 lower = NULL;
06c17c39 687
f757855e 688 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39
LP
689 break;
690 }
691
f4889f65
LP
692 case ARG_SETENV: {
693 char **n;
694
695 if (!env_assignment_is_valid(optarg)) {
696 log_error("Environment variable assignment '%s' is not valid.", optarg);
697 return -EINVAL;
698 }
699
700 n = strv_env_set(arg_setenv, optarg);
701 if (!n)
702 return log_oom();
703
704 strv_free(arg_setenv);
705 arg_setenv = n;
f757855e
LP
706
707 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
708 break;
709 }
710
284c0b91
LP
711 case 'q':
712 arg_quiet = true;
713 break;
714
8a96d94e
LP
715 case ARG_SHARE_SYSTEM:
716 arg_share_system = true;
717 break;
718
eb91eb18
LP
719 case ARG_REGISTER:
720 r = parse_boolean(optarg);
721 if (r < 0) {
722 log_error("Failed to parse --register= argument: %s", optarg);
723 return r;
724 }
725
726 arg_register = r;
727 break;
728
89f7c846
LP
729 case ARG_KEEP_UNIT:
730 arg_keep_unit = true;
731 break;
732
6afc95b7
LP
733 case ARG_PERSONALITY:
734
ac45f971 735 arg_personality = personality_from_string(optarg);
050f7277 736 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
737 log_error("Unknown or unsupported personality '%s'.", optarg);
738 return -EINVAL;
739 }
740
f757855e 741 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
742 break;
743
4d9f07b4
LP
744 case ARG_VOLATILE:
745
746 if (!optarg)
f757855e 747 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 748 else {
f757855e 749 VolatileMode m;
4d9f07b4 750
f757855e
LP
751 m = volatile_mode_from_string(optarg);
752 if (m < 0) {
753 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 754 return -EINVAL;
f757855e
LP
755 } else
756 arg_volatile_mode = m;
6d0b55c2
LP
757 }
758
f757855e
LP
759 arg_settings_mask |= SETTING_VOLATILE_MODE;
760 break;
6d0b55c2 761
f757855e
LP
762 case 'p':
763 r = expose_port_parse(&arg_expose_ports, optarg);
764 if (r == -EEXIST)
765 return log_error_errno(r, "Duplicate port specification: %s", optarg);
766 if (r < 0)
767 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 768
f757855e 769 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 770 break;
6d0b55c2 771
f36933fe
LP
772 case ARG_PROPERTY:
773 if (strv_extend(&arg_property, optarg) < 0)
774 return log_oom();
775
776 break;
777
6dac160c
LP
778 case ARG_PRIVATE_USERS:
779 if (optarg) {
780 _cleanup_free_ char *buffer = NULL;
781 const char *range, *shift;
782
783 range = strchr(optarg, ':');
784 if (range) {
785 buffer = strndup(optarg, range - optarg);
786 if (!buffer)
787 return log_oom();
788 shift = buffer;
789
790 range++;
791 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
792 log_error("Failed to parse UID range: %s", range);
793 return -EINVAL;
794 }
795 } else
796 shift = optarg;
797
798 if (parse_uid(shift, &arg_uid_shift) < 0) {
799 log_error("Failed to parse UID: %s", optarg);
800 return -EINVAL;
801 }
802 }
803
804 arg_userns = true;
805 break;
806
c6c8f6e2
LP
807 case ARG_KILL_SIGNAL:
808 arg_kill_signal = signal_from_string_try_harder(optarg);
809 if (arg_kill_signal < 0) {
810 log_error("Cannot parse signal: %s", optarg);
811 return -EINVAL;
812 }
813
f757855e
LP
814 arg_settings_mask |= SETTING_KILL_SIGNAL;
815 break;
816
817 case ARG_SETTINGS:
818
819 /* no → do not read files
820 * yes → read files, do not override cmdline, trust only subset
821 * override → read files, override cmdline, trust only subset
822 * trusted → read files, do not override cmdline, trust all
823 */
824
825 r = parse_boolean(optarg);
826 if (r < 0) {
827 if (streq(optarg, "trusted")) {
828 mask_all_settings = false;
829 mask_no_settings = false;
830 arg_settings_trusted = true;
831
832 } else if (streq(optarg, "override")) {
833 mask_all_settings = false;
834 mask_no_settings = true;
835 arg_settings_trusted = -1;
836 } else
837 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
838 } else if (r > 0) {
839 /* yes */
840 mask_all_settings = false;
841 mask_no_settings = false;
842 arg_settings_trusted = -1;
843 } else {
844 /* no */
845 mask_all_settings = true;
846 mask_no_settings = false;
847 arg_settings_trusted = false;
848 }
849
c6c8f6e2
LP
850 break;
851
88213476
LP
852 case '?':
853 return -EINVAL;
854
855 default:
eb9da376 856 assert_not_reached("Unhandled option");
88213476 857 }
88213476 858
eb91eb18
LP
859 if (arg_share_system)
860 arg_register = false;
861
862 if (arg_boot && arg_share_system) {
863 log_error("--boot and --share-system may not be combined.");
864 return -EINVAL;
865 }
866
89f7c846
LP
867 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
868 log_error("--keep-unit may not be used when invoked from a user session.");
869 return -EINVAL;
870 }
871
1b9e5b12
LP
872 if (arg_directory && arg_image) {
873 log_error("--directory= and --image= may not be combined.");
874 return -EINVAL;
875 }
876
ec16945e
LP
877 if (arg_template && arg_image) {
878 log_error("--template= and --image= may not be combined.");
879 return -EINVAL;
880 }
881
882 if (arg_template && !(arg_directory || arg_machine)) {
883 log_error("--template= needs --directory= or --machine=.");
884 return -EINVAL;
885 }
886
887 if (arg_ephemeral && arg_template) {
888 log_error("--ephemeral and --template= may not be combined.");
889 return -EINVAL;
890 }
891
892 if (arg_ephemeral && arg_image) {
893 log_error("--ephemeral and --image= may not be combined.");
894 return -EINVAL;
895 }
896
df9a75e4
LP
897 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
898 log_error("--ephemeral and --link-journal= may not be combined.");
899 return -EINVAL;
900 }
901
f757855e
LP
902 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
903 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
904
905 if (argc > optind) {
906 arg_parameters = strv_copy(argv + optind);
907 if (!arg_parameters)
908 return log_oom();
909
910 arg_settings_mask |= SETTING_BOOT;
911 }
912
913 /* Load all settings from .nspawn files */
914 if (mask_no_settings)
915 arg_settings_mask = 0;
916
917 /* Don't load any settings from .nspawn files */
918 if (mask_all_settings)
919 arg_settings_mask = _SETTINGS_MASK_ALL;
920
921 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
922
923 r = detect_unified_cgroup_hierarchy();
924 if (r < 0)
925 return r;
926
6aadfa4c
ILG
927 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
928 if (e)
929 arg_container_service_name = e;
930
f757855e
LP
931 return 1;
932}
933
934static int verify_arguments(void) {
935
936 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
937 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
938 return -EINVAL;
939 }
940
6d0b55c2
LP
941 if (arg_expose_ports && !arg_private_network) {
942 log_error("Cannot use --port= without private networking.");
943 return -EINVAL;
944 }
945
c6c8f6e2
LP
946 if (arg_boot && arg_kill_signal <= 0)
947 arg_kill_signal = SIGRTMIN+3;
948
f757855e 949 return 0;
88213476
LP
950}
951
03cfe0d5
LP
952static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
953 assert(p);
954
955 if (!arg_userns)
956 return 0;
957
958 if (uid == UID_INVALID && gid == GID_INVALID)
959 return 0;
960
961 if (uid != UID_INVALID) {
962 uid += arg_uid_shift;
963
964 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
965 return -EOVERFLOW;
966 }
967
968 if (gid != GID_INVALID) {
969 gid += (gid_t) arg_uid_shift;
970
971 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
972 return -EOVERFLOW;
973 }
974
975 if (lchown(p, uid, gid) < 0)
976 return -errno;
b12afc8c
LP
977
978 return 0;
979}
980
03cfe0d5
LP
981static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
982 const char *q;
983
984 q = prefix_roota(root, path);
985 if (mkdir(q, mode) < 0) {
986 if (errno == EEXIST)
987 return 0;
988 return -errno;
989 }
990
991 return userns_lchown(q, uid, gid);
992}
993
e58a1277 994static int setup_timezone(const char *dest) {
03cfe0d5
LP
995 _cleanup_free_ char *p = NULL, *q = NULL;
996 const char *where, *check, *what;
d4036145
LP
997 char *z, *y;
998 int r;
f8440af5 999
e58a1277
LP
1000 assert(dest);
1001
1002 /* Fix the timezone, if possible */
d4036145
LP
1003 r = readlink_malloc("/etc/localtime", &p);
1004 if (r < 0) {
1005 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1006 return 0;
1007 }
1008
1009 z = path_startswith(p, "../usr/share/zoneinfo/");
1010 if (!z)
1011 z = path_startswith(p, "/usr/share/zoneinfo/");
1012 if (!z) {
1013 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1014 return 0;
1015 }
1016
03cfe0d5 1017 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1018 r = readlink_malloc(where, &q);
1019 if (r >= 0) {
1020 y = path_startswith(q, "../usr/share/zoneinfo/");
1021 if (!y)
1022 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1023
d4036145
LP
1024 /* Already pointing to the right place? Then do nothing .. */
1025 if (y && streq(y, z))
1026 return 0;
1027 }
1028
03cfe0d5 1029 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1030 check = prefix_roota(dest, check);
03cfe0d5 1031 if (laccess(check, F_OK) < 0) {
d4036145
LP
1032 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1033 return 0;
1034 }
68fb0892 1035
79d80fc1
TG
1036 r = unlink(where);
1037 if (r < 0 && errno != ENOENT) {
56f64d95 1038 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1039 return 0;
1040 }
4d9f07b4 1041
03cfe0d5 1042 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1043 if (symlink(what, where) < 0) {
56f64d95 1044 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1045 return 0;
1046 }
e58a1277 1047
03cfe0d5
LP
1048 r = userns_lchown(where, 0, 0);
1049 if (r < 0)
1050 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1051
e58a1277 1052 return 0;
88213476
LP
1053}
1054
2547bb41 1055static int setup_resolv_conf(const char *dest) {
03cfe0d5 1056 const char *where = NULL;
79d80fc1 1057 int r;
2547bb41
LP
1058
1059 assert(dest);
1060
1061 if (arg_private_network)
1062 return 0;
1063
1064 /* Fix resolv.conf, if possible */
03cfe0d5 1065 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1066
f2068bcc 1067 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1068 if (r < 0) {
68a313c5
LP
1069 /* If the file already exists as symlink, let's
1070 * suppress the warning, under the assumption that
1071 * resolved or something similar runs inside and the
1072 * symlink points there.
1073 *
1074 * If the disk image is read-only, there's also no
1075 * point in complaining.
1076 */
1077 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1078 "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1079 return 0;
1080 }
2547bb41 1081
03cfe0d5
LP
1082 r = userns_lchown(where, 0, 0);
1083 if (r < 0)
1084 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1085
2547bb41
LP
1086 return 0;
1087}
1088
9f24adc2 1089static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
03cfe0d5 1090 assert(s);
9f24adc2
LP
1091
1092 snprintf(s, 37,
1093 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1094 SD_ID128_FORMAT_VAL(id));
1095
1096 return s;
1097}
1098
04bc4a3f 1099static int setup_boot_id(const char *dest) {
03cfe0d5 1100 const char *from, *to;
39883f62 1101 sd_id128_t rnd = {};
04bc4a3f
LP
1102 char as_uuid[37];
1103 int r;
1104
eb91eb18
LP
1105 if (arg_share_system)
1106 return 0;
1107
04bc4a3f
LP
1108 /* Generate a new randomized boot ID, so that each boot-up of
1109 * the container gets a new one */
1110
03cfe0d5
LP
1111 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1112 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1113
1114 r = sd_id128_randomize(&rnd);
f647962d
MS
1115 if (r < 0)
1116 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1117
9f24adc2 1118 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1119
4c1fc3e4 1120 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
f647962d
MS
1121 if (r < 0)
1122 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1123
03cfe0d5
LP
1124 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1125 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1126 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
56f64d95 1127 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1128
1129 unlink(from);
04bc4a3f
LP
1130 return r;
1131}
1132
e58a1277 1133static int copy_devnodes(const char *dest) {
88213476
LP
1134
1135 static const char devnodes[] =
1136 "null\0"
1137 "zero\0"
1138 "full\0"
1139 "random\0"
1140 "urandom\0"
85614d66
TG
1141 "tty\0"
1142 "net/tun\0";
88213476
LP
1143
1144 const char *d;
e58a1277 1145 int r = 0;
7fd1b19b 1146 _cleanup_umask_ mode_t u;
a258bf26
LP
1147
1148 assert(dest);
124640f1
LP
1149
1150 u = umask(0000);
88213476 1151
03cfe0d5
LP
1152 /* Create /dev/net, so that we can create /dev/net/tun in it */
1153 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1154 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1155
88213476 1156 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1157 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1158 struct stat st;
88213476 1159
7f112f50 1160 from = strappend("/dev/", d);
03cfe0d5 1161 to = prefix_root(dest, from);
88213476
LP
1162
1163 if (stat(from, &st) < 0) {
1164
4a62c710
MS
1165 if (errno != ENOENT)
1166 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1167
a258bf26 1168 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1169
03cfe0d5 1170 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1171 return -EIO;
a258bf26 1172
85614d66 1173 } else {
81f5049b
AC
1174 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1175 if (errno != EPERM)
1176 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1177
1178 /* Some systems abusively restrict mknod but
1179 * allow bind mounts. */
1180 r = touch(to);
1181 if (r < 0)
1182 return log_error_errno(r, "touch (%s) failed: %m", to);
1183 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1184 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1185 }
6278cf60 1186
03cfe0d5
LP
1187 r = userns_lchown(to, 0, 0);
1188 if (r < 0)
1189 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1190 }
88213476
LP
1191 }
1192
e58a1277
LP
1193 return r;
1194}
88213476 1195
03cfe0d5
LP
1196static int setup_pts(const char *dest) {
1197 _cleanup_free_ char *options = NULL;
1198 const char *p;
709f6e46 1199 int r;
03cfe0d5
LP
1200
1201#ifdef HAVE_SELINUX
1202 if (arg_selinux_apifs_context)
1203 (void) asprintf(&options,
3dce8915 1204 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1205 arg_uid_shift + TTY_GID,
1206 arg_selinux_apifs_context);
1207 else
1208#endif
1209 (void) asprintf(&options,
3dce8915 1210 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1211 arg_uid_shift + TTY_GID);
f2d88580 1212
03cfe0d5 1213 if (!options)
f2d88580
LP
1214 return log_oom();
1215
03cfe0d5 1216 /* Mount /dev/pts itself */
cc9fce65 1217 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1218 if (mkdir(p, 0755) < 0)
1219 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1220 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1221 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
709f6e46
MS
1222 r = userns_lchown(p, 0, 0);
1223 if (r < 0)
1224 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1225
1226 /* Create /dev/ptmx symlink */
1227 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1228 if (symlink("pts/ptmx", p) < 0)
1229 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1230 r = userns_lchown(p, 0, 0);
1231 if (r < 0)
1232 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1233
03cfe0d5
LP
1234 /* And fix /dev/pts/ptmx ownership */
1235 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1236 r = userns_lchown(p, 0, 0);
1237 if (r < 0)
1238 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1239
f2d88580
LP
1240 return 0;
1241}
1242
e58a1277 1243static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1244 _cleanup_umask_ mode_t u;
1245 const char *to;
e58a1277 1246 int r;
e58a1277
LP
1247
1248 assert(dest);
1249 assert(console);
1250
1251 u = umask(0000);
1252
03cfe0d5 1253 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1254 if (r < 0)
1255 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1256
a258bf26
LP
1257 /* We need to bind mount the right tty to /dev/console since
1258 * ptys can only exist on pts file systems. To have something
81f5049b 1259 * to bind mount things on we create a empty regular file. */
a258bf26 1260
03cfe0d5 1261 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1262 r = touch(to);
1263 if (r < 0)
1264 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1265
4543768d 1266 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1267 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1268
25ea79fe 1269 return 0;
e58a1277
LP
1270}
1271
1272static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1273 const char *from, *to;
7fd1b19b 1274 _cleanup_umask_ mode_t u;
d9603714 1275 int fd, r;
e58a1277 1276
e58a1277 1277 assert(kmsg_socket >= 0);
a258bf26 1278
e58a1277 1279 u = umask(0000);
a258bf26 1280
03cfe0d5 1281 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1282 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1283 * on the reading side behave very similar to /proc/kmsg,
1284 * their writing side behaves differently from /dev/kmsg in
1285 * that writing blocks when nothing is reading. In order to
1286 * avoid any problems with containers deadlocking due to this
1287 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1288 from = prefix_roota(dest, "/run/kmsg");
1289 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1290
4a62c710 1291 if (mkfifo(from, 0600) < 0)
03cfe0d5 1292 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 1293 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1294 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1295
1296 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1297 if (fd < 0)
1298 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1299
e58a1277
LP
1300 /* Store away the fd in the socket, so that it stays open as
1301 * long as we run the child */
3ee897d6 1302 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1303 safe_close(fd);
e58a1277 1304
d9603714
DH
1305 if (r < 0)
1306 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1307
03cfe0d5
LP
1308 /* And now make the FIFO unavailable as /run/kmsg... */
1309 (void) unlink(from);
1310
25ea79fe 1311 return 0;
88213476
LP
1312}
1313
1c4baffc 1314static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1315 union in_addr_union *exposed = userdata;
1316
1317 assert(rtnl);
1318 assert(m);
1319 assert(exposed);
1320
7a8f6325 1321 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1322 return 0;
1323}
1324
3a74cea5 1325static int setup_hostname(void) {
3a74cea5 1326
eb91eb18
LP
1327 if (arg_share_system)
1328 return 0;
1329
605f81a8 1330 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1331 return -errno;
3a74cea5 1332
7027ff61 1333 return 0;
3a74cea5
LP
1334}
1335
57fb9fb5 1336static int setup_journal(const char *directory) {
4d680aee 1337 sd_id128_t machine_id, this_id;
03cfe0d5
LP
1338 _cleanup_free_ char *b = NULL, *d = NULL;
1339 const char *etc_machine_id, *p, *q;
8054d749 1340 bool try;
27407a01 1341 char *id;
57fb9fb5
LP
1342 int r;
1343
df9a75e4
LP
1344 /* Don't link journals in ephemeral mode */
1345 if (arg_ephemeral)
1346 return 0;
1347
8054d749
LP
1348 if (arg_link_journal == LINK_NO)
1349 return 0;
1350
1351 try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
1352
03cfe0d5 1353 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
57fb9fb5 1354
03cfe0d5 1355 r = read_one_line_file(etc_machine_id, &b);
8054d749 1356 if (r == -ENOENT && try)
27407a01 1357 return 0;
f647962d 1358 else if (r < 0)
03cfe0d5 1359 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
57fb9fb5 1360
27407a01 1361 id = strstrip(b);
8054d749 1362 if (isempty(id) && try)
27407a01 1363 return 0;
57fb9fb5 1364
27407a01
ZJS
1365 /* Verify validity */
1366 r = sd_id128_from_string(id, &machine_id);
f647962d 1367 if (r < 0)
03cfe0d5 1368 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
57fb9fb5 1369
4d680aee 1370 r = sd_id128_get_machine(&this_id);
f647962d
MS
1371 if (r < 0)
1372 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1373
1374 if (sd_id128_equal(machine_id, this_id)) {
8054d749 1375 log_full(try ? LOG_WARNING : LOG_ERR,
4d680aee 1376 "Host and machine ids are equal (%s): refusing to link journals", id);
8054d749 1377 if (try)
4d680aee 1378 return 0;
df9a75e4 1379 return -EEXIST;
4d680aee
ZJS
1380 }
1381
03cfe0d5
LP
1382 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1383 if (r < 0)
1384 return log_error_errno(r, "Failed to create /var: %m");
1385
1386 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1387 if (r < 0)
1388 return log_error_errno(r, "Failed to create /var/log: %m");
1389
1390 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1391 if (r < 0)
1392 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1393
1394 p = strjoina("/var/log/journal/", id);
1395 q = prefix_roota(directory, p);
27407a01 1396
e26d6ce5 1397 if (path_is_mount_point(p, 0) > 0) {
8054d749
LP
1398 if (try)
1399 return 0;
27407a01 1400
8054d749
LP
1401 log_error("%s: already a mount point, refusing to use for journal", p);
1402 return -EEXIST;
57fb9fb5
LP
1403 }
1404
e26d6ce5 1405 if (path_is_mount_point(q, 0) > 0) {
8054d749
LP
1406 if (try)
1407 return 0;
57fb9fb5 1408
8054d749
LP
1409 log_error("%s: already a mount point, refusing to use for journal", q);
1410 return -EEXIST;
57fb9fb5
LP
1411 }
1412
1413 r = readlink_and_make_absolute(p, &d);
1414 if (r >= 0) {
1415 if ((arg_link_journal == LINK_GUEST ||
1416 arg_link_journal == LINK_AUTO) &&
1417 path_equal(d, q)) {
1418
03cfe0d5 1419 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1420 if (r < 0)
709f6e46 1421 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1422 return 0;
57fb9fb5
LP
1423 }
1424
4a62c710
MS
1425 if (unlink(p) < 0)
1426 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1427 } else if (r == -EINVAL) {
1428
1429 if (arg_link_journal == LINK_GUEST &&
1430 rmdir(p) < 0) {
1431
27407a01
ZJS
1432 if (errno == ENOTDIR) {
1433 log_error("%s already exists and is neither a symlink nor a directory", p);
1434 return r;
4314d33f
MS
1435 } else
1436 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1437 }
4314d33f
MS
1438 } else if (r != -ENOENT)
1439 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1440
1441 if (arg_link_journal == LINK_GUEST) {
1442
1443 if (symlink(q, p) < 0) {
8054d749 1444 if (try) {
56f64d95 1445 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1446 return 0;
4314d33f
MS
1447 } else
1448 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1449 }
1450
03cfe0d5 1451 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1452 if (r < 0)
709f6e46 1453 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1454 return 0;
57fb9fb5
LP
1455 }
1456
1457 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1458 /* don't create parents here -- if the host doesn't have
1459 * permanent journal set up, don't force it here */
ba8e6c4d
LP
1460
1461 if (mkdir(p, 0755) < 0 && errno != EEXIST) {
8054d749 1462 if (try) {
56f64d95 1463 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1464 return 0;
4314d33f
MS
1465 } else
1466 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1467 }
1468
27407a01
ZJS
1469 } else if (access(p, F_OK) < 0)
1470 return 0;
57fb9fb5 1471
cdb2b9d0
LP
1472 if (dir_is_empty(q) == 0)
1473 log_warning("%s is not empty, proceeding anyway.", q);
1474
03cfe0d5 1475 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1476 if (r < 0)
1477 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1478
4543768d 1479 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 1480 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1481
27407a01 1482 return 0;
57fb9fb5
LP
1483}
1484
88213476 1485static int drop_capabilities(void) {
a103496c 1486 return capability_bounding_set_drop(arg_retain, false);
88213476
LP
1487}
1488
db999e0f
LP
1489static int reset_audit_loginuid(void) {
1490 _cleanup_free_ char *p = NULL;
1491 int r;
1492
1493 if (arg_share_system)
1494 return 0;
1495
1496 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1497 if (r == -ENOENT)
db999e0f 1498 return 0;
f647962d
MS
1499 if (r < 0)
1500 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1501
1502 /* Already reset? */
1503 if (streq(p, "4294967295"))
1504 return 0;
1505
ad118bda 1506 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1507 if (r < 0) {
10a87006
LP
1508 log_error_errno(r,
1509 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1510 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1511 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1512 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1513 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1514
db999e0f 1515 sleep(5);
77b6e194 1516 }
db999e0f
LP
1517
1518 return 0;
77b6e194
LP
1519}
1520
28650077 1521static int setup_seccomp(void) {
24fb1112
LP
1522
1523#ifdef HAVE_SECCOMP
9a71b112
JF
1524 static const struct {
1525 uint64_t capability;
1526 int syscall_num;
1527 } blacklist[] = {
5ba7a268
LP
1528 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1529 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1530 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1531 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1532 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1533 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1534 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1535 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1536 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1537 { CAP_SYSLOG, SCMP_SYS(syslog) },
d0a0ccf3
JF
1538 };
1539
24fb1112 1540 scmp_filter_ctx seccomp;
28650077 1541 unsigned i;
24fb1112
LP
1542 int r;
1543
24fb1112
LP
1544 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1545 if (!seccomp)
1546 return log_oom();
1547
e9642be2 1548 r = seccomp_add_secondary_archs(seccomp);
9875fd78 1549 if (r < 0) {
da927ba9 1550 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
1551 goto finish;
1552 }
1553
28650077 1554 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
1555 if (arg_retain & (1ULL << blacklist[i].capability))
1556 continue;
1557
1558 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
1559 if (r == -EFAULT)
1560 continue; /* unknown syscall */
1561 if (r < 0) {
da927ba9 1562 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
1563 goto finish;
1564 }
1565 }
1566
d0a0ccf3 1567
28650077
LP
1568 /*
1569 Audit is broken in containers, much of the userspace audit
1570 hookup will fail if running inside a container. We don't
1571 care and just turn off creation of audit sockets.
1572
1573 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1574 with EAFNOSUPPORT which audit userspace uses as indication
1575 that audit is disabled in the kernel.
1576 */
1577
3302da46 1578 r = seccomp_rule_add(
24fb1112
LP
1579 seccomp,
1580 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1581 SCMP_SYS(socket),
1582 2,
1583 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1584 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1585 if (r < 0) {
da927ba9 1586 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
1587 goto finish;
1588 }
1589
1590 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1591 if (r < 0) {
da927ba9 1592 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
1593 goto finish;
1594 }
1595
1596 r = seccomp_load(seccomp);
9b1cbdc6
ILG
1597 if (r == -EINVAL) {
1598 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1599 r = 0;
1600 goto finish;
1601 }
1602 if (r < 0) {
da927ba9 1603 log_error_errno(r, "Failed to install seccomp audit filter: %m");
9b1cbdc6
ILG
1604 goto finish;
1605 }
24fb1112
LP
1606
1607finish:
1608 seccomp_release(seccomp);
1609 return r;
1610#else
1611 return 0;
1612#endif
1613
1614}
1615
785890ac
LP
1616static int setup_propagate(const char *root) {
1617 const char *p, *q;
709f6e46 1618 int r;
785890ac
LP
1619
1620 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1621 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1622 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1623 (void) mkdir_p(p, 0600);
1624
709f6e46
MS
1625 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1626 if (r < 0)
1627 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1628
709f6e46
MS
1629 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1630 if (r < 0)
1631 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1632
709f6e46
MS
1633 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1634 if (r < 0)
1635 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1636
03cfe0d5 1637 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
1638 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1639 return log_error_errno(errno, "Failed to install propagation bind mount.");
1640
1641 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1642 return log_error_errno(errno, "Failed to make propagation mount read-only");
1643
1644 return 0;
1645}
1646
1b9e5b12
LP
1647static int setup_image(char **device_path, int *loop_nr) {
1648 struct loop_info64 info = {
1649 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1650 };
1651 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1652 _cleanup_free_ char* loopdev = NULL;
1653 struct stat st;
1654 int r, nr;
1655
1656 assert(device_path);
1657 assert(loop_nr);
ec16945e 1658 assert(arg_image);
1b9e5b12
LP
1659
1660 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1661 if (fd < 0)
1662 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 1663
4a62c710
MS
1664 if (fstat(fd, &st) < 0)
1665 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
1666
1667 if (S_ISBLK(st.st_mode)) {
1668 char *p;
1669
1670 p = strdup(arg_image);
1671 if (!p)
1672 return log_oom();
1673
1674 *device_path = p;
1675
1676 *loop_nr = -1;
1677
1678 r = fd;
1679 fd = -1;
1680
1681 return r;
1682 }
1683
1684 if (!S_ISREG(st.st_mode)) {
070edd97 1685 log_error("%s is not a regular file or block device.", arg_image);
1b9e5b12
LP
1686 return -EINVAL;
1687 }
1688
1689 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1690 if (control < 0)
1691 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
1692
1693 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
1694 if (nr < 0)
1695 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
1696
1697 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1698 return log_oom();
1699
1700 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1701 if (loop < 0)
1702 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 1703
4a62c710
MS
1704 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1705 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
1706
1707 if (arg_read_only)
1708 info.lo_flags |= LO_FLAGS_READ_ONLY;
1709
4a62c710
MS
1710 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1711 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
1712
1713 *device_path = loopdev;
1714 loopdev = NULL;
1715
1716 *loop_nr = nr;
1717
1718 r = loop;
1719 loop = -1;
1720
1721 return r;
1722}
1723
ada4799a
LP
1724#define PARTITION_TABLE_BLURB \
1725 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 1726 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 1727 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
1728 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1729 "to be bootable with systemd-nspawn."
1730
1b9e5b12
LP
1731static int dissect_image(
1732 int fd,
727fd4fd
LP
1733 char **root_device, bool *root_device_rw,
1734 char **home_device, bool *home_device_rw,
1735 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
1736 bool *secondary) {
1737
1738#ifdef HAVE_BLKID
01dc33ce
ZJS
1739 int home_nr = -1, srv_nr = -1;
1740#ifdef GPT_ROOT_NATIVE
1741 int root_nr = -1;
1742#endif
1743#ifdef GPT_ROOT_SECONDARY
1744 int secondary_root_nr = -1;
1745#endif
f6c51a81 1746 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
1747 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1748 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1749 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1750 _cleanup_udev_unref_ struct udev *udev = NULL;
1751 struct udev_list_entry *first, *item;
f6c51a81 1752 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 1753 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
1754 const char *pttype = NULL;
1755 blkid_partlist pl;
1756 struct stat st;
c09ef2e4 1757 unsigned i;
1b9e5b12
LP
1758 int r;
1759
1760 assert(fd >= 0);
1761 assert(root_device);
1762 assert(home_device);
1763 assert(srv_device);
1764 assert(secondary);
ec16945e 1765 assert(arg_image);
1b9e5b12
LP
1766
1767 b = blkid_new_probe();
1768 if (!b)
1769 return log_oom();
1770
1771 errno = 0;
1772 r = blkid_probe_set_device(b, fd, 0, 0);
1773 if (r != 0) {
1774 if (errno == 0)
1775 return log_oom();
1776
e1427b13 1777 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
1778 }
1779
1780 blkid_probe_enable_partitions(b, 1);
1781 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1782
1783 errno = 0;
1784 r = blkid_do_safeprobe(b);
1785 if (r == -2 || r == 1) {
ada4799a
LP
1786 log_error("Failed to identify any partition table on\n"
1787 " %s\n"
1788 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1789 return -EINVAL;
1790 } else if (r != 0) {
1791 if (errno == 0)
1792 errno = EIO;
e1427b13 1793 return log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
1794 }
1795
48861960 1796 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
1797
1798 is_gpt = streq_ptr(pttype, "gpt");
1799 is_mbr = streq_ptr(pttype, "dos");
1800
1801 if (!is_gpt && !is_mbr) {
1802 log_error("No GPT or MBR partition table discovered on\n"
1803 " %s\n"
1804 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1805 return -EINVAL;
1806 }
1807
1808 errno = 0;
1809 pl = blkid_probe_get_partitions(b);
1810 if (!pl) {
1811 if (errno == 0)
1812 return log_oom();
1813
1814 log_error("Failed to list partitions of %s", arg_image);
1815 return -errno;
1816 }
1817
1818 udev = udev_new();
1819 if (!udev)
1820 return log_oom();
1821
4a62c710
MS
1822 if (fstat(fd, &st) < 0)
1823 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 1824
c09ef2e4
LP
1825 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1826 if (!d)
1b9e5b12
LP
1827 return log_oom();
1828
c09ef2e4
LP
1829 for (i = 0;; i++) {
1830 int n, m;
1b9e5b12 1831
c09ef2e4
LP
1832 if (i >= 10) {
1833 log_error("Kernel partitions never appeared.");
1834 return -ENXIO;
1835 }
1836
1837 e = udev_enumerate_new(udev);
1838 if (!e)
1839 return log_oom();
1840
1841 r = udev_enumerate_add_match_parent(e, d);
1842 if (r < 0)
1843 return log_oom();
1844
1845 r = udev_enumerate_scan_devices(e);
1846 if (r < 0)
1847 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1848
1849 /* Count the partitions enumerated by the kernel */
1850 n = 0;
1851 first = udev_enumerate_get_list_entry(e);
1852 udev_list_entry_foreach(item, first)
1853 n++;
1854
1855 /* Count the partitions enumerated by blkid */
1856 m = blkid_partlist_numof_partitions(pl);
1857 if (n == m + 1)
1858 break;
1859 if (n > m + 1) {
1860 log_error("blkid and kernel partition list do not match.");
1861 return -EIO;
1862 }
1863 if (n < m + 1) {
1864 unsigned j;
1865
1866 /* The kernel has probed fewer partitions than
1867 * blkid? Maybe the kernel prober is still
1868 * running or it got EBUSY because udev
1869 * already opened the device. Let's reprobe
1870 * the device, which is a synchronous call
1871 * that waits until probing is complete. */
1872
1873 for (j = 0; j < 20; j++) {
1874
1875 r = ioctl(fd, BLKRRPART, 0);
1876 if (r < 0)
1877 r = -errno;
1878 if (r >= 0 || r != -EBUSY)
1879 break;
1880
1881 /* If something else has the device
1882 * open, such as an udev rule, the
1883 * ioctl will return EBUSY. Since
1884 * there's no way to wait until it
1885 * isn't busy anymore, let's just wait
1886 * a bit, and try again.
1887 *
1888 * This is really something they
1889 * should fix in the kernel! */
1890
1891 usleep(50 * USEC_PER_MSEC);
1892 }
1893
1894 if (r < 0)
1895 return log_error_errno(r, "Failed to reread partition table: %m");
1896 }
1897
1898 e = udev_enumerate_unref(e);
1899 }
1b9e5b12
LP
1900
1901 first = udev_enumerate_get_list_entry(e);
1902 udev_list_entry_foreach(item, first) {
1903 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 1904 const char *node;
727fd4fd 1905 unsigned long long flags;
1b9e5b12
LP
1906 blkid_partition pp;
1907 dev_t qn;
1908 int nr;
1909
1910 errno = 0;
1911 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1912 if (!q) {
1913 if (!errno)
1914 errno = ENOMEM;
1915
e1427b13 1916 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
1917 }
1918
1919 qn = udev_device_get_devnum(q);
1920 if (major(qn) == 0)
1921 continue;
1922
1923 if (st.st_rdev == qn)
1924 continue;
1925
1926 node = udev_device_get_devnode(q);
1927 if (!node)
1928 continue;
1929
1930 pp = blkid_partlist_devno_to_partition(pl, qn);
1931 if (!pp)
1932 continue;
1933
727fd4fd 1934 flags = blkid_partition_get_flags(pp);
727fd4fd 1935
1b9e5b12
LP
1936 nr = blkid_partition_get_partno(pp);
1937 if (nr < 0)
1938 continue;
1939
ada4799a
LP
1940 if (is_gpt) {
1941 sd_id128_t type_id;
1942 const char *stype;
1b9e5b12 1943
f6c51a81
LP
1944 if (flags & GPT_FLAG_NO_AUTO)
1945 continue;
1946
ada4799a
LP
1947 stype = blkid_partition_get_type_string(pp);
1948 if (!stype)
1949 continue;
1b9e5b12 1950
ada4799a 1951 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
1952 continue;
1953
ada4799a 1954 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 1955
ada4799a
LP
1956 if (home && nr >= home_nr)
1957 continue;
1b9e5b12 1958
ada4799a
LP
1959 home_nr = nr;
1960 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 1961
ada4799a
LP
1962 r = free_and_strdup(&home, node);
1963 if (r < 0)
1964 return log_oom();
727fd4fd 1965
ada4799a
LP
1966 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1967
1968 if (srv && nr >= srv_nr)
1969 continue;
1970
1971 srv_nr = nr;
1972 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1973
1974 r = free_and_strdup(&srv, node);
1975 if (r < 0)
1976 return log_oom();
1977 }
1b9e5b12 1978#ifdef GPT_ROOT_NATIVE
ada4799a 1979 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 1980
ada4799a
LP
1981 if (root && nr >= root_nr)
1982 continue;
1b9e5b12 1983
ada4799a
LP
1984 root_nr = nr;
1985 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 1986
ada4799a
LP
1987 r = free_and_strdup(&root, node);
1988 if (r < 0)
1989 return log_oom();
1990 }
1b9e5b12
LP
1991#endif
1992#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
1993 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1994
1995 if (secondary_root && nr >= secondary_root_nr)
1996 continue;
1997
1998 secondary_root_nr = nr;
1999 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2000
2001 r = free_and_strdup(&secondary_root, node);
2002 if (r < 0)
2003 return log_oom();
2004 }
2005#endif
f6c51a81
LP
2006 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2007
2008 if (generic)
2009 multiple_generic = true;
2010 else {
2011 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2012
2013 r = free_and_strdup(&generic, node);
2014 if (r < 0)
2015 return log_oom();
2016 }
2017 }
ada4799a
LP
2018
2019 } else if (is_mbr) {
2020 int type;
1b9e5b12 2021
f6c51a81
LP
2022 if (flags != 0x80) /* Bootable flag */
2023 continue;
2024
ada4799a
LP
2025 type = blkid_partition_get_type(pp);
2026 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2027 continue;
2028
f6c51a81
LP
2029 if (generic)
2030 multiple_generic = true;
2031 else {
2032 generic_rw = true;
727fd4fd 2033
f6c51a81
LP
2034 r = free_and_strdup(&root, node);
2035 if (r < 0)
2036 return log_oom();
2037 }
1b9e5b12 2038 }
1b9e5b12
LP
2039 }
2040
1b9e5b12
LP
2041 if (root) {
2042 *root_device = root;
2043 root = NULL;
727fd4fd
LP
2044
2045 *root_device_rw = root_rw;
1b9e5b12
LP
2046 *secondary = false;
2047 } else if (secondary_root) {
2048 *root_device = secondary_root;
2049 secondary_root = NULL;
727fd4fd
LP
2050
2051 *root_device_rw = secondary_root_rw;
1b9e5b12 2052 *secondary = true;
f6c51a81
LP
2053 } else if (generic) {
2054
2055 /* There were no partitions with precise meanings
2056 * around, but we found generic partitions. In this
2057 * case, if there's only one, we can go ahead and boot
2058 * it, otherwise we bail out, because we really cannot
2059 * make any sense of it. */
2060
2061 if (multiple_generic) {
2062 log_error("Identified multiple bootable Linux partitions on\n"
2063 " %s\n"
2064 PARTITION_TABLE_BLURB, arg_image);
2065 return -EINVAL;
2066 }
2067
2068 *root_device = generic;
2069 generic = NULL;
2070
2071 *root_device_rw = generic_rw;
2072 *secondary = false;
2073 } else {
2074 log_error("Failed to identify root partition in disk image\n"
2075 " %s\n"
2076 PARTITION_TABLE_BLURB, arg_image);
2077 return -EINVAL;
1b9e5b12
LP
2078 }
2079
2080 if (home) {
2081 *home_device = home;
2082 home = NULL;
727fd4fd
LP
2083
2084 *home_device_rw = home_rw;
1b9e5b12
LP
2085 }
2086
2087 if (srv) {
2088 *srv_device = srv;
2089 srv = NULL;
727fd4fd
LP
2090
2091 *srv_device_rw = srv_rw;
1b9e5b12
LP
2092 }
2093
2094 return 0;
2095#else
2096 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2097 return -EOPNOTSUPP;
1b9e5b12
LP
2098#endif
2099}
2100
727fd4fd 2101static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2102#ifdef HAVE_BLKID
2103 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2104 const char *fstype, *p;
2105 int r;
2106
2107 assert(what);
2108 assert(where);
2109
727fd4fd
LP
2110 if (arg_read_only)
2111 rw = false;
2112
1b9e5b12 2113 if (directory)
63c372cb 2114 p = strjoina(where, directory);
1b9e5b12
LP
2115 else
2116 p = where;
2117
2118 errno = 0;
2119 b = blkid_new_probe_from_filename(what);
2120 if (!b) {
2121 if (errno == 0)
2122 return log_oom();
e1427b13 2123 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2124 }
2125
2126 blkid_probe_enable_superblocks(b, 1);
2127 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2128
2129 errno = 0;
2130 r = blkid_do_safeprobe(b);
2131 if (r == -1 || r == 1) {
2132 log_error("Cannot determine file system type of %s", what);
2133 return -EINVAL;
2134 } else if (r != 0) {
2135 if (errno == 0)
2136 errno = EIO;
e1427b13 2137 return log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2138 }
2139
2140 errno = 0;
2141 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2142 if (errno == 0)
2143 errno = EINVAL;
2144 log_error("Failed to determine file system type of %s", what);
2145 return -errno;
2146 }
2147
2148 if (streq(fstype, "crypto_LUKS")) {
2149 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 2150 return -EOPNOTSUPP;
1b9e5b12
LP
2151 }
2152
4a62c710
MS
2153 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2154 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
2155
2156 return 0;
2157#else
2158 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2159 return -EOPNOTSUPP;
1b9e5b12
LP
2160#endif
2161}
2162
727fd4fd
LP
2163static int mount_devices(
2164 const char *where,
2165 const char *root_device, bool root_device_rw,
2166 const char *home_device, bool home_device_rw,
2167 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2168 int r;
2169
2170 assert(where);
2171
2172 if (root_device) {
727fd4fd 2173 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2174 if (r < 0)
2175 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2176 }
2177
2178 if (home_device) {
727fd4fd 2179 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2180 if (r < 0)
2181 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2182 }
2183
2184 if (srv_device) {
727fd4fd 2185 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2186 if (r < 0)
2187 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2188 }
2189
2190 return 0;
2191}
2192
2193static void loop_remove(int nr, int *image_fd) {
2194 _cleanup_close_ int control = -1;
e8c8ddcc 2195 int r;
1b9e5b12
LP
2196
2197 if (nr < 0)
2198 return;
2199
2200 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2201 r = ioctl(*image_fd, LOOP_CLR_FD);
2202 if (r < 0)
5e4074aa 2203 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2204 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2205 }
2206
2207 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2208 if (control < 0) {
56f64d95 2209 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2210 return;
e8c8ddcc 2211 }
1b9e5b12 2212
e8c8ddcc
TG
2213 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2214 if (r < 0)
5e4074aa 2215 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2216}
2217
113cea80 2218/*
6d416b9c
LS
2219 * Return values:
2220 * < 0 : wait_for_terminate() failed to get the state of the
2221 * container, the container was terminated by a signal, or
2222 * failed for an unknown reason. No change is made to the
2223 * container argument.
2224 * > 0 : The program executed in the container terminated with an
2225 * error. The exit code of the program executed in the
919699ec
LP
2226 * container is returned. The container argument has been set
2227 * to CONTAINER_TERMINATED.
6d416b9c
LS
2228 * 0 : The container is being rebooted, has been shut down or exited
2229 * successfully. The container argument has been set to either
2230 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2231 *
6d416b9c
LS
2232 * That is, success is indicated by a return value of zero, and an
2233 * error is indicated by a non-zero value.
113cea80
DH
2234 */
2235static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2236 siginfo_t status;
919699ec 2237 int r;
113cea80
DH
2238
2239 r = wait_for_terminate(pid, &status);
f647962d
MS
2240 if (r < 0)
2241 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2242
2243 switch (status.si_code) {
fddbb89c 2244
113cea80 2245 case CLD_EXITED:
919699ec
LP
2246 if (status.si_status == 0) {
2247 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 2248
fddbb89c 2249 } else
919699ec 2250 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2251
919699ec
LP
2252 *container = CONTAINER_TERMINATED;
2253 return status.si_status;
113cea80
DH
2254
2255 case CLD_KILLED:
2256 if (status.si_status == SIGINT) {
113cea80 2257
919699ec 2258 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2259 *container = CONTAINER_TERMINATED;
919699ec
LP
2260 return 0;
2261
113cea80 2262 } else if (status.si_status == SIGHUP) {
113cea80 2263
919699ec 2264 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2265 *container = CONTAINER_REBOOTED;
919699ec 2266 return 0;
113cea80 2267 }
919699ec 2268
113cea80
DH
2269 /* CLD_KILLED fallthrough */
2270
2271 case CLD_DUMPED:
fddbb89c 2272 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2273 return -EIO;
113cea80
DH
2274
2275 default:
fddbb89c 2276 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2277 return -EIO;
113cea80
DH
2278 }
2279
2280 return r;
2281}
2282
023fb90b
LP
2283static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2284 pid_t pid;
2285
4a0b58c4 2286 pid = PTR_TO_PID(userdata);
023fb90b 2287 if (pid > 0) {
c6c8f6e2 2288 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2289 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2290 sd_event_source_set_userdata(s, NULL);
2291 return 0;
2292 }
2293 }
2294
2295 sd_event_exit(sd_event_source_get_event(s), 0);
2296 return 0;
2297}
2298
ec16945e 2299static int determine_names(void) {
1b9cebf6 2300 int r;
ec16945e 2301
c1521918
LP
2302 if (arg_template && !arg_directory && arg_machine) {
2303
2304 /* If --template= was specified then we should not
2305 * search for a machine, but instead create a new one
2306 * in /var/lib/machine. */
2307
2308 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2309 if (!arg_directory)
2310 return log_oom();
2311 }
2312
ec16945e 2313 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2314 if (arg_machine) {
2315 _cleanup_(image_unrefp) Image *i = NULL;
2316
2317 r = image_find(arg_machine, &i);
2318 if (r < 0)
2319 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2320 else if (r == 0) {
2321 log_error("No image for machine '%s': %m", arg_machine);
2322 return -ENOENT;
2323 }
2324
aceac2f0 2325 if (i->type == IMAGE_RAW)
0f03c2a4 2326 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2327 else
0f03c2a4 2328 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6
LP
2329 if (r < 0)
2330 return log_error_errno(r, "Invalid image directory: %m");
2331
aee327b8
LP
2332 if (!arg_ephemeral)
2333 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2334 } else
ec16945e
LP
2335 arg_directory = get_current_dir_name();
2336
1b9cebf6
LP
2337 if (!arg_directory && !arg_machine) {
2338 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2339 return -EINVAL;
2340 }
2341 }
2342
2343 if (!arg_machine) {
b9ba4dab
LP
2344 if (arg_directory && path_equal(arg_directory, "/"))
2345 arg_machine = gethostname_malloc();
2346 else
2347 arg_machine = strdup(basename(arg_image ?: arg_directory));
2348
ec16945e
LP
2349 if (!arg_machine)
2350 return log_oom();
2351
ae691c1d 2352 hostname_cleanup(arg_machine);
ec16945e
LP
2353 if (!machine_name_is_valid(arg_machine)) {
2354 log_error("Failed to determine machine name automatically, please use -M.");
2355 return -EINVAL;
2356 }
b9ba4dab
LP
2357
2358 if (arg_ephemeral) {
2359 char *b;
2360
2361 /* Add a random suffix when this is an
2362 * ephemeral machine, so that we can run many
2363 * instances at once without manually having
2364 * to specify -M each time. */
2365
2366 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2367 return log_oom();
2368
2369 free(arg_machine);
2370 arg_machine = b;
2371 }
ec16945e
LP
2372 }
2373
2374 return 0;
2375}
2376
03cfe0d5 2377static int determine_uid_shift(const char *directory) {
6dac160c
LP
2378 int r;
2379
03cfe0d5
LP
2380 if (!arg_userns) {
2381 arg_uid_shift = 0;
6dac160c 2382 return 0;
03cfe0d5 2383 }
6dac160c
LP
2384
2385 if (arg_uid_shift == UID_INVALID) {
2386 struct stat st;
2387
03cfe0d5 2388 r = stat(directory, &st);
6dac160c 2389 if (r < 0)
03cfe0d5 2390 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2391
2392 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2393
2394 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2395 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2396 return -EINVAL;
2397 }
2398
2399 arg_uid_range = UINT32_C(0x10000);
2400 }
2401
2402 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2403 log_error("UID base too high for UID range.");
2404 return -EINVAL;
2405 }
2406
2407 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2408 return 0;
2409}
2410
03cfe0d5
LP
2411static int inner_child(
2412 Barrier *barrier,
2413 const char *directory,
2414 bool secondary,
2415 int kmsg_socket,
2416 int rtnl_socket,
f757855e 2417 FDSet *fds) {
69c79d3c 2418
03cfe0d5 2419 _cleanup_free_ char *home = NULL;
6aadfa4c 2420 unsigned n_env = 1;
03cfe0d5
LP
2421 const char *envp[] = {
2422 "PATH=" DEFAULT_PATH_SPLIT_USR,
6aadfa4c 2423 NULL, /* container */
03cfe0d5
LP
2424 NULL, /* TERM */
2425 NULL, /* HOME */
2426 NULL, /* USER */
2427 NULL, /* LOGNAME */
2428 NULL, /* container_uuid */
2429 NULL, /* LISTEN_FDS */
2430 NULL, /* LISTEN_PID */
2431 NULL
2432 };
88213476 2433
2371271c 2434 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2435 int r;
88213476 2436
03cfe0d5
LP
2437 assert(barrier);
2438 assert(directory);
2439 assert(kmsg_socket >= 0);
88213476 2440
efdb0237
LP
2441 cg_unified_flush();
2442
03cfe0d5
LP
2443 if (arg_userns) {
2444 /* Tell the parent, that it now can write the UID map. */
2445 (void) barrier_place(barrier); /* #1 */
7027ff61 2446
03cfe0d5
LP
2447 /* Wait until the parent wrote the UID map */
2448 if (!barrier_place_and_sync(barrier)) { /* #2 */
2449 log_error("Parent died too early");
2450 return -ESRCH;
2451 }
88213476
LP
2452 }
2453
d1678248 2454 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2455 if (r < 0)
2456 return r;
2457
d8fc6a00
LP
2458 r = mount_sysfs(NULL);
2459 if (r < 0)
2460 return r;
2461
03cfe0d5
LP
2462 /* Wait until we are cgroup-ified, so that we
2463 * can mount the right cgroup path writable */
2464 if (!barrier_place_and_sync(barrier)) { /* #3 */
2465 log_error("Parent died too early");
2466 return -ESRCH;
88213476
LP
2467 }
2468
e83bebef 2469 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
03cfe0d5
LP
2470 if (r < 0)
2471 return r;
ec16945e 2472
03cfe0d5
LP
2473 r = reset_uid_gid();
2474 if (r < 0)
2475 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 2476
03cfe0d5
LP
2477 r = setup_boot_id(NULL);
2478 if (r < 0)
2479 return r;
ec16945e 2480
03cfe0d5
LP
2481 r = setup_kmsg(NULL, kmsg_socket);
2482 if (r < 0)
2483 return r;
2484 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2485
03cfe0d5 2486 umask(0022);
30535c16 2487
03cfe0d5
LP
2488 if (setsid() < 0)
2489 return log_error_errno(errno, "setsid() failed: %m");
2490
2491 if (arg_private_network)
2492 loopback_setup();
2493
7a8f6325
LP
2494 if (arg_expose_ports) {
2495 r = expose_port_send_rtnl(rtnl_socket);
2496 if (r < 0)
2497 return r;
2498 rtnl_socket = safe_close(rtnl_socket);
2499 }
03cfe0d5 2500
709f6e46
MS
2501 r = drop_capabilities();
2502 if (r < 0)
2503 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2504
2505 setup_hostname();
2506
050f7277 2507 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2508 if (personality(arg_personality) < 0)
2509 return log_error_errno(errno, "personality() failed: %m");
2510 } else if (secondary) {
2511 if (personality(PER_LINUX32) < 0)
2512 return log_error_errno(errno, "personality() failed: %m");
2513 }
2514
2515#ifdef HAVE_SELINUX
2516 if (arg_selinux_context)
2517 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2518 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2519#endif
2520
ee645080 2521 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2522 if (r < 0)
2523 return r;
2524
6aadfa4c
ILG
2525 /* LXC sets container=lxc, so follow the scheme here */
2526 envp[n_env++] = strjoina("container=", arg_container_service_name);
2527
03cfe0d5
LP
2528 envp[n_env] = strv_find_prefix(environ, "TERM=");
2529 if (envp[n_env])
2530 n_env ++;
2531
2532 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2533 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2534 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2535 return log_oom();
2536
2537 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2538 char as_uuid[37];
2539
2540 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2541 return log_oom();
2542 }
2543
2544 if (fdset_size(fds) > 0) {
2545 r = fdset_cloexec(fds, false);
2546 if (r < 0)
2547 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2548
2549 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2550 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2551 return log_oom();
2552 }
2553
2371271c
TG
2554 env_use = strv_env_merge(2, envp, arg_setenv);
2555 if (!env_use)
2556 return log_oom();
03cfe0d5
LP
2557
2558 /* Let the parent know that we are ready and
2559 * wait until the parent is ready with the
2560 * setup, too... */
2561 if (!barrier_place_and_sync(barrier)) { /* #4 */
2562 log_error("Parent died too early");
2563 return -ESRCH;
2564 }
2565
2566 /* Now, explicitly close the log, so that we
2567 * then can close all remaining fds. Closing
2568 * the log explicitly first has the benefit
2569 * that the logging subsystem knows about it,
2570 * and is thus ready to be reopened should we
2571 * need it again. Note that the other fds
2572 * closed here are at least the locking and
2573 * barrier fds. */
2574 log_close();
2575 (void) fdset_close_others(fds);
2576
2577 if (arg_boot) {
2578 char **a;
2579 size_t m;
2580
2581 /* Automatically search for the init system */
2582
f757855e 2583 m = 1 + strv_length(arg_parameters);
03cfe0d5 2584 a = newa(char*, m + 1);
f757855e
LP
2585 if (strv_isempty(arg_parameters))
2586 a[1] = NULL;
2587 else
2588 memcpy(a + 1, arg_parameters, m * sizeof(char*));
03cfe0d5
LP
2589
2590 a[0] = (char*) "/usr/lib/systemd/systemd";
2591 execve(a[0], a, env_use);
2592
2593 a[0] = (char*) "/lib/systemd/systemd";
2594 execve(a[0], a, env_use);
2595
2596 a[0] = (char*) "/sbin/init";
2597 execve(a[0], a, env_use);
f757855e
LP
2598 } else if (!strv_isempty(arg_parameters))
2599 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2600 else {
f757855e 2601 chdir(home ?: "/root");
03cfe0d5
LP
2602 execle("/bin/bash", "-bash", NULL, env_use);
2603 execle("/bin/sh", "-sh", NULL, env_use);
2604 }
2605
35607a8d 2606 r = -errno;
03cfe0d5 2607 (void) log_open();
35607a8d 2608 return log_error_errno(r, "execv() failed: %m");
03cfe0d5
LP
2609}
2610
2611static int outer_child(
2612 Barrier *barrier,
2613 const char *directory,
2614 const char *console,
2615 const char *root_device, bool root_device_rw,
2616 const char *home_device, bool home_device_rw,
2617 const char *srv_device, bool srv_device_rw,
2618 bool interactive,
2619 bool secondary,
2620 int pid_socket,
2621 int kmsg_socket,
2622 int rtnl_socket,
825d5287 2623 int uid_shift_socket,
f757855e 2624 FDSet *fds) {
03cfe0d5
LP
2625
2626 pid_t pid;
2627 ssize_t l;
2628 int r;
2629
2630 assert(barrier);
2631 assert(directory);
2632 assert(console);
2633 assert(pid_socket >= 0);
2634 assert(kmsg_socket >= 0);
2635
efdb0237
LP
2636 cg_unified_flush();
2637
03cfe0d5
LP
2638 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2639 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2640
2641 if (interactive) {
2642 close_nointr(STDIN_FILENO);
2643 close_nointr(STDOUT_FILENO);
2644 close_nointr(STDERR_FILENO);
2645
2646 r = open_terminal(console, O_RDWR);
2647 if (r != STDIN_FILENO) {
2648 if (r >= 0) {
2649 safe_close(r);
2650 r = -EINVAL;
2651 }
2652
2653 return log_error_errno(r, "Failed to open console: %m");
2654 }
2655
2656 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2657 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2658 return log_error_errno(errno, "Failed to duplicate console: %m");
2659 }
2660
2661 r = reset_audit_loginuid();
2662 if (r < 0)
2663 return r;
2664
2665 /* Mark everything as slave, so that we still
2666 * receive mounts from the real root, but don't
2667 * propagate mounts to the real root. */
2668 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2669 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2670
2671 r = mount_devices(directory,
2672 root_device, root_device_rw,
2673 home_device, home_device_rw,
2674 srv_device, srv_device_rw);
2675 if (r < 0)
2676 return r;
2677
391567f4
LP
2678 r = determine_uid_shift(directory);
2679 if (r < 0)
2680 return r;
2681
825d5287
RM
2682 if (arg_userns) {
2683 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2684 if (l < 0)
2685 return log_error_errno(errno, "Failed to send UID shift: %m");
2686 if (l != sizeof(arg_uid_shift)) {
2687 log_error("Short write while sending UID shift.");
2688 return -EIO;
2689 }
2690 }
2691
03cfe0d5
LP
2692 /* Turn directory into bind mount */
2693 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2694 return log_error_errno(errno, "Failed to make bind mount: %m");
2695
e83bebef 2696 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
2697 if (r < 0)
2698 return r;
2699
e83bebef 2700 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
2701 if (r < 0)
2702 return r;
2703
03cfe0d5
LP
2704 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2705 if (r < 0)
2706 return r;
2707
03cfe0d5
LP
2708 if (arg_read_only) {
2709 r = bind_remount_recursive(directory, true);
2710 if (r < 0)
2711 return log_error_errno(r, "Failed to make tree read-only: %m");
2712 }
2713
d1678248 2714 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2715 if (r < 0)
2716 return r;
2717
07fa00f9
LP
2718 r = copy_devnodes(directory);
2719 if (r < 0)
03cfe0d5
LP
2720 return r;
2721
2722 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2723
07fa00f9
LP
2724 r = setup_pts(directory);
2725 if (r < 0)
03cfe0d5
LP
2726 return r;
2727
2728 r = setup_propagate(directory);
2729 if (r < 0)
2730 return r;
2731
2732 r = setup_dev_console(directory, console);
2733 if (r < 0)
2734 return r;
2735
2736 r = setup_seccomp();
2737 if (r < 0)
2738 return r;
2739
2740 r = setup_timezone(directory);
2741 if (r < 0)
2742 return r;
2743
2744 r = setup_resolv_conf(directory);
2745 if (r < 0)
2746 return r;
2747
2748 r = setup_journal(directory);
2749 if (r < 0)
2750 return r;
2751
e83bebef 2752 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2753 if (r < 0)
2754 return r;
2755
e83bebef 2756 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2757 if (r < 0)
2758 return r;
2759
2760 r = mount_move_root(directory);
2761 if (r < 0)
2762 return log_error_errno(r, "Failed to move root directory: %m");
2763
2764 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2765 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2766 (arg_private_network ? CLONE_NEWNET : 0) |
2767 (arg_userns ? CLONE_NEWUSER : 0),
2768 NULL);
2769 if (pid < 0)
2770 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
2771 if (pid == 0) {
2772 pid_socket = safe_close(pid_socket);
825d5287 2773 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
2774
2775 /* The inner child has all namespaces that are
2776 * requested, so that we all are owned by the user if
2777 * user namespaces are turned on. */
2778
f757855e 2779 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
2780 if (r < 0)
2781 _exit(EXIT_FAILURE);
2782
2783 _exit(EXIT_SUCCESS);
2784 }
2785
2786 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2787 if (l < 0)
2788 return log_error_errno(errno, "Failed to send PID: %m");
2789 if (l != sizeof(pid)) {
2790 log_error("Short write while sending PID.");
2791 return -EIO;
2792 }
2793
2794 pid_socket = safe_close(pid_socket);
327e26d6
KN
2795 kmsg_socket = safe_close(kmsg_socket);
2796 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
2797
2798 return 0;
2799}
2800
2801static int setup_uid_map(pid_t pid) {
2802 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2803 int r;
2804
2805 assert(pid > 1);
2806
2807 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2808 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 2809 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2810 if (r < 0)
2811 return log_error_errno(r, "Failed to write UID map: %m");
2812
2813 /* We always assign the same UID and GID ranges */
2814 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 2815 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2816 if (r < 0)
2817 return log_error_errno(r, "Failed to write GID map: %m");
2818
2819 return 0;
2820}
2821
f757855e
LP
2822static int load_settings(void) {
2823 _cleanup_(settings_freep) Settings *settings = NULL;
2824 _cleanup_fclose_ FILE *f = NULL;
2825 _cleanup_free_ char *p = NULL;
2826 const char *fn, *i;
2827 int r;
2828
2829 /* If all settings are masked, there's no point in looking for
2830 * the settings file */
2831 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2832 return 0;
2833
2834 fn = strjoina(arg_machine, ".nspawn");
2835
2836 /* We first look in the admin's directories in /etc and /run */
2837 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2838 _cleanup_free_ char *j = NULL;
2839
2840 j = strjoin(i, "/", fn, NULL);
2841 if (!j)
2842 return log_oom();
2843
2844 f = fopen(j, "re");
2845 if (f) {
2846 p = j;
2847 j = NULL;
2848
b938cb90 2849 /* By default, we trust configuration from /etc and /run */
f757855e
LP
2850 if (arg_settings_trusted < 0)
2851 arg_settings_trusted = true;
2852
2853 break;
2854 }
2855
2856 if (errno != ENOENT)
2857 return log_error_errno(errno, "Failed to open %s: %m", j);
2858 }
2859
2860 if (!f) {
2861 /* After that, let's look for a file next to the
2862 * actual image we shall boot. */
2863
2864 if (arg_image) {
2865 p = file_in_same_dir(arg_image, fn);
2866 if (!p)
2867 return log_oom();
2868 } else if (arg_directory) {
2869 p = file_in_same_dir(arg_directory, fn);
2870 if (!p)
2871 return log_oom();
2872 }
2873
2874 if (p) {
2875 f = fopen(p, "re");
2876 if (!f && errno != ENOENT)
2877 return log_error_errno(errno, "Failed to open %s: %m", p);
2878
b938cb90 2879 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
2880 if (arg_settings_trusted < 0)
2881 arg_settings_trusted = false;
2882 }
2883 }
2884
2885 if (!f)
2886 return 0;
2887
2888 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2889
2890 r = settings_load(f, p, &settings);
2891 if (r < 0)
2892 return r;
2893
2894 /* Copy over bits from the settings, unless they have been
2895 * explicitly masked by command line switches. */
2896
2897 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2898 settings->boot >= 0) {
2899 arg_boot = settings->boot;
2900
2901 strv_free(arg_parameters);
2902 arg_parameters = settings->parameters;
2903 settings->parameters = NULL;
2904 }
2905
2906 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2907 settings->environment) {
2908 strv_free(arg_setenv);
2909 arg_setenv = settings->environment;
2910 settings->environment = NULL;
2911 }
2912
2913 if ((arg_settings_mask & SETTING_USER) == 0 &&
2914 settings->user) {
2915 free(arg_user);
2916 arg_user = settings->user;
2917 settings->user = NULL;
2918 }
2919
2920 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 2921 uint64_t plus;
f757855e 2922
0e265674
LP
2923 plus = settings->capability;
2924 if (settings_private_network(settings))
2925 plus |= (1ULL << CAP_NET_ADMIN);
2926
2927 if (!arg_settings_trusted && plus != 0) {
2928 if (settings->capability != 0)
2929 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2930 } else
2931 arg_retain |= plus;
f757855e
LP
2932
2933 arg_retain &= ~settings->drop_capability;
2934 }
2935
2936 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2937 settings->kill_signal > 0)
2938 arg_kill_signal = settings->kill_signal;
2939
2940 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2941 settings->personality != PERSONALITY_INVALID)
2942 arg_personality = settings->personality;
2943
2944 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2945 !sd_id128_is_null(settings->machine_id)) {
2946
2947 if (!arg_settings_trusted)
2948 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2949 else
2950 arg_uuid = settings->machine_id;
2951 }
2952
2953 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2954 settings->read_only >= 0)
2955 arg_read_only = settings->read_only;
2956
2957 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2958 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2959 arg_volatile_mode = settings->volatile_mode;
2960
2961 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2962 settings->n_custom_mounts > 0) {
2963
2964 if (!arg_settings_trusted)
2965 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2966 else {
2967 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2968 arg_custom_mounts = settings->custom_mounts;
2969 arg_n_custom_mounts = settings->n_custom_mounts;
2970
2971 settings->custom_mounts = NULL;
2972 settings->n_custom_mounts = 0;
2973 }
2974 }
2975
2976 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2977 (settings->private_network >= 0 ||
2978 settings->network_veth >= 0 ||
2979 settings->network_bridge ||
2980 settings->network_interfaces ||
2981 settings->network_macvlan ||
f6d6bad1
LP
2982 settings->network_ipvlan ||
2983 settings->network_veth_extra)) {
f757855e
LP
2984
2985 if (!arg_settings_trusted)
2986 log_warning("Ignoring network settings, file %s is not trusted.", p);
2987 else {
f6d6bad1 2988 arg_network_veth = settings_network_veth(settings);
0e265674
LP
2989 arg_private_network = settings_private_network(settings);
2990
f757855e
LP
2991 strv_free(arg_network_interfaces);
2992 arg_network_interfaces = settings->network_interfaces;
2993 settings->network_interfaces = NULL;
2994
2995 strv_free(arg_network_macvlan);
2996 arg_network_macvlan = settings->network_macvlan;
2997 settings->network_macvlan = NULL;
2998
2999 strv_free(arg_network_ipvlan);
3000 arg_network_ipvlan = settings->network_ipvlan;
3001 settings->network_ipvlan = NULL;
3002
f6d6bad1
LP
3003 strv_free(arg_network_veth_extra);
3004 arg_network_veth_extra = settings->network_veth_extra;
3005 settings->network_veth_extra = NULL;
3006
f757855e
LP
3007 free(arg_network_bridge);
3008 arg_network_bridge = settings->network_bridge;
3009 settings->network_bridge = NULL;
f757855e
LP
3010 }
3011 }
3012
3013 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3014 settings->expose_ports) {
3015
3016 if (!arg_settings_trusted)
3017 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3018 else {
3019 expose_port_free_all(arg_expose_ports);
3020 arg_expose_ports = settings->expose_ports;
3021 settings->expose_ports = NULL;
3022 }
3023 }
3024
3025 return 0;
3026}
3027
03cfe0d5
LP
3028int main(int argc, char *argv[]) {
3029
3030 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3031 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3032 _cleanup_close_ int master = -1, image_fd = -1;
3033 _cleanup_fdset_free_ FDSet *fds = NULL;
3034 int r, n_fd_passed, loop_nr = -1;
3035 char veth_name[IFNAMSIZ];
3036 bool secondary = false, remove_subvol = false;
72c0a2c2 3037 sigset_t mask_chld;
03cfe0d5
LP
3038 pid_t pid = 0;
3039 int ret = EXIT_SUCCESS;
3040 union in_addr_union exposed = {};
3041 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3042 bool interactive;
3043
3044 log_parse_environment();
3045 log_open();
3046
3047 r = parse_argv(argc, argv);
3048 if (r <= 0)
3049 goto finish;
3050
03cfe0d5
LP
3051 if (geteuid() != 0) {
3052 log_error("Need to be root.");
3053 r = -EPERM;
3054 goto finish;
3055 }
f757855e
LP
3056 r = determine_names();
3057 if (r < 0)
3058 goto finish;
3059
3060 r = load_settings();
3061 if (r < 0)
3062 goto finish;
3063
3064 r = verify_arguments();
3065 if (r < 0)
3066 goto finish;
03cfe0d5
LP
3067
3068 n_fd_passed = sd_listen_fds(false);
3069 if (n_fd_passed > 0) {
3070 r = fdset_new_listen_fds(&fds, false);
3071 if (r < 0) {
3072 log_error_errno(r, "Failed to collect file descriptors: %m");
3073 goto finish;
3074 }
3075 }
3076
3077 if (arg_directory) {
3078 assert(!arg_image);
3079
3080 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3081 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3082 r = -EINVAL;
3083 goto finish;
3084 }
3085
3086 if (arg_ephemeral) {
3087 _cleanup_free_ char *np = NULL;
3088
3089 /* If the specified path is a mount point we
3090 * generate the new snapshot immediately
3091 * inside it under a random name. However if
3092 * the specified is not a mount point we
3093 * create the new snapshot in the parent
3094 * directory, just next to it. */
e26d6ce5 3095 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
3096 if (r < 0) {
3097 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3098 goto finish;
3099 }
3100 if (r > 0)
770b5ce4 3101 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3102 else
770b5ce4 3103 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
3104 if (r < 0) {
3105 log_error_errno(r, "Failed to generate name for snapshot: %m");
3106 goto finish;
3107 }
3108
3109 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3110 if (r < 0) {
3111 log_error_errno(r, "Failed to lock %s: %m", np);
3112 goto finish;
3113 }
3114
5bcd08db 3115 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3116 if (r < 0) {
3117 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3118 goto finish;
ec16945e
LP
3119 }
3120
3121 free(arg_directory);
3122 arg_directory = np;
8a16a7b4 3123 np = NULL;
ec16945e
LP
3124
3125 remove_subvol = true;
30535c16
LP
3126
3127 } else {
3128 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3129 if (r == -EBUSY) {
3130 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3131 goto finish;
3132 }
3133 if (r < 0) {
3134 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3135 return r;
3136 }
3137
3138 if (arg_template) {
5bcd08db 3139 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3140 if (r == -EEXIST) {
3141 if (!arg_quiet)
3142 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3143 } else if (r < 0) {
83521414 3144 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3145 goto finish;
3146 } else {
3147 if (!arg_quiet)
3148 log_info("Populated %s from template %s.", arg_directory, arg_template);
3149 }
3150 }
ec16945e
LP
3151 }
3152
1b9e5b12
LP
3153 if (arg_boot) {
3154 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3155 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3156 r = -EINVAL;
1b9e5b12
LP
3157 goto finish;
3158 }
3159 } else {
3160 const char *p;
3161
16fb773e
LP
3162 p = strjoina(arg_directory, "/usr/");
3163 if (laccess(p, F_OK) < 0) {
3164 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 3165 r = -EINVAL;
1b9e5b12 3166 goto finish;
1b9e5b12
LP
3167 }
3168 }
ec16945e 3169
6b9132a9 3170 } else {
1b9e5b12 3171 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3172
ec16945e
LP
3173 assert(arg_image);
3174 assert(!arg_template);
3175
30535c16
LP
3176 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3177 if (r == -EBUSY) {
3178 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3179 goto finish;
3180 }
3181 if (r < 0) {
3182 r = log_error_errno(r, "Failed to create image lock: %m");
3183 goto finish;
3184 }
3185
1b9e5b12 3186 if (!mkdtemp(template)) {
56f64d95 3187 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3188 r = -errno;
6b9132a9 3189 goto finish;
1b9e5b12 3190 }
6b9132a9 3191
1b9e5b12
LP
3192 arg_directory = strdup(template);
3193 if (!arg_directory) {
3194 r = log_oom();
3195 goto finish;
6b9132a9 3196 }
88213476 3197
1b9e5b12
LP
3198 image_fd = setup_image(&device_path, &loop_nr);
3199 if (image_fd < 0) {
3200 r = image_fd;
842f3b0f
LP
3201 goto finish;
3202 }
1b9e5b12 3203
4d9f07b4
LP
3204 r = dissect_image(image_fd,
3205 &root_device, &root_device_rw,
3206 &home_device, &home_device_rw,
3207 &srv_device, &srv_device_rw,
3208 &secondary);
1b9e5b12
LP
3209 if (r < 0)
3210 goto finish;
842f3b0f 3211 }
842f3b0f 3212
5a8af538
LP
3213 r = custom_mounts_prepare();
3214 if (r < 0)
3215 goto finish;
3216
03cfe0d5
LP
3217 interactive =
3218 isatty(STDIN_FILENO) > 0 &&
3219 isatty(STDOUT_FILENO) > 0;
9c857b9d 3220
db7feb7e
LP
3221 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3222 if (master < 0) {
ec16945e 3223 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3224 goto finish;
3225 }
3226
611b312b
LP
3227 r = ptsname_malloc(master, &console);
3228 if (r < 0) {
3229 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
3230 goto finish;
3231 }
3232
a258bf26 3233 if (unlockpt(master) < 0) {
ec16945e 3234 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3235 goto finish;
3236 }
3237
9c857b9d
LP
3238 if (!arg_quiet)
3239 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3240 arg_machine, arg_image ?: arg_directory);
3241
72c0a2c2 3242 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3243
023fb90b
LP
3244 assert_se(sigemptyset(&mask_chld) == 0);
3245 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3246
03cfe0d5
LP
3247 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3248 r = log_error_errno(errno, "Failed to become subreaper: %m");
3249 goto finish;
3250 }
3251
d87be9b0 3252 for (;;) {
97044145 3253 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 };
113cea80 3254 ContainerStatus container_status;
7566e267 3255 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
03cfe0d5 3256 static const struct sigaction sa = {
189d5bac 3257 .sa_handler = nop_signal_handler,
e866af3a
DH
3258 .sa_flags = SA_NOCLDSTOP,
3259 };
03cfe0d5
LP
3260 int ifi = 0;
3261 ssize_t l;
4afd3348 3262 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
dbb60d69 3263 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4afd3348 3264 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
dbb60d69 3265 char last_char = 0;
e866af3a 3266
7566e267 3267 r = barrier_create(&barrier);
a2da110b 3268 if (r < 0) {
da927ba9 3269 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3270 goto finish;
3271 }
3272
4610de50 3273 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
6d0b55c2
LP
3274 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3275 goto finish;
3276 }
3277
4610de50 3278 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
6d0b55c2
LP
3279 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3280 goto finish;
3281 }
3282
4610de50 3283 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
03cfe0d5
LP
3284 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3285 goto finish;
3286 }
3287
825d5287 3288 if (arg_userns)
4610de50 3289 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
825d5287
RM
3290 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3291 goto finish;
3292 }
3293
e866af3a
DH
3294 /* Child can be killed before execv(), so handle SIGCHLD
3295 * in order to interrupt parent's blocking calls and
3296 * give it a chance to call wait() and terminate. */
3297 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3298 if (r < 0) {
ec16945e 3299 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3300 goto finish;
3301 }
3302
e866af3a
DH
3303 r = sigaction(SIGCHLD, &sa, NULL);
3304 if (r < 0) {
ec16945e 3305 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3306 goto finish;
3307 }
3308
03cfe0d5 3309 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
d87be9b0
LP
3310 if (pid < 0) {
3311 if (errno == EINVAL)
ec16945e 3312 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3313 else
ec16945e 3314 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3315
d87be9b0
LP
3316 goto finish;
3317 }
a258bf26 3318
d87be9b0 3319 if (pid == 0) {
03cfe0d5 3320 /* The outer child only has a file system namespace. */
a2da110b
DH
3321 barrier_set_role(&barrier, BARRIER_CHILD);
3322
03e334a1 3323 master = safe_close(master);
a258bf26 3324
03e334a1 3325 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3326 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 3327 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
825d5287 3328 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 3329
ce30c8dc
LP
3330 (void) reset_all_signal_handlers();
3331 (void) reset_signal_mask();
f5c1b9ee 3332
03cfe0d5
LP
3333 r = outer_child(&barrier,
3334 arg_directory,
3335 console,
3336 root_device, root_device_rw,
3337 home_device, home_device_rw,
3338 srv_device, srv_device_rw,
3339 interactive,
3340 secondary,
3341 pid_socket_pair[1],
3342 kmsg_socket_pair[1],
3343 rtnl_socket_pair[1],
825d5287 3344 uid_shift_socket_pair[1],
f757855e 3345 fds);
0cb9fbcd 3346 if (r < 0)
a2da110b 3347 _exit(EXIT_FAILURE);
d87be9b0 3348
03cfe0d5 3349 _exit(EXIT_SUCCESS);
da5b3bad 3350 }
88213476 3351
a2da110b 3352 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 3353
2feceb5e 3354 fds = fdset_free(fds);
842f3b0f 3355
6d0b55c2
LP
3356 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3357 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 3358 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
82116c43 3359 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
6d0b55c2 3360
03cfe0d5
LP
3361 /* Wait for the outer child. */
3362 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3363 if (r < 0)
3364 goto finish;
3365 if (r != 0) {
3366 r = -EIO;
3367 goto finish;
3368 }
3369 pid = 0;
6dac160c 3370
03cfe0d5
LP
3371 /* And now retrieve the PID of the inner child. */
3372 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3373 if (l < 0) {
3374 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3375 goto finish;
3376 }
3377 if (l != sizeof(pid)) {
76d44882 3378 log_error("Short read while reading inner child PID.");
03cfe0d5
LP
3379 r = EIO;
3380 goto finish;
3381 }
354bfd2b 3382
03cfe0d5 3383 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 3384
03cfe0d5
LP
3385 if (arg_userns) {
3386 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3387 log_error("Child died too early.");
3388 r = -ESRCH;
840295fc 3389 goto finish;
03cfe0d5 3390 }
ab046dde 3391
825d5287
RM
3392 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3393 if (l < 0) {
3394 r = log_error_errno(errno, "Failed to read UID shift: %m");
3395 goto finish;
3396 }
3397 if (l != sizeof(arg_uid_shift)) {
76d44882 3398 log_error("Short read while reading UID shift.");
825d5287
RM
3399 r = EIO;
3400 goto finish;
3401 }
3402
03cfe0d5 3403 r = setup_uid_map(pid);
840295fc
LP
3404 if (r < 0)
3405 goto finish;
ab046dde 3406
03cfe0d5
LP
3407 (void) barrier_place(&barrier); /* #2 */
3408 }
c74e630d 3409
9a2a5625 3410 if (arg_private_network) {
4bbfe7ad 3411
9a2a5625
LP
3412 r = move_network_interfaces(pid, arg_network_interfaces);
3413 if (r < 0)
3414 goto finish;
5aa4bb6b 3415
9a2a5625
LP
3416 if (arg_network_veth) {
3417 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3418 if (r < 0)
3419 goto finish;
3420 else if (r > 0)
3421 ifi = r;
6dac160c 3422
9a2a5625
LP
3423 if (arg_network_bridge) {
3424 r = setup_bridge(veth_name, arg_network_bridge);
3425 if (r < 0)
3426 goto finish;
3427 if (r > 0)
3428 ifi = r;
3429 }
3430 }
6dac160c 3431
f6d6bad1
LP
3432 r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
3433 if (r < 0)
3434 goto finish;
3435
9a2a5625
LP
3436 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3437 if (r < 0)
3438 goto finish;
3439
3440 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3441 if (r < 0)
3442 goto finish;
3443 }
6dac160c 3444
b7103bc5
LP
3445 if (arg_register) {
3446 r = register_machine(
3447 arg_machine,
3448 pid,
3449 arg_directory,
3450 arg_uuid,
3451 ifi,
3452 arg_slice,
3453 arg_custom_mounts, arg_n_custom_mounts,
3454 arg_kill_signal,
3455 arg_property,
6aadfa4c
ILG
3456 arg_keep_unit,
3457 arg_container_service_name);
b7103bc5
LP
3458 if (r < 0)
3459 goto finish;
3460 }
6dac160c 3461
34829a32 3462 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
efdb0237
LP
3463 if (r < 0)
3464 goto finish;
3465
34829a32
LP
3466 if (arg_keep_unit) {
3467 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3468 if (r < 0)
3469 goto finish;
3470 }
efdb0237 3471
34829a32 3472 r = chown_cgroup(pid, arg_uid_shift);
03cfe0d5
LP
3473 if (r < 0)
3474 goto finish;
6dac160c 3475
03cfe0d5
LP
3476 /* Notify the child that the parent is ready with all
3477 * its setup (including cgroup-ification), and that
3478 * the child can now hand over control to the code to
3479 * run inside the container. */
3480 (void) barrier_place(&barrier); /* #3 */
6dac160c 3481
03cfe0d5
LP
3482 /* Block SIGCHLD here, before notifying child.
3483 * process_pty() will handle it with the other signals. */
3484 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 3485
03cfe0d5
LP
3486 /* Reset signal to default */
3487 r = default_signals(SIGCHLD, -1);
3488 if (r < 0) {
3489 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3490 goto finish;
3491 }
e866af3a 3492
03cfe0d5 3493 /* Let the child know that we are ready and wait that the child is completely ready now. */
c0ffce2b
KN
3494 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3495 log_error("Child died too early.");
03cfe0d5
LP
3496 r = -ESRCH;
3497 goto finish;
3498 }
b12afc8c 3499
03cfe0d5
LP
3500 sd_notifyf(false,
3501 "READY=1\n"
3502 "STATUS=Container running.\n"
3503 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 3504
03cfe0d5
LP
3505 r = sd_event_new(&event);
3506 if (r < 0) {
3507 log_error_errno(r, "Failed to get default event source: %m");
3508 goto finish;
3509 }
88213476 3510
03cfe0d5
LP
3511 if (arg_kill_signal > 0) {
3512 /* Try to kill the init system on SIGINT or SIGTERM */
4a0b58c4
LP
3513 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid));
3514 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid));
03cfe0d5
LP
3515 } else {
3516 /* Immediately exit */
3517 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3518 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3519 }
023fb90b 3520
03cfe0d5
LP
3521 /* simply exit on sigchld */
3522 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 3523
03cfe0d5 3524 if (arg_expose_ports) {
7a8f6325 3525 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
03cfe0d5
LP
3526 if (r < 0)
3527 goto finish;
023fb90b 3528
7a8f6325 3529 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
03cfe0d5 3530 }
023fb90b 3531
03cfe0d5 3532 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 3533
ae3dde80 3534 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
03cfe0d5
LP
3535 if (r < 0) {
3536 log_error_errno(r, "Failed to create PTY forwarder: %m");
3537 goto finish;
3538 }
023fb90b 3539
03cfe0d5
LP
3540 r = sd_event_loop(event);
3541 if (r < 0) {
3542 log_error_errno(r, "Failed to run event loop: %m");
3543 goto finish;
3544 }
6d0b55c2 3545
03cfe0d5 3546 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 3547
03cfe0d5 3548 forward = pty_forward_free(forward);
6d0b55c2 3549
03cfe0d5
LP
3550 if (!arg_quiet && last_char != '\n')
3551 putc('\n', stdout);
04d39279 3552
03cfe0d5 3553 /* Kill if it is not dead yet anyway */
b7103bc5
LP
3554 if (arg_register && !arg_keep_unit)
3555 terminate_machine(pid);
1f0cd86b 3556
840295fc 3557 /* Normally redundant, but better safe than sorry */
04d39279 3558 kill(pid, SIGKILL);
a258bf26 3559
113cea80 3560 r = wait_for_container(pid, &container_status);
04d39279
LP
3561 pid = 0;
3562
ec16945e 3563 if (r < 0)
ce9f1527
LP
3564 /* We failed to wait for the container, or the
3565 * container exited abnormally */
ec16945e
LP
3566 goto finish;
3567 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
3568 /* The container exited with a non-zero
3569 * status, or with zero status and no reboot
3570 * was requested. */
ec16945e 3571 ret = r;
d87be9b0 3572 break;
ec16945e 3573 }
88213476 3574
113cea80 3575 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3576
3577 if (arg_keep_unit) {
3578 /* Special handling if we are running as a
3579 * service: instead of simply restarting the
3580 * machine we want to restart the entire
3581 * service, so let's inform systemd about this
3582 * with the special exit code 133. The service
3583 * file uses RestartForceExitStatus=133 so
3584 * that this results in a full nspawn
3585 * restart. This is necessary since we might
3586 * have cgroup parameters set we want to have
3587 * flushed out. */
ec16945e
LP
3588 ret = 133;
3589 r = 0;
ce38dbc8
LP
3590 break;
3591 }
6d0b55c2 3592
7a8f6325 3593 expose_port_flush(arg_expose_ports, &exposed);
d87be9b0 3594 }
88213476
LP
3595
3596finish:
af4ec430
LP
3597 sd_notify(false,
3598 "STOPPING=1\n"
3599 "STATUS=Terminating...");
3600
9444b1f2
LP
3601 if (pid > 0)
3602 kill(pid, SIGKILL);
88213476 3603
503546da
LP
3604 /* Try to flush whatever is still queued in the pty */
3605 if (master >= 0)
59f448cf 3606 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
503546da 3607
03cfe0d5
LP
3608 loop_remove(loop_nr, &image_fd);
3609
ec16945e
LP
3610 if (remove_subvol && arg_directory) {
3611 int k;
3612
5bcd08db 3613 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
ec16945e
LP
3614 if (k < 0)
3615 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3616 }
3617
785890ac
LP
3618 if (arg_machine) {
3619 const char *p;
3620
63c372cb 3621 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 3622 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
3623 }
3624
7a8f6325 3625 expose_port_flush(arg_expose_ports, &exposed);
f757855e 3626
04d391da 3627 free(arg_directory);
ec16945e
LP
3628 free(arg_template);
3629 free(arg_image);
7027ff61 3630 free(arg_machine);
c74e630d
LP
3631 free(arg_user);
3632 strv_free(arg_setenv);
f757855e 3633 free(arg_network_bridge);
c74e630d
LP
3634 strv_free(arg_network_interfaces);
3635 strv_free(arg_network_macvlan);
4bbfe7ad 3636 strv_free(arg_network_ipvlan);
f6d6bad1 3637 strv_free(arg_network_veth_extra);
f757855e
LP
3638 strv_free(arg_parameters);
3639 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3640 expose_port_free_all(arg_expose_ports);
6d0b55c2 3641
ec16945e 3642 return r < 0 ? EXIT_FAILURE : ret;
88213476 3643}