]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
Merge pull request #2463 from poettering/machined-tty-fix
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
8fe0087e
LP
22#ifdef HAVE_BLKID
23#include <blkid/blkid.h>
24#endif
88213476 25#include <errno.h>
88213476 26#include <getopt.h>
1b9e5b12 27#include <linux/loop.h>
8fe0087e 28#include <sched.h>
24fb1112
LP
29#ifdef HAVE_SECCOMP
30#include <seccomp.h>
31#endif
8fe0087e
LP
32#ifdef HAVE_SELINUX
33#include <selinux/selinux.h>
1b9e5b12 34#endif
8fe0087e
LP
35#include <signal.h>
36#include <stdio.h>
37#include <stdlib.h>
38#include <string.h>
39#include <sys/file.h>
40#include <sys/mount.h>
41#include <sys/personality.h>
42#include <sys/prctl.h>
43#include <sys/types.h>
44#include <unistd.h>
1b9e5b12 45
1f0cd86b 46#include "sd-daemon.h"
1f0cd86b 47#include "sd-id128.h"
8fe0087e 48
b5efdb8a 49#include "alloc-util.h"
8fe0087e
LP
50#include "barrier.h"
51#include "base-filesystem.h"
52#include "blkid-util.h"
53#include "btrfs-util.h"
8fe0087e 54#include "cap-list.h"
430f0182 55#include "capability-util.h"
04d391da 56#include "cgroup-util.h"
8fe0087e 57#include "copy.h"
4fc9982c 58#include "dev-setup.h"
8fe0087e 59#include "env-util.h"
3ffd4af2 60#include "fd-util.h"
842f3b0f 61#include "fdset.h"
a5c32cff 62#include "fileio.h"
8fe0087e 63#include "formats-util.h"
f4f15635 64#include "fs-util.h"
1b9e5b12 65#include "gpt.h"
8fe0087e
LP
66#include "hostname-util.h"
67#include "log.h"
68#include "loopback-setup.h"
1b9cebf6 69#include "machine-image.h"
8fe0087e
LP
70#include "macro.h"
71#include "missing.h"
72#include "mkdir.h"
4349cd7c 73#include "mount-util.h"
8fe0087e 74#include "netlink-util.h"
07630cea
LP
75#include "nspawn-cgroup.h"
76#include "nspawn-expose-ports.h"
77#include "nspawn-mount.h"
78#include "nspawn-network.h"
79#include "nspawn-register.h"
80#include "nspawn-settings.h"
81#include "nspawn-setuid.h"
6bedfcbb 82#include "parse-util.h"
8fe0087e 83#include "path-util.h"
0b452006 84#include "process-util.h"
8fe0087e
LP
85#include "ptyfwd.h"
86#include "random-util.h"
87#include "rm-rf.h"
e9642be2
LP
88#ifdef HAVE_SECCOMP
89#include "seccomp-util.h"
90#endif
8fe0087e 91#include "signal-util.h"
2583fbea 92#include "socket-util.h"
8fcde012 93#include "stat-util.h"
15a5e950 94#include "stdio-util.h"
07630cea 95#include "string-util.h"
8fe0087e
LP
96#include "strv.h"
97#include "terminal-util.h"
98#include "udev-util.h"
affb60b1 99#include "umask-util.h"
b1d4f8e1 100#include "user-util.h"
8fe0087e 101#include "util.h"
e9642be2 102
113cea80
DH
103typedef enum ContainerStatus {
104 CONTAINER_TERMINATED,
105 CONTAINER_REBOOTED
106} ContainerStatus;
107
57fb9fb5
LP
108typedef enum LinkJournal {
109 LINK_NO,
110 LINK_AUTO,
111 LINK_HOST,
112 LINK_GUEST
113} LinkJournal;
88213476
LP
114
115static char *arg_directory = NULL;
ec16945e 116static char *arg_template = NULL;
687d0825 117static char *arg_user = NULL;
9444b1f2 118static sd_id128_t arg_uuid = {};
7027ff61 119static char *arg_machine = NULL;
c74e630d
LP
120static const char *arg_selinux_context = NULL;
121static const char *arg_selinux_apifs_context = NULL;
9444b1f2 122static const char *arg_slice = NULL;
ff01d048 123static bool arg_private_network = false;
bc2f673e 124static bool arg_read_only = false;
0f0dbc46 125static bool arg_boot = false;
ec16945e 126static bool arg_ephemeral = false;
57fb9fb5 127static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 128static bool arg_link_journal_try = false;
5076f0cc
LP
129static uint64_t arg_retain =
130 (1ULL << CAP_CHOWN) |
131 (1ULL << CAP_DAC_OVERRIDE) |
132 (1ULL << CAP_DAC_READ_SEARCH) |
133 (1ULL << CAP_FOWNER) |
134 (1ULL << CAP_FSETID) |
135 (1ULL << CAP_IPC_OWNER) |
136 (1ULL << CAP_KILL) |
137 (1ULL << CAP_LEASE) |
138 (1ULL << CAP_LINUX_IMMUTABLE) |
139 (1ULL << CAP_NET_BIND_SERVICE) |
140 (1ULL << CAP_NET_BROADCAST) |
141 (1ULL << CAP_NET_RAW) |
142 (1ULL << CAP_SETGID) |
143 (1ULL << CAP_SETFCAP) |
144 (1ULL << CAP_SETPCAP) |
145 (1ULL << CAP_SETUID) |
146 (1ULL << CAP_SYS_ADMIN) |
147 (1ULL << CAP_SYS_CHROOT) |
148 (1ULL << CAP_SYS_NICE) |
149 (1ULL << CAP_SYS_PTRACE) |
150 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 151 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
152 (1ULL << CAP_SYS_BOOT) |
153 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
154 (1ULL << CAP_AUDIT_CONTROL) |
155 (1ULL << CAP_MKNOD);
5a8af538
LP
156static CustomMount *arg_custom_mounts = NULL;
157static unsigned arg_n_custom_mounts = 0;
f4889f65 158static char **arg_setenv = NULL;
284c0b91 159static bool arg_quiet = false;
8a96d94e 160static bool arg_share_system = false;
eb91eb18 161static bool arg_register = true;
89f7c846 162static bool arg_keep_unit = false;
aa28aefe 163static char **arg_network_interfaces = NULL;
c74e630d 164static char **arg_network_macvlan = NULL;
4bbfe7ad 165static char **arg_network_ipvlan = NULL;
69c79d3c 166static bool arg_network_veth = false;
f6d6bad1 167static char **arg_network_veth_extra = NULL;
f757855e 168static char *arg_network_bridge = NULL;
050f7277 169static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 170static char *arg_image = NULL;
f757855e 171static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 172static ExposePort *arg_expose_ports = NULL;
f36933fe 173static char **arg_property = NULL;
6dac160c
LP
174static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
175static bool arg_userns = false;
c6c8f6e2 176static int arg_kill_signal = 0;
efdb0237 177static bool arg_unified_cgroup_hierarchy = false;
f757855e
LP
178static SettingsMask arg_settings_mask = 0;
179static int arg_settings_trusted = -1;
180static char **arg_parameters = NULL;
6aadfa4c 181static const char *arg_container_service_name = "systemd-nspawn";
88213476 182
601185b4 183static void help(void) {
88213476
LP
184 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
185 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
186 " -h --help Show this help\n"
187 " --version Print version string\n"
69c79d3c 188 " -q --quiet Do not show status information\n"
1b9e5b12 189 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
190 " --template=PATH Initialize root directory from template directory,\n"
191 " if missing\n"
192 " -x --ephemeral Run container with snapshot of root directory, and\n"
193 " remove it after exit\n"
194 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
195 " -b --boot Boot up full system (i.e. invoke init)\n"
196 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 197 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 198 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 199 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 200 " --property=NAME=VALUE Set scope unit property\n"
03cfe0d5
LP
201 " --private-users[=UIDBASE[:NUIDS]]\n"
202 " Run within user namespace\n"
69c79d3c
LP
203 " --private-network Disable network in container\n"
204 " --network-interface=INTERFACE\n"
205 " Assign an existing network interface to the\n"
206 " container\n"
c74e630d
LP
207 " --network-macvlan=INTERFACE\n"
208 " Create a macvlan network interface based on an\n"
209 " existing network interface to the container\n"
4bbfe7ad
TG
210 " --network-ipvlan=INTERFACE\n"
211 " Create a ipvlan network interface based on an\n"
212 " existing network interface to the container\n"
a8eaaee7 213 " -n --network-veth Add a virtual Ethernet connection between host\n"
69c79d3c 214 " and container\n"
f6d6bad1
LP
215 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
216 " Add an additional virtual Ethernet link between\n"
217 " host and container\n"
ab046dde 218 " --network-bridge=INTERFACE\n"
a8eaaee7 219 " Add a virtual Ethernet connection between host\n"
ab046dde
TG
220 " and container and add it to an existing bridge on\n"
221 " the host\n"
6d0b55c2 222 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 223 " Expose a container IP port on the host\n"
82adf6af
LP
224 " -Z --selinux-context=SECLABEL\n"
225 " Set the SELinux security context to be used by\n"
226 " processes in the container\n"
227 " -L --selinux-apifs-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " API/tmpfs file systems in the container\n"
a8828ed9
DW
230 " --capability=CAP In addition to the default, retain specified\n"
231 " capability\n"
232 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 233 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
234 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
235 " try-guest, try-host\n"
236 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 237 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
238 " --bind=PATH[:PATH[:OPTIONS]]\n"
239 " Bind mount a file or directory from the host into\n"
a8828ed9 240 " the container\n"
5e5bfa6e
EY
241 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
242 " Similar, but creates a read-only bind mount\n"
06c17c39 243 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
244 " --overlay=PATH[:PATH...]:PATH\n"
245 " Create an overlay mount from the host to \n"
246 " the container\n"
247 " --overlay-ro=PATH[:PATH...]:PATH\n"
248 " Similar, but creates a read-only overlay mount\n"
284c0b91 249 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 250 " --share-system Share system namespaces with host\n"
eb91eb18 251 " --register=BOOLEAN Register container as machine\n"
89f7c846 252 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 253 " the service unit nspawn is running in\n"
6d0b55c2 254 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 255 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
6d0b55c2 256 , program_invocation_short_name);
88213476
LP
257}
258
5a8af538
LP
259
260static int custom_mounts_prepare(void) {
261 unsigned i;
262 int r;
263
264 /* Ensure the mounts are applied prefix first. */
265 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
266
267 /* Allocate working directories for the overlay file systems that need it */
268 for (i = 0; i < arg_n_custom_mounts; i++) {
269 CustomMount *m = &arg_custom_mounts[i];
270
825d5287
RM
271 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
272 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
273 return -EINVAL;
274 }
275
5a8af538
LP
276 if (m->type != CUSTOM_MOUNT_OVERLAY)
277 continue;
278
279 if (m->work_dir)
280 continue;
281
282 if (m->read_only)
283 continue;
284
14bcf25c 285 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
286 if (r < 0)
287 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
288 }
289
290 return 0;
291}
292
efdb0237
LP
293static int detect_unified_cgroup_hierarchy(void) {
294 const char *e;
295 int r;
296
297 /* Allow the user to control whether the unified hierarchy is used */
298 e = getenv("UNIFIED_CGROUP_HIERARCHY");
299 if (e) {
300 r = parse_boolean(e);
301 if (r < 0)
302 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
303
304 arg_unified_cgroup_hierarchy = r;
305 return 0;
306 }
307
308 /* Otherwise inherit the default from the host system */
309 r = cg_unified();
310 if (r < 0)
311 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
312
313 arg_unified_cgroup_hierarchy = r;
314 return 0;
315}
316
88213476
LP
317static int parse_argv(int argc, char *argv[]) {
318
a41fe3a2 319 enum {
acbeb427
ZJS
320 ARG_VERSION = 0x100,
321 ARG_PRIVATE_NETWORK,
bc2f673e 322 ARG_UUID,
5076f0cc 323 ARG_READ_ONLY,
57fb9fb5 324 ARG_CAPABILITY,
420c7379 325 ARG_DROP_CAPABILITY,
17fe0523
LP
326 ARG_LINK_JOURNAL,
327 ARG_BIND,
f4889f65 328 ARG_BIND_RO,
06c17c39 329 ARG_TMPFS,
5a8af538
LP
330 ARG_OVERLAY,
331 ARG_OVERLAY_RO,
f4889f65 332 ARG_SETENV,
eb91eb18 333 ARG_SHARE_SYSTEM,
89f7c846 334 ARG_REGISTER,
aa28aefe 335 ARG_KEEP_UNIT,
69c79d3c 336 ARG_NETWORK_INTERFACE,
c74e630d 337 ARG_NETWORK_MACVLAN,
4bbfe7ad 338 ARG_NETWORK_IPVLAN,
ab046dde 339 ARG_NETWORK_BRIDGE,
f6d6bad1 340 ARG_NETWORK_VETH_EXTRA,
6afc95b7 341 ARG_PERSONALITY,
4d9f07b4 342 ARG_VOLATILE,
ec16945e 343 ARG_TEMPLATE,
f36933fe 344 ARG_PROPERTY,
6dac160c 345 ARG_PRIVATE_USERS,
c6c8f6e2 346 ARG_KILL_SIGNAL,
f757855e 347 ARG_SETTINGS,
a41fe3a2
LP
348 };
349
88213476 350 static const struct option options[] = {
aa28aefe
LP
351 { "help", no_argument, NULL, 'h' },
352 { "version", no_argument, NULL, ARG_VERSION },
353 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
354 { "template", required_argument, NULL, ARG_TEMPLATE },
355 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
356 { "user", required_argument, NULL, 'u' },
357 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
358 { "boot", no_argument, NULL, 'b' },
359 { "uuid", required_argument, NULL, ARG_UUID },
360 { "read-only", no_argument, NULL, ARG_READ_ONLY },
361 { "capability", required_argument, NULL, ARG_CAPABILITY },
362 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
363 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
364 { "bind", required_argument, NULL, ARG_BIND },
365 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 366 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
367 { "overlay", required_argument, NULL, ARG_OVERLAY },
368 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
369 { "machine", required_argument, NULL, 'M' },
370 { "slice", required_argument, NULL, 'S' },
371 { "setenv", required_argument, NULL, ARG_SETENV },
372 { "selinux-context", required_argument, NULL, 'Z' },
373 { "selinux-apifs-context", required_argument, NULL, 'L' },
374 { "quiet", no_argument, NULL, 'q' },
375 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
376 { "register", required_argument, NULL, ARG_REGISTER },
377 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
378 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 379 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 380 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 381 { "network-veth", no_argument, NULL, 'n' },
f6d6bad1 382 { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA},
ab046dde 383 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 384 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 385 { "image", required_argument, NULL, 'i' },
4d9f07b4 386 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 387 { "port", required_argument, NULL, 'p' },
f36933fe 388 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 389 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 390 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
f757855e 391 { "settings", required_argument, NULL, ARG_SETTINGS },
eb9da376 392 {}
88213476
LP
393 };
394
9444b1f2 395 int c, r;
6aadfa4c 396 const char *p, *e;
a42c8b54 397 uint64_t plus = 0, minus = 0;
f757855e 398 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
399
400 assert(argc >= 0);
401 assert(argv);
402
0dfaa006 403 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
404
405 switch (c) {
406
407 case 'h':
601185b4
ZJS
408 help();
409 return 0;
88213476 410
acbeb427 411 case ARG_VERSION:
3f6fd1ba 412 return version();
acbeb427 413
88213476 414 case 'D':
0f03c2a4 415 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 416 if (r < 0)
0f03c2a4 417 return r;
ec16945e
LP
418 break;
419
420 case ARG_TEMPLATE:
0f03c2a4 421 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 422 if (r < 0)
0f03c2a4 423 return r;
88213476
LP
424 break;
425
1b9e5b12 426 case 'i':
0f03c2a4 427 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 428 if (r < 0)
0f03c2a4 429 return r;
ec16945e
LP
430 break;
431
432 case 'x':
433 arg_ephemeral = true;
1b9e5b12
LP
434 break;
435
687d0825 436 case 'u':
2fc09a9c
DM
437 r = free_and_strdup(&arg_user, optarg);
438 if (r < 0)
7027ff61 439 return log_oom();
687d0825 440
f757855e 441 arg_settings_mask |= SETTING_USER;
687d0825
MV
442 break;
443
ab046dde 444 case ARG_NETWORK_BRIDGE:
f757855e
LP
445 r = free_and_strdup(&arg_network_bridge, optarg);
446 if (r < 0)
447 return log_oom();
ab046dde
TG
448
449 /* fall through */
450
0dfaa006 451 case 'n':
69c79d3c
LP
452 arg_network_veth = true;
453 arg_private_network = true;
f757855e 454 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
455 break;
456
f6d6bad1
LP
457 case ARG_NETWORK_VETH_EXTRA:
458 r = veth_extra_parse(&arg_network_veth_extra, optarg);
459 if (r < 0)
460 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
461
462 arg_private_network = true;
463 arg_settings_mask |= SETTING_NETWORK;
464 break;
465
aa28aefe 466 case ARG_NETWORK_INTERFACE:
c74e630d
LP
467 if (strv_extend(&arg_network_interfaces, optarg) < 0)
468 return log_oom();
469
470 arg_private_network = true;
f757855e 471 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
472 break;
473
474 case ARG_NETWORK_MACVLAN:
475 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
476 return log_oom();
477
4bbfe7ad 478 arg_private_network = true;
f757855e 479 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
480 break;
481
482 case ARG_NETWORK_IPVLAN:
483 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
484 return log_oom();
485
aa28aefe
LP
486 /* fall through */
487
ff01d048
LP
488 case ARG_PRIVATE_NETWORK:
489 arg_private_network = true;
f757855e 490 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
491 break;
492
0f0dbc46
LP
493 case 'b':
494 arg_boot = true;
f757855e 495 arg_settings_mask |= SETTING_BOOT;
0f0dbc46
LP
496 break;
497
144f0fc0 498 case ARG_UUID:
9444b1f2
LP
499 r = sd_id128_from_string(optarg, &arg_uuid);
500 if (r < 0) {
aa96c6cb 501 log_error("Invalid UUID: %s", optarg);
9444b1f2 502 return r;
aa96c6cb 503 }
f757855e
LP
504
505 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 506 break;
aa96c6cb 507
9444b1f2 508 case 'S':
c74e630d 509 arg_slice = optarg;
144f0fc0
LP
510 break;
511
7027ff61 512 case 'M':
c1521918 513 if (isempty(optarg))
97b11eed 514 arg_machine = mfree(arg_machine);
c1521918 515 else {
0c3c4284 516 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
517 log_error("Invalid machine name: %s", optarg);
518 return -EINVAL;
519 }
7027ff61 520
0c3c4284
LP
521 r = free_and_strdup(&arg_machine, optarg);
522 if (r < 0)
eb91eb18
LP
523 return log_oom();
524
525 break;
526 }
7027ff61 527
82adf6af
LP
528 case 'Z':
529 arg_selinux_context = optarg;
a8828ed9
DW
530 break;
531
82adf6af
LP
532 case 'L':
533 arg_selinux_apifs_context = optarg;
a8828ed9
DW
534 break;
535
bc2f673e
LP
536 case ARG_READ_ONLY:
537 arg_read_only = true;
f757855e 538 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
539 break;
540
420c7379
LP
541 case ARG_CAPABILITY:
542 case ARG_DROP_CAPABILITY: {
6cbe4ed1
SS
543 p = optarg;
544 for(;;) {
545 _cleanup_free_ char *t = NULL;
5076f0cc 546
6cbe4ed1
SS
547 r = extract_first_word(&p, &t, ",", 0);
548 if (r < 0)
549 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 550
6cbe4ed1
SS
551 if (r == 0)
552 break;
5076f0cc 553
39ed67d1
LP
554 if (streq(t, "all")) {
555 if (c == ARG_CAPABILITY)
a42c8b54 556 plus = (uint64_t) -1;
39ed67d1 557 else
a42c8b54 558 minus = (uint64_t) -1;
39ed67d1 559 } else {
2822da4f
LP
560 int cap;
561
562 cap = capability_from_name(t);
563 if (cap < 0) {
39ed67d1
LP
564 log_error("Failed to parse capability %s.", t);
565 return -EINVAL;
566 }
567
568 if (c == ARG_CAPABILITY)
a42c8b54 569 plus |= 1ULL << (uint64_t) cap;
39ed67d1 570 else
a42c8b54 571 minus |= 1ULL << (uint64_t) cap;
5076f0cc 572 }
5076f0cc
LP
573 }
574
f757855e 575 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
576 break;
577 }
578
57fb9fb5
LP
579 case 'j':
580 arg_link_journal = LINK_GUEST;
574edc90 581 arg_link_journal_try = true;
57fb9fb5
LP
582 break;
583
584 case ARG_LINK_JOURNAL:
53e438e3 585 if (streq(optarg, "auto")) {
57fb9fb5 586 arg_link_journal = LINK_AUTO;
53e438e3
LP
587 arg_link_journal_try = false;
588 } else if (streq(optarg, "no")) {
57fb9fb5 589 arg_link_journal = LINK_NO;
53e438e3
LP
590 arg_link_journal_try = false;
591 } else if (streq(optarg, "guest")) {
57fb9fb5 592 arg_link_journal = LINK_GUEST;
53e438e3
LP
593 arg_link_journal_try = false;
594 } else if (streq(optarg, "host")) {
57fb9fb5 595 arg_link_journal = LINK_HOST;
53e438e3
LP
596 arg_link_journal_try = false;
597 } else if (streq(optarg, "try-guest")) {
574edc90
MP
598 arg_link_journal = LINK_GUEST;
599 arg_link_journal_try = true;
600 } else if (streq(optarg, "try-host")) {
601 arg_link_journal = LINK_HOST;
602 arg_link_journal_try = true;
603 } else {
57fb9fb5
LP
604 log_error("Failed to parse link journal mode %s", optarg);
605 return -EINVAL;
606 }
607
608 break;
609
17fe0523 610 case ARG_BIND:
f757855e
LP
611 case ARG_BIND_RO:
612 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
613 if (r < 0)
614 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 615
f757855e 616 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 617 break;
06c17c39 618
f757855e
LP
619 case ARG_TMPFS:
620 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
621 if (r < 0)
622 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 623
f757855e 624 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 625 break;
5a8af538
LP
626
627 case ARG_OVERLAY:
628 case ARG_OVERLAY_RO: {
629 _cleanup_free_ char *upper = NULL, *destination = NULL;
630 _cleanup_strv_free_ char **lower = NULL;
631 CustomMount *m;
632 unsigned n = 0;
633 char **i;
634
62f9f39a
RM
635 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
636 if (r == -ENOMEM)
06c17c39 637 return log_oom();
62f9f39a
RM
638 else if (r < 0) {
639 log_error("Invalid overlay specification: %s", optarg);
640 return r;
641 }
06c17c39 642
5a8af538
LP
643 STRV_FOREACH(i, lower) {
644 if (!path_is_absolute(*i)) {
645 log_error("Overlay path %s is not absolute.", *i);
646 return -EINVAL;
647 }
648
649 n++;
650 }
651
652 if (n < 2) {
653 log_error("--overlay= needs at least two colon-separated directories specified.");
654 return -EINVAL;
655 }
656
657 if (n == 2) {
658 /* If two parameters are specified,
659 * the first one is the lower, the
660 * second one the upper directory. And
af86c440
ZJS
661 * we'll also define the destination
662 * mount point the same as the upper. */
5a8af538
LP
663 upper = lower[1];
664 lower[1] = NULL;
665
666 destination = strdup(upper);
667 if (!destination)
668 return log_oom();
669
670 } else {
671 upper = lower[n - 2];
672 destination = lower[n - 1];
673 lower[n - 2] = NULL;
674 }
675
f757855e 676 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
5a8af538
LP
677 if (!m)
678 return log_oom();
679
680 m->destination = destination;
681 m->source = upper;
682 m->lower = lower;
683 m->read_only = c == ARG_OVERLAY_RO;
684
685 upper = destination = NULL;
686 lower = NULL;
06c17c39 687
f757855e 688 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39
LP
689 break;
690 }
691
f4889f65
LP
692 case ARG_SETENV: {
693 char **n;
694
695 if (!env_assignment_is_valid(optarg)) {
696 log_error("Environment variable assignment '%s' is not valid.", optarg);
697 return -EINVAL;
698 }
699
700 n = strv_env_set(arg_setenv, optarg);
701 if (!n)
702 return log_oom();
703
704 strv_free(arg_setenv);
705 arg_setenv = n;
f757855e
LP
706
707 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
708 break;
709 }
710
284c0b91
LP
711 case 'q':
712 arg_quiet = true;
713 break;
714
8a96d94e
LP
715 case ARG_SHARE_SYSTEM:
716 arg_share_system = true;
717 break;
718
eb91eb18
LP
719 case ARG_REGISTER:
720 r = parse_boolean(optarg);
721 if (r < 0) {
722 log_error("Failed to parse --register= argument: %s", optarg);
723 return r;
724 }
725
726 arg_register = r;
727 break;
728
89f7c846
LP
729 case ARG_KEEP_UNIT:
730 arg_keep_unit = true;
731 break;
732
6afc95b7
LP
733 case ARG_PERSONALITY:
734
ac45f971 735 arg_personality = personality_from_string(optarg);
050f7277 736 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
737 log_error("Unknown or unsupported personality '%s'.", optarg);
738 return -EINVAL;
739 }
740
f757855e 741 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
742 break;
743
4d9f07b4
LP
744 case ARG_VOLATILE:
745
746 if (!optarg)
f757855e 747 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 748 else {
f757855e 749 VolatileMode m;
4d9f07b4 750
f757855e
LP
751 m = volatile_mode_from_string(optarg);
752 if (m < 0) {
753 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 754 return -EINVAL;
f757855e
LP
755 } else
756 arg_volatile_mode = m;
6d0b55c2
LP
757 }
758
f757855e
LP
759 arg_settings_mask |= SETTING_VOLATILE_MODE;
760 break;
6d0b55c2 761
f757855e
LP
762 case 'p':
763 r = expose_port_parse(&arg_expose_ports, optarg);
764 if (r == -EEXIST)
765 return log_error_errno(r, "Duplicate port specification: %s", optarg);
766 if (r < 0)
767 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 768
f757855e 769 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 770 break;
6d0b55c2 771
f36933fe
LP
772 case ARG_PROPERTY:
773 if (strv_extend(&arg_property, optarg) < 0)
774 return log_oom();
775
776 break;
777
6dac160c
LP
778 case ARG_PRIVATE_USERS:
779 if (optarg) {
780 _cleanup_free_ char *buffer = NULL;
781 const char *range, *shift;
782
783 range = strchr(optarg, ':');
784 if (range) {
785 buffer = strndup(optarg, range - optarg);
786 if (!buffer)
787 return log_oom();
788 shift = buffer;
789
790 range++;
791 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
792 log_error("Failed to parse UID range: %s", range);
793 return -EINVAL;
794 }
795 } else
796 shift = optarg;
797
798 if (parse_uid(shift, &arg_uid_shift) < 0) {
799 log_error("Failed to parse UID: %s", optarg);
800 return -EINVAL;
801 }
802 }
803
804 arg_userns = true;
805 break;
806
c6c8f6e2
LP
807 case ARG_KILL_SIGNAL:
808 arg_kill_signal = signal_from_string_try_harder(optarg);
809 if (arg_kill_signal < 0) {
810 log_error("Cannot parse signal: %s", optarg);
811 return -EINVAL;
812 }
813
f757855e
LP
814 arg_settings_mask |= SETTING_KILL_SIGNAL;
815 break;
816
817 case ARG_SETTINGS:
818
819 /* no → do not read files
820 * yes → read files, do not override cmdline, trust only subset
821 * override → read files, override cmdline, trust only subset
822 * trusted → read files, do not override cmdline, trust all
823 */
824
825 r = parse_boolean(optarg);
826 if (r < 0) {
827 if (streq(optarg, "trusted")) {
828 mask_all_settings = false;
829 mask_no_settings = false;
830 arg_settings_trusted = true;
831
832 } else if (streq(optarg, "override")) {
833 mask_all_settings = false;
834 mask_no_settings = true;
835 arg_settings_trusted = -1;
836 } else
837 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
838 } else if (r > 0) {
839 /* yes */
840 mask_all_settings = false;
841 mask_no_settings = false;
842 arg_settings_trusted = -1;
843 } else {
844 /* no */
845 mask_all_settings = true;
846 mask_no_settings = false;
847 arg_settings_trusted = false;
848 }
849
c6c8f6e2
LP
850 break;
851
88213476
LP
852 case '?':
853 return -EINVAL;
854
855 default:
eb9da376 856 assert_not_reached("Unhandled option");
88213476 857 }
88213476 858
eb91eb18
LP
859 if (arg_share_system)
860 arg_register = false;
861
862 if (arg_boot && arg_share_system) {
863 log_error("--boot and --share-system may not be combined.");
864 return -EINVAL;
865 }
866
89f7c846
LP
867 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
868 log_error("--keep-unit may not be used when invoked from a user session.");
869 return -EINVAL;
870 }
871
1b9e5b12
LP
872 if (arg_directory && arg_image) {
873 log_error("--directory= and --image= may not be combined.");
874 return -EINVAL;
875 }
876
ec16945e
LP
877 if (arg_template && arg_image) {
878 log_error("--template= and --image= may not be combined.");
879 return -EINVAL;
880 }
881
882 if (arg_template && !(arg_directory || arg_machine)) {
883 log_error("--template= needs --directory= or --machine=.");
884 return -EINVAL;
885 }
886
887 if (arg_ephemeral && arg_template) {
888 log_error("--ephemeral and --template= may not be combined.");
889 return -EINVAL;
890 }
891
892 if (arg_ephemeral && arg_image) {
893 log_error("--ephemeral and --image= may not be combined.");
894 return -EINVAL;
895 }
896
df9a75e4
LP
897 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
898 log_error("--ephemeral and --link-journal= may not be combined.");
899 return -EINVAL;
900 }
901
f757855e
LP
902 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
903 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
904
905 if (argc > optind) {
906 arg_parameters = strv_copy(argv + optind);
907 if (!arg_parameters)
908 return log_oom();
909
910 arg_settings_mask |= SETTING_BOOT;
911 }
912
913 /* Load all settings from .nspawn files */
914 if (mask_no_settings)
915 arg_settings_mask = 0;
916
917 /* Don't load any settings from .nspawn files */
918 if (mask_all_settings)
919 arg_settings_mask = _SETTINGS_MASK_ALL;
920
921 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
922
923 r = detect_unified_cgroup_hierarchy();
924 if (r < 0)
925 return r;
926
6aadfa4c
ILG
927 e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
928 if (e)
929 arg_container_service_name = e;
930
f757855e
LP
931 return 1;
932}
933
934static int verify_arguments(void) {
935
936 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
937 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
938 return -EINVAL;
939 }
940
6d0b55c2
LP
941 if (arg_expose_ports && !arg_private_network) {
942 log_error("Cannot use --port= without private networking.");
943 return -EINVAL;
944 }
945
c6c8f6e2
LP
946 if (arg_boot && arg_kill_signal <= 0)
947 arg_kill_signal = SIGRTMIN+3;
948
f757855e 949 return 0;
88213476
LP
950}
951
03cfe0d5
LP
952static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
953 assert(p);
954
955 if (!arg_userns)
956 return 0;
957
958 if (uid == UID_INVALID && gid == GID_INVALID)
959 return 0;
960
961 if (uid != UID_INVALID) {
962 uid += arg_uid_shift;
963
964 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
965 return -EOVERFLOW;
966 }
967
968 if (gid != GID_INVALID) {
969 gid += (gid_t) arg_uid_shift;
970
971 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
972 return -EOVERFLOW;
973 }
974
975 if (lchown(p, uid, gid) < 0)
976 return -errno;
b12afc8c
LP
977
978 return 0;
979}
980
03cfe0d5
LP
981static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
982 const char *q;
983
984 q = prefix_roota(root, path);
985 if (mkdir(q, mode) < 0) {
986 if (errno == EEXIST)
987 return 0;
988 return -errno;
989 }
990
991 return userns_lchown(q, uid, gid);
992}
993
e58a1277 994static int setup_timezone(const char *dest) {
03cfe0d5
LP
995 _cleanup_free_ char *p = NULL, *q = NULL;
996 const char *where, *check, *what;
d4036145
LP
997 char *z, *y;
998 int r;
f8440af5 999
e58a1277
LP
1000 assert(dest);
1001
1002 /* Fix the timezone, if possible */
d4036145
LP
1003 r = readlink_malloc("/etc/localtime", &p);
1004 if (r < 0) {
1005 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1006 return 0;
1007 }
1008
1009 z = path_startswith(p, "../usr/share/zoneinfo/");
1010 if (!z)
1011 z = path_startswith(p, "/usr/share/zoneinfo/");
1012 if (!z) {
1013 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1014 return 0;
1015 }
1016
03cfe0d5 1017 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1018 r = readlink_malloc(where, &q);
1019 if (r >= 0) {
1020 y = path_startswith(q, "../usr/share/zoneinfo/");
1021 if (!y)
1022 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1023
d4036145
LP
1024 /* Already pointing to the right place? Then do nothing .. */
1025 if (y && streq(y, z))
1026 return 0;
1027 }
1028
03cfe0d5 1029 check = strjoina("/usr/share/zoneinfo/", z);
61e741ed 1030 check = prefix_roota(dest, check);
03cfe0d5 1031 if (laccess(check, F_OK) < 0) {
d4036145
LP
1032 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1033 return 0;
1034 }
68fb0892 1035
79d80fc1
TG
1036 r = unlink(where);
1037 if (r < 0 && errno != ENOENT) {
56f64d95 1038 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1039 return 0;
1040 }
4d9f07b4 1041
03cfe0d5 1042 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1043 if (symlink(what, where) < 0) {
56f64d95 1044 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1045 return 0;
1046 }
e58a1277 1047
03cfe0d5
LP
1048 r = userns_lchown(where, 0, 0);
1049 if (r < 0)
1050 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1051
e58a1277 1052 return 0;
88213476
LP
1053}
1054
2547bb41 1055static int setup_resolv_conf(const char *dest) {
03cfe0d5 1056 const char *where = NULL;
79d80fc1 1057 int r;
2547bb41
LP
1058
1059 assert(dest);
1060
1061 if (arg_private_network)
1062 return 0;
1063
1064 /* Fix resolv.conf, if possible */
03cfe0d5 1065 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1066
f2068bcc 1067 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1068 if (r < 0) {
68a313c5
LP
1069 /* If the file already exists as symlink, let's
1070 * suppress the warning, under the assumption that
1071 * resolved or something similar runs inside and the
1072 * symlink points there.
1073 *
1074 * If the disk image is read-only, there's also no
1075 * point in complaining.
1076 */
1077 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1078 "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1079 return 0;
1080 }
2547bb41 1081
03cfe0d5
LP
1082 r = userns_lchown(where, 0, 0);
1083 if (r < 0)
1084 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1085
2547bb41
LP
1086 return 0;
1087}
1088
9f24adc2 1089static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
03cfe0d5 1090 assert(s);
9f24adc2
LP
1091
1092 snprintf(s, 37,
1093 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1094 SD_ID128_FORMAT_VAL(id));
1095
1096 return s;
1097}
1098
04bc4a3f 1099static int setup_boot_id(const char *dest) {
03cfe0d5 1100 const char *from, *to;
39883f62 1101 sd_id128_t rnd = {};
04bc4a3f
LP
1102 char as_uuid[37];
1103 int r;
1104
eb91eb18
LP
1105 if (arg_share_system)
1106 return 0;
1107
04bc4a3f
LP
1108 /* Generate a new randomized boot ID, so that each boot-up of
1109 * the container gets a new one */
1110
03cfe0d5
LP
1111 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1112 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1113
1114 r = sd_id128_randomize(&rnd);
f647962d
MS
1115 if (r < 0)
1116 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1117
9f24adc2 1118 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1119
4c1fc3e4 1120 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
f647962d
MS
1121 if (r < 0)
1122 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1123
03cfe0d5
LP
1124 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1125 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1126 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
56f64d95 1127 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1128
1129 unlink(from);
04bc4a3f
LP
1130 return r;
1131}
1132
e58a1277 1133static int copy_devnodes(const char *dest) {
88213476
LP
1134
1135 static const char devnodes[] =
1136 "null\0"
1137 "zero\0"
1138 "full\0"
1139 "random\0"
1140 "urandom\0"
85614d66
TG
1141 "tty\0"
1142 "net/tun\0";
88213476
LP
1143
1144 const char *d;
e58a1277 1145 int r = 0;
7fd1b19b 1146 _cleanup_umask_ mode_t u;
a258bf26
LP
1147
1148 assert(dest);
124640f1
LP
1149
1150 u = umask(0000);
88213476 1151
03cfe0d5
LP
1152 /* Create /dev/net, so that we can create /dev/net/tun in it */
1153 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1154 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1155
88213476 1156 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1157 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1158 struct stat st;
88213476 1159
7f112f50 1160 from = strappend("/dev/", d);
03cfe0d5 1161 to = prefix_root(dest, from);
88213476
LP
1162
1163 if (stat(from, &st) < 0) {
1164
4a62c710
MS
1165 if (errno != ENOENT)
1166 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1167
a258bf26 1168 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1169
03cfe0d5 1170 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1171 return -EIO;
a258bf26 1172
85614d66 1173 } else {
81f5049b
AC
1174 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1175 if (errno != EPERM)
1176 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1177
1178 /* Some systems abusively restrict mknod but
1179 * allow bind mounts. */
1180 r = touch(to);
1181 if (r < 0)
1182 return log_error_errno(r, "touch (%s) failed: %m", to);
1183 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1184 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1185 }
6278cf60 1186
03cfe0d5
LP
1187 r = userns_lchown(to, 0, 0);
1188 if (r < 0)
1189 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1190 }
88213476
LP
1191 }
1192
e58a1277
LP
1193 return r;
1194}
88213476 1195
03cfe0d5
LP
1196static int setup_pts(const char *dest) {
1197 _cleanup_free_ char *options = NULL;
1198 const char *p;
709f6e46 1199 int r;
03cfe0d5
LP
1200
1201#ifdef HAVE_SELINUX
1202 if (arg_selinux_apifs_context)
1203 (void) asprintf(&options,
3dce8915 1204 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1205 arg_uid_shift + TTY_GID,
1206 arg_selinux_apifs_context);
1207 else
1208#endif
1209 (void) asprintf(&options,
3dce8915 1210 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1211 arg_uid_shift + TTY_GID);
f2d88580 1212
03cfe0d5 1213 if (!options)
f2d88580
LP
1214 return log_oom();
1215
03cfe0d5 1216 /* Mount /dev/pts itself */
cc9fce65 1217 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1218 if (mkdir(p, 0755) < 0)
1219 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1220 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1221 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
709f6e46
MS
1222 r = userns_lchown(p, 0, 0);
1223 if (r < 0)
1224 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1225
1226 /* Create /dev/ptmx symlink */
1227 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1228 if (symlink("pts/ptmx", p) < 0)
1229 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1230 r = userns_lchown(p, 0, 0);
1231 if (r < 0)
1232 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1233
03cfe0d5
LP
1234 /* And fix /dev/pts/ptmx ownership */
1235 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1236 r = userns_lchown(p, 0, 0);
1237 if (r < 0)
1238 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1239
f2d88580
LP
1240 return 0;
1241}
1242
e58a1277 1243static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1244 _cleanup_umask_ mode_t u;
1245 const char *to;
e58a1277 1246 int r;
e58a1277
LP
1247
1248 assert(dest);
1249 assert(console);
1250
1251 u = umask(0000);
1252
03cfe0d5 1253 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1254 if (r < 0)
1255 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1256
a258bf26
LP
1257 /* We need to bind mount the right tty to /dev/console since
1258 * ptys can only exist on pts file systems. To have something
81f5049b 1259 * to bind mount things on we create a empty regular file. */
a258bf26 1260
03cfe0d5 1261 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1262 r = touch(to);
1263 if (r < 0)
1264 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1265
4543768d 1266 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1267 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1268
25ea79fe 1269 return 0;
e58a1277
LP
1270}
1271
1272static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1273 const char *from, *to;
7fd1b19b 1274 _cleanup_umask_ mode_t u;
d9603714 1275 int fd, r;
e58a1277 1276
e58a1277 1277 assert(kmsg_socket >= 0);
a258bf26 1278
e58a1277 1279 u = umask(0000);
a258bf26 1280
03cfe0d5 1281 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1282 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1283 * on the reading side behave very similar to /proc/kmsg,
1284 * their writing side behaves differently from /dev/kmsg in
1285 * that writing blocks when nothing is reading. In order to
1286 * avoid any problems with containers deadlocking due to this
1287 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1288 from = prefix_roota(dest, "/run/kmsg");
1289 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1290
4a62c710 1291 if (mkfifo(from, 0600) < 0)
03cfe0d5 1292 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 1293 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1294 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1295
1296 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1297 if (fd < 0)
1298 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1299
e58a1277
LP
1300 /* Store away the fd in the socket, so that it stays open as
1301 * long as we run the child */
3ee897d6 1302 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1303 safe_close(fd);
e58a1277 1304
d9603714
DH
1305 if (r < 0)
1306 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1307
03cfe0d5
LP
1308 /* And now make the FIFO unavailable as /run/kmsg... */
1309 (void) unlink(from);
1310
25ea79fe 1311 return 0;
88213476
LP
1312}
1313
1c4baffc 1314static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1315 union in_addr_union *exposed = userdata;
1316
1317 assert(rtnl);
1318 assert(m);
1319 assert(exposed);
1320
7a8f6325 1321 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1322 return 0;
1323}
1324
3a74cea5 1325static int setup_hostname(void) {
3a74cea5 1326
eb91eb18
LP
1327 if (arg_share_system)
1328 return 0;
1329
605f81a8 1330 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1331 return -errno;
3a74cea5 1332
7027ff61 1333 return 0;
3a74cea5
LP
1334}
1335
57fb9fb5 1336static int setup_journal(const char *directory) {
4d680aee 1337 sd_id128_t machine_id, this_id;
03cfe0d5
LP
1338 _cleanup_free_ char *b = NULL, *d = NULL;
1339 const char *etc_machine_id, *p, *q;
27407a01 1340 char *id;
57fb9fb5
LP
1341 int r;
1342
df9a75e4
LP
1343 /* Don't link journals in ephemeral mode */
1344 if (arg_ephemeral)
1345 return 0;
1346
03cfe0d5 1347 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
57fb9fb5 1348
03cfe0d5 1349 r = read_one_line_file(etc_machine_id, &b);
27407a01
ZJS
1350 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1351 return 0;
f647962d 1352 else if (r < 0)
03cfe0d5 1353 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
57fb9fb5 1354
27407a01
ZJS
1355 id = strstrip(b);
1356 if (isempty(id) && arg_link_journal == LINK_AUTO)
1357 return 0;
57fb9fb5 1358
27407a01
ZJS
1359 /* Verify validity */
1360 r = sd_id128_from_string(id, &machine_id);
f647962d 1361 if (r < 0)
03cfe0d5 1362 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
57fb9fb5 1363
4d680aee 1364 r = sd_id128_get_machine(&this_id);
f647962d
MS
1365 if (r < 0)
1366 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1367
1368 if (sd_id128_equal(machine_id, this_id)) {
1369 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1370 "Host and machine ids are equal (%s): refusing to link journals", id);
1371 if (arg_link_journal == LINK_AUTO)
1372 return 0;
df9a75e4 1373 return -EEXIST;
4d680aee
ZJS
1374 }
1375
1376 if (arg_link_journal == LINK_NO)
1377 return 0;
1378
03cfe0d5
LP
1379 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1380 if (r < 0)
1381 return log_error_errno(r, "Failed to create /var: %m");
1382
1383 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1384 if (r < 0)
1385 return log_error_errno(r, "Failed to create /var/log: %m");
1386
1387 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1388 if (r < 0)
1389 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1390
1391 p = strjoina("/var/log/journal/", id);
1392 q = prefix_roota(directory, p);
27407a01 1393
e26d6ce5 1394 if (path_is_mount_point(p, 0) > 0) {
27407a01
ZJS
1395 if (arg_link_journal != LINK_AUTO) {
1396 log_error("%s: already a mount point, refusing to use for journal", p);
1397 return -EEXIST;
1398 }
1399
1400 return 0;
57fb9fb5
LP
1401 }
1402
e26d6ce5 1403 if (path_is_mount_point(q, 0) > 0) {
57fb9fb5 1404 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1405 log_error("%s: already a mount point, refusing to use for journal", q);
1406 return -EEXIST;
57fb9fb5
LP
1407 }
1408
27407a01 1409 return 0;
57fb9fb5
LP
1410 }
1411
1412 r = readlink_and_make_absolute(p, &d);
1413 if (r >= 0) {
1414 if ((arg_link_journal == LINK_GUEST ||
1415 arg_link_journal == LINK_AUTO) &&
1416 path_equal(d, q)) {
1417
03cfe0d5 1418 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1419 if (r < 0)
709f6e46 1420 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1421 return 0;
57fb9fb5
LP
1422 }
1423
4a62c710
MS
1424 if (unlink(p) < 0)
1425 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1426 } else if (r == -EINVAL) {
1427
1428 if (arg_link_journal == LINK_GUEST &&
1429 rmdir(p) < 0) {
1430
27407a01
ZJS
1431 if (errno == ENOTDIR) {
1432 log_error("%s already exists and is neither a symlink nor a directory", p);
1433 return r;
4314d33f
MS
1434 } else
1435 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1436 }
4314d33f
MS
1437 } else if (r != -ENOENT)
1438 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1439
1440 if (arg_link_journal == LINK_GUEST) {
1441
1442 if (symlink(q, p) < 0) {
574edc90 1443 if (arg_link_journal_try) {
56f64d95 1444 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1445 return 0;
4314d33f
MS
1446 } else
1447 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1448 }
1449
03cfe0d5 1450 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1451 if (r < 0)
709f6e46 1452 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1453 return 0;
57fb9fb5
LP
1454 }
1455
1456 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1457 /* don't create parents here -- if the host doesn't have
1458 * permanent journal set up, don't force it here */
1459 r = mkdir(p, 0755);
57fb9fb5 1460 if (r < 0) {
574edc90 1461 if (arg_link_journal_try) {
56f64d95 1462 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1463 return 0;
4314d33f
MS
1464 } else
1465 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1466 }
1467
27407a01
ZJS
1468 } else if (access(p, F_OK) < 0)
1469 return 0;
57fb9fb5 1470
cdb2b9d0
LP
1471 if (dir_is_empty(q) == 0)
1472 log_warning("%s is not empty, proceeding anyway.", q);
1473
03cfe0d5 1474 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1475 if (r < 0)
1476 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1477
4543768d 1478 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 1479 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1480
27407a01 1481 return 0;
57fb9fb5
LP
1482}
1483
88213476 1484static int drop_capabilities(void) {
a103496c 1485 return capability_bounding_set_drop(arg_retain, false);
88213476
LP
1486}
1487
db999e0f
LP
1488static int reset_audit_loginuid(void) {
1489 _cleanup_free_ char *p = NULL;
1490 int r;
1491
1492 if (arg_share_system)
1493 return 0;
1494
1495 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1496 if (r == -ENOENT)
db999e0f 1497 return 0;
f647962d
MS
1498 if (r < 0)
1499 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1500
1501 /* Already reset? */
1502 if (streq(p, "4294967295"))
1503 return 0;
1504
ad118bda 1505 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1506 if (r < 0) {
10a87006
LP
1507 log_error_errno(r,
1508 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1509 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1510 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1511 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1512 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1513
db999e0f 1514 sleep(5);
77b6e194 1515 }
db999e0f
LP
1516
1517 return 0;
77b6e194
LP
1518}
1519
28650077 1520static int setup_seccomp(void) {
24fb1112
LP
1521
1522#ifdef HAVE_SECCOMP
9a71b112
JF
1523 static const struct {
1524 uint64_t capability;
1525 int syscall_num;
1526 } blacklist[] = {
5ba7a268
LP
1527 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1528 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1529 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1530 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1531 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1532 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1533 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1534 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1535 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1536 { CAP_SYSLOG, SCMP_SYS(syslog) },
d0a0ccf3
JF
1537 };
1538
24fb1112 1539 scmp_filter_ctx seccomp;
28650077 1540 unsigned i;
24fb1112
LP
1541 int r;
1542
24fb1112
LP
1543 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1544 if (!seccomp)
1545 return log_oom();
1546
e9642be2 1547 r = seccomp_add_secondary_archs(seccomp);
9875fd78 1548 if (r < 0) {
da927ba9 1549 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
1550 goto finish;
1551 }
1552
28650077 1553 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
1554 if (arg_retain & (1ULL << blacklist[i].capability))
1555 continue;
1556
1557 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
1558 if (r == -EFAULT)
1559 continue; /* unknown syscall */
1560 if (r < 0) {
da927ba9 1561 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
1562 goto finish;
1563 }
1564 }
1565
d0a0ccf3 1566
28650077
LP
1567 /*
1568 Audit is broken in containers, much of the userspace audit
1569 hookup will fail if running inside a container. We don't
1570 care and just turn off creation of audit sockets.
1571
1572 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1573 with EAFNOSUPPORT which audit userspace uses as indication
1574 that audit is disabled in the kernel.
1575 */
1576
3302da46 1577 r = seccomp_rule_add(
24fb1112
LP
1578 seccomp,
1579 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1580 SCMP_SYS(socket),
1581 2,
1582 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1583 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1584 if (r < 0) {
da927ba9 1585 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
1586 goto finish;
1587 }
1588
1589 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1590 if (r < 0) {
da927ba9 1591 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
1592 goto finish;
1593 }
1594
1595 r = seccomp_load(seccomp);
9b1cbdc6
ILG
1596 if (r == -EINVAL) {
1597 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1598 r = 0;
1599 goto finish;
1600 }
1601 if (r < 0) {
da927ba9 1602 log_error_errno(r, "Failed to install seccomp audit filter: %m");
9b1cbdc6
ILG
1603 goto finish;
1604 }
24fb1112
LP
1605
1606finish:
1607 seccomp_release(seccomp);
1608 return r;
1609#else
1610 return 0;
1611#endif
1612
1613}
1614
785890ac
LP
1615static int setup_propagate(const char *root) {
1616 const char *p, *q;
709f6e46 1617 int r;
785890ac
LP
1618
1619 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1620 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1621 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1622 (void) mkdir_p(p, 0600);
1623
709f6e46
MS
1624 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1625 if (r < 0)
1626 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1627
709f6e46
MS
1628 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1629 if (r < 0)
1630 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1631
709f6e46
MS
1632 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1633 if (r < 0)
1634 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1635
03cfe0d5 1636 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
1637 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1638 return log_error_errno(errno, "Failed to install propagation bind mount.");
1639
1640 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1641 return log_error_errno(errno, "Failed to make propagation mount read-only");
1642
1643 return 0;
1644}
1645
1b9e5b12
LP
1646static int setup_image(char **device_path, int *loop_nr) {
1647 struct loop_info64 info = {
1648 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1649 };
1650 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1651 _cleanup_free_ char* loopdev = NULL;
1652 struct stat st;
1653 int r, nr;
1654
1655 assert(device_path);
1656 assert(loop_nr);
ec16945e 1657 assert(arg_image);
1b9e5b12
LP
1658
1659 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1660 if (fd < 0)
1661 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 1662
4a62c710
MS
1663 if (fstat(fd, &st) < 0)
1664 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
1665
1666 if (S_ISBLK(st.st_mode)) {
1667 char *p;
1668
1669 p = strdup(arg_image);
1670 if (!p)
1671 return log_oom();
1672
1673 *device_path = p;
1674
1675 *loop_nr = -1;
1676
1677 r = fd;
1678 fd = -1;
1679
1680 return r;
1681 }
1682
1683 if (!S_ISREG(st.st_mode)) {
070edd97 1684 log_error("%s is not a regular file or block device.", arg_image);
1b9e5b12
LP
1685 return -EINVAL;
1686 }
1687
1688 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1689 if (control < 0)
1690 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
1691
1692 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
1693 if (nr < 0)
1694 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
1695
1696 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1697 return log_oom();
1698
1699 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1700 if (loop < 0)
1701 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 1702
4a62c710
MS
1703 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1704 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
1705
1706 if (arg_read_only)
1707 info.lo_flags |= LO_FLAGS_READ_ONLY;
1708
4a62c710
MS
1709 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1710 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
1711
1712 *device_path = loopdev;
1713 loopdev = NULL;
1714
1715 *loop_nr = nr;
1716
1717 r = loop;
1718 loop = -1;
1719
1720 return r;
1721}
1722
ada4799a
LP
1723#define PARTITION_TABLE_BLURB \
1724 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 1725 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 1726 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
1727 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1728 "to be bootable with systemd-nspawn."
1729
1b9e5b12
LP
1730static int dissect_image(
1731 int fd,
727fd4fd
LP
1732 char **root_device, bool *root_device_rw,
1733 char **home_device, bool *home_device_rw,
1734 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
1735 bool *secondary) {
1736
1737#ifdef HAVE_BLKID
01dc33ce
ZJS
1738 int home_nr = -1, srv_nr = -1;
1739#ifdef GPT_ROOT_NATIVE
1740 int root_nr = -1;
1741#endif
1742#ifdef GPT_ROOT_SECONDARY
1743 int secondary_root_nr = -1;
1744#endif
f6c51a81 1745 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
1746 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1747 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1748 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1749 _cleanup_udev_unref_ struct udev *udev = NULL;
1750 struct udev_list_entry *first, *item;
f6c51a81 1751 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 1752 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
1753 const char *pttype = NULL;
1754 blkid_partlist pl;
1755 struct stat st;
c09ef2e4 1756 unsigned i;
1b9e5b12
LP
1757 int r;
1758
1759 assert(fd >= 0);
1760 assert(root_device);
1761 assert(home_device);
1762 assert(srv_device);
1763 assert(secondary);
ec16945e 1764 assert(arg_image);
1b9e5b12
LP
1765
1766 b = blkid_new_probe();
1767 if (!b)
1768 return log_oom();
1769
1770 errno = 0;
1771 r = blkid_probe_set_device(b, fd, 0, 0);
1772 if (r != 0) {
1773 if (errno == 0)
1774 return log_oom();
1775
e1427b13 1776 return log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
1777 }
1778
1779 blkid_probe_enable_partitions(b, 1);
1780 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1781
1782 errno = 0;
1783 r = blkid_do_safeprobe(b);
1784 if (r == -2 || r == 1) {
ada4799a
LP
1785 log_error("Failed to identify any partition table on\n"
1786 " %s\n"
1787 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1788 return -EINVAL;
1789 } else if (r != 0) {
1790 if (errno == 0)
1791 errno = EIO;
e1427b13 1792 return log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
1793 }
1794
48861960 1795 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
1796
1797 is_gpt = streq_ptr(pttype, "gpt");
1798 is_mbr = streq_ptr(pttype, "dos");
1799
1800 if (!is_gpt && !is_mbr) {
1801 log_error("No GPT or MBR partition table discovered on\n"
1802 " %s\n"
1803 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1804 return -EINVAL;
1805 }
1806
1807 errno = 0;
1808 pl = blkid_probe_get_partitions(b);
1809 if (!pl) {
1810 if (errno == 0)
1811 return log_oom();
1812
1813 log_error("Failed to list partitions of %s", arg_image);
1814 return -errno;
1815 }
1816
1817 udev = udev_new();
1818 if (!udev)
1819 return log_oom();
1820
4a62c710
MS
1821 if (fstat(fd, &st) < 0)
1822 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 1823
c09ef2e4
LP
1824 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1825 if (!d)
1b9e5b12
LP
1826 return log_oom();
1827
c09ef2e4
LP
1828 for (i = 0;; i++) {
1829 int n, m;
1b9e5b12 1830
c09ef2e4
LP
1831 if (i >= 10) {
1832 log_error("Kernel partitions never appeared.");
1833 return -ENXIO;
1834 }
1835
1836 e = udev_enumerate_new(udev);
1837 if (!e)
1838 return log_oom();
1839
1840 r = udev_enumerate_add_match_parent(e, d);
1841 if (r < 0)
1842 return log_oom();
1843
1844 r = udev_enumerate_scan_devices(e);
1845 if (r < 0)
1846 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1847
1848 /* Count the partitions enumerated by the kernel */
1849 n = 0;
1850 first = udev_enumerate_get_list_entry(e);
1851 udev_list_entry_foreach(item, first)
1852 n++;
1853
1854 /* Count the partitions enumerated by blkid */
1855 m = blkid_partlist_numof_partitions(pl);
1856 if (n == m + 1)
1857 break;
1858 if (n > m + 1) {
1859 log_error("blkid and kernel partition list do not match.");
1860 return -EIO;
1861 }
1862 if (n < m + 1) {
1863 unsigned j;
1864
1865 /* The kernel has probed fewer partitions than
1866 * blkid? Maybe the kernel prober is still
1867 * running or it got EBUSY because udev
1868 * already opened the device. Let's reprobe
1869 * the device, which is a synchronous call
1870 * that waits until probing is complete. */
1871
1872 for (j = 0; j < 20; j++) {
1873
1874 r = ioctl(fd, BLKRRPART, 0);
1875 if (r < 0)
1876 r = -errno;
1877 if (r >= 0 || r != -EBUSY)
1878 break;
1879
1880 /* If something else has the device
1881 * open, such as an udev rule, the
1882 * ioctl will return EBUSY. Since
1883 * there's no way to wait until it
1884 * isn't busy anymore, let's just wait
1885 * a bit, and try again.
1886 *
1887 * This is really something they
1888 * should fix in the kernel! */
1889
1890 usleep(50 * USEC_PER_MSEC);
1891 }
1892
1893 if (r < 0)
1894 return log_error_errno(r, "Failed to reread partition table: %m");
1895 }
1896
1897 e = udev_enumerate_unref(e);
1898 }
1b9e5b12
LP
1899
1900 first = udev_enumerate_get_list_entry(e);
1901 udev_list_entry_foreach(item, first) {
1902 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 1903 const char *node;
727fd4fd 1904 unsigned long long flags;
1b9e5b12
LP
1905 blkid_partition pp;
1906 dev_t qn;
1907 int nr;
1908
1909 errno = 0;
1910 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1911 if (!q) {
1912 if (!errno)
1913 errno = ENOMEM;
1914
e1427b13 1915 return log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
1916 }
1917
1918 qn = udev_device_get_devnum(q);
1919 if (major(qn) == 0)
1920 continue;
1921
1922 if (st.st_rdev == qn)
1923 continue;
1924
1925 node = udev_device_get_devnode(q);
1926 if (!node)
1927 continue;
1928
1929 pp = blkid_partlist_devno_to_partition(pl, qn);
1930 if (!pp)
1931 continue;
1932
727fd4fd 1933 flags = blkid_partition_get_flags(pp);
727fd4fd 1934
1b9e5b12
LP
1935 nr = blkid_partition_get_partno(pp);
1936 if (nr < 0)
1937 continue;
1938
ada4799a
LP
1939 if (is_gpt) {
1940 sd_id128_t type_id;
1941 const char *stype;
1b9e5b12 1942
f6c51a81
LP
1943 if (flags & GPT_FLAG_NO_AUTO)
1944 continue;
1945
ada4799a
LP
1946 stype = blkid_partition_get_type_string(pp);
1947 if (!stype)
1948 continue;
1b9e5b12 1949
ada4799a 1950 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
1951 continue;
1952
ada4799a 1953 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 1954
ada4799a
LP
1955 if (home && nr >= home_nr)
1956 continue;
1b9e5b12 1957
ada4799a
LP
1958 home_nr = nr;
1959 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 1960
ada4799a
LP
1961 r = free_and_strdup(&home, node);
1962 if (r < 0)
1963 return log_oom();
727fd4fd 1964
ada4799a
LP
1965 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1966
1967 if (srv && nr >= srv_nr)
1968 continue;
1969
1970 srv_nr = nr;
1971 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1972
1973 r = free_and_strdup(&srv, node);
1974 if (r < 0)
1975 return log_oom();
1976 }
1b9e5b12 1977#ifdef GPT_ROOT_NATIVE
ada4799a 1978 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 1979
ada4799a
LP
1980 if (root && nr >= root_nr)
1981 continue;
1b9e5b12 1982
ada4799a
LP
1983 root_nr = nr;
1984 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 1985
ada4799a
LP
1986 r = free_and_strdup(&root, node);
1987 if (r < 0)
1988 return log_oom();
1989 }
1b9e5b12
LP
1990#endif
1991#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
1992 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1993
1994 if (secondary_root && nr >= secondary_root_nr)
1995 continue;
1996
1997 secondary_root_nr = nr;
1998 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
1999
2000 r = free_and_strdup(&secondary_root, node);
2001 if (r < 0)
2002 return log_oom();
2003 }
2004#endif
f6c51a81
LP
2005 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2006
2007 if (generic)
2008 multiple_generic = true;
2009 else {
2010 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2011
2012 r = free_and_strdup(&generic, node);
2013 if (r < 0)
2014 return log_oom();
2015 }
2016 }
ada4799a
LP
2017
2018 } else if (is_mbr) {
2019 int type;
1b9e5b12 2020
f6c51a81
LP
2021 if (flags != 0x80) /* Bootable flag */
2022 continue;
2023
ada4799a
LP
2024 type = blkid_partition_get_type(pp);
2025 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2026 continue;
2027
f6c51a81
LP
2028 if (generic)
2029 multiple_generic = true;
2030 else {
2031 generic_rw = true;
727fd4fd 2032
f6c51a81
LP
2033 r = free_and_strdup(&root, node);
2034 if (r < 0)
2035 return log_oom();
2036 }
1b9e5b12 2037 }
1b9e5b12
LP
2038 }
2039
1b9e5b12
LP
2040 if (root) {
2041 *root_device = root;
2042 root = NULL;
727fd4fd
LP
2043
2044 *root_device_rw = root_rw;
1b9e5b12
LP
2045 *secondary = false;
2046 } else if (secondary_root) {
2047 *root_device = secondary_root;
2048 secondary_root = NULL;
727fd4fd
LP
2049
2050 *root_device_rw = secondary_root_rw;
1b9e5b12 2051 *secondary = true;
f6c51a81
LP
2052 } else if (generic) {
2053
2054 /* There were no partitions with precise meanings
2055 * around, but we found generic partitions. In this
2056 * case, if there's only one, we can go ahead and boot
2057 * it, otherwise we bail out, because we really cannot
2058 * make any sense of it. */
2059
2060 if (multiple_generic) {
2061 log_error("Identified multiple bootable Linux partitions on\n"
2062 " %s\n"
2063 PARTITION_TABLE_BLURB, arg_image);
2064 return -EINVAL;
2065 }
2066
2067 *root_device = generic;
2068 generic = NULL;
2069
2070 *root_device_rw = generic_rw;
2071 *secondary = false;
2072 } else {
2073 log_error("Failed to identify root partition in disk image\n"
2074 " %s\n"
2075 PARTITION_TABLE_BLURB, arg_image);
2076 return -EINVAL;
1b9e5b12
LP
2077 }
2078
2079 if (home) {
2080 *home_device = home;
2081 home = NULL;
727fd4fd
LP
2082
2083 *home_device_rw = home_rw;
1b9e5b12
LP
2084 }
2085
2086 if (srv) {
2087 *srv_device = srv;
2088 srv = NULL;
727fd4fd
LP
2089
2090 *srv_device_rw = srv_rw;
1b9e5b12
LP
2091 }
2092
2093 return 0;
2094#else
2095 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2096 return -EOPNOTSUPP;
1b9e5b12
LP
2097#endif
2098}
2099
727fd4fd 2100static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2101#ifdef HAVE_BLKID
2102 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2103 const char *fstype, *p;
2104 int r;
2105
2106 assert(what);
2107 assert(where);
2108
727fd4fd
LP
2109 if (arg_read_only)
2110 rw = false;
2111
1b9e5b12 2112 if (directory)
63c372cb 2113 p = strjoina(where, directory);
1b9e5b12
LP
2114 else
2115 p = where;
2116
2117 errno = 0;
2118 b = blkid_new_probe_from_filename(what);
2119 if (!b) {
2120 if (errno == 0)
2121 return log_oom();
e1427b13 2122 return log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2123 }
2124
2125 blkid_probe_enable_superblocks(b, 1);
2126 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2127
2128 errno = 0;
2129 r = blkid_do_safeprobe(b);
2130 if (r == -1 || r == 1) {
2131 log_error("Cannot determine file system type of %s", what);
2132 return -EINVAL;
2133 } else if (r != 0) {
2134 if (errno == 0)
2135 errno = EIO;
e1427b13 2136 return log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2137 }
2138
2139 errno = 0;
2140 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2141 if (errno == 0)
2142 errno = EINVAL;
2143 log_error("Failed to determine file system type of %s", what);
2144 return -errno;
2145 }
2146
2147 if (streq(fstype, "crypto_LUKS")) {
2148 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 2149 return -EOPNOTSUPP;
1b9e5b12
LP
2150 }
2151
4a62c710
MS
2152 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2153 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
2154
2155 return 0;
2156#else
2157 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2158 return -EOPNOTSUPP;
1b9e5b12
LP
2159#endif
2160}
2161
727fd4fd
LP
2162static int mount_devices(
2163 const char *where,
2164 const char *root_device, bool root_device_rw,
2165 const char *home_device, bool home_device_rw,
2166 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2167 int r;
2168
2169 assert(where);
2170
2171 if (root_device) {
727fd4fd 2172 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2173 if (r < 0)
2174 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2175 }
2176
2177 if (home_device) {
727fd4fd 2178 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2179 if (r < 0)
2180 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2181 }
2182
2183 if (srv_device) {
727fd4fd 2184 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2185 if (r < 0)
2186 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2187 }
2188
2189 return 0;
2190}
2191
2192static void loop_remove(int nr, int *image_fd) {
2193 _cleanup_close_ int control = -1;
e8c8ddcc 2194 int r;
1b9e5b12
LP
2195
2196 if (nr < 0)
2197 return;
2198
2199 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2200 r = ioctl(*image_fd, LOOP_CLR_FD);
2201 if (r < 0)
5e4074aa 2202 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2203 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2204 }
2205
2206 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2207 if (control < 0) {
56f64d95 2208 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2209 return;
e8c8ddcc 2210 }
1b9e5b12 2211
e8c8ddcc
TG
2212 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2213 if (r < 0)
5e4074aa 2214 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2215}
2216
113cea80 2217/*
6d416b9c
LS
2218 * Return values:
2219 * < 0 : wait_for_terminate() failed to get the state of the
2220 * container, the container was terminated by a signal, or
2221 * failed for an unknown reason. No change is made to the
2222 * container argument.
2223 * > 0 : The program executed in the container terminated with an
2224 * error. The exit code of the program executed in the
919699ec
LP
2225 * container is returned. The container argument has been set
2226 * to CONTAINER_TERMINATED.
6d416b9c
LS
2227 * 0 : The container is being rebooted, has been shut down or exited
2228 * successfully. The container argument has been set to either
2229 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2230 *
6d416b9c
LS
2231 * That is, success is indicated by a return value of zero, and an
2232 * error is indicated by a non-zero value.
113cea80
DH
2233 */
2234static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2235 siginfo_t status;
919699ec 2236 int r;
113cea80
DH
2237
2238 r = wait_for_terminate(pid, &status);
f647962d
MS
2239 if (r < 0)
2240 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2241
2242 switch (status.si_code) {
fddbb89c 2243
113cea80 2244 case CLD_EXITED:
919699ec
LP
2245 if (status.si_status == 0) {
2246 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 2247
fddbb89c 2248 } else
919699ec 2249 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2250
919699ec
LP
2251 *container = CONTAINER_TERMINATED;
2252 return status.si_status;
113cea80
DH
2253
2254 case CLD_KILLED:
2255 if (status.si_status == SIGINT) {
113cea80 2256
919699ec 2257 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2258 *container = CONTAINER_TERMINATED;
919699ec
LP
2259 return 0;
2260
113cea80 2261 } else if (status.si_status == SIGHUP) {
113cea80 2262
919699ec 2263 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2264 *container = CONTAINER_REBOOTED;
919699ec 2265 return 0;
113cea80 2266 }
919699ec 2267
113cea80
DH
2268 /* CLD_KILLED fallthrough */
2269
2270 case CLD_DUMPED:
fddbb89c 2271 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2272 return -EIO;
113cea80
DH
2273
2274 default:
fddbb89c 2275 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2276 return -EIO;
113cea80
DH
2277 }
2278
2279 return r;
2280}
2281
023fb90b
LP
2282static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2283 pid_t pid;
2284
4a0b58c4 2285 pid = PTR_TO_PID(userdata);
023fb90b 2286 if (pid > 0) {
c6c8f6e2 2287 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2288 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2289 sd_event_source_set_userdata(s, NULL);
2290 return 0;
2291 }
2292 }
2293
2294 sd_event_exit(sd_event_source_get_event(s), 0);
2295 return 0;
2296}
2297
ec16945e 2298static int determine_names(void) {
1b9cebf6 2299 int r;
ec16945e 2300
c1521918
LP
2301 if (arg_template && !arg_directory && arg_machine) {
2302
2303 /* If --template= was specified then we should not
2304 * search for a machine, but instead create a new one
2305 * in /var/lib/machine. */
2306
2307 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2308 if (!arg_directory)
2309 return log_oom();
2310 }
2311
ec16945e 2312 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2313 if (arg_machine) {
2314 _cleanup_(image_unrefp) Image *i = NULL;
2315
2316 r = image_find(arg_machine, &i);
2317 if (r < 0)
2318 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2319 else if (r == 0) {
2320 log_error("No image for machine '%s': %m", arg_machine);
2321 return -ENOENT;
2322 }
2323
aceac2f0 2324 if (i->type == IMAGE_RAW)
0f03c2a4 2325 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2326 else
0f03c2a4 2327 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6
LP
2328 if (r < 0)
2329 return log_error_errno(r, "Invalid image directory: %m");
2330
aee327b8
LP
2331 if (!arg_ephemeral)
2332 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2333 } else
ec16945e
LP
2334 arg_directory = get_current_dir_name();
2335
1b9cebf6
LP
2336 if (!arg_directory && !arg_machine) {
2337 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2338 return -EINVAL;
2339 }
2340 }
2341
2342 if (!arg_machine) {
b9ba4dab
LP
2343 if (arg_directory && path_equal(arg_directory, "/"))
2344 arg_machine = gethostname_malloc();
2345 else
2346 arg_machine = strdup(basename(arg_image ?: arg_directory));
2347
ec16945e
LP
2348 if (!arg_machine)
2349 return log_oom();
2350
ae691c1d 2351 hostname_cleanup(arg_machine);
ec16945e
LP
2352 if (!machine_name_is_valid(arg_machine)) {
2353 log_error("Failed to determine machine name automatically, please use -M.");
2354 return -EINVAL;
2355 }
b9ba4dab
LP
2356
2357 if (arg_ephemeral) {
2358 char *b;
2359
2360 /* Add a random suffix when this is an
2361 * ephemeral machine, so that we can run many
2362 * instances at once without manually having
2363 * to specify -M each time. */
2364
2365 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2366 return log_oom();
2367
2368 free(arg_machine);
2369 arg_machine = b;
2370 }
ec16945e
LP
2371 }
2372
2373 return 0;
2374}
2375
03cfe0d5 2376static int determine_uid_shift(const char *directory) {
6dac160c
LP
2377 int r;
2378
03cfe0d5
LP
2379 if (!arg_userns) {
2380 arg_uid_shift = 0;
6dac160c 2381 return 0;
03cfe0d5 2382 }
6dac160c
LP
2383
2384 if (arg_uid_shift == UID_INVALID) {
2385 struct stat st;
2386
03cfe0d5 2387 r = stat(directory, &st);
6dac160c 2388 if (r < 0)
03cfe0d5 2389 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2390
2391 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2392
2393 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2394 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2395 return -EINVAL;
2396 }
2397
2398 arg_uid_range = UINT32_C(0x10000);
2399 }
2400
2401 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2402 log_error("UID base too high for UID range.");
2403 return -EINVAL;
2404 }
2405
2406 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2407 return 0;
2408}
2409
03cfe0d5
LP
2410static int inner_child(
2411 Barrier *barrier,
2412 const char *directory,
2413 bool secondary,
2414 int kmsg_socket,
2415 int rtnl_socket,
f757855e 2416 FDSet *fds) {
69c79d3c 2417
03cfe0d5 2418 _cleanup_free_ char *home = NULL;
6aadfa4c 2419 unsigned n_env = 1;
03cfe0d5
LP
2420 const char *envp[] = {
2421 "PATH=" DEFAULT_PATH_SPLIT_USR,
6aadfa4c 2422 NULL, /* container */
03cfe0d5
LP
2423 NULL, /* TERM */
2424 NULL, /* HOME */
2425 NULL, /* USER */
2426 NULL, /* LOGNAME */
2427 NULL, /* container_uuid */
2428 NULL, /* LISTEN_FDS */
2429 NULL, /* LISTEN_PID */
2430 NULL
2431 };
88213476 2432
2371271c 2433 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2434 int r;
88213476 2435
03cfe0d5
LP
2436 assert(barrier);
2437 assert(directory);
2438 assert(kmsg_socket >= 0);
88213476 2439
efdb0237
LP
2440 cg_unified_flush();
2441
03cfe0d5
LP
2442 if (arg_userns) {
2443 /* Tell the parent, that it now can write the UID map. */
2444 (void) barrier_place(barrier); /* #1 */
7027ff61 2445
03cfe0d5
LP
2446 /* Wait until the parent wrote the UID map */
2447 if (!barrier_place_and_sync(barrier)) { /* #2 */
2448 log_error("Parent died too early");
2449 return -ESRCH;
2450 }
88213476
LP
2451 }
2452
d1678248 2453 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2454 if (r < 0)
2455 return r;
2456
d8fc6a00
LP
2457 r = mount_sysfs(NULL);
2458 if (r < 0)
2459 return r;
2460
03cfe0d5
LP
2461 /* Wait until we are cgroup-ified, so that we
2462 * can mount the right cgroup path writable */
2463 if (!barrier_place_and_sync(barrier)) { /* #3 */
2464 log_error("Parent died too early");
2465 return -ESRCH;
88213476
LP
2466 }
2467
e83bebef 2468 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
03cfe0d5
LP
2469 if (r < 0)
2470 return r;
ec16945e 2471
03cfe0d5
LP
2472 r = reset_uid_gid();
2473 if (r < 0)
2474 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 2475
03cfe0d5
LP
2476 r = setup_boot_id(NULL);
2477 if (r < 0)
2478 return r;
ec16945e 2479
03cfe0d5
LP
2480 r = setup_kmsg(NULL, kmsg_socket);
2481 if (r < 0)
2482 return r;
2483 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2484
03cfe0d5 2485 umask(0022);
30535c16 2486
03cfe0d5
LP
2487 if (setsid() < 0)
2488 return log_error_errno(errno, "setsid() failed: %m");
2489
2490 if (arg_private_network)
2491 loopback_setup();
2492
7a8f6325
LP
2493 if (arg_expose_ports) {
2494 r = expose_port_send_rtnl(rtnl_socket);
2495 if (r < 0)
2496 return r;
2497 rtnl_socket = safe_close(rtnl_socket);
2498 }
03cfe0d5 2499
709f6e46
MS
2500 r = drop_capabilities();
2501 if (r < 0)
2502 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2503
2504 setup_hostname();
2505
050f7277 2506 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2507 if (personality(arg_personality) < 0)
2508 return log_error_errno(errno, "personality() failed: %m");
2509 } else if (secondary) {
2510 if (personality(PER_LINUX32) < 0)
2511 return log_error_errno(errno, "personality() failed: %m");
2512 }
2513
2514#ifdef HAVE_SELINUX
2515 if (arg_selinux_context)
2516 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2517 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2518#endif
2519
ee645080 2520 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2521 if (r < 0)
2522 return r;
2523
6aadfa4c
ILG
2524 /* LXC sets container=lxc, so follow the scheme here */
2525 envp[n_env++] = strjoina("container=", arg_container_service_name);
2526
03cfe0d5
LP
2527 envp[n_env] = strv_find_prefix(environ, "TERM=");
2528 if (envp[n_env])
2529 n_env ++;
2530
2531 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2532 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2533 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2534 return log_oom();
2535
2536 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2537 char as_uuid[37];
2538
2539 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2540 return log_oom();
2541 }
2542
2543 if (fdset_size(fds) > 0) {
2544 r = fdset_cloexec(fds, false);
2545 if (r < 0)
2546 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2547
2548 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2549 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2550 return log_oom();
2551 }
2552
2371271c
TG
2553 env_use = strv_env_merge(2, envp, arg_setenv);
2554 if (!env_use)
2555 return log_oom();
03cfe0d5
LP
2556
2557 /* Let the parent know that we are ready and
2558 * wait until the parent is ready with the
2559 * setup, too... */
2560 if (!barrier_place_and_sync(barrier)) { /* #4 */
2561 log_error("Parent died too early");
2562 return -ESRCH;
2563 }
2564
2565 /* Now, explicitly close the log, so that we
2566 * then can close all remaining fds. Closing
2567 * the log explicitly first has the benefit
2568 * that the logging subsystem knows about it,
2569 * and is thus ready to be reopened should we
2570 * need it again. Note that the other fds
2571 * closed here are at least the locking and
2572 * barrier fds. */
2573 log_close();
2574 (void) fdset_close_others(fds);
2575
2576 if (arg_boot) {
2577 char **a;
2578 size_t m;
2579
2580 /* Automatically search for the init system */
2581
f757855e 2582 m = 1 + strv_length(arg_parameters);
03cfe0d5 2583 a = newa(char*, m + 1);
f757855e
LP
2584 if (strv_isempty(arg_parameters))
2585 a[1] = NULL;
2586 else
2587 memcpy(a + 1, arg_parameters, m * sizeof(char*));
03cfe0d5
LP
2588
2589 a[0] = (char*) "/usr/lib/systemd/systemd";
2590 execve(a[0], a, env_use);
2591
2592 a[0] = (char*) "/lib/systemd/systemd";
2593 execve(a[0], a, env_use);
2594
2595 a[0] = (char*) "/sbin/init";
2596 execve(a[0], a, env_use);
f757855e
LP
2597 } else if (!strv_isempty(arg_parameters))
2598 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2599 else {
f757855e 2600 chdir(home ?: "/root");
03cfe0d5
LP
2601 execle("/bin/bash", "-bash", NULL, env_use);
2602 execle("/bin/sh", "-sh", NULL, env_use);
2603 }
2604
35607a8d 2605 r = -errno;
03cfe0d5 2606 (void) log_open();
35607a8d 2607 return log_error_errno(r, "execv() failed: %m");
03cfe0d5
LP
2608}
2609
2610static int outer_child(
2611 Barrier *barrier,
2612 const char *directory,
2613 const char *console,
2614 const char *root_device, bool root_device_rw,
2615 const char *home_device, bool home_device_rw,
2616 const char *srv_device, bool srv_device_rw,
2617 bool interactive,
2618 bool secondary,
2619 int pid_socket,
2620 int kmsg_socket,
2621 int rtnl_socket,
825d5287 2622 int uid_shift_socket,
f757855e 2623 FDSet *fds) {
03cfe0d5
LP
2624
2625 pid_t pid;
2626 ssize_t l;
2627 int r;
2628
2629 assert(barrier);
2630 assert(directory);
2631 assert(console);
2632 assert(pid_socket >= 0);
2633 assert(kmsg_socket >= 0);
2634
efdb0237
LP
2635 cg_unified_flush();
2636
03cfe0d5
LP
2637 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2638 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2639
2640 if (interactive) {
2641 close_nointr(STDIN_FILENO);
2642 close_nointr(STDOUT_FILENO);
2643 close_nointr(STDERR_FILENO);
2644
2645 r = open_terminal(console, O_RDWR);
2646 if (r != STDIN_FILENO) {
2647 if (r >= 0) {
2648 safe_close(r);
2649 r = -EINVAL;
2650 }
2651
2652 return log_error_errno(r, "Failed to open console: %m");
2653 }
2654
2655 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2656 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2657 return log_error_errno(errno, "Failed to duplicate console: %m");
2658 }
2659
2660 r = reset_audit_loginuid();
2661 if (r < 0)
2662 return r;
2663
2664 /* Mark everything as slave, so that we still
2665 * receive mounts from the real root, but don't
2666 * propagate mounts to the real root. */
2667 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2668 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2669
2670 r = mount_devices(directory,
2671 root_device, root_device_rw,
2672 home_device, home_device_rw,
2673 srv_device, srv_device_rw);
2674 if (r < 0)
2675 return r;
2676
391567f4
LP
2677 r = determine_uid_shift(directory);
2678 if (r < 0)
2679 return r;
2680
825d5287
RM
2681 if (arg_userns) {
2682 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2683 if (l < 0)
2684 return log_error_errno(errno, "Failed to send UID shift: %m");
2685 if (l != sizeof(arg_uid_shift)) {
2686 log_error("Short write while sending UID shift.");
2687 return -EIO;
2688 }
2689 }
2690
03cfe0d5
LP
2691 /* Turn directory into bind mount */
2692 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2693 return log_error_errno(errno, "Failed to make bind mount: %m");
2694
e83bebef 2695 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
2696 if (r < 0)
2697 return r;
2698
e83bebef 2699 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
2700 if (r < 0)
2701 return r;
2702
03cfe0d5
LP
2703 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2704 if (r < 0)
2705 return r;
2706
03cfe0d5
LP
2707 if (arg_read_only) {
2708 r = bind_remount_recursive(directory, true);
2709 if (r < 0)
2710 return log_error_errno(r, "Failed to make tree read-only: %m");
2711 }
2712
d1678248 2713 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2714 if (r < 0)
2715 return r;
2716
07fa00f9
LP
2717 r = copy_devnodes(directory);
2718 if (r < 0)
03cfe0d5
LP
2719 return r;
2720
2721 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2722
07fa00f9
LP
2723 r = setup_pts(directory);
2724 if (r < 0)
03cfe0d5
LP
2725 return r;
2726
2727 r = setup_propagate(directory);
2728 if (r < 0)
2729 return r;
2730
2731 r = setup_dev_console(directory, console);
2732 if (r < 0)
2733 return r;
2734
2735 r = setup_seccomp();
2736 if (r < 0)
2737 return r;
2738
2739 r = setup_timezone(directory);
2740 if (r < 0)
2741 return r;
2742
2743 r = setup_resolv_conf(directory);
2744 if (r < 0)
2745 return r;
2746
2747 r = setup_journal(directory);
2748 if (r < 0)
2749 return r;
2750
e83bebef 2751 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2752 if (r < 0)
2753 return r;
2754
e83bebef 2755 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2756 if (r < 0)
2757 return r;
2758
2759 r = mount_move_root(directory);
2760 if (r < 0)
2761 return log_error_errno(r, "Failed to move root directory: %m");
2762
2763 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2764 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2765 (arg_private_network ? CLONE_NEWNET : 0) |
2766 (arg_userns ? CLONE_NEWUSER : 0),
2767 NULL);
2768 if (pid < 0)
2769 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
2770 if (pid == 0) {
2771 pid_socket = safe_close(pid_socket);
825d5287 2772 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
2773
2774 /* The inner child has all namespaces that are
2775 * requested, so that we all are owned by the user if
2776 * user namespaces are turned on. */
2777
f757855e 2778 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
2779 if (r < 0)
2780 _exit(EXIT_FAILURE);
2781
2782 _exit(EXIT_SUCCESS);
2783 }
2784
2785 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2786 if (l < 0)
2787 return log_error_errno(errno, "Failed to send PID: %m");
2788 if (l != sizeof(pid)) {
2789 log_error("Short write while sending PID.");
2790 return -EIO;
2791 }
2792
2793 pid_socket = safe_close(pid_socket);
327e26d6
KN
2794 kmsg_socket = safe_close(kmsg_socket);
2795 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
2796
2797 return 0;
2798}
2799
2800static int setup_uid_map(pid_t pid) {
2801 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2802 int r;
2803
2804 assert(pid > 1);
2805
2806 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2807 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 2808 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2809 if (r < 0)
2810 return log_error_errno(r, "Failed to write UID map: %m");
2811
2812 /* We always assign the same UID and GID ranges */
2813 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 2814 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2815 if (r < 0)
2816 return log_error_errno(r, "Failed to write GID map: %m");
2817
2818 return 0;
2819}
2820
f757855e
LP
2821static int load_settings(void) {
2822 _cleanup_(settings_freep) Settings *settings = NULL;
2823 _cleanup_fclose_ FILE *f = NULL;
2824 _cleanup_free_ char *p = NULL;
2825 const char *fn, *i;
2826 int r;
2827
2828 /* If all settings are masked, there's no point in looking for
2829 * the settings file */
2830 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2831 return 0;
2832
2833 fn = strjoina(arg_machine, ".nspawn");
2834
2835 /* We first look in the admin's directories in /etc and /run */
2836 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2837 _cleanup_free_ char *j = NULL;
2838
2839 j = strjoin(i, "/", fn, NULL);
2840 if (!j)
2841 return log_oom();
2842
2843 f = fopen(j, "re");
2844 if (f) {
2845 p = j;
2846 j = NULL;
2847
b938cb90 2848 /* By default, we trust configuration from /etc and /run */
f757855e
LP
2849 if (arg_settings_trusted < 0)
2850 arg_settings_trusted = true;
2851
2852 break;
2853 }
2854
2855 if (errno != ENOENT)
2856 return log_error_errno(errno, "Failed to open %s: %m", j);
2857 }
2858
2859 if (!f) {
2860 /* After that, let's look for a file next to the
2861 * actual image we shall boot. */
2862
2863 if (arg_image) {
2864 p = file_in_same_dir(arg_image, fn);
2865 if (!p)
2866 return log_oom();
2867 } else if (arg_directory) {
2868 p = file_in_same_dir(arg_directory, fn);
2869 if (!p)
2870 return log_oom();
2871 }
2872
2873 if (p) {
2874 f = fopen(p, "re");
2875 if (!f && errno != ENOENT)
2876 return log_error_errno(errno, "Failed to open %s: %m", p);
2877
b938cb90 2878 /* By default, we do not trust configuration from /var/lib/machines */
f757855e
LP
2879 if (arg_settings_trusted < 0)
2880 arg_settings_trusted = false;
2881 }
2882 }
2883
2884 if (!f)
2885 return 0;
2886
2887 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2888
2889 r = settings_load(f, p, &settings);
2890 if (r < 0)
2891 return r;
2892
2893 /* Copy over bits from the settings, unless they have been
2894 * explicitly masked by command line switches. */
2895
2896 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2897 settings->boot >= 0) {
2898 arg_boot = settings->boot;
2899
2900 strv_free(arg_parameters);
2901 arg_parameters = settings->parameters;
2902 settings->parameters = NULL;
2903 }
2904
2905 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2906 settings->environment) {
2907 strv_free(arg_setenv);
2908 arg_setenv = settings->environment;
2909 settings->environment = NULL;
2910 }
2911
2912 if ((arg_settings_mask & SETTING_USER) == 0 &&
2913 settings->user) {
2914 free(arg_user);
2915 arg_user = settings->user;
2916 settings->user = NULL;
2917 }
2918
2919 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 2920 uint64_t plus;
f757855e 2921
0e265674
LP
2922 plus = settings->capability;
2923 if (settings_private_network(settings))
2924 plus |= (1ULL << CAP_NET_ADMIN);
2925
2926 if (!arg_settings_trusted && plus != 0) {
2927 if (settings->capability != 0)
2928 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2929 } else
2930 arg_retain |= plus;
f757855e
LP
2931
2932 arg_retain &= ~settings->drop_capability;
2933 }
2934
2935 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2936 settings->kill_signal > 0)
2937 arg_kill_signal = settings->kill_signal;
2938
2939 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2940 settings->personality != PERSONALITY_INVALID)
2941 arg_personality = settings->personality;
2942
2943 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2944 !sd_id128_is_null(settings->machine_id)) {
2945
2946 if (!arg_settings_trusted)
2947 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2948 else
2949 arg_uuid = settings->machine_id;
2950 }
2951
2952 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2953 settings->read_only >= 0)
2954 arg_read_only = settings->read_only;
2955
2956 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2957 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2958 arg_volatile_mode = settings->volatile_mode;
2959
2960 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2961 settings->n_custom_mounts > 0) {
2962
2963 if (!arg_settings_trusted)
2964 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2965 else {
2966 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2967 arg_custom_mounts = settings->custom_mounts;
2968 arg_n_custom_mounts = settings->n_custom_mounts;
2969
2970 settings->custom_mounts = NULL;
2971 settings->n_custom_mounts = 0;
2972 }
2973 }
2974
2975 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2976 (settings->private_network >= 0 ||
2977 settings->network_veth >= 0 ||
2978 settings->network_bridge ||
2979 settings->network_interfaces ||
2980 settings->network_macvlan ||
f6d6bad1
LP
2981 settings->network_ipvlan ||
2982 settings->network_veth_extra)) {
f757855e
LP
2983
2984 if (!arg_settings_trusted)
2985 log_warning("Ignoring network settings, file %s is not trusted.", p);
2986 else {
f6d6bad1 2987 arg_network_veth = settings_network_veth(settings);
0e265674
LP
2988 arg_private_network = settings_private_network(settings);
2989
f757855e
LP
2990 strv_free(arg_network_interfaces);
2991 arg_network_interfaces = settings->network_interfaces;
2992 settings->network_interfaces = NULL;
2993
2994 strv_free(arg_network_macvlan);
2995 arg_network_macvlan = settings->network_macvlan;
2996 settings->network_macvlan = NULL;
2997
2998 strv_free(arg_network_ipvlan);
2999 arg_network_ipvlan = settings->network_ipvlan;
3000 settings->network_ipvlan = NULL;
3001
f6d6bad1
LP
3002 strv_free(arg_network_veth_extra);
3003 arg_network_veth_extra = settings->network_veth_extra;
3004 settings->network_veth_extra = NULL;
3005
f757855e
LP
3006 free(arg_network_bridge);
3007 arg_network_bridge = settings->network_bridge;
3008 settings->network_bridge = NULL;
f757855e
LP
3009 }
3010 }
3011
3012 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
3013 settings->expose_ports) {
3014
3015 if (!arg_settings_trusted)
3016 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3017 else {
3018 expose_port_free_all(arg_expose_ports);
3019 arg_expose_ports = settings->expose_ports;
3020 settings->expose_ports = NULL;
3021 }
3022 }
3023
3024 return 0;
3025}
3026
03cfe0d5
LP
3027int main(int argc, char *argv[]) {
3028
3029 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3030 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3031 _cleanup_close_ int master = -1, image_fd = -1;
3032 _cleanup_fdset_free_ FDSet *fds = NULL;
3033 int r, n_fd_passed, loop_nr = -1;
3034 char veth_name[IFNAMSIZ];
3035 bool secondary = false, remove_subvol = false;
72c0a2c2 3036 sigset_t mask_chld;
03cfe0d5
LP
3037 pid_t pid = 0;
3038 int ret = EXIT_SUCCESS;
3039 union in_addr_union exposed = {};
3040 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3041 bool interactive;
3042
3043 log_parse_environment();
3044 log_open();
3045
3046 r = parse_argv(argc, argv);
3047 if (r <= 0)
3048 goto finish;
3049
03cfe0d5
LP
3050 if (geteuid() != 0) {
3051 log_error("Need to be root.");
3052 r = -EPERM;
3053 goto finish;
3054 }
f757855e
LP
3055 r = determine_names();
3056 if (r < 0)
3057 goto finish;
3058
3059 r = load_settings();
3060 if (r < 0)
3061 goto finish;
3062
3063 r = verify_arguments();
3064 if (r < 0)
3065 goto finish;
03cfe0d5
LP
3066
3067 n_fd_passed = sd_listen_fds(false);
3068 if (n_fd_passed > 0) {
3069 r = fdset_new_listen_fds(&fds, false);
3070 if (r < 0) {
3071 log_error_errno(r, "Failed to collect file descriptors: %m");
3072 goto finish;
3073 }
3074 }
3075
3076 if (arg_directory) {
3077 assert(!arg_image);
3078
3079 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3080 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3081 r = -EINVAL;
3082 goto finish;
3083 }
3084
3085 if (arg_ephemeral) {
3086 _cleanup_free_ char *np = NULL;
3087
3088 /* If the specified path is a mount point we
3089 * generate the new snapshot immediately
3090 * inside it under a random name. However if
3091 * the specified is not a mount point we
3092 * create the new snapshot in the parent
3093 * directory, just next to it. */
e26d6ce5 3094 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
3095 if (r < 0) {
3096 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3097 goto finish;
3098 }
3099 if (r > 0)
770b5ce4 3100 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3101 else
770b5ce4 3102 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
3103 if (r < 0) {
3104 log_error_errno(r, "Failed to generate name for snapshot: %m");
3105 goto finish;
3106 }
3107
3108 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3109 if (r < 0) {
3110 log_error_errno(r, "Failed to lock %s: %m", np);
3111 goto finish;
3112 }
3113
5bcd08db 3114 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3115 if (r < 0) {
3116 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3117 goto finish;
ec16945e
LP
3118 }
3119
3120 free(arg_directory);
3121 arg_directory = np;
8a16a7b4 3122 np = NULL;
ec16945e
LP
3123
3124 remove_subvol = true;
30535c16
LP
3125
3126 } else {
3127 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3128 if (r == -EBUSY) {
3129 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3130 goto finish;
3131 }
3132 if (r < 0) {
3133 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3134 return r;
3135 }
3136
3137 if (arg_template) {
5bcd08db 3138 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3139 if (r == -EEXIST) {
3140 if (!arg_quiet)
3141 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3142 } else if (r < 0) {
83521414 3143 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3144 goto finish;
3145 } else {
3146 if (!arg_quiet)
3147 log_info("Populated %s from template %s.", arg_directory, arg_template);
3148 }
3149 }
ec16945e
LP
3150 }
3151
1b9e5b12
LP
3152 if (arg_boot) {
3153 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3154 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3155 r = -EINVAL;
1b9e5b12
LP
3156 goto finish;
3157 }
3158 } else {
3159 const char *p;
3160
16fb773e
LP
3161 p = strjoina(arg_directory, "/usr/");
3162 if (laccess(p, F_OK) < 0) {
3163 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 3164 r = -EINVAL;
1b9e5b12 3165 goto finish;
1b9e5b12
LP
3166 }
3167 }
ec16945e 3168
6b9132a9 3169 } else {
1b9e5b12 3170 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3171
ec16945e
LP
3172 assert(arg_image);
3173 assert(!arg_template);
3174
30535c16
LP
3175 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3176 if (r == -EBUSY) {
3177 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3178 goto finish;
3179 }
3180 if (r < 0) {
3181 r = log_error_errno(r, "Failed to create image lock: %m");
3182 goto finish;
3183 }
3184
1b9e5b12 3185 if (!mkdtemp(template)) {
56f64d95 3186 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3187 r = -errno;
6b9132a9 3188 goto finish;
1b9e5b12 3189 }
6b9132a9 3190
1b9e5b12
LP
3191 arg_directory = strdup(template);
3192 if (!arg_directory) {
3193 r = log_oom();
3194 goto finish;
6b9132a9 3195 }
88213476 3196
1b9e5b12
LP
3197 image_fd = setup_image(&device_path, &loop_nr);
3198 if (image_fd < 0) {
3199 r = image_fd;
842f3b0f
LP
3200 goto finish;
3201 }
1b9e5b12 3202
4d9f07b4
LP
3203 r = dissect_image(image_fd,
3204 &root_device, &root_device_rw,
3205 &home_device, &home_device_rw,
3206 &srv_device, &srv_device_rw,
3207 &secondary);
1b9e5b12
LP
3208 if (r < 0)
3209 goto finish;
842f3b0f 3210 }
842f3b0f 3211
5a8af538
LP
3212 r = custom_mounts_prepare();
3213 if (r < 0)
3214 goto finish;
3215
03cfe0d5
LP
3216 interactive =
3217 isatty(STDIN_FILENO) > 0 &&
3218 isatty(STDOUT_FILENO) > 0;
9c857b9d 3219
db7feb7e
LP
3220 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3221 if (master < 0) {
ec16945e 3222 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3223 goto finish;
3224 }
3225
611b312b
LP
3226 r = ptsname_malloc(master, &console);
3227 if (r < 0) {
3228 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
3229 goto finish;
3230 }
3231
a258bf26 3232 if (unlockpt(master) < 0) {
ec16945e 3233 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3234 goto finish;
3235 }
3236
9c857b9d
LP
3237 if (!arg_quiet)
3238 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3239 arg_machine, arg_image ?: arg_directory);
3240
72c0a2c2 3241 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3242
023fb90b
LP
3243 assert_se(sigemptyset(&mask_chld) == 0);
3244 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3245
03cfe0d5
LP
3246 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3247 r = log_error_errno(errno, "Failed to become subreaper: %m");
3248 goto finish;
3249 }
3250
d87be9b0 3251 for (;;) {
97044145 3252 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 };
113cea80 3253 ContainerStatus container_status;
7566e267 3254 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
03cfe0d5 3255 static const struct sigaction sa = {
189d5bac 3256 .sa_handler = nop_signal_handler,
e866af3a
DH
3257 .sa_flags = SA_NOCLDSTOP,
3258 };
03cfe0d5
LP
3259 int ifi = 0;
3260 ssize_t l;
4afd3348 3261 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
dbb60d69 3262 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4afd3348 3263 _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
dbb60d69 3264 char last_char = 0;
e866af3a 3265
7566e267 3266 r = barrier_create(&barrier);
a2da110b 3267 if (r < 0) {
da927ba9 3268 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3269 goto finish;
3270 }
3271
4610de50 3272 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
6d0b55c2
LP
3273 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3274 goto finish;
3275 }
3276
4610de50 3277 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
6d0b55c2
LP
3278 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3279 goto finish;
3280 }
3281
4610de50 3282 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
03cfe0d5
LP
3283 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3284 goto finish;
3285 }
3286
825d5287 3287 if (arg_userns)
4610de50 3288 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
825d5287
RM
3289 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3290 goto finish;
3291 }
3292
e866af3a
DH
3293 /* Child can be killed before execv(), so handle SIGCHLD
3294 * in order to interrupt parent's blocking calls and
3295 * give it a chance to call wait() and terminate. */
3296 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3297 if (r < 0) {
ec16945e 3298 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3299 goto finish;
3300 }
3301
e866af3a
DH
3302 r = sigaction(SIGCHLD, &sa, NULL);
3303 if (r < 0) {
ec16945e 3304 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3305 goto finish;
3306 }
3307
03cfe0d5 3308 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
d87be9b0
LP
3309 if (pid < 0) {
3310 if (errno == EINVAL)
ec16945e 3311 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3312 else
ec16945e 3313 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3314
d87be9b0
LP
3315 goto finish;
3316 }
a258bf26 3317
d87be9b0 3318 if (pid == 0) {
03cfe0d5 3319 /* The outer child only has a file system namespace. */
a2da110b
DH
3320 barrier_set_role(&barrier, BARRIER_CHILD);
3321
03e334a1 3322 master = safe_close(master);
a258bf26 3323
03e334a1 3324 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3325 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 3326 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
825d5287 3327 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 3328
ce30c8dc
LP
3329 (void) reset_all_signal_handlers();
3330 (void) reset_signal_mask();
f5c1b9ee 3331
03cfe0d5
LP
3332 r = outer_child(&barrier,
3333 arg_directory,
3334 console,
3335 root_device, root_device_rw,
3336 home_device, home_device_rw,
3337 srv_device, srv_device_rw,
3338 interactive,
3339 secondary,
3340 pid_socket_pair[1],
3341 kmsg_socket_pair[1],
3342 rtnl_socket_pair[1],
825d5287 3343 uid_shift_socket_pair[1],
f757855e 3344 fds);
0cb9fbcd 3345 if (r < 0)
a2da110b 3346 _exit(EXIT_FAILURE);
d87be9b0 3347
03cfe0d5 3348 _exit(EXIT_SUCCESS);
da5b3bad 3349 }
88213476 3350
a2da110b 3351 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 3352
2feceb5e 3353 fds = fdset_free(fds);
842f3b0f 3354
6d0b55c2
LP
3355 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3356 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 3357 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
82116c43 3358 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
6d0b55c2 3359
03cfe0d5
LP
3360 /* Wait for the outer child. */
3361 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3362 if (r < 0)
3363 goto finish;
3364 if (r != 0) {
3365 r = -EIO;
3366 goto finish;
3367 }
3368 pid = 0;
6dac160c 3369
03cfe0d5
LP
3370 /* And now retrieve the PID of the inner child. */
3371 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3372 if (l < 0) {
3373 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3374 goto finish;
3375 }
3376 if (l != sizeof(pid)) {
76d44882 3377 log_error("Short read while reading inner child PID.");
03cfe0d5
LP
3378 r = EIO;
3379 goto finish;
3380 }
354bfd2b 3381
03cfe0d5 3382 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 3383
03cfe0d5
LP
3384 if (arg_userns) {
3385 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3386 log_error("Child died too early.");
3387 r = -ESRCH;
840295fc 3388 goto finish;
03cfe0d5 3389 }
ab046dde 3390
825d5287
RM
3391 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3392 if (l < 0) {
3393 r = log_error_errno(errno, "Failed to read UID shift: %m");
3394 goto finish;
3395 }
3396 if (l != sizeof(arg_uid_shift)) {
76d44882 3397 log_error("Short read while reading UID shift.");
825d5287
RM
3398 r = EIO;
3399 goto finish;
3400 }
3401
03cfe0d5 3402 r = setup_uid_map(pid);
840295fc
LP
3403 if (r < 0)
3404 goto finish;
ab046dde 3405
03cfe0d5
LP
3406 (void) barrier_place(&barrier); /* #2 */
3407 }
c74e630d 3408
9a2a5625 3409 if (arg_private_network) {
4bbfe7ad 3410
9a2a5625
LP
3411 r = move_network_interfaces(pid, arg_network_interfaces);
3412 if (r < 0)
3413 goto finish;
5aa4bb6b 3414
9a2a5625
LP
3415 if (arg_network_veth) {
3416 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3417 if (r < 0)
3418 goto finish;
3419 else if (r > 0)
3420 ifi = r;
6dac160c 3421
9a2a5625
LP
3422 if (arg_network_bridge) {
3423 r = setup_bridge(veth_name, arg_network_bridge);
3424 if (r < 0)
3425 goto finish;
3426 if (r > 0)
3427 ifi = r;
3428 }
3429 }
6dac160c 3430
f6d6bad1
LP
3431 r = setup_veth_extra(arg_machine, pid, arg_network_veth_extra);
3432 if (r < 0)
3433 goto finish;
3434
9a2a5625
LP
3435 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3436 if (r < 0)
3437 goto finish;
3438
3439 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3440 if (r < 0)
3441 goto finish;
3442 }
6dac160c 3443
b7103bc5
LP
3444 if (arg_register) {
3445 r = register_machine(
3446 arg_machine,
3447 pid,
3448 arg_directory,
3449 arg_uuid,
3450 ifi,
3451 arg_slice,
3452 arg_custom_mounts, arg_n_custom_mounts,
3453 arg_kill_signal,
3454 arg_property,
6aadfa4c
ILG
3455 arg_keep_unit,
3456 arg_container_service_name);
b7103bc5
LP
3457 if (r < 0)
3458 goto finish;
3459 }
6dac160c 3460
34829a32 3461 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
efdb0237
LP
3462 if (r < 0)
3463 goto finish;
3464
34829a32
LP
3465 if (arg_keep_unit) {
3466 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3467 if (r < 0)
3468 goto finish;
3469 }
efdb0237 3470
34829a32 3471 r = chown_cgroup(pid, arg_uid_shift);
03cfe0d5
LP
3472 if (r < 0)
3473 goto finish;
6dac160c 3474
03cfe0d5
LP
3475 /* Notify the child that the parent is ready with all
3476 * its setup (including cgroup-ification), and that
3477 * the child can now hand over control to the code to
3478 * run inside the container. */
3479 (void) barrier_place(&barrier); /* #3 */
6dac160c 3480
03cfe0d5
LP
3481 /* Block SIGCHLD here, before notifying child.
3482 * process_pty() will handle it with the other signals. */
3483 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 3484
03cfe0d5
LP
3485 /* Reset signal to default */
3486 r = default_signals(SIGCHLD, -1);
3487 if (r < 0) {
3488 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3489 goto finish;
3490 }
e866af3a 3491
03cfe0d5 3492 /* Let the child know that we are ready and wait that the child is completely ready now. */
c0ffce2b
KN
3493 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3494 log_error("Child died too early.");
03cfe0d5
LP
3495 r = -ESRCH;
3496 goto finish;
3497 }
b12afc8c 3498
03cfe0d5
LP
3499 sd_notifyf(false,
3500 "READY=1\n"
3501 "STATUS=Container running.\n"
3502 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 3503
03cfe0d5
LP
3504 r = sd_event_new(&event);
3505 if (r < 0) {
3506 log_error_errno(r, "Failed to get default event source: %m");
3507 goto finish;
3508 }
88213476 3509
03cfe0d5
LP
3510 if (arg_kill_signal > 0) {
3511 /* Try to kill the init system on SIGINT or SIGTERM */
4a0b58c4
LP
3512 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(pid));
3513 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(pid));
03cfe0d5
LP
3514 } else {
3515 /* Immediately exit */
3516 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3517 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3518 }
023fb90b 3519
03cfe0d5
LP
3520 /* simply exit on sigchld */
3521 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 3522
03cfe0d5 3523 if (arg_expose_ports) {
7a8f6325 3524 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
03cfe0d5
LP
3525 if (r < 0)
3526 goto finish;
023fb90b 3527
7a8f6325 3528 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
03cfe0d5 3529 }
023fb90b 3530
03cfe0d5 3531 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 3532
ae3dde80 3533 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
03cfe0d5
LP
3534 if (r < 0) {
3535 log_error_errno(r, "Failed to create PTY forwarder: %m");
3536 goto finish;
3537 }
023fb90b 3538
03cfe0d5
LP
3539 r = sd_event_loop(event);
3540 if (r < 0) {
3541 log_error_errno(r, "Failed to run event loop: %m");
3542 goto finish;
3543 }
6d0b55c2 3544
03cfe0d5 3545 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 3546
03cfe0d5 3547 forward = pty_forward_free(forward);
6d0b55c2 3548
03cfe0d5
LP
3549 if (!arg_quiet && last_char != '\n')
3550 putc('\n', stdout);
04d39279 3551
03cfe0d5 3552 /* Kill if it is not dead yet anyway */
b7103bc5
LP
3553 if (arg_register && !arg_keep_unit)
3554 terminate_machine(pid);
1f0cd86b 3555
840295fc 3556 /* Normally redundant, but better safe than sorry */
04d39279 3557 kill(pid, SIGKILL);
a258bf26 3558
113cea80 3559 r = wait_for_container(pid, &container_status);
04d39279
LP
3560 pid = 0;
3561
ec16945e 3562 if (r < 0)
ce9f1527
LP
3563 /* We failed to wait for the container, or the
3564 * container exited abnormally */
ec16945e
LP
3565 goto finish;
3566 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
3567 /* The container exited with a non-zero
3568 * status, or with zero status and no reboot
3569 * was requested. */
ec16945e 3570 ret = r;
d87be9b0 3571 break;
ec16945e 3572 }
88213476 3573
113cea80 3574 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3575
3576 if (arg_keep_unit) {
3577 /* Special handling if we are running as a
3578 * service: instead of simply restarting the
3579 * machine we want to restart the entire
3580 * service, so let's inform systemd about this
3581 * with the special exit code 133. The service
3582 * file uses RestartForceExitStatus=133 so
3583 * that this results in a full nspawn
3584 * restart. This is necessary since we might
3585 * have cgroup parameters set we want to have
3586 * flushed out. */
ec16945e
LP
3587 ret = 133;
3588 r = 0;
ce38dbc8
LP
3589 break;
3590 }
6d0b55c2 3591
7a8f6325 3592 expose_port_flush(arg_expose_ports, &exposed);
d87be9b0 3593 }
88213476
LP
3594
3595finish:
af4ec430
LP
3596 sd_notify(false,
3597 "STOPPING=1\n"
3598 "STATUS=Terminating...");
3599
9444b1f2
LP
3600 if (pid > 0)
3601 kill(pid, SIGKILL);
88213476 3602
503546da
LP
3603 /* Try to flush whatever is still queued in the pty */
3604 if (master >= 0)
59f448cf 3605 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
503546da 3606
03cfe0d5
LP
3607 loop_remove(loop_nr, &image_fd);
3608
ec16945e
LP
3609 if (remove_subvol && arg_directory) {
3610 int k;
3611
5bcd08db 3612 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
ec16945e
LP
3613 if (k < 0)
3614 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3615 }
3616
785890ac
LP
3617 if (arg_machine) {
3618 const char *p;
3619
63c372cb 3620 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 3621 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
3622 }
3623
7a8f6325 3624 expose_port_flush(arg_expose_ports, &exposed);
f757855e 3625
04d391da 3626 free(arg_directory);
ec16945e
LP
3627 free(arg_template);
3628 free(arg_image);
7027ff61 3629 free(arg_machine);
c74e630d
LP
3630 free(arg_user);
3631 strv_free(arg_setenv);
f757855e 3632 free(arg_network_bridge);
c74e630d
LP
3633 strv_free(arg_network_interfaces);
3634 strv_free(arg_network_macvlan);
4bbfe7ad 3635 strv_free(arg_network_ipvlan);
f6d6bad1 3636 strv_free(arg_network_veth_extra);
f757855e
LP
3637 strv_free(arg_parameters);
3638 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3639 expose_port_free_all(arg_expose_ports);
6d0b55c2 3640
ec16945e 3641 return r < 0 ? EXIT_FAILURE : ret;
88213476 3642}