]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
Merge pull request #1668 from ssahani/net1
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
8fe0087e
LP
22#ifdef HAVE_BLKID
23#include <blkid/blkid.h>
24#endif
88213476 25#include <errno.h>
88213476 26#include <getopt.h>
1b9e5b12 27#include <linux/loop.h>
8fe0087e 28#include <sched.h>
24fb1112
LP
29#ifdef HAVE_SECCOMP
30#include <seccomp.h>
31#endif
8fe0087e
LP
32#ifdef HAVE_SELINUX
33#include <selinux/selinux.h>
1b9e5b12 34#endif
8fe0087e
LP
35#include <signal.h>
36#include <stdio.h>
37#include <stdlib.h>
38#include <string.h>
39#include <sys/file.h>
40#include <sys/mount.h>
41#include <sys/personality.h>
42#include <sys/prctl.h>
43#include <sys/types.h>
44#include <unistd.h>
1b9e5b12 45
1f0cd86b 46#include "sd-daemon.h"
1f0cd86b 47#include "sd-id128.h"
8fe0087e
LP
48
49#include "barrier.h"
50#include "base-filesystem.h"
51#include "blkid-util.h"
52#include "btrfs-util.h"
8fe0087e
LP
53#include "cap-list.h"
54#include "capability.h"
04d391da 55#include "cgroup-util.h"
8fe0087e 56#include "copy.h"
4fc9982c 57#include "dev-setup.h"
8fe0087e
LP
58#include "env-util.h"
59#include "event-util.h"
3ffd4af2 60#include "fd-util.h"
842f3b0f 61#include "fdset.h"
a5c32cff 62#include "fileio.h"
8fe0087e 63#include "formats-util.h"
1b9e5b12 64#include "gpt.h"
8fe0087e
LP
65#include "hostname-util.h"
66#include "log.h"
67#include "loopback-setup.h"
1b9cebf6 68#include "machine-image.h"
8fe0087e
LP
69#include "macro.h"
70#include "missing.h"
71#include "mkdir.h"
72#include "netlink-util.h"
07630cea
LP
73#include "nspawn-cgroup.h"
74#include "nspawn-expose-ports.h"
75#include "nspawn-mount.h"
76#include "nspawn-network.h"
77#include "nspawn-register.h"
78#include "nspawn-settings.h"
79#include "nspawn-setuid.h"
8fe0087e 80#include "path-util.h"
0b452006 81#include "process-util.h"
8fe0087e
LP
82#include "ptyfwd.h"
83#include "random-util.h"
84#include "rm-rf.h"
e9642be2
LP
85#ifdef HAVE_SECCOMP
86#include "seccomp-util.h"
87#endif
8fe0087e 88#include "signal-util.h"
07630cea 89#include "string-util.h"
8fe0087e
LP
90#include "strv.h"
91#include "terminal-util.h"
92#include "udev-util.h"
93#include "util.h"
e9642be2 94
113cea80
DH
95typedef enum ContainerStatus {
96 CONTAINER_TERMINATED,
97 CONTAINER_REBOOTED
98} ContainerStatus;
99
57fb9fb5
LP
100typedef enum LinkJournal {
101 LINK_NO,
102 LINK_AUTO,
103 LINK_HOST,
104 LINK_GUEST
105} LinkJournal;
88213476
LP
106
107static char *arg_directory = NULL;
ec16945e 108static char *arg_template = NULL;
687d0825 109static char *arg_user = NULL;
9444b1f2 110static sd_id128_t arg_uuid = {};
7027ff61 111static char *arg_machine = NULL;
c74e630d
LP
112static const char *arg_selinux_context = NULL;
113static const char *arg_selinux_apifs_context = NULL;
9444b1f2 114static const char *arg_slice = NULL;
ff01d048 115static bool arg_private_network = false;
bc2f673e 116static bool arg_read_only = false;
0f0dbc46 117static bool arg_boot = false;
ec16945e 118static bool arg_ephemeral = false;
57fb9fb5 119static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 120static bool arg_link_journal_try = false;
5076f0cc
LP
121static uint64_t arg_retain =
122 (1ULL << CAP_CHOWN) |
123 (1ULL << CAP_DAC_OVERRIDE) |
124 (1ULL << CAP_DAC_READ_SEARCH) |
125 (1ULL << CAP_FOWNER) |
126 (1ULL << CAP_FSETID) |
127 (1ULL << CAP_IPC_OWNER) |
128 (1ULL << CAP_KILL) |
129 (1ULL << CAP_LEASE) |
130 (1ULL << CAP_LINUX_IMMUTABLE) |
131 (1ULL << CAP_NET_BIND_SERVICE) |
132 (1ULL << CAP_NET_BROADCAST) |
133 (1ULL << CAP_NET_RAW) |
134 (1ULL << CAP_SETGID) |
135 (1ULL << CAP_SETFCAP) |
136 (1ULL << CAP_SETPCAP) |
137 (1ULL << CAP_SETUID) |
138 (1ULL << CAP_SYS_ADMIN) |
139 (1ULL << CAP_SYS_CHROOT) |
140 (1ULL << CAP_SYS_NICE) |
141 (1ULL << CAP_SYS_PTRACE) |
142 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 143 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
144 (1ULL << CAP_SYS_BOOT) |
145 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
146 (1ULL << CAP_AUDIT_CONTROL) |
147 (1ULL << CAP_MKNOD);
5a8af538
LP
148static CustomMount *arg_custom_mounts = NULL;
149static unsigned arg_n_custom_mounts = 0;
f4889f65 150static char **arg_setenv = NULL;
284c0b91 151static bool arg_quiet = false;
8a96d94e 152static bool arg_share_system = false;
eb91eb18 153static bool arg_register = true;
89f7c846 154static bool arg_keep_unit = false;
aa28aefe 155static char **arg_network_interfaces = NULL;
c74e630d 156static char **arg_network_macvlan = NULL;
4bbfe7ad 157static char **arg_network_ipvlan = NULL;
69c79d3c 158static bool arg_network_veth = false;
f757855e 159static char *arg_network_bridge = NULL;
050f7277 160static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 161static char *arg_image = NULL;
f757855e 162static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 163static ExposePort *arg_expose_ports = NULL;
f36933fe 164static char **arg_property = NULL;
6dac160c
LP
165static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
166static bool arg_userns = false;
c6c8f6e2 167static int arg_kill_signal = 0;
efdb0237 168static bool arg_unified_cgroup_hierarchy = false;
f757855e
LP
169static SettingsMask arg_settings_mask = 0;
170static int arg_settings_trusted = -1;
171static char **arg_parameters = NULL;
88213476 172
601185b4 173static void help(void) {
88213476
LP
174 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
175 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
176 " -h --help Show this help\n"
177 " --version Print version string\n"
69c79d3c 178 " -q --quiet Do not show status information\n"
1b9e5b12 179 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
180 " --template=PATH Initialize root directory from template directory,\n"
181 " if missing\n"
182 " -x --ephemeral Run container with snapshot of root directory, and\n"
183 " remove it after exit\n"
184 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
185 " -b --boot Boot up full system (i.e. invoke init)\n"
186 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 187 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 188 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 189 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 190 " --property=NAME=VALUE Set scope unit property\n"
03cfe0d5
LP
191 " --private-users[=UIDBASE[:NUIDS]]\n"
192 " Run within user namespace\n"
69c79d3c
LP
193 " --private-network Disable network in container\n"
194 " --network-interface=INTERFACE\n"
195 " Assign an existing network interface to the\n"
196 " container\n"
c74e630d
LP
197 " --network-macvlan=INTERFACE\n"
198 " Create a macvlan network interface based on an\n"
199 " existing network interface to the container\n"
4bbfe7ad
TG
200 " --network-ipvlan=INTERFACE\n"
201 " Create a ipvlan network interface based on an\n"
202 " existing network interface to the container\n"
0dfaa006 203 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 204 " and container\n"
ab046dde 205 " --network-bridge=INTERFACE\n"
32457153 206 " Add a virtual ethernet connection between host\n"
ab046dde
TG
207 " and container and add it to an existing bridge on\n"
208 " the host\n"
6d0b55c2 209 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 210 " Expose a container IP port on the host\n"
82adf6af
LP
211 " -Z --selinux-context=SECLABEL\n"
212 " Set the SELinux security context to be used by\n"
213 " processes in the container\n"
214 " -L --selinux-apifs-context=SECLABEL\n"
215 " Set the SELinux security context to be used by\n"
216 " API/tmpfs file systems in the container\n"
a8828ed9
DW
217 " --capability=CAP In addition to the default, retain specified\n"
218 " capability\n"
219 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 220 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
221 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
222 " try-guest, try-host\n"
223 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 224 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
225 " --bind=PATH[:PATH[:OPTIONS]]\n"
226 " Bind mount a file or directory from the host into\n"
a8828ed9 227 " the container\n"
5e5bfa6e
EY
228 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
229 " Similar, but creates a read-only bind mount\n"
06c17c39 230 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
231 " --overlay=PATH[:PATH...]:PATH\n"
232 " Create an overlay mount from the host to \n"
233 " the container\n"
234 " --overlay-ro=PATH[:PATH...]:PATH\n"
235 " Similar, but creates a read-only overlay mount\n"
284c0b91 236 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 237 " --share-system Share system namespaces with host\n"
eb91eb18 238 " --register=BOOLEAN Register container as machine\n"
89f7c846 239 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 240 " the service unit nspawn is running in\n"
6d0b55c2 241 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 242 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
6d0b55c2 243 , program_invocation_short_name);
88213476
LP
244}
245
5a8af538
LP
246
247static int custom_mounts_prepare(void) {
248 unsigned i;
249 int r;
250
251 /* Ensure the mounts are applied prefix first. */
252 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
253
254 /* Allocate working directories for the overlay file systems that need it */
255 for (i = 0; i < arg_n_custom_mounts; i++) {
256 CustomMount *m = &arg_custom_mounts[i];
257
825d5287
RM
258 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
259 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
260 return -EINVAL;
261 }
262
5a8af538
LP
263 if (m->type != CUSTOM_MOUNT_OVERLAY)
264 continue;
265
266 if (m->work_dir)
267 continue;
268
269 if (m->read_only)
270 continue;
271
14bcf25c 272 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
273 if (r < 0)
274 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
275 }
276
277 return 0;
278}
279
efdb0237
LP
280static int detect_unified_cgroup_hierarchy(void) {
281 const char *e;
282 int r;
283
284 /* Allow the user to control whether the unified hierarchy is used */
285 e = getenv("UNIFIED_CGROUP_HIERARCHY");
286 if (e) {
287 r = parse_boolean(e);
288 if (r < 0)
289 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
290
291 arg_unified_cgroup_hierarchy = r;
292 return 0;
293 }
294
295 /* Otherwise inherit the default from the host system */
296 r = cg_unified();
297 if (r < 0)
298 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
299
300 arg_unified_cgroup_hierarchy = r;
301 return 0;
302}
303
88213476
LP
304static int parse_argv(int argc, char *argv[]) {
305
a41fe3a2 306 enum {
acbeb427
ZJS
307 ARG_VERSION = 0x100,
308 ARG_PRIVATE_NETWORK,
bc2f673e 309 ARG_UUID,
5076f0cc 310 ARG_READ_ONLY,
57fb9fb5 311 ARG_CAPABILITY,
420c7379 312 ARG_DROP_CAPABILITY,
17fe0523
LP
313 ARG_LINK_JOURNAL,
314 ARG_BIND,
f4889f65 315 ARG_BIND_RO,
06c17c39 316 ARG_TMPFS,
5a8af538
LP
317 ARG_OVERLAY,
318 ARG_OVERLAY_RO,
f4889f65 319 ARG_SETENV,
eb91eb18 320 ARG_SHARE_SYSTEM,
89f7c846 321 ARG_REGISTER,
aa28aefe 322 ARG_KEEP_UNIT,
69c79d3c 323 ARG_NETWORK_INTERFACE,
c74e630d 324 ARG_NETWORK_MACVLAN,
4bbfe7ad 325 ARG_NETWORK_IPVLAN,
ab046dde 326 ARG_NETWORK_BRIDGE,
6afc95b7 327 ARG_PERSONALITY,
4d9f07b4 328 ARG_VOLATILE,
ec16945e 329 ARG_TEMPLATE,
f36933fe 330 ARG_PROPERTY,
6dac160c 331 ARG_PRIVATE_USERS,
c6c8f6e2 332 ARG_KILL_SIGNAL,
f757855e 333 ARG_SETTINGS,
a41fe3a2
LP
334 };
335
88213476 336 static const struct option options[] = {
aa28aefe
LP
337 { "help", no_argument, NULL, 'h' },
338 { "version", no_argument, NULL, ARG_VERSION },
339 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
340 { "template", required_argument, NULL, ARG_TEMPLATE },
341 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
342 { "user", required_argument, NULL, 'u' },
343 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
344 { "boot", no_argument, NULL, 'b' },
345 { "uuid", required_argument, NULL, ARG_UUID },
346 { "read-only", no_argument, NULL, ARG_READ_ONLY },
347 { "capability", required_argument, NULL, ARG_CAPABILITY },
348 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
349 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
350 { "bind", required_argument, NULL, ARG_BIND },
351 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 352 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
353 { "overlay", required_argument, NULL, ARG_OVERLAY },
354 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
355 { "machine", required_argument, NULL, 'M' },
356 { "slice", required_argument, NULL, 'S' },
357 { "setenv", required_argument, NULL, ARG_SETENV },
358 { "selinux-context", required_argument, NULL, 'Z' },
359 { "selinux-apifs-context", required_argument, NULL, 'L' },
360 { "quiet", no_argument, NULL, 'q' },
361 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
362 { "register", required_argument, NULL, ARG_REGISTER },
363 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
364 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 365 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 366 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 367 { "network-veth", no_argument, NULL, 'n' },
ab046dde 368 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 369 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 370 { "image", required_argument, NULL, 'i' },
4d9f07b4 371 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 372 { "port", required_argument, NULL, 'p' },
f36933fe 373 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 374 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 375 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
f757855e 376 { "settings", required_argument, NULL, ARG_SETTINGS },
eb9da376 377 {}
88213476
LP
378 };
379
9444b1f2 380 int c, r;
a42c8b54 381 uint64_t plus = 0, minus = 0;
f757855e 382 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
383
384 assert(argc >= 0);
385 assert(argv);
386
0dfaa006 387 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
388
389 switch (c) {
390
391 case 'h':
601185b4
ZJS
392 help();
393 return 0;
88213476 394
acbeb427 395 case ARG_VERSION:
3f6fd1ba 396 return version();
acbeb427 397
88213476 398 case 'D':
0f03c2a4 399 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 400 if (r < 0)
0f03c2a4 401 return r;
ec16945e
LP
402 break;
403
404 case ARG_TEMPLATE:
0f03c2a4 405 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 406 if (r < 0)
0f03c2a4 407 return r;
88213476
LP
408 break;
409
1b9e5b12 410 case 'i':
0f03c2a4 411 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 412 if (r < 0)
0f03c2a4 413 return r;
ec16945e
LP
414 break;
415
416 case 'x':
417 arg_ephemeral = true;
1b9e5b12
LP
418 break;
419
687d0825 420 case 'u':
2fc09a9c
DM
421 r = free_and_strdup(&arg_user, optarg);
422 if (r < 0)
7027ff61 423 return log_oom();
687d0825 424
f757855e 425 arg_settings_mask |= SETTING_USER;
687d0825
MV
426 break;
427
ab046dde 428 case ARG_NETWORK_BRIDGE:
f757855e
LP
429 r = free_and_strdup(&arg_network_bridge, optarg);
430 if (r < 0)
431 return log_oom();
ab046dde
TG
432
433 /* fall through */
434
0dfaa006 435 case 'n':
69c79d3c
LP
436 arg_network_veth = true;
437 arg_private_network = true;
f757855e 438 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
439 break;
440
aa28aefe 441 case ARG_NETWORK_INTERFACE:
c74e630d
LP
442 if (strv_extend(&arg_network_interfaces, optarg) < 0)
443 return log_oom();
444
445 arg_private_network = true;
f757855e 446 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
447 break;
448
449 case ARG_NETWORK_MACVLAN:
450 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
451 return log_oom();
452
4bbfe7ad 453 arg_private_network = true;
f757855e 454 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
455 break;
456
457 case ARG_NETWORK_IPVLAN:
458 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
459 return log_oom();
460
aa28aefe
LP
461 /* fall through */
462
ff01d048
LP
463 case ARG_PRIVATE_NETWORK:
464 arg_private_network = true;
f757855e 465 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
466 break;
467
0f0dbc46
LP
468 case 'b':
469 arg_boot = true;
f757855e 470 arg_settings_mask |= SETTING_BOOT;
0f0dbc46
LP
471 break;
472
144f0fc0 473 case ARG_UUID:
9444b1f2
LP
474 r = sd_id128_from_string(optarg, &arg_uuid);
475 if (r < 0) {
aa96c6cb 476 log_error("Invalid UUID: %s", optarg);
9444b1f2 477 return r;
aa96c6cb 478 }
f757855e
LP
479
480 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 481 break;
aa96c6cb 482
9444b1f2 483 case 'S':
c74e630d 484 arg_slice = optarg;
144f0fc0
LP
485 break;
486
7027ff61 487 case 'M':
c1521918 488 if (isempty(optarg))
97b11eed 489 arg_machine = mfree(arg_machine);
c1521918 490 else {
0c3c4284 491 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
492 log_error("Invalid machine name: %s", optarg);
493 return -EINVAL;
494 }
7027ff61 495
0c3c4284
LP
496 r = free_and_strdup(&arg_machine, optarg);
497 if (r < 0)
eb91eb18
LP
498 return log_oom();
499
500 break;
501 }
7027ff61 502
82adf6af
LP
503 case 'Z':
504 arg_selinux_context = optarg;
a8828ed9
DW
505 break;
506
82adf6af
LP
507 case 'L':
508 arg_selinux_apifs_context = optarg;
a8828ed9
DW
509 break;
510
bc2f673e
LP
511 case ARG_READ_ONLY:
512 arg_read_only = true;
f757855e 513 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
514 break;
515
420c7379
LP
516 case ARG_CAPABILITY:
517 case ARG_DROP_CAPABILITY: {
a2a5291b 518 const char *state, *word;
5076f0cc
LP
519 size_t length;
520
521 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 522 _cleanup_free_ char *t;
5076f0cc
LP
523
524 t = strndup(word, length);
0d0f0c50
SL
525 if (!t)
526 return log_oom();
5076f0cc 527
39ed67d1
LP
528 if (streq(t, "all")) {
529 if (c == ARG_CAPABILITY)
a42c8b54 530 plus = (uint64_t) -1;
39ed67d1 531 else
a42c8b54 532 minus = (uint64_t) -1;
39ed67d1 533 } else {
2822da4f
LP
534 int cap;
535
536 cap = capability_from_name(t);
537 if (cap < 0) {
39ed67d1
LP
538 log_error("Failed to parse capability %s.", t);
539 return -EINVAL;
540 }
541
542 if (c == ARG_CAPABILITY)
a42c8b54 543 plus |= 1ULL << (uint64_t) cap;
39ed67d1 544 else
a42c8b54 545 minus |= 1ULL << (uint64_t) cap;
5076f0cc 546 }
5076f0cc
LP
547 }
548
f757855e 549 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
550 break;
551 }
552
57fb9fb5
LP
553 case 'j':
554 arg_link_journal = LINK_GUEST;
574edc90 555 arg_link_journal_try = true;
57fb9fb5
LP
556 break;
557
558 case ARG_LINK_JOURNAL:
53e438e3 559 if (streq(optarg, "auto")) {
57fb9fb5 560 arg_link_journal = LINK_AUTO;
53e438e3
LP
561 arg_link_journal_try = false;
562 } else if (streq(optarg, "no")) {
57fb9fb5 563 arg_link_journal = LINK_NO;
53e438e3
LP
564 arg_link_journal_try = false;
565 } else if (streq(optarg, "guest")) {
57fb9fb5 566 arg_link_journal = LINK_GUEST;
53e438e3
LP
567 arg_link_journal_try = false;
568 } else if (streq(optarg, "host")) {
57fb9fb5 569 arg_link_journal = LINK_HOST;
53e438e3
LP
570 arg_link_journal_try = false;
571 } else if (streq(optarg, "try-guest")) {
574edc90
MP
572 arg_link_journal = LINK_GUEST;
573 arg_link_journal_try = true;
574 } else if (streq(optarg, "try-host")) {
575 arg_link_journal = LINK_HOST;
576 arg_link_journal_try = true;
577 } else {
57fb9fb5
LP
578 log_error("Failed to parse link journal mode %s", optarg);
579 return -EINVAL;
580 }
581
582 break;
583
17fe0523 584 case ARG_BIND:
f757855e
LP
585 case ARG_BIND_RO:
586 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
587 if (r < 0)
588 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 589
f757855e 590 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 591 break;
06c17c39 592
f757855e
LP
593 case ARG_TMPFS:
594 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
595 if (r < 0)
596 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 597
f757855e 598 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 599 break;
5a8af538
LP
600
601 case ARG_OVERLAY:
602 case ARG_OVERLAY_RO: {
603 _cleanup_free_ char *upper = NULL, *destination = NULL;
604 _cleanup_strv_free_ char **lower = NULL;
605 CustomMount *m;
606 unsigned n = 0;
607 char **i;
608
62f9f39a
RM
609 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
610 if (r == -ENOMEM)
06c17c39 611 return log_oom();
62f9f39a
RM
612 else if (r < 0) {
613 log_error("Invalid overlay specification: %s", optarg);
614 return r;
615 }
06c17c39 616
5a8af538
LP
617 STRV_FOREACH(i, lower) {
618 if (!path_is_absolute(*i)) {
619 log_error("Overlay path %s is not absolute.", *i);
620 return -EINVAL;
621 }
622
623 n++;
624 }
625
626 if (n < 2) {
627 log_error("--overlay= needs at least two colon-separated directories specified.");
628 return -EINVAL;
629 }
630
631 if (n == 2) {
632 /* If two parameters are specified,
633 * the first one is the lower, the
634 * second one the upper directory. And
af86c440
ZJS
635 * we'll also define the destination
636 * mount point the same as the upper. */
5a8af538
LP
637 upper = lower[1];
638 lower[1] = NULL;
639
640 destination = strdup(upper);
641 if (!destination)
642 return log_oom();
643
644 } else {
645 upper = lower[n - 2];
646 destination = lower[n - 1];
647 lower[n - 2] = NULL;
648 }
649
f757855e 650 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
5a8af538
LP
651 if (!m)
652 return log_oom();
653
654 m->destination = destination;
655 m->source = upper;
656 m->lower = lower;
657 m->read_only = c == ARG_OVERLAY_RO;
658
659 upper = destination = NULL;
660 lower = NULL;
06c17c39 661
f757855e 662 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39
LP
663 break;
664 }
665
f4889f65
LP
666 case ARG_SETENV: {
667 char **n;
668
669 if (!env_assignment_is_valid(optarg)) {
670 log_error("Environment variable assignment '%s' is not valid.", optarg);
671 return -EINVAL;
672 }
673
674 n = strv_env_set(arg_setenv, optarg);
675 if (!n)
676 return log_oom();
677
678 strv_free(arg_setenv);
679 arg_setenv = n;
f757855e
LP
680
681 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
682 break;
683 }
684
284c0b91
LP
685 case 'q':
686 arg_quiet = true;
687 break;
688
8a96d94e
LP
689 case ARG_SHARE_SYSTEM:
690 arg_share_system = true;
691 break;
692
eb91eb18
LP
693 case ARG_REGISTER:
694 r = parse_boolean(optarg);
695 if (r < 0) {
696 log_error("Failed to parse --register= argument: %s", optarg);
697 return r;
698 }
699
700 arg_register = r;
701 break;
702
89f7c846
LP
703 case ARG_KEEP_UNIT:
704 arg_keep_unit = true;
705 break;
706
6afc95b7
LP
707 case ARG_PERSONALITY:
708
ac45f971 709 arg_personality = personality_from_string(optarg);
050f7277 710 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
711 log_error("Unknown or unsupported personality '%s'.", optarg);
712 return -EINVAL;
713 }
714
f757855e 715 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
716 break;
717
4d9f07b4
LP
718 case ARG_VOLATILE:
719
720 if (!optarg)
f757855e 721 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 722 else {
f757855e 723 VolatileMode m;
4d9f07b4 724
f757855e
LP
725 m = volatile_mode_from_string(optarg);
726 if (m < 0) {
727 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 728 return -EINVAL;
f757855e
LP
729 } else
730 arg_volatile_mode = m;
6d0b55c2
LP
731 }
732
f757855e
LP
733 arg_settings_mask |= SETTING_VOLATILE_MODE;
734 break;
6d0b55c2 735
f757855e
LP
736 case 'p':
737 r = expose_port_parse(&arg_expose_ports, optarg);
738 if (r == -EEXIST)
739 return log_error_errno(r, "Duplicate port specification: %s", optarg);
740 if (r < 0)
741 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 742
f757855e 743 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 744 break;
6d0b55c2 745
f36933fe
LP
746 case ARG_PROPERTY:
747 if (strv_extend(&arg_property, optarg) < 0)
748 return log_oom();
749
750 break;
751
6dac160c
LP
752 case ARG_PRIVATE_USERS:
753 if (optarg) {
754 _cleanup_free_ char *buffer = NULL;
755 const char *range, *shift;
756
757 range = strchr(optarg, ':');
758 if (range) {
759 buffer = strndup(optarg, range - optarg);
760 if (!buffer)
761 return log_oom();
762 shift = buffer;
763
764 range++;
765 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
766 log_error("Failed to parse UID range: %s", range);
767 return -EINVAL;
768 }
769 } else
770 shift = optarg;
771
772 if (parse_uid(shift, &arg_uid_shift) < 0) {
773 log_error("Failed to parse UID: %s", optarg);
774 return -EINVAL;
775 }
776 }
777
778 arg_userns = true;
779 break;
780
c6c8f6e2
LP
781 case ARG_KILL_SIGNAL:
782 arg_kill_signal = signal_from_string_try_harder(optarg);
783 if (arg_kill_signal < 0) {
784 log_error("Cannot parse signal: %s", optarg);
785 return -EINVAL;
786 }
787
f757855e
LP
788 arg_settings_mask |= SETTING_KILL_SIGNAL;
789 break;
790
791 case ARG_SETTINGS:
792
793 /* no → do not read files
794 * yes → read files, do not override cmdline, trust only subset
795 * override → read files, override cmdline, trust only subset
796 * trusted → read files, do not override cmdline, trust all
797 */
798
799 r = parse_boolean(optarg);
800 if (r < 0) {
801 if (streq(optarg, "trusted")) {
802 mask_all_settings = false;
803 mask_no_settings = false;
804 arg_settings_trusted = true;
805
806 } else if (streq(optarg, "override")) {
807 mask_all_settings = false;
808 mask_no_settings = true;
809 arg_settings_trusted = -1;
810 } else
811 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
812 } else if (r > 0) {
813 /* yes */
814 mask_all_settings = false;
815 mask_no_settings = false;
816 arg_settings_trusted = -1;
817 } else {
818 /* no */
819 mask_all_settings = true;
820 mask_no_settings = false;
821 arg_settings_trusted = false;
822 }
823
c6c8f6e2
LP
824 break;
825
88213476
LP
826 case '?':
827 return -EINVAL;
828
829 default:
eb9da376 830 assert_not_reached("Unhandled option");
88213476 831 }
88213476 832
eb91eb18
LP
833 if (arg_share_system)
834 arg_register = false;
835
836 if (arg_boot && arg_share_system) {
837 log_error("--boot and --share-system may not be combined.");
838 return -EINVAL;
839 }
840
89f7c846
LP
841 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
842 log_error("--keep-unit may not be used when invoked from a user session.");
843 return -EINVAL;
844 }
845
1b9e5b12
LP
846 if (arg_directory && arg_image) {
847 log_error("--directory= and --image= may not be combined.");
848 return -EINVAL;
849 }
850
ec16945e
LP
851 if (arg_template && arg_image) {
852 log_error("--template= and --image= may not be combined.");
853 return -EINVAL;
854 }
855
856 if (arg_template && !(arg_directory || arg_machine)) {
857 log_error("--template= needs --directory= or --machine=.");
858 return -EINVAL;
859 }
860
861 if (arg_ephemeral && arg_template) {
862 log_error("--ephemeral and --template= may not be combined.");
863 return -EINVAL;
864 }
865
866 if (arg_ephemeral && arg_image) {
867 log_error("--ephemeral and --image= may not be combined.");
868 return -EINVAL;
869 }
870
df9a75e4
LP
871 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
872 log_error("--ephemeral and --link-journal= may not be combined.");
873 return -EINVAL;
874 }
875
f757855e
LP
876 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
877 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
878
879 if (argc > optind) {
880 arg_parameters = strv_copy(argv + optind);
881 if (!arg_parameters)
882 return log_oom();
883
884 arg_settings_mask |= SETTING_BOOT;
885 }
886
887 /* Load all settings from .nspawn files */
888 if (mask_no_settings)
889 arg_settings_mask = 0;
890
891 /* Don't load any settings from .nspawn files */
892 if (mask_all_settings)
893 arg_settings_mask = _SETTINGS_MASK_ALL;
894
895 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
896
897 r = detect_unified_cgroup_hierarchy();
898 if (r < 0)
899 return r;
900
901 return 1;
902}
903
904static int verify_arguments(void) {
905
906 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
907 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
908 return -EINVAL;
909 }
910
6d0b55c2
LP
911 if (arg_expose_ports && !arg_private_network) {
912 log_error("Cannot use --port= without private networking.");
913 return -EINVAL;
914 }
915
c6c8f6e2
LP
916 if (arg_boot && arg_kill_signal <= 0)
917 arg_kill_signal = SIGRTMIN+3;
918
f757855e 919 return 0;
88213476
LP
920}
921
03cfe0d5
LP
922static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
923 assert(p);
924
925 if (!arg_userns)
926 return 0;
927
928 if (uid == UID_INVALID && gid == GID_INVALID)
929 return 0;
930
931 if (uid != UID_INVALID) {
932 uid += arg_uid_shift;
933
934 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
935 return -EOVERFLOW;
936 }
937
938 if (gid != GID_INVALID) {
939 gid += (gid_t) arg_uid_shift;
940
941 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
942 return -EOVERFLOW;
943 }
944
945 if (lchown(p, uid, gid) < 0)
946 return -errno;
b12afc8c
LP
947
948 return 0;
949}
950
03cfe0d5
LP
951static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
952 const char *q;
953
954 q = prefix_roota(root, path);
955 if (mkdir(q, mode) < 0) {
956 if (errno == EEXIST)
957 return 0;
958 return -errno;
959 }
960
961 return userns_lchown(q, uid, gid);
962}
963
e58a1277 964static int setup_timezone(const char *dest) {
03cfe0d5
LP
965 _cleanup_free_ char *p = NULL, *q = NULL;
966 const char *where, *check, *what;
d4036145
LP
967 char *z, *y;
968 int r;
f8440af5 969
e58a1277
LP
970 assert(dest);
971
972 /* Fix the timezone, if possible */
d4036145
LP
973 r = readlink_malloc("/etc/localtime", &p);
974 if (r < 0) {
975 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
976 return 0;
977 }
978
979 z = path_startswith(p, "../usr/share/zoneinfo/");
980 if (!z)
981 z = path_startswith(p, "/usr/share/zoneinfo/");
982 if (!z) {
983 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
984 return 0;
985 }
986
03cfe0d5 987 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
988 r = readlink_malloc(where, &q);
989 if (r >= 0) {
990 y = path_startswith(q, "../usr/share/zoneinfo/");
991 if (!y)
992 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 993
d4036145
LP
994 /* Already pointing to the right place? Then do nothing .. */
995 if (y && streq(y, z))
996 return 0;
997 }
998
03cfe0d5
LP
999 check = strjoina("/usr/share/zoneinfo/", z);
1000 check = prefix_root(dest, check);
1001 if (laccess(check, F_OK) < 0) {
d4036145
LP
1002 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1003 return 0;
1004 }
68fb0892 1005
79d80fc1
TG
1006 r = unlink(where);
1007 if (r < 0 && errno != ENOENT) {
56f64d95 1008 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1009 return 0;
1010 }
4d9f07b4 1011
03cfe0d5 1012 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1013 if (symlink(what, where) < 0) {
56f64d95 1014 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1015 return 0;
1016 }
e58a1277 1017
03cfe0d5
LP
1018 r = userns_lchown(where, 0, 0);
1019 if (r < 0)
1020 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1021
e58a1277 1022 return 0;
88213476
LP
1023}
1024
2547bb41 1025static int setup_resolv_conf(const char *dest) {
03cfe0d5 1026 const char *where = NULL;
79d80fc1 1027 int r;
2547bb41
LP
1028
1029 assert(dest);
1030
1031 if (arg_private_network)
1032 return 0;
1033
1034 /* Fix resolv.conf, if possible */
03cfe0d5 1035 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1036
f2068bcc 1037 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1038 if (r < 0) {
68a313c5
LP
1039 /* If the file already exists as symlink, let's
1040 * suppress the warning, under the assumption that
1041 * resolved or something similar runs inside and the
1042 * symlink points there.
1043 *
1044 * If the disk image is read-only, there's also no
1045 * point in complaining.
1046 */
1047 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1048 "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1049 return 0;
1050 }
2547bb41 1051
03cfe0d5
LP
1052 r = userns_lchown(where, 0, 0);
1053 if (r < 0)
1054 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1055
2547bb41
LP
1056 return 0;
1057}
1058
9f24adc2 1059static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
03cfe0d5 1060 assert(s);
9f24adc2
LP
1061
1062 snprintf(s, 37,
1063 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1064 SD_ID128_FORMAT_VAL(id));
1065
1066 return s;
1067}
1068
04bc4a3f 1069static int setup_boot_id(const char *dest) {
03cfe0d5 1070 const char *from, *to;
39883f62 1071 sd_id128_t rnd = {};
04bc4a3f
LP
1072 char as_uuid[37];
1073 int r;
1074
eb91eb18
LP
1075 if (arg_share_system)
1076 return 0;
1077
04bc4a3f
LP
1078 /* Generate a new randomized boot ID, so that each boot-up of
1079 * the container gets a new one */
1080
03cfe0d5
LP
1081 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1082 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1083
1084 r = sd_id128_randomize(&rnd);
f647962d
MS
1085 if (r < 0)
1086 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1087
9f24adc2 1088 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1089
4c1fc3e4 1090 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
f647962d
MS
1091 if (r < 0)
1092 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1093
03cfe0d5
LP
1094 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1095 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1096 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
56f64d95 1097 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1098
1099 unlink(from);
04bc4a3f
LP
1100 return r;
1101}
1102
e58a1277 1103static int copy_devnodes(const char *dest) {
88213476
LP
1104
1105 static const char devnodes[] =
1106 "null\0"
1107 "zero\0"
1108 "full\0"
1109 "random\0"
1110 "urandom\0"
85614d66
TG
1111 "tty\0"
1112 "net/tun\0";
88213476
LP
1113
1114 const char *d;
e58a1277 1115 int r = 0;
7fd1b19b 1116 _cleanup_umask_ mode_t u;
a258bf26
LP
1117
1118 assert(dest);
124640f1
LP
1119
1120 u = umask(0000);
88213476 1121
03cfe0d5
LP
1122 /* Create /dev/net, so that we can create /dev/net/tun in it */
1123 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1124 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1125
88213476 1126 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1127 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1128 struct stat st;
88213476 1129
7f112f50 1130 from = strappend("/dev/", d);
03cfe0d5 1131 to = prefix_root(dest, from);
88213476
LP
1132
1133 if (stat(from, &st) < 0) {
1134
4a62c710
MS
1135 if (errno != ENOENT)
1136 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1137
a258bf26 1138 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1139
03cfe0d5 1140 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1141 return -EIO;
a258bf26 1142
85614d66 1143 } else {
81f5049b
AC
1144 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1145 if (errno != EPERM)
1146 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1147
1148 /* Some systems abusively restrict mknod but
1149 * allow bind mounts. */
1150 r = touch(to);
1151 if (r < 0)
1152 return log_error_errno(r, "touch (%s) failed: %m", to);
1153 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1154 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1155 }
6278cf60 1156
03cfe0d5
LP
1157 r = userns_lchown(to, 0, 0);
1158 if (r < 0)
1159 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1160 }
88213476
LP
1161 }
1162
e58a1277
LP
1163 return r;
1164}
88213476 1165
03cfe0d5
LP
1166static int setup_pts(const char *dest) {
1167 _cleanup_free_ char *options = NULL;
1168 const char *p;
1169
1170#ifdef HAVE_SELINUX
1171 if (arg_selinux_apifs_context)
1172 (void) asprintf(&options,
3dce8915 1173 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1174 arg_uid_shift + TTY_GID,
1175 arg_selinux_apifs_context);
1176 else
1177#endif
1178 (void) asprintf(&options,
3dce8915 1179 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1180 arg_uid_shift + TTY_GID);
f2d88580 1181
03cfe0d5 1182 if (!options)
f2d88580
LP
1183 return log_oom();
1184
03cfe0d5 1185 /* Mount /dev/pts itself */
cc9fce65 1186 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1187 if (mkdir(p, 0755) < 0)
1188 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1189 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1190 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1191 if (userns_lchown(p, 0, 0) < 0)
1192 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1193
1194 /* Create /dev/ptmx symlink */
1195 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1196 if (symlink("pts/ptmx", p) < 0)
1197 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
03cfe0d5
LP
1198 if (userns_lchown(p, 0, 0) < 0)
1199 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
f2d88580 1200
03cfe0d5
LP
1201 /* And fix /dev/pts/ptmx ownership */
1202 p = prefix_roota(dest, "/dev/pts/ptmx");
1203 if (userns_lchown(p, 0, 0) < 0)
1204 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1205
f2d88580
LP
1206 return 0;
1207}
1208
e58a1277 1209static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1210 _cleanup_umask_ mode_t u;
1211 const char *to;
e58a1277 1212 int r;
e58a1277
LP
1213
1214 assert(dest);
1215 assert(console);
1216
1217 u = umask(0000);
1218
03cfe0d5 1219 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1220 if (r < 0)
1221 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1222
a258bf26
LP
1223 /* We need to bind mount the right tty to /dev/console since
1224 * ptys can only exist on pts file systems. To have something
81f5049b 1225 * to bind mount things on we create a empty regular file. */
a258bf26 1226
03cfe0d5 1227 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1228 r = touch(to);
1229 if (r < 0)
1230 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1231
4543768d 1232 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1233 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1234
25ea79fe 1235 return 0;
e58a1277
LP
1236}
1237
1238static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1239 const char *from, *to;
7fd1b19b 1240 _cleanup_umask_ mode_t u;
d9603714 1241 int fd, r;
e58a1277 1242
e58a1277 1243 assert(kmsg_socket >= 0);
a258bf26 1244
e58a1277 1245 u = umask(0000);
a258bf26 1246
03cfe0d5 1247 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1248 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1249 * on the reading side behave very similar to /proc/kmsg,
1250 * their writing side behaves differently from /dev/kmsg in
1251 * that writing blocks when nothing is reading. In order to
1252 * avoid any problems with containers deadlocking due to this
1253 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1254 from = prefix_roota(dest, "/run/kmsg");
1255 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1256
4a62c710 1257 if (mkfifo(from, 0600) < 0)
03cfe0d5 1258 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 1259 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1260 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1261
1262 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1263 if (fd < 0)
1264 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1265
e58a1277
LP
1266 /* Store away the fd in the socket, so that it stays open as
1267 * long as we run the child */
3ee897d6 1268 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1269 safe_close(fd);
e58a1277 1270
d9603714
DH
1271 if (r < 0)
1272 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1273
03cfe0d5
LP
1274 /* And now make the FIFO unavailable as /run/kmsg... */
1275 (void) unlink(from);
1276
25ea79fe 1277 return 0;
88213476
LP
1278}
1279
1c4baffc 1280static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1281 union in_addr_union *exposed = userdata;
1282
1283 assert(rtnl);
1284 assert(m);
1285 assert(exposed);
1286
7a8f6325 1287 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1288 return 0;
1289}
1290
3a74cea5 1291static int setup_hostname(void) {
3a74cea5 1292
eb91eb18
LP
1293 if (arg_share_system)
1294 return 0;
1295
605f81a8 1296 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1297 return -errno;
3a74cea5 1298
7027ff61 1299 return 0;
3a74cea5
LP
1300}
1301
57fb9fb5 1302static int setup_journal(const char *directory) {
4d680aee 1303 sd_id128_t machine_id, this_id;
03cfe0d5
LP
1304 _cleanup_free_ char *b = NULL, *d = NULL;
1305 const char *etc_machine_id, *p, *q;
27407a01 1306 char *id;
57fb9fb5
LP
1307 int r;
1308
df9a75e4
LP
1309 /* Don't link journals in ephemeral mode */
1310 if (arg_ephemeral)
1311 return 0;
1312
03cfe0d5 1313 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
57fb9fb5 1314
03cfe0d5 1315 r = read_one_line_file(etc_machine_id, &b);
27407a01
ZJS
1316 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1317 return 0;
f647962d 1318 else if (r < 0)
03cfe0d5 1319 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
57fb9fb5 1320
27407a01
ZJS
1321 id = strstrip(b);
1322 if (isempty(id) && arg_link_journal == LINK_AUTO)
1323 return 0;
57fb9fb5 1324
27407a01
ZJS
1325 /* Verify validity */
1326 r = sd_id128_from_string(id, &machine_id);
f647962d 1327 if (r < 0)
03cfe0d5 1328 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
57fb9fb5 1329
4d680aee 1330 r = sd_id128_get_machine(&this_id);
f647962d
MS
1331 if (r < 0)
1332 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1333
1334 if (sd_id128_equal(machine_id, this_id)) {
1335 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1336 "Host and machine ids are equal (%s): refusing to link journals", id);
1337 if (arg_link_journal == LINK_AUTO)
1338 return 0;
df9a75e4 1339 return -EEXIST;
4d680aee
ZJS
1340 }
1341
1342 if (arg_link_journal == LINK_NO)
1343 return 0;
1344
03cfe0d5
LP
1345 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1346 if (r < 0)
1347 return log_error_errno(r, "Failed to create /var: %m");
1348
1349 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1350 if (r < 0)
1351 return log_error_errno(r, "Failed to create /var/log: %m");
1352
1353 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1354 if (r < 0)
1355 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1356
1357 p = strjoina("/var/log/journal/", id);
1358 q = prefix_roota(directory, p);
27407a01 1359
e26d6ce5 1360 if (path_is_mount_point(p, 0) > 0) {
27407a01
ZJS
1361 if (arg_link_journal != LINK_AUTO) {
1362 log_error("%s: already a mount point, refusing to use for journal", p);
1363 return -EEXIST;
1364 }
1365
1366 return 0;
57fb9fb5
LP
1367 }
1368
e26d6ce5 1369 if (path_is_mount_point(q, 0) > 0) {
57fb9fb5 1370 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1371 log_error("%s: already a mount point, refusing to use for journal", q);
1372 return -EEXIST;
57fb9fb5
LP
1373 }
1374
27407a01 1375 return 0;
57fb9fb5
LP
1376 }
1377
1378 r = readlink_and_make_absolute(p, &d);
1379 if (r >= 0) {
1380 if ((arg_link_journal == LINK_GUEST ||
1381 arg_link_journal == LINK_AUTO) &&
1382 path_equal(d, q)) {
1383
03cfe0d5 1384 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1385 if (r < 0)
56f64d95 1386 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1387 return 0;
57fb9fb5
LP
1388 }
1389
4a62c710
MS
1390 if (unlink(p) < 0)
1391 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1392 } else if (r == -EINVAL) {
1393
1394 if (arg_link_journal == LINK_GUEST &&
1395 rmdir(p) < 0) {
1396
27407a01
ZJS
1397 if (errno == ENOTDIR) {
1398 log_error("%s already exists and is neither a symlink nor a directory", p);
1399 return r;
1400 } else {
56f64d95 1401 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 1402 return -errno;
57fb9fb5 1403 }
57fb9fb5
LP
1404 }
1405 } else if (r != -ENOENT) {
56f64d95 1406 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 1407 return r;
57fb9fb5
LP
1408 }
1409
1410 if (arg_link_journal == LINK_GUEST) {
1411
1412 if (symlink(q, p) < 0) {
574edc90 1413 if (arg_link_journal_try) {
56f64d95 1414 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
1415 return 0;
1416 } else {
56f64d95 1417 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
1418 return -errno;
1419 }
57fb9fb5
LP
1420 }
1421
03cfe0d5 1422 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1423 if (r < 0)
56f64d95 1424 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1425 return 0;
57fb9fb5
LP
1426 }
1427
1428 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1429 /* don't create parents here -- if the host doesn't have
1430 * permanent journal set up, don't force it here */
1431 r = mkdir(p, 0755);
57fb9fb5 1432 if (r < 0) {
574edc90 1433 if (arg_link_journal_try) {
56f64d95 1434 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
1435 return 0;
1436 } else {
56f64d95 1437 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
1438 return r;
1439 }
57fb9fb5
LP
1440 }
1441
27407a01
ZJS
1442 } else if (access(p, F_OK) < 0)
1443 return 0;
57fb9fb5 1444
cdb2b9d0
LP
1445 if (dir_is_empty(q) == 0)
1446 log_warning("%s is not empty, proceeding anyway.", q);
1447
03cfe0d5 1448 r = userns_mkdir(directory, p, 0755, 0, 0);
57fb9fb5 1449 if (r < 0) {
56f64d95 1450 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 1451 return r;
57fb9fb5
LP
1452 }
1453
4543768d 1454 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 1455 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1456
27407a01 1457 return 0;
57fb9fb5
LP
1458}
1459
88213476 1460static int drop_capabilities(void) {
5076f0cc 1461 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1462}
1463
db999e0f
LP
1464static int reset_audit_loginuid(void) {
1465 _cleanup_free_ char *p = NULL;
1466 int r;
1467
1468 if (arg_share_system)
1469 return 0;
1470
1471 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1472 if (r == -ENOENT)
db999e0f 1473 return 0;
f647962d
MS
1474 if (r < 0)
1475 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1476
1477 /* Already reset? */
1478 if (streq(p, "4294967295"))
1479 return 0;
1480
ad118bda 1481 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1482 if (r < 0) {
10a87006
LP
1483 log_error_errno(r,
1484 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1485 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1486 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1487 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1488 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1489
db999e0f 1490 sleep(5);
77b6e194 1491 }
db999e0f
LP
1492
1493 return 0;
77b6e194
LP
1494}
1495
28650077 1496static int setup_seccomp(void) {
24fb1112
LP
1497
1498#ifdef HAVE_SECCOMP
9a71b112
JF
1499 static const struct {
1500 uint64_t capability;
1501 int syscall_num;
1502 } blacklist[] = {
5ba7a268
LP
1503 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1504 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1505 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1506 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1507 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1508 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1509 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1510 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1511 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1512 { CAP_SYSLOG, SCMP_SYS(syslog) },
d0a0ccf3
JF
1513 };
1514
24fb1112 1515 scmp_filter_ctx seccomp;
28650077 1516 unsigned i;
24fb1112
LP
1517 int r;
1518
24fb1112
LP
1519 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1520 if (!seccomp)
1521 return log_oom();
1522
e9642be2 1523 r = seccomp_add_secondary_archs(seccomp);
9875fd78 1524 if (r < 0) {
da927ba9 1525 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
1526 goto finish;
1527 }
1528
28650077 1529 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
1530 if (arg_retain & (1ULL << blacklist[i].capability))
1531 continue;
1532
1533 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
1534 if (r == -EFAULT)
1535 continue; /* unknown syscall */
1536 if (r < 0) {
da927ba9 1537 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
1538 goto finish;
1539 }
1540 }
1541
d0a0ccf3 1542
28650077
LP
1543 /*
1544 Audit is broken in containers, much of the userspace audit
1545 hookup will fail if running inside a container. We don't
1546 care and just turn off creation of audit sockets.
1547
1548 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1549 with EAFNOSUPPORT which audit userspace uses as indication
1550 that audit is disabled in the kernel.
1551 */
1552
3302da46 1553 r = seccomp_rule_add(
24fb1112
LP
1554 seccomp,
1555 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1556 SCMP_SYS(socket),
1557 2,
1558 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1559 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1560 if (r < 0) {
da927ba9 1561 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
1562 goto finish;
1563 }
1564
1565 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1566 if (r < 0) {
da927ba9 1567 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
1568 goto finish;
1569 }
1570
1571 r = seccomp_load(seccomp);
9b1cbdc6
ILG
1572 if (r == -EINVAL) {
1573 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1574 r = 0;
1575 goto finish;
1576 }
1577 if (r < 0) {
da927ba9 1578 log_error_errno(r, "Failed to install seccomp audit filter: %m");
9b1cbdc6
ILG
1579 goto finish;
1580 }
24fb1112
LP
1581
1582finish:
1583 seccomp_release(seccomp);
1584 return r;
1585#else
1586 return 0;
1587#endif
1588
1589}
1590
785890ac
LP
1591static int setup_propagate(const char *root) {
1592 const char *p, *q;
1593
1594 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1595 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1596 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1597 (void) mkdir_p(p, 0600);
1598
03cfe0d5
LP
1599 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
1600 return log_error_errno(errno, "Failed to create /run/systemd: %m");
1601
1602 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1603 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
1604
1605 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1606 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1607
03cfe0d5 1608 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
1609 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1610 return log_error_errno(errno, "Failed to install propagation bind mount.");
1611
1612 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1613 return log_error_errno(errno, "Failed to make propagation mount read-only");
1614
1615 return 0;
1616}
1617
1b9e5b12
LP
1618static int setup_image(char **device_path, int *loop_nr) {
1619 struct loop_info64 info = {
1620 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1621 };
1622 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1623 _cleanup_free_ char* loopdev = NULL;
1624 struct stat st;
1625 int r, nr;
1626
1627 assert(device_path);
1628 assert(loop_nr);
ec16945e 1629 assert(arg_image);
1b9e5b12
LP
1630
1631 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1632 if (fd < 0)
1633 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 1634
4a62c710
MS
1635 if (fstat(fd, &st) < 0)
1636 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
1637
1638 if (S_ISBLK(st.st_mode)) {
1639 char *p;
1640
1641 p = strdup(arg_image);
1642 if (!p)
1643 return log_oom();
1644
1645 *device_path = p;
1646
1647 *loop_nr = -1;
1648
1649 r = fd;
1650 fd = -1;
1651
1652 return r;
1653 }
1654
1655 if (!S_ISREG(st.st_mode)) {
56f64d95 1656 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
1657 return -EINVAL;
1658 }
1659
1660 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1661 if (control < 0)
1662 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
1663
1664 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
1665 if (nr < 0)
1666 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
1667
1668 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1669 return log_oom();
1670
1671 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1672 if (loop < 0)
1673 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 1674
4a62c710
MS
1675 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1676 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
1677
1678 if (arg_read_only)
1679 info.lo_flags |= LO_FLAGS_READ_ONLY;
1680
4a62c710
MS
1681 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1682 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
1683
1684 *device_path = loopdev;
1685 loopdev = NULL;
1686
1687 *loop_nr = nr;
1688
1689 r = loop;
1690 loop = -1;
1691
1692 return r;
1693}
1694
ada4799a
LP
1695#define PARTITION_TABLE_BLURB \
1696 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 1697 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 1698 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
1699 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1700 "to be bootable with systemd-nspawn."
1701
1b9e5b12
LP
1702static int dissect_image(
1703 int fd,
727fd4fd
LP
1704 char **root_device, bool *root_device_rw,
1705 char **home_device, bool *home_device_rw,
1706 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
1707 bool *secondary) {
1708
1709#ifdef HAVE_BLKID
01dc33ce
ZJS
1710 int home_nr = -1, srv_nr = -1;
1711#ifdef GPT_ROOT_NATIVE
1712 int root_nr = -1;
1713#endif
1714#ifdef GPT_ROOT_SECONDARY
1715 int secondary_root_nr = -1;
1716#endif
f6c51a81 1717 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
1718 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1719 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1720 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1721 _cleanup_udev_unref_ struct udev *udev = NULL;
1722 struct udev_list_entry *first, *item;
f6c51a81 1723 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 1724 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
1725 const char *pttype = NULL;
1726 blkid_partlist pl;
1727 struct stat st;
c09ef2e4 1728 unsigned i;
1b9e5b12
LP
1729 int r;
1730
1731 assert(fd >= 0);
1732 assert(root_device);
1733 assert(home_device);
1734 assert(srv_device);
1735 assert(secondary);
ec16945e 1736 assert(arg_image);
1b9e5b12
LP
1737
1738 b = blkid_new_probe();
1739 if (!b)
1740 return log_oom();
1741
1742 errno = 0;
1743 r = blkid_probe_set_device(b, fd, 0, 0);
1744 if (r != 0) {
1745 if (errno == 0)
1746 return log_oom();
1747
56f64d95 1748 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
1749 return -errno;
1750 }
1751
1752 blkid_probe_enable_partitions(b, 1);
1753 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1754
1755 errno = 0;
1756 r = blkid_do_safeprobe(b);
1757 if (r == -2 || r == 1) {
ada4799a
LP
1758 log_error("Failed to identify any partition table on\n"
1759 " %s\n"
1760 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1761 return -EINVAL;
1762 } else if (r != 0) {
1763 if (errno == 0)
1764 errno = EIO;
56f64d95 1765 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
1766 return -errno;
1767 }
1768
48861960 1769 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
1770
1771 is_gpt = streq_ptr(pttype, "gpt");
1772 is_mbr = streq_ptr(pttype, "dos");
1773
1774 if (!is_gpt && !is_mbr) {
1775 log_error("No GPT or MBR partition table discovered on\n"
1776 " %s\n"
1777 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1778 return -EINVAL;
1779 }
1780
1781 errno = 0;
1782 pl = blkid_probe_get_partitions(b);
1783 if (!pl) {
1784 if (errno == 0)
1785 return log_oom();
1786
1787 log_error("Failed to list partitions of %s", arg_image);
1788 return -errno;
1789 }
1790
1791 udev = udev_new();
1792 if (!udev)
1793 return log_oom();
1794
4a62c710
MS
1795 if (fstat(fd, &st) < 0)
1796 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 1797
c09ef2e4
LP
1798 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1799 if (!d)
1b9e5b12
LP
1800 return log_oom();
1801
c09ef2e4
LP
1802 for (i = 0;; i++) {
1803 int n, m;
1b9e5b12 1804
c09ef2e4
LP
1805 if (i >= 10) {
1806 log_error("Kernel partitions never appeared.");
1807 return -ENXIO;
1808 }
1809
1810 e = udev_enumerate_new(udev);
1811 if (!e)
1812 return log_oom();
1813
1814 r = udev_enumerate_add_match_parent(e, d);
1815 if (r < 0)
1816 return log_oom();
1817
1818 r = udev_enumerate_scan_devices(e);
1819 if (r < 0)
1820 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1821
1822 /* Count the partitions enumerated by the kernel */
1823 n = 0;
1824 first = udev_enumerate_get_list_entry(e);
1825 udev_list_entry_foreach(item, first)
1826 n++;
1827
1828 /* Count the partitions enumerated by blkid */
1829 m = blkid_partlist_numof_partitions(pl);
1830 if (n == m + 1)
1831 break;
1832 if (n > m + 1) {
1833 log_error("blkid and kernel partition list do not match.");
1834 return -EIO;
1835 }
1836 if (n < m + 1) {
1837 unsigned j;
1838
1839 /* The kernel has probed fewer partitions than
1840 * blkid? Maybe the kernel prober is still
1841 * running or it got EBUSY because udev
1842 * already opened the device. Let's reprobe
1843 * the device, which is a synchronous call
1844 * that waits until probing is complete. */
1845
1846 for (j = 0; j < 20; j++) {
1847
1848 r = ioctl(fd, BLKRRPART, 0);
1849 if (r < 0)
1850 r = -errno;
1851 if (r >= 0 || r != -EBUSY)
1852 break;
1853
1854 /* If something else has the device
1855 * open, such as an udev rule, the
1856 * ioctl will return EBUSY. Since
1857 * there's no way to wait until it
1858 * isn't busy anymore, let's just wait
1859 * a bit, and try again.
1860 *
1861 * This is really something they
1862 * should fix in the kernel! */
1863
1864 usleep(50 * USEC_PER_MSEC);
1865 }
1866
1867 if (r < 0)
1868 return log_error_errno(r, "Failed to reread partition table: %m");
1869 }
1870
1871 e = udev_enumerate_unref(e);
1872 }
1b9e5b12
LP
1873
1874 first = udev_enumerate_get_list_entry(e);
1875 udev_list_entry_foreach(item, first) {
1876 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 1877 const char *node;
727fd4fd 1878 unsigned long long flags;
1b9e5b12
LP
1879 blkid_partition pp;
1880 dev_t qn;
1881 int nr;
1882
1883 errno = 0;
1884 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1885 if (!q) {
1886 if (!errno)
1887 errno = ENOMEM;
1888
56f64d95 1889 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
1890 return -errno;
1891 }
1892
1893 qn = udev_device_get_devnum(q);
1894 if (major(qn) == 0)
1895 continue;
1896
1897 if (st.st_rdev == qn)
1898 continue;
1899
1900 node = udev_device_get_devnode(q);
1901 if (!node)
1902 continue;
1903
1904 pp = blkid_partlist_devno_to_partition(pl, qn);
1905 if (!pp)
1906 continue;
1907
727fd4fd 1908 flags = blkid_partition_get_flags(pp);
727fd4fd 1909
1b9e5b12
LP
1910 nr = blkid_partition_get_partno(pp);
1911 if (nr < 0)
1912 continue;
1913
ada4799a
LP
1914 if (is_gpt) {
1915 sd_id128_t type_id;
1916 const char *stype;
1b9e5b12 1917
f6c51a81
LP
1918 if (flags & GPT_FLAG_NO_AUTO)
1919 continue;
1920
ada4799a
LP
1921 stype = blkid_partition_get_type_string(pp);
1922 if (!stype)
1923 continue;
1b9e5b12 1924
ada4799a 1925 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
1926 continue;
1927
ada4799a 1928 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 1929
ada4799a
LP
1930 if (home && nr >= home_nr)
1931 continue;
1b9e5b12 1932
ada4799a
LP
1933 home_nr = nr;
1934 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 1935
ada4799a
LP
1936 r = free_and_strdup(&home, node);
1937 if (r < 0)
1938 return log_oom();
727fd4fd 1939
ada4799a
LP
1940 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1941
1942 if (srv && nr >= srv_nr)
1943 continue;
1944
1945 srv_nr = nr;
1946 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1947
1948 r = free_and_strdup(&srv, node);
1949 if (r < 0)
1950 return log_oom();
1951 }
1b9e5b12 1952#ifdef GPT_ROOT_NATIVE
ada4799a 1953 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 1954
ada4799a
LP
1955 if (root && nr >= root_nr)
1956 continue;
1b9e5b12 1957
ada4799a
LP
1958 root_nr = nr;
1959 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 1960
ada4799a
LP
1961 r = free_and_strdup(&root, node);
1962 if (r < 0)
1963 return log_oom();
1964 }
1b9e5b12
LP
1965#endif
1966#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
1967 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1968
1969 if (secondary_root && nr >= secondary_root_nr)
1970 continue;
1971
1972 secondary_root_nr = nr;
1973 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
1974
1975 r = free_and_strdup(&secondary_root, node);
1976 if (r < 0)
1977 return log_oom();
1978 }
1979#endif
f6c51a81
LP
1980 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
1981
1982 if (generic)
1983 multiple_generic = true;
1984 else {
1985 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
1986
1987 r = free_and_strdup(&generic, node);
1988 if (r < 0)
1989 return log_oom();
1990 }
1991 }
ada4799a
LP
1992
1993 } else if (is_mbr) {
1994 int type;
1b9e5b12 1995
f6c51a81
LP
1996 if (flags != 0x80) /* Bootable flag */
1997 continue;
1998
ada4799a
LP
1999 type = blkid_partition_get_type(pp);
2000 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2001 continue;
2002
f6c51a81
LP
2003 if (generic)
2004 multiple_generic = true;
2005 else {
2006 generic_rw = true;
727fd4fd 2007
f6c51a81
LP
2008 r = free_and_strdup(&root, node);
2009 if (r < 0)
2010 return log_oom();
2011 }
1b9e5b12 2012 }
1b9e5b12
LP
2013 }
2014
1b9e5b12
LP
2015 if (root) {
2016 *root_device = root;
2017 root = NULL;
727fd4fd
LP
2018
2019 *root_device_rw = root_rw;
1b9e5b12
LP
2020 *secondary = false;
2021 } else if (secondary_root) {
2022 *root_device = secondary_root;
2023 secondary_root = NULL;
727fd4fd
LP
2024
2025 *root_device_rw = secondary_root_rw;
1b9e5b12 2026 *secondary = true;
f6c51a81
LP
2027 } else if (generic) {
2028
2029 /* There were no partitions with precise meanings
2030 * around, but we found generic partitions. In this
2031 * case, if there's only one, we can go ahead and boot
2032 * it, otherwise we bail out, because we really cannot
2033 * make any sense of it. */
2034
2035 if (multiple_generic) {
2036 log_error("Identified multiple bootable Linux partitions on\n"
2037 " %s\n"
2038 PARTITION_TABLE_BLURB, arg_image);
2039 return -EINVAL;
2040 }
2041
2042 *root_device = generic;
2043 generic = NULL;
2044
2045 *root_device_rw = generic_rw;
2046 *secondary = false;
2047 } else {
2048 log_error("Failed to identify root partition in disk image\n"
2049 " %s\n"
2050 PARTITION_TABLE_BLURB, arg_image);
2051 return -EINVAL;
1b9e5b12
LP
2052 }
2053
2054 if (home) {
2055 *home_device = home;
2056 home = NULL;
727fd4fd
LP
2057
2058 *home_device_rw = home_rw;
1b9e5b12
LP
2059 }
2060
2061 if (srv) {
2062 *srv_device = srv;
2063 srv = NULL;
727fd4fd
LP
2064
2065 *srv_device_rw = srv_rw;
1b9e5b12
LP
2066 }
2067
2068 return 0;
2069#else
2070 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2071 return -EOPNOTSUPP;
1b9e5b12
LP
2072#endif
2073}
2074
727fd4fd 2075static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2076#ifdef HAVE_BLKID
2077 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2078 const char *fstype, *p;
2079 int r;
2080
2081 assert(what);
2082 assert(where);
2083
727fd4fd
LP
2084 if (arg_read_only)
2085 rw = false;
2086
1b9e5b12 2087 if (directory)
63c372cb 2088 p = strjoina(where, directory);
1b9e5b12
LP
2089 else
2090 p = where;
2091
2092 errno = 0;
2093 b = blkid_new_probe_from_filename(what);
2094 if (!b) {
2095 if (errno == 0)
2096 return log_oom();
56f64d95 2097 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2098 return -errno;
2099 }
2100
2101 blkid_probe_enable_superblocks(b, 1);
2102 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2103
2104 errno = 0;
2105 r = blkid_do_safeprobe(b);
2106 if (r == -1 || r == 1) {
2107 log_error("Cannot determine file system type of %s", what);
2108 return -EINVAL;
2109 } else if (r != 0) {
2110 if (errno == 0)
2111 errno = EIO;
56f64d95 2112 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2113 return -errno;
2114 }
2115
2116 errno = 0;
2117 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2118 if (errno == 0)
2119 errno = EINVAL;
2120 log_error("Failed to determine file system type of %s", what);
2121 return -errno;
2122 }
2123
2124 if (streq(fstype, "crypto_LUKS")) {
2125 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 2126 return -EOPNOTSUPP;
1b9e5b12
LP
2127 }
2128
4a62c710
MS
2129 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2130 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
2131
2132 return 0;
2133#else
2134 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2135 return -EOPNOTSUPP;
1b9e5b12
LP
2136#endif
2137}
2138
727fd4fd
LP
2139static int mount_devices(
2140 const char *where,
2141 const char *root_device, bool root_device_rw,
2142 const char *home_device, bool home_device_rw,
2143 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2144 int r;
2145
2146 assert(where);
2147
2148 if (root_device) {
727fd4fd 2149 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2150 if (r < 0)
2151 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2152 }
2153
2154 if (home_device) {
727fd4fd 2155 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2156 if (r < 0)
2157 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2158 }
2159
2160 if (srv_device) {
727fd4fd 2161 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2162 if (r < 0)
2163 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2164 }
2165
2166 return 0;
2167}
2168
2169static void loop_remove(int nr, int *image_fd) {
2170 _cleanup_close_ int control = -1;
e8c8ddcc 2171 int r;
1b9e5b12
LP
2172
2173 if (nr < 0)
2174 return;
2175
2176 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2177 r = ioctl(*image_fd, LOOP_CLR_FD);
2178 if (r < 0)
5e4074aa 2179 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2180 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2181 }
2182
2183 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2184 if (control < 0) {
56f64d95 2185 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2186 return;
e8c8ddcc 2187 }
1b9e5b12 2188
e8c8ddcc
TG
2189 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2190 if (r < 0)
5e4074aa 2191 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2192}
2193
113cea80 2194/*
6d416b9c
LS
2195 * Return values:
2196 * < 0 : wait_for_terminate() failed to get the state of the
2197 * container, the container was terminated by a signal, or
2198 * failed for an unknown reason. No change is made to the
2199 * container argument.
2200 * > 0 : The program executed in the container terminated with an
2201 * error. The exit code of the program executed in the
919699ec
LP
2202 * container is returned. The container argument has been set
2203 * to CONTAINER_TERMINATED.
6d416b9c
LS
2204 * 0 : The container is being rebooted, has been shut down or exited
2205 * successfully. The container argument has been set to either
2206 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2207 *
6d416b9c
LS
2208 * That is, success is indicated by a return value of zero, and an
2209 * error is indicated by a non-zero value.
113cea80
DH
2210 */
2211static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2212 siginfo_t status;
919699ec 2213 int r;
113cea80
DH
2214
2215 r = wait_for_terminate(pid, &status);
f647962d
MS
2216 if (r < 0)
2217 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2218
2219 switch (status.si_code) {
fddbb89c 2220
113cea80 2221 case CLD_EXITED:
919699ec
LP
2222 if (status.si_status == 0) {
2223 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 2224
fddbb89c 2225 } else
919699ec 2226 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2227
919699ec
LP
2228 *container = CONTAINER_TERMINATED;
2229 return status.si_status;
113cea80
DH
2230
2231 case CLD_KILLED:
2232 if (status.si_status == SIGINT) {
113cea80 2233
919699ec 2234 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2235 *container = CONTAINER_TERMINATED;
919699ec
LP
2236 return 0;
2237
113cea80 2238 } else if (status.si_status == SIGHUP) {
113cea80 2239
919699ec 2240 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2241 *container = CONTAINER_REBOOTED;
919699ec 2242 return 0;
113cea80 2243 }
919699ec 2244
113cea80
DH
2245 /* CLD_KILLED fallthrough */
2246
2247 case CLD_DUMPED:
fddbb89c 2248 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2249 return -EIO;
113cea80
DH
2250
2251 default:
fddbb89c 2252 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2253 return -EIO;
113cea80
DH
2254 }
2255
2256 return r;
2257}
2258
023fb90b
LP
2259static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2260 pid_t pid;
2261
2262 pid = PTR_TO_UINT32(userdata);
2263 if (pid > 0) {
c6c8f6e2 2264 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2265 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2266 sd_event_source_set_userdata(s, NULL);
2267 return 0;
2268 }
2269 }
2270
2271 sd_event_exit(sd_event_source_get_event(s), 0);
2272 return 0;
2273}
2274
ec16945e 2275static int determine_names(void) {
1b9cebf6 2276 int r;
ec16945e 2277
c1521918
LP
2278 if (arg_template && !arg_directory && arg_machine) {
2279
2280 /* If --template= was specified then we should not
2281 * search for a machine, but instead create a new one
2282 * in /var/lib/machine. */
2283
2284 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2285 if (!arg_directory)
2286 return log_oom();
2287 }
2288
ec16945e 2289 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2290 if (arg_machine) {
2291 _cleanup_(image_unrefp) Image *i = NULL;
2292
2293 r = image_find(arg_machine, &i);
2294 if (r < 0)
2295 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2296 else if (r == 0) {
2297 log_error("No image for machine '%s': %m", arg_machine);
2298 return -ENOENT;
2299 }
2300
aceac2f0 2301 if (i->type == IMAGE_RAW)
0f03c2a4 2302 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2303 else
0f03c2a4 2304 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6
LP
2305 if (r < 0)
2306 return log_error_errno(r, "Invalid image directory: %m");
2307
aee327b8
LP
2308 if (!arg_ephemeral)
2309 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2310 } else
ec16945e
LP
2311 arg_directory = get_current_dir_name();
2312
1b9cebf6
LP
2313 if (!arg_directory && !arg_machine) {
2314 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2315 return -EINVAL;
2316 }
2317 }
2318
2319 if (!arg_machine) {
b9ba4dab
LP
2320 if (arg_directory && path_equal(arg_directory, "/"))
2321 arg_machine = gethostname_malloc();
2322 else
2323 arg_machine = strdup(basename(arg_image ?: arg_directory));
2324
ec16945e
LP
2325 if (!arg_machine)
2326 return log_oom();
2327
ae691c1d 2328 hostname_cleanup(arg_machine);
ec16945e
LP
2329 if (!machine_name_is_valid(arg_machine)) {
2330 log_error("Failed to determine machine name automatically, please use -M.");
2331 return -EINVAL;
2332 }
b9ba4dab
LP
2333
2334 if (arg_ephemeral) {
2335 char *b;
2336
2337 /* Add a random suffix when this is an
2338 * ephemeral machine, so that we can run many
2339 * instances at once without manually having
2340 * to specify -M each time. */
2341
2342 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2343 return log_oom();
2344
2345 free(arg_machine);
2346 arg_machine = b;
2347 }
ec16945e
LP
2348 }
2349
2350 return 0;
2351}
2352
03cfe0d5 2353static int determine_uid_shift(const char *directory) {
6dac160c
LP
2354 int r;
2355
03cfe0d5
LP
2356 if (!arg_userns) {
2357 arg_uid_shift = 0;
6dac160c 2358 return 0;
03cfe0d5 2359 }
6dac160c
LP
2360
2361 if (arg_uid_shift == UID_INVALID) {
2362 struct stat st;
2363
03cfe0d5 2364 r = stat(directory, &st);
6dac160c 2365 if (r < 0)
03cfe0d5 2366 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2367
2368 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2369
2370 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2371 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2372 return -EINVAL;
2373 }
2374
2375 arg_uid_range = UINT32_C(0x10000);
2376 }
2377
2378 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2379 log_error("UID base too high for UID range.");
2380 return -EINVAL;
2381 }
2382
2383 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2384 return 0;
2385}
2386
03cfe0d5
LP
2387static int inner_child(
2388 Barrier *barrier,
2389 const char *directory,
2390 bool secondary,
2391 int kmsg_socket,
2392 int rtnl_socket,
f757855e 2393 FDSet *fds) {
69c79d3c 2394
03cfe0d5
LP
2395 _cleanup_free_ char *home = NULL;
2396 unsigned n_env = 2;
2397 const char *envp[] = {
2398 "PATH=" DEFAULT_PATH_SPLIT_USR,
2399 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2400 NULL, /* TERM */
2401 NULL, /* HOME */
2402 NULL, /* USER */
2403 NULL, /* LOGNAME */
2404 NULL, /* container_uuid */
2405 NULL, /* LISTEN_FDS */
2406 NULL, /* LISTEN_PID */
2407 NULL
2408 };
88213476 2409
2371271c 2410 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2411 int r;
88213476 2412
03cfe0d5
LP
2413 assert(barrier);
2414 assert(directory);
2415 assert(kmsg_socket >= 0);
88213476 2416
efdb0237
LP
2417 cg_unified_flush();
2418
03cfe0d5
LP
2419 if (arg_userns) {
2420 /* Tell the parent, that it now can write the UID map. */
2421 (void) barrier_place(barrier); /* #1 */
7027ff61 2422
03cfe0d5
LP
2423 /* Wait until the parent wrote the UID map */
2424 if (!barrier_place_and_sync(barrier)) { /* #2 */
2425 log_error("Parent died too early");
2426 return -ESRCH;
2427 }
88213476
LP
2428 }
2429
d1678248 2430 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2431 if (r < 0)
2432 return r;
2433
d8fc6a00
LP
2434 r = mount_sysfs(NULL);
2435 if (r < 0)
2436 return r;
2437
03cfe0d5
LP
2438 /* Wait until we are cgroup-ified, so that we
2439 * can mount the right cgroup path writable */
2440 if (!barrier_place_and_sync(barrier)) { /* #3 */
2441 log_error("Parent died too early");
2442 return -ESRCH;
88213476
LP
2443 }
2444
e83bebef 2445 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
03cfe0d5
LP
2446 if (r < 0)
2447 return r;
ec16945e 2448
03cfe0d5
LP
2449 r = reset_uid_gid();
2450 if (r < 0)
2451 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 2452
03cfe0d5
LP
2453 r = setup_boot_id(NULL);
2454 if (r < 0)
2455 return r;
ec16945e 2456
03cfe0d5
LP
2457 r = setup_kmsg(NULL, kmsg_socket);
2458 if (r < 0)
2459 return r;
2460 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2461
03cfe0d5 2462 umask(0022);
30535c16 2463
03cfe0d5
LP
2464 if (setsid() < 0)
2465 return log_error_errno(errno, "setsid() failed: %m");
2466
2467 if (arg_private_network)
2468 loopback_setup();
2469
7a8f6325
LP
2470 if (arg_expose_ports) {
2471 r = expose_port_send_rtnl(rtnl_socket);
2472 if (r < 0)
2473 return r;
2474 rtnl_socket = safe_close(rtnl_socket);
2475 }
03cfe0d5
LP
2476
2477 if (drop_capabilities() < 0)
2478 return log_error_errno(errno, "drop_capabilities() failed: %m");
2479
2480 setup_hostname();
2481
050f7277 2482 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2483 if (personality(arg_personality) < 0)
2484 return log_error_errno(errno, "personality() failed: %m");
2485 } else if (secondary) {
2486 if (personality(PER_LINUX32) < 0)
2487 return log_error_errno(errno, "personality() failed: %m");
2488 }
2489
2490#ifdef HAVE_SELINUX
2491 if (arg_selinux_context)
2492 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2493 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2494#endif
2495
ee645080 2496 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2497 if (r < 0)
2498 return r;
2499
2500 envp[n_env] = strv_find_prefix(environ, "TERM=");
2501 if (envp[n_env])
2502 n_env ++;
2503
2504 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2505 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2506 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2507 return log_oom();
2508
2509 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2510 char as_uuid[37];
2511
2512 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2513 return log_oom();
2514 }
2515
2516 if (fdset_size(fds) > 0) {
2517 r = fdset_cloexec(fds, false);
2518 if (r < 0)
2519 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2520
2521 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2522 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2523 return log_oom();
2524 }
2525
2371271c
TG
2526 env_use = strv_env_merge(2, envp, arg_setenv);
2527 if (!env_use)
2528 return log_oom();
03cfe0d5
LP
2529
2530 /* Let the parent know that we are ready and
2531 * wait until the parent is ready with the
2532 * setup, too... */
2533 if (!barrier_place_and_sync(barrier)) { /* #4 */
2534 log_error("Parent died too early");
2535 return -ESRCH;
2536 }
2537
2538 /* Now, explicitly close the log, so that we
2539 * then can close all remaining fds. Closing
2540 * the log explicitly first has the benefit
2541 * that the logging subsystem knows about it,
2542 * and is thus ready to be reopened should we
2543 * need it again. Note that the other fds
2544 * closed here are at least the locking and
2545 * barrier fds. */
2546 log_close();
2547 (void) fdset_close_others(fds);
2548
2549 if (arg_boot) {
2550 char **a;
2551 size_t m;
2552
2553 /* Automatically search for the init system */
2554
f757855e 2555 m = 1 + strv_length(arg_parameters);
03cfe0d5 2556 a = newa(char*, m + 1);
f757855e
LP
2557 if (strv_isempty(arg_parameters))
2558 a[1] = NULL;
2559 else
2560 memcpy(a + 1, arg_parameters, m * sizeof(char*));
03cfe0d5
LP
2561
2562 a[0] = (char*) "/usr/lib/systemd/systemd";
2563 execve(a[0], a, env_use);
2564
2565 a[0] = (char*) "/lib/systemd/systemd";
2566 execve(a[0], a, env_use);
2567
2568 a[0] = (char*) "/sbin/init";
2569 execve(a[0], a, env_use);
f757855e
LP
2570 } else if (!strv_isempty(arg_parameters))
2571 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2572 else {
f757855e 2573 chdir(home ?: "/root");
03cfe0d5
LP
2574 execle("/bin/bash", "-bash", NULL, env_use);
2575 execle("/bin/sh", "-sh", NULL, env_use);
2576 }
2577
2578 (void) log_open();
2579 return log_error_errno(errno, "execv() failed: %m");
2580}
2581
2582static int outer_child(
2583 Barrier *barrier,
2584 const char *directory,
2585 const char *console,
2586 const char *root_device, bool root_device_rw,
2587 const char *home_device, bool home_device_rw,
2588 const char *srv_device, bool srv_device_rw,
2589 bool interactive,
2590 bool secondary,
2591 int pid_socket,
2592 int kmsg_socket,
2593 int rtnl_socket,
825d5287 2594 int uid_shift_socket,
f757855e 2595 FDSet *fds) {
03cfe0d5
LP
2596
2597 pid_t pid;
2598 ssize_t l;
2599 int r;
2600
2601 assert(barrier);
2602 assert(directory);
2603 assert(console);
2604 assert(pid_socket >= 0);
2605 assert(kmsg_socket >= 0);
2606
efdb0237
LP
2607 cg_unified_flush();
2608
03cfe0d5
LP
2609 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2610 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2611
2612 if (interactive) {
2613 close_nointr(STDIN_FILENO);
2614 close_nointr(STDOUT_FILENO);
2615 close_nointr(STDERR_FILENO);
2616
2617 r = open_terminal(console, O_RDWR);
2618 if (r != STDIN_FILENO) {
2619 if (r >= 0) {
2620 safe_close(r);
2621 r = -EINVAL;
2622 }
2623
2624 return log_error_errno(r, "Failed to open console: %m");
2625 }
2626
2627 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2628 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2629 return log_error_errno(errno, "Failed to duplicate console: %m");
2630 }
2631
2632 r = reset_audit_loginuid();
2633 if (r < 0)
2634 return r;
2635
2636 /* Mark everything as slave, so that we still
2637 * receive mounts from the real root, but don't
2638 * propagate mounts to the real root. */
2639 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2640 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2641
2642 r = mount_devices(directory,
2643 root_device, root_device_rw,
2644 home_device, home_device_rw,
2645 srv_device, srv_device_rw);
2646 if (r < 0)
2647 return r;
2648
391567f4
LP
2649 r = determine_uid_shift(directory);
2650 if (r < 0)
2651 return r;
2652
825d5287
RM
2653 if (arg_userns) {
2654 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2655 if (l < 0)
2656 return log_error_errno(errno, "Failed to send UID shift: %m");
2657 if (l != sizeof(arg_uid_shift)) {
2658 log_error("Short write while sending UID shift.");
2659 return -EIO;
2660 }
2661 }
2662
03cfe0d5
LP
2663 /* Turn directory into bind mount */
2664 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2665 return log_error_errno(errno, "Failed to make bind mount: %m");
2666
e83bebef 2667 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
2668 if (r < 0)
2669 return r;
2670
e83bebef 2671 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
2672 if (r < 0)
2673 return r;
2674
03cfe0d5
LP
2675 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2676 if (r < 0)
2677 return r;
2678
03cfe0d5
LP
2679 if (arg_read_only) {
2680 r = bind_remount_recursive(directory, true);
2681 if (r < 0)
2682 return log_error_errno(r, "Failed to make tree read-only: %m");
2683 }
2684
d1678248 2685 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2686 if (r < 0)
2687 return r;
2688
07fa00f9
LP
2689 r = copy_devnodes(directory);
2690 if (r < 0)
03cfe0d5
LP
2691 return r;
2692
2693 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2694
07fa00f9
LP
2695 r = setup_pts(directory);
2696 if (r < 0)
03cfe0d5
LP
2697 return r;
2698
2699 r = setup_propagate(directory);
2700 if (r < 0)
2701 return r;
2702
2703 r = setup_dev_console(directory, console);
2704 if (r < 0)
2705 return r;
2706
2707 r = setup_seccomp();
2708 if (r < 0)
2709 return r;
2710
2711 r = setup_timezone(directory);
2712 if (r < 0)
2713 return r;
2714
2715 r = setup_resolv_conf(directory);
2716 if (r < 0)
2717 return r;
2718
2719 r = setup_journal(directory);
2720 if (r < 0)
2721 return r;
2722
e83bebef 2723 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2724 if (r < 0)
2725 return r;
2726
e83bebef 2727 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2728 if (r < 0)
2729 return r;
2730
2731 r = mount_move_root(directory);
2732 if (r < 0)
2733 return log_error_errno(r, "Failed to move root directory: %m");
2734
2735 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2736 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2737 (arg_private_network ? CLONE_NEWNET : 0) |
2738 (arg_userns ? CLONE_NEWUSER : 0),
2739 NULL);
2740 if (pid < 0)
2741 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
2742 if (pid == 0) {
2743 pid_socket = safe_close(pid_socket);
825d5287 2744 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
2745
2746 /* The inner child has all namespaces that are
2747 * requested, so that we all are owned by the user if
2748 * user namespaces are turned on. */
2749
f757855e 2750 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
2751 if (r < 0)
2752 _exit(EXIT_FAILURE);
2753
2754 _exit(EXIT_SUCCESS);
2755 }
2756
2757 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2758 if (l < 0)
2759 return log_error_errno(errno, "Failed to send PID: %m");
2760 if (l != sizeof(pid)) {
2761 log_error("Short write while sending PID.");
2762 return -EIO;
2763 }
2764
2765 pid_socket = safe_close(pid_socket);
327e26d6
KN
2766 kmsg_socket = safe_close(kmsg_socket);
2767 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
2768
2769 return 0;
2770}
2771
2772static int setup_uid_map(pid_t pid) {
2773 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2774 int r;
2775
2776 assert(pid > 1);
2777
2778 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2779 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 2780 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2781 if (r < 0)
2782 return log_error_errno(r, "Failed to write UID map: %m");
2783
2784 /* We always assign the same UID and GID ranges */
2785 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 2786 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2787 if (r < 0)
2788 return log_error_errno(r, "Failed to write GID map: %m");
2789
2790 return 0;
2791}
2792
f757855e
LP
2793static int load_settings(void) {
2794 _cleanup_(settings_freep) Settings *settings = NULL;
2795 _cleanup_fclose_ FILE *f = NULL;
2796 _cleanup_free_ char *p = NULL;
2797 const char *fn, *i;
2798 int r;
2799
2800 /* If all settings are masked, there's no point in looking for
2801 * the settings file */
2802 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2803 return 0;
2804
2805 fn = strjoina(arg_machine, ".nspawn");
2806
2807 /* We first look in the admin's directories in /etc and /run */
2808 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2809 _cleanup_free_ char *j = NULL;
2810
2811 j = strjoin(i, "/", fn, NULL);
2812 if (!j)
2813 return log_oom();
2814
2815 f = fopen(j, "re");
2816 if (f) {
2817 p = j;
2818 j = NULL;
2819
2820 /* By default we trust configuration from /etc and /run */
2821 if (arg_settings_trusted < 0)
2822 arg_settings_trusted = true;
2823
2824 break;
2825 }
2826
2827 if (errno != ENOENT)
2828 return log_error_errno(errno, "Failed to open %s: %m", j);
2829 }
2830
2831 if (!f) {
2832 /* After that, let's look for a file next to the
2833 * actual image we shall boot. */
2834
2835 if (arg_image) {
2836 p = file_in_same_dir(arg_image, fn);
2837 if (!p)
2838 return log_oom();
2839 } else if (arg_directory) {
2840 p = file_in_same_dir(arg_directory, fn);
2841 if (!p)
2842 return log_oom();
2843 }
2844
2845 if (p) {
2846 f = fopen(p, "re");
2847 if (!f && errno != ENOENT)
2848 return log_error_errno(errno, "Failed to open %s: %m", p);
2849
2850 /* By default we do not trust configuration from /var/lib/machines */
2851 if (arg_settings_trusted < 0)
2852 arg_settings_trusted = false;
2853 }
2854 }
2855
2856 if (!f)
2857 return 0;
2858
2859 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2860
2861 r = settings_load(f, p, &settings);
2862 if (r < 0)
2863 return r;
2864
2865 /* Copy over bits from the settings, unless they have been
2866 * explicitly masked by command line switches. */
2867
2868 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2869 settings->boot >= 0) {
2870 arg_boot = settings->boot;
2871
2872 strv_free(arg_parameters);
2873 arg_parameters = settings->parameters;
2874 settings->parameters = NULL;
2875 }
2876
2877 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2878 settings->environment) {
2879 strv_free(arg_setenv);
2880 arg_setenv = settings->environment;
2881 settings->environment = NULL;
2882 }
2883
2884 if ((arg_settings_mask & SETTING_USER) == 0 &&
2885 settings->user) {
2886 free(arg_user);
2887 arg_user = settings->user;
2888 settings->user = NULL;
2889 }
2890
2891 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 2892 uint64_t plus;
f757855e 2893
0e265674
LP
2894 plus = settings->capability;
2895 if (settings_private_network(settings))
2896 plus |= (1ULL << CAP_NET_ADMIN);
2897
2898 if (!arg_settings_trusted && plus != 0) {
2899 if (settings->capability != 0)
2900 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2901 } else
2902 arg_retain |= plus;
f757855e
LP
2903
2904 arg_retain &= ~settings->drop_capability;
2905 }
2906
2907 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2908 settings->kill_signal > 0)
2909 arg_kill_signal = settings->kill_signal;
2910
2911 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2912 settings->personality != PERSONALITY_INVALID)
2913 arg_personality = settings->personality;
2914
2915 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2916 !sd_id128_is_null(settings->machine_id)) {
2917
2918 if (!arg_settings_trusted)
2919 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2920 else
2921 arg_uuid = settings->machine_id;
2922 }
2923
2924 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2925 settings->read_only >= 0)
2926 arg_read_only = settings->read_only;
2927
2928 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2929 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2930 arg_volatile_mode = settings->volatile_mode;
2931
2932 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2933 settings->n_custom_mounts > 0) {
2934
2935 if (!arg_settings_trusted)
2936 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2937 else {
2938 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2939 arg_custom_mounts = settings->custom_mounts;
2940 arg_n_custom_mounts = settings->n_custom_mounts;
2941
2942 settings->custom_mounts = NULL;
2943 settings->n_custom_mounts = 0;
2944 }
2945 }
2946
2947 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2948 (settings->private_network >= 0 ||
2949 settings->network_veth >= 0 ||
2950 settings->network_bridge ||
2951 settings->network_interfaces ||
2952 settings->network_macvlan ||
2953 settings->network_ipvlan)) {
2954
2955 if (!arg_settings_trusted)
2956 log_warning("Ignoring network settings, file %s is not trusted.", p);
2957 else {
0e265674
LP
2958 arg_network_veth = settings_private_network(settings);
2959 arg_private_network = settings_private_network(settings);
2960
f757855e
LP
2961 strv_free(arg_network_interfaces);
2962 arg_network_interfaces = settings->network_interfaces;
2963 settings->network_interfaces = NULL;
2964
2965 strv_free(arg_network_macvlan);
2966 arg_network_macvlan = settings->network_macvlan;
2967 settings->network_macvlan = NULL;
2968
2969 strv_free(arg_network_ipvlan);
2970 arg_network_ipvlan = settings->network_ipvlan;
2971 settings->network_ipvlan = NULL;
2972
2973 free(arg_network_bridge);
2974 arg_network_bridge = settings->network_bridge;
2975 settings->network_bridge = NULL;
f757855e
LP
2976 }
2977 }
2978
2979 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
2980 settings->expose_ports) {
2981
2982 if (!arg_settings_trusted)
2983 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
2984 else {
2985 expose_port_free_all(arg_expose_ports);
2986 arg_expose_ports = settings->expose_ports;
2987 settings->expose_ports = NULL;
2988 }
2989 }
2990
2991 return 0;
2992}
2993
03cfe0d5
LP
2994int main(int argc, char *argv[]) {
2995
2996 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
2997 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2998 _cleanup_close_ int master = -1, image_fd = -1;
2999 _cleanup_fdset_free_ FDSet *fds = NULL;
3000 int r, n_fd_passed, loop_nr = -1;
3001 char veth_name[IFNAMSIZ];
3002 bool secondary = false, remove_subvol = false;
72c0a2c2 3003 sigset_t mask_chld;
03cfe0d5
LP
3004 pid_t pid = 0;
3005 int ret = EXIT_SUCCESS;
3006 union in_addr_union exposed = {};
3007 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3008 bool interactive;
3009
3010 log_parse_environment();
3011 log_open();
3012
3013 r = parse_argv(argc, argv);
3014 if (r <= 0)
3015 goto finish;
3016
03cfe0d5
LP
3017 if (geteuid() != 0) {
3018 log_error("Need to be root.");
3019 r = -EPERM;
3020 goto finish;
3021 }
f757855e
LP
3022 r = determine_names();
3023 if (r < 0)
3024 goto finish;
3025
3026 r = load_settings();
3027 if (r < 0)
3028 goto finish;
3029
3030 r = verify_arguments();
3031 if (r < 0)
3032 goto finish;
03cfe0d5
LP
3033
3034 n_fd_passed = sd_listen_fds(false);
3035 if (n_fd_passed > 0) {
3036 r = fdset_new_listen_fds(&fds, false);
3037 if (r < 0) {
3038 log_error_errno(r, "Failed to collect file descriptors: %m");
3039 goto finish;
3040 }
3041 }
3042
3043 if (arg_directory) {
3044 assert(!arg_image);
3045
3046 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3047 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3048 r = -EINVAL;
3049 goto finish;
3050 }
3051
3052 if (arg_ephemeral) {
3053 _cleanup_free_ char *np = NULL;
3054
3055 /* If the specified path is a mount point we
3056 * generate the new snapshot immediately
3057 * inside it under a random name. However if
3058 * the specified is not a mount point we
3059 * create the new snapshot in the parent
3060 * directory, just next to it. */
e26d6ce5 3061 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
3062 if (r < 0) {
3063 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3064 goto finish;
3065 }
3066 if (r > 0)
770b5ce4 3067 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3068 else
770b5ce4 3069 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
3070 if (r < 0) {
3071 log_error_errno(r, "Failed to generate name for snapshot: %m");
3072 goto finish;
3073 }
3074
3075 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3076 if (r < 0) {
3077 log_error_errno(r, "Failed to lock %s: %m", np);
3078 goto finish;
3079 }
3080
5bcd08db 3081 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3082 if (r < 0) {
3083 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3084 goto finish;
ec16945e
LP
3085 }
3086
3087 free(arg_directory);
3088 arg_directory = np;
8a16a7b4 3089 np = NULL;
ec16945e
LP
3090
3091 remove_subvol = true;
30535c16
LP
3092
3093 } else {
3094 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3095 if (r == -EBUSY) {
3096 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3097 goto finish;
3098 }
3099 if (r < 0) {
3100 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3101 return r;
3102 }
3103
3104 if (arg_template) {
5bcd08db 3105 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3106 if (r == -EEXIST) {
3107 if (!arg_quiet)
3108 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3109 } else if (r < 0) {
83521414 3110 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3111 goto finish;
3112 } else {
3113 if (!arg_quiet)
3114 log_info("Populated %s from template %s.", arg_directory, arg_template);
3115 }
3116 }
ec16945e
LP
3117 }
3118
1b9e5b12
LP
3119 if (arg_boot) {
3120 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3121 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3122 r = -EINVAL;
1b9e5b12
LP
3123 goto finish;
3124 }
3125 } else {
3126 const char *p;
3127
16fb773e
LP
3128 p = strjoina(arg_directory, "/usr/");
3129 if (laccess(p, F_OK) < 0) {
3130 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 3131 r = -EINVAL;
1b9e5b12 3132 goto finish;
1b9e5b12
LP
3133 }
3134 }
ec16945e 3135
6b9132a9 3136 } else {
1b9e5b12 3137 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3138
ec16945e
LP
3139 assert(arg_image);
3140 assert(!arg_template);
3141
30535c16
LP
3142 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3143 if (r == -EBUSY) {
3144 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3145 goto finish;
3146 }
3147 if (r < 0) {
3148 r = log_error_errno(r, "Failed to create image lock: %m");
3149 goto finish;
3150 }
3151
1b9e5b12 3152 if (!mkdtemp(template)) {
56f64d95 3153 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3154 r = -errno;
6b9132a9 3155 goto finish;
1b9e5b12 3156 }
6b9132a9 3157
1b9e5b12
LP
3158 arg_directory = strdup(template);
3159 if (!arg_directory) {
3160 r = log_oom();
3161 goto finish;
6b9132a9 3162 }
88213476 3163
1b9e5b12
LP
3164 image_fd = setup_image(&device_path, &loop_nr);
3165 if (image_fd < 0) {
3166 r = image_fd;
842f3b0f
LP
3167 goto finish;
3168 }
1b9e5b12 3169
4d9f07b4
LP
3170 r = dissect_image(image_fd,
3171 &root_device, &root_device_rw,
3172 &home_device, &home_device_rw,
3173 &srv_device, &srv_device_rw,
3174 &secondary);
1b9e5b12
LP
3175 if (r < 0)
3176 goto finish;
842f3b0f 3177 }
842f3b0f 3178
5a8af538
LP
3179 r = custom_mounts_prepare();
3180 if (r < 0)
3181 goto finish;
3182
03cfe0d5
LP
3183 interactive =
3184 isatty(STDIN_FILENO) > 0 &&
3185 isatty(STDOUT_FILENO) > 0;
9c857b9d 3186
db7feb7e
LP
3187 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3188 if (master < 0) {
ec16945e 3189 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3190 goto finish;
3191 }
3192
611b312b
LP
3193 r = ptsname_malloc(master, &console);
3194 if (r < 0) {
3195 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
3196 goto finish;
3197 }
3198
a258bf26 3199 if (unlockpt(master) < 0) {
ec16945e 3200 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3201 goto finish;
3202 }
3203
9c857b9d
LP
3204 if (!arg_quiet)
3205 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3206 arg_machine, arg_image ?: arg_directory);
3207
72c0a2c2 3208 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3209
023fb90b
LP
3210 assert_se(sigemptyset(&mask_chld) == 0);
3211 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3212
03cfe0d5
LP
3213 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3214 r = log_error_errno(errno, "Failed to become subreaper: %m");
3215 goto finish;
3216 }
3217
d87be9b0 3218 for (;;) {
825d5287
RM
3219 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
3220 uid_shift_socket_pair[2] = { -1, -1 };
113cea80 3221 ContainerStatus container_status;
7566e267 3222 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
03cfe0d5 3223 static const struct sigaction sa = {
189d5bac 3224 .sa_handler = nop_signal_handler,
e866af3a
DH
3225 .sa_flags = SA_NOCLDSTOP,
3226 };
03cfe0d5
LP
3227 int ifi = 0;
3228 ssize_t l;
dbb60d69
LP
3229 _cleanup_event_unref_ sd_event *event = NULL;
3230 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3231 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3232 char last_char = 0;
e866af3a 3233
7566e267 3234 r = barrier_create(&barrier);
a2da110b 3235 if (r < 0) {
da927ba9 3236 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3237 goto finish;
3238 }
3239
4610de50 3240 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
6d0b55c2
LP
3241 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3242 goto finish;
3243 }
3244
4610de50 3245 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
6d0b55c2
LP
3246 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3247 goto finish;
3248 }
3249
4610de50 3250 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
03cfe0d5
LP
3251 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3252 goto finish;
3253 }
3254
825d5287 3255 if (arg_userns)
4610de50 3256 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
825d5287
RM
3257 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3258 goto finish;
3259 }
3260
e866af3a
DH
3261 /* Child can be killed before execv(), so handle SIGCHLD
3262 * in order to interrupt parent's blocking calls and
3263 * give it a chance to call wait() and terminate. */
3264 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3265 if (r < 0) {
ec16945e 3266 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3267 goto finish;
3268 }
3269
e866af3a
DH
3270 r = sigaction(SIGCHLD, &sa, NULL);
3271 if (r < 0) {
ec16945e 3272 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3273 goto finish;
3274 }
3275
03cfe0d5 3276 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
d87be9b0
LP
3277 if (pid < 0) {
3278 if (errno == EINVAL)
ec16945e 3279 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3280 else
ec16945e 3281 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3282
d87be9b0
LP
3283 goto finish;
3284 }
a258bf26 3285
d87be9b0 3286 if (pid == 0) {
03cfe0d5 3287 /* The outer child only has a file system namespace. */
a2da110b
DH
3288 barrier_set_role(&barrier, BARRIER_CHILD);
3289
03e334a1 3290 master = safe_close(master);
a258bf26 3291
03e334a1 3292 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3293 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 3294 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
825d5287 3295 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 3296
ce30c8dc
LP
3297 (void) reset_all_signal_handlers();
3298 (void) reset_signal_mask();
f5c1b9ee 3299
03cfe0d5
LP
3300 r = outer_child(&barrier,
3301 arg_directory,
3302 console,
3303 root_device, root_device_rw,
3304 home_device, home_device_rw,
3305 srv_device, srv_device_rw,
3306 interactive,
3307 secondary,
3308 pid_socket_pair[1],
3309 kmsg_socket_pair[1],
3310 rtnl_socket_pair[1],
825d5287 3311 uid_shift_socket_pair[1],
f757855e 3312 fds);
0cb9fbcd 3313 if (r < 0)
a2da110b 3314 _exit(EXIT_FAILURE);
d87be9b0 3315
03cfe0d5 3316 _exit(EXIT_SUCCESS);
da5b3bad 3317 }
88213476 3318
a2da110b 3319 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 3320
2feceb5e 3321 fds = fdset_free(fds);
842f3b0f 3322
6d0b55c2
LP
3323 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3324 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 3325 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
82116c43 3326 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
6d0b55c2 3327
03cfe0d5
LP
3328 /* Wait for the outer child. */
3329 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3330 if (r < 0)
3331 goto finish;
3332 if (r != 0) {
3333 r = -EIO;
3334 goto finish;
3335 }
3336 pid = 0;
6dac160c 3337
03cfe0d5
LP
3338 /* And now retrieve the PID of the inner child. */
3339 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3340 if (l < 0) {
3341 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3342 goto finish;
3343 }
3344 if (l != sizeof(pid)) {
76d44882 3345 log_error("Short read while reading inner child PID.");
03cfe0d5
LP
3346 r = EIO;
3347 goto finish;
3348 }
354bfd2b 3349
03cfe0d5 3350 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 3351
03cfe0d5
LP
3352 if (arg_userns) {
3353 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3354 log_error("Child died too early.");
3355 r = -ESRCH;
840295fc 3356 goto finish;
03cfe0d5 3357 }
ab046dde 3358
825d5287
RM
3359 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3360 if (l < 0) {
3361 r = log_error_errno(errno, "Failed to read UID shift: %m");
3362 goto finish;
3363 }
3364 if (l != sizeof(arg_uid_shift)) {
76d44882 3365 log_error("Short read while reading UID shift.");
825d5287
RM
3366 r = EIO;
3367 goto finish;
3368 }
3369
03cfe0d5 3370 r = setup_uid_map(pid);
840295fc
LP
3371 if (r < 0)
3372 goto finish;
ab046dde 3373
03cfe0d5
LP
3374 (void) barrier_place(&barrier); /* #2 */
3375 }
c74e630d 3376
9a2a5625 3377 if (arg_private_network) {
4bbfe7ad 3378
9a2a5625
LP
3379 r = move_network_interfaces(pid, arg_network_interfaces);
3380 if (r < 0)
3381 goto finish;
5aa4bb6b 3382
9a2a5625
LP
3383 if (arg_network_veth) {
3384 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3385 if (r < 0)
3386 goto finish;
3387 else if (r > 0)
3388 ifi = r;
6dac160c 3389
9a2a5625
LP
3390 if (arg_network_bridge) {
3391 r = setup_bridge(veth_name, arg_network_bridge);
3392 if (r < 0)
3393 goto finish;
3394 if (r > 0)
3395 ifi = r;
3396 }
3397 }
6dac160c 3398
9a2a5625
LP
3399 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3400 if (r < 0)
3401 goto finish;
3402
3403 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3404 if (r < 0)
3405 goto finish;
3406 }
6dac160c 3407
b7103bc5
LP
3408 if (arg_register) {
3409 r = register_machine(
3410 arg_machine,
3411 pid,
3412 arg_directory,
3413 arg_uuid,
3414 ifi,
3415 arg_slice,
3416 arg_custom_mounts, arg_n_custom_mounts,
3417 arg_kill_signal,
3418 arg_property,
3419 arg_keep_unit);
3420 if (r < 0)
3421 goto finish;
3422 }
6dac160c 3423
34829a32 3424 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
efdb0237
LP
3425 if (r < 0)
3426 goto finish;
3427
34829a32
LP
3428 if (arg_keep_unit) {
3429 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3430 if (r < 0)
3431 goto finish;
3432 }
efdb0237 3433
34829a32 3434 r = chown_cgroup(pid, arg_uid_shift);
03cfe0d5
LP
3435 if (r < 0)
3436 goto finish;
6dac160c 3437
03cfe0d5
LP
3438 /* Notify the child that the parent is ready with all
3439 * its setup (including cgroup-ification), and that
3440 * the child can now hand over control to the code to
3441 * run inside the container. */
3442 (void) barrier_place(&barrier); /* #3 */
6dac160c 3443
03cfe0d5
LP
3444 /* Block SIGCHLD here, before notifying child.
3445 * process_pty() will handle it with the other signals. */
3446 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 3447
03cfe0d5
LP
3448 /* Reset signal to default */
3449 r = default_signals(SIGCHLD, -1);
3450 if (r < 0) {
3451 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3452 goto finish;
3453 }
e866af3a 3454
03cfe0d5 3455 /* Let the child know that we are ready and wait that the child is completely ready now. */
c0ffce2b
KN
3456 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3457 log_error("Child died too early.");
03cfe0d5
LP
3458 r = -ESRCH;
3459 goto finish;
3460 }
b12afc8c 3461
03cfe0d5
LP
3462 sd_notifyf(false,
3463 "READY=1\n"
3464 "STATUS=Container running.\n"
3465 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 3466
03cfe0d5
LP
3467 r = sd_event_new(&event);
3468 if (r < 0) {
3469 log_error_errno(r, "Failed to get default event source: %m");
3470 goto finish;
3471 }
88213476 3472
03cfe0d5
LP
3473 if (arg_kill_signal > 0) {
3474 /* Try to kill the init system on SIGINT or SIGTERM */
3475 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3476 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3477 } else {
3478 /* Immediately exit */
3479 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3480 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3481 }
023fb90b 3482
03cfe0d5
LP
3483 /* simply exit on sigchld */
3484 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 3485
03cfe0d5 3486 if (arg_expose_ports) {
7a8f6325 3487 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
03cfe0d5
LP
3488 if (r < 0)
3489 goto finish;
023fb90b 3490
7a8f6325 3491 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
03cfe0d5 3492 }
023fb90b 3493
03cfe0d5 3494 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 3495
ae3dde80 3496 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
03cfe0d5
LP
3497 if (r < 0) {
3498 log_error_errno(r, "Failed to create PTY forwarder: %m");
3499 goto finish;
3500 }
023fb90b 3501
03cfe0d5
LP
3502 r = sd_event_loop(event);
3503 if (r < 0) {
3504 log_error_errno(r, "Failed to run event loop: %m");
3505 goto finish;
3506 }
6d0b55c2 3507
03cfe0d5 3508 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 3509
03cfe0d5 3510 forward = pty_forward_free(forward);
6d0b55c2 3511
03cfe0d5
LP
3512 if (!arg_quiet && last_char != '\n')
3513 putc('\n', stdout);
04d39279 3514
03cfe0d5 3515 /* Kill if it is not dead yet anyway */
b7103bc5
LP
3516 if (arg_register && !arg_keep_unit)
3517 terminate_machine(pid);
1f0cd86b 3518
840295fc 3519 /* Normally redundant, but better safe than sorry */
04d39279 3520 kill(pid, SIGKILL);
a258bf26 3521
113cea80 3522 r = wait_for_container(pid, &container_status);
04d39279
LP
3523 pid = 0;
3524
ec16945e 3525 if (r < 0)
ce9f1527
LP
3526 /* We failed to wait for the container, or the
3527 * container exited abnormally */
ec16945e
LP
3528 goto finish;
3529 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
3530 /* The container exited with a non-zero
3531 * status, or with zero status and no reboot
3532 * was requested. */
ec16945e 3533 ret = r;
d87be9b0 3534 break;
ec16945e 3535 }
88213476 3536
113cea80 3537 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3538
3539 if (arg_keep_unit) {
3540 /* Special handling if we are running as a
3541 * service: instead of simply restarting the
3542 * machine we want to restart the entire
3543 * service, so let's inform systemd about this
3544 * with the special exit code 133. The service
3545 * file uses RestartForceExitStatus=133 so
3546 * that this results in a full nspawn
3547 * restart. This is necessary since we might
3548 * have cgroup parameters set we want to have
3549 * flushed out. */
ec16945e
LP
3550 ret = 133;
3551 r = 0;
ce38dbc8
LP
3552 break;
3553 }
6d0b55c2 3554
7a8f6325 3555 expose_port_flush(arg_expose_ports, &exposed);
d87be9b0 3556 }
88213476
LP
3557
3558finish:
af4ec430
LP
3559 sd_notify(false,
3560 "STOPPING=1\n"
3561 "STATUS=Terminating...");
3562
9444b1f2
LP
3563 if (pid > 0)
3564 kill(pid, SIGKILL);
88213476 3565
503546da
LP
3566 /* Try to flush whatever is still queued in the pty */
3567 if (master >= 0)
59f448cf 3568 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
503546da 3569
03cfe0d5
LP
3570 loop_remove(loop_nr, &image_fd);
3571
ec16945e
LP
3572 if (remove_subvol && arg_directory) {
3573 int k;
3574
5bcd08db 3575 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
ec16945e
LP
3576 if (k < 0)
3577 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3578 }
3579
785890ac
LP
3580 if (arg_machine) {
3581 const char *p;
3582
63c372cb 3583 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 3584 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
3585 }
3586
7a8f6325 3587 expose_port_flush(arg_expose_ports, &exposed);
f757855e 3588
04d391da 3589 free(arg_directory);
ec16945e
LP
3590 free(arg_template);
3591 free(arg_image);
7027ff61 3592 free(arg_machine);
c74e630d
LP
3593 free(arg_user);
3594 strv_free(arg_setenv);
f757855e 3595 free(arg_network_bridge);
c74e630d
LP
3596 strv_free(arg_network_interfaces);
3597 strv_free(arg_network_macvlan);
4bbfe7ad 3598 strv_free(arg_network_ipvlan);
f757855e
LP
3599 strv_free(arg_parameters);
3600 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3601 expose_port_free_all(arg_expose_ports);
6d0b55c2 3602
ec16945e 3603 return r < 0 ? EXIT_FAILURE : ret;
88213476 3604}