]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: no fake errno
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
8fe0087e
LP
22#ifdef HAVE_BLKID
23#include <blkid/blkid.h>
24#endif
88213476 25#include <errno.h>
88213476 26#include <getopt.h>
1b9e5b12 27#include <linux/loop.h>
8fe0087e 28#include <sched.h>
24fb1112
LP
29#ifdef HAVE_SECCOMP
30#include <seccomp.h>
31#endif
8fe0087e
LP
32#ifdef HAVE_SELINUX
33#include <selinux/selinux.h>
1b9e5b12 34#endif
8fe0087e
LP
35#include <signal.h>
36#include <stdio.h>
37#include <stdlib.h>
38#include <string.h>
39#include <sys/file.h>
40#include <sys/mount.h>
41#include <sys/personality.h>
42#include <sys/prctl.h>
43#include <sys/types.h>
44#include <unistd.h>
1b9e5b12 45
1f0cd86b 46#include "sd-daemon.h"
1f0cd86b 47#include "sd-id128.h"
8fe0087e 48
b5efdb8a 49#include "alloc-util.h"
8fe0087e
LP
50#include "barrier.h"
51#include "base-filesystem.h"
52#include "blkid-util.h"
53#include "btrfs-util.h"
8fe0087e 54#include "cap-list.h"
430f0182 55#include "capability-util.h"
04d391da 56#include "cgroup-util.h"
8fe0087e 57#include "copy.h"
4fc9982c 58#include "dev-setup.h"
8fe0087e
LP
59#include "env-util.h"
60#include "event-util.h"
3ffd4af2 61#include "fd-util.h"
842f3b0f 62#include "fdset.h"
a5c32cff 63#include "fileio.h"
8fe0087e 64#include "formats-util.h"
f4f15635 65#include "fs-util.h"
1b9e5b12 66#include "gpt.h"
8fe0087e
LP
67#include "hostname-util.h"
68#include "log.h"
69#include "loopback-setup.h"
1b9cebf6 70#include "machine-image.h"
8fe0087e
LP
71#include "macro.h"
72#include "missing.h"
73#include "mkdir.h"
4349cd7c 74#include "mount-util.h"
8fe0087e 75#include "netlink-util.h"
07630cea
LP
76#include "nspawn-cgroup.h"
77#include "nspawn-expose-ports.h"
78#include "nspawn-mount.h"
79#include "nspawn-network.h"
80#include "nspawn-register.h"
81#include "nspawn-settings.h"
82#include "nspawn-setuid.h"
6bedfcbb 83#include "parse-util.h"
8fe0087e 84#include "path-util.h"
0b452006 85#include "process-util.h"
8fe0087e
LP
86#include "ptyfwd.h"
87#include "random-util.h"
88#include "rm-rf.h"
e9642be2
LP
89#ifdef HAVE_SECCOMP
90#include "seccomp-util.h"
91#endif
8fe0087e 92#include "signal-util.h"
2583fbea 93#include "socket-util.h"
8fcde012 94#include "stat-util.h"
15a5e950 95#include "stdio-util.h"
07630cea 96#include "string-util.h"
8fe0087e
LP
97#include "strv.h"
98#include "terminal-util.h"
99#include "udev-util.h"
affb60b1 100#include "umask-util.h"
b1d4f8e1 101#include "user-util.h"
8fe0087e 102#include "util.h"
e9642be2 103
113cea80
DH
104typedef enum ContainerStatus {
105 CONTAINER_TERMINATED,
106 CONTAINER_REBOOTED
107} ContainerStatus;
108
57fb9fb5
LP
109typedef enum LinkJournal {
110 LINK_NO,
111 LINK_AUTO,
112 LINK_HOST,
113 LINK_GUEST
114} LinkJournal;
88213476
LP
115
116static char *arg_directory = NULL;
ec16945e 117static char *arg_template = NULL;
687d0825 118static char *arg_user = NULL;
9444b1f2 119static sd_id128_t arg_uuid = {};
7027ff61 120static char *arg_machine = NULL;
c74e630d
LP
121static const char *arg_selinux_context = NULL;
122static const char *arg_selinux_apifs_context = NULL;
9444b1f2 123static const char *arg_slice = NULL;
ff01d048 124static bool arg_private_network = false;
bc2f673e 125static bool arg_read_only = false;
0f0dbc46 126static bool arg_boot = false;
ec16945e 127static bool arg_ephemeral = false;
57fb9fb5 128static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 129static bool arg_link_journal_try = false;
5076f0cc
LP
130static uint64_t arg_retain =
131 (1ULL << CAP_CHOWN) |
132 (1ULL << CAP_DAC_OVERRIDE) |
133 (1ULL << CAP_DAC_READ_SEARCH) |
134 (1ULL << CAP_FOWNER) |
135 (1ULL << CAP_FSETID) |
136 (1ULL << CAP_IPC_OWNER) |
137 (1ULL << CAP_KILL) |
138 (1ULL << CAP_LEASE) |
139 (1ULL << CAP_LINUX_IMMUTABLE) |
140 (1ULL << CAP_NET_BIND_SERVICE) |
141 (1ULL << CAP_NET_BROADCAST) |
142 (1ULL << CAP_NET_RAW) |
143 (1ULL << CAP_SETGID) |
144 (1ULL << CAP_SETFCAP) |
145 (1ULL << CAP_SETPCAP) |
146 (1ULL << CAP_SETUID) |
147 (1ULL << CAP_SYS_ADMIN) |
148 (1ULL << CAP_SYS_CHROOT) |
149 (1ULL << CAP_SYS_NICE) |
150 (1ULL << CAP_SYS_PTRACE) |
151 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 152 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
153 (1ULL << CAP_SYS_BOOT) |
154 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
155 (1ULL << CAP_AUDIT_CONTROL) |
156 (1ULL << CAP_MKNOD);
5a8af538
LP
157static CustomMount *arg_custom_mounts = NULL;
158static unsigned arg_n_custom_mounts = 0;
f4889f65 159static char **arg_setenv = NULL;
284c0b91 160static bool arg_quiet = false;
8a96d94e 161static bool arg_share_system = false;
eb91eb18 162static bool arg_register = true;
89f7c846 163static bool arg_keep_unit = false;
aa28aefe 164static char **arg_network_interfaces = NULL;
c74e630d 165static char **arg_network_macvlan = NULL;
4bbfe7ad 166static char **arg_network_ipvlan = NULL;
69c79d3c 167static bool arg_network_veth = false;
f757855e 168static char *arg_network_bridge = NULL;
050f7277 169static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 170static char *arg_image = NULL;
f757855e 171static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 172static ExposePort *arg_expose_ports = NULL;
f36933fe 173static char **arg_property = NULL;
6dac160c
LP
174static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
175static bool arg_userns = false;
c6c8f6e2 176static int arg_kill_signal = 0;
efdb0237 177static bool arg_unified_cgroup_hierarchy = false;
f757855e
LP
178static SettingsMask arg_settings_mask = 0;
179static int arg_settings_trusted = -1;
180static char **arg_parameters = NULL;
88213476 181
601185b4 182static void help(void) {
88213476
LP
183 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
184 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
185 " -h --help Show this help\n"
186 " --version Print version string\n"
69c79d3c 187 " -q --quiet Do not show status information\n"
1b9e5b12 188 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
189 " --template=PATH Initialize root directory from template directory,\n"
190 " if missing\n"
191 " -x --ephemeral Run container with snapshot of root directory, and\n"
192 " remove it after exit\n"
193 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
194 " -b --boot Boot up full system (i.e. invoke init)\n"
195 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 196 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 197 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 198 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 199 " --property=NAME=VALUE Set scope unit property\n"
03cfe0d5
LP
200 " --private-users[=UIDBASE[:NUIDS]]\n"
201 " Run within user namespace\n"
69c79d3c
LP
202 " --private-network Disable network in container\n"
203 " --network-interface=INTERFACE\n"
204 " Assign an existing network interface to the\n"
205 " container\n"
c74e630d
LP
206 " --network-macvlan=INTERFACE\n"
207 " Create a macvlan network interface based on an\n"
208 " existing network interface to the container\n"
4bbfe7ad
TG
209 " --network-ipvlan=INTERFACE\n"
210 " Create a ipvlan network interface based on an\n"
211 " existing network interface to the container\n"
0dfaa006 212 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 213 " and container\n"
ab046dde 214 " --network-bridge=INTERFACE\n"
32457153 215 " Add a virtual ethernet connection between host\n"
ab046dde
TG
216 " and container and add it to an existing bridge on\n"
217 " the host\n"
6d0b55c2 218 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 219 " Expose a container IP port on the host\n"
82adf6af
LP
220 " -Z --selinux-context=SECLABEL\n"
221 " Set the SELinux security context to be used by\n"
222 " processes in the container\n"
223 " -L --selinux-apifs-context=SECLABEL\n"
224 " Set the SELinux security context to be used by\n"
225 " API/tmpfs file systems in the container\n"
a8828ed9
DW
226 " --capability=CAP In addition to the default, retain specified\n"
227 " capability\n"
228 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 229 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
230 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
231 " try-guest, try-host\n"
232 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 233 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
234 " --bind=PATH[:PATH[:OPTIONS]]\n"
235 " Bind mount a file or directory from the host into\n"
a8828ed9 236 " the container\n"
5e5bfa6e
EY
237 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
238 " Similar, but creates a read-only bind mount\n"
06c17c39 239 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
240 " --overlay=PATH[:PATH...]:PATH\n"
241 " Create an overlay mount from the host to \n"
242 " the container\n"
243 " --overlay-ro=PATH[:PATH...]:PATH\n"
244 " Similar, but creates a read-only overlay mount\n"
284c0b91 245 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 246 " --share-system Share system namespaces with host\n"
eb91eb18 247 " --register=BOOLEAN Register container as machine\n"
89f7c846 248 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 249 " the service unit nspawn is running in\n"
6d0b55c2 250 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 251 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
6d0b55c2 252 , program_invocation_short_name);
88213476
LP
253}
254
5a8af538
LP
255
256static int custom_mounts_prepare(void) {
257 unsigned i;
258 int r;
259
260 /* Ensure the mounts are applied prefix first. */
261 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
262
263 /* Allocate working directories for the overlay file systems that need it */
264 for (i = 0; i < arg_n_custom_mounts; i++) {
265 CustomMount *m = &arg_custom_mounts[i];
266
825d5287
RM
267 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
268 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
269 return -EINVAL;
270 }
271
5a8af538
LP
272 if (m->type != CUSTOM_MOUNT_OVERLAY)
273 continue;
274
275 if (m->work_dir)
276 continue;
277
278 if (m->read_only)
279 continue;
280
14bcf25c 281 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
282 if (r < 0)
283 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
284 }
285
286 return 0;
287}
288
efdb0237
LP
289static int detect_unified_cgroup_hierarchy(void) {
290 const char *e;
291 int r;
292
293 /* Allow the user to control whether the unified hierarchy is used */
294 e = getenv("UNIFIED_CGROUP_HIERARCHY");
295 if (e) {
296 r = parse_boolean(e);
297 if (r < 0)
298 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
299
300 arg_unified_cgroup_hierarchy = r;
301 return 0;
302 }
303
304 /* Otherwise inherit the default from the host system */
305 r = cg_unified();
306 if (r < 0)
307 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
308
309 arg_unified_cgroup_hierarchy = r;
310 return 0;
311}
312
88213476
LP
313static int parse_argv(int argc, char *argv[]) {
314
a41fe3a2 315 enum {
acbeb427
ZJS
316 ARG_VERSION = 0x100,
317 ARG_PRIVATE_NETWORK,
bc2f673e 318 ARG_UUID,
5076f0cc 319 ARG_READ_ONLY,
57fb9fb5 320 ARG_CAPABILITY,
420c7379 321 ARG_DROP_CAPABILITY,
17fe0523
LP
322 ARG_LINK_JOURNAL,
323 ARG_BIND,
f4889f65 324 ARG_BIND_RO,
06c17c39 325 ARG_TMPFS,
5a8af538
LP
326 ARG_OVERLAY,
327 ARG_OVERLAY_RO,
f4889f65 328 ARG_SETENV,
eb91eb18 329 ARG_SHARE_SYSTEM,
89f7c846 330 ARG_REGISTER,
aa28aefe 331 ARG_KEEP_UNIT,
69c79d3c 332 ARG_NETWORK_INTERFACE,
c74e630d 333 ARG_NETWORK_MACVLAN,
4bbfe7ad 334 ARG_NETWORK_IPVLAN,
ab046dde 335 ARG_NETWORK_BRIDGE,
6afc95b7 336 ARG_PERSONALITY,
4d9f07b4 337 ARG_VOLATILE,
ec16945e 338 ARG_TEMPLATE,
f36933fe 339 ARG_PROPERTY,
6dac160c 340 ARG_PRIVATE_USERS,
c6c8f6e2 341 ARG_KILL_SIGNAL,
f757855e 342 ARG_SETTINGS,
a41fe3a2
LP
343 };
344
88213476 345 static const struct option options[] = {
aa28aefe
LP
346 { "help", no_argument, NULL, 'h' },
347 { "version", no_argument, NULL, ARG_VERSION },
348 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
349 { "template", required_argument, NULL, ARG_TEMPLATE },
350 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
351 { "user", required_argument, NULL, 'u' },
352 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
353 { "boot", no_argument, NULL, 'b' },
354 { "uuid", required_argument, NULL, ARG_UUID },
355 { "read-only", no_argument, NULL, ARG_READ_ONLY },
356 { "capability", required_argument, NULL, ARG_CAPABILITY },
357 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
358 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
359 { "bind", required_argument, NULL, ARG_BIND },
360 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 361 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
362 { "overlay", required_argument, NULL, ARG_OVERLAY },
363 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
364 { "machine", required_argument, NULL, 'M' },
365 { "slice", required_argument, NULL, 'S' },
366 { "setenv", required_argument, NULL, ARG_SETENV },
367 { "selinux-context", required_argument, NULL, 'Z' },
368 { "selinux-apifs-context", required_argument, NULL, 'L' },
369 { "quiet", no_argument, NULL, 'q' },
370 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
371 { "register", required_argument, NULL, ARG_REGISTER },
372 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
373 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 374 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 375 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 376 { "network-veth", no_argument, NULL, 'n' },
ab046dde 377 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 378 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 379 { "image", required_argument, NULL, 'i' },
4d9f07b4 380 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 381 { "port", required_argument, NULL, 'p' },
f36933fe 382 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 383 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 384 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
f757855e 385 { "settings", required_argument, NULL, ARG_SETTINGS },
eb9da376 386 {}
88213476
LP
387 };
388
9444b1f2 389 int c, r;
6cbe4ed1 390 const char *p;
a42c8b54 391 uint64_t plus = 0, minus = 0;
f757855e 392 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
393
394 assert(argc >= 0);
395 assert(argv);
396
0dfaa006 397 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
398
399 switch (c) {
400
401 case 'h':
601185b4
ZJS
402 help();
403 return 0;
88213476 404
acbeb427 405 case ARG_VERSION:
3f6fd1ba 406 return version();
acbeb427 407
88213476 408 case 'D':
0f03c2a4 409 r = parse_path_argument_and_warn(optarg, false, &arg_directory);
ec16945e 410 if (r < 0)
0f03c2a4 411 return r;
ec16945e
LP
412 break;
413
414 case ARG_TEMPLATE:
0f03c2a4 415 r = parse_path_argument_and_warn(optarg, false, &arg_template);
ec16945e 416 if (r < 0)
0f03c2a4 417 return r;
88213476
LP
418 break;
419
1b9e5b12 420 case 'i':
0f03c2a4 421 r = parse_path_argument_and_warn(optarg, false, &arg_image);
ec16945e 422 if (r < 0)
0f03c2a4 423 return r;
ec16945e
LP
424 break;
425
426 case 'x':
427 arg_ephemeral = true;
1b9e5b12
LP
428 break;
429
687d0825 430 case 'u':
2fc09a9c
DM
431 r = free_and_strdup(&arg_user, optarg);
432 if (r < 0)
7027ff61 433 return log_oom();
687d0825 434
f757855e 435 arg_settings_mask |= SETTING_USER;
687d0825
MV
436 break;
437
ab046dde 438 case ARG_NETWORK_BRIDGE:
f757855e
LP
439 r = free_and_strdup(&arg_network_bridge, optarg);
440 if (r < 0)
441 return log_oom();
ab046dde
TG
442
443 /* fall through */
444
0dfaa006 445 case 'n':
69c79d3c
LP
446 arg_network_veth = true;
447 arg_private_network = true;
f757855e 448 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
449 break;
450
aa28aefe 451 case ARG_NETWORK_INTERFACE:
c74e630d
LP
452 if (strv_extend(&arg_network_interfaces, optarg) < 0)
453 return log_oom();
454
455 arg_private_network = true;
f757855e 456 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
457 break;
458
459 case ARG_NETWORK_MACVLAN:
460 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
461 return log_oom();
462
4bbfe7ad 463 arg_private_network = true;
f757855e 464 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
465 break;
466
467 case ARG_NETWORK_IPVLAN:
468 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
469 return log_oom();
470
aa28aefe
LP
471 /* fall through */
472
ff01d048
LP
473 case ARG_PRIVATE_NETWORK:
474 arg_private_network = true;
f757855e 475 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
476 break;
477
0f0dbc46
LP
478 case 'b':
479 arg_boot = true;
f757855e 480 arg_settings_mask |= SETTING_BOOT;
0f0dbc46
LP
481 break;
482
144f0fc0 483 case ARG_UUID:
9444b1f2
LP
484 r = sd_id128_from_string(optarg, &arg_uuid);
485 if (r < 0) {
aa96c6cb 486 log_error("Invalid UUID: %s", optarg);
9444b1f2 487 return r;
aa96c6cb 488 }
f757855e
LP
489
490 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 491 break;
aa96c6cb 492
9444b1f2 493 case 'S':
c74e630d 494 arg_slice = optarg;
144f0fc0
LP
495 break;
496
7027ff61 497 case 'M':
c1521918 498 if (isempty(optarg))
97b11eed 499 arg_machine = mfree(arg_machine);
c1521918 500 else {
0c3c4284 501 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
502 log_error("Invalid machine name: %s", optarg);
503 return -EINVAL;
504 }
7027ff61 505
0c3c4284
LP
506 r = free_and_strdup(&arg_machine, optarg);
507 if (r < 0)
eb91eb18
LP
508 return log_oom();
509
510 break;
511 }
7027ff61 512
82adf6af
LP
513 case 'Z':
514 arg_selinux_context = optarg;
a8828ed9
DW
515 break;
516
82adf6af
LP
517 case 'L':
518 arg_selinux_apifs_context = optarg;
a8828ed9
DW
519 break;
520
bc2f673e
LP
521 case ARG_READ_ONLY:
522 arg_read_only = true;
f757855e 523 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
524 break;
525
420c7379
LP
526 case ARG_CAPABILITY:
527 case ARG_DROP_CAPABILITY: {
6cbe4ed1
SS
528 p = optarg;
529 for(;;) {
530 _cleanup_free_ char *t = NULL;
5076f0cc 531
6cbe4ed1
SS
532 r = extract_first_word(&p, &t, ",", 0);
533 if (r < 0)
534 return log_error_errno(r, "Failed to parse capability %s.", t);
5076f0cc 535
6cbe4ed1
SS
536 if (r == 0)
537 break;
5076f0cc 538
39ed67d1
LP
539 if (streq(t, "all")) {
540 if (c == ARG_CAPABILITY)
a42c8b54 541 plus = (uint64_t) -1;
39ed67d1 542 else
a42c8b54 543 minus = (uint64_t) -1;
39ed67d1 544 } else {
2822da4f
LP
545 int cap;
546
547 cap = capability_from_name(t);
548 if (cap < 0) {
39ed67d1
LP
549 log_error("Failed to parse capability %s.", t);
550 return -EINVAL;
551 }
552
553 if (c == ARG_CAPABILITY)
a42c8b54 554 plus |= 1ULL << (uint64_t) cap;
39ed67d1 555 else
a42c8b54 556 minus |= 1ULL << (uint64_t) cap;
5076f0cc 557 }
5076f0cc
LP
558 }
559
f757855e 560 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
561 break;
562 }
563
57fb9fb5
LP
564 case 'j':
565 arg_link_journal = LINK_GUEST;
574edc90 566 arg_link_journal_try = true;
57fb9fb5
LP
567 break;
568
569 case ARG_LINK_JOURNAL:
53e438e3 570 if (streq(optarg, "auto")) {
57fb9fb5 571 arg_link_journal = LINK_AUTO;
53e438e3
LP
572 arg_link_journal_try = false;
573 } else if (streq(optarg, "no")) {
57fb9fb5 574 arg_link_journal = LINK_NO;
53e438e3
LP
575 arg_link_journal_try = false;
576 } else if (streq(optarg, "guest")) {
57fb9fb5 577 arg_link_journal = LINK_GUEST;
53e438e3
LP
578 arg_link_journal_try = false;
579 } else if (streq(optarg, "host")) {
57fb9fb5 580 arg_link_journal = LINK_HOST;
53e438e3
LP
581 arg_link_journal_try = false;
582 } else if (streq(optarg, "try-guest")) {
574edc90
MP
583 arg_link_journal = LINK_GUEST;
584 arg_link_journal_try = true;
585 } else if (streq(optarg, "try-host")) {
586 arg_link_journal = LINK_HOST;
587 arg_link_journal_try = true;
588 } else {
57fb9fb5
LP
589 log_error("Failed to parse link journal mode %s", optarg);
590 return -EINVAL;
591 }
592
593 break;
594
17fe0523 595 case ARG_BIND:
f757855e
LP
596 case ARG_BIND_RO:
597 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
598 if (r < 0)
599 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 600
f757855e 601 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 602 break;
06c17c39 603
f757855e
LP
604 case ARG_TMPFS:
605 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
606 if (r < 0)
607 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 608
f757855e 609 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 610 break;
5a8af538
LP
611
612 case ARG_OVERLAY:
613 case ARG_OVERLAY_RO: {
614 _cleanup_free_ char *upper = NULL, *destination = NULL;
615 _cleanup_strv_free_ char **lower = NULL;
616 CustomMount *m;
617 unsigned n = 0;
618 char **i;
619
62f9f39a
RM
620 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
621 if (r == -ENOMEM)
06c17c39 622 return log_oom();
62f9f39a
RM
623 else if (r < 0) {
624 log_error("Invalid overlay specification: %s", optarg);
625 return r;
626 }
06c17c39 627
5a8af538
LP
628 STRV_FOREACH(i, lower) {
629 if (!path_is_absolute(*i)) {
630 log_error("Overlay path %s is not absolute.", *i);
631 return -EINVAL;
632 }
633
634 n++;
635 }
636
637 if (n < 2) {
638 log_error("--overlay= needs at least two colon-separated directories specified.");
639 return -EINVAL;
640 }
641
642 if (n == 2) {
643 /* If two parameters are specified,
644 * the first one is the lower, the
645 * second one the upper directory. And
af86c440
ZJS
646 * we'll also define the destination
647 * mount point the same as the upper. */
5a8af538
LP
648 upper = lower[1];
649 lower[1] = NULL;
650
651 destination = strdup(upper);
652 if (!destination)
653 return log_oom();
654
655 } else {
656 upper = lower[n - 2];
657 destination = lower[n - 1];
658 lower[n - 2] = NULL;
659 }
660
f757855e 661 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
5a8af538
LP
662 if (!m)
663 return log_oom();
664
665 m->destination = destination;
666 m->source = upper;
667 m->lower = lower;
668 m->read_only = c == ARG_OVERLAY_RO;
669
670 upper = destination = NULL;
671 lower = NULL;
06c17c39 672
f757855e 673 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39
LP
674 break;
675 }
676
f4889f65
LP
677 case ARG_SETENV: {
678 char **n;
679
680 if (!env_assignment_is_valid(optarg)) {
681 log_error("Environment variable assignment '%s' is not valid.", optarg);
682 return -EINVAL;
683 }
684
685 n = strv_env_set(arg_setenv, optarg);
686 if (!n)
687 return log_oom();
688
689 strv_free(arg_setenv);
690 arg_setenv = n;
f757855e
LP
691
692 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
693 break;
694 }
695
284c0b91
LP
696 case 'q':
697 arg_quiet = true;
698 break;
699
8a96d94e
LP
700 case ARG_SHARE_SYSTEM:
701 arg_share_system = true;
702 break;
703
eb91eb18
LP
704 case ARG_REGISTER:
705 r = parse_boolean(optarg);
706 if (r < 0) {
707 log_error("Failed to parse --register= argument: %s", optarg);
708 return r;
709 }
710
711 arg_register = r;
712 break;
713
89f7c846
LP
714 case ARG_KEEP_UNIT:
715 arg_keep_unit = true;
716 break;
717
6afc95b7
LP
718 case ARG_PERSONALITY:
719
ac45f971 720 arg_personality = personality_from_string(optarg);
050f7277 721 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
722 log_error("Unknown or unsupported personality '%s'.", optarg);
723 return -EINVAL;
724 }
725
f757855e 726 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
727 break;
728
4d9f07b4
LP
729 case ARG_VOLATILE:
730
731 if (!optarg)
f757855e 732 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 733 else {
f757855e 734 VolatileMode m;
4d9f07b4 735
f757855e
LP
736 m = volatile_mode_from_string(optarg);
737 if (m < 0) {
738 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 739 return -EINVAL;
f757855e
LP
740 } else
741 arg_volatile_mode = m;
6d0b55c2
LP
742 }
743
f757855e
LP
744 arg_settings_mask |= SETTING_VOLATILE_MODE;
745 break;
6d0b55c2 746
f757855e
LP
747 case 'p':
748 r = expose_port_parse(&arg_expose_ports, optarg);
749 if (r == -EEXIST)
750 return log_error_errno(r, "Duplicate port specification: %s", optarg);
751 if (r < 0)
752 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 753
f757855e 754 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 755 break;
6d0b55c2 756
f36933fe
LP
757 case ARG_PROPERTY:
758 if (strv_extend(&arg_property, optarg) < 0)
759 return log_oom();
760
761 break;
762
6dac160c
LP
763 case ARG_PRIVATE_USERS:
764 if (optarg) {
765 _cleanup_free_ char *buffer = NULL;
766 const char *range, *shift;
767
768 range = strchr(optarg, ':');
769 if (range) {
770 buffer = strndup(optarg, range - optarg);
771 if (!buffer)
772 return log_oom();
773 shift = buffer;
774
775 range++;
776 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
777 log_error("Failed to parse UID range: %s", range);
778 return -EINVAL;
779 }
780 } else
781 shift = optarg;
782
783 if (parse_uid(shift, &arg_uid_shift) < 0) {
784 log_error("Failed to parse UID: %s", optarg);
785 return -EINVAL;
786 }
787 }
788
789 arg_userns = true;
790 break;
791
c6c8f6e2
LP
792 case ARG_KILL_SIGNAL:
793 arg_kill_signal = signal_from_string_try_harder(optarg);
794 if (arg_kill_signal < 0) {
795 log_error("Cannot parse signal: %s", optarg);
796 return -EINVAL;
797 }
798
f757855e
LP
799 arg_settings_mask |= SETTING_KILL_SIGNAL;
800 break;
801
802 case ARG_SETTINGS:
803
804 /* no → do not read files
805 * yes → read files, do not override cmdline, trust only subset
806 * override → read files, override cmdline, trust only subset
807 * trusted → read files, do not override cmdline, trust all
808 */
809
810 r = parse_boolean(optarg);
811 if (r < 0) {
812 if (streq(optarg, "trusted")) {
813 mask_all_settings = false;
814 mask_no_settings = false;
815 arg_settings_trusted = true;
816
817 } else if (streq(optarg, "override")) {
818 mask_all_settings = false;
819 mask_no_settings = true;
820 arg_settings_trusted = -1;
821 } else
822 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
823 } else if (r > 0) {
824 /* yes */
825 mask_all_settings = false;
826 mask_no_settings = false;
827 arg_settings_trusted = -1;
828 } else {
829 /* no */
830 mask_all_settings = true;
831 mask_no_settings = false;
832 arg_settings_trusted = false;
833 }
834
c6c8f6e2
LP
835 break;
836
88213476
LP
837 case '?':
838 return -EINVAL;
839
840 default:
eb9da376 841 assert_not_reached("Unhandled option");
88213476 842 }
88213476 843
eb91eb18
LP
844 if (arg_share_system)
845 arg_register = false;
846
847 if (arg_boot && arg_share_system) {
848 log_error("--boot and --share-system may not be combined.");
849 return -EINVAL;
850 }
851
89f7c846
LP
852 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
853 log_error("--keep-unit may not be used when invoked from a user session.");
854 return -EINVAL;
855 }
856
1b9e5b12
LP
857 if (arg_directory && arg_image) {
858 log_error("--directory= and --image= may not be combined.");
859 return -EINVAL;
860 }
861
ec16945e
LP
862 if (arg_template && arg_image) {
863 log_error("--template= and --image= may not be combined.");
864 return -EINVAL;
865 }
866
867 if (arg_template && !(arg_directory || arg_machine)) {
868 log_error("--template= needs --directory= or --machine=.");
869 return -EINVAL;
870 }
871
872 if (arg_ephemeral && arg_template) {
873 log_error("--ephemeral and --template= may not be combined.");
874 return -EINVAL;
875 }
876
877 if (arg_ephemeral && arg_image) {
878 log_error("--ephemeral and --image= may not be combined.");
879 return -EINVAL;
880 }
881
df9a75e4
LP
882 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
883 log_error("--ephemeral and --link-journal= may not be combined.");
884 return -EINVAL;
885 }
886
f757855e
LP
887 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
888 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
889
890 if (argc > optind) {
891 arg_parameters = strv_copy(argv + optind);
892 if (!arg_parameters)
893 return log_oom();
894
895 arg_settings_mask |= SETTING_BOOT;
896 }
897
898 /* Load all settings from .nspawn files */
899 if (mask_no_settings)
900 arg_settings_mask = 0;
901
902 /* Don't load any settings from .nspawn files */
903 if (mask_all_settings)
904 arg_settings_mask = _SETTINGS_MASK_ALL;
905
906 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
907
908 r = detect_unified_cgroup_hierarchy();
909 if (r < 0)
910 return r;
911
912 return 1;
913}
914
915static int verify_arguments(void) {
916
917 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
918 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
919 return -EINVAL;
920 }
921
6d0b55c2
LP
922 if (arg_expose_ports && !arg_private_network) {
923 log_error("Cannot use --port= without private networking.");
924 return -EINVAL;
925 }
926
c6c8f6e2
LP
927 if (arg_boot && arg_kill_signal <= 0)
928 arg_kill_signal = SIGRTMIN+3;
929
f757855e 930 return 0;
88213476
LP
931}
932
03cfe0d5
LP
933static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
934 assert(p);
935
936 if (!arg_userns)
937 return 0;
938
939 if (uid == UID_INVALID && gid == GID_INVALID)
940 return 0;
941
942 if (uid != UID_INVALID) {
943 uid += arg_uid_shift;
944
945 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
946 return -EOVERFLOW;
947 }
948
949 if (gid != GID_INVALID) {
950 gid += (gid_t) arg_uid_shift;
951
952 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
953 return -EOVERFLOW;
954 }
955
956 if (lchown(p, uid, gid) < 0)
957 return -errno;
b12afc8c
LP
958
959 return 0;
960}
961
03cfe0d5
LP
962static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
963 const char *q;
964
965 q = prefix_roota(root, path);
966 if (mkdir(q, mode) < 0) {
967 if (errno == EEXIST)
968 return 0;
969 return -errno;
970 }
971
972 return userns_lchown(q, uid, gid);
973}
974
e58a1277 975static int setup_timezone(const char *dest) {
03cfe0d5
LP
976 _cleanup_free_ char *p = NULL, *q = NULL;
977 const char *where, *check, *what;
d4036145
LP
978 char *z, *y;
979 int r;
f8440af5 980
e58a1277
LP
981 assert(dest);
982
983 /* Fix the timezone, if possible */
d4036145
LP
984 r = readlink_malloc("/etc/localtime", &p);
985 if (r < 0) {
986 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
987 return 0;
988 }
989
990 z = path_startswith(p, "../usr/share/zoneinfo/");
991 if (!z)
992 z = path_startswith(p, "/usr/share/zoneinfo/");
993 if (!z) {
994 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
995 return 0;
996 }
997
03cfe0d5 998 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
999 r = readlink_malloc(where, &q);
1000 if (r >= 0) {
1001 y = path_startswith(q, "../usr/share/zoneinfo/");
1002 if (!y)
1003 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1004
d4036145
LP
1005 /* Already pointing to the right place? Then do nothing .. */
1006 if (y && streq(y, z))
1007 return 0;
1008 }
1009
03cfe0d5
LP
1010 check = strjoina("/usr/share/zoneinfo/", z);
1011 check = prefix_root(dest, check);
1012 if (laccess(check, F_OK) < 0) {
d4036145
LP
1013 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1014 return 0;
1015 }
68fb0892 1016
79d80fc1
TG
1017 r = unlink(where);
1018 if (r < 0 && errno != ENOENT) {
56f64d95 1019 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1020 return 0;
1021 }
4d9f07b4 1022
03cfe0d5 1023 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1024 if (symlink(what, where) < 0) {
56f64d95 1025 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1026 return 0;
1027 }
e58a1277 1028
03cfe0d5
LP
1029 r = userns_lchown(where, 0, 0);
1030 if (r < 0)
1031 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1032
e58a1277 1033 return 0;
88213476
LP
1034}
1035
2547bb41 1036static int setup_resolv_conf(const char *dest) {
03cfe0d5 1037 const char *where = NULL;
79d80fc1 1038 int r;
2547bb41
LP
1039
1040 assert(dest);
1041
1042 if (arg_private_network)
1043 return 0;
1044
1045 /* Fix resolv.conf, if possible */
03cfe0d5 1046 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1047
f2068bcc 1048 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1049 if (r < 0) {
68a313c5
LP
1050 /* If the file already exists as symlink, let's
1051 * suppress the warning, under the assumption that
1052 * resolved or something similar runs inside and the
1053 * symlink points there.
1054 *
1055 * If the disk image is read-only, there's also no
1056 * point in complaining.
1057 */
1058 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1059 "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1060 return 0;
1061 }
2547bb41 1062
03cfe0d5
LP
1063 r = userns_lchown(where, 0, 0);
1064 if (r < 0)
1065 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1066
2547bb41
LP
1067 return 0;
1068}
1069
9f24adc2 1070static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
03cfe0d5 1071 assert(s);
9f24adc2
LP
1072
1073 snprintf(s, 37,
1074 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1075 SD_ID128_FORMAT_VAL(id));
1076
1077 return s;
1078}
1079
04bc4a3f 1080static int setup_boot_id(const char *dest) {
03cfe0d5 1081 const char *from, *to;
39883f62 1082 sd_id128_t rnd = {};
04bc4a3f
LP
1083 char as_uuid[37];
1084 int r;
1085
eb91eb18
LP
1086 if (arg_share_system)
1087 return 0;
1088
04bc4a3f
LP
1089 /* Generate a new randomized boot ID, so that each boot-up of
1090 * the container gets a new one */
1091
03cfe0d5
LP
1092 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1093 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1094
1095 r = sd_id128_randomize(&rnd);
f647962d
MS
1096 if (r < 0)
1097 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1098
9f24adc2 1099 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1100
4c1fc3e4 1101 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
f647962d
MS
1102 if (r < 0)
1103 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1104
03cfe0d5
LP
1105 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1106 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1107 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
56f64d95 1108 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1109
1110 unlink(from);
04bc4a3f
LP
1111 return r;
1112}
1113
e58a1277 1114static int copy_devnodes(const char *dest) {
88213476
LP
1115
1116 static const char devnodes[] =
1117 "null\0"
1118 "zero\0"
1119 "full\0"
1120 "random\0"
1121 "urandom\0"
85614d66
TG
1122 "tty\0"
1123 "net/tun\0";
88213476
LP
1124
1125 const char *d;
e58a1277 1126 int r = 0;
7fd1b19b 1127 _cleanup_umask_ mode_t u;
a258bf26
LP
1128
1129 assert(dest);
124640f1
LP
1130
1131 u = umask(0000);
88213476 1132
03cfe0d5
LP
1133 /* Create /dev/net, so that we can create /dev/net/tun in it */
1134 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1135 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1136
88213476 1137 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1138 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1139 struct stat st;
88213476 1140
7f112f50 1141 from = strappend("/dev/", d);
03cfe0d5 1142 to = prefix_root(dest, from);
88213476
LP
1143
1144 if (stat(from, &st) < 0) {
1145
4a62c710
MS
1146 if (errno != ENOENT)
1147 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1148
a258bf26 1149 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1150
03cfe0d5 1151 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1152 return -EIO;
a258bf26 1153
85614d66 1154 } else {
81f5049b
AC
1155 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1156 if (errno != EPERM)
1157 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1158
1159 /* Some systems abusively restrict mknod but
1160 * allow bind mounts. */
1161 r = touch(to);
1162 if (r < 0)
1163 return log_error_errno(r, "touch (%s) failed: %m", to);
1164 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1165 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1166 }
6278cf60 1167
03cfe0d5
LP
1168 r = userns_lchown(to, 0, 0);
1169 if (r < 0)
1170 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1171 }
88213476
LP
1172 }
1173
e58a1277
LP
1174 return r;
1175}
88213476 1176
03cfe0d5
LP
1177static int setup_pts(const char *dest) {
1178 _cleanup_free_ char *options = NULL;
1179 const char *p;
709f6e46 1180 int r;
03cfe0d5
LP
1181
1182#ifdef HAVE_SELINUX
1183 if (arg_selinux_apifs_context)
1184 (void) asprintf(&options,
3dce8915 1185 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1186 arg_uid_shift + TTY_GID,
1187 arg_selinux_apifs_context);
1188 else
1189#endif
1190 (void) asprintf(&options,
3dce8915 1191 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1192 arg_uid_shift + TTY_GID);
f2d88580 1193
03cfe0d5 1194 if (!options)
f2d88580
LP
1195 return log_oom();
1196
03cfe0d5 1197 /* Mount /dev/pts itself */
cc9fce65 1198 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1199 if (mkdir(p, 0755) < 0)
1200 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1201 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1202 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
709f6e46
MS
1203 r = userns_lchown(p, 0, 0);
1204 if (r < 0)
1205 return log_error_errno(r, "Failed to chown /dev/pts: %m");
03cfe0d5
LP
1206
1207 /* Create /dev/ptmx symlink */
1208 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1209 if (symlink("pts/ptmx", p) < 0)
1210 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
709f6e46
MS
1211 r = userns_lchown(p, 0, 0);
1212 if (r < 0)
1213 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
f2d88580 1214
03cfe0d5
LP
1215 /* And fix /dev/pts/ptmx ownership */
1216 p = prefix_roota(dest, "/dev/pts/ptmx");
709f6e46
MS
1217 r = userns_lchown(p, 0, 0);
1218 if (r < 0)
1219 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1220
f2d88580
LP
1221 return 0;
1222}
1223
e58a1277 1224static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1225 _cleanup_umask_ mode_t u;
1226 const char *to;
e58a1277 1227 int r;
e58a1277
LP
1228
1229 assert(dest);
1230 assert(console);
1231
1232 u = umask(0000);
1233
03cfe0d5 1234 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1235 if (r < 0)
1236 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1237
a258bf26
LP
1238 /* We need to bind mount the right tty to /dev/console since
1239 * ptys can only exist on pts file systems. To have something
81f5049b 1240 * to bind mount things on we create a empty regular file. */
a258bf26 1241
03cfe0d5 1242 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1243 r = touch(to);
1244 if (r < 0)
1245 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1246
4543768d 1247 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1248 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1249
25ea79fe 1250 return 0;
e58a1277
LP
1251}
1252
1253static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1254 const char *from, *to;
7fd1b19b 1255 _cleanup_umask_ mode_t u;
d9603714 1256 int fd, r;
e58a1277 1257
e58a1277 1258 assert(kmsg_socket >= 0);
a258bf26 1259
e58a1277 1260 u = umask(0000);
a258bf26 1261
03cfe0d5 1262 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1263 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1264 * on the reading side behave very similar to /proc/kmsg,
1265 * their writing side behaves differently from /dev/kmsg in
1266 * that writing blocks when nothing is reading. In order to
1267 * avoid any problems with containers deadlocking due to this
1268 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1269 from = prefix_roota(dest, "/run/kmsg");
1270 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1271
4a62c710 1272 if (mkfifo(from, 0600) < 0)
03cfe0d5 1273 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 1274 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1275 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1276
1277 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1278 if (fd < 0)
1279 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1280
e58a1277
LP
1281 /* Store away the fd in the socket, so that it stays open as
1282 * long as we run the child */
3ee897d6 1283 r = send_one_fd(kmsg_socket, fd, 0);
03e334a1 1284 safe_close(fd);
e58a1277 1285
d9603714
DH
1286 if (r < 0)
1287 return log_error_errno(r, "Failed to send FIFO fd: %m");
a258bf26 1288
03cfe0d5
LP
1289 /* And now make the FIFO unavailable as /run/kmsg... */
1290 (void) unlink(from);
1291
25ea79fe 1292 return 0;
88213476
LP
1293}
1294
1c4baffc 1295static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1296 union in_addr_union *exposed = userdata;
1297
1298 assert(rtnl);
1299 assert(m);
1300 assert(exposed);
1301
7a8f6325 1302 expose_port_execute(rtnl, arg_expose_ports, exposed);
6d0b55c2
LP
1303 return 0;
1304}
1305
3a74cea5 1306static int setup_hostname(void) {
3a74cea5 1307
eb91eb18
LP
1308 if (arg_share_system)
1309 return 0;
1310
605f81a8 1311 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1312 return -errno;
3a74cea5 1313
7027ff61 1314 return 0;
3a74cea5
LP
1315}
1316
57fb9fb5 1317static int setup_journal(const char *directory) {
4d680aee 1318 sd_id128_t machine_id, this_id;
03cfe0d5
LP
1319 _cleanup_free_ char *b = NULL, *d = NULL;
1320 const char *etc_machine_id, *p, *q;
27407a01 1321 char *id;
57fb9fb5
LP
1322 int r;
1323
df9a75e4
LP
1324 /* Don't link journals in ephemeral mode */
1325 if (arg_ephemeral)
1326 return 0;
1327
03cfe0d5 1328 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
57fb9fb5 1329
03cfe0d5 1330 r = read_one_line_file(etc_machine_id, &b);
27407a01
ZJS
1331 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1332 return 0;
f647962d 1333 else if (r < 0)
03cfe0d5 1334 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
57fb9fb5 1335
27407a01
ZJS
1336 id = strstrip(b);
1337 if (isempty(id) && arg_link_journal == LINK_AUTO)
1338 return 0;
57fb9fb5 1339
27407a01
ZJS
1340 /* Verify validity */
1341 r = sd_id128_from_string(id, &machine_id);
f647962d 1342 if (r < 0)
03cfe0d5 1343 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
57fb9fb5 1344
4d680aee 1345 r = sd_id128_get_machine(&this_id);
f647962d
MS
1346 if (r < 0)
1347 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1348
1349 if (sd_id128_equal(machine_id, this_id)) {
1350 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1351 "Host and machine ids are equal (%s): refusing to link journals", id);
1352 if (arg_link_journal == LINK_AUTO)
1353 return 0;
df9a75e4 1354 return -EEXIST;
4d680aee
ZJS
1355 }
1356
1357 if (arg_link_journal == LINK_NO)
1358 return 0;
1359
03cfe0d5
LP
1360 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1361 if (r < 0)
1362 return log_error_errno(r, "Failed to create /var: %m");
1363
1364 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1365 if (r < 0)
1366 return log_error_errno(r, "Failed to create /var/log: %m");
1367
1368 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1369 if (r < 0)
1370 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1371
1372 p = strjoina("/var/log/journal/", id);
1373 q = prefix_roota(directory, p);
27407a01 1374
e26d6ce5 1375 if (path_is_mount_point(p, 0) > 0) {
27407a01
ZJS
1376 if (arg_link_journal != LINK_AUTO) {
1377 log_error("%s: already a mount point, refusing to use for journal", p);
1378 return -EEXIST;
1379 }
1380
1381 return 0;
57fb9fb5
LP
1382 }
1383
e26d6ce5 1384 if (path_is_mount_point(q, 0) > 0) {
57fb9fb5 1385 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1386 log_error("%s: already a mount point, refusing to use for journal", q);
1387 return -EEXIST;
57fb9fb5
LP
1388 }
1389
27407a01 1390 return 0;
57fb9fb5
LP
1391 }
1392
1393 r = readlink_and_make_absolute(p, &d);
1394 if (r >= 0) {
1395 if ((arg_link_journal == LINK_GUEST ||
1396 arg_link_journal == LINK_AUTO) &&
1397 path_equal(d, q)) {
1398
03cfe0d5 1399 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1400 if (r < 0)
709f6e46 1401 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1402 return 0;
57fb9fb5
LP
1403 }
1404
4a62c710
MS
1405 if (unlink(p) < 0)
1406 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1407 } else if (r == -EINVAL) {
1408
1409 if (arg_link_journal == LINK_GUEST &&
1410 rmdir(p) < 0) {
1411
27407a01
ZJS
1412 if (errno == ENOTDIR) {
1413 log_error("%s already exists and is neither a symlink nor a directory", p);
1414 return r;
4314d33f
MS
1415 } else
1416 return log_error_errno(errno, "Failed to remove %s: %m", p);
57fb9fb5 1417 }
4314d33f
MS
1418 } else if (r != -ENOENT)
1419 return log_error_errno(r, "readlink(%s) failed: %m", p);
57fb9fb5
LP
1420
1421 if (arg_link_journal == LINK_GUEST) {
1422
1423 if (symlink(q, p) < 0) {
574edc90 1424 if (arg_link_journal_try) {
56f64d95 1425 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90 1426 return 0;
4314d33f
MS
1427 } else
1428 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
57fb9fb5
LP
1429 }
1430
03cfe0d5 1431 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1432 if (r < 0)
709f6e46 1433 log_warning_errno(r, "Failed to create directory %s: %m", q);
27407a01 1434 return 0;
57fb9fb5
LP
1435 }
1436
1437 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1438 /* don't create parents here -- if the host doesn't have
1439 * permanent journal set up, don't force it here */
1440 r = mkdir(p, 0755);
57fb9fb5 1441 if (r < 0) {
574edc90 1442 if (arg_link_journal_try) {
56f64d95 1443 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90 1444 return 0;
4314d33f
MS
1445 } else
1446 return log_error_errno(errno, "Failed to create %s: %m", p);
57fb9fb5
LP
1447 }
1448
27407a01
ZJS
1449 } else if (access(p, F_OK) < 0)
1450 return 0;
57fb9fb5 1451
cdb2b9d0
LP
1452 if (dir_is_empty(q) == 0)
1453 log_warning("%s is not empty, proceeding anyway.", q);
1454
03cfe0d5 1455 r = userns_mkdir(directory, p, 0755, 0, 0);
709f6e46
MS
1456 if (r < 0)
1457 return log_error_errno(r, "Failed to create %s: %m", q);
57fb9fb5 1458
4543768d 1459 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 1460 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1461
27407a01 1462 return 0;
57fb9fb5
LP
1463}
1464
88213476 1465static int drop_capabilities(void) {
5076f0cc 1466 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1467}
1468
db999e0f
LP
1469static int reset_audit_loginuid(void) {
1470 _cleanup_free_ char *p = NULL;
1471 int r;
1472
1473 if (arg_share_system)
1474 return 0;
1475
1476 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1477 if (r == -ENOENT)
db999e0f 1478 return 0;
f647962d
MS
1479 if (r < 0)
1480 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1481
1482 /* Already reset? */
1483 if (streq(p, "4294967295"))
1484 return 0;
1485
ad118bda 1486 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1487 if (r < 0) {
10a87006
LP
1488 log_error_errno(r,
1489 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1490 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1491 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1492 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1493 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 1494
db999e0f 1495 sleep(5);
77b6e194 1496 }
db999e0f
LP
1497
1498 return 0;
77b6e194
LP
1499}
1500
28650077 1501static int setup_seccomp(void) {
24fb1112
LP
1502
1503#ifdef HAVE_SECCOMP
9a71b112
JF
1504 static const struct {
1505 uint64_t capability;
1506 int syscall_num;
1507 } blacklist[] = {
5ba7a268
LP
1508 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1509 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1510 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1511 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1512 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1513 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1514 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1515 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1516 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1517 { CAP_SYSLOG, SCMP_SYS(syslog) },
d0a0ccf3
JF
1518 };
1519
24fb1112 1520 scmp_filter_ctx seccomp;
28650077 1521 unsigned i;
24fb1112
LP
1522 int r;
1523
24fb1112
LP
1524 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1525 if (!seccomp)
1526 return log_oom();
1527
e9642be2 1528 r = seccomp_add_secondary_archs(seccomp);
9875fd78 1529 if (r < 0) {
da927ba9 1530 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
1531 goto finish;
1532 }
1533
28650077 1534 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
1535 if (arg_retain & (1ULL << blacklist[i].capability))
1536 continue;
1537
1538 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
1539 if (r == -EFAULT)
1540 continue; /* unknown syscall */
1541 if (r < 0) {
da927ba9 1542 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
1543 goto finish;
1544 }
1545 }
1546
d0a0ccf3 1547
28650077
LP
1548 /*
1549 Audit is broken in containers, much of the userspace audit
1550 hookup will fail if running inside a container. We don't
1551 care and just turn off creation of audit sockets.
1552
1553 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1554 with EAFNOSUPPORT which audit userspace uses as indication
1555 that audit is disabled in the kernel.
1556 */
1557
3302da46 1558 r = seccomp_rule_add(
24fb1112
LP
1559 seccomp,
1560 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1561 SCMP_SYS(socket),
1562 2,
1563 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1564 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1565 if (r < 0) {
da927ba9 1566 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
1567 goto finish;
1568 }
1569
1570 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1571 if (r < 0) {
da927ba9 1572 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
1573 goto finish;
1574 }
1575
1576 r = seccomp_load(seccomp);
9b1cbdc6
ILG
1577 if (r == -EINVAL) {
1578 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1579 r = 0;
1580 goto finish;
1581 }
1582 if (r < 0) {
da927ba9 1583 log_error_errno(r, "Failed to install seccomp audit filter: %m");
9b1cbdc6
ILG
1584 goto finish;
1585 }
24fb1112
LP
1586
1587finish:
1588 seccomp_release(seccomp);
1589 return r;
1590#else
1591 return 0;
1592#endif
1593
1594}
1595
785890ac
LP
1596static int setup_propagate(const char *root) {
1597 const char *p, *q;
709f6e46 1598 int r;
785890ac
LP
1599
1600 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1601 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 1602 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
1603 (void) mkdir_p(p, 0600);
1604
709f6e46
MS
1605 r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
1606 if (r < 0)
1607 return log_error_errno(r, "Failed to create /run/systemd: %m");
03cfe0d5 1608
709f6e46
MS
1609 r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
1610 if (r < 0)
1611 return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
03cfe0d5 1612
709f6e46
MS
1613 r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
1614 if (r < 0)
1615 return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 1616
03cfe0d5 1617 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
1618 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1619 return log_error_errno(errno, "Failed to install propagation bind mount.");
1620
1621 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1622 return log_error_errno(errno, "Failed to make propagation mount read-only");
1623
1624 return 0;
1625}
1626
1b9e5b12
LP
1627static int setup_image(char **device_path, int *loop_nr) {
1628 struct loop_info64 info = {
1629 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1630 };
1631 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1632 _cleanup_free_ char* loopdev = NULL;
1633 struct stat st;
1634 int r, nr;
1635
1636 assert(device_path);
1637 assert(loop_nr);
ec16945e 1638 assert(arg_image);
1b9e5b12
LP
1639
1640 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1641 if (fd < 0)
1642 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 1643
4a62c710
MS
1644 if (fstat(fd, &st) < 0)
1645 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
1646
1647 if (S_ISBLK(st.st_mode)) {
1648 char *p;
1649
1650 p = strdup(arg_image);
1651 if (!p)
1652 return log_oom();
1653
1654 *device_path = p;
1655
1656 *loop_nr = -1;
1657
1658 r = fd;
1659 fd = -1;
1660
1661 return r;
1662 }
1663
1664 if (!S_ISREG(st.st_mode)) {
070edd97 1665 log_error("%s is not a regular file or block device.", arg_image);
1b9e5b12
LP
1666 return -EINVAL;
1667 }
1668
1669 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
1670 if (control < 0)
1671 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
1672
1673 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
1674 if (nr < 0)
1675 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
1676
1677 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1678 return log_oom();
1679
1680 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
1681 if (loop < 0)
1682 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 1683
4a62c710
MS
1684 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1685 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
1686
1687 if (arg_read_only)
1688 info.lo_flags |= LO_FLAGS_READ_ONLY;
1689
4a62c710
MS
1690 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1691 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
1692
1693 *device_path = loopdev;
1694 loopdev = NULL;
1695
1696 *loop_nr = nr;
1697
1698 r = loop;
1699 loop = -1;
1700
1701 return r;
1702}
1703
ada4799a
LP
1704#define PARTITION_TABLE_BLURB \
1705 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 1706 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 1707 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
1708 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1709 "to be bootable with systemd-nspawn."
1710
1b9e5b12
LP
1711static int dissect_image(
1712 int fd,
727fd4fd
LP
1713 char **root_device, bool *root_device_rw,
1714 char **home_device, bool *home_device_rw,
1715 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
1716 bool *secondary) {
1717
1718#ifdef HAVE_BLKID
01dc33ce
ZJS
1719 int home_nr = -1, srv_nr = -1;
1720#ifdef GPT_ROOT_NATIVE
1721 int root_nr = -1;
1722#endif
1723#ifdef GPT_ROOT_SECONDARY
1724 int secondary_root_nr = -1;
1725#endif
f6c51a81 1726 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
1727 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1728 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1729 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1730 _cleanup_udev_unref_ struct udev *udev = NULL;
1731 struct udev_list_entry *first, *item;
f6c51a81 1732 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 1733 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
1734 const char *pttype = NULL;
1735 blkid_partlist pl;
1736 struct stat st;
c09ef2e4 1737 unsigned i;
1b9e5b12
LP
1738 int r;
1739
1740 assert(fd >= 0);
1741 assert(root_device);
1742 assert(home_device);
1743 assert(srv_device);
1744 assert(secondary);
ec16945e 1745 assert(arg_image);
1b9e5b12
LP
1746
1747 b = blkid_new_probe();
1748 if (!b)
1749 return log_oom();
1750
1751 errno = 0;
1752 r = blkid_probe_set_device(b, fd, 0, 0);
1753 if (r != 0) {
1754 if (errno == 0)
1755 return log_oom();
1756
56f64d95 1757 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
1758 return -errno;
1759 }
1760
1761 blkid_probe_enable_partitions(b, 1);
1762 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1763
1764 errno = 0;
1765 r = blkid_do_safeprobe(b);
1766 if (r == -2 || r == 1) {
ada4799a
LP
1767 log_error("Failed to identify any partition table on\n"
1768 " %s\n"
1769 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1770 return -EINVAL;
1771 } else if (r != 0) {
1772 if (errno == 0)
1773 errno = EIO;
56f64d95 1774 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
1775 return -errno;
1776 }
1777
48861960 1778 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
1779
1780 is_gpt = streq_ptr(pttype, "gpt");
1781 is_mbr = streq_ptr(pttype, "dos");
1782
1783 if (!is_gpt && !is_mbr) {
1784 log_error("No GPT or MBR partition table discovered on\n"
1785 " %s\n"
1786 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
1787 return -EINVAL;
1788 }
1789
1790 errno = 0;
1791 pl = blkid_probe_get_partitions(b);
1792 if (!pl) {
1793 if (errno == 0)
1794 return log_oom();
1795
1796 log_error("Failed to list partitions of %s", arg_image);
1797 return -errno;
1798 }
1799
1800 udev = udev_new();
1801 if (!udev)
1802 return log_oom();
1803
4a62c710
MS
1804 if (fstat(fd, &st) < 0)
1805 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 1806
c09ef2e4
LP
1807 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1808 if (!d)
1b9e5b12
LP
1809 return log_oom();
1810
c09ef2e4
LP
1811 for (i = 0;; i++) {
1812 int n, m;
1b9e5b12 1813
c09ef2e4
LP
1814 if (i >= 10) {
1815 log_error("Kernel partitions never appeared.");
1816 return -ENXIO;
1817 }
1818
1819 e = udev_enumerate_new(udev);
1820 if (!e)
1821 return log_oom();
1822
1823 r = udev_enumerate_add_match_parent(e, d);
1824 if (r < 0)
1825 return log_oom();
1826
1827 r = udev_enumerate_scan_devices(e);
1828 if (r < 0)
1829 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1830
1831 /* Count the partitions enumerated by the kernel */
1832 n = 0;
1833 first = udev_enumerate_get_list_entry(e);
1834 udev_list_entry_foreach(item, first)
1835 n++;
1836
1837 /* Count the partitions enumerated by blkid */
1838 m = blkid_partlist_numof_partitions(pl);
1839 if (n == m + 1)
1840 break;
1841 if (n > m + 1) {
1842 log_error("blkid and kernel partition list do not match.");
1843 return -EIO;
1844 }
1845 if (n < m + 1) {
1846 unsigned j;
1847
1848 /* The kernel has probed fewer partitions than
1849 * blkid? Maybe the kernel prober is still
1850 * running or it got EBUSY because udev
1851 * already opened the device. Let's reprobe
1852 * the device, which is a synchronous call
1853 * that waits until probing is complete. */
1854
1855 for (j = 0; j < 20; j++) {
1856
1857 r = ioctl(fd, BLKRRPART, 0);
1858 if (r < 0)
1859 r = -errno;
1860 if (r >= 0 || r != -EBUSY)
1861 break;
1862
1863 /* If something else has the device
1864 * open, such as an udev rule, the
1865 * ioctl will return EBUSY. Since
1866 * there's no way to wait until it
1867 * isn't busy anymore, let's just wait
1868 * a bit, and try again.
1869 *
1870 * This is really something they
1871 * should fix in the kernel! */
1872
1873 usleep(50 * USEC_PER_MSEC);
1874 }
1875
1876 if (r < 0)
1877 return log_error_errno(r, "Failed to reread partition table: %m");
1878 }
1879
1880 e = udev_enumerate_unref(e);
1881 }
1b9e5b12
LP
1882
1883 first = udev_enumerate_get_list_entry(e);
1884 udev_list_entry_foreach(item, first) {
1885 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 1886 const char *node;
727fd4fd 1887 unsigned long long flags;
1b9e5b12
LP
1888 blkid_partition pp;
1889 dev_t qn;
1890 int nr;
1891
1892 errno = 0;
1893 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1894 if (!q) {
1895 if (!errno)
1896 errno = ENOMEM;
1897
56f64d95 1898 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
1899 return -errno;
1900 }
1901
1902 qn = udev_device_get_devnum(q);
1903 if (major(qn) == 0)
1904 continue;
1905
1906 if (st.st_rdev == qn)
1907 continue;
1908
1909 node = udev_device_get_devnode(q);
1910 if (!node)
1911 continue;
1912
1913 pp = blkid_partlist_devno_to_partition(pl, qn);
1914 if (!pp)
1915 continue;
1916
727fd4fd 1917 flags = blkid_partition_get_flags(pp);
727fd4fd 1918
1b9e5b12
LP
1919 nr = blkid_partition_get_partno(pp);
1920 if (nr < 0)
1921 continue;
1922
ada4799a
LP
1923 if (is_gpt) {
1924 sd_id128_t type_id;
1925 const char *stype;
1b9e5b12 1926
f6c51a81
LP
1927 if (flags & GPT_FLAG_NO_AUTO)
1928 continue;
1929
ada4799a
LP
1930 stype = blkid_partition_get_type_string(pp);
1931 if (!stype)
1932 continue;
1b9e5b12 1933
ada4799a 1934 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
1935 continue;
1936
ada4799a 1937 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 1938
ada4799a
LP
1939 if (home && nr >= home_nr)
1940 continue;
1b9e5b12 1941
ada4799a
LP
1942 home_nr = nr;
1943 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 1944
ada4799a
LP
1945 r = free_and_strdup(&home, node);
1946 if (r < 0)
1947 return log_oom();
727fd4fd 1948
ada4799a
LP
1949 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1950
1951 if (srv && nr >= srv_nr)
1952 continue;
1953
1954 srv_nr = nr;
1955 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1956
1957 r = free_and_strdup(&srv, node);
1958 if (r < 0)
1959 return log_oom();
1960 }
1b9e5b12 1961#ifdef GPT_ROOT_NATIVE
ada4799a 1962 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 1963
ada4799a
LP
1964 if (root && nr >= root_nr)
1965 continue;
1b9e5b12 1966
ada4799a
LP
1967 root_nr = nr;
1968 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 1969
ada4799a
LP
1970 r = free_and_strdup(&root, node);
1971 if (r < 0)
1972 return log_oom();
1973 }
1b9e5b12
LP
1974#endif
1975#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
1976 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1977
1978 if (secondary_root && nr >= secondary_root_nr)
1979 continue;
1980
1981 secondary_root_nr = nr;
1982 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
1983
1984 r = free_and_strdup(&secondary_root, node);
1985 if (r < 0)
1986 return log_oom();
1987 }
1988#endif
f6c51a81
LP
1989 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
1990
1991 if (generic)
1992 multiple_generic = true;
1993 else {
1994 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
1995
1996 r = free_and_strdup(&generic, node);
1997 if (r < 0)
1998 return log_oom();
1999 }
2000 }
ada4799a
LP
2001
2002 } else if (is_mbr) {
2003 int type;
1b9e5b12 2004
f6c51a81
LP
2005 if (flags != 0x80) /* Bootable flag */
2006 continue;
2007
ada4799a
LP
2008 type = blkid_partition_get_type(pp);
2009 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2010 continue;
2011
f6c51a81
LP
2012 if (generic)
2013 multiple_generic = true;
2014 else {
2015 generic_rw = true;
727fd4fd 2016
f6c51a81
LP
2017 r = free_and_strdup(&root, node);
2018 if (r < 0)
2019 return log_oom();
2020 }
1b9e5b12 2021 }
1b9e5b12
LP
2022 }
2023
1b9e5b12
LP
2024 if (root) {
2025 *root_device = root;
2026 root = NULL;
727fd4fd
LP
2027
2028 *root_device_rw = root_rw;
1b9e5b12
LP
2029 *secondary = false;
2030 } else if (secondary_root) {
2031 *root_device = secondary_root;
2032 secondary_root = NULL;
727fd4fd
LP
2033
2034 *root_device_rw = secondary_root_rw;
1b9e5b12 2035 *secondary = true;
f6c51a81
LP
2036 } else if (generic) {
2037
2038 /* There were no partitions with precise meanings
2039 * around, but we found generic partitions. In this
2040 * case, if there's only one, we can go ahead and boot
2041 * it, otherwise we bail out, because we really cannot
2042 * make any sense of it. */
2043
2044 if (multiple_generic) {
2045 log_error("Identified multiple bootable Linux partitions on\n"
2046 " %s\n"
2047 PARTITION_TABLE_BLURB, arg_image);
2048 return -EINVAL;
2049 }
2050
2051 *root_device = generic;
2052 generic = NULL;
2053
2054 *root_device_rw = generic_rw;
2055 *secondary = false;
2056 } else {
2057 log_error("Failed to identify root partition in disk image\n"
2058 " %s\n"
2059 PARTITION_TABLE_BLURB, arg_image);
2060 return -EINVAL;
1b9e5b12
LP
2061 }
2062
2063 if (home) {
2064 *home_device = home;
2065 home = NULL;
727fd4fd
LP
2066
2067 *home_device_rw = home_rw;
1b9e5b12
LP
2068 }
2069
2070 if (srv) {
2071 *srv_device = srv;
2072 srv = NULL;
727fd4fd
LP
2073
2074 *srv_device_rw = srv_rw;
1b9e5b12
LP
2075 }
2076
2077 return 0;
2078#else
2079 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2080 return -EOPNOTSUPP;
1b9e5b12
LP
2081#endif
2082}
2083
727fd4fd 2084static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2085#ifdef HAVE_BLKID
2086 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2087 const char *fstype, *p;
2088 int r;
2089
2090 assert(what);
2091 assert(where);
2092
727fd4fd
LP
2093 if (arg_read_only)
2094 rw = false;
2095
1b9e5b12 2096 if (directory)
63c372cb 2097 p = strjoina(where, directory);
1b9e5b12
LP
2098 else
2099 p = where;
2100
2101 errno = 0;
2102 b = blkid_new_probe_from_filename(what);
2103 if (!b) {
2104 if (errno == 0)
2105 return log_oom();
56f64d95 2106 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2107 return -errno;
2108 }
2109
2110 blkid_probe_enable_superblocks(b, 1);
2111 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2112
2113 errno = 0;
2114 r = blkid_do_safeprobe(b);
2115 if (r == -1 || r == 1) {
2116 log_error("Cannot determine file system type of %s", what);
2117 return -EINVAL;
2118 } else if (r != 0) {
2119 if (errno == 0)
2120 errno = EIO;
56f64d95 2121 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2122 return -errno;
2123 }
2124
2125 errno = 0;
2126 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2127 if (errno == 0)
2128 errno = EINVAL;
2129 log_error("Failed to determine file system type of %s", what);
2130 return -errno;
2131 }
2132
2133 if (streq(fstype, "crypto_LUKS")) {
2134 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 2135 return -EOPNOTSUPP;
1b9e5b12
LP
2136 }
2137
4a62c710
MS
2138 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2139 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
2140
2141 return 0;
2142#else
2143 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 2144 return -EOPNOTSUPP;
1b9e5b12
LP
2145#endif
2146}
2147
727fd4fd
LP
2148static int mount_devices(
2149 const char *where,
2150 const char *root_device, bool root_device_rw,
2151 const char *home_device, bool home_device_rw,
2152 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2153 int r;
2154
2155 assert(where);
2156
2157 if (root_device) {
727fd4fd 2158 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2159 if (r < 0)
2160 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2161 }
2162
2163 if (home_device) {
727fd4fd 2164 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2165 if (r < 0)
2166 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2167 }
2168
2169 if (srv_device) {
727fd4fd 2170 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2171 if (r < 0)
2172 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2173 }
2174
2175 return 0;
2176}
2177
2178static void loop_remove(int nr, int *image_fd) {
2179 _cleanup_close_ int control = -1;
e8c8ddcc 2180 int r;
1b9e5b12
LP
2181
2182 if (nr < 0)
2183 return;
2184
2185 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2186 r = ioctl(*image_fd, LOOP_CLR_FD);
2187 if (r < 0)
5e4074aa 2188 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2189 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2190 }
2191
2192 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2193 if (control < 0) {
56f64d95 2194 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2195 return;
e8c8ddcc 2196 }
1b9e5b12 2197
e8c8ddcc
TG
2198 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2199 if (r < 0)
5e4074aa 2200 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2201}
2202
113cea80 2203/*
6d416b9c
LS
2204 * Return values:
2205 * < 0 : wait_for_terminate() failed to get the state of the
2206 * container, the container was terminated by a signal, or
2207 * failed for an unknown reason. No change is made to the
2208 * container argument.
2209 * > 0 : The program executed in the container terminated with an
2210 * error. The exit code of the program executed in the
919699ec
LP
2211 * container is returned. The container argument has been set
2212 * to CONTAINER_TERMINATED.
6d416b9c
LS
2213 * 0 : The container is being rebooted, has been shut down or exited
2214 * successfully. The container argument has been set to either
2215 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2216 *
6d416b9c
LS
2217 * That is, success is indicated by a return value of zero, and an
2218 * error is indicated by a non-zero value.
113cea80
DH
2219 */
2220static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2221 siginfo_t status;
919699ec 2222 int r;
113cea80
DH
2223
2224 r = wait_for_terminate(pid, &status);
f647962d
MS
2225 if (r < 0)
2226 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2227
2228 switch (status.si_code) {
fddbb89c 2229
113cea80 2230 case CLD_EXITED:
919699ec
LP
2231 if (status.si_status == 0) {
2232 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 2233
fddbb89c 2234 } else
919699ec 2235 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2236
919699ec
LP
2237 *container = CONTAINER_TERMINATED;
2238 return status.si_status;
113cea80
DH
2239
2240 case CLD_KILLED:
2241 if (status.si_status == SIGINT) {
113cea80 2242
919699ec 2243 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2244 *container = CONTAINER_TERMINATED;
919699ec
LP
2245 return 0;
2246
113cea80 2247 } else if (status.si_status == SIGHUP) {
113cea80 2248
919699ec 2249 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2250 *container = CONTAINER_REBOOTED;
919699ec 2251 return 0;
113cea80 2252 }
919699ec 2253
113cea80
DH
2254 /* CLD_KILLED fallthrough */
2255
2256 case CLD_DUMPED:
fddbb89c 2257 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2258 return -EIO;
113cea80
DH
2259
2260 default:
fddbb89c 2261 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2262 return -EIO;
113cea80
DH
2263 }
2264
2265 return r;
2266}
2267
023fb90b
LP
2268static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2269 pid_t pid;
2270
2271 pid = PTR_TO_UINT32(userdata);
2272 if (pid > 0) {
c6c8f6e2 2273 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
2274 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2275 sd_event_source_set_userdata(s, NULL);
2276 return 0;
2277 }
2278 }
2279
2280 sd_event_exit(sd_event_source_get_event(s), 0);
2281 return 0;
2282}
2283
ec16945e 2284static int determine_names(void) {
1b9cebf6 2285 int r;
ec16945e 2286
c1521918
LP
2287 if (arg_template && !arg_directory && arg_machine) {
2288
2289 /* If --template= was specified then we should not
2290 * search for a machine, but instead create a new one
2291 * in /var/lib/machine. */
2292
2293 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2294 if (!arg_directory)
2295 return log_oom();
2296 }
2297
ec16945e 2298 if (!arg_image && !arg_directory) {
1b9cebf6
LP
2299 if (arg_machine) {
2300 _cleanup_(image_unrefp) Image *i = NULL;
2301
2302 r = image_find(arg_machine, &i);
2303 if (r < 0)
2304 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2305 else if (r == 0) {
2306 log_error("No image for machine '%s': %m", arg_machine);
2307 return -ENOENT;
2308 }
2309
aceac2f0 2310 if (i->type == IMAGE_RAW)
0f03c2a4 2311 r = free_and_strdup(&arg_image, i->path);
1b9cebf6 2312 else
0f03c2a4 2313 r = free_and_strdup(&arg_directory, i->path);
1b9cebf6
LP
2314 if (r < 0)
2315 return log_error_errno(r, "Invalid image directory: %m");
2316
aee327b8
LP
2317 if (!arg_ephemeral)
2318 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 2319 } else
ec16945e
LP
2320 arg_directory = get_current_dir_name();
2321
1b9cebf6
LP
2322 if (!arg_directory && !arg_machine) {
2323 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
2324 return -EINVAL;
2325 }
2326 }
2327
2328 if (!arg_machine) {
b9ba4dab
LP
2329 if (arg_directory && path_equal(arg_directory, "/"))
2330 arg_machine = gethostname_malloc();
2331 else
2332 arg_machine = strdup(basename(arg_image ?: arg_directory));
2333
ec16945e
LP
2334 if (!arg_machine)
2335 return log_oom();
2336
ae691c1d 2337 hostname_cleanup(arg_machine);
ec16945e
LP
2338 if (!machine_name_is_valid(arg_machine)) {
2339 log_error("Failed to determine machine name automatically, please use -M.");
2340 return -EINVAL;
2341 }
b9ba4dab
LP
2342
2343 if (arg_ephemeral) {
2344 char *b;
2345
2346 /* Add a random suffix when this is an
2347 * ephemeral machine, so that we can run many
2348 * instances at once without manually having
2349 * to specify -M each time. */
2350
2351 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2352 return log_oom();
2353
2354 free(arg_machine);
2355 arg_machine = b;
2356 }
ec16945e
LP
2357 }
2358
2359 return 0;
2360}
2361
03cfe0d5 2362static int determine_uid_shift(const char *directory) {
6dac160c
LP
2363 int r;
2364
03cfe0d5
LP
2365 if (!arg_userns) {
2366 arg_uid_shift = 0;
6dac160c 2367 return 0;
03cfe0d5 2368 }
6dac160c
LP
2369
2370 if (arg_uid_shift == UID_INVALID) {
2371 struct stat st;
2372
03cfe0d5 2373 r = stat(directory, &st);
6dac160c 2374 if (r < 0)
03cfe0d5 2375 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
2376
2377 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
2378
2379 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 2380 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
2381 return -EINVAL;
2382 }
2383
2384 arg_uid_range = UINT32_C(0x10000);
2385 }
2386
2387 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2388 log_error("UID base too high for UID range.");
2389 return -EINVAL;
2390 }
2391
2392 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2393 return 0;
2394}
2395
03cfe0d5
LP
2396static int inner_child(
2397 Barrier *barrier,
2398 const char *directory,
2399 bool secondary,
2400 int kmsg_socket,
2401 int rtnl_socket,
f757855e 2402 FDSet *fds) {
69c79d3c 2403
03cfe0d5
LP
2404 _cleanup_free_ char *home = NULL;
2405 unsigned n_env = 2;
2406 const char *envp[] = {
2407 "PATH=" DEFAULT_PATH_SPLIT_USR,
2408 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2409 NULL, /* TERM */
2410 NULL, /* HOME */
2411 NULL, /* USER */
2412 NULL, /* LOGNAME */
2413 NULL, /* container_uuid */
2414 NULL, /* LISTEN_FDS */
2415 NULL, /* LISTEN_PID */
2416 NULL
2417 };
88213476 2418
2371271c 2419 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 2420 int r;
88213476 2421
03cfe0d5
LP
2422 assert(barrier);
2423 assert(directory);
2424 assert(kmsg_socket >= 0);
88213476 2425
efdb0237
LP
2426 cg_unified_flush();
2427
03cfe0d5
LP
2428 if (arg_userns) {
2429 /* Tell the parent, that it now can write the UID map. */
2430 (void) barrier_place(barrier); /* #1 */
7027ff61 2431
03cfe0d5
LP
2432 /* Wait until the parent wrote the UID map */
2433 if (!barrier_place_and_sync(barrier)) { /* #2 */
2434 log_error("Parent died too early");
2435 return -ESRCH;
2436 }
88213476
LP
2437 }
2438
d1678248 2439 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2440 if (r < 0)
2441 return r;
2442
d8fc6a00
LP
2443 r = mount_sysfs(NULL);
2444 if (r < 0)
2445 return r;
2446
03cfe0d5
LP
2447 /* Wait until we are cgroup-ified, so that we
2448 * can mount the right cgroup path writable */
2449 if (!barrier_place_and_sync(barrier)) { /* #3 */
2450 log_error("Parent died too early");
2451 return -ESRCH;
88213476
LP
2452 }
2453
e83bebef 2454 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
03cfe0d5
LP
2455 if (r < 0)
2456 return r;
ec16945e 2457
03cfe0d5
LP
2458 r = reset_uid_gid();
2459 if (r < 0)
2460 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 2461
03cfe0d5
LP
2462 r = setup_boot_id(NULL);
2463 if (r < 0)
2464 return r;
ec16945e 2465
03cfe0d5
LP
2466 r = setup_kmsg(NULL, kmsg_socket);
2467 if (r < 0)
2468 return r;
2469 kmsg_socket = safe_close(kmsg_socket);
ec16945e 2470
03cfe0d5 2471 umask(0022);
30535c16 2472
03cfe0d5
LP
2473 if (setsid() < 0)
2474 return log_error_errno(errno, "setsid() failed: %m");
2475
2476 if (arg_private_network)
2477 loopback_setup();
2478
7a8f6325
LP
2479 if (arg_expose_ports) {
2480 r = expose_port_send_rtnl(rtnl_socket);
2481 if (r < 0)
2482 return r;
2483 rtnl_socket = safe_close(rtnl_socket);
2484 }
03cfe0d5 2485
709f6e46
MS
2486 r = drop_capabilities();
2487 if (r < 0)
2488 return log_error_errno(r, "drop_capabilities() failed: %m");
03cfe0d5
LP
2489
2490 setup_hostname();
2491
050f7277 2492 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
2493 if (personality(arg_personality) < 0)
2494 return log_error_errno(errno, "personality() failed: %m");
2495 } else if (secondary) {
2496 if (personality(PER_LINUX32) < 0)
2497 return log_error_errno(errno, "personality() failed: %m");
2498 }
2499
2500#ifdef HAVE_SELINUX
2501 if (arg_selinux_context)
2502 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2503 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2504#endif
2505
ee645080 2506 r = change_uid_gid(arg_user, &home);
03cfe0d5
LP
2507 if (r < 0)
2508 return r;
2509
2510 envp[n_env] = strv_find_prefix(environ, "TERM=");
2511 if (envp[n_env])
2512 n_env ++;
2513
2514 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2515 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2516 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2517 return log_oom();
2518
2519 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2520 char as_uuid[37];
2521
2522 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2523 return log_oom();
2524 }
2525
2526 if (fdset_size(fds) > 0) {
2527 r = fdset_cloexec(fds, false);
2528 if (r < 0)
2529 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2530
2531 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2532 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2533 return log_oom();
2534 }
2535
2371271c
TG
2536 env_use = strv_env_merge(2, envp, arg_setenv);
2537 if (!env_use)
2538 return log_oom();
03cfe0d5
LP
2539
2540 /* Let the parent know that we are ready and
2541 * wait until the parent is ready with the
2542 * setup, too... */
2543 if (!barrier_place_and_sync(barrier)) { /* #4 */
2544 log_error("Parent died too early");
2545 return -ESRCH;
2546 }
2547
2548 /* Now, explicitly close the log, so that we
2549 * then can close all remaining fds. Closing
2550 * the log explicitly first has the benefit
2551 * that the logging subsystem knows about it,
2552 * and is thus ready to be reopened should we
2553 * need it again. Note that the other fds
2554 * closed here are at least the locking and
2555 * barrier fds. */
2556 log_close();
2557 (void) fdset_close_others(fds);
2558
2559 if (arg_boot) {
2560 char **a;
2561 size_t m;
2562
2563 /* Automatically search for the init system */
2564
f757855e 2565 m = 1 + strv_length(arg_parameters);
03cfe0d5 2566 a = newa(char*, m + 1);
f757855e
LP
2567 if (strv_isempty(arg_parameters))
2568 a[1] = NULL;
2569 else
2570 memcpy(a + 1, arg_parameters, m * sizeof(char*));
03cfe0d5
LP
2571
2572 a[0] = (char*) "/usr/lib/systemd/systemd";
2573 execve(a[0], a, env_use);
2574
2575 a[0] = (char*) "/lib/systemd/systemd";
2576 execve(a[0], a, env_use);
2577
2578 a[0] = (char*) "/sbin/init";
2579 execve(a[0], a, env_use);
f757855e
LP
2580 } else if (!strv_isempty(arg_parameters))
2581 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 2582 else {
f757855e 2583 chdir(home ?: "/root");
03cfe0d5
LP
2584 execle("/bin/bash", "-bash", NULL, env_use);
2585 execle("/bin/sh", "-sh", NULL, env_use);
2586 }
2587
2588 (void) log_open();
2589 return log_error_errno(errno, "execv() failed: %m");
2590}
2591
2592static int outer_child(
2593 Barrier *barrier,
2594 const char *directory,
2595 const char *console,
2596 const char *root_device, bool root_device_rw,
2597 const char *home_device, bool home_device_rw,
2598 const char *srv_device, bool srv_device_rw,
2599 bool interactive,
2600 bool secondary,
2601 int pid_socket,
2602 int kmsg_socket,
2603 int rtnl_socket,
825d5287 2604 int uid_shift_socket,
f757855e 2605 FDSet *fds) {
03cfe0d5
LP
2606
2607 pid_t pid;
2608 ssize_t l;
2609 int r;
2610
2611 assert(barrier);
2612 assert(directory);
2613 assert(console);
2614 assert(pid_socket >= 0);
2615 assert(kmsg_socket >= 0);
2616
efdb0237
LP
2617 cg_unified_flush();
2618
03cfe0d5
LP
2619 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2620 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2621
2622 if (interactive) {
2623 close_nointr(STDIN_FILENO);
2624 close_nointr(STDOUT_FILENO);
2625 close_nointr(STDERR_FILENO);
2626
2627 r = open_terminal(console, O_RDWR);
2628 if (r != STDIN_FILENO) {
2629 if (r >= 0) {
2630 safe_close(r);
2631 r = -EINVAL;
2632 }
2633
2634 return log_error_errno(r, "Failed to open console: %m");
2635 }
2636
2637 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2638 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2639 return log_error_errno(errno, "Failed to duplicate console: %m");
2640 }
2641
2642 r = reset_audit_loginuid();
2643 if (r < 0)
2644 return r;
2645
2646 /* Mark everything as slave, so that we still
2647 * receive mounts from the real root, but don't
2648 * propagate mounts to the real root. */
2649 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2650 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2651
2652 r = mount_devices(directory,
2653 root_device, root_device_rw,
2654 home_device, home_device_rw,
2655 srv_device, srv_device_rw);
2656 if (r < 0)
2657 return r;
2658
391567f4
LP
2659 r = determine_uid_shift(directory);
2660 if (r < 0)
2661 return r;
2662
825d5287
RM
2663 if (arg_userns) {
2664 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2665 if (l < 0)
2666 return log_error_errno(errno, "Failed to send UID shift: %m");
2667 if (l != sizeof(arg_uid_shift)) {
2668 log_error("Short write while sending UID shift.");
2669 return -EIO;
2670 }
2671 }
2672
03cfe0d5
LP
2673 /* Turn directory into bind mount */
2674 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2675 return log_error_errno(errno, "Failed to make bind mount: %m");
2676
e83bebef 2677 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
2678 if (r < 0)
2679 return r;
2680
e83bebef 2681 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
2682 if (r < 0)
2683 return r;
2684
03cfe0d5
LP
2685 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2686 if (r < 0)
2687 return r;
2688
03cfe0d5
LP
2689 if (arg_read_only) {
2690 r = bind_remount_recursive(directory, true);
2691 if (r < 0)
2692 return log_error_errno(r, "Failed to make tree read-only: %m");
2693 }
2694
d1678248 2695 r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2696 if (r < 0)
2697 return r;
2698
07fa00f9
LP
2699 r = copy_devnodes(directory);
2700 if (r < 0)
03cfe0d5
LP
2701 return r;
2702
2703 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2704
07fa00f9
LP
2705 r = setup_pts(directory);
2706 if (r < 0)
03cfe0d5
LP
2707 return r;
2708
2709 r = setup_propagate(directory);
2710 if (r < 0)
2711 return r;
2712
2713 r = setup_dev_console(directory, console);
2714 if (r < 0)
2715 return r;
2716
2717 r = setup_seccomp();
2718 if (r < 0)
2719 return r;
2720
2721 r = setup_timezone(directory);
2722 if (r < 0)
2723 return r;
2724
2725 r = setup_resolv_conf(directory);
2726 if (r < 0)
2727 return r;
2728
2729 r = setup_journal(directory);
2730 if (r < 0)
2731 return r;
2732
e83bebef 2733 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2734 if (r < 0)
2735 return r;
2736
e83bebef 2737 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
2738 if (r < 0)
2739 return r;
2740
2741 r = mount_move_root(directory);
2742 if (r < 0)
2743 return log_error_errno(r, "Failed to move root directory: %m");
2744
2745 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2746 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2747 (arg_private_network ? CLONE_NEWNET : 0) |
2748 (arg_userns ? CLONE_NEWUSER : 0),
2749 NULL);
2750 if (pid < 0)
2751 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
2752 if (pid == 0) {
2753 pid_socket = safe_close(pid_socket);
825d5287 2754 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
2755
2756 /* The inner child has all namespaces that are
2757 * requested, so that we all are owned by the user if
2758 * user namespaces are turned on. */
2759
f757855e 2760 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
2761 if (r < 0)
2762 _exit(EXIT_FAILURE);
2763
2764 _exit(EXIT_SUCCESS);
2765 }
2766
2767 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2768 if (l < 0)
2769 return log_error_errno(errno, "Failed to send PID: %m");
2770 if (l != sizeof(pid)) {
2771 log_error("Short write while sending PID.");
2772 return -EIO;
2773 }
2774
2775 pid_socket = safe_close(pid_socket);
327e26d6
KN
2776 kmsg_socket = safe_close(kmsg_socket);
2777 rtnl_socket = safe_close(rtnl_socket);
03cfe0d5
LP
2778
2779 return 0;
2780}
2781
2782static int setup_uid_map(pid_t pid) {
2783 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2784 int r;
2785
2786 assert(pid > 1);
2787
2788 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2789 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 2790 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2791 if (r < 0)
2792 return log_error_errno(r, "Failed to write UID map: %m");
2793
2794 /* We always assign the same UID and GID ranges */
2795 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 2796 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
2797 if (r < 0)
2798 return log_error_errno(r, "Failed to write GID map: %m");
2799
2800 return 0;
2801}
2802
f757855e
LP
2803static int load_settings(void) {
2804 _cleanup_(settings_freep) Settings *settings = NULL;
2805 _cleanup_fclose_ FILE *f = NULL;
2806 _cleanup_free_ char *p = NULL;
2807 const char *fn, *i;
2808 int r;
2809
2810 /* If all settings are masked, there's no point in looking for
2811 * the settings file */
2812 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2813 return 0;
2814
2815 fn = strjoina(arg_machine, ".nspawn");
2816
2817 /* We first look in the admin's directories in /etc and /run */
2818 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2819 _cleanup_free_ char *j = NULL;
2820
2821 j = strjoin(i, "/", fn, NULL);
2822 if (!j)
2823 return log_oom();
2824
2825 f = fopen(j, "re");
2826 if (f) {
2827 p = j;
2828 j = NULL;
2829
2830 /* By default we trust configuration from /etc and /run */
2831 if (arg_settings_trusted < 0)
2832 arg_settings_trusted = true;
2833
2834 break;
2835 }
2836
2837 if (errno != ENOENT)
2838 return log_error_errno(errno, "Failed to open %s: %m", j);
2839 }
2840
2841 if (!f) {
2842 /* After that, let's look for a file next to the
2843 * actual image we shall boot. */
2844
2845 if (arg_image) {
2846 p = file_in_same_dir(arg_image, fn);
2847 if (!p)
2848 return log_oom();
2849 } else if (arg_directory) {
2850 p = file_in_same_dir(arg_directory, fn);
2851 if (!p)
2852 return log_oom();
2853 }
2854
2855 if (p) {
2856 f = fopen(p, "re");
2857 if (!f && errno != ENOENT)
2858 return log_error_errno(errno, "Failed to open %s: %m", p);
2859
2860 /* By default we do not trust configuration from /var/lib/machines */
2861 if (arg_settings_trusted < 0)
2862 arg_settings_trusted = false;
2863 }
2864 }
2865
2866 if (!f)
2867 return 0;
2868
2869 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2870
2871 r = settings_load(f, p, &settings);
2872 if (r < 0)
2873 return r;
2874
2875 /* Copy over bits from the settings, unless they have been
2876 * explicitly masked by command line switches. */
2877
2878 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2879 settings->boot >= 0) {
2880 arg_boot = settings->boot;
2881
2882 strv_free(arg_parameters);
2883 arg_parameters = settings->parameters;
2884 settings->parameters = NULL;
2885 }
2886
2887 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2888 settings->environment) {
2889 strv_free(arg_setenv);
2890 arg_setenv = settings->environment;
2891 settings->environment = NULL;
2892 }
2893
2894 if ((arg_settings_mask & SETTING_USER) == 0 &&
2895 settings->user) {
2896 free(arg_user);
2897 arg_user = settings->user;
2898 settings->user = NULL;
2899 }
2900
2901 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
0e265674 2902 uint64_t plus;
f757855e 2903
0e265674
LP
2904 plus = settings->capability;
2905 if (settings_private_network(settings))
2906 plus |= (1ULL << CAP_NET_ADMIN);
2907
2908 if (!arg_settings_trusted && plus != 0) {
2909 if (settings->capability != 0)
2910 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2911 } else
2912 arg_retain |= plus;
f757855e
LP
2913
2914 arg_retain &= ~settings->drop_capability;
2915 }
2916
2917 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2918 settings->kill_signal > 0)
2919 arg_kill_signal = settings->kill_signal;
2920
2921 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2922 settings->personality != PERSONALITY_INVALID)
2923 arg_personality = settings->personality;
2924
2925 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2926 !sd_id128_is_null(settings->machine_id)) {
2927
2928 if (!arg_settings_trusted)
2929 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2930 else
2931 arg_uuid = settings->machine_id;
2932 }
2933
2934 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2935 settings->read_only >= 0)
2936 arg_read_only = settings->read_only;
2937
2938 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2939 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2940 arg_volatile_mode = settings->volatile_mode;
2941
2942 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2943 settings->n_custom_mounts > 0) {
2944
2945 if (!arg_settings_trusted)
2946 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2947 else {
2948 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2949 arg_custom_mounts = settings->custom_mounts;
2950 arg_n_custom_mounts = settings->n_custom_mounts;
2951
2952 settings->custom_mounts = NULL;
2953 settings->n_custom_mounts = 0;
2954 }
2955 }
2956
2957 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2958 (settings->private_network >= 0 ||
2959 settings->network_veth >= 0 ||
2960 settings->network_bridge ||
2961 settings->network_interfaces ||
2962 settings->network_macvlan ||
2963 settings->network_ipvlan)) {
2964
2965 if (!arg_settings_trusted)
2966 log_warning("Ignoring network settings, file %s is not trusted.", p);
2967 else {
0e265674
LP
2968 arg_network_veth = settings_private_network(settings);
2969 arg_private_network = settings_private_network(settings);
2970
f757855e
LP
2971 strv_free(arg_network_interfaces);
2972 arg_network_interfaces = settings->network_interfaces;
2973 settings->network_interfaces = NULL;
2974
2975 strv_free(arg_network_macvlan);
2976 arg_network_macvlan = settings->network_macvlan;
2977 settings->network_macvlan = NULL;
2978
2979 strv_free(arg_network_ipvlan);
2980 arg_network_ipvlan = settings->network_ipvlan;
2981 settings->network_ipvlan = NULL;
2982
2983 free(arg_network_bridge);
2984 arg_network_bridge = settings->network_bridge;
2985 settings->network_bridge = NULL;
f757855e
LP
2986 }
2987 }
2988
2989 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
2990 settings->expose_ports) {
2991
2992 if (!arg_settings_trusted)
2993 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
2994 else {
2995 expose_port_free_all(arg_expose_ports);
2996 arg_expose_ports = settings->expose_ports;
2997 settings->expose_ports = NULL;
2998 }
2999 }
3000
3001 return 0;
3002}
3003
03cfe0d5
LP
3004int main(int argc, char *argv[]) {
3005
3006 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3007 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3008 _cleanup_close_ int master = -1, image_fd = -1;
3009 _cleanup_fdset_free_ FDSet *fds = NULL;
3010 int r, n_fd_passed, loop_nr = -1;
3011 char veth_name[IFNAMSIZ];
3012 bool secondary = false, remove_subvol = false;
72c0a2c2 3013 sigset_t mask_chld;
03cfe0d5
LP
3014 pid_t pid = 0;
3015 int ret = EXIT_SUCCESS;
3016 union in_addr_union exposed = {};
3017 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3018 bool interactive;
3019
3020 log_parse_environment();
3021 log_open();
3022
3023 r = parse_argv(argc, argv);
3024 if (r <= 0)
3025 goto finish;
3026
03cfe0d5
LP
3027 if (geteuid() != 0) {
3028 log_error("Need to be root.");
3029 r = -EPERM;
3030 goto finish;
3031 }
f757855e
LP
3032 r = determine_names();
3033 if (r < 0)
3034 goto finish;
3035
3036 r = load_settings();
3037 if (r < 0)
3038 goto finish;
3039
3040 r = verify_arguments();
3041 if (r < 0)
3042 goto finish;
03cfe0d5
LP
3043
3044 n_fd_passed = sd_listen_fds(false);
3045 if (n_fd_passed > 0) {
3046 r = fdset_new_listen_fds(&fds, false);
3047 if (r < 0) {
3048 log_error_errno(r, "Failed to collect file descriptors: %m");
3049 goto finish;
3050 }
3051 }
3052
3053 if (arg_directory) {
3054 assert(!arg_image);
3055
3056 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3057 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3058 r = -EINVAL;
3059 goto finish;
3060 }
3061
3062 if (arg_ephemeral) {
3063 _cleanup_free_ char *np = NULL;
3064
3065 /* If the specified path is a mount point we
3066 * generate the new snapshot immediately
3067 * inside it under a random name. However if
3068 * the specified is not a mount point we
3069 * create the new snapshot in the parent
3070 * directory, just next to it. */
e26d6ce5 3071 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
3072 if (r < 0) {
3073 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3074 goto finish;
3075 }
3076 if (r > 0)
770b5ce4 3077 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 3078 else
770b5ce4 3079 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
3080 if (r < 0) {
3081 log_error_errno(r, "Failed to generate name for snapshot: %m");
3082 goto finish;
3083 }
3084
3085 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3086 if (r < 0) {
3087 log_error_errno(r, "Failed to lock %s: %m", np);
3088 goto finish;
3089 }
3090
5bcd08db 3091 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
03cfe0d5
LP
3092 if (r < 0) {
3093 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3094 goto finish;
ec16945e
LP
3095 }
3096
3097 free(arg_directory);
3098 arg_directory = np;
8a16a7b4 3099 np = NULL;
ec16945e
LP
3100
3101 remove_subvol = true;
30535c16
LP
3102
3103 } else {
3104 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3105 if (r == -EBUSY) {
3106 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3107 goto finish;
3108 }
3109 if (r < 0) {
3110 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3111 return r;
3112 }
3113
3114 if (arg_template) {
5bcd08db 3115 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE | BTRFS_SNAPSHOT_QUOTA);
30535c16
LP
3116 if (r == -EEXIST) {
3117 if (!arg_quiet)
3118 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3119 } else if (r < 0) {
83521414 3120 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3121 goto finish;
3122 } else {
3123 if (!arg_quiet)
3124 log_info("Populated %s from template %s.", arg_directory, arg_template);
3125 }
3126 }
ec16945e
LP
3127 }
3128
1b9e5b12
LP
3129 if (arg_boot) {
3130 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3131 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3132 r = -EINVAL;
1b9e5b12
LP
3133 goto finish;
3134 }
3135 } else {
3136 const char *p;
3137
16fb773e
LP
3138 p = strjoina(arg_directory, "/usr/");
3139 if (laccess(p, F_OK) < 0) {
3140 log_error("Directory %s doesn't look like it has an OS tree. Refusing.", arg_directory);
ec16945e 3141 r = -EINVAL;
1b9e5b12 3142 goto finish;
1b9e5b12
LP
3143 }
3144 }
ec16945e 3145
6b9132a9 3146 } else {
1b9e5b12 3147 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3148
ec16945e
LP
3149 assert(arg_image);
3150 assert(!arg_template);
3151
30535c16
LP
3152 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3153 if (r == -EBUSY) {
3154 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3155 goto finish;
3156 }
3157 if (r < 0) {
3158 r = log_error_errno(r, "Failed to create image lock: %m");
3159 goto finish;
3160 }
3161
1b9e5b12 3162 if (!mkdtemp(template)) {
56f64d95 3163 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3164 r = -errno;
6b9132a9 3165 goto finish;
1b9e5b12 3166 }
6b9132a9 3167
1b9e5b12
LP
3168 arg_directory = strdup(template);
3169 if (!arg_directory) {
3170 r = log_oom();
3171 goto finish;
6b9132a9 3172 }
88213476 3173
1b9e5b12
LP
3174 image_fd = setup_image(&device_path, &loop_nr);
3175 if (image_fd < 0) {
3176 r = image_fd;
842f3b0f
LP
3177 goto finish;
3178 }
1b9e5b12 3179
4d9f07b4
LP
3180 r = dissect_image(image_fd,
3181 &root_device, &root_device_rw,
3182 &home_device, &home_device_rw,
3183 &srv_device, &srv_device_rw,
3184 &secondary);
1b9e5b12
LP
3185 if (r < 0)
3186 goto finish;
842f3b0f 3187 }
842f3b0f 3188
5a8af538
LP
3189 r = custom_mounts_prepare();
3190 if (r < 0)
3191 goto finish;
3192
03cfe0d5
LP
3193 interactive =
3194 isatty(STDIN_FILENO) > 0 &&
3195 isatty(STDOUT_FILENO) > 0;
9c857b9d 3196
db7feb7e
LP
3197 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3198 if (master < 0) {
ec16945e 3199 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3200 goto finish;
3201 }
3202
611b312b
LP
3203 r = ptsname_malloc(master, &console);
3204 if (r < 0) {
3205 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
3206 goto finish;
3207 }
3208
a258bf26 3209 if (unlockpt(master) < 0) {
ec16945e 3210 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3211 goto finish;
3212 }
3213
9c857b9d
LP
3214 if (!arg_quiet)
3215 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3216 arg_machine, arg_image ?: arg_directory);
3217
72c0a2c2 3218 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 3219
023fb90b
LP
3220 assert_se(sigemptyset(&mask_chld) == 0);
3221 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3222
03cfe0d5
LP
3223 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3224 r = log_error_errno(errno, "Failed to become subreaper: %m");
3225 goto finish;
3226 }
3227
d87be9b0 3228 for (;;) {
97044145 3229 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 };
113cea80 3230 ContainerStatus container_status;
7566e267 3231 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
03cfe0d5 3232 static const struct sigaction sa = {
189d5bac 3233 .sa_handler = nop_signal_handler,
e866af3a
DH
3234 .sa_flags = SA_NOCLDSTOP,
3235 };
03cfe0d5
LP
3236 int ifi = 0;
3237 ssize_t l;
dbb60d69
LP
3238 _cleanup_event_unref_ sd_event *event = NULL;
3239 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3240 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3241 char last_char = 0;
e866af3a 3242
7566e267 3243 r = barrier_create(&barrier);
a2da110b 3244 if (r < 0) {
da927ba9 3245 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3246 goto finish;
3247 }
3248
4610de50 3249 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
6d0b55c2
LP
3250 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3251 goto finish;
3252 }
3253
4610de50 3254 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
6d0b55c2
LP
3255 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3256 goto finish;
3257 }
3258
4610de50 3259 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
03cfe0d5
LP
3260 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3261 goto finish;
3262 }
3263
825d5287 3264 if (arg_userns)
4610de50 3265 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
825d5287
RM
3266 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3267 goto finish;
3268 }
3269
e866af3a
DH
3270 /* Child can be killed before execv(), so handle SIGCHLD
3271 * in order to interrupt parent's blocking calls and
3272 * give it a chance to call wait() and terminate. */
3273 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3274 if (r < 0) {
ec16945e 3275 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3276 goto finish;
3277 }
3278
e866af3a
DH
3279 r = sigaction(SIGCHLD, &sa, NULL);
3280 if (r < 0) {
ec16945e 3281 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3282 goto finish;
3283 }
3284
03cfe0d5 3285 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
d87be9b0
LP
3286 if (pid < 0) {
3287 if (errno == EINVAL)
ec16945e 3288 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3289 else
ec16945e 3290 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3291
d87be9b0
LP
3292 goto finish;
3293 }
a258bf26 3294
d87be9b0 3295 if (pid == 0) {
03cfe0d5 3296 /* The outer child only has a file system namespace. */
a2da110b
DH
3297 barrier_set_role(&barrier, BARRIER_CHILD);
3298
03e334a1 3299 master = safe_close(master);
a258bf26 3300
03e334a1 3301 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3302 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 3303 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
825d5287 3304 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 3305
ce30c8dc
LP
3306 (void) reset_all_signal_handlers();
3307 (void) reset_signal_mask();
f5c1b9ee 3308
03cfe0d5
LP
3309 r = outer_child(&barrier,
3310 arg_directory,
3311 console,
3312 root_device, root_device_rw,
3313 home_device, home_device_rw,
3314 srv_device, srv_device_rw,
3315 interactive,
3316 secondary,
3317 pid_socket_pair[1],
3318 kmsg_socket_pair[1],
3319 rtnl_socket_pair[1],
825d5287 3320 uid_shift_socket_pair[1],
f757855e 3321 fds);
0cb9fbcd 3322 if (r < 0)
a2da110b 3323 _exit(EXIT_FAILURE);
d87be9b0 3324
03cfe0d5 3325 _exit(EXIT_SUCCESS);
da5b3bad 3326 }
88213476 3327
a2da110b 3328 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 3329
2feceb5e 3330 fds = fdset_free(fds);
842f3b0f 3331
6d0b55c2
LP
3332 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3333 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 3334 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
82116c43 3335 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
6d0b55c2 3336
03cfe0d5
LP
3337 /* Wait for the outer child. */
3338 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3339 if (r < 0)
3340 goto finish;
3341 if (r != 0) {
3342 r = -EIO;
3343 goto finish;
3344 }
3345 pid = 0;
6dac160c 3346
03cfe0d5
LP
3347 /* And now retrieve the PID of the inner child. */
3348 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3349 if (l < 0) {
3350 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3351 goto finish;
3352 }
3353 if (l != sizeof(pid)) {
76d44882 3354 log_error("Short read while reading inner child PID.");
03cfe0d5
LP
3355 r = EIO;
3356 goto finish;
3357 }
354bfd2b 3358
03cfe0d5 3359 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 3360
03cfe0d5
LP
3361 if (arg_userns) {
3362 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3363 log_error("Child died too early.");
3364 r = -ESRCH;
840295fc 3365 goto finish;
03cfe0d5 3366 }
ab046dde 3367
825d5287
RM
3368 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3369 if (l < 0) {
3370 r = log_error_errno(errno, "Failed to read UID shift: %m");
3371 goto finish;
3372 }
3373 if (l != sizeof(arg_uid_shift)) {
76d44882 3374 log_error("Short read while reading UID shift.");
825d5287
RM
3375 r = EIO;
3376 goto finish;
3377 }
3378
03cfe0d5 3379 r = setup_uid_map(pid);
840295fc
LP
3380 if (r < 0)
3381 goto finish;
ab046dde 3382
03cfe0d5
LP
3383 (void) barrier_place(&barrier); /* #2 */
3384 }
c74e630d 3385
9a2a5625 3386 if (arg_private_network) {
4bbfe7ad 3387
9a2a5625
LP
3388 r = move_network_interfaces(pid, arg_network_interfaces);
3389 if (r < 0)
3390 goto finish;
5aa4bb6b 3391
9a2a5625
LP
3392 if (arg_network_veth) {
3393 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3394 if (r < 0)
3395 goto finish;
3396 else if (r > 0)
3397 ifi = r;
6dac160c 3398
9a2a5625
LP
3399 if (arg_network_bridge) {
3400 r = setup_bridge(veth_name, arg_network_bridge);
3401 if (r < 0)
3402 goto finish;
3403 if (r > 0)
3404 ifi = r;
3405 }
3406 }
6dac160c 3407
9a2a5625
LP
3408 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3409 if (r < 0)
3410 goto finish;
3411
3412 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3413 if (r < 0)
3414 goto finish;
3415 }
6dac160c 3416
b7103bc5
LP
3417 if (arg_register) {
3418 r = register_machine(
3419 arg_machine,
3420 pid,
3421 arg_directory,
3422 arg_uuid,
3423 ifi,
3424 arg_slice,
3425 arg_custom_mounts, arg_n_custom_mounts,
3426 arg_kill_signal,
3427 arg_property,
3428 arg_keep_unit);
3429 if (r < 0)
3430 goto finish;
3431 }
6dac160c 3432
34829a32 3433 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
efdb0237
LP
3434 if (r < 0)
3435 goto finish;
3436
34829a32
LP
3437 if (arg_keep_unit) {
3438 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3439 if (r < 0)
3440 goto finish;
3441 }
efdb0237 3442
34829a32 3443 r = chown_cgroup(pid, arg_uid_shift);
03cfe0d5
LP
3444 if (r < 0)
3445 goto finish;
6dac160c 3446
03cfe0d5
LP
3447 /* Notify the child that the parent is ready with all
3448 * its setup (including cgroup-ification), and that
3449 * the child can now hand over control to the code to
3450 * run inside the container. */
3451 (void) barrier_place(&barrier); /* #3 */
6dac160c 3452
03cfe0d5
LP
3453 /* Block SIGCHLD here, before notifying child.
3454 * process_pty() will handle it with the other signals. */
3455 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 3456
03cfe0d5
LP
3457 /* Reset signal to default */
3458 r = default_signals(SIGCHLD, -1);
3459 if (r < 0) {
3460 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3461 goto finish;
3462 }
e866af3a 3463
03cfe0d5 3464 /* Let the child know that we are ready and wait that the child is completely ready now. */
c0ffce2b
KN
3465 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3466 log_error("Child died too early.");
03cfe0d5
LP
3467 r = -ESRCH;
3468 goto finish;
3469 }
b12afc8c 3470
03cfe0d5
LP
3471 sd_notifyf(false,
3472 "READY=1\n"
3473 "STATUS=Container running.\n"
3474 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 3475
03cfe0d5
LP
3476 r = sd_event_new(&event);
3477 if (r < 0) {
3478 log_error_errno(r, "Failed to get default event source: %m");
3479 goto finish;
3480 }
88213476 3481
03cfe0d5
LP
3482 if (arg_kill_signal > 0) {
3483 /* Try to kill the init system on SIGINT or SIGTERM */
3484 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3485 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3486 } else {
3487 /* Immediately exit */
3488 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3489 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3490 }
023fb90b 3491
03cfe0d5
LP
3492 /* simply exit on sigchld */
3493 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 3494
03cfe0d5 3495 if (arg_expose_ports) {
7a8f6325 3496 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
03cfe0d5
LP
3497 if (r < 0)
3498 goto finish;
023fb90b 3499
7a8f6325 3500 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
03cfe0d5 3501 }
023fb90b 3502
03cfe0d5 3503 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 3504
ae3dde80 3505 r = pty_forward_new(event, master, PTY_FORWARD_IGNORE_VHANGUP | (interactive ? 0 : PTY_FORWARD_READ_ONLY), &forward);
03cfe0d5
LP
3506 if (r < 0) {
3507 log_error_errno(r, "Failed to create PTY forwarder: %m");
3508 goto finish;
3509 }
023fb90b 3510
03cfe0d5
LP
3511 r = sd_event_loop(event);
3512 if (r < 0) {
3513 log_error_errno(r, "Failed to run event loop: %m");
3514 goto finish;
3515 }
6d0b55c2 3516
03cfe0d5 3517 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 3518
03cfe0d5 3519 forward = pty_forward_free(forward);
6d0b55c2 3520
03cfe0d5
LP
3521 if (!arg_quiet && last_char != '\n')
3522 putc('\n', stdout);
04d39279 3523
03cfe0d5 3524 /* Kill if it is not dead yet anyway */
b7103bc5
LP
3525 if (arg_register && !arg_keep_unit)
3526 terminate_machine(pid);
1f0cd86b 3527
840295fc 3528 /* Normally redundant, but better safe than sorry */
04d39279 3529 kill(pid, SIGKILL);
a258bf26 3530
113cea80 3531 r = wait_for_container(pid, &container_status);
04d39279
LP
3532 pid = 0;
3533
ec16945e 3534 if (r < 0)
ce9f1527
LP
3535 /* We failed to wait for the container, or the
3536 * container exited abnormally */
ec16945e
LP
3537 goto finish;
3538 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
3539 /* The container exited with a non-zero
3540 * status, or with zero status and no reboot
3541 * was requested. */
ec16945e 3542 ret = r;
d87be9b0 3543 break;
ec16945e 3544 }
88213476 3545
113cea80 3546 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3547
3548 if (arg_keep_unit) {
3549 /* Special handling if we are running as a
3550 * service: instead of simply restarting the
3551 * machine we want to restart the entire
3552 * service, so let's inform systemd about this
3553 * with the special exit code 133. The service
3554 * file uses RestartForceExitStatus=133 so
3555 * that this results in a full nspawn
3556 * restart. This is necessary since we might
3557 * have cgroup parameters set we want to have
3558 * flushed out. */
ec16945e
LP
3559 ret = 133;
3560 r = 0;
ce38dbc8
LP
3561 break;
3562 }
6d0b55c2 3563
7a8f6325 3564 expose_port_flush(arg_expose_ports, &exposed);
d87be9b0 3565 }
88213476
LP
3566
3567finish:
af4ec430
LP
3568 sd_notify(false,
3569 "STOPPING=1\n"
3570 "STATUS=Terminating...");
3571
9444b1f2
LP
3572 if (pid > 0)
3573 kill(pid, SIGKILL);
88213476 3574
503546da
LP
3575 /* Try to flush whatever is still queued in the pty */
3576 if (master >= 0)
59f448cf 3577 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
503546da 3578
03cfe0d5
LP
3579 loop_remove(loop_nr, &image_fd);
3580
ec16945e
LP
3581 if (remove_subvol && arg_directory) {
3582 int k;
3583
5bcd08db 3584 k = btrfs_subvol_remove(arg_directory, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
ec16945e
LP
3585 if (k < 0)
3586 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3587 }
3588
785890ac
LP
3589 if (arg_machine) {
3590 const char *p;
3591
63c372cb 3592 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 3593 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
3594 }
3595
7a8f6325 3596 expose_port_flush(arg_expose_ports, &exposed);
f757855e 3597
04d391da 3598 free(arg_directory);
ec16945e
LP
3599 free(arg_template);
3600 free(arg_image);
7027ff61 3601 free(arg_machine);
c74e630d
LP
3602 free(arg_user);
3603 strv_free(arg_setenv);
f757855e 3604 free(arg_network_bridge);
c74e630d
LP
3605 strv_free(arg_network_interfaces);
3606 strv_free(arg_network_macvlan);
4bbfe7ad 3607 strv_free(arg_network_ipvlan);
f757855e
LP
3608 strv_free(arg_parameters);
3609 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3610 expose_port_free_all(arg_expose_ports);
6d0b55c2 3611
ec16945e 3612 return r < 0 ? EXIT_FAILURE : ret;
88213476 3613}