]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
import: rename "gpt" disk image type to "raw"
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
88213476 34#include <getopt.h>
a258bf26
LP
35#include <termios.h>
36#include <sys/signalfd.h>
687d0825 37#include <grp.h>
5ed27dbd 38#include <linux/fs.h>
9537eab0
LP
39#include <sys/un.h>
40#include <sys/socket.h>
aea38d80 41#include <linux/netlink.h>
aa28aefe 42#include <net/if.h>
69c79d3c 43#include <linux/veth.h>
6afc95b7 44#include <sys/personality.h>
1b9e5b12 45#include <linux/loop.h>
aa28aefe 46
5d63309c 47#ifdef HAVE_SELINUX
a8828ed9
DW
48#include <selinux/selinux.h>
49#endif
88213476 50
24fb1112
LP
51#ifdef HAVE_SECCOMP
52#include <seccomp.h>
53#endif
54
1b9e5b12
LP
55#ifdef HAVE_BLKID
56#include <blkid/blkid.h>
57#endif
58
1f0cd86b
LP
59#include "sd-daemon.h"
60#include "sd-bus.h"
61#include "sd-id128.h"
aa28aefe 62#include "sd-rtnl.h"
88213476
LP
63#include "log.h"
64#include "util.h"
49e942b2 65#include "mkdir.h"
6b2d0e85 66#include "macro.h"
d7832d2c 67#include "audit.h"
94d82985 68#include "missing.h"
04d391da 69#include "cgroup-util.h"
a258bf26 70#include "strv.h"
9eb977db 71#include "path-util.h"
a41fe3a2 72#include "loopback-setup.h"
4fc9982c 73#include "dev-setup.h"
842f3b0f 74#include "fdset.h"
acbeb427 75#include "build.h"
a5c32cff 76#include "fileio.h"
40ca29a1 77#include "bus-util.h"
1f0cd86b 78#include "bus-error.h"
4ba93280 79#include "ptyfwd.h"
9bd37b40 80#include "bus-kernel.h"
f4889f65 81#include "env-util.h"
7f112f50 82#include "def.h"
aa28aefe 83#include "rtnl-util.h"
7e227024 84#include "udev-util.h"
1b9e5b12
LP
85#include "blkid-util.h"
86#include "gpt.h"
01dde061 87#include "siphash24.h"
849958d1 88#include "copy.h"
3577de7a 89#include "base-filesystem.h"
a2da110b 90#include "barrier.h"
023fb90b 91#include "event-util.h"
f01ae826 92#include "capability.h"
2822da4f 93#include "cap-list.h"
ec16945e 94#include "btrfs-util.h"
1b9cebf6 95#include "machine-image.h"
6d0b55c2
LP
96#include "list.h"
97#include "in-addr-util.h"
98#include "fw-util.h"
99#include "local-addresses.h"
f2d88580 100
e9642be2
LP
101#ifdef HAVE_SECCOMP
102#include "seccomp-util.h"
103#endif
104
6d0b55c2
LP
105typedef struct ExposePort {
106 int protocol;
107 uint16_t host_port;
108 uint16_t container_port;
109 LIST_FIELDS(struct ExposePort, ports);
110} ExposePort;
111
113cea80
DH
112typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115} ContainerStatus;
116
57fb9fb5
LP
117typedef enum LinkJournal {
118 LINK_NO,
119 LINK_AUTO,
120 LINK_HOST,
121 LINK_GUEST
122} LinkJournal;
88213476 123
4d9f07b4
LP
124typedef enum Volatile {
125 VOLATILE_NO,
126 VOLATILE_YES,
127 VOLATILE_STATE,
128} Volatile;
129
88213476 130static char *arg_directory = NULL;
ec16945e 131static char *arg_template = NULL;
687d0825 132static char *arg_user = NULL;
9444b1f2 133static sd_id128_t arg_uuid = {};
7027ff61 134static char *arg_machine = NULL;
c74e630d
LP
135static const char *arg_selinux_context = NULL;
136static const char *arg_selinux_apifs_context = NULL;
9444b1f2 137static const char *arg_slice = NULL;
ff01d048 138static bool arg_private_network = false;
bc2f673e 139static bool arg_read_only = false;
0f0dbc46 140static bool arg_boot = false;
ec16945e 141static bool arg_ephemeral = false;
57fb9fb5 142static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 143static bool arg_link_journal_try = false;
5076f0cc
LP
144static uint64_t arg_retain =
145 (1ULL << CAP_CHOWN) |
146 (1ULL << CAP_DAC_OVERRIDE) |
147 (1ULL << CAP_DAC_READ_SEARCH) |
148 (1ULL << CAP_FOWNER) |
149 (1ULL << CAP_FSETID) |
150 (1ULL << CAP_IPC_OWNER) |
151 (1ULL << CAP_KILL) |
152 (1ULL << CAP_LEASE) |
153 (1ULL << CAP_LINUX_IMMUTABLE) |
154 (1ULL << CAP_NET_BIND_SERVICE) |
155 (1ULL << CAP_NET_BROADCAST) |
156 (1ULL << CAP_NET_RAW) |
157 (1ULL << CAP_SETGID) |
158 (1ULL << CAP_SETFCAP) |
159 (1ULL << CAP_SETPCAP) |
160 (1ULL << CAP_SETUID) |
161 (1ULL << CAP_SYS_ADMIN) |
162 (1ULL << CAP_SYS_CHROOT) |
163 (1ULL << CAP_SYS_NICE) |
164 (1ULL << CAP_SYS_PTRACE) |
165 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 166 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
167 (1ULL << CAP_SYS_BOOT) |
168 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
169 (1ULL << CAP_AUDIT_CONTROL) |
170 (1ULL << CAP_MKNOD);
17fe0523
LP
171static char **arg_bind = NULL;
172static char **arg_bind_ro = NULL;
06c17c39 173static char **arg_tmpfs = NULL;
f4889f65 174static char **arg_setenv = NULL;
284c0b91 175static bool arg_quiet = false;
8a96d94e 176static bool arg_share_system = false;
eb91eb18 177static bool arg_register = true;
89f7c846 178static bool arg_keep_unit = false;
aa28aefe 179static char **arg_network_interfaces = NULL;
c74e630d 180static char **arg_network_macvlan = NULL;
69c79d3c 181static bool arg_network_veth = false;
c74e630d 182static const char *arg_network_bridge = NULL;
6afc95b7 183static unsigned long arg_personality = 0xffffffffLU;
ec16945e 184static char *arg_image = NULL;
4d9f07b4 185static Volatile arg_volatile = VOLATILE_NO;
6d0b55c2 186static ExposePort *arg_expose_ports = NULL;
88213476 187
601185b4 188static void help(void) {
88213476
LP
189 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
190 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
191 " -h --help Show this help\n"
192 " --version Print version string\n"
69c79d3c 193 " -q --quiet Do not show status information\n"
1b9e5b12 194 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
195 " --template=PATH Initialize root directory from template directory,\n"
196 " if missing\n"
197 " -x --ephemeral Run container with snapshot of root directory, and\n"
198 " remove it after exit\n"
199 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
200 " -b --boot Boot up full system (i.e. invoke init)\n"
201 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 202 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 203 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 204 " -S --slice=SLICE Place the container in the specified slice\n"
69c79d3c
LP
205 " --private-network Disable network in container\n"
206 " --network-interface=INTERFACE\n"
207 " Assign an existing network interface to the\n"
208 " container\n"
c74e630d
LP
209 " --network-macvlan=INTERFACE\n"
210 " Create a macvlan network interface based on an\n"
211 " existing network interface to the container\n"
0dfaa006 212 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 213 " and container\n"
ab046dde 214 " --network-bridge=INTERFACE\n"
32457153 215 " Add a virtual ethernet connection between host\n"
ab046dde
TG
216 " and container and add it to an existing bridge on\n"
217 " the host\n"
6d0b55c2 218 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 219 " Expose a container IP port on the host\n"
82adf6af
LP
220 " -Z --selinux-context=SECLABEL\n"
221 " Set the SELinux security context to be used by\n"
222 " processes in the container\n"
223 " -L --selinux-apifs-context=SECLABEL\n"
224 " Set the SELinux security context to be used by\n"
225 " API/tmpfs file systems in the container\n"
a8828ed9
DW
226 " --capability=CAP In addition to the default, retain specified\n"
227 " capability\n"
228 " --drop-capability=CAP Drop the specified capability from the default set\n"
574edc90
MP
229 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
230 " try-guest, try-host\n"
231 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 232 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
233 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
234 " the container\n"
235 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 236 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
284c0b91 237 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 238 " --share-system Share system namespaces with host\n"
eb91eb18 239 " --register=BOOLEAN Register container as machine\n"
89f7c846 240 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 241 " the service unit nspawn is running in\n"
6d0b55c2
LP
242 " --volatile[=MODE] Run the system in volatile mode\n"
243 , program_invocation_short_name);
88213476
LP
244}
245
ec16945e
LP
246static int set_sanitized_path(char **b, const char *path) {
247 char *p;
248
249 assert(b);
250 assert(path);
251
252 p = canonicalize_file_name(path);
253 if (!p) {
254 if (errno != ENOENT)
255 return -errno;
256
257 p = path_make_absolute_cwd(path);
258 if (!p)
259 return -ENOMEM;
260 }
261
262 free(*b);
263 *b = path_kill_slashes(p);
264 return 0;
265}
266
88213476
LP
267static int parse_argv(int argc, char *argv[]) {
268
a41fe3a2 269 enum {
acbeb427
ZJS
270 ARG_VERSION = 0x100,
271 ARG_PRIVATE_NETWORK,
bc2f673e 272 ARG_UUID,
5076f0cc 273 ARG_READ_ONLY,
57fb9fb5 274 ARG_CAPABILITY,
420c7379 275 ARG_DROP_CAPABILITY,
17fe0523
LP
276 ARG_LINK_JOURNAL,
277 ARG_BIND,
f4889f65 278 ARG_BIND_RO,
06c17c39 279 ARG_TMPFS,
f4889f65 280 ARG_SETENV,
eb91eb18 281 ARG_SHARE_SYSTEM,
89f7c846 282 ARG_REGISTER,
aa28aefe 283 ARG_KEEP_UNIT,
69c79d3c 284 ARG_NETWORK_INTERFACE,
c74e630d 285 ARG_NETWORK_MACVLAN,
ab046dde 286 ARG_NETWORK_BRIDGE,
6afc95b7 287 ARG_PERSONALITY,
4d9f07b4 288 ARG_VOLATILE,
ec16945e 289 ARG_TEMPLATE,
a41fe3a2
LP
290 };
291
88213476 292 static const struct option options[] = {
aa28aefe
LP
293 { "help", no_argument, NULL, 'h' },
294 { "version", no_argument, NULL, ARG_VERSION },
295 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
296 { "template", required_argument, NULL, ARG_TEMPLATE },
297 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
298 { "user", required_argument, NULL, 'u' },
299 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
300 { "boot", no_argument, NULL, 'b' },
301 { "uuid", required_argument, NULL, ARG_UUID },
302 { "read-only", no_argument, NULL, ARG_READ_ONLY },
303 { "capability", required_argument, NULL, ARG_CAPABILITY },
304 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
305 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
306 { "bind", required_argument, NULL, ARG_BIND },
307 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 308 { "tmpfs", required_argument, NULL, ARG_TMPFS },
aa28aefe
LP
309 { "machine", required_argument, NULL, 'M' },
310 { "slice", required_argument, NULL, 'S' },
311 { "setenv", required_argument, NULL, ARG_SETENV },
312 { "selinux-context", required_argument, NULL, 'Z' },
313 { "selinux-apifs-context", required_argument, NULL, 'L' },
314 { "quiet", no_argument, NULL, 'q' },
315 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
316 { "register", required_argument, NULL, ARG_REGISTER },
317 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
318 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 319 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
0dfaa006 320 { "network-veth", no_argument, NULL, 'n' },
ab046dde 321 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 322 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 323 { "image", required_argument, NULL, 'i' },
4d9f07b4 324 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 325 { "port", required_argument, NULL, 'p' },
eb9da376 326 {}
88213476
LP
327 };
328
9444b1f2 329 int c, r;
a42c8b54 330 uint64_t plus = 0, minus = 0;
88213476
LP
331
332 assert(argc >= 0);
333 assert(argv);
334
0dfaa006 335 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
336
337 switch (c) {
338
339 case 'h':
601185b4
ZJS
340 help();
341 return 0;
88213476 342
acbeb427
ZJS
343 case ARG_VERSION:
344 puts(PACKAGE_STRING);
345 puts(SYSTEMD_FEATURES);
346 return 0;
347
88213476 348 case 'D':
ec16945e
LP
349 r = set_sanitized_path(&arg_directory, optarg);
350 if (r < 0)
351 return log_error_errno(r, "Invalid root directory: %m");
352
353 break;
354
355 case ARG_TEMPLATE:
356 r = set_sanitized_path(&arg_template, optarg);
357 if (r < 0)
358 return log_error_errno(r, "Invalid template directory: %m");
88213476
LP
359
360 break;
361
1b9e5b12 362 case 'i':
ec16945e
LP
363 r = set_sanitized_path(&arg_image, optarg);
364 if (r < 0)
365 return log_error_errno(r, "Invalid image path: %m");
366
367 break;
368
369 case 'x':
370 arg_ephemeral = true;
1b9e5b12
LP
371 break;
372
687d0825
MV
373 case 'u':
374 free(arg_user);
7027ff61
LP
375 arg_user = strdup(optarg);
376 if (!arg_user)
377 return log_oom();
687d0825
MV
378
379 break;
380
ab046dde 381 case ARG_NETWORK_BRIDGE:
c74e630d 382 arg_network_bridge = optarg;
ab046dde
TG
383
384 /* fall through */
385
0dfaa006 386 case 'n':
69c79d3c
LP
387 arg_network_veth = true;
388 arg_private_network = true;
389 break;
390
aa28aefe 391 case ARG_NETWORK_INTERFACE:
c74e630d
LP
392 if (strv_extend(&arg_network_interfaces, optarg) < 0)
393 return log_oom();
394
395 arg_private_network = true;
396 break;
397
398 case ARG_NETWORK_MACVLAN:
399 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
400 return log_oom();
401
402 /* fall through */
403
ff01d048
LP
404 case ARG_PRIVATE_NETWORK:
405 arg_private_network = true;
a41fe3a2
LP
406 break;
407
0f0dbc46
LP
408 case 'b':
409 arg_boot = true;
410 break;
411
144f0fc0 412 case ARG_UUID:
9444b1f2
LP
413 r = sd_id128_from_string(optarg, &arg_uuid);
414 if (r < 0) {
aa96c6cb 415 log_error("Invalid UUID: %s", optarg);
9444b1f2 416 return r;
aa96c6cb 417 }
9444b1f2 418 break;
aa96c6cb 419
9444b1f2 420 case 'S':
c74e630d 421 arg_slice = optarg;
144f0fc0
LP
422 break;
423
7027ff61 424 case 'M':
eb91eb18
LP
425 if (isempty(optarg)) {
426 free(arg_machine);
427 arg_machine = NULL;
428 } else {
0c3c4284 429 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
430 log_error("Invalid machine name: %s", optarg);
431 return -EINVAL;
432 }
7027ff61 433
0c3c4284
LP
434 r = free_and_strdup(&arg_machine, optarg);
435 if (r < 0)
eb91eb18
LP
436 return log_oom();
437
438 break;
439 }
7027ff61 440
82adf6af
LP
441 case 'Z':
442 arg_selinux_context = optarg;
a8828ed9
DW
443 break;
444
82adf6af
LP
445 case 'L':
446 arg_selinux_apifs_context = optarg;
a8828ed9
DW
447 break;
448
bc2f673e
LP
449 case ARG_READ_ONLY:
450 arg_read_only = true;
451 break;
452
420c7379
LP
453 case ARG_CAPABILITY:
454 case ARG_DROP_CAPABILITY: {
a2a5291b 455 const char *state, *word;
5076f0cc
LP
456 size_t length;
457
458 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 459 _cleanup_free_ char *t;
5076f0cc
LP
460
461 t = strndup(word, length);
0d0f0c50
SL
462 if (!t)
463 return log_oom();
5076f0cc 464
39ed67d1
LP
465 if (streq(t, "all")) {
466 if (c == ARG_CAPABILITY)
a42c8b54 467 plus = (uint64_t) -1;
39ed67d1 468 else
a42c8b54 469 minus = (uint64_t) -1;
39ed67d1 470 } else {
2822da4f
LP
471 int cap;
472
473 cap = capability_from_name(t);
474 if (cap < 0) {
39ed67d1
LP
475 log_error("Failed to parse capability %s.", t);
476 return -EINVAL;
477 }
478
479 if (c == ARG_CAPABILITY)
a42c8b54 480 plus |= 1ULL << (uint64_t) cap;
39ed67d1 481 else
a42c8b54 482 minus |= 1ULL << (uint64_t) cap;
5076f0cc 483 }
5076f0cc
LP
484 }
485
486 break;
487 }
488
57fb9fb5
LP
489 case 'j':
490 arg_link_journal = LINK_GUEST;
574edc90 491 arg_link_journal_try = true;
57fb9fb5
LP
492 break;
493
494 case ARG_LINK_JOURNAL:
53e438e3 495 if (streq(optarg, "auto")) {
57fb9fb5 496 arg_link_journal = LINK_AUTO;
53e438e3
LP
497 arg_link_journal_try = false;
498 } else if (streq(optarg, "no")) {
57fb9fb5 499 arg_link_journal = LINK_NO;
53e438e3
LP
500 arg_link_journal_try = false;
501 } else if (streq(optarg, "guest")) {
57fb9fb5 502 arg_link_journal = LINK_GUEST;
53e438e3
LP
503 arg_link_journal_try = false;
504 } else if (streq(optarg, "host")) {
57fb9fb5 505 arg_link_journal = LINK_HOST;
53e438e3
LP
506 arg_link_journal_try = false;
507 } else if (streq(optarg, "try-guest")) {
574edc90
MP
508 arg_link_journal = LINK_GUEST;
509 arg_link_journal_try = true;
510 } else if (streq(optarg, "try-host")) {
511 arg_link_journal = LINK_HOST;
512 arg_link_journal_try = true;
513 } else {
57fb9fb5
LP
514 log_error("Failed to parse link journal mode %s", optarg);
515 return -EINVAL;
516 }
517
518 break;
519
17fe0523
LP
520 case ARG_BIND:
521 case ARG_BIND_RO: {
522 _cleanup_free_ char *a = NULL, *b = NULL;
523 char *e;
524 char ***x;
17fe0523
LP
525
526 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
527
528 e = strchr(optarg, ':');
529 if (e) {
530 a = strndup(optarg, e - optarg);
531 b = strdup(e + 1);
532 } else {
533 a = strdup(optarg);
534 b = strdup(optarg);
535 }
536
537 if (!a || !b)
538 return log_oom();
539
540 if (!path_is_absolute(a) || !path_is_absolute(b)) {
541 log_error("Invalid bind mount specification: %s", optarg);
542 return -EINVAL;
543 }
544
545 r = strv_extend(x, a);
546 if (r < 0)
b3451bed 547 return log_oom();
17fe0523
LP
548
549 r = strv_extend(x, b);
550 if (r < 0)
b3451bed 551 return log_oom();
17fe0523
LP
552
553 break;
554 }
555
06c17c39
LP
556 case ARG_TMPFS: {
557 _cleanup_free_ char *a = NULL, *b = NULL;
558 char *e;
559
560 e = strchr(optarg, ':');
561 if (e) {
562 a = strndup(optarg, e - optarg);
563 b = strdup(e + 1);
564 } else {
565 a = strdup(optarg);
566 b = strdup("mode=0755");
567 }
568
569 if (!a || !b)
570 return log_oom();
571
572 if (!path_is_absolute(a)) {
573 log_error("Invalid tmpfs specification: %s", optarg);
574 return -EINVAL;
575 }
576
577 r = strv_push(&arg_tmpfs, a);
578 if (r < 0)
579 return log_oom();
580
581 a = NULL;
582
583 r = strv_push(&arg_tmpfs, b);
584 if (r < 0)
585 return log_oom();
586
587 b = NULL;
588
589 break;
590 }
591
f4889f65
LP
592 case ARG_SETENV: {
593 char **n;
594
595 if (!env_assignment_is_valid(optarg)) {
596 log_error("Environment variable assignment '%s' is not valid.", optarg);
597 return -EINVAL;
598 }
599
600 n = strv_env_set(arg_setenv, optarg);
601 if (!n)
602 return log_oom();
603
604 strv_free(arg_setenv);
605 arg_setenv = n;
606 break;
607 }
608
284c0b91
LP
609 case 'q':
610 arg_quiet = true;
611 break;
612
8a96d94e
LP
613 case ARG_SHARE_SYSTEM:
614 arg_share_system = true;
615 break;
616
eb91eb18
LP
617 case ARG_REGISTER:
618 r = parse_boolean(optarg);
619 if (r < 0) {
620 log_error("Failed to parse --register= argument: %s", optarg);
621 return r;
622 }
623
624 arg_register = r;
625 break;
626
89f7c846
LP
627 case ARG_KEEP_UNIT:
628 arg_keep_unit = true;
629 break;
630
6afc95b7
LP
631 case ARG_PERSONALITY:
632
ac45f971 633 arg_personality = personality_from_string(optarg);
6afc95b7
LP
634 if (arg_personality == 0xffffffffLU) {
635 log_error("Unknown or unsupported personality '%s'.", optarg);
636 return -EINVAL;
637 }
638
639 break;
640
4d9f07b4
LP
641 case ARG_VOLATILE:
642
643 if (!optarg)
644 arg_volatile = VOLATILE_YES;
645 else {
646 r = parse_boolean(optarg);
647 if (r < 0) {
648 if (streq(optarg, "state"))
649 arg_volatile = VOLATILE_STATE;
650 else {
651 log_error("Failed to parse --volatile= argument: %s", optarg);
652 return r;
653 }
654 } else
655 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
656 }
657
658 break;
659
6d0b55c2
LP
660 case 'p': {
661 const char *split, *e;
662 uint16_t container_port, host_port;
663 int protocol;
664 ExposePort *p;
665
666 if ((e = startswith(optarg, "tcp:")))
667 protocol = IPPROTO_TCP;
668 else if ((e = startswith(optarg, "udp:")))
669 protocol = IPPROTO_UDP;
670 else {
671 e = optarg;
672 protocol = IPPROTO_TCP;
673 }
674
675 split = strchr(e, ':');
676 if (split) {
677 char v[split - e + 1];
678
679 memcpy(v, e, split - e);
680 v[split - e] = 0;
681
682 r = safe_atou16(v, &host_port);
683 if (r < 0 || host_port <= 0) {
684 log_error("Failed to parse host port: %s", optarg);
685 return -EINVAL;
686 }
687
688 r = safe_atou16(split + 1, &container_port);
689 } else {
690 r = safe_atou16(e, &container_port);
691 host_port = container_port;
692 }
693
694 if (r < 0 || container_port <= 0) {
695 log_error("Failed to parse host port: %s", optarg);
696 return -EINVAL;
697 }
698
699 LIST_FOREACH(ports, p, arg_expose_ports) {
700 if (p->protocol == protocol && p->host_port == host_port) {
701 log_error("Duplicate port specification: %s", optarg);
702 return -EINVAL;
703 }
704 }
705
706 p = new(ExposePort, 1);
707 if (!p)
708 return log_oom();
709
710 p->protocol = protocol;
711 p->host_port = host_port;
712 p->container_port = container_port;
713
714 LIST_PREPEND(ports, arg_expose_ports, p);
715
716 break;
717 }
718
88213476
LP
719 case '?':
720 return -EINVAL;
721
722 default:
eb9da376 723 assert_not_reached("Unhandled option");
88213476 724 }
88213476 725
eb91eb18
LP
726 if (arg_share_system)
727 arg_register = false;
728
729 if (arg_boot && arg_share_system) {
730 log_error("--boot and --share-system may not be combined.");
731 return -EINVAL;
732 }
733
89f7c846
LP
734 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
735 log_error("--keep-unit may not be used when invoked from a user session.");
736 return -EINVAL;
737 }
738
1b9e5b12
LP
739 if (arg_directory && arg_image) {
740 log_error("--directory= and --image= may not be combined.");
741 return -EINVAL;
742 }
743
ec16945e
LP
744 if (arg_template && arg_image) {
745 log_error("--template= and --image= may not be combined.");
746 return -EINVAL;
747 }
748
749 if (arg_template && !(arg_directory || arg_machine)) {
750 log_error("--template= needs --directory= or --machine=.");
751 return -EINVAL;
752 }
753
754 if (arg_ephemeral && arg_template) {
755 log_error("--ephemeral and --template= may not be combined.");
756 return -EINVAL;
757 }
758
759 if (arg_ephemeral && arg_image) {
760 log_error("--ephemeral and --image= may not be combined.");
761 return -EINVAL;
762 }
763
df9a75e4
LP
764 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
765 log_error("--ephemeral and --link-journal= may not be combined.");
766 return -EINVAL;
767 }
768
4d9f07b4
LP
769 if (arg_volatile != VOLATILE_NO && arg_read_only) {
770 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
771 return -EINVAL;
772 }
773
6d0b55c2
LP
774 if (arg_expose_ports && !arg_private_network) {
775 log_error("Cannot use --port= without private networking.");
776 return -EINVAL;
777 }
778
a42c8b54
LP
779 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
780
88213476
LP
781 return 1;
782}
783
784static int mount_all(const char *dest) {
785
786 typedef struct MountPoint {
787 const char *what;
788 const char *where;
789 const char *type;
790 const char *options;
791 unsigned long flags;
3bd66c05 792 bool fatal;
88213476
LP
793 } MountPoint;
794
795 static const MountPoint mount_table[] = {
06c17c39
LP
796 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
797 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
798 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
799 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
800 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 801 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
06c17c39
LP
802 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
803 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 804#ifdef HAVE_SELINUX
06c17c39
LP
805 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
806 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 807#endif
88213476
LP
808 };
809
810 unsigned k;
811 int r = 0;
812
813 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
7fd1b19b 814 _cleanup_free_ char *where = NULL;
d002827b 815#ifdef HAVE_SELINUX
a8828ed9 816 _cleanup_free_ char *options = NULL;
d002827b
LP
817#endif
818 const char *o;
88213476
LP
819 int t;
820
17fe0523
LP
821 where = strjoin(dest, "/", mount_table[k].where, NULL);
822 if (!where)
823 return log_oom();
88213476 824
e65aec12 825 t = path_is_mount_point(where, true);
68fb0892 826 if (t < 0) {
da927ba9 827 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
88213476
LP
828
829 if (r == 0)
830 r = t;
831
832 continue;
833 }
834
9c1c7f71
LP
835 /* Skip this entry if it is not a remount. */
836 if (mount_table[k].what && t > 0)
014a9c77
LP
837 continue;
838
79d80fc1
TG
839 t = mkdir_p(where, 0755);
840 if (t < 0) {
841 if (mount_table[k].fatal) {
da927ba9 842 log_error_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
843
844 if (r == 0)
845 r = t;
846 } else
da927ba9 847 log_warning_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
848
849 continue;
850 }
88213476 851
a8828ed9 852#ifdef HAVE_SELINUX
82adf6af
LP
853 if (arg_selinux_apifs_context &&
854 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
855 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
856 if (!options)
857 return log_oom();
858
859 o = options;
860 } else
a8828ed9 861#endif
d002827b 862 o = mount_table[k].options;
a8828ed9 863
a8828ed9 864
88213476
LP
865 if (mount(mount_table[k].what,
866 where,
867 mount_table[k].type,
868 mount_table[k].flags,
79d80fc1 869 o) < 0) {
88213476 870
79d80fc1 871 if (mount_table[k].fatal) {
56f64d95 872 log_error_errno(errno, "mount(%s) failed: %m", where);
88213476 873
79d80fc1
TG
874 if (r == 0)
875 r = -errno;
876 } else
56f64d95 877 log_warning_errno(errno, "mount(%s) failed: %m", where);
88213476 878 }
88213476
LP
879 }
880
e58a1277
LP
881 return r;
882}
f8440af5 883
d6797c92 884static int mount_binds(const char *dest, char **l, bool ro) {
17fe0523
LP
885 char **x, **y;
886
887 STRV_FOREACH_PAIR(x, y, l) {
06c17c39 888 _cleanup_free_ char *where = NULL;
d2421337 889 struct stat source_st, dest_st;
2ed4e5e0 890 int r;
d2421337 891
4a62c710
MS
892 if (stat(*x, &source_st) < 0)
893 return log_error_errno(errno, "Failed to stat %s: %m", *x);
17fe0523 894
06c17c39
LP
895 where = strappend(dest, *y);
896 if (!where)
897 return log_oom();
898
2ed4e5e0
SL
899 r = stat(where, &dest_st);
900 if (r == 0) {
d2421337 901 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
06c17c39 902 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
d2421337
DR
903 return -EINVAL;
904 }
2ed4e5e0
SL
905 } else if (errno == ENOENT) {
906 r = mkdir_parents_label(where, 0755);
f647962d
MS
907 if (r < 0)
908 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
2ed4e5e0 909 } else {
56f64d95 910 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
2ed4e5e0
SL
911 return -errno;
912 }
06c17c39 913
2ed4e5e0 914 /* Create the mount point, but be conservative -- refuse to create block
4d9f07b4 915 * and char devices. */
79d80fc1
TG
916 if (S_ISDIR(source_st.st_mode)) {
917 r = mkdir_label(where, 0755);
f647962d
MS
918 if (r < 0 && errno != EEXIST)
919 return log_error_errno(r, "Failed to create mount point %s: %m", where);
79d80fc1
TG
920 } else if (S_ISFIFO(source_st.st_mode)) {
921 r = mkfifo(where, 0644);
4a62c710
MS
922 if (r < 0 && errno != EEXIST)
923 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
79d80fc1
TG
924 } else if (S_ISSOCK(source_st.st_mode)) {
925 r = mknod(where, 0644 | S_IFSOCK, 0);
4a62c710
MS
926 if (r < 0 && errno != EEXIST)
927 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
79d80fc1
TG
928 } else if (S_ISREG(source_st.st_mode)) {
929 r = touch(where);
f647962d
MS
930 if (r < 0)
931 return log_error_errno(r, "Failed to create mount point %s: %m", where);
79d80fc1 932 } else {
2ed4e5e0
SL
933 log_error("Refusing to create mountpoint for file: %s", *x);
934 return -ENOTSUP;
d2421337 935 }
17fe0523 936
4a62c710
MS
937 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
938 return log_error_errno(errno, "mount(%s) failed: %m", where);
17fe0523 939
d6797c92
LP
940 if (ro) {
941 r = bind_remount_recursive(where, true);
f647962d
MS
942 if (r < 0)
943 return log_error_errno(r, "Read-Only bind mount failed: %m");
17fe0523
LP
944 }
945 }
946
947 return 0;
948}
949
b12afc8c
LP
950static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
951 char *to;
952 int r;
953
954 to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
955
956 r = path_is_mount_point(to, false);
957 if (r < 0)
958 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
959 if (r > 0)
960 return 0;
961
962 mkdir_p(to, 0755);
963
964 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
965 return log_error_errno(errno, "Failed to mount to %s: %m", to);
966
967 return 1;
968}
969
970static int mount_cgroup(const char *dest) {
971 _cleanup_set_free_free_ Set *controllers = NULL;
972 _cleanup_free_ char *own_cgroup_path = NULL;
973 const char *cgroup_root, *systemd_root, *systemd_own;
974 int r;
975
976 controllers = set_new(&string_hash_ops);
977 if (!controllers)
978 return log_oom();
979
980 r = cg_kernel_controllers(controllers);
981 if (r < 0)
982 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
983
984 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
985 if (r < 0)
986 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
987
988 cgroup_root = strappenda(dest, "/sys/fs/cgroup");
989 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
990 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
991
992 for (;;) {
993 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
994
995 controller = set_steal_first(controllers);
996 if (!controller)
997 break;
998
999 origin = strappend("/sys/fs/cgroup/", controller);
1000 if (!origin)
1001 return log_oom();
1002
1003 r = readlink_malloc(origin, &combined);
1004 if (r == -EINVAL) {
1005 /* Not a symbolic link, but directly a single cgroup hierarchy */
1006
1007 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1008 if (r < 0)
1009 return r;
1010
1011 } else if (r < 0)
1012 return log_error_errno(r, "Failed to read link %s: %m", origin);
1013 else {
1014 _cleanup_free_ char *target = NULL;
1015
1016 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1017 if (!target)
1018 return log_oom();
1019
1020 /* A symbolic link, a combination of controllers in one hierarchy */
1021
1022 if (!filename_is_valid(combined)) {
1023 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1024 continue;
1025 }
1026
1027 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1028 if (r < 0)
1029 return r;
1030
1031 if (symlink(combined, target) < 0)
1032 return log_error_errno(errno, "Failed to create symlink for combined hiearchy: %m");
1033 }
1034 }
1035
1036 r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1037 if (r < 0)
1038 return r;
1039
1040 /* Make our own cgroup a (writable) bind mount */
1041 systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1042 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1043 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1044
1045 /* And then remount the systemd cgroup root read-only */
1046 systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1047 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1048 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1049
1050 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1051 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1052
1053 return 0;
1054}
1055
06c17c39
LP
1056static int mount_tmpfs(const char *dest) {
1057 char **i, **o;
1058
1059 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1060 _cleanup_free_ char *where = NULL;
79d80fc1 1061 int r;
06c17c39
LP
1062
1063 where = strappend(dest, *i);
1064 if (!where)
1065 return log_oom();
1066
79d80fc1 1067 r = mkdir_label(where, 0755);
04a91939
LP
1068 if (r < 0 && r != -EEXIST)
1069 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
06c17c39 1070
4a62c710
MS
1071 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1072 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
06c17c39
LP
1073 }
1074
1075 return 0;
1076}
1077
e58a1277 1078static int setup_timezone(const char *dest) {
d4036145
LP
1079 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1080 char *z, *y;
1081 int r;
f8440af5 1082
e58a1277
LP
1083 assert(dest);
1084
1085 /* Fix the timezone, if possible */
d4036145
LP
1086 r = readlink_malloc("/etc/localtime", &p);
1087 if (r < 0) {
1088 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1089 return 0;
1090 }
1091
1092 z = path_startswith(p, "../usr/share/zoneinfo/");
1093 if (!z)
1094 z = path_startswith(p, "/usr/share/zoneinfo/");
1095 if (!z) {
1096 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1097 return 0;
1098 }
1099
04bc4a3f
LP
1100 where = strappend(dest, "/etc/localtime");
1101 if (!where)
0d0f0c50 1102 return log_oom();
715ac17a 1103
d4036145
LP
1104 r = readlink_malloc(where, &q);
1105 if (r >= 0) {
1106 y = path_startswith(q, "../usr/share/zoneinfo/");
1107 if (!y)
1108 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1109
d4036145
LP
1110 /* Already pointing to the right place? Then do nothing .. */
1111 if (y && streq(y, z))
1112 return 0;
1113 }
1114
1115 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1116 if (!check)
0d0f0c50 1117 return log_oom();
4d1c38b8 1118
d4036145
LP
1119 if (access(check, F_OK) < 0) {
1120 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1121 return 0;
1122 }
68fb0892 1123
d4036145
LP
1124 what = strappend("../usr/share/zoneinfo/", z);
1125 if (!what)
1126 return log_oom();
1127
79d80fc1
TG
1128 r = mkdir_parents(where, 0755);
1129 if (r < 0) {
da927ba9 1130 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
79d80fc1
TG
1131
1132 return 0;
1133 }
1134
1135 r = unlink(where);
1136 if (r < 0 && errno != ENOENT) {
56f64d95 1137 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1138
1139 return 0;
1140 }
4d9f07b4 1141
d4036145 1142 if (symlink(what, where) < 0) {
56f64d95 1143 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1144 return 0;
1145 }
e58a1277
LP
1146
1147 return 0;
88213476
LP
1148}
1149
2547bb41 1150static int setup_resolv_conf(const char *dest) {
c8b32e11 1151 _cleanup_free_ char *where = NULL;
79d80fc1 1152 int r;
2547bb41
LP
1153
1154 assert(dest);
1155
1156 if (arg_private_network)
1157 return 0;
1158
1159 /* Fix resolv.conf, if possible */
04bc4a3f
LP
1160 where = strappend(dest, "/etc/resolv.conf");
1161 if (!where)
0d0f0c50 1162 return log_oom();
2547bb41 1163
77e63faf
LP
1164 /* We don't really care for the results of this really. If it
1165 * fails, it fails, but meh... */
79d80fc1
TG
1166 r = mkdir_parents(where, 0755);
1167 if (r < 0) {
da927ba9 1168 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
79d80fc1
TG
1169
1170 return 0;
1171 }
1172
f2068bcc 1173 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1174 if (r < 0) {
da927ba9 1175 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1176
1177 return 0;
1178 }
2547bb41
LP
1179
1180 return 0;
1181}
1182
4d9f07b4
LP
1183static int setup_volatile_state(const char *directory) {
1184 const char *p;
1185 int r;
1186
1187 assert(directory);
1188
1189 if (arg_volatile != VOLATILE_STATE)
1190 return 0;
1191
1192 /* --volatile=state means we simply overmount /var
1193 with a tmpfs, and the rest read-only. */
1194
1195 r = bind_remount_recursive(directory, true);
f647962d
MS
1196 if (r < 0)
1197 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
4d9f07b4
LP
1198
1199 p = strappenda(directory, "/var");
79d80fc1 1200 r = mkdir(p, 0755);
4a62c710
MS
1201 if (r < 0 && errno != EEXIST)
1202 return log_error_errno(errno, "Failed to create %s: %m", directory);
4d9f07b4 1203
4a62c710
MS
1204 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1205 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
4d9f07b4
LP
1206
1207 return 0;
1208}
1209
1210static int setup_volatile(const char *directory) {
1211 bool tmpfs_mounted = false, bind_mounted = false;
1212 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1213 const char *f, *t;
1214 int r;
1215
1216 assert(directory);
1217
1218 if (arg_volatile != VOLATILE_YES)
1219 return 0;
1220
1221 /* --volatile=yes means we mount a tmpfs to the root dir, and
1222 the original /usr to use inside it, and that read-only. */
1223
4a62c710
MS
1224 if (!mkdtemp(template))
1225 return log_error_errno(errno, "Failed to create temporary directory: %m");
4d9f07b4
LP
1226
1227 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
56f64d95 1228 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
4d9f07b4
LP
1229 r = -errno;
1230 goto fail;
1231 }
1232
1233 tmpfs_mounted = true;
1234
1235 f = strappenda(directory, "/usr");
1236 t = strappenda(template, "/usr");
1237
79d80fc1
TG
1238 r = mkdir(t, 0755);
1239 if (r < 0 && errno != EEXIST) {
56f64d95 1240 log_error_errno(errno, "Failed to create %s: %m", t);
79d80fc1
TG
1241 r = -errno;
1242 goto fail;
1243 }
1244
4d9f07b4 1245 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
56f64d95 1246 log_error_errno(errno, "Failed to create /usr bind mount: %m");
4d9f07b4
LP
1247 r = -errno;
1248 goto fail;
1249 }
1250
1251 bind_mounted = true;
1252
1253 r = bind_remount_recursive(t, true);
1254 if (r < 0) {
da927ba9 1255 log_error_errno(r, "Failed to remount %s read-only: %m", t);
4d9f07b4
LP
1256 goto fail;
1257 }
1258
1259 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
56f64d95 1260 log_error_errno(errno, "Failed to move root mount: %m");
4d9f07b4
LP
1261 r = -errno;
1262 goto fail;
1263 }
1264
1265 rmdir(template);
1266
1267 return 0;
1268
1269fail:
1270 if (bind_mounted)
1271 umount(t);
1272 if (tmpfs_mounted)
1273 umount(template);
1274 rmdir(template);
1275 return r;
1276}
1277
9f24adc2
LP
1278static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1279
1280 snprintf(s, 37,
1281 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1282 SD_ID128_FORMAT_VAL(id));
1283
1284 return s;
1285}
1286
04bc4a3f 1287static int setup_boot_id(const char *dest) {
7fd1b19b 1288 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 1289 sd_id128_t rnd = {};
04bc4a3f
LP
1290 char as_uuid[37];
1291 int r;
1292
1293 assert(dest);
1294
eb91eb18
LP
1295 if (arg_share_system)
1296 return 0;
1297
04bc4a3f
LP
1298 /* Generate a new randomized boot ID, so that each boot-up of
1299 * the container gets a new one */
1300
1301 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 1302 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
1303 if (!from || !to)
1304 return log_oom();
04bc4a3f
LP
1305
1306 r = sd_id128_randomize(&rnd);
f647962d
MS
1307 if (r < 0)
1308 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1309
9f24adc2 1310 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1311
574d5f2d 1312 r = write_string_file(from, as_uuid);
f647962d
MS
1313 if (r < 0)
1314 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f
LP
1315
1316 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
56f64d95 1317 log_error_errno(errno, "Failed to bind mount boot id: %m");
04bc4a3f 1318 r = -errno;
10d18763 1319 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
56f64d95 1320 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1321
1322 unlink(from);
04bc4a3f
LP
1323 return r;
1324}
1325
e58a1277 1326static int copy_devnodes(const char *dest) {
88213476
LP
1327
1328 static const char devnodes[] =
1329 "null\0"
1330 "zero\0"
1331 "full\0"
1332 "random\0"
1333 "urandom\0"
85614d66
TG
1334 "tty\0"
1335 "net/tun\0";
88213476
LP
1336
1337 const char *d;
e58a1277 1338 int r = 0;
7fd1b19b 1339 _cleanup_umask_ mode_t u;
a258bf26
LP
1340
1341 assert(dest);
124640f1
LP
1342
1343 u = umask(0000);
88213476
LP
1344
1345 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1346 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1347 struct stat st;
88213476 1348
7f112f50
LP
1349 from = strappend("/dev/", d);
1350 to = strjoin(dest, "/dev/", d, NULL);
1351 if (!from || !to)
1352 return log_oom();
88213476
LP
1353
1354 if (stat(from, &st) < 0) {
1355
4a62c710
MS
1356 if (errno != ENOENT)
1357 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1358
a258bf26 1359 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1360
ed8b7a3e 1361 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 1362 return -EIO;
a258bf26 1363
85614d66
TG
1364 } else {
1365 r = mkdir_parents(to, 0775);
1366 if (r < 0) {
da927ba9 1367 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
85614d66
TG
1368 return -r;
1369 }
a258bf26 1370
4a62c710 1371 if (mknod(to, st.st_mode, st.st_rdev) < 0)
080e7832 1372 return log_error_errno(errno, "mknod(%s) failed: %m", to);
88213476 1373 }
88213476
LP
1374 }
1375
e58a1277
LP
1376 return r;
1377}
88213476 1378
f2d88580
LP
1379static int setup_ptmx(const char *dest) {
1380 _cleanup_free_ char *p = NULL;
1381
1382 p = strappend(dest, "/dev/ptmx");
1383 if (!p)
1384 return log_oom();
1385
4a62c710
MS
1386 if (symlink("pts/ptmx", p) < 0)
1387 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
f2d88580
LP
1388
1389 return 0;
1390}
1391
e58a1277 1392static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1393 _cleanup_umask_ mode_t u;
1394 const char *to;
e58a1277 1395 struct stat st;
e58a1277 1396 int r;
e58a1277
LP
1397
1398 assert(dest);
1399 assert(console);
1400
1401 u = umask(0000);
1402
4a62c710
MS
1403 if (stat("/dev/null", &st) < 0)
1404 return log_error_errno(errno, "Failed to stat /dev/null: %m");
88213476 1405
e58a1277 1406 r = chmod_and_chown(console, 0600, 0, 0);
f647962d
MS
1407 if (r < 0)
1408 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1409
a258bf26
LP
1410 /* We need to bind mount the right tty to /dev/console since
1411 * ptys can only exist on pts file systems. To have something
eb0f0863
LP
1412 * to bind mount things on we create a device node first, and
1413 * use /dev/null for that since we the cgroups device policy
1414 * allows us to create that freely, while we cannot create
1415 * /dev/console. (Note that the major minor doesn't actually
1416 * matter here, since we mount it over anyway). */
a258bf26 1417
eb0f0863 1418 to = strappenda(dest, "/dev/console");
4a62c710
MS
1419 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1420 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
a258bf26 1421
4a62c710
MS
1422 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1423 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1424
25ea79fe 1425 return 0;
e58a1277
LP
1426}
1427
1428static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 1429 _cleanup_free_ char *from = NULL, *to = NULL;
7fd1b19b 1430 _cleanup_umask_ mode_t u;
6d0b55c2 1431 int r, fd, k;
e58a1277
LP
1432 union {
1433 struct cmsghdr cmsghdr;
1434 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1435 } control = {};
1436 struct msghdr mh = {
1437 .msg_control = &control,
1438 .msg_controllen = sizeof(control),
1439 };
e58a1277
LP
1440 struct cmsghdr *cmsg;
1441
1442 assert(dest);
1443 assert(kmsg_socket >= 0);
a258bf26 1444
e58a1277 1445 u = umask(0000);
a258bf26 1446
f1e5dfe2
LP
1447 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1448 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1449 * on the reading side behave very similar to /proc/kmsg,
1450 * their writing side behaves differently from /dev/kmsg in
1451 * that writing blocks when nothing is reading. In order to
1452 * avoid any problems with containers deadlocking due to this
1453 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
1454 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1455 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1456 return log_oom();
e58a1277 1457
4a62c710
MS
1458 if (mkfifo(from, 0600) < 0)
1459 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
e58a1277
LP
1460
1461 r = chmod_and_chown(from, 0600, 0, 0);
f647962d
MS
1462 if (r < 0)
1463 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
e58a1277 1464
4a62c710
MS
1465 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1466 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1467
1468 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1469 if (fd < 0)
1470 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1471
e58a1277
LP
1472 cmsg = CMSG_FIRSTHDR(&mh);
1473 cmsg->cmsg_level = SOL_SOCKET;
1474 cmsg->cmsg_type = SCM_RIGHTS;
1475 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1476 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1477
1478 mh.msg_controllen = cmsg->cmsg_len;
1479
1480 /* Store away the fd in the socket, so that it stays open as
1481 * long as we run the child */
6d0b55c2 1482 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
03e334a1 1483 safe_close(fd);
e58a1277 1484
4a62c710
MS
1485 if (k < 0)
1486 return log_error_errno(errno, "Failed to send FIFO fd: %m");
a258bf26 1487
f1e5dfe2
LP
1488 /* And now make the FIFO unavailable as /dev/kmsg... */
1489 unlink(from);
25ea79fe 1490 return 0;
88213476
LP
1491}
1492
6d0b55c2
LP
1493static int send_rtnl(int send_fd) {
1494 union {
1495 struct cmsghdr cmsghdr;
1496 uint8_t buf[CMSG_SPACE(sizeof(int))];
1497 } control = {};
1498 struct msghdr mh = {
1499 .msg_control = &control,
1500 .msg_controllen = sizeof(control),
1501 };
1502 struct cmsghdr *cmsg;
1503 _cleanup_close_ int fd = -1;
1504 ssize_t k;
1505
1506 assert(send_fd >= 0);
1507
1508 if (!arg_expose_ports)
1509 return 0;
1510
1511 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1512 if (fd < 0)
1513 return log_error_errno(errno, "failed to allocate container netlink: %m");
1514
1515 cmsg = CMSG_FIRSTHDR(&mh);
1516 cmsg->cmsg_level = SOL_SOCKET;
1517 cmsg->cmsg_type = SCM_RIGHTS;
1518 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1519 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1520
1521 mh.msg_controllen = cmsg->cmsg_len;
1522
1523 /* Store away the fd in the socket, so that it stays open as
1524 * long as we run the child */
1525 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1526 if (k < 0)
1527 return log_error_errno(errno, "Failed to send netlink fd: %m");
1528
1529 return 0;
1530}
1531
1532static int flush_ports(union in_addr_union *exposed) {
1533 ExposePort *p;
1534 int r, af = AF_INET;
1535
1536 assert(exposed);
1537
1538 if (!arg_expose_ports)
1539 return 0;
1540
1541 if (in_addr_is_null(af, exposed))
1542 return 0;
1543
1544 log_debug("Lost IP address.");
1545
1546 LIST_FOREACH(ports, p, arg_expose_ports) {
1547 r = fw_add_local_dnat(false,
1548 af,
1549 p->protocol,
1550 NULL,
1551 NULL, 0,
1552 NULL, 0,
1553 p->host_port,
1554 exposed,
1555 p->container_port,
1556 NULL);
1557 if (r < 0)
1558 log_warning_errno(r, "Failed to modify firewall: %m");
1559 }
1560
1561 *exposed = IN_ADDR_NULL;
1562 return 0;
1563}
1564
1565static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1566 _cleanup_free_ struct local_address *addresses = NULL;
1567 _cleanup_free_ char *pretty = NULL;
1568 union in_addr_union new_exposed;
1569 ExposePort *p;
1570 bool add;
1571 int af = AF_INET, r;
1572
1573 assert(exposed);
1574
1575 /* Invoked each time an address is added or removed inside the
1576 * container */
1577
1578 if (!arg_expose_ports)
1579 return 0;
1580
1581 r = local_addresses(rtnl, 0, af, &addresses);
1582 if (r < 0)
1583 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1584
1585 add = r > 0 &&
1586 addresses[0].family == af &&
1587 addresses[0].scope < RT_SCOPE_LINK;
1588
1589 if (!add)
1590 return flush_ports(exposed);
1591
1592 new_exposed = addresses[0].address;
1593 if (in_addr_equal(af, exposed, &new_exposed))
1594 return 0;
1595
1596 in_addr_to_string(af, &new_exposed, &pretty);
1597 log_debug("New container IP is %s.", strna(pretty));
1598
1599 LIST_FOREACH(ports, p, arg_expose_ports) {
1600
1601 r = fw_add_local_dnat(true,
1602 af,
1603 p->protocol,
1604 NULL,
1605 NULL, 0,
1606 NULL, 0,
1607 p->host_port,
1608 &new_exposed,
1609 p->container_port,
1610 in_addr_is_null(af, exposed) ? NULL : exposed);
1611 if (r < 0)
1612 log_warning_errno(r, "Failed to modify firewall: %m");
1613 }
1614
1615 *exposed = new_exposed;
1616 return 0;
1617}
1618
1619static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1620 union in_addr_union *exposed = userdata;
1621
1622 assert(rtnl);
1623 assert(m);
1624 assert(exposed);
1625
1626 expose_ports(rtnl, exposed);
1627 return 0;
1628}
1629
1630static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1631 union {
1632 struct cmsghdr cmsghdr;
1633 uint8_t buf[CMSG_SPACE(sizeof(int))];
1634 } control = {};
1635 struct msghdr mh = {
1636 .msg_control = &control,
1637 .msg_controllen = sizeof(control),
1638 };
1639 struct cmsghdr *cmsg;
1640 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1641 int fd, r;
1642 ssize_t k;
1643
1644 assert(event);
1645 assert(recv_fd >= 0);
1646 assert(ret);
1647
1648 if (!arg_expose_ports)
1649 return 0;
1650
1651 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1652 if (k < 0)
1653 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1654
1655 cmsg = CMSG_FIRSTHDR(&mh);
1656 assert(cmsg->cmsg_level == SOL_SOCKET);
1657 assert(cmsg->cmsg_type == SCM_RIGHTS);
657bdca9 1658 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
6d0b55c2
LP
1659 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1660
1661 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1662 if (r < 0) {
1663 safe_close(fd);
1664 return log_error_errno(r, "Failed to create rtnl object: %m");
1665 }
1666
1667 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1668 if (r < 0)
1669 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1670
1671 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1672 if (r < 0)
1673 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1674
1675 r = sd_rtnl_attach_event(rtnl, event, 0);
1676 if (r < 0)
1677 return log_error_errno(r, "Failed to add to even loop: %m");
1678
1679 *ret = rtnl;
1680 rtnl = NULL;
1681
1682 return 0;
1683}
1684
3a74cea5 1685static int setup_hostname(void) {
3a74cea5 1686
eb91eb18
LP
1687 if (arg_share_system)
1688 return 0;
1689
605f81a8 1690 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1691 return -errno;
3a74cea5 1692
7027ff61 1693 return 0;
3a74cea5
LP
1694}
1695
57fb9fb5 1696static int setup_journal(const char *directory) {
4d680aee 1697 sd_id128_t machine_id, this_id;
7fd1b19b 1698 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 1699 char *id;
57fb9fb5
LP
1700 int r;
1701
df9a75e4
LP
1702 /* Don't link journals in ephemeral mode */
1703 if (arg_ephemeral)
1704 return 0;
1705
57fb9fb5 1706 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
1707 if (!p)
1708 return log_oom();
57fb9fb5
LP
1709
1710 r = read_one_line_file(p, &b);
27407a01
ZJS
1711 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1712 return 0;
f647962d
MS
1713 else if (r < 0)
1714 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
57fb9fb5 1715
27407a01
ZJS
1716 id = strstrip(b);
1717 if (isempty(id) && arg_link_journal == LINK_AUTO)
1718 return 0;
57fb9fb5 1719
27407a01
ZJS
1720 /* Verify validity */
1721 r = sd_id128_from_string(id, &machine_id);
f647962d
MS
1722 if (r < 0)
1723 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
57fb9fb5 1724
4d680aee 1725 r = sd_id128_get_machine(&this_id);
f647962d
MS
1726 if (r < 0)
1727 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1728
1729 if (sd_id128_equal(machine_id, this_id)) {
1730 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1731 "Host and machine ids are equal (%s): refusing to link journals", id);
1732 if (arg_link_journal == LINK_AUTO)
1733 return 0;
df9a75e4 1734 return -EEXIST;
4d680aee
ZJS
1735 }
1736
1737 if (arg_link_journal == LINK_NO)
1738 return 0;
1739
57fb9fb5 1740 free(p);
27407a01
ZJS
1741 p = strappend("/var/log/journal/", id);
1742 q = strjoin(directory, "/var/log/journal/", id, NULL);
1743 if (!p || !q)
1744 return log_oom();
1745
1746 if (path_is_mount_point(p, false) > 0) {
1747 if (arg_link_journal != LINK_AUTO) {
1748 log_error("%s: already a mount point, refusing to use for journal", p);
1749 return -EEXIST;
1750 }
1751
1752 return 0;
57fb9fb5
LP
1753 }
1754
27407a01 1755 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1756 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1757 log_error("%s: already a mount point, refusing to use for journal", q);
1758 return -EEXIST;
57fb9fb5
LP
1759 }
1760
27407a01 1761 return 0;
57fb9fb5
LP
1762 }
1763
1764 r = readlink_and_make_absolute(p, &d);
1765 if (r >= 0) {
1766 if ((arg_link_journal == LINK_GUEST ||
1767 arg_link_journal == LINK_AUTO) &&
1768 path_equal(d, q)) {
1769
27407a01
ZJS
1770 r = mkdir_p(q, 0755);
1771 if (r < 0)
56f64d95 1772 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1773 return 0;
57fb9fb5
LP
1774 }
1775
4a62c710
MS
1776 if (unlink(p) < 0)
1777 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1778 } else if (r == -EINVAL) {
1779
1780 if (arg_link_journal == LINK_GUEST &&
1781 rmdir(p) < 0) {
1782
27407a01
ZJS
1783 if (errno == ENOTDIR) {
1784 log_error("%s already exists and is neither a symlink nor a directory", p);
1785 return r;
1786 } else {
56f64d95 1787 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 1788 return -errno;
57fb9fb5 1789 }
57fb9fb5
LP
1790 }
1791 } else if (r != -ENOENT) {
56f64d95 1792 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 1793 return r;
57fb9fb5
LP
1794 }
1795
1796 if (arg_link_journal == LINK_GUEST) {
1797
1798 if (symlink(q, p) < 0) {
574edc90 1799 if (arg_link_journal_try) {
56f64d95 1800 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
1801 return 0;
1802 } else {
56f64d95 1803 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
1804 return -errno;
1805 }
57fb9fb5
LP
1806 }
1807
27407a01
ZJS
1808 r = mkdir_p(q, 0755);
1809 if (r < 0)
56f64d95 1810 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1811 return 0;
57fb9fb5
LP
1812 }
1813
1814 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1815 /* don't create parents here -- if the host doesn't have
1816 * permanent journal set up, don't force it here */
1817 r = mkdir(p, 0755);
57fb9fb5 1818 if (r < 0) {
574edc90 1819 if (arg_link_journal_try) {
56f64d95 1820 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
1821 return 0;
1822 } else {
56f64d95 1823 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
1824 return r;
1825 }
57fb9fb5
LP
1826 }
1827
27407a01
ZJS
1828 } else if (access(p, F_OK) < 0)
1829 return 0;
57fb9fb5 1830
cdb2b9d0
LP
1831 if (dir_is_empty(q) == 0)
1832 log_warning("%s is not empty, proceeding anyway.", q);
1833
57fb9fb5
LP
1834 r = mkdir_p(q, 0755);
1835 if (r < 0) {
56f64d95 1836 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 1837 return r;
57fb9fb5
LP
1838 }
1839
4a62c710
MS
1840 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1841 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1842
27407a01 1843 return 0;
57fb9fb5
LP
1844}
1845
88213476 1846static int drop_capabilities(void) {
5076f0cc 1847 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1848}
1849
5aa4bb6b 1850static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 1851 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
24996861 1852 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
1853 int r;
1854
eb91eb18
LP
1855 if (!arg_register)
1856 return 0;
1857
1c03020c 1858 r = sd_bus_default_system(&bus);
f647962d
MS
1859 if (r < 0)
1860 return log_error_errno(r, "Failed to open system bus: %m");
9444b1f2 1861
89f7c846
LP
1862 if (arg_keep_unit) {
1863 r = sd_bus_call_method(
1864 bus,
1865 "org.freedesktop.machine1",
1866 "/org/freedesktop/machine1",
1867 "org.freedesktop.machine1.Manager",
5aa4bb6b 1868 "RegisterMachineWithNetwork",
89f7c846
LP
1869 &error,
1870 NULL,
5aa4bb6b 1871 "sayssusai",
89f7c846
LP
1872 arg_machine,
1873 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1874 "nspawn",
1875 "container",
1876 (uint32_t) pid,
5aa4bb6b
LP
1877 strempty(arg_directory),
1878 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 1879 } else {
9457ac5b
LP
1880 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1881
1882 r = sd_bus_message_new_method_call(
89f7c846 1883 bus,
9457ac5b 1884 &m,
89f7c846
LP
1885 "org.freedesktop.machine1",
1886 "/org/freedesktop/machine1",
1887 "org.freedesktop.machine1.Manager",
5aa4bb6b 1888 "CreateMachineWithNetwork");
f647962d
MS
1889 if (r < 0)
1890 return log_error_errno(r, "Failed to create message: %m");
9457ac5b
LP
1891
1892 r = sd_bus_message_append(
1893 m,
5aa4bb6b 1894 "sayssusai",
89f7c846
LP
1895 arg_machine,
1896 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1897 "nspawn",
1898 "container",
1899 (uint32_t) pid,
5aa4bb6b
LP
1900 strempty(arg_directory),
1901 local_ifindex > 0 ? 1 : 0, local_ifindex);
f647962d
MS
1902 if (r < 0)
1903 return log_error_errno(r, "Failed to append message arguments: %m");
9457ac5b
LP
1904
1905 r = sd_bus_message_open_container(m, 'a', "(sv)");
f647962d
MS
1906 if (r < 0)
1907 return log_error_errno(r, "Failed to open container: %m");
9457ac5b
LP
1908
1909 if (!isempty(arg_slice)) {
1910 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
f647962d
MS
1911 if (r < 0)
1912 return log_error_errno(r, "Failed to append slice: %m");
9457ac5b
LP
1913 }
1914
1915 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
f647962d
MS
1916 if (r < 0)
1917 return log_error_errno(r, "Failed to add device policy: %m");
9457ac5b 1918
63cc4c31 1919 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
9457ac5b
LP
1920 /* Allow the container to
1921 * access and create the API
1922 * device nodes, so that
1923 * PrivateDevices= in the
1924 * container can work
1925 * fine */
1926 "/dev/null", "rwm",
1927 "/dev/zero", "rwm",
1928 "/dev/full", "rwm",
1929 "/dev/random", "rwm",
1930 "/dev/urandom", "rwm",
1931 "/dev/tty", "rwm",
864e1706 1932 "/dev/net/tun", "rwm",
9457ac5b
LP
1933 /* Allow the container
1934 * access to ptys. However,
1935 * do not permit the
1936 * container to ever create
1937 * these device nodes. */
1938 "/dev/pts/ptmx", "rw",
63cc4c31 1939 "char-pts", "rw");
f647962d
MS
1940 if (r < 0)
1941 return log_error_errno(r, "Failed to add device whitelist: %m");
9457ac5b
LP
1942
1943 r = sd_bus_message_close_container(m);
f647962d
MS
1944 if (r < 0)
1945 return log_error_errno(r, "Failed to close container: %m");
9457ac5b
LP
1946
1947 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
1948 }
1949
9444b1f2 1950 if (r < 0) {
1f0cd86b
LP
1951 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1952 return r;
1953 }
1954
1955 return 0;
1956}
1957
1958static int terminate_machine(pid_t pid) {
1959 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1960 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
24996861 1961 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
1962 const char *path;
1963 int r;
1964
eb91eb18
LP
1965 if (!arg_register)
1966 return 0;
1967
76b54375 1968 r = sd_bus_default_system(&bus);
f647962d
MS
1969 if (r < 0)
1970 return log_error_errno(r, "Failed to open system bus: %m");
1f0cd86b
LP
1971
1972 r = sd_bus_call_method(
1973 bus,
1974 "org.freedesktop.machine1",
1975 "/org/freedesktop/machine1",
1976 "org.freedesktop.machine1.Manager",
1977 "GetMachineByPID",
1978 &error,
1979 &reply,
1980 "u",
1981 (uint32_t) pid);
1982 if (r < 0) {
1983 /* Note that the machine might already have been
1984 * cleaned up automatically, hence don't consider it a
1985 * failure if we cannot get the machine object. */
1986 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1987 return 0;
1988 }
1989
1990 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
1991 if (r < 0)
1992 return bus_log_parse_error(r);
9444b1f2 1993
1f0cd86b
LP
1994 r = sd_bus_call_method(
1995 bus,
1996 "org.freedesktop.machine1",
1997 path,
1998 "org.freedesktop.machine1.Machine",
1999 "Terminate",
2000 &error,
2001 NULL,
2002 NULL);
2003 if (r < 0) {
2004 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2005 return 0;
2006 }
2007
9444b1f2
LP
2008 return 0;
2009}
2010
db999e0f
LP
2011static int reset_audit_loginuid(void) {
2012 _cleanup_free_ char *p = NULL;
2013 int r;
2014
2015 if (arg_share_system)
2016 return 0;
2017
2018 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2019 if (r == -ENOENT)
db999e0f 2020 return 0;
f647962d
MS
2021 if (r < 0)
2022 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2023
2024 /* Already reset? */
2025 if (streq(p, "4294967295"))
2026 return 0;
2027
2028 r = write_string_file("/proc/self/loginuid", "4294967295");
2029 if (r < 0) {
2030 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2031 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2032 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2033 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2034 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
77b6e194 2035
db999e0f 2036 sleep(5);
77b6e194 2037 }
db999e0f
LP
2038
2039 return 0;
77b6e194
LP
2040}
2041
4f758c23
LP
2042#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2043#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
e867ceb6 2044#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
01dde061 2045
a90e2305 2046static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
01dde061
TG
2047 uint8_t result[8];
2048 size_t l, sz;
a90e2305
LP
2049 uint8_t *v, *i;
2050 int r;
01dde061
TG
2051
2052 l = strlen(arg_machine);
2053 sz = sizeof(sd_id128_t) + l;
e867ceb6
LP
2054 if (idx > 0)
2055 sz += sizeof(idx);
a90e2305 2056
01dde061
TG
2057 v = alloca(sz);
2058
2059 /* fetch some persistent data unique to the host */
2060 r = sd_id128_get_machine((sd_id128_t*) v);
2061 if (r < 0)
2062 return r;
2063
2064 /* combine with some data unique (on this host) to this
2065 * container instance */
a90e2305
LP
2066 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2067 if (idx > 0) {
2068 idx = htole64(idx);
2069 memcpy(i, &idx, sizeof(idx));
2070 }
01dde061
TG
2071
2072 /* Let's hash the host machine ID plus the container name. We
2073 * use a fixed, but originally randomly created hash key here. */
4f758c23 2074 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
2075
2076 assert_cc(ETH_ALEN <= sizeof(result));
2077 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2078
2079 /* see eth_random_addr in the kernel */
2080 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2081 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2082
2083 return 0;
2084}
2085
5aa4bb6b 2086static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
69c79d3c 2087 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 2088 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4f758c23 2089 struct ether_addr mac_host, mac_container;
5aa4bb6b 2090 int r, i;
69c79d3c
LP
2091
2092 if (!arg_private_network)
2093 return 0;
2094
2095 if (!arg_network_veth)
2096 return 0;
2097
08af0da2
LP
2098 /* Use two different interface name prefixes depending whether
2099 * we are in bridge mode or not. */
c00524c9 2100 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 2101 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 2102
e867ceb6
LP
2103 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2104 if (r < 0)
2105 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
4f758c23 2106
e867ceb6
LP
2107 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2108 if (r < 0)
2109 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
01dde061 2110
151b9b96 2111 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2112 if (r < 0)
2113 return log_error_errno(r, "Failed to connect to netlink: %m");
69c79d3c 2114
151b9b96 2115 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2116 if (r < 0)
2117 return log_error_errno(r, "Failed to allocate netlink message: %m");
69c79d3c 2118
ab046dde 2119 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
f647962d
MS
2120 if (r < 0)
2121 return log_error_errno(r, "Failed to add netlink interface name: %m");
69c79d3c 2122
4f758c23 2123 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
f647962d
MS
2124 if (r < 0)
2125 return log_error_errno(r, "Failed to add netlink MAC address: %m");
4f758c23 2126
ee3a6a51 2127 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2128 if (r < 0)
2129 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2130
d8e538ec 2131 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
f647962d
MS
2132 if (r < 0)
2133 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2134
ee3a6a51 2135 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
f647962d
MS
2136 if (r < 0)
2137 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2138
ab046dde 2139 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
f647962d
MS
2140 if (r < 0)
2141 return log_error_errno(r, "Failed to add netlink interface name: %m");
01dde061 2142
4f758c23 2143 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
f647962d
MS
2144 if (r < 0)
2145 return log_error_errno(r, "Failed to add netlink MAC address: %m");
69c79d3c 2146
ab046dde 2147 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2148 if (r < 0)
2149 return log_error_errno(r, "Failed to add netlink namespace field: %m");
69c79d3c
LP
2150
2151 r = sd_rtnl_message_close_container(m);
f647962d
MS
2152 if (r < 0)
2153 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2154
2155 r = sd_rtnl_message_close_container(m);
f647962d
MS
2156 if (r < 0)
2157 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2158
2159 r = sd_rtnl_message_close_container(m);
f647962d
MS
2160 if (r < 0)
2161 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2162
2163 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2164 if (r < 0)
2165 return log_error_errno(r, "Failed to add new veth interfaces: %m");
69c79d3c 2166
5aa4bb6b 2167 i = (int) if_nametoindex(iface_name);
4a62c710
MS
2168 if (i <= 0)
2169 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
5aa4bb6b
LP
2170
2171 *ifi = i;
2172
69c79d3c
LP
2173 return 0;
2174}
2175
5aa4bb6b 2176static int setup_bridge(const char veth_name[], int *ifi) {
ab046dde
TG
2177 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2178 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2179 int r, bridge;
2180
2181 if (!arg_private_network)
2182 return 0;
2183
2184 if (!arg_network_veth)
2185 return 0;
2186
2187 if (!arg_network_bridge)
2188 return 0;
2189
2190 bridge = (int) if_nametoindex(arg_network_bridge);
4a62c710
MS
2191 if (bridge <= 0)
2192 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
ab046dde 2193
5aa4bb6b
LP
2194 *ifi = bridge;
2195
151b9b96 2196 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2197 if (r < 0)
2198 return log_error_errno(r, "Failed to connect to netlink: %m");
ab046dde 2199
151b9b96 2200 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
f647962d
MS
2201 if (r < 0)
2202 return log_error_errno(r, "Failed to allocate netlink message: %m");
ab046dde 2203
039dd4af 2204 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
f647962d
MS
2205 if (r < 0)
2206 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
039dd4af 2207
ab046dde 2208 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
f647962d
MS
2209 if (r < 0)
2210 return log_error_errno(r, "Failed to add netlink interface name field: %m");
ab046dde
TG
2211
2212 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
f647962d
MS
2213 if (r < 0)
2214 return log_error_errno(r, "Failed to add netlink master field: %m");
ab046dde
TG
2215
2216 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2217 if (r < 0)
2218 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
ab046dde
TG
2219
2220 return 0;
2221}
2222
c74e630d
LP
2223static int parse_interface(struct udev *udev, const char *name) {
2224 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2225 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2226 int ifi;
2227
2228 ifi = (int) if_nametoindex(name);
4a62c710
MS
2229 if (ifi <= 0)
2230 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
c74e630d
LP
2231
2232 sprintf(ifi_str, "n%i", ifi);
2233 d = udev_device_new_from_device_id(udev, ifi_str);
4a62c710
MS
2234 if (!d)
2235 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
c74e630d
LP
2236
2237 if (udev_device_get_is_initialized(d) <= 0) {
2238 log_error("Network interface %s is not initialized yet.", name);
2239 return -EBUSY;
2240 }
2241
2242 return ifi;
2243}
2244
69c79d3c 2245static int move_network_interfaces(pid_t pid) {
7e227024 2246 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 2247 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
2248 char **i;
2249 int r;
2250
2251 if (!arg_private_network)
2252 return 0;
2253
2254 if (strv_isempty(arg_network_interfaces))
2255 return 0;
2256
151b9b96 2257 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2258 if (r < 0)
2259 return log_error_errno(r, "Failed to connect to netlink: %m");
aa28aefe 2260
7e227024
LP
2261 udev = udev_new();
2262 if (!udev) {
2263 log_error("Failed to connect to udev.");
2264 return -ENOMEM;
2265 }
2266
aa28aefe 2267 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 2268 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 2269 int ifi;
aa28aefe 2270
c74e630d
LP
2271 ifi = parse_interface(udev, *i);
2272 if (ifi < 0)
2273 return ifi;
2274
3125b3ef 2275 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
f647962d
MS
2276 if (r < 0)
2277 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2278
c74e630d 2279 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2280 if (r < 0)
2281 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
7e227024 2282
c74e630d 2283 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2284 if (r < 0)
2285 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
c74e630d 2286 }
7e227024 2287
c74e630d
LP
2288 return 0;
2289}
2290
2291static int setup_macvlan(pid_t pid) {
2292 _cleanup_udev_unref_ struct udev *udev = NULL;
2293 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
e867ceb6 2294 unsigned idx = 0;
c74e630d
LP
2295 char **i;
2296 int r;
2297
2298 if (!arg_private_network)
2299 return 0;
2300
2301 if (strv_isempty(arg_network_macvlan))
2302 return 0;
2303
2304 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2305 if (r < 0)
2306 return log_error_errno(r, "Failed to connect to netlink: %m");
c74e630d
LP
2307
2308 udev = udev_new();
2309 if (!udev) {
2310 log_error("Failed to connect to udev.");
2311 return -ENOMEM;
2312 }
2313
2314 STRV_FOREACH(i, arg_network_macvlan) {
2315 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2316 _cleanup_free_ char *n = NULL;
e867ceb6 2317 struct ether_addr mac;
c74e630d
LP
2318 int ifi;
2319
2320 ifi = parse_interface(udev, *i);
2321 if (ifi < 0)
2322 return ifi;
2323
e867ceb6
LP
2324 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2325 if (r < 0)
2326 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2327
c74e630d 2328 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2329 if (r < 0)
2330 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2331
c74e630d 2332 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
f647962d
MS
2333 if (r < 0)
2334 return log_error_errno(r, "Failed to add netlink interface index: %m");
c74e630d
LP
2335
2336 n = strappend("mv-", *i);
2337 if (!n)
2338 return log_oom();
2339
2340 strshorten(n, IFNAMSIZ-1);
2341
2342 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
f647962d
MS
2343 if (r < 0)
2344 return log_error_errno(r, "Failed to add netlink interface name: %m");
c74e630d 2345
e867ceb6
LP
2346 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2347 if (r < 0)
2348 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2349
aa28aefe 2350 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2351 if (r < 0)
2352 return log_error_errno(r, "Failed to add netlink namespace field: %m");
c74e630d
LP
2353
2354 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2355 if (r < 0)
2356 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 2357
d8e538ec 2358 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
f647962d
MS
2359 if (r < 0)
2360 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d
LP
2361
2362 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
f647962d
MS
2363 if (r < 0)
2364 return log_error_errno(r, "Failed to append macvlan mode: %m");
c74e630d
LP
2365
2366 r = sd_rtnl_message_close_container(m);
f647962d
MS
2367 if (r < 0)
2368 return log_error_errno(r, "Failed to close netlink container: %m");
c74e630d
LP
2369
2370 r = sd_rtnl_message_close_container(m);
f647962d
MS
2371 if (r < 0)
2372 return log_error_errno(r, "Failed to close netlink container: %m");
aa28aefe
LP
2373
2374 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2375 if (r < 0)
2376 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
aa28aefe
LP
2377 }
2378
2379 return 0;
2380}
2381
28650077 2382static int setup_seccomp(void) {
24fb1112
LP
2383
2384#ifdef HAVE_SECCOMP
28650077
LP
2385 static const int blacklist[] = {
2386 SCMP_SYS(kexec_load),
2387 SCMP_SYS(open_by_handle_at),
2388 SCMP_SYS(init_module),
2389 SCMP_SYS(finit_module),
2390 SCMP_SYS(delete_module),
2391 SCMP_SYS(iopl),
2392 SCMP_SYS(ioperm),
2393 SCMP_SYS(swapon),
2394 SCMP_SYS(swapoff),
2395 };
2396
24fb1112 2397 scmp_filter_ctx seccomp;
28650077 2398 unsigned i;
24fb1112
LP
2399 int r;
2400
24fb1112
LP
2401 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2402 if (!seccomp)
2403 return log_oom();
2404
e9642be2 2405 r = seccomp_add_secondary_archs(seccomp);
9875fd78 2406 if (r < 0) {
da927ba9 2407 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
2408 goto finish;
2409 }
2410
28650077
LP
2411 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2412 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2413 if (r == -EFAULT)
2414 continue; /* unknown syscall */
2415 if (r < 0) {
da927ba9 2416 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
2417 goto finish;
2418 }
2419 }
2420
2421 /*
2422 Audit is broken in containers, much of the userspace audit
2423 hookup will fail if running inside a container. We don't
2424 care and just turn off creation of audit sockets.
2425
2426 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2427 with EAFNOSUPPORT which audit userspace uses as indication
2428 that audit is disabled in the kernel.
2429 */
2430
3302da46 2431 r = seccomp_rule_add(
24fb1112
LP
2432 seccomp,
2433 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2434 SCMP_SYS(socket),
2435 2,
2436 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2437 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2438 if (r < 0) {
da927ba9 2439 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
2440 goto finish;
2441 }
2442
2443 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2444 if (r < 0) {
da927ba9 2445 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
2446 goto finish;
2447 }
2448
2449 r = seccomp_load(seccomp);
2450 if (r < 0)
da927ba9 2451 log_error_errno(r, "Failed to install seccomp audit filter: %m");
24fb1112
LP
2452
2453finish:
2454 seccomp_release(seccomp);
2455 return r;
2456#else
2457 return 0;
2458#endif
2459
2460}
2461
785890ac
LP
2462static int setup_propagate(const char *root) {
2463 const char *p, *q;
2464
2465 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2466 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2467 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2468 (void) mkdir_p(p, 0600);
2469
2470 q = strappenda(root, "/run/systemd/nspawn/incoming");
2471 mkdir_parents(q, 0755);
2472 mkdir_p(q, 0600);
2473
2474 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2475 return log_error_errno(errno, "Failed to install propagation bind mount.");
2476
2477 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2478 return log_error_errno(errno, "Failed to make propagation mount read-only");
2479
2480 return 0;
2481}
2482
1b9e5b12
LP
2483static int setup_image(char **device_path, int *loop_nr) {
2484 struct loop_info64 info = {
2485 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2486 };
2487 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2488 _cleanup_free_ char* loopdev = NULL;
2489 struct stat st;
2490 int r, nr;
2491
2492 assert(device_path);
2493 assert(loop_nr);
ec16945e 2494 assert(arg_image);
1b9e5b12
LP
2495
2496 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2497 if (fd < 0)
2498 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 2499
4a62c710
MS
2500 if (fstat(fd, &st) < 0)
2501 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
2502
2503 if (S_ISBLK(st.st_mode)) {
2504 char *p;
2505
2506 p = strdup(arg_image);
2507 if (!p)
2508 return log_oom();
2509
2510 *device_path = p;
2511
2512 *loop_nr = -1;
2513
2514 r = fd;
2515 fd = -1;
2516
2517 return r;
2518 }
2519
2520 if (!S_ISREG(st.st_mode)) {
56f64d95 2521 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
2522 return -EINVAL;
2523 }
2524
2525 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
2526 if (control < 0)
2527 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
2528
2529 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
2530 if (nr < 0)
2531 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
2532
2533 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2534 return log_oom();
2535
2536 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2537 if (loop < 0)
2538 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 2539
4a62c710
MS
2540 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2541 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
2542
2543 if (arg_read_only)
2544 info.lo_flags |= LO_FLAGS_READ_ONLY;
2545
4a62c710
MS
2546 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2547 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
2548
2549 *device_path = loopdev;
2550 loopdev = NULL;
2551
2552 *loop_nr = nr;
2553
2554 r = loop;
2555 loop = -1;
2556
2557 return r;
2558}
2559
ada4799a
LP
2560#define PARTITION_TABLE_BLURB \
2561 "Note that the disk image needs to either contain only a single MBR partition of\n" \
2562 "type 0x83 that is marked bootable, or follow\n" \
2563 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2564 "to be bootable with systemd-nspawn."
2565
1b9e5b12
LP
2566static int dissect_image(
2567 int fd,
727fd4fd
LP
2568 char **root_device, bool *root_device_rw,
2569 char **home_device, bool *home_device_rw,
2570 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
2571 bool *secondary) {
2572
2573#ifdef HAVE_BLKID
01dc33ce
ZJS
2574 int home_nr = -1, srv_nr = -1;
2575#ifdef GPT_ROOT_NATIVE
2576 int root_nr = -1;
2577#endif
2578#ifdef GPT_ROOT_SECONDARY
2579 int secondary_root_nr = -1;
2580#endif
2581
1b9e5b12
LP
2582 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2583 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2584 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2585 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2586 _cleanup_udev_unref_ struct udev *udev = NULL;
2587 struct udev_list_entry *first, *item;
727fd4fd 2588 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1b9e5b12
LP
2589 const char *pttype = NULL;
2590 blkid_partlist pl;
2591 struct stat st;
2592 int r;
ada4799a 2593 bool is_gpt, is_mbr;
1b9e5b12
LP
2594
2595 assert(fd >= 0);
2596 assert(root_device);
2597 assert(home_device);
2598 assert(srv_device);
2599 assert(secondary);
ec16945e 2600 assert(arg_image);
1b9e5b12
LP
2601
2602 b = blkid_new_probe();
2603 if (!b)
2604 return log_oom();
2605
2606 errno = 0;
2607 r = blkid_probe_set_device(b, fd, 0, 0);
2608 if (r != 0) {
2609 if (errno == 0)
2610 return log_oom();
2611
56f64d95 2612 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
2613 return -errno;
2614 }
2615
2616 blkid_probe_enable_partitions(b, 1);
2617 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2618
2619 errno = 0;
2620 r = blkid_do_safeprobe(b);
2621 if (r == -2 || r == 1) {
ada4799a
LP
2622 log_error("Failed to identify any partition table on\n"
2623 " %s\n"
2624 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
2625 return -EINVAL;
2626 } else if (r != 0) {
2627 if (errno == 0)
2628 errno = EIO;
56f64d95 2629 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
2630 return -errno;
2631 }
2632
2633 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
2634
2635 is_gpt = streq_ptr(pttype, "gpt");
2636 is_mbr = streq_ptr(pttype, "dos");
2637
2638 if (!is_gpt && !is_mbr) {
2639 log_error("No GPT or MBR partition table discovered on\n"
2640 " %s\n"
2641 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
2642 return -EINVAL;
2643 }
2644
2645 errno = 0;
2646 pl = blkid_probe_get_partitions(b);
2647 if (!pl) {
2648 if (errno == 0)
2649 return log_oom();
2650
2651 log_error("Failed to list partitions of %s", arg_image);
2652 return -errno;
2653 }
2654
2655 udev = udev_new();
2656 if (!udev)
2657 return log_oom();
2658
4a62c710
MS
2659 if (fstat(fd, &st) < 0)
2660 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12
LP
2661
2662 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2663 if (!d)
2664 return log_oom();
2665
2666 e = udev_enumerate_new(udev);
2667 if (!e)
2668 return log_oom();
2669
2670 r = udev_enumerate_add_match_parent(e, d);
2671 if (r < 0)
2672 return log_oom();
2673
2674 r = udev_enumerate_scan_devices(e);
f647962d
MS
2675 if (r < 0)
2676 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1b9e5b12
LP
2677
2678 first = udev_enumerate_get_list_entry(e);
2679 udev_list_entry_foreach(item, first) {
2680 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 2681 const char *node;
727fd4fd 2682 unsigned long long flags;
1b9e5b12
LP
2683 blkid_partition pp;
2684 dev_t qn;
2685 int nr;
2686
2687 errno = 0;
2688 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2689 if (!q) {
2690 if (!errno)
2691 errno = ENOMEM;
2692
56f64d95 2693 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
2694 return -errno;
2695 }
2696
2697 qn = udev_device_get_devnum(q);
2698 if (major(qn) == 0)
2699 continue;
2700
2701 if (st.st_rdev == qn)
2702 continue;
2703
2704 node = udev_device_get_devnode(q);
2705 if (!node)
2706 continue;
2707
2708 pp = blkid_partlist_devno_to_partition(pl, qn);
2709 if (!pp)
2710 continue;
2711
727fd4fd 2712 flags = blkid_partition_get_flags(pp);
ada4799a
LP
2713 if (is_gpt && (flags & GPT_FLAG_NO_AUTO))
2714 continue;
2715 if (is_mbr && (flags != 0x80)) /* Bootable flag */
727fd4fd
LP
2716 continue;
2717
1b9e5b12
LP
2718 nr = blkid_partition_get_partno(pp);
2719 if (nr < 0)
2720 continue;
2721
ada4799a
LP
2722 if (is_gpt) {
2723 sd_id128_t type_id;
2724 const char *stype;
1b9e5b12 2725
ada4799a
LP
2726 stype = blkid_partition_get_type_string(pp);
2727 if (!stype)
2728 continue;
1b9e5b12 2729
ada4799a 2730 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
2731 continue;
2732
ada4799a 2733 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 2734
ada4799a
LP
2735 if (home && nr >= home_nr)
2736 continue;
1b9e5b12 2737
ada4799a
LP
2738 home_nr = nr;
2739 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 2740
ada4799a
LP
2741 r = free_and_strdup(&home, node);
2742 if (r < 0)
2743 return log_oom();
727fd4fd 2744
ada4799a
LP
2745 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2746
2747 if (srv && nr >= srv_nr)
2748 continue;
2749
2750 srv_nr = nr;
2751 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2752
2753 r = free_and_strdup(&srv, node);
2754 if (r < 0)
2755 return log_oom();
2756 }
1b9e5b12 2757#ifdef GPT_ROOT_NATIVE
ada4799a 2758 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 2759
ada4799a
LP
2760 if (root && nr >= root_nr)
2761 continue;
1b9e5b12 2762
ada4799a
LP
2763 root_nr = nr;
2764 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 2765
ada4799a
LP
2766 r = free_and_strdup(&root, node);
2767 if (r < 0)
2768 return log_oom();
2769 }
1b9e5b12
LP
2770#endif
2771#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
2772 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2773
2774 if (secondary_root && nr >= secondary_root_nr)
2775 continue;
2776
2777 secondary_root_nr = nr;
2778 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2779
2780 r = free_and_strdup(&secondary_root, node);
2781 if (r < 0)
2782 return log_oom();
2783 }
2784#endif
2785
2786 } else if (is_mbr) {
2787 int type;
1b9e5b12 2788
ada4799a
LP
2789 type = blkid_partition_get_type(pp);
2790 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2791 continue;
2792
ada4799a
LP
2793 /* Note that there's a certain, intended
2794 * asymmetry here: while for GPT we simply
2795 * take the first valid partition and ignore
2796 * all others of the same type, for MBR we
2797 * fail if there are multiple suitable
2798 * partitions. This is because the GPT
2799 * partition types are defined by us, and
2800 * hence we can define their lookup semantics,
2801 * while for the MBR logic we reuse existing
2802 * definitions, and simply don't want to make
2803 * out the situation. */
2804
2805 if (root) {
2806 log_error("Identified multiple bootable Linux 0x83 partitions on\n"
2807 " %s\n"
2808 PARTITION_TABLE_BLURB, arg_image);
2809 return -EINVAL;
2810 }
727fd4fd 2811
ada4799a 2812 root_nr = nr;
727fd4fd 2813
ada4799a
LP
2814 r = free_and_strdup(&root, node);
2815 if (r < 0)
1b9e5b12
LP
2816 return log_oom();
2817 }
1b9e5b12
LP
2818 }
2819
2820 if (!root && !secondary_root) {
ada4799a
LP
2821 log_error("Failed to identify root partition in disk image\n"
2822 " %s\n"
2823 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
2824 return -EINVAL;
2825 }
2826
2827 if (root) {
2828 *root_device = root;
2829 root = NULL;
727fd4fd
LP
2830
2831 *root_device_rw = root_rw;
1b9e5b12
LP
2832 *secondary = false;
2833 } else if (secondary_root) {
2834 *root_device = secondary_root;
2835 secondary_root = NULL;
727fd4fd
LP
2836
2837 *root_device_rw = secondary_root_rw;
1b9e5b12
LP
2838 *secondary = true;
2839 }
2840
2841 if (home) {
2842 *home_device = home;
2843 home = NULL;
727fd4fd
LP
2844
2845 *home_device_rw = home_rw;
1b9e5b12
LP
2846 }
2847
2848 if (srv) {
2849 *srv_device = srv;
2850 srv = NULL;
727fd4fd
LP
2851
2852 *srv_device_rw = srv_rw;
1b9e5b12
LP
2853 }
2854
2855 return 0;
2856#else
2857 log_error("--image= is not supported, compiled without blkid support.");
2858 return -ENOTSUP;
2859#endif
2860}
2861
727fd4fd 2862static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2863#ifdef HAVE_BLKID
2864 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2865 const char *fstype, *p;
2866 int r;
2867
2868 assert(what);
2869 assert(where);
2870
727fd4fd
LP
2871 if (arg_read_only)
2872 rw = false;
2873
1b9e5b12
LP
2874 if (directory)
2875 p = strappenda(where, directory);
2876 else
2877 p = where;
2878
2879 errno = 0;
2880 b = blkid_new_probe_from_filename(what);
2881 if (!b) {
2882 if (errno == 0)
2883 return log_oom();
56f64d95 2884 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2885 return -errno;
2886 }
2887
2888 blkid_probe_enable_superblocks(b, 1);
2889 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2890
2891 errno = 0;
2892 r = blkid_do_safeprobe(b);
2893 if (r == -1 || r == 1) {
2894 log_error("Cannot determine file system type of %s", what);
2895 return -EINVAL;
2896 } else if (r != 0) {
2897 if (errno == 0)
2898 errno = EIO;
56f64d95 2899 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2900 return -errno;
2901 }
2902
2903 errno = 0;
2904 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2905 if (errno == 0)
2906 errno = EINVAL;
2907 log_error("Failed to determine file system type of %s", what);
2908 return -errno;
2909 }
2910
2911 if (streq(fstype, "crypto_LUKS")) {
2912 log_error("nspawn currently does not support LUKS disk images.");
2913 return -ENOTSUP;
2914 }
2915
4a62c710
MS
2916 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2917 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
2918
2919 return 0;
2920#else
2921 log_error("--image= is not supported, compiled without blkid support.");
2922 return -ENOTSUP;
2923#endif
2924}
2925
727fd4fd
LP
2926static int mount_devices(
2927 const char *where,
2928 const char *root_device, bool root_device_rw,
2929 const char *home_device, bool home_device_rw,
2930 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2931 int r;
2932
2933 assert(where);
2934
2935 if (root_device) {
727fd4fd 2936 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2937 if (r < 0)
2938 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2939 }
2940
2941 if (home_device) {
727fd4fd 2942 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2943 if (r < 0)
2944 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2945 }
2946
2947 if (srv_device) {
727fd4fd 2948 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2949 if (r < 0)
2950 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2951 }
2952
2953 return 0;
2954}
2955
2956static void loop_remove(int nr, int *image_fd) {
2957 _cleanup_close_ int control = -1;
e8c8ddcc 2958 int r;
1b9e5b12
LP
2959
2960 if (nr < 0)
2961 return;
2962
2963 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2964 r = ioctl(*image_fd, LOOP_CLR_FD);
2965 if (r < 0)
5e4074aa 2966 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 2967 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2968 }
2969
2970 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2971 if (control < 0) {
56f64d95 2972 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2973 return;
e8c8ddcc 2974 }
1b9e5b12 2975
e8c8ddcc
TG
2976 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2977 if (r < 0)
5e4074aa 2978 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2979}
2980
0cb9fbcd
LP
2981static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2982 int pipe_fds[2];
2983 pid_t pid;
2984
2985 assert(database);
2986 assert(key);
2987 assert(rpid);
2988
4a62c710
MS
2989 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2990 return log_error_errno(errno, "Failed to allocate pipe: %m");
0cb9fbcd
LP
2991
2992 pid = fork();
4a62c710
MS
2993 if (pid < 0)
2994 return log_error_errno(errno, "Failed to fork getent child: %m");
2995 else if (pid == 0) {
0cb9fbcd
LP
2996 int nullfd;
2997 char *empty_env = NULL;
2998
2999 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3000 _exit(EXIT_FAILURE);
3001
3002 if (pipe_fds[0] > 2)
03e334a1 3003 safe_close(pipe_fds[0]);
0cb9fbcd 3004 if (pipe_fds[1] > 2)
03e334a1 3005 safe_close(pipe_fds[1]);
0cb9fbcd
LP
3006
3007 nullfd = open("/dev/null", O_RDWR);
3008 if (nullfd < 0)
3009 _exit(EXIT_FAILURE);
3010
3011 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3012 _exit(EXIT_FAILURE);
3013
3014 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3015 _exit(EXIT_FAILURE);
3016
3017 if (nullfd > 2)
03e334a1 3018 safe_close(nullfd);
0cb9fbcd
LP
3019
3020 reset_all_signal_handlers();
3021 close_all_fds(NULL, 0);
3022
4de82926
MM
3023 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3024 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
3025 _exit(EXIT_FAILURE);
3026 }
3027
03e334a1 3028 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
3029
3030 *rpid = pid;
3031
3032 return pipe_fds[0];
3033}
3034
3035static int change_uid_gid(char **_home) {
a2a5291b
ZJS
3036 char line[LINE_MAX], *x, *u, *g, *h;
3037 const char *word, *state;
0cb9fbcd
LP
3038 _cleanup_free_ uid_t *uids = NULL;
3039 _cleanup_free_ char *home = NULL;
3040 _cleanup_fclose_ FILE *f = NULL;
3041 _cleanup_close_ int fd = -1;
3042 unsigned n_uids = 0;
70f539ca 3043 size_t sz = 0, l;
0cb9fbcd
LP
3044 uid_t uid;
3045 gid_t gid;
3046 pid_t pid;
3047 int r;
3048
3049 assert(_home);
3050
3051 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3052 /* Reset everything fully to 0, just in case */
3053
4a62c710
MS
3054 if (setgroups(0, NULL) < 0)
3055 return log_error_errno(errno, "setgroups() failed: %m");
0cb9fbcd 3056
4a62c710
MS
3057 if (setresgid(0, 0, 0) < 0)
3058 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3059
4a62c710
MS
3060 if (setresuid(0, 0, 0) < 0)
3061 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3062
3063 *_home = NULL;
3064 return 0;
3065 }
3066
3067 /* First, get user credentials */
3068 fd = spawn_getent("passwd", arg_user, &pid);
3069 if (fd < 0)
3070 return fd;
3071
3072 f = fdopen(fd, "r");
3073 if (!f)
3074 return log_oom();
3075 fd = -1;
3076
3077 if (!fgets(line, sizeof(line), f)) {
3078
3079 if (!ferror(f)) {
3080 log_error("Failed to resolve user %s.", arg_user);
3081 return -ESRCH;
3082 }
3083
56f64d95 3084 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3085 return -errno;
3086 }
3087
3088 truncate_nl(line);
3089
820d3acf 3090 wait_for_terminate_and_warn("getent passwd", pid, true);
0cb9fbcd
LP
3091
3092 x = strchr(line, ':');
3093 if (!x) {
3094 log_error("/etc/passwd entry has invalid user field.");
3095 return -EIO;
3096 }
3097
3098 u = strchr(x+1, ':');
3099 if (!u) {
3100 log_error("/etc/passwd entry has invalid password field.");
3101 return -EIO;
3102 }
3103
3104 u++;
3105 g = strchr(u, ':');
3106 if (!g) {
3107 log_error("/etc/passwd entry has invalid UID field.");
3108 return -EIO;
3109 }
3110
3111 *g = 0;
3112 g++;
3113 x = strchr(g, ':');
3114 if (!x) {
3115 log_error("/etc/passwd entry has invalid GID field.");
3116 return -EIO;
3117 }
3118
3119 *x = 0;
3120 h = strchr(x+1, ':');
3121 if (!h) {
3122 log_error("/etc/passwd entry has invalid GECOS field.");
3123 return -EIO;
3124 }
3125
3126 h++;
3127 x = strchr(h, ':');
3128 if (!x) {
3129 log_error("/etc/passwd entry has invalid home directory field.");
3130 return -EIO;
3131 }
3132
3133 *x = 0;
3134
3135 r = parse_uid(u, &uid);
3136 if (r < 0) {
3137 log_error("Failed to parse UID of user.");
3138 return -EIO;
3139 }
3140
3141 r = parse_gid(g, &gid);
3142 if (r < 0) {
3143 log_error("Failed to parse GID of user.");
3144 return -EIO;
3145 }
3146
3147 home = strdup(h);
3148 if (!home)
3149 return log_oom();
3150
3151 /* Second, get group memberships */
3152 fd = spawn_getent("initgroups", arg_user, &pid);
3153 if (fd < 0)
3154 return fd;
3155
3156 fclose(f);
3157 f = fdopen(fd, "r");
3158 if (!f)
3159 return log_oom();
3160 fd = -1;
3161
3162 if (!fgets(line, sizeof(line), f)) {
3163 if (!ferror(f)) {
3164 log_error("Failed to resolve user %s.", arg_user);
3165 return -ESRCH;
3166 }
3167
56f64d95 3168 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3169 return -errno;
3170 }
3171
3172 truncate_nl(line);
3173
820d3acf 3174 wait_for_terminate_and_warn("getent initgroups", pid, true);
0cb9fbcd
LP
3175
3176 /* Skip over the username and subsequent separator whitespace */
3177 x = line;
3178 x += strcspn(x, WHITESPACE);
3179 x += strspn(x, WHITESPACE);
3180
a2a5291b 3181 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
3182 char c[l+1];
3183
a2a5291b 3184 memcpy(c, word, l);
0cb9fbcd
LP
3185 c[l] = 0;
3186
3187 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3188 return log_oom();
3189
3190 r = parse_uid(c, &uids[n_uids++]);
3191 if (r < 0) {
3192 log_error("Failed to parse group data from getent.");
3193 return -EIO;
3194 }
3195 }
3196
3197 r = mkdir_parents(home, 0775);
f647962d
MS
3198 if (r < 0)
3199 return log_error_errno(r, "Failed to make home root directory: %m");
0cb9fbcd
LP
3200
3201 r = mkdir_safe(home, 0755, uid, gid);
f647962d
MS
3202 if (r < 0 && r != -EEXIST)
3203 return log_error_errno(r, "Failed to make home directory: %m");
0cb9fbcd
LP
3204
3205 fchown(STDIN_FILENO, uid, gid);
3206 fchown(STDOUT_FILENO, uid, gid);
3207 fchown(STDERR_FILENO, uid, gid);
3208
4a62c710
MS
3209 if (setgroups(n_uids, uids) < 0)
3210 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
0cb9fbcd 3211
4a62c710
MS
3212 if (setresgid(gid, gid, gid) < 0)
3213 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3214
4a62c710
MS
3215 if (setresuid(uid, uid, uid) < 0)
3216 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3217
3218 if (_home) {
3219 *_home = home;
3220 home = NULL;
3221 }
3222
3223 return 0;
3224}
3225
113cea80 3226/*
6d416b9c
LS
3227 * Return values:
3228 * < 0 : wait_for_terminate() failed to get the state of the
3229 * container, the container was terminated by a signal, or
3230 * failed for an unknown reason. No change is made to the
3231 * container argument.
3232 * > 0 : The program executed in the container terminated with an
3233 * error. The exit code of the program executed in the
919699ec
LP
3234 * container is returned. The container argument has been set
3235 * to CONTAINER_TERMINATED.
6d416b9c
LS
3236 * 0 : The container is being rebooted, has been shut down or exited
3237 * successfully. The container argument has been set to either
3238 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 3239 *
6d416b9c
LS
3240 * That is, success is indicated by a return value of zero, and an
3241 * error is indicated by a non-zero value.
113cea80
DH
3242 */
3243static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 3244 siginfo_t status;
919699ec 3245 int r;
113cea80
DH
3246
3247 r = wait_for_terminate(pid, &status);
f647962d
MS
3248 if (r < 0)
3249 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
3250
3251 switch (status.si_code) {
fddbb89c 3252
113cea80 3253 case CLD_EXITED:
919699ec
LP
3254 if (status.si_status == 0) {
3255 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 3256
fddbb89c 3257 } else
919699ec 3258 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 3259
919699ec
LP
3260 *container = CONTAINER_TERMINATED;
3261 return status.si_status;
113cea80
DH
3262
3263 case CLD_KILLED:
3264 if (status.si_status == SIGINT) {
113cea80 3265
919699ec 3266 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 3267 *container = CONTAINER_TERMINATED;
919699ec
LP
3268 return 0;
3269
113cea80 3270 } else if (status.si_status == SIGHUP) {
113cea80 3271
919699ec 3272 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 3273 *container = CONTAINER_REBOOTED;
919699ec 3274 return 0;
113cea80 3275 }
919699ec 3276
113cea80
DH
3277 /* CLD_KILLED fallthrough */
3278
3279 case CLD_DUMPED:
fddbb89c 3280 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 3281 return -EIO;
113cea80
DH
3282
3283 default:
fddbb89c 3284 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 3285 return -EIO;
113cea80
DH
3286 }
3287
3288 return r;
3289}
3290
e866af3a
DH
3291static void nop_handler(int sig) {}
3292
023fb90b
LP
3293static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3294 pid_t pid;
3295
3296 pid = PTR_TO_UINT32(userdata);
3297 if (pid > 0) {
3298 if (kill(pid, SIGRTMIN+3) >= 0) {
3299 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3300 sd_event_source_set_userdata(s, NULL);
3301 return 0;
3302 }
3303 }
3304
3305 sd_event_exit(sd_event_source_get_event(s), 0);
3306 return 0;
3307}
3308
ec16945e 3309static int determine_names(void) {
1b9cebf6 3310 int r;
ec16945e
LP
3311
3312 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3313 if (arg_machine) {
3314 _cleanup_(image_unrefp) Image *i = NULL;
3315
3316 r = image_find(arg_machine, &i);
3317 if (r < 0)
3318 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3319 else if (r == 0) {
3320 log_error("No image for machine '%s': %m", arg_machine);
3321 return -ENOENT;
3322 }
3323
aceac2f0 3324 if (i->type == IMAGE_RAW)
1b9cebf6
LP
3325 r = set_sanitized_path(&arg_image, i->path);
3326 else
3327 r = set_sanitized_path(&arg_directory, i->path);
3328 if (r < 0)
3329 return log_error_errno(r, "Invalid image directory: %m");
3330
3331 arg_read_only = arg_read_only || i->read_only;
3332 } else
ec16945e
LP
3333 arg_directory = get_current_dir_name();
3334
1b9cebf6
LP
3335 if (!arg_directory && !arg_machine) {
3336 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
3337 return -EINVAL;
3338 }
3339 }
3340
3341 if (!arg_machine) {
b9ba4dab
LP
3342 if (arg_directory && path_equal(arg_directory, "/"))
3343 arg_machine = gethostname_malloc();
3344 else
3345 arg_machine = strdup(basename(arg_image ?: arg_directory));
3346
ec16945e
LP
3347 if (!arg_machine)
3348 return log_oom();
3349
3350 hostname_cleanup(arg_machine, false);
3351 if (!machine_name_is_valid(arg_machine)) {
3352 log_error("Failed to determine machine name automatically, please use -M.");
3353 return -EINVAL;
3354 }
b9ba4dab
LP
3355
3356 if (arg_ephemeral) {
3357 char *b;
3358
3359 /* Add a random suffix when this is an
3360 * ephemeral machine, so that we can run many
3361 * instances at once without manually having
3362 * to specify -M each time. */
3363
3364 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3365 return log_oom();
3366
3367 free(arg_machine);
3368 arg_machine = b;
3369 }
ec16945e
LP
3370 }
3371
3372 return 0;
3373}
3374
88213476 3375int main(int argc, char *argv[]) {
69c79d3c 3376
611b312b 3377 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
727fd4fd 3378 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
63cc4c31 3379 _cleanup_close_ int master = -1, image_fd = -1;
69c79d3c 3380 _cleanup_fdset_free_ FDSet *fds = NULL;
ec16945e 3381 int r, n_fd_passed, loop_nr = -1;
1b9e5b12 3382 char veth_name[IFNAMSIZ];
ec16945e 3383 bool secondary = false, remove_subvol = false;
e866af3a 3384 sigset_t mask, mask_chld;
69c79d3c 3385 pid_t pid = 0;
ec16945e 3386 int ret = EXIT_SUCCESS;
6d0b55c2 3387 union in_addr_union exposed = {};
30535c16 3388 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
88213476
LP
3389
3390 log_parse_environment();
3391 log_open();
3392
ec16945e
LP
3393 r = parse_argv(argc, argv);
3394 if (r <= 0)
88213476 3395 goto finish;
88213476 3396
ec16945e
LP
3397 r = determine_names();
3398 if (r < 0)
3399 goto finish;
7027ff61 3400
88213476
LP
3401 if (geteuid() != 0) {
3402 log_error("Need to be root.");
ec16945e 3403 r = -EPERM;
88213476
LP
3404 goto finish;
3405 }
3406
04d391da
LP
3407 if (sd_booted() <= 0) {
3408 log_error("Not running on a systemd system.");
ec16945e 3409 r = -EINVAL;
04d391da
LP
3410 goto finish;
3411 }
3412
1b9e5b12
LP
3413 log_close();
3414 n_fd_passed = sd_listen_fds(false);
3415 if (n_fd_passed > 0) {
ec16945e
LP
3416 r = fdset_new_listen_fds(&fds, false);
3417 if (r < 0) {
3418 log_error_errno(r, "Failed to collect file descriptors: %m");
1b9e5b12
LP
3419 goto finish;
3420 }
88213476 3421 }
1b9e5b12
LP
3422 fdset_close_others(fds);
3423 log_open();
88213476 3424
1b9e5b12 3425 if (arg_directory) {
ec16945e
LP
3426 assert(!arg_image);
3427
c4e34a61
LP
3428 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3429 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
ec16945e 3430 r = -EINVAL;
6b9132a9
LP
3431 goto finish;
3432 }
1b9e5b12 3433
30535c16
LP
3434 if (arg_ephemeral) {
3435 _cleanup_release_lock_file_ LockFile original_lock = LOCK_FILE_INIT;
ec16945e
LP
3436 char *np;
3437
c4e34a61
LP
3438 /* If the specified path is a mount point we
3439 * generate the new snapshot immediately
3440 * inside it under a random name. However if
3441 * the specified is not a mount point we
3442 * create the new snapshot in the parent
3443 * directory, just next to it. */
3444 r = path_is_mount_point(arg_directory, false);
3445 if (r < 0) {
3446 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3447 goto finish;
3448 }
3449 if (r > 0)
3450 r = tempfn_random_child(arg_directory, &np);
3451 else
3452 r = tempfn_random(arg_directory, &np);
ec16945e
LP
3453 if (r < 0) {
3454 log_error_errno(r, "Failed to generate name for snapshot: %m");
3455 goto finish;
3456 }
3457
30535c16
LP
3458 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3459 if (r < 0) {
3460 log_error_errno(r, "Failed to lock %s: %m", np);
3461 goto finish;
3462 }
3463
ec16945e
LP
3464 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3465 if (r < 0) {
3466 free(np);
3467 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3468 goto finish;
3469 }
3470
3471 free(arg_directory);
3472 arg_directory = np;
3473
3474 remove_subvol = true;
30535c16
LP
3475
3476 } else {
3477 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3478 if (r == -EBUSY) {
3479 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3480 goto finish;
3481 }
3482 if (r < 0) {
3483 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3484 return r;
3485 }
3486
3487 if (arg_template) {
3488 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3489 if (r == -EEXIST) {
3490 if (!arg_quiet)
3491 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3492 } else if (r < 0) {
3493 log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
3494 goto finish;
3495 } else {
3496 if (!arg_quiet)
3497 log_info("Populated %s from template %s.", arg_directory, arg_template);
3498 }
3499 }
ec16945e
LP
3500 }
3501
1b9e5b12
LP
3502 if (arg_boot) {
3503 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3504 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3505 r = -EINVAL;
1b9e5b12
LP
3506 goto finish;
3507 }
3508 } else {
3509 const char *p;
3510
3511 p = strappenda(arg_directory,
3512 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3513 if (access(p, F_OK) < 0) {
3514 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
ec16945e 3515 r = -EINVAL;
1b9e5b12 3516 goto finish;
1b9e5b12
LP
3517 }
3518 }
ec16945e 3519
6b9132a9 3520 } else {
1b9e5b12 3521 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3522
ec16945e
LP
3523 assert(arg_image);
3524 assert(!arg_template);
3525
30535c16
LP
3526 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3527 if (r == -EBUSY) {
3528 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3529 goto finish;
3530 }
3531 if (r < 0) {
3532 r = log_error_errno(r, "Failed to create image lock: %m");
3533 goto finish;
3534 }
3535
1b9e5b12 3536 if (!mkdtemp(template)) {
56f64d95 3537 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3538 r = -errno;
6b9132a9 3539 goto finish;
1b9e5b12 3540 }
6b9132a9 3541
1b9e5b12
LP
3542 arg_directory = strdup(template);
3543 if (!arg_directory) {
3544 r = log_oom();
3545 goto finish;
6b9132a9 3546 }
88213476 3547
1b9e5b12
LP
3548 image_fd = setup_image(&device_path, &loop_nr);
3549 if (image_fd < 0) {
3550 r = image_fd;
842f3b0f
LP
3551 goto finish;
3552 }
1b9e5b12 3553
4d9f07b4
LP
3554 r = dissect_image(image_fd,
3555 &root_device, &root_device_rw,
3556 &home_device, &home_device_rw,
3557 &srv_device, &srv_device_rw,
3558 &secondary);
1b9e5b12
LP
3559 if (r < 0)
3560 goto finish;
842f3b0f 3561 }
842f3b0f 3562
db7feb7e
LP
3563 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3564 if (master < 0) {
ec16945e 3565 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3566 goto finish;
3567 }
3568
611b312b
LP
3569 r = ptsname_malloc(master, &console);
3570 if (r < 0) {
3571 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
3572 goto finish;
3573 }
3574
284c0b91 3575 if (!arg_quiet)
45f1386c 3576 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
ec16945e 3577 arg_machine, arg_image ?: arg_directory);
a258bf26
LP
3578
3579 if (unlockpt(master) < 0) {
ec16945e 3580 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3581 goto finish;
3582 }
3583
a258bf26
LP
3584 assert_se(sigemptyset(&mask) == 0);
3585 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3586 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3587
023fb90b
LP
3588 assert_se(sigemptyset(&mask_chld) == 0);
3589 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3590
d87be9b0 3591 for (;;) {
6d0b55c2 3592 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
113cea80 3593 ContainerStatus container_status;
7566e267 3594 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
e866af3a
DH
3595 struct sigaction sa = {
3596 .sa_handler = nop_handler,
3597 .sa_flags = SA_NOCLDSTOP,
3598 };
3599
7566e267 3600 r = barrier_create(&barrier);
a2da110b 3601 if (r < 0) {
da927ba9 3602 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3603 goto finish;
3604 }
3605
6d0b55c2
LP
3606 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3607 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3608 goto finish;
3609 }
3610
3611 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3612 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3613 goto finish;
3614 }
3615
e866af3a
DH
3616 /* Child can be killed before execv(), so handle SIGCHLD
3617 * in order to interrupt parent's blocking calls and
3618 * give it a chance to call wait() and terminate. */
3619 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3620 if (r < 0) {
ec16945e 3621 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3622 goto finish;
3623 }
3624
e866af3a
DH
3625 r = sigaction(SIGCHLD, &sa, NULL);
3626 if (r < 0) {
ec16945e 3627 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3628 goto finish;
3629 }
3630
60e1651a
KW
3631 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3632 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3633 (arg_private_network ? CLONE_NEWNET : 0), NULL);
d87be9b0
LP
3634 if (pid < 0) {
3635 if (errno == EINVAL)
ec16945e 3636 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3637 else
ec16945e 3638 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3639
d87be9b0
LP
3640 goto finish;
3641 }
a258bf26 3642
d87be9b0
LP
3643 if (pid == 0) {
3644 /* child */
0cb9fbcd 3645 _cleanup_free_ char *home = NULL;
5674767e 3646 unsigned n_env = 2;
d87be9b0 3647 const char *envp[] = {
e10a55fd 3648 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
3649 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3650 NULL, /* TERM */
3651 NULL, /* HOME */
3652 NULL, /* USER */
3653 NULL, /* LOGNAME */
3654 NULL, /* container_uuid */
842f3b0f
LP
3655 NULL, /* LISTEN_FDS */
3656 NULL, /* LISTEN_PID */
d87be9b0
LP
3657 NULL
3658 };
f4889f65 3659 char **env_use;
a258bf26 3660
a2da110b
DH
3661 barrier_set_role(&barrier, BARRIER_CHILD);
3662
5674767e
ZJS
3663 envp[n_env] = strv_find_prefix(environ, "TERM=");
3664 if (envp[n_env])
3665 n_env ++;
a258bf26 3666
03e334a1 3667 master = safe_close(master);
a258bf26 3668
d87be9b0
LP
3669 close_nointr(STDIN_FILENO);
3670 close_nointr(STDOUT_FILENO);
3671 close_nointr(STDERR_FILENO);
db7feb7e 3672
03e334a1 3673 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3674 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
a258bf26 3675
d87be9b0 3676 reset_all_signal_handlers();
1b6d7fa7 3677 reset_signal_mask();
f5c1b9ee 3678
ec16945e
LP
3679 r = open_terminal(console, O_RDWR);
3680 if (r != STDIN_FILENO) {
3681 if (r >= 0) {
3682 safe_close(r);
3683 r = -EINVAL;
842f3b0f
LP
3684 }
3685
ec16945e 3686 log_error_errno(r, "Failed to open console: %m");
a2da110b 3687 _exit(EXIT_FAILURE);
842f3b0f
LP
3688 }
3689
3690 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3691 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
56f64d95 3692 log_error_errno(errno, "Failed to duplicate console: %m");
a2da110b 3693 _exit(EXIT_FAILURE);
842f3b0f 3694 }
bc2f673e 3695
d87be9b0 3696 if (setsid() < 0) {
56f64d95 3697 log_error_errno(errno, "setsid() failed: %m");
a2da110b 3698 _exit(EXIT_FAILURE);
bc2f673e
LP
3699 }
3700
db999e0f 3701 if (reset_audit_loginuid() < 0)
a2da110b 3702 _exit(EXIT_FAILURE);
db999e0f 3703
d87be9b0 3704 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
56f64d95 3705 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
a2da110b 3706 _exit(EXIT_FAILURE);
d87be9b0 3707 }
e58a1277 3708
d87be9b0
LP
3709 /* Mark everything as slave, so that we still
3710 * receive mounts from the real root, but don't
3711 * propagate mounts to the real root. */
3712 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
56f64d95 3713 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
a2da110b 3714 _exit(EXIT_FAILURE);
d87be9b0 3715 }
04bc4a3f 3716
727fd4fd
LP
3717 if (mount_devices(arg_directory,
3718 root_device, root_device_rw,
3719 home_device, home_device_rw,
3720 srv_device, srv_device_rw) < 0)
a2da110b 3721 _exit(EXIT_FAILURE);
1b9e5b12 3722
d87be9b0
LP
3723 /* Turn directory into bind mount */
3724 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
56f64d95 3725 log_error_errno(errno, "Failed to make bind mount: %m");
a2da110b 3726 _exit(EXIT_FAILURE);
d87be9b0 3727 }
88213476 3728
4d9f07b4
LP
3729 r = setup_volatile(arg_directory);
3730 if (r < 0)
a2da110b 3731 _exit(EXIT_FAILURE);
4d9f07b4
LP
3732
3733 if (setup_volatile_state(arg_directory) < 0)
a2da110b 3734 _exit(EXIT_FAILURE);
4d9f07b4
LP
3735
3736 r = base_filesystem_create(arg_directory);
3737 if (r < 0)
a2da110b 3738 _exit(EXIT_FAILURE);
4d9f07b4 3739
d6797c92 3740 if (arg_read_only) {
ec16945e
LP
3741 r = bind_remount_recursive(arg_directory, true);
3742 if (r < 0) {
3743 log_error_errno(r, "Failed to make tree read-only: %m");
a2da110b 3744 _exit(EXIT_FAILURE);
d87be9b0 3745 }
d6797c92 3746 }
2547bb41 3747
d87be9b0 3748 if (mount_all(arg_directory) < 0)
a2da110b 3749 _exit(EXIT_FAILURE);
57fb9fb5 3750
d87be9b0 3751 if (copy_devnodes(arg_directory) < 0)
a2da110b 3752 _exit(EXIT_FAILURE);
a258bf26 3753
f2d88580 3754 if (setup_ptmx(arg_directory) < 0)
a2da110b 3755 _exit(EXIT_FAILURE);
f2d88580 3756
d87be9b0 3757 dev_setup(arg_directory);
88213476 3758
785890ac
LP
3759 if (setup_propagate(arg_directory) < 0)
3760 _exit(EXIT_FAILURE);
3761
28650077 3762 if (setup_seccomp() < 0)
a2da110b 3763 _exit(EXIT_FAILURE);
24fb1112 3764
d87be9b0 3765 if (setup_dev_console(arg_directory, console) < 0)
a2da110b 3766 _exit(EXIT_FAILURE);
88213476 3767
d87be9b0 3768 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
a2da110b 3769 _exit(EXIT_FAILURE);
03e334a1 3770 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
a258bf26 3771
6d0b55c2
LP
3772 if (send_rtnl(rtnl_socket_pair[1]) < 0)
3773 _exit(EXIT_FAILURE);
3774 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3775
b12afc8c
LP
3776 /* Tell the parent that we are ready, and that
3777 * it can cgroupify us to that we lack access
3778 * to certain devices and resources. */
3779 (void) barrier_place(&barrier);
3780
d87be9b0 3781 if (setup_boot_id(arg_directory) < 0)
a2da110b 3782 _exit(EXIT_FAILURE);
a41fe3a2 3783
d87be9b0 3784 if (setup_timezone(arg_directory) < 0)
a2da110b 3785 _exit(EXIT_FAILURE);
88213476 3786
d87be9b0 3787 if (setup_resolv_conf(arg_directory) < 0)
a2da110b 3788 _exit(EXIT_FAILURE);
687d0825 3789
d87be9b0 3790 if (setup_journal(arg_directory) < 0)
a2da110b 3791 _exit(EXIT_FAILURE);
687d0825 3792
d6797c92 3793 if (mount_binds(arg_directory, arg_bind, false) < 0)
a2da110b 3794 _exit(EXIT_FAILURE);
17fe0523 3795
d6797c92 3796 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
a2da110b 3797 _exit(EXIT_FAILURE);
17fe0523 3798
06c17c39 3799 if (mount_tmpfs(arg_directory) < 0)
a2da110b 3800 _exit(EXIT_FAILURE);
06c17c39 3801
b12afc8c
LP
3802 /* Wait until we are cgroup-ified, so that we
3803 * can mount the right cgroup path writable */
3804 (void) barrier_sync_next(&barrier);
3805
3806 if (mount_cgroup(arg_directory) < 0)
3807 _exit(EXIT_FAILURE);
d96c1ecf 3808
d87be9b0 3809 if (chdir(arg_directory) < 0) {
56f64d95 3810 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
a2da110b 3811 _exit(EXIT_FAILURE);
687d0825
MV
3812 }
3813
d87be9b0 3814 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
56f64d95 3815 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
a2da110b 3816 _exit(EXIT_FAILURE);
687d0825
MV
3817 }
3818
d87be9b0 3819 if (chroot(".") < 0) {
56f64d95 3820 log_error_errno(errno, "chroot() failed: %m");
a2da110b 3821 _exit(EXIT_FAILURE);
687d0825
MV
3822 }
3823
d87be9b0 3824 if (chdir("/") < 0) {
56f64d95 3825 log_error_errno(errno, "chdir() failed: %m");
a2da110b 3826 _exit(EXIT_FAILURE);
687d0825
MV
3827 }
3828
d87be9b0
LP
3829 umask(0022);
3830
eb91eb18
LP
3831 if (arg_private_network)
3832 loopback_setup();
d87be9b0
LP
3833
3834 if (drop_capabilities() < 0) {
56f64d95 3835 log_error_errno(errno, "drop_capabilities() failed: %m");
a2da110b 3836 _exit(EXIT_FAILURE);
687d0825 3837 }
687d0825 3838
0cb9fbcd
LP
3839 r = change_uid_gid(&home);
3840 if (r < 0)
a2da110b 3841 _exit(EXIT_FAILURE);
d87be9b0 3842
842f3b0f
LP
3843 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3844 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3845 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 3846 log_oom();
a2da110b 3847 _exit(EXIT_FAILURE);
144f0fc0 3848 }
687d0825 3849
9444b1f2 3850 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
9f24adc2
LP
3851 char as_uuid[37];
3852
3853 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
842f3b0f 3854 log_oom();
a2da110b 3855 _exit(EXIT_FAILURE);
842f3b0f
LP
3856 }
3857 }
3858
3859 if (fdset_size(fds) > 0) {
ec16945e
LP
3860 r = fdset_cloexec(fds, false);
3861 if (r < 0) {
3862 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
a2da110b 3863 _exit(EXIT_FAILURE);
842f3b0f
LP
3864 }
3865
3866 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 3867 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0 3868 log_oom();
a2da110b 3869 _exit(EXIT_FAILURE);
d87be9b0
LP
3870 }
3871 }
3872
3873 setup_hostname();
3874
6afc95b7
LP
3875 if (arg_personality != 0xffffffffLU) {
3876 if (personality(arg_personality) < 0) {
56f64d95 3877 log_error_errno(errno, "personality() failed: %m");
a2da110b 3878 _exit(EXIT_FAILURE);
6afc95b7 3879 }
1b9e5b12
LP
3880 } else if (secondary) {
3881 if (personality(PER_LINUX32) < 0) {
56f64d95 3882 log_error_errno(errno, "personality() failed: %m");
a2da110b 3883 _exit(EXIT_FAILURE);
1b9e5b12 3884 }
6afc95b7
LP
3885 }
3886
d96c1ecf
LP
3887#ifdef HAVE_SELINUX
3888 if (arg_selinux_context)
0cb9fbcd 3889 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
56f64d95 3890 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
a2da110b 3891 _exit(EXIT_FAILURE);
0cb9fbcd 3892 }
d96c1ecf 3893#endif
354bfd2b 3894
f4889f65
LP
3895 if (!strv_isempty(arg_setenv)) {
3896 char **n;
3897
3898 n = strv_env_merge(2, envp, arg_setenv);
3899 if (!n) {
3900 log_oom();
a2da110b 3901 _exit(EXIT_FAILURE);
f4889f65
LP
3902 }
3903
3904 env_use = n;
3905 } else
3906 env_use = (char**) envp;
3907
d96c1ecf 3908 /* Wait until the parent is ready with the setup, too... */
a2da110b
DH
3909 if (!barrier_place_and_sync(&barrier))
3910 _exit(EXIT_FAILURE);
d96c1ecf 3911
d87be9b0
LP
3912 if (arg_boot) {
3913 char **a;
3914 size_t l;
88213476 3915
d87be9b0 3916 /* Automatically search for the init system */
0f0dbc46 3917
d87be9b0
LP
3918 l = 1 + argc - optind;
3919 a = newa(char*, l + 1);
3920 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 3921
d87be9b0 3922 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 3923 execve(a[0], a, env_use);
0f0dbc46 3924
d87be9b0 3925 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 3926 execve(a[0], a, env_use);
0f0dbc46 3927
d87be9b0 3928 a[0] = (char*) "/sbin/init";
f4889f65 3929 execve(a[0], a, env_use);
d87be9b0 3930 } else if (argc > optind)
f4889f65 3931 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
3932 else {
3933 chdir(home ? home : "/root");
f4889f65 3934 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 3935 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
3936 }
3937
56f64d95 3938 log_error_errno(errno, "execv() failed: %m");
d87be9b0 3939 _exit(EXIT_FAILURE);
da5b3bad 3940 }
88213476 3941
a2da110b 3942 barrier_set_role(&barrier, BARRIER_PARENT);
842f3b0f
LP
3943 fdset_free(fds);
3944 fds = NULL;
3945
6d0b55c2
LP
3946 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3947 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3948
b12afc8c
LP
3949 /* Wait for the most basic Child-setup to be done,
3950 * before we add hardware to it, and place it in a
3951 * cgroup. */
3952 if (barrier_sync_next(&barrier)) {
5aa4bb6b 3953 int ifi = 0;
354bfd2b 3954
840295fc
LP
3955 r = move_network_interfaces(pid);
3956 if (r < 0)
3957 goto finish;
aa28aefe 3958
5aa4bb6b 3959 r = setup_veth(pid, veth_name, &ifi);
840295fc
LP
3960 if (r < 0)
3961 goto finish;
ab046dde 3962
5aa4bb6b 3963 r = setup_bridge(veth_name, &ifi);
840295fc
LP
3964 if (r < 0)
3965 goto finish;
ab046dde 3966
840295fc
LP
3967 r = setup_macvlan(pid);
3968 if (r < 0)
3969 goto finish;
c74e630d 3970
5aa4bb6b
LP
3971 r = register_machine(pid, ifi);
3972 if (r < 0)
3973 goto finish;
3974
840295fc
LP
3975 /* Block SIGCHLD here, before notifying child.
3976 * process_pty() will handle it with the other signals. */
3977 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3978 if (r < 0)
3979 goto finish;
e866af3a 3980
840295fc
LP
3981 /* Reset signal to default */
3982 r = default_signals(SIGCHLD, -1);
3983 if (r < 0)
3984 goto finish;
e866af3a 3985
840295fc
LP
3986 /* Notify the child that the parent is ready with all
3987 * its setup, and that the child can now hand over
3988 * control to the code to run inside the container. */
814a3fdf
LP
3989 (void) barrier_place(&barrier);
3990
b12afc8c 3991 /* And wait that the child is completely ready now. */
6d0b55c2
LP
3992 if (barrier_place_and_sync(&barrier)) {
3993 _cleanup_event_unref_ sd_event *event = NULL;
3994 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3995 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
3996 char last_char = 0;
b12afc8c 3997
733d15ac
LP
3998 sd_notifyf(false,
3999 "READY=1\n"
4000 "STATUS=Container running.\n"
4001 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 4002
6d0b55c2
LP
4003 r = sd_event_new(&event);
4004 if (r < 0) {
4005 log_error_errno(r, "Failed to get default event source: %m");
4006 goto finish;
4007 }
88213476 4008
6d0b55c2
LP
4009 if (arg_boot) {
4010 /* Try to kill the init system on SIGINT or SIGTERM */
4011 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4012 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4013 } else {
4014 /* Immediately exit */
4015 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4016 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4017 }
023fb90b 4018
6d0b55c2
LP
4019 /* simply exit on sigchld */
4020 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 4021
6d0b55c2
LP
4022 if (arg_expose_ports) {
4023 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4024 if (r < 0)
4025 goto finish;
023fb90b 4026
6d0b55c2
LP
4027 (void) expose_ports(rtnl, &exposed);
4028 }
023fb90b 4029
6d0b55c2 4030 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 4031
6d0b55c2
LP
4032 r = pty_forward_new(event, master, true, &forward);
4033 if (r < 0) {
4034 log_error_errno(r, "Failed to create PTY forwarder: %m");
4035 goto finish;
4036 }
023fb90b 4037
6d0b55c2
LP
4038 r = sd_event_loop(event);
4039 if (r < 0) {
4040 log_error_errno(r, "Failed to run event loop: %m");
4041 goto finish;
4042 }
4043
4044 pty_forward_get_last_char(forward, &last_char);
4045
4046 forward = pty_forward_free(forward);
4047
4048 if (!arg_quiet && last_char != '\n')
4049 putc('\n', stdout);
04d39279 4050
6d0b55c2
LP
4051 /* Kill if it is not dead yet anyway */
4052 terminate_machine(pid);
4053 }
840295fc 4054 }
1f0cd86b 4055
840295fc 4056 /* Normally redundant, but better safe than sorry */
04d39279 4057 kill(pid, SIGKILL);
a258bf26 4058
113cea80 4059 r = wait_for_container(pid, &container_status);
04d39279
LP
4060 pid = 0;
4061
ec16945e 4062 if (r < 0)
ce9f1527
LP
4063 /* We failed to wait for the container, or the
4064 * container exited abnormally */
ec16945e
LP
4065 goto finish;
4066 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
4067 /* The container exited with a non-zero
4068 * status, or with zero status and no reboot
4069 * was requested. */
ec16945e 4070 ret = r;
d87be9b0 4071 break;
ec16945e 4072 }
88213476 4073
113cea80 4074 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
4075
4076 if (arg_keep_unit) {
4077 /* Special handling if we are running as a
4078 * service: instead of simply restarting the
4079 * machine we want to restart the entire
4080 * service, so let's inform systemd about this
4081 * with the special exit code 133. The service
4082 * file uses RestartForceExitStatus=133 so
4083 * that this results in a full nspawn
4084 * restart. This is necessary since we might
4085 * have cgroup parameters set we want to have
4086 * flushed out. */
ec16945e
LP
4087 ret = 133;
4088 r = 0;
ce38dbc8
LP
4089 break;
4090 }
6d0b55c2
LP
4091
4092 flush_ports(&exposed);
d87be9b0 4093 }
88213476
LP
4094
4095finish:
af4ec430
LP
4096 sd_notify(false,
4097 "STOPPING=1\n"
4098 "STATUS=Terminating...");
4099
1b9e5b12
LP
4100 loop_remove(loop_nr, &image_fd);
4101
9444b1f2
LP
4102 if (pid > 0)
4103 kill(pid, SIGKILL);
88213476 4104
ec16945e
LP
4105 if (remove_subvol && arg_directory) {
4106 int k;
4107
4108 k = btrfs_subvol_remove(arg_directory);
4109 if (k < 0)
4110 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4111 }
4112
785890ac
LP
4113 if (arg_machine) {
4114 const char *p;
4115
8937422f 4116 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
4117 (void) rm_rf(p, false, true, false);
4118 }
4119
04d391da 4120 free(arg_directory);
ec16945e
LP
4121 free(arg_template);
4122 free(arg_image);
7027ff61 4123 free(arg_machine);
c74e630d
LP
4124 free(arg_user);
4125 strv_free(arg_setenv);
4126 strv_free(arg_network_interfaces);
4127 strv_free(arg_network_macvlan);
4128 strv_free(arg_bind);
4129 strv_free(arg_bind_ro);
06c17c39 4130 strv_free(arg_tmpfs);
88213476 4131
6d0b55c2
LP
4132 flush_ports(&exposed);
4133
4134 while (arg_expose_ports) {
4135 ExposePort *p = arg_expose_ports;
4136 LIST_REMOVE(ports, arg_expose_ports, p);
4137 free(p);
4138 }
4139
ec16945e 4140 return r < 0 ? EXIT_FAILURE : ret;
88213476 4141}