]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: remove spurious include of <sys/capability.h>
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
88213476 34#include <getopt.h>
a258bf26
LP
35#include <termios.h>
36#include <sys/signalfd.h>
687d0825 37#include <grp.h>
5ed27dbd 38#include <linux/fs.h>
9537eab0
LP
39#include <sys/un.h>
40#include <sys/socket.h>
aea38d80 41#include <linux/netlink.h>
aa28aefe 42#include <net/if.h>
69c79d3c 43#include <linux/veth.h>
6afc95b7 44#include <sys/personality.h>
1b9e5b12 45#include <linux/loop.h>
aa28aefe 46
5d63309c 47#ifdef HAVE_SELINUX
a8828ed9
DW
48#include <selinux/selinux.h>
49#endif
88213476 50
24fb1112
LP
51#ifdef HAVE_SECCOMP
52#include <seccomp.h>
53#endif
54
1b9e5b12
LP
55#ifdef HAVE_BLKID
56#include <blkid/blkid.h>
57#endif
58
1f0cd86b
LP
59#include "sd-daemon.h"
60#include "sd-bus.h"
61#include "sd-id128.h"
aa28aefe 62#include "sd-rtnl.h"
88213476
LP
63#include "log.h"
64#include "util.h"
49e942b2 65#include "mkdir.h"
6b2d0e85 66#include "macro.h"
d7832d2c 67#include "audit.h"
94d82985 68#include "missing.h"
04d391da 69#include "cgroup-util.h"
a258bf26 70#include "strv.h"
9eb977db 71#include "path-util.h"
a41fe3a2 72#include "loopback-setup.h"
4fc9982c 73#include "dev-setup.h"
842f3b0f 74#include "fdset.h"
acbeb427 75#include "build.h"
a5c32cff 76#include "fileio.h"
40ca29a1 77#include "bus-util.h"
1f0cd86b 78#include "bus-error.h"
4ba93280 79#include "ptyfwd.h"
9bd37b40 80#include "bus-kernel.h"
f4889f65 81#include "env-util.h"
7f112f50 82#include "def.h"
aa28aefe 83#include "rtnl-util.h"
7e227024 84#include "udev-util.h"
1b9e5b12
LP
85#include "blkid-util.h"
86#include "gpt.h"
01dde061 87#include "siphash24.h"
849958d1 88#include "copy.h"
3577de7a 89#include "base-filesystem.h"
a2da110b 90#include "barrier.h"
023fb90b 91#include "event-util.h"
f01ae826 92#include "capability.h"
2822da4f 93#include "cap-list.h"
ec16945e 94#include "btrfs-util.h"
f2d88580 95
e9642be2
LP
96#ifdef HAVE_SECCOMP
97#include "seccomp-util.h"
98#endif
99
113cea80
DH
100typedef enum ContainerStatus {
101 CONTAINER_TERMINATED,
102 CONTAINER_REBOOTED
103} ContainerStatus;
104
57fb9fb5
LP
105typedef enum LinkJournal {
106 LINK_NO,
107 LINK_AUTO,
108 LINK_HOST,
109 LINK_GUEST
110} LinkJournal;
88213476 111
4d9f07b4
LP
112typedef enum Volatile {
113 VOLATILE_NO,
114 VOLATILE_YES,
115 VOLATILE_STATE,
116} Volatile;
117
88213476 118static char *arg_directory = NULL;
ec16945e 119static char *arg_template = NULL;
687d0825 120static char *arg_user = NULL;
9444b1f2 121static sd_id128_t arg_uuid = {};
7027ff61 122static char *arg_machine = NULL;
c74e630d
LP
123static const char *arg_selinux_context = NULL;
124static const char *arg_selinux_apifs_context = NULL;
9444b1f2 125static const char *arg_slice = NULL;
ff01d048 126static bool arg_private_network = false;
bc2f673e 127static bool arg_read_only = false;
0f0dbc46 128static bool arg_boot = false;
ec16945e 129static bool arg_ephemeral = false;
57fb9fb5 130static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 131static bool arg_link_journal_try = false;
5076f0cc
LP
132static uint64_t arg_retain =
133 (1ULL << CAP_CHOWN) |
134 (1ULL << CAP_DAC_OVERRIDE) |
135 (1ULL << CAP_DAC_READ_SEARCH) |
136 (1ULL << CAP_FOWNER) |
137 (1ULL << CAP_FSETID) |
138 (1ULL << CAP_IPC_OWNER) |
139 (1ULL << CAP_KILL) |
140 (1ULL << CAP_LEASE) |
141 (1ULL << CAP_LINUX_IMMUTABLE) |
142 (1ULL << CAP_NET_BIND_SERVICE) |
143 (1ULL << CAP_NET_BROADCAST) |
144 (1ULL << CAP_NET_RAW) |
145 (1ULL << CAP_SETGID) |
146 (1ULL << CAP_SETFCAP) |
147 (1ULL << CAP_SETPCAP) |
148 (1ULL << CAP_SETUID) |
149 (1ULL << CAP_SYS_ADMIN) |
150 (1ULL << CAP_SYS_CHROOT) |
151 (1ULL << CAP_SYS_NICE) |
152 (1ULL << CAP_SYS_PTRACE) |
153 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 154 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
155 (1ULL << CAP_SYS_BOOT) |
156 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
157 (1ULL << CAP_AUDIT_CONTROL) |
158 (1ULL << CAP_MKNOD);
17fe0523
LP
159static char **arg_bind = NULL;
160static char **arg_bind_ro = NULL;
06c17c39 161static char **arg_tmpfs = NULL;
f4889f65 162static char **arg_setenv = NULL;
284c0b91 163static bool arg_quiet = false;
8a96d94e 164static bool arg_share_system = false;
eb91eb18 165static bool arg_register = true;
89f7c846 166static bool arg_keep_unit = false;
aa28aefe 167static char **arg_network_interfaces = NULL;
c74e630d 168static char **arg_network_macvlan = NULL;
69c79d3c 169static bool arg_network_veth = false;
c74e630d 170static const char *arg_network_bridge = NULL;
6afc95b7 171static unsigned long arg_personality = 0xffffffffLU;
ec16945e 172static char *arg_image = NULL;
4d9f07b4 173static Volatile arg_volatile = VOLATILE_NO;
88213476 174
601185b4 175static void help(void) {
88213476
LP
176 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
177 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
178 " -h --help Show this help\n"
179 " --version Print version string\n"
69c79d3c 180 " -q --quiet Do not show status information\n"
1b9e5b12 181 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
182 " --template=PATH Initialize root directory from template directory,\n"
183 " if missing\n"
184 " -x --ephemeral Run container with snapshot of root directory, and\n"
185 " remove it after exit\n"
186 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
187 " -b --boot Boot up full system (i.e. invoke init)\n"
188 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 189 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 190 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 191 " -S --slice=SLICE Place the container in the specified slice\n"
69c79d3c
LP
192 " --private-network Disable network in container\n"
193 " --network-interface=INTERFACE\n"
194 " Assign an existing network interface to the\n"
195 " container\n"
c74e630d
LP
196 " --network-macvlan=INTERFACE\n"
197 " Create a macvlan network interface based on an\n"
198 " existing network interface to the container\n"
32457153 199 " --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 200 " and container\n"
ab046dde 201 " --network-bridge=INTERFACE\n"
32457153 202 " Add a virtual ethernet connection between host\n"
ab046dde
TG
203 " and container and add it to an existing bridge on\n"
204 " the host\n"
82adf6af
LP
205 " -Z --selinux-context=SECLABEL\n"
206 " Set the SELinux security context to be used by\n"
207 " processes in the container\n"
208 " -L --selinux-apifs-context=SECLABEL\n"
209 " Set the SELinux security context to be used by\n"
210 " API/tmpfs file systems in the container\n"
a8828ed9
DW
211 " --capability=CAP In addition to the default, retain specified\n"
212 " capability\n"
213 " --drop-capability=CAP Drop the specified capability from the default set\n"
574edc90
MP
214 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
215 " try-guest, try-host\n"
216 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 217 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
218 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
219 " the container\n"
220 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 221 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
284c0b91 222 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 223 " --share-system Share system namespaces with host\n"
eb91eb18 224 " --register=BOOLEAN Register container as machine\n"
89f7c846 225 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4
LP
226 " the service unit nspawn is running in\n"
227 " --volatile[=MODE] Run the system in volatile mode\n",
88213476 228 program_invocation_short_name);
88213476
LP
229}
230
ec16945e
LP
231static int set_sanitized_path(char **b, const char *path) {
232 char *p;
233
234 assert(b);
235 assert(path);
236
237 p = canonicalize_file_name(path);
238 if (!p) {
239 if (errno != ENOENT)
240 return -errno;
241
242 p = path_make_absolute_cwd(path);
243 if (!p)
244 return -ENOMEM;
245 }
246
247 free(*b);
248 *b = path_kill_slashes(p);
249 return 0;
250}
251
88213476
LP
252static int parse_argv(int argc, char *argv[]) {
253
a41fe3a2 254 enum {
acbeb427
ZJS
255 ARG_VERSION = 0x100,
256 ARG_PRIVATE_NETWORK,
bc2f673e 257 ARG_UUID,
5076f0cc 258 ARG_READ_ONLY,
57fb9fb5 259 ARG_CAPABILITY,
420c7379 260 ARG_DROP_CAPABILITY,
17fe0523
LP
261 ARG_LINK_JOURNAL,
262 ARG_BIND,
f4889f65 263 ARG_BIND_RO,
06c17c39 264 ARG_TMPFS,
f4889f65 265 ARG_SETENV,
eb91eb18 266 ARG_SHARE_SYSTEM,
89f7c846 267 ARG_REGISTER,
aa28aefe 268 ARG_KEEP_UNIT,
69c79d3c 269 ARG_NETWORK_INTERFACE,
c74e630d 270 ARG_NETWORK_MACVLAN,
69c79d3c 271 ARG_NETWORK_VETH,
ab046dde 272 ARG_NETWORK_BRIDGE,
6afc95b7 273 ARG_PERSONALITY,
4d9f07b4 274 ARG_VOLATILE,
ec16945e 275 ARG_TEMPLATE,
a41fe3a2
LP
276 };
277
88213476 278 static const struct option options[] = {
aa28aefe
LP
279 { "help", no_argument, NULL, 'h' },
280 { "version", no_argument, NULL, ARG_VERSION },
281 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
282 { "template", required_argument, NULL, ARG_TEMPLATE },
283 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
284 { "user", required_argument, NULL, 'u' },
285 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
286 { "boot", no_argument, NULL, 'b' },
287 { "uuid", required_argument, NULL, ARG_UUID },
288 { "read-only", no_argument, NULL, ARG_READ_ONLY },
289 { "capability", required_argument, NULL, ARG_CAPABILITY },
290 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
291 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
292 { "bind", required_argument, NULL, ARG_BIND },
293 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 294 { "tmpfs", required_argument, NULL, ARG_TMPFS },
aa28aefe
LP
295 { "machine", required_argument, NULL, 'M' },
296 { "slice", required_argument, NULL, 'S' },
297 { "setenv", required_argument, NULL, ARG_SETENV },
298 { "selinux-context", required_argument, NULL, 'Z' },
299 { "selinux-apifs-context", required_argument, NULL, 'L' },
300 { "quiet", no_argument, NULL, 'q' },
301 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
302 { "register", required_argument, NULL, ARG_REGISTER },
303 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
304 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 305 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
ab046dde
TG
306 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
307 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 308 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 309 { "image", required_argument, NULL, 'i' },
4d9f07b4 310 { "volatile", optional_argument, NULL, ARG_VOLATILE },
eb9da376 311 {}
88213476
LP
312 };
313
9444b1f2 314 int c, r;
a42c8b54 315 uint64_t plus = 0, minus = 0;
88213476
LP
316
317 assert(argc >= 0);
318 assert(argv);
319
ec16945e 320 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:x", options, NULL)) >= 0)
88213476
LP
321
322 switch (c) {
323
324 case 'h':
601185b4
ZJS
325 help();
326 return 0;
88213476 327
acbeb427
ZJS
328 case ARG_VERSION:
329 puts(PACKAGE_STRING);
330 puts(SYSTEMD_FEATURES);
331 return 0;
332
88213476 333 case 'D':
ec16945e
LP
334 r = set_sanitized_path(&arg_directory, optarg);
335 if (r < 0)
336 return log_error_errno(r, "Invalid root directory: %m");
337
338 break;
339
340 case ARG_TEMPLATE:
341 r = set_sanitized_path(&arg_template, optarg);
342 if (r < 0)
343 return log_error_errno(r, "Invalid template directory: %m");
88213476
LP
344
345 break;
346
1b9e5b12 347 case 'i':
ec16945e
LP
348 r = set_sanitized_path(&arg_image, optarg);
349 if (r < 0)
350 return log_error_errno(r, "Invalid image path: %m");
351
352 break;
353
354 case 'x':
355 arg_ephemeral = true;
1b9e5b12
LP
356 break;
357
687d0825
MV
358 case 'u':
359 free(arg_user);
7027ff61
LP
360 arg_user = strdup(optarg);
361 if (!arg_user)
362 return log_oom();
687d0825
MV
363
364 break;
365
ab046dde 366 case ARG_NETWORK_BRIDGE:
c74e630d 367 arg_network_bridge = optarg;
ab046dde
TG
368
369 /* fall through */
370
69c79d3c
LP
371 case ARG_NETWORK_VETH:
372 arg_network_veth = true;
373 arg_private_network = true;
374 break;
375
aa28aefe 376 case ARG_NETWORK_INTERFACE:
c74e630d
LP
377 if (strv_extend(&arg_network_interfaces, optarg) < 0)
378 return log_oom();
379
380 arg_private_network = true;
381 break;
382
383 case ARG_NETWORK_MACVLAN:
384 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
385 return log_oom();
386
387 /* fall through */
388
ff01d048
LP
389 case ARG_PRIVATE_NETWORK:
390 arg_private_network = true;
a41fe3a2
LP
391 break;
392
0f0dbc46
LP
393 case 'b':
394 arg_boot = true;
395 break;
396
144f0fc0 397 case ARG_UUID:
9444b1f2
LP
398 r = sd_id128_from_string(optarg, &arg_uuid);
399 if (r < 0) {
aa96c6cb 400 log_error("Invalid UUID: %s", optarg);
9444b1f2 401 return r;
aa96c6cb 402 }
9444b1f2 403 break;
aa96c6cb 404
9444b1f2 405 case 'S':
c74e630d 406 arg_slice = optarg;
144f0fc0
LP
407 break;
408
7027ff61 409 case 'M':
eb91eb18
LP
410 if (isempty(optarg)) {
411 free(arg_machine);
412 arg_machine = NULL;
413 } else {
0c3c4284 414 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
415 log_error("Invalid machine name: %s", optarg);
416 return -EINVAL;
417 }
7027ff61 418
0c3c4284
LP
419 r = free_and_strdup(&arg_machine, optarg);
420 if (r < 0)
eb91eb18
LP
421 return log_oom();
422
423 break;
424 }
7027ff61 425
82adf6af
LP
426 case 'Z':
427 arg_selinux_context = optarg;
a8828ed9
DW
428 break;
429
82adf6af
LP
430 case 'L':
431 arg_selinux_apifs_context = optarg;
a8828ed9
DW
432 break;
433
bc2f673e
LP
434 case ARG_READ_ONLY:
435 arg_read_only = true;
436 break;
437
420c7379
LP
438 case ARG_CAPABILITY:
439 case ARG_DROP_CAPABILITY: {
a2a5291b 440 const char *state, *word;
5076f0cc
LP
441 size_t length;
442
443 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 444 _cleanup_free_ char *t;
5076f0cc
LP
445
446 t = strndup(word, length);
0d0f0c50
SL
447 if (!t)
448 return log_oom();
5076f0cc 449
39ed67d1
LP
450 if (streq(t, "all")) {
451 if (c == ARG_CAPABILITY)
a42c8b54 452 plus = (uint64_t) -1;
39ed67d1 453 else
a42c8b54 454 minus = (uint64_t) -1;
39ed67d1 455 } else {
2822da4f
LP
456 int cap;
457
458 cap = capability_from_name(t);
459 if (cap < 0) {
39ed67d1
LP
460 log_error("Failed to parse capability %s.", t);
461 return -EINVAL;
462 }
463
464 if (c == ARG_CAPABILITY)
a42c8b54 465 plus |= 1ULL << (uint64_t) cap;
39ed67d1 466 else
a42c8b54 467 minus |= 1ULL << (uint64_t) cap;
5076f0cc 468 }
5076f0cc
LP
469 }
470
471 break;
472 }
473
57fb9fb5
LP
474 case 'j':
475 arg_link_journal = LINK_GUEST;
574edc90 476 arg_link_journal_try = true;
57fb9fb5
LP
477 break;
478
479 case ARG_LINK_JOURNAL:
53e438e3 480 if (streq(optarg, "auto")) {
57fb9fb5 481 arg_link_journal = LINK_AUTO;
53e438e3
LP
482 arg_link_journal_try = false;
483 } else if (streq(optarg, "no")) {
57fb9fb5 484 arg_link_journal = LINK_NO;
53e438e3
LP
485 arg_link_journal_try = false;
486 } else if (streq(optarg, "guest")) {
57fb9fb5 487 arg_link_journal = LINK_GUEST;
53e438e3
LP
488 arg_link_journal_try = false;
489 } else if (streq(optarg, "host")) {
57fb9fb5 490 arg_link_journal = LINK_HOST;
53e438e3
LP
491 arg_link_journal_try = false;
492 } else if (streq(optarg, "try-guest")) {
574edc90
MP
493 arg_link_journal = LINK_GUEST;
494 arg_link_journal_try = true;
495 } else if (streq(optarg, "try-host")) {
496 arg_link_journal = LINK_HOST;
497 arg_link_journal_try = true;
498 } else {
57fb9fb5
LP
499 log_error("Failed to parse link journal mode %s", optarg);
500 return -EINVAL;
501 }
502
503 break;
504
17fe0523
LP
505 case ARG_BIND:
506 case ARG_BIND_RO: {
507 _cleanup_free_ char *a = NULL, *b = NULL;
508 char *e;
509 char ***x;
17fe0523
LP
510
511 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
512
513 e = strchr(optarg, ':');
514 if (e) {
515 a = strndup(optarg, e - optarg);
516 b = strdup(e + 1);
517 } else {
518 a = strdup(optarg);
519 b = strdup(optarg);
520 }
521
522 if (!a || !b)
523 return log_oom();
524
525 if (!path_is_absolute(a) || !path_is_absolute(b)) {
526 log_error("Invalid bind mount specification: %s", optarg);
527 return -EINVAL;
528 }
529
530 r = strv_extend(x, a);
531 if (r < 0)
b3451bed 532 return log_oom();
17fe0523
LP
533
534 r = strv_extend(x, b);
535 if (r < 0)
b3451bed 536 return log_oom();
17fe0523
LP
537
538 break;
539 }
540
06c17c39
LP
541 case ARG_TMPFS: {
542 _cleanup_free_ char *a = NULL, *b = NULL;
543 char *e;
544
545 e = strchr(optarg, ':');
546 if (e) {
547 a = strndup(optarg, e - optarg);
548 b = strdup(e + 1);
549 } else {
550 a = strdup(optarg);
551 b = strdup("mode=0755");
552 }
553
554 if (!a || !b)
555 return log_oom();
556
557 if (!path_is_absolute(a)) {
558 log_error("Invalid tmpfs specification: %s", optarg);
559 return -EINVAL;
560 }
561
562 r = strv_push(&arg_tmpfs, a);
563 if (r < 0)
564 return log_oom();
565
566 a = NULL;
567
568 r = strv_push(&arg_tmpfs, b);
569 if (r < 0)
570 return log_oom();
571
572 b = NULL;
573
574 break;
575 }
576
f4889f65
LP
577 case ARG_SETENV: {
578 char **n;
579
580 if (!env_assignment_is_valid(optarg)) {
581 log_error("Environment variable assignment '%s' is not valid.", optarg);
582 return -EINVAL;
583 }
584
585 n = strv_env_set(arg_setenv, optarg);
586 if (!n)
587 return log_oom();
588
589 strv_free(arg_setenv);
590 arg_setenv = n;
591 break;
592 }
593
284c0b91
LP
594 case 'q':
595 arg_quiet = true;
596 break;
597
8a96d94e
LP
598 case ARG_SHARE_SYSTEM:
599 arg_share_system = true;
600 break;
601
eb91eb18
LP
602 case ARG_REGISTER:
603 r = parse_boolean(optarg);
604 if (r < 0) {
605 log_error("Failed to parse --register= argument: %s", optarg);
606 return r;
607 }
608
609 arg_register = r;
610 break;
611
89f7c846
LP
612 case ARG_KEEP_UNIT:
613 arg_keep_unit = true;
614 break;
615
6afc95b7
LP
616 case ARG_PERSONALITY:
617
ac45f971 618 arg_personality = personality_from_string(optarg);
6afc95b7
LP
619 if (arg_personality == 0xffffffffLU) {
620 log_error("Unknown or unsupported personality '%s'.", optarg);
621 return -EINVAL;
622 }
623
624 break;
625
4d9f07b4
LP
626 case ARG_VOLATILE:
627
628 if (!optarg)
629 arg_volatile = VOLATILE_YES;
630 else {
631 r = parse_boolean(optarg);
632 if (r < 0) {
633 if (streq(optarg, "state"))
634 arg_volatile = VOLATILE_STATE;
635 else {
636 log_error("Failed to parse --volatile= argument: %s", optarg);
637 return r;
638 }
639 } else
640 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
641 }
642
643 break;
644
88213476
LP
645 case '?':
646 return -EINVAL;
647
648 default:
eb9da376 649 assert_not_reached("Unhandled option");
88213476 650 }
88213476 651
eb91eb18
LP
652 if (arg_share_system)
653 arg_register = false;
654
655 if (arg_boot && arg_share_system) {
656 log_error("--boot and --share-system may not be combined.");
657 return -EINVAL;
658 }
659
89f7c846
LP
660 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
661 log_error("--keep-unit may not be used when invoked from a user session.");
662 return -EINVAL;
663 }
664
1b9e5b12
LP
665 if (arg_directory && arg_image) {
666 log_error("--directory= and --image= may not be combined.");
667 return -EINVAL;
668 }
669
ec16945e
LP
670 if (arg_template && arg_image) {
671 log_error("--template= and --image= may not be combined.");
672 return -EINVAL;
673 }
674
675 if (arg_template && !(arg_directory || arg_machine)) {
676 log_error("--template= needs --directory= or --machine=.");
677 return -EINVAL;
678 }
679
680 if (arg_ephemeral && arg_template) {
681 log_error("--ephemeral and --template= may not be combined.");
682 return -EINVAL;
683 }
684
685 if (arg_ephemeral && arg_image) {
686 log_error("--ephemeral and --image= may not be combined.");
687 return -EINVAL;
688 }
689
df9a75e4
LP
690 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
691 log_error("--ephemeral and --link-journal= may not be combined.");
692 return -EINVAL;
693 }
694
4d9f07b4
LP
695 if (arg_volatile != VOLATILE_NO && arg_read_only) {
696 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
697 return -EINVAL;
698 }
699
a42c8b54
LP
700 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
701
88213476
LP
702 return 1;
703}
704
705static int mount_all(const char *dest) {
706
707 typedef struct MountPoint {
708 const char *what;
709 const char *where;
710 const char *type;
711 const char *options;
712 unsigned long flags;
3bd66c05 713 bool fatal;
88213476
LP
714 } MountPoint;
715
716 static const MountPoint mount_table[] = {
06c17c39
LP
717 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
718 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
719 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
720 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
721 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 722 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
06c17c39
LP
723 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
724 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 725#ifdef HAVE_SELINUX
06c17c39
LP
726 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
727 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 728#endif
88213476
LP
729 };
730
731 unsigned k;
732 int r = 0;
733
734 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
7fd1b19b 735 _cleanup_free_ char *where = NULL;
d002827b 736#ifdef HAVE_SELINUX
a8828ed9 737 _cleanup_free_ char *options = NULL;
d002827b
LP
738#endif
739 const char *o;
88213476
LP
740 int t;
741
17fe0523
LP
742 where = strjoin(dest, "/", mount_table[k].where, NULL);
743 if (!where)
744 return log_oom();
88213476 745
e65aec12 746 t = path_is_mount_point(where, true);
68fb0892 747 if (t < 0) {
da927ba9 748 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
88213476
LP
749
750 if (r == 0)
751 r = t;
752
753 continue;
754 }
755
9c1c7f71
LP
756 /* Skip this entry if it is not a remount. */
757 if (mount_table[k].what && t > 0)
014a9c77
LP
758 continue;
759
79d80fc1
TG
760 t = mkdir_p(where, 0755);
761 if (t < 0) {
762 if (mount_table[k].fatal) {
da927ba9 763 log_error_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
764
765 if (r == 0)
766 r = t;
767 } else
da927ba9 768 log_warning_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
769
770 continue;
771 }
88213476 772
a8828ed9 773#ifdef HAVE_SELINUX
82adf6af
LP
774 if (arg_selinux_apifs_context &&
775 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
776 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
777 if (!options)
778 return log_oom();
779
780 o = options;
781 } else
a8828ed9 782#endif
d002827b 783 o = mount_table[k].options;
a8828ed9 784
a8828ed9 785
88213476
LP
786 if (mount(mount_table[k].what,
787 where,
788 mount_table[k].type,
789 mount_table[k].flags,
79d80fc1 790 o) < 0) {
88213476 791
79d80fc1 792 if (mount_table[k].fatal) {
56f64d95 793 log_error_errno(errno, "mount(%s) failed: %m", where);
88213476 794
79d80fc1
TG
795 if (r == 0)
796 r = -errno;
797 } else
56f64d95 798 log_warning_errno(errno, "mount(%s) failed: %m", where);
88213476 799 }
88213476
LP
800 }
801
e58a1277
LP
802 return r;
803}
f8440af5 804
d6797c92 805static int mount_binds(const char *dest, char **l, bool ro) {
17fe0523
LP
806 char **x, **y;
807
808 STRV_FOREACH_PAIR(x, y, l) {
06c17c39 809 _cleanup_free_ char *where = NULL;
d2421337 810 struct stat source_st, dest_st;
2ed4e5e0 811 int r;
d2421337 812
4a62c710
MS
813 if (stat(*x, &source_st) < 0)
814 return log_error_errno(errno, "Failed to stat %s: %m", *x);
17fe0523 815
06c17c39
LP
816 where = strappend(dest, *y);
817 if (!where)
818 return log_oom();
819
2ed4e5e0
SL
820 r = stat(where, &dest_st);
821 if (r == 0) {
d2421337 822 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
06c17c39 823 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
d2421337
DR
824 return -EINVAL;
825 }
2ed4e5e0
SL
826 } else if (errno == ENOENT) {
827 r = mkdir_parents_label(where, 0755);
f647962d
MS
828 if (r < 0)
829 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
2ed4e5e0 830 } else {
56f64d95 831 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
2ed4e5e0
SL
832 return -errno;
833 }
06c17c39 834
2ed4e5e0 835 /* Create the mount point, but be conservative -- refuse to create block
4d9f07b4 836 * and char devices. */
79d80fc1
TG
837 if (S_ISDIR(source_st.st_mode)) {
838 r = mkdir_label(where, 0755);
f647962d
MS
839 if (r < 0 && errno != EEXIST)
840 return log_error_errno(r, "Failed to create mount point %s: %m", where);
79d80fc1
TG
841 } else if (S_ISFIFO(source_st.st_mode)) {
842 r = mkfifo(where, 0644);
4a62c710
MS
843 if (r < 0 && errno != EEXIST)
844 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
79d80fc1
TG
845 } else if (S_ISSOCK(source_st.st_mode)) {
846 r = mknod(where, 0644 | S_IFSOCK, 0);
4a62c710
MS
847 if (r < 0 && errno != EEXIST)
848 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
79d80fc1
TG
849 } else if (S_ISREG(source_st.st_mode)) {
850 r = touch(where);
f647962d
MS
851 if (r < 0)
852 return log_error_errno(r, "Failed to create mount point %s: %m", where);
79d80fc1 853 } else {
2ed4e5e0
SL
854 log_error("Refusing to create mountpoint for file: %s", *x);
855 return -ENOTSUP;
d2421337 856 }
17fe0523 857
4a62c710
MS
858 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
859 return log_error_errno(errno, "mount(%s) failed: %m", where);
17fe0523 860
d6797c92
LP
861 if (ro) {
862 r = bind_remount_recursive(where, true);
f647962d
MS
863 if (r < 0)
864 return log_error_errno(r, "Read-Only bind mount failed: %m");
17fe0523
LP
865 }
866 }
867
868 return 0;
869}
870
06c17c39
LP
871static int mount_tmpfs(const char *dest) {
872 char **i, **o;
873
874 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
875 _cleanup_free_ char *where = NULL;
79d80fc1 876 int r;
06c17c39
LP
877
878 where = strappend(dest, *i);
879 if (!where)
880 return log_oom();
881
79d80fc1 882 r = mkdir_label(where, 0755);
04a91939
LP
883 if (r < 0 && r != -EEXIST)
884 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
06c17c39 885
4a62c710
MS
886 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
887 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
06c17c39
LP
888 }
889
890 return 0;
891}
892
e58a1277 893static int setup_timezone(const char *dest) {
d4036145
LP
894 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
895 char *z, *y;
896 int r;
f8440af5 897
e58a1277
LP
898 assert(dest);
899
900 /* Fix the timezone, if possible */
d4036145
LP
901 r = readlink_malloc("/etc/localtime", &p);
902 if (r < 0) {
903 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
904 return 0;
905 }
906
907 z = path_startswith(p, "../usr/share/zoneinfo/");
908 if (!z)
909 z = path_startswith(p, "/usr/share/zoneinfo/");
910 if (!z) {
911 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
912 return 0;
913 }
914
04bc4a3f
LP
915 where = strappend(dest, "/etc/localtime");
916 if (!where)
0d0f0c50 917 return log_oom();
715ac17a 918
d4036145
LP
919 r = readlink_malloc(where, &q);
920 if (r >= 0) {
921 y = path_startswith(q, "../usr/share/zoneinfo/");
922 if (!y)
923 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 924
d4036145
LP
925 /* Already pointing to the right place? Then do nothing .. */
926 if (y && streq(y, z))
927 return 0;
928 }
929
930 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
931 if (!check)
0d0f0c50 932 return log_oom();
4d1c38b8 933
d4036145
LP
934 if (access(check, F_OK) < 0) {
935 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
936 return 0;
937 }
68fb0892 938
d4036145
LP
939 what = strappend("../usr/share/zoneinfo/", z);
940 if (!what)
941 return log_oom();
942
79d80fc1
TG
943 r = mkdir_parents(where, 0755);
944 if (r < 0) {
da927ba9 945 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
79d80fc1
TG
946
947 return 0;
948 }
949
950 r = unlink(where);
951 if (r < 0 && errno != ENOENT) {
56f64d95 952 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
953
954 return 0;
955 }
4d9f07b4 956
d4036145 957 if (symlink(what, where) < 0) {
56f64d95 958 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
959 return 0;
960 }
e58a1277
LP
961
962 return 0;
88213476
LP
963}
964
2547bb41 965static int setup_resolv_conf(const char *dest) {
c8b32e11 966 _cleanup_free_ char *where = NULL;
79d80fc1 967 int r;
2547bb41
LP
968
969 assert(dest);
970
971 if (arg_private_network)
972 return 0;
973
974 /* Fix resolv.conf, if possible */
04bc4a3f
LP
975 where = strappend(dest, "/etc/resolv.conf");
976 if (!where)
0d0f0c50 977 return log_oom();
2547bb41 978
77e63faf
LP
979 /* We don't really care for the results of this really. If it
980 * fails, it fails, but meh... */
79d80fc1
TG
981 r = mkdir_parents(where, 0755);
982 if (r < 0) {
da927ba9 983 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
79d80fc1
TG
984
985 return 0;
986 }
987
988 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
989 if (r < 0) {
da927ba9 990 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
991
992 return 0;
993 }
2547bb41
LP
994
995 return 0;
996}
997
4d9f07b4
LP
998static int setup_volatile_state(const char *directory) {
999 const char *p;
1000 int r;
1001
1002 assert(directory);
1003
1004 if (arg_volatile != VOLATILE_STATE)
1005 return 0;
1006
1007 /* --volatile=state means we simply overmount /var
1008 with a tmpfs, and the rest read-only. */
1009
1010 r = bind_remount_recursive(directory, true);
f647962d
MS
1011 if (r < 0)
1012 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
4d9f07b4
LP
1013
1014 p = strappenda(directory, "/var");
79d80fc1 1015 r = mkdir(p, 0755);
4a62c710
MS
1016 if (r < 0 && errno != EEXIST)
1017 return log_error_errno(errno, "Failed to create %s: %m", directory);
4d9f07b4 1018
4a62c710
MS
1019 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1020 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
4d9f07b4
LP
1021
1022 return 0;
1023}
1024
1025static int setup_volatile(const char *directory) {
1026 bool tmpfs_mounted = false, bind_mounted = false;
1027 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1028 const char *f, *t;
1029 int r;
1030
1031 assert(directory);
1032
1033 if (arg_volatile != VOLATILE_YES)
1034 return 0;
1035
1036 /* --volatile=yes means we mount a tmpfs to the root dir, and
1037 the original /usr to use inside it, and that read-only. */
1038
4a62c710
MS
1039 if (!mkdtemp(template))
1040 return log_error_errno(errno, "Failed to create temporary directory: %m");
4d9f07b4
LP
1041
1042 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
56f64d95 1043 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
4d9f07b4
LP
1044 r = -errno;
1045 goto fail;
1046 }
1047
1048 tmpfs_mounted = true;
1049
1050 f = strappenda(directory, "/usr");
1051 t = strappenda(template, "/usr");
1052
79d80fc1
TG
1053 r = mkdir(t, 0755);
1054 if (r < 0 && errno != EEXIST) {
56f64d95 1055 log_error_errno(errno, "Failed to create %s: %m", t);
79d80fc1
TG
1056 r = -errno;
1057 goto fail;
1058 }
1059
4d9f07b4 1060 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
56f64d95 1061 log_error_errno(errno, "Failed to create /usr bind mount: %m");
4d9f07b4
LP
1062 r = -errno;
1063 goto fail;
1064 }
1065
1066 bind_mounted = true;
1067
1068 r = bind_remount_recursive(t, true);
1069 if (r < 0) {
da927ba9 1070 log_error_errno(r, "Failed to remount %s read-only: %m", t);
4d9f07b4
LP
1071 goto fail;
1072 }
1073
1074 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
56f64d95 1075 log_error_errno(errno, "Failed to move root mount: %m");
4d9f07b4
LP
1076 r = -errno;
1077 goto fail;
1078 }
1079
1080 rmdir(template);
1081
1082 return 0;
1083
1084fail:
1085 if (bind_mounted)
1086 umount(t);
1087 if (tmpfs_mounted)
1088 umount(template);
1089 rmdir(template);
1090 return r;
1091}
1092
9f24adc2
LP
1093static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1094
1095 snprintf(s, 37,
1096 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1097 SD_ID128_FORMAT_VAL(id));
1098
1099 return s;
1100}
1101
04bc4a3f 1102static int setup_boot_id(const char *dest) {
7fd1b19b 1103 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 1104 sd_id128_t rnd = {};
04bc4a3f
LP
1105 char as_uuid[37];
1106 int r;
1107
1108 assert(dest);
1109
eb91eb18
LP
1110 if (arg_share_system)
1111 return 0;
1112
04bc4a3f
LP
1113 /* Generate a new randomized boot ID, so that each boot-up of
1114 * the container gets a new one */
1115
1116 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 1117 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
1118 if (!from || !to)
1119 return log_oom();
04bc4a3f
LP
1120
1121 r = sd_id128_randomize(&rnd);
f647962d
MS
1122 if (r < 0)
1123 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1124
9f24adc2 1125 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1126
574d5f2d 1127 r = write_string_file(from, as_uuid);
f647962d
MS
1128 if (r < 0)
1129 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f
LP
1130
1131 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
56f64d95 1132 log_error_errno(errno, "Failed to bind mount boot id: %m");
04bc4a3f 1133 r = -errno;
10d18763 1134 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
56f64d95 1135 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1136
1137 unlink(from);
04bc4a3f
LP
1138 return r;
1139}
1140
e58a1277 1141static int copy_devnodes(const char *dest) {
88213476
LP
1142
1143 static const char devnodes[] =
1144 "null\0"
1145 "zero\0"
1146 "full\0"
1147 "random\0"
1148 "urandom\0"
85614d66
TG
1149 "tty\0"
1150 "net/tun\0";
88213476
LP
1151
1152 const char *d;
e58a1277 1153 int r = 0;
7fd1b19b 1154 _cleanup_umask_ mode_t u;
a258bf26
LP
1155
1156 assert(dest);
124640f1
LP
1157
1158 u = umask(0000);
88213476
LP
1159
1160 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1161 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1162 struct stat st;
88213476 1163
7f112f50
LP
1164 from = strappend("/dev/", d);
1165 to = strjoin(dest, "/dev/", d, NULL);
1166 if (!from || !to)
1167 return log_oom();
88213476
LP
1168
1169 if (stat(from, &st) < 0) {
1170
4a62c710
MS
1171 if (errno != ENOENT)
1172 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1173
a258bf26 1174 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1175
ed8b7a3e 1176 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 1177 return -EIO;
a258bf26 1178
85614d66
TG
1179 } else {
1180 r = mkdir_parents(to, 0775);
1181 if (r < 0) {
da927ba9 1182 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
85614d66
TG
1183 return -r;
1184 }
a258bf26 1185
4a62c710
MS
1186 if (mknod(to, st.st_mode, st.st_rdev) < 0)
1187 return log_error_errno(errno, "mknod(%s) failed: %m", dest);
88213476 1188 }
88213476
LP
1189 }
1190
e58a1277
LP
1191 return r;
1192}
88213476 1193
f2d88580
LP
1194static int setup_ptmx(const char *dest) {
1195 _cleanup_free_ char *p = NULL;
1196
1197 p = strappend(dest, "/dev/ptmx");
1198 if (!p)
1199 return log_oom();
1200
4a62c710
MS
1201 if (symlink("pts/ptmx", p) < 0)
1202 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
f2d88580
LP
1203
1204 return 0;
1205}
1206
e58a1277 1207static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1208 _cleanup_umask_ mode_t u;
1209 const char *to;
e58a1277 1210 struct stat st;
e58a1277 1211 int r;
e58a1277
LP
1212
1213 assert(dest);
1214 assert(console);
1215
1216 u = umask(0000);
1217
4a62c710
MS
1218 if (stat("/dev/null", &st) < 0)
1219 return log_error_errno(errno, "Failed to stat /dev/null: %m");
88213476 1220
e58a1277 1221 r = chmod_and_chown(console, 0600, 0, 0);
f647962d
MS
1222 if (r < 0)
1223 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1224
a258bf26
LP
1225 /* We need to bind mount the right tty to /dev/console since
1226 * ptys can only exist on pts file systems. To have something
eb0f0863
LP
1227 * to bind mount things on we create a device node first, and
1228 * use /dev/null for that since we the cgroups device policy
1229 * allows us to create that freely, while we cannot create
1230 * /dev/console. (Note that the major minor doesn't actually
1231 * matter here, since we mount it over anyway). */
a258bf26 1232
eb0f0863 1233 to = strappenda(dest, "/dev/console");
4a62c710
MS
1234 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1235 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
a258bf26 1236
4a62c710
MS
1237 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1238 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1239
25ea79fe 1240 return 0;
e58a1277
LP
1241}
1242
1243static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 1244 _cleanup_free_ char *from = NULL, *to = NULL;
e58a1277 1245 int r, fd, k;
7fd1b19b 1246 _cleanup_umask_ mode_t u;
e58a1277
LP
1247 union {
1248 struct cmsghdr cmsghdr;
1249 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1250 } control = {};
1251 struct msghdr mh = {
1252 .msg_control = &control,
1253 .msg_controllen = sizeof(control),
1254 };
e58a1277
LP
1255 struct cmsghdr *cmsg;
1256
1257 assert(dest);
1258 assert(kmsg_socket >= 0);
a258bf26 1259
e58a1277 1260 u = umask(0000);
a258bf26 1261
f1e5dfe2
LP
1262 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1263 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1264 * on the reading side behave very similar to /proc/kmsg,
1265 * their writing side behaves differently from /dev/kmsg in
1266 * that writing blocks when nothing is reading. In order to
1267 * avoid any problems with containers deadlocking due to this
1268 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
1269 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1270 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1271 return log_oom();
e58a1277 1272
4a62c710
MS
1273 if (mkfifo(from, 0600) < 0)
1274 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
e58a1277
LP
1275
1276 r = chmod_and_chown(from, 0600, 0, 0);
f647962d
MS
1277 if (r < 0)
1278 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
e58a1277 1279
4a62c710
MS
1280 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1281 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1282
1283 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1284 if (fd < 0)
1285 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1286
e58a1277
LP
1287 cmsg = CMSG_FIRSTHDR(&mh);
1288 cmsg->cmsg_level = SOL_SOCKET;
1289 cmsg->cmsg_type = SCM_RIGHTS;
1290 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1291 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1292
1293 mh.msg_controllen = cmsg->cmsg_len;
1294
1295 /* Store away the fd in the socket, so that it stays open as
1296 * long as we run the child */
1297 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
03e334a1 1298 safe_close(fd);
e58a1277 1299
4a62c710
MS
1300 if (k < 0)
1301 return log_error_errno(errno, "Failed to send FIFO fd: %m");
a258bf26 1302
f1e5dfe2
LP
1303 /* And now make the FIFO unavailable as /dev/kmsg... */
1304 unlink(from);
25ea79fe 1305 return 0;
88213476
LP
1306}
1307
3a74cea5 1308static int setup_hostname(void) {
3a74cea5 1309
eb91eb18
LP
1310 if (arg_share_system)
1311 return 0;
1312
605f81a8 1313 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1314 return -errno;
3a74cea5 1315
7027ff61 1316 return 0;
3a74cea5
LP
1317}
1318
57fb9fb5 1319static int setup_journal(const char *directory) {
4d680aee 1320 sd_id128_t machine_id, this_id;
7fd1b19b 1321 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 1322 char *id;
57fb9fb5
LP
1323 int r;
1324
df9a75e4
LP
1325 /* Don't link journals in ephemeral mode */
1326 if (arg_ephemeral)
1327 return 0;
1328
57fb9fb5 1329 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
1330 if (!p)
1331 return log_oom();
57fb9fb5
LP
1332
1333 r = read_one_line_file(p, &b);
27407a01
ZJS
1334 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1335 return 0;
f647962d
MS
1336 else if (r < 0)
1337 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
57fb9fb5 1338
27407a01
ZJS
1339 id = strstrip(b);
1340 if (isempty(id) && arg_link_journal == LINK_AUTO)
1341 return 0;
57fb9fb5 1342
27407a01
ZJS
1343 /* Verify validity */
1344 r = sd_id128_from_string(id, &machine_id);
f647962d
MS
1345 if (r < 0)
1346 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
57fb9fb5 1347
4d680aee 1348 r = sd_id128_get_machine(&this_id);
f647962d
MS
1349 if (r < 0)
1350 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1351
1352 if (sd_id128_equal(machine_id, this_id)) {
1353 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1354 "Host and machine ids are equal (%s): refusing to link journals", id);
1355 if (arg_link_journal == LINK_AUTO)
1356 return 0;
df9a75e4 1357 return -EEXIST;
4d680aee
ZJS
1358 }
1359
1360 if (arg_link_journal == LINK_NO)
1361 return 0;
1362
57fb9fb5 1363 free(p);
27407a01
ZJS
1364 p = strappend("/var/log/journal/", id);
1365 q = strjoin(directory, "/var/log/journal/", id, NULL);
1366 if (!p || !q)
1367 return log_oom();
1368
1369 if (path_is_mount_point(p, false) > 0) {
1370 if (arg_link_journal != LINK_AUTO) {
1371 log_error("%s: already a mount point, refusing to use for journal", p);
1372 return -EEXIST;
1373 }
1374
1375 return 0;
57fb9fb5
LP
1376 }
1377
27407a01 1378 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1379 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1380 log_error("%s: already a mount point, refusing to use for journal", q);
1381 return -EEXIST;
57fb9fb5
LP
1382 }
1383
27407a01 1384 return 0;
57fb9fb5
LP
1385 }
1386
1387 r = readlink_and_make_absolute(p, &d);
1388 if (r >= 0) {
1389 if ((arg_link_journal == LINK_GUEST ||
1390 arg_link_journal == LINK_AUTO) &&
1391 path_equal(d, q)) {
1392
27407a01
ZJS
1393 r = mkdir_p(q, 0755);
1394 if (r < 0)
56f64d95 1395 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1396 return 0;
57fb9fb5
LP
1397 }
1398
4a62c710
MS
1399 if (unlink(p) < 0)
1400 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1401 } else if (r == -EINVAL) {
1402
1403 if (arg_link_journal == LINK_GUEST &&
1404 rmdir(p) < 0) {
1405
27407a01
ZJS
1406 if (errno == ENOTDIR) {
1407 log_error("%s already exists and is neither a symlink nor a directory", p);
1408 return r;
1409 } else {
56f64d95 1410 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 1411 return -errno;
57fb9fb5 1412 }
57fb9fb5
LP
1413 }
1414 } else if (r != -ENOENT) {
56f64d95 1415 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 1416 return r;
57fb9fb5
LP
1417 }
1418
1419 if (arg_link_journal == LINK_GUEST) {
1420
1421 if (symlink(q, p) < 0) {
574edc90 1422 if (arg_link_journal_try) {
56f64d95 1423 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
1424 return 0;
1425 } else {
56f64d95 1426 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
1427 return -errno;
1428 }
57fb9fb5
LP
1429 }
1430
27407a01
ZJS
1431 r = mkdir_p(q, 0755);
1432 if (r < 0)
56f64d95 1433 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1434 return 0;
57fb9fb5
LP
1435 }
1436
1437 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1438 /* don't create parents here -- if the host doesn't have
1439 * permanent journal set up, don't force it here */
1440 r = mkdir(p, 0755);
57fb9fb5 1441 if (r < 0) {
574edc90 1442 if (arg_link_journal_try) {
56f64d95 1443 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
1444 return 0;
1445 } else {
56f64d95 1446 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
1447 return r;
1448 }
57fb9fb5
LP
1449 }
1450
27407a01
ZJS
1451 } else if (access(p, F_OK) < 0)
1452 return 0;
57fb9fb5 1453
cdb2b9d0
LP
1454 if (dir_is_empty(q) == 0)
1455 log_warning("%s is not empty, proceeding anyway.", q);
1456
57fb9fb5
LP
1457 r = mkdir_p(q, 0755);
1458 if (r < 0) {
56f64d95 1459 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 1460 return r;
57fb9fb5
LP
1461 }
1462
4a62c710
MS
1463 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1464 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1465
27407a01 1466 return 0;
57fb9fb5
LP
1467}
1468
88213476 1469static int drop_capabilities(void) {
5076f0cc 1470 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1471}
1472
5aa4bb6b 1473static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 1474 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
24996861 1475 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
1476 int r;
1477
eb91eb18
LP
1478 if (!arg_register)
1479 return 0;
1480
1c03020c 1481 r = sd_bus_default_system(&bus);
f647962d
MS
1482 if (r < 0)
1483 return log_error_errno(r, "Failed to open system bus: %m");
9444b1f2 1484
89f7c846
LP
1485 if (arg_keep_unit) {
1486 r = sd_bus_call_method(
1487 bus,
1488 "org.freedesktop.machine1",
1489 "/org/freedesktop/machine1",
1490 "org.freedesktop.machine1.Manager",
5aa4bb6b 1491 "RegisterMachineWithNetwork",
89f7c846
LP
1492 &error,
1493 NULL,
5aa4bb6b 1494 "sayssusai",
89f7c846
LP
1495 arg_machine,
1496 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1497 "nspawn",
1498 "container",
1499 (uint32_t) pid,
5aa4bb6b
LP
1500 strempty(arg_directory),
1501 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 1502 } else {
9457ac5b
LP
1503 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1504
1505 r = sd_bus_message_new_method_call(
89f7c846 1506 bus,
9457ac5b 1507 &m,
89f7c846
LP
1508 "org.freedesktop.machine1",
1509 "/org/freedesktop/machine1",
1510 "org.freedesktop.machine1.Manager",
5aa4bb6b 1511 "CreateMachineWithNetwork");
f647962d
MS
1512 if (r < 0)
1513 return log_error_errno(r, "Failed to create message: %m");
9457ac5b
LP
1514
1515 r = sd_bus_message_append(
1516 m,
5aa4bb6b 1517 "sayssusai",
89f7c846
LP
1518 arg_machine,
1519 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1520 "nspawn",
1521 "container",
1522 (uint32_t) pid,
5aa4bb6b
LP
1523 strempty(arg_directory),
1524 local_ifindex > 0 ? 1 : 0, local_ifindex);
f647962d
MS
1525 if (r < 0)
1526 return log_error_errno(r, "Failed to append message arguments: %m");
9457ac5b
LP
1527
1528 r = sd_bus_message_open_container(m, 'a', "(sv)");
f647962d
MS
1529 if (r < 0)
1530 return log_error_errno(r, "Failed to open container: %m");
9457ac5b
LP
1531
1532 if (!isempty(arg_slice)) {
1533 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
f647962d
MS
1534 if (r < 0)
1535 return log_error_errno(r, "Failed to append slice: %m");
9457ac5b
LP
1536 }
1537
1538 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
f647962d
MS
1539 if (r < 0)
1540 return log_error_errno(r, "Failed to add device policy: %m");
9457ac5b 1541
63cc4c31 1542 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
9457ac5b
LP
1543 /* Allow the container to
1544 * access and create the API
1545 * device nodes, so that
1546 * PrivateDevices= in the
1547 * container can work
1548 * fine */
1549 "/dev/null", "rwm",
1550 "/dev/zero", "rwm",
1551 "/dev/full", "rwm",
1552 "/dev/random", "rwm",
1553 "/dev/urandom", "rwm",
1554 "/dev/tty", "rwm",
864e1706 1555 "/dev/net/tun", "rwm",
9457ac5b
LP
1556 /* Allow the container
1557 * access to ptys. However,
1558 * do not permit the
1559 * container to ever create
1560 * these device nodes. */
1561 "/dev/pts/ptmx", "rw",
63cc4c31 1562 "char-pts", "rw");
f647962d
MS
1563 if (r < 0)
1564 return log_error_errno(r, "Failed to add device whitelist: %m");
9457ac5b
LP
1565
1566 r = sd_bus_message_close_container(m);
f647962d
MS
1567 if (r < 0)
1568 return log_error_errno(r, "Failed to close container: %m");
9457ac5b
LP
1569
1570 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
1571 }
1572
9444b1f2 1573 if (r < 0) {
1f0cd86b
LP
1574 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1575 return r;
1576 }
1577
1578 return 0;
1579}
1580
1581static int terminate_machine(pid_t pid) {
1582 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1583 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
24996861 1584 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
1585 const char *path;
1586 int r;
1587
eb91eb18
LP
1588 if (!arg_register)
1589 return 0;
1590
76b54375 1591 r = sd_bus_default_system(&bus);
f647962d
MS
1592 if (r < 0)
1593 return log_error_errno(r, "Failed to open system bus: %m");
1f0cd86b
LP
1594
1595 r = sd_bus_call_method(
1596 bus,
1597 "org.freedesktop.machine1",
1598 "/org/freedesktop/machine1",
1599 "org.freedesktop.machine1.Manager",
1600 "GetMachineByPID",
1601 &error,
1602 &reply,
1603 "u",
1604 (uint32_t) pid);
1605 if (r < 0) {
1606 /* Note that the machine might already have been
1607 * cleaned up automatically, hence don't consider it a
1608 * failure if we cannot get the machine object. */
1609 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1610 return 0;
1611 }
1612
1613 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
1614 if (r < 0)
1615 return bus_log_parse_error(r);
9444b1f2 1616
1f0cd86b
LP
1617 r = sd_bus_call_method(
1618 bus,
1619 "org.freedesktop.machine1",
1620 path,
1621 "org.freedesktop.machine1.Machine",
1622 "Terminate",
1623 &error,
1624 NULL,
1625 NULL);
1626 if (r < 0) {
1627 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1628 return 0;
1629 }
1630
9444b1f2
LP
1631 return 0;
1632}
1633
db999e0f
LP
1634static int reset_audit_loginuid(void) {
1635 _cleanup_free_ char *p = NULL;
1636 int r;
1637
1638 if (arg_share_system)
1639 return 0;
1640
1641 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1642 if (r == -ENOENT)
db999e0f 1643 return 0;
f647962d
MS
1644 if (r < 0)
1645 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1646
1647 /* Already reset? */
1648 if (streq(p, "4294967295"))
1649 return 0;
1650
1651 r = write_string_file("/proc/self/loginuid", "4294967295");
1652 if (r < 0) {
1653 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1654 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1655 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1656 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1657 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
77b6e194 1658
db999e0f 1659 sleep(5);
77b6e194 1660 }
db999e0f
LP
1661
1662 return 0;
77b6e194
LP
1663}
1664
4f758c23
LP
1665#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1666#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
e867ceb6 1667#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
01dde061 1668
a90e2305 1669static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
01dde061
TG
1670 uint8_t result[8];
1671 size_t l, sz;
a90e2305
LP
1672 uint8_t *v, *i;
1673 int r;
01dde061
TG
1674
1675 l = strlen(arg_machine);
1676 sz = sizeof(sd_id128_t) + l;
e867ceb6
LP
1677 if (idx > 0)
1678 sz += sizeof(idx);
a90e2305 1679
01dde061
TG
1680 v = alloca(sz);
1681
1682 /* fetch some persistent data unique to the host */
1683 r = sd_id128_get_machine((sd_id128_t*) v);
1684 if (r < 0)
1685 return r;
1686
1687 /* combine with some data unique (on this host) to this
1688 * container instance */
a90e2305
LP
1689 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
1690 if (idx > 0) {
1691 idx = htole64(idx);
1692 memcpy(i, &idx, sizeof(idx));
1693 }
01dde061
TG
1694
1695 /* Let's hash the host machine ID plus the container name. We
1696 * use a fixed, but originally randomly created hash key here. */
4f758c23 1697 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
1698
1699 assert_cc(ETH_ALEN <= sizeof(result));
1700 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1701
1702 /* see eth_random_addr in the kernel */
1703 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1704 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1705
1706 return 0;
1707}
1708
5aa4bb6b 1709static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
69c79d3c 1710 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 1711 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4f758c23 1712 struct ether_addr mac_host, mac_container;
5aa4bb6b 1713 int r, i;
69c79d3c
LP
1714
1715 if (!arg_private_network)
1716 return 0;
1717
1718 if (!arg_network_veth)
1719 return 0;
1720
08af0da2
LP
1721 /* Use two different interface name prefixes depending whether
1722 * we are in bridge mode or not. */
c00524c9 1723 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 1724 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 1725
e867ceb6
LP
1726 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
1727 if (r < 0)
1728 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
4f758c23 1729
e867ceb6
LP
1730 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
1731 if (r < 0)
1732 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
01dde061 1733
151b9b96 1734 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
1735 if (r < 0)
1736 return log_error_errno(r, "Failed to connect to netlink: %m");
69c79d3c 1737
151b9b96 1738 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
1739 if (r < 0)
1740 return log_error_errno(r, "Failed to allocate netlink message: %m");
69c79d3c 1741
ab046dde 1742 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
f647962d
MS
1743 if (r < 0)
1744 return log_error_errno(r, "Failed to add netlink interface name: %m");
69c79d3c 1745
4f758c23 1746 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
f647962d
MS
1747 if (r < 0)
1748 return log_error_errno(r, "Failed to add netlink MAC address: %m");
4f758c23 1749
ee3a6a51 1750 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
1751 if (r < 0)
1752 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 1753
d8e538ec 1754 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
f647962d
MS
1755 if (r < 0)
1756 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 1757
ee3a6a51 1758 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
f647962d
MS
1759 if (r < 0)
1760 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 1761
ab046dde 1762 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
f647962d
MS
1763 if (r < 0)
1764 return log_error_errno(r, "Failed to add netlink interface name: %m");
01dde061 1765
4f758c23 1766 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
f647962d
MS
1767 if (r < 0)
1768 return log_error_errno(r, "Failed to add netlink MAC address: %m");
69c79d3c 1769
ab046dde 1770 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
1771 if (r < 0)
1772 return log_error_errno(r, "Failed to add netlink namespace field: %m");
69c79d3c
LP
1773
1774 r = sd_rtnl_message_close_container(m);
f647962d
MS
1775 if (r < 0)
1776 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
1777
1778 r = sd_rtnl_message_close_container(m);
f647962d
MS
1779 if (r < 0)
1780 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
1781
1782 r = sd_rtnl_message_close_container(m);
f647962d
MS
1783 if (r < 0)
1784 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
1785
1786 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
1787 if (r < 0)
1788 return log_error_errno(r, "Failed to add new veth interfaces: %m");
69c79d3c 1789
5aa4bb6b 1790 i = (int) if_nametoindex(iface_name);
4a62c710
MS
1791 if (i <= 0)
1792 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
5aa4bb6b
LP
1793
1794 *ifi = i;
1795
69c79d3c
LP
1796 return 0;
1797}
1798
5aa4bb6b 1799static int setup_bridge(const char veth_name[], int *ifi) {
ab046dde
TG
1800 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1801 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1802 int r, bridge;
1803
1804 if (!arg_private_network)
1805 return 0;
1806
1807 if (!arg_network_veth)
1808 return 0;
1809
1810 if (!arg_network_bridge)
1811 return 0;
1812
1813 bridge = (int) if_nametoindex(arg_network_bridge);
4a62c710
MS
1814 if (bridge <= 0)
1815 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
ab046dde 1816
5aa4bb6b
LP
1817 *ifi = bridge;
1818
151b9b96 1819 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
1820 if (r < 0)
1821 return log_error_errno(r, "Failed to connect to netlink: %m");
ab046dde 1822
151b9b96 1823 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
f647962d
MS
1824 if (r < 0)
1825 return log_error_errno(r, "Failed to allocate netlink message: %m");
ab046dde 1826
039dd4af 1827 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
f647962d
MS
1828 if (r < 0)
1829 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
039dd4af 1830
ab046dde 1831 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
f647962d
MS
1832 if (r < 0)
1833 return log_error_errno(r, "Failed to add netlink interface name field: %m");
ab046dde
TG
1834
1835 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
f647962d
MS
1836 if (r < 0)
1837 return log_error_errno(r, "Failed to add netlink master field: %m");
ab046dde
TG
1838
1839 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
1840 if (r < 0)
1841 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
ab046dde
TG
1842
1843 return 0;
1844}
1845
c74e630d
LP
1846static int parse_interface(struct udev *udev, const char *name) {
1847 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1848 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1849 int ifi;
1850
1851 ifi = (int) if_nametoindex(name);
4a62c710
MS
1852 if (ifi <= 0)
1853 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
c74e630d
LP
1854
1855 sprintf(ifi_str, "n%i", ifi);
1856 d = udev_device_new_from_device_id(udev, ifi_str);
4a62c710
MS
1857 if (!d)
1858 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
c74e630d
LP
1859
1860 if (udev_device_get_is_initialized(d) <= 0) {
1861 log_error("Network interface %s is not initialized yet.", name);
1862 return -EBUSY;
1863 }
1864
1865 return ifi;
1866}
1867
69c79d3c 1868static int move_network_interfaces(pid_t pid) {
7e227024 1869 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 1870 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
1871 char **i;
1872 int r;
1873
1874 if (!arg_private_network)
1875 return 0;
1876
1877 if (strv_isempty(arg_network_interfaces))
1878 return 0;
1879
151b9b96 1880 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
1881 if (r < 0)
1882 return log_error_errno(r, "Failed to connect to netlink: %m");
aa28aefe 1883
7e227024
LP
1884 udev = udev_new();
1885 if (!udev) {
1886 log_error("Failed to connect to udev.");
1887 return -ENOMEM;
1888 }
1889
aa28aefe 1890 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 1891 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 1892 int ifi;
aa28aefe 1893
c74e630d
LP
1894 ifi = parse_interface(udev, *i);
1895 if (ifi < 0)
1896 return ifi;
1897
3125b3ef 1898 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
f647962d
MS
1899 if (r < 0)
1900 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 1901
c74e630d 1902 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
1903 if (r < 0)
1904 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
7e227024 1905
c74e630d 1906 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
1907 if (r < 0)
1908 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
c74e630d 1909 }
7e227024 1910
c74e630d
LP
1911 return 0;
1912}
1913
1914static int setup_macvlan(pid_t pid) {
1915 _cleanup_udev_unref_ struct udev *udev = NULL;
1916 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
e867ceb6 1917 unsigned idx = 0;
c74e630d
LP
1918 char **i;
1919 int r;
1920
1921 if (!arg_private_network)
1922 return 0;
1923
1924 if (strv_isempty(arg_network_macvlan))
1925 return 0;
1926
1927 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
1928 if (r < 0)
1929 return log_error_errno(r, "Failed to connect to netlink: %m");
c74e630d
LP
1930
1931 udev = udev_new();
1932 if (!udev) {
1933 log_error("Failed to connect to udev.");
1934 return -ENOMEM;
1935 }
1936
1937 STRV_FOREACH(i, arg_network_macvlan) {
1938 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1939 _cleanup_free_ char *n = NULL;
e867ceb6 1940 struct ether_addr mac;
c74e630d
LP
1941 int ifi;
1942
1943 ifi = parse_interface(udev, *i);
1944 if (ifi < 0)
1945 return ifi;
1946
e867ceb6
LP
1947 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
1948 if (r < 0)
1949 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
1950
c74e630d 1951 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
1952 if (r < 0)
1953 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 1954
c74e630d 1955 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
f647962d
MS
1956 if (r < 0)
1957 return log_error_errno(r, "Failed to add netlink interface index: %m");
c74e630d
LP
1958
1959 n = strappend("mv-", *i);
1960 if (!n)
1961 return log_oom();
1962
1963 strshorten(n, IFNAMSIZ-1);
1964
1965 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
f647962d
MS
1966 if (r < 0)
1967 return log_error_errno(r, "Failed to add netlink interface name: %m");
c74e630d 1968
e867ceb6
LP
1969 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1970 if (r < 0)
1971 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1972
aa28aefe 1973 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
1974 if (r < 0)
1975 return log_error_errno(r, "Failed to add netlink namespace field: %m");
c74e630d
LP
1976
1977 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
1978 if (r < 0)
1979 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 1980
d8e538ec 1981 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
f647962d
MS
1982 if (r < 0)
1983 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d
LP
1984
1985 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
f647962d
MS
1986 if (r < 0)
1987 return log_error_errno(r, "Failed to append macvlan mode: %m");
c74e630d
LP
1988
1989 r = sd_rtnl_message_close_container(m);
f647962d
MS
1990 if (r < 0)
1991 return log_error_errno(r, "Failed to close netlink container: %m");
c74e630d
LP
1992
1993 r = sd_rtnl_message_close_container(m);
f647962d
MS
1994 if (r < 0)
1995 return log_error_errno(r, "Failed to close netlink container: %m");
aa28aefe
LP
1996
1997 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
1998 if (r < 0)
1999 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
aa28aefe
LP
2000 }
2001
2002 return 0;
2003}
2004
28650077 2005static int setup_seccomp(void) {
24fb1112
LP
2006
2007#ifdef HAVE_SECCOMP
28650077
LP
2008 static const int blacklist[] = {
2009 SCMP_SYS(kexec_load),
2010 SCMP_SYS(open_by_handle_at),
2011 SCMP_SYS(init_module),
2012 SCMP_SYS(finit_module),
2013 SCMP_SYS(delete_module),
2014 SCMP_SYS(iopl),
2015 SCMP_SYS(ioperm),
2016 SCMP_SYS(swapon),
2017 SCMP_SYS(swapoff),
2018 };
2019
24fb1112 2020 scmp_filter_ctx seccomp;
28650077 2021 unsigned i;
24fb1112
LP
2022 int r;
2023
24fb1112
LP
2024 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2025 if (!seccomp)
2026 return log_oom();
2027
e9642be2 2028 r = seccomp_add_secondary_archs(seccomp);
9875fd78 2029 if (r < 0) {
da927ba9 2030 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
2031 goto finish;
2032 }
2033
28650077
LP
2034 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2035 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2036 if (r == -EFAULT)
2037 continue; /* unknown syscall */
2038 if (r < 0) {
da927ba9 2039 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
2040 goto finish;
2041 }
2042 }
2043
2044 /*
2045 Audit is broken in containers, much of the userspace audit
2046 hookup will fail if running inside a container. We don't
2047 care and just turn off creation of audit sockets.
2048
2049 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2050 with EAFNOSUPPORT which audit userspace uses as indication
2051 that audit is disabled in the kernel.
2052 */
2053
3302da46 2054 r = seccomp_rule_add(
24fb1112
LP
2055 seccomp,
2056 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2057 SCMP_SYS(socket),
2058 2,
2059 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2060 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2061 if (r < 0) {
da927ba9 2062 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
2063 goto finish;
2064 }
2065
2066 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2067 if (r < 0) {
da927ba9 2068 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
2069 goto finish;
2070 }
2071
2072 r = seccomp_load(seccomp);
2073 if (r < 0)
da927ba9 2074 log_error_errno(r, "Failed to install seccomp audit filter: %m");
24fb1112
LP
2075
2076finish:
2077 seccomp_release(seccomp);
2078 return r;
2079#else
2080 return 0;
2081#endif
2082
2083}
2084
785890ac
LP
2085static int setup_propagate(const char *root) {
2086 const char *p, *q;
2087
2088 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2089 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2090 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2091 (void) mkdir_p(p, 0600);
2092
2093 q = strappenda(root, "/run/systemd/nspawn/incoming");
2094 mkdir_parents(q, 0755);
2095 mkdir_p(q, 0600);
2096
2097 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2098 return log_error_errno(errno, "Failed to install propagation bind mount.");
2099
2100 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2101 return log_error_errno(errno, "Failed to make propagation mount read-only");
2102
2103 return 0;
2104}
2105
1b9e5b12
LP
2106static int setup_image(char **device_path, int *loop_nr) {
2107 struct loop_info64 info = {
2108 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2109 };
2110 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2111 _cleanup_free_ char* loopdev = NULL;
2112 struct stat st;
2113 int r, nr;
2114
2115 assert(device_path);
2116 assert(loop_nr);
ec16945e 2117 assert(arg_image);
1b9e5b12
LP
2118
2119 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2120 if (fd < 0)
2121 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 2122
4a62c710
MS
2123 if (fstat(fd, &st) < 0)
2124 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
2125
2126 if (S_ISBLK(st.st_mode)) {
2127 char *p;
2128
2129 p = strdup(arg_image);
2130 if (!p)
2131 return log_oom();
2132
2133 *device_path = p;
2134
2135 *loop_nr = -1;
2136
2137 r = fd;
2138 fd = -1;
2139
2140 return r;
2141 }
2142
2143 if (!S_ISREG(st.st_mode)) {
56f64d95 2144 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
2145 return -EINVAL;
2146 }
2147
2148 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
2149 if (control < 0)
2150 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
2151
2152 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
2153 if (nr < 0)
2154 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
2155
2156 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2157 return log_oom();
2158
2159 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2160 if (loop < 0)
2161 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 2162
4a62c710
MS
2163 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2164 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
2165
2166 if (arg_read_only)
2167 info.lo_flags |= LO_FLAGS_READ_ONLY;
2168
4a62c710
MS
2169 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2170 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
2171
2172 *device_path = loopdev;
2173 loopdev = NULL;
2174
2175 *loop_nr = nr;
2176
2177 r = loop;
2178 loop = -1;
2179
2180 return r;
2181}
2182
2183static int dissect_image(
2184 int fd,
727fd4fd
LP
2185 char **root_device, bool *root_device_rw,
2186 char **home_device, bool *home_device_rw,
2187 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
2188 bool *secondary) {
2189
2190#ifdef HAVE_BLKID
01dc33ce
ZJS
2191 int home_nr = -1, srv_nr = -1;
2192#ifdef GPT_ROOT_NATIVE
2193 int root_nr = -1;
2194#endif
2195#ifdef GPT_ROOT_SECONDARY
2196 int secondary_root_nr = -1;
2197#endif
2198
1b9e5b12
LP
2199 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2200 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2201 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2202 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2203 _cleanup_udev_unref_ struct udev *udev = NULL;
2204 struct udev_list_entry *first, *item;
727fd4fd 2205 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1b9e5b12
LP
2206 const char *pttype = NULL;
2207 blkid_partlist pl;
2208 struct stat st;
2209 int r;
2210
2211 assert(fd >= 0);
2212 assert(root_device);
2213 assert(home_device);
2214 assert(srv_device);
2215 assert(secondary);
ec16945e 2216 assert(arg_image);
1b9e5b12
LP
2217
2218 b = blkid_new_probe();
2219 if (!b)
2220 return log_oom();
2221
2222 errno = 0;
2223 r = blkid_probe_set_device(b, fd, 0, 0);
2224 if (r != 0) {
2225 if (errno == 0)
2226 return log_oom();
2227
56f64d95 2228 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
2229 return -errno;
2230 }
2231
2232 blkid_probe_enable_partitions(b, 1);
2233 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2234
2235 errno = 0;
2236 r = blkid_do_safeprobe(b);
2237 if (r == -2 || r == 1) {
2238 log_error("Failed to identify any partition table on %s.\n"
2239 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2240 return -EINVAL;
2241 } else if (r != 0) {
2242 if (errno == 0)
2243 errno = EIO;
56f64d95 2244 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
2245 return -errno;
2246 }
2247
2248 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2249 if (!streq_ptr(pttype, "gpt")) {
2250 log_error("Image %s does not carry a GUID Partition Table.\n"
2251 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2252 return -EINVAL;
2253 }
2254
2255 errno = 0;
2256 pl = blkid_probe_get_partitions(b);
2257 if (!pl) {
2258 if (errno == 0)
2259 return log_oom();
2260
2261 log_error("Failed to list partitions of %s", arg_image);
2262 return -errno;
2263 }
2264
2265 udev = udev_new();
2266 if (!udev)
2267 return log_oom();
2268
4a62c710
MS
2269 if (fstat(fd, &st) < 0)
2270 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12
LP
2271
2272 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2273 if (!d)
2274 return log_oom();
2275
2276 e = udev_enumerate_new(udev);
2277 if (!e)
2278 return log_oom();
2279
2280 r = udev_enumerate_add_match_parent(e, d);
2281 if (r < 0)
2282 return log_oom();
2283
2284 r = udev_enumerate_scan_devices(e);
f647962d
MS
2285 if (r < 0)
2286 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1b9e5b12
LP
2287
2288 first = udev_enumerate_get_list_entry(e);
2289 udev_list_entry_foreach(item, first) {
2290 _cleanup_udev_device_unref_ struct udev_device *q;
2291 const char *stype, *node;
727fd4fd 2292 unsigned long long flags;
1b9e5b12
LP
2293 sd_id128_t type_id;
2294 blkid_partition pp;
2295 dev_t qn;
2296 int nr;
2297
2298 errno = 0;
2299 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2300 if (!q) {
2301 if (!errno)
2302 errno = ENOMEM;
2303
56f64d95 2304 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
2305 return -errno;
2306 }
2307
2308 qn = udev_device_get_devnum(q);
2309 if (major(qn) == 0)
2310 continue;
2311
2312 if (st.st_rdev == qn)
2313 continue;
2314
2315 node = udev_device_get_devnode(q);
2316 if (!node)
2317 continue;
2318
2319 pp = blkid_partlist_devno_to_partition(pl, qn);
2320 if (!pp)
2321 continue;
2322
727fd4fd
LP
2323 flags = blkid_partition_get_flags(pp);
2324 if (flags & GPT_FLAG_NO_AUTO)
2325 continue;
2326
1b9e5b12
LP
2327 nr = blkid_partition_get_partno(pp);
2328 if (nr < 0)
2329 continue;
2330
2331 stype = blkid_partition_get_type_string(pp);
2332 if (!stype)
2333 continue;
2334
2335 if (sd_id128_from_string(stype, &type_id) < 0)
2336 continue;
2337
2338 if (sd_id128_equal(type_id, GPT_HOME)) {
2339
2340 if (home && nr >= home_nr)
2341 continue;
2342
2343 home_nr = nr;
727fd4fd
LP
2344 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2345
1b9e5b12
LP
2346 free(home);
2347 home = strdup(node);
2348 if (!home)
2349 return log_oom();
2350 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2351
2352 if (srv && nr >= srv_nr)
2353 continue;
2354
2355 srv_nr = nr;
727fd4fd
LP
2356 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2357
1b9e5b12
LP
2358 free(srv);
2359 srv = strdup(node);
2360 if (!srv)
2361 return log_oom();
2362 }
2363#ifdef GPT_ROOT_NATIVE
2364 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2365
2366 if (root && nr >= root_nr)
2367 continue;
2368
2369 root_nr = nr;
727fd4fd
LP
2370 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2371
1b9e5b12
LP
2372 free(root);
2373 root = strdup(node);
2374 if (!root)
2375 return log_oom();
2376 }
2377#endif
2378#ifdef GPT_ROOT_SECONDARY
2379 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2380
2381 if (secondary_root && nr >= secondary_root_nr)
2382 continue;
2383
2384 secondary_root_nr = nr;
727fd4fd
LP
2385 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2386
2387
1b9e5b12
LP
2388 free(secondary_root);
2389 secondary_root = strdup(node);
2390 if (!secondary_root)
2391 return log_oom();
2392 }
2393#endif
2394 }
2395
2396 if (!root && !secondary_root) {
2397 log_error("Failed to identify root partition in disk image %s.\n"
2398 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2399 return -EINVAL;
2400 }
2401
2402 if (root) {
2403 *root_device = root;
2404 root = NULL;
727fd4fd
LP
2405
2406 *root_device_rw = root_rw;
1b9e5b12
LP
2407 *secondary = false;
2408 } else if (secondary_root) {
2409 *root_device = secondary_root;
2410 secondary_root = NULL;
727fd4fd
LP
2411
2412 *root_device_rw = secondary_root_rw;
1b9e5b12
LP
2413 *secondary = true;
2414 }
2415
2416 if (home) {
2417 *home_device = home;
2418 home = NULL;
727fd4fd
LP
2419
2420 *home_device_rw = home_rw;
1b9e5b12
LP
2421 }
2422
2423 if (srv) {
2424 *srv_device = srv;
2425 srv = NULL;
727fd4fd
LP
2426
2427 *srv_device_rw = srv_rw;
1b9e5b12
LP
2428 }
2429
2430 return 0;
2431#else
2432 log_error("--image= is not supported, compiled without blkid support.");
2433 return -ENOTSUP;
2434#endif
2435}
2436
727fd4fd 2437static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2438#ifdef HAVE_BLKID
2439 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2440 const char *fstype, *p;
2441 int r;
2442
2443 assert(what);
2444 assert(where);
2445
727fd4fd
LP
2446 if (arg_read_only)
2447 rw = false;
2448
1b9e5b12
LP
2449 if (directory)
2450 p = strappenda(where, directory);
2451 else
2452 p = where;
2453
2454 errno = 0;
2455 b = blkid_new_probe_from_filename(what);
2456 if (!b) {
2457 if (errno == 0)
2458 return log_oom();
56f64d95 2459 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2460 return -errno;
2461 }
2462
2463 blkid_probe_enable_superblocks(b, 1);
2464 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2465
2466 errno = 0;
2467 r = blkid_do_safeprobe(b);
2468 if (r == -1 || r == 1) {
2469 log_error("Cannot determine file system type of %s", what);
2470 return -EINVAL;
2471 } else if (r != 0) {
2472 if (errno == 0)
2473 errno = EIO;
56f64d95 2474 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2475 return -errno;
2476 }
2477
2478 errno = 0;
2479 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2480 if (errno == 0)
2481 errno = EINVAL;
2482 log_error("Failed to determine file system type of %s", what);
2483 return -errno;
2484 }
2485
2486 if (streq(fstype, "crypto_LUKS")) {
2487 log_error("nspawn currently does not support LUKS disk images.");
2488 return -ENOTSUP;
2489 }
2490
4a62c710
MS
2491 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2492 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
2493
2494 return 0;
2495#else
2496 log_error("--image= is not supported, compiled without blkid support.");
2497 return -ENOTSUP;
2498#endif
2499}
2500
727fd4fd
LP
2501static int mount_devices(
2502 const char *where,
2503 const char *root_device, bool root_device_rw,
2504 const char *home_device, bool home_device_rw,
2505 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2506 int r;
2507
2508 assert(where);
2509
2510 if (root_device) {
727fd4fd 2511 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2512 if (r < 0)
2513 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2514 }
2515
2516 if (home_device) {
727fd4fd 2517 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2518 if (r < 0)
2519 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2520 }
2521
2522 if (srv_device) {
727fd4fd 2523 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2524 if (r < 0)
2525 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2526 }
2527
2528 return 0;
2529}
2530
2531static void loop_remove(int nr, int *image_fd) {
2532 _cleanup_close_ int control = -1;
e8c8ddcc 2533 int r;
1b9e5b12
LP
2534
2535 if (nr < 0)
2536 return;
2537
2538 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2539 r = ioctl(*image_fd, LOOP_CLR_FD);
2540 if (r < 0)
56f64d95 2541 log_warning_errno(errno, "Failed to close loop image: %m");
03e334a1 2542 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2543 }
2544
2545 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2546 if (control < 0) {
56f64d95 2547 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2548 return;
e8c8ddcc 2549 }
1b9e5b12 2550
e8c8ddcc
TG
2551 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2552 if (r < 0)
56f64d95 2553 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2554}
2555
0cb9fbcd
LP
2556static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2557 int pipe_fds[2];
2558 pid_t pid;
2559
2560 assert(database);
2561 assert(key);
2562 assert(rpid);
2563
4a62c710
MS
2564 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2565 return log_error_errno(errno, "Failed to allocate pipe: %m");
0cb9fbcd
LP
2566
2567 pid = fork();
4a62c710
MS
2568 if (pid < 0)
2569 return log_error_errno(errno, "Failed to fork getent child: %m");
2570 else if (pid == 0) {
0cb9fbcd
LP
2571 int nullfd;
2572 char *empty_env = NULL;
2573
2574 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2575 _exit(EXIT_FAILURE);
2576
2577 if (pipe_fds[0] > 2)
03e334a1 2578 safe_close(pipe_fds[0]);
0cb9fbcd 2579 if (pipe_fds[1] > 2)
03e334a1 2580 safe_close(pipe_fds[1]);
0cb9fbcd
LP
2581
2582 nullfd = open("/dev/null", O_RDWR);
2583 if (nullfd < 0)
2584 _exit(EXIT_FAILURE);
2585
2586 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2587 _exit(EXIT_FAILURE);
2588
2589 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2590 _exit(EXIT_FAILURE);
2591
2592 if (nullfd > 2)
03e334a1 2593 safe_close(nullfd);
0cb9fbcd
LP
2594
2595 reset_all_signal_handlers();
2596 close_all_fds(NULL, 0);
2597
4de82926
MM
2598 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2599 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
2600 _exit(EXIT_FAILURE);
2601 }
2602
03e334a1 2603 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
2604
2605 *rpid = pid;
2606
2607 return pipe_fds[0];
2608}
2609
2610static int change_uid_gid(char **_home) {
a2a5291b
ZJS
2611 char line[LINE_MAX], *x, *u, *g, *h;
2612 const char *word, *state;
0cb9fbcd
LP
2613 _cleanup_free_ uid_t *uids = NULL;
2614 _cleanup_free_ char *home = NULL;
2615 _cleanup_fclose_ FILE *f = NULL;
2616 _cleanup_close_ int fd = -1;
2617 unsigned n_uids = 0;
70f539ca 2618 size_t sz = 0, l;
0cb9fbcd
LP
2619 uid_t uid;
2620 gid_t gid;
2621 pid_t pid;
2622 int r;
2623
2624 assert(_home);
2625
2626 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2627 /* Reset everything fully to 0, just in case */
2628
4a62c710
MS
2629 if (setgroups(0, NULL) < 0)
2630 return log_error_errno(errno, "setgroups() failed: %m");
0cb9fbcd 2631
4a62c710
MS
2632 if (setresgid(0, 0, 0) < 0)
2633 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 2634
4a62c710
MS
2635 if (setresuid(0, 0, 0) < 0)
2636 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
2637
2638 *_home = NULL;
2639 return 0;
2640 }
2641
2642 /* First, get user credentials */
2643 fd = spawn_getent("passwd", arg_user, &pid);
2644 if (fd < 0)
2645 return fd;
2646
2647 f = fdopen(fd, "r");
2648 if (!f)
2649 return log_oom();
2650 fd = -1;
2651
2652 if (!fgets(line, sizeof(line), f)) {
2653
2654 if (!ferror(f)) {
2655 log_error("Failed to resolve user %s.", arg_user);
2656 return -ESRCH;
2657 }
2658
56f64d95 2659 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
2660 return -errno;
2661 }
2662
2663 truncate_nl(line);
2664
820d3acf 2665 wait_for_terminate_and_warn("getent passwd", pid, true);
0cb9fbcd
LP
2666
2667 x = strchr(line, ':');
2668 if (!x) {
2669 log_error("/etc/passwd entry has invalid user field.");
2670 return -EIO;
2671 }
2672
2673 u = strchr(x+1, ':');
2674 if (!u) {
2675 log_error("/etc/passwd entry has invalid password field.");
2676 return -EIO;
2677 }
2678
2679 u++;
2680 g = strchr(u, ':');
2681 if (!g) {
2682 log_error("/etc/passwd entry has invalid UID field.");
2683 return -EIO;
2684 }
2685
2686 *g = 0;
2687 g++;
2688 x = strchr(g, ':');
2689 if (!x) {
2690 log_error("/etc/passwd entry has invalid GID field.");
2691 return -EIO;
2692 }
2693
2694 *x = 0;
2695 h = strchr(x+1, ':');
2696 if (!h) {
2697 log_error("/etc/passwd entry has invalid GECOS field.");
2698 return -EIO;
2699 }
2700
2701 h++;
2702 x = strchr(h, ':');
2703 if (!x) {
2704 log_error("/etc/passwd entry has invalid home directory field.");
2705 return -EIO;
2706 }
2707
2708 *x = 0;
2709
2710 r = parse_uid(u, &uid);
2711 if (r < 0) {
2712 log_error("Failed to parse UID of user.");
2713 return -EIO;
2714 }
2715
2716 r = parse_gid(g, &gid);
2717 if (r < 0) {
2718 log_error("Failed to parse GID of user.");
2719 return -EIO;
2720 }
2721
2722 home = strdup(h);
2723 if (!home)
2724 return log_oom();
2725
2726 /* Second, get group memberships */
2727 fd = spawn_getent("initgroups", arg_user, &pid);
2728 if (fd < 0)
2729 return fd;
2730
2731 fclose(f);
2732 f = fdopen(fd, "r");
2733 if (!f)
2734 return log_oom();
2735 fd = -1;
2736
2737 if (!fgets(line, sizeof(line), f)) {
2738 if (!ferror(f)) {
2739 log_error("Failed to resolve user %s.", arg_user);
2740 return -ESRCH;
2741 }
2742
56f64d95 2743 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
2744 return -errno;
2745 }
2746
2747 truncate_nl(line);
2748
820d3acf 2749 wait_for_terminate_and_warn("getent initgroups", pid, true);
0cb9fbcd
LP
2750
2751 /* Skip over the username and subsequent separator whitespace */
2752 x = line;
2753 x += strcspn(x, WHITESPACE);
2754 x += strspn(x, WHITESPACE);
2755
a2a5291b 2756 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
2757 char c[l+1];
2758
a2a5291b 2759 memcpy(c, word, l);
0cb9fbcd
LP
2760 c[l] = 0;
2761
2762 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2763 return log_oom();
2764
2765 r = parse_uid(c, &uids[n_uids++]);
2766 if (r < 0) {
2767 log_error("Failed to parse group data from getent.");
2768 return -EIO;
2769 }
2770 }
2771
2772 r = mkdir_parents(home, 0775);
f647962d
MS
2773 if (r < 0)
2774 return log_error_errno(r, "Failed to make home root directory: %m");
0cb9fbcd
LP
2775
2776 r = mkdir_safe(home, 0755, uid, gid);
f647962d
MS
2777 if (r < 0 && r != -EEXIST)
2778 return log_error_errno(r, "Failed to make home directory: %m");
0cb9fbcd
LP
2779
2780 fchown(STDIN_FILENO, uid, gid);
2781 fchown(STDOUT_FILENO, uid, gid);
2782 fchown(STDERR_FILENO, uid, gid);
2783
4a62c710
MS
2784 if (setgroups(n_uids, uids) < 0)
2785 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
0cb9fbcd 2786
4a62c710
MS
2787 if (setresgid(gid, gid, gid) < 0)
2788 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 2789
4a62c710
MS
2790 if (setresuid(uid, uid, uid) < 0)
2791 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
2792
2793 if (_home) {
2794 *_home = home;
2795 home = NULL;
2796 }
2797
2798 return 0;
2799}
2800
113cea80 2801/*
6d416b9c
LS
2802 * Return values:
2803 * < 0 : wait_for_terminate() failed to get the state of the
2804 * container, the container was terminated by a signal, or
2805 * failed for an unknown reason. No change is made to the
2806 * container argument.
2807 * > 0 : The program executed in the container terminated with an
2808 * error. The exit code of the program executed in the
919699ec
LP
2809 * container is returned. The container argument has been set
2810 * to CONTAINER_TERMINATED.
6d416b9c
LS
2811 * 0 : The container is being rebooted, has been shut down or exited
2812 * successfully. The container argument has been set to either
2813 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2814 *
6d416b9c
LS
2815 * That is, success is indicated by a return value of zero, and an
2816 * error is indicated by a non-zero value.
113cea80
DH
2817 */
2818static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2819 siginfo_t status;
919699ec 2820 int r;
113cea80
DH
2821
2822 r = wait_for_terminate(pid, &status);
f647962d
MS
2823 if (r < 0)
2824 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2825
2826 switch (status.si_code) {
fddbb89c 2827
113cea80 2828 case CLD_EXITED:
919699ec
LP
2829 if (status.si_status == 0) {
2830 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 2831
fddbb89c 2832 } else
919699ec 2833 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2834
919699ec
LP
2835 *container = CONTAINER_TERMINATED;
2836 return status.si_status;
113cea80
DH
2837
2838 case CLD_KILLED:
2839 if (status.si_status == SIGINT) {
113cea80 2840
919699ec 2841 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2842 *container = CONTAINER_TERMINATED;
919699ec
LP
2843 return 0;
2844
113cea80 2845 } else if (status.si_status == SIGHUP) {
113cea80 2846
919699ec 2847 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2848 *container = CONTAINER_REBOOTED;
919699ec 2849 return 0;
113cea80 2850 }
919699ec 2851
113cea80
DH
2852 /* CLD_KILLED fallthrough */
2853
2854 case CLD_DUMPED:
fddbb89c 2855 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2856 return -EIO;
113cea80
DH
2857
2858 default:
fddbb89c 2859 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2860 return -EIO;
113cea80
DH
2861 }
2862
2863 return r;
2864}
2865
e866af3a
DH
2866static void nop_handler(int sig) {}
2867
023fb90b
LP
2868static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2869 pid_t pid;
2870
2871 pid = PTR_TO_UINT32(userdata);
2872 if (pid > 0) {
2873 if (kill(pid, SIGRTMIN+3) >= 0) {
2874 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2875 sd_event_source_set_userdata(s, NULL);
2876 return 0;
2877 }
2878 }
2879
2880 sd_event_exit(sd_event_source_get_event(s), 0);
2881 return 0;
2882}
2883
ec16945e
LP
2884static int determine_names(void) {
2885
2886 if (!arg_image && !arg_directory) {
2887 if (arg_machine)
2888 arg_directory = strappend("/var/lib/container/", arg_machine);
2889 else
2890 arg_directory = get_current_dir_name();
2891
2892 if (!arg_directory) {
2893 log_error("Failed to determine path, please use -D.");
2894 return -EINVAL;
2895 }
2896 }
2897
2898 if (!arg_machine) {
b9ba4dab
LP
2899 if (arg_directory && path_equal(arg_directory, "/"))
2900 arg_machine = gethostname_malloc();
2901 else
2902 arg_machine = strdup(basename(arg_image ?: arg_directory));
2903
ec16945e
LP
2904 if (!arg_machine)
2905 return log_oom();
2906
2907 hostname_cleanup(arg_machine, false);
2908 if (!machine_name_is_valid(arg_machine)) {
2909 log_error("Failed to determine machine name automatically, please use -M.");
2910 return -EINVAL;
2911 }
b9ba4dab
LP
2912
2913 if (arg_ephemeral) {
2914 char *b;
2915
2916 /* Add a random suffix when this is an
2917 * ephemeral machine, so that we can run many
2918 * instances at once without manually having
2919 * to specify -M each time. */
2920
2921 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2922 return log_oom();
2923
2924 free(arg_machine);
2925 arg_machine = b;
2926 }
ec16945e
LP
2927 }
2928
2929 return 0;
2930}
2931
88213476 2932int main(int argc, char *argv[]) {
69c79d3c 2933
611b312b 2934 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
727fd4fd 2935 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
63cc4c31 2936 _cleanup_close_ int master = -1, image_fd = -1;
3d94f76c 2937 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
69c79d3c 2938 _cleanup_fdset_free_ FDSet *fds = NULL;
ec16945e 2939 int r, n_fd_passed, loop_nr = -1;
1b9e5b12 2940 char veth_name[IFNAMSIZ];
ec16945e 2941 bool secondary = false, remove_subvol = false;
e866af3a 2942 sigset_t mask, mask_chld;
69c79d3c 2943 pid_t pid = 0;
ec16945e 2944 int ret = EXIT_SUCCESS;
88213476
LP
2945
2946 log_parse_environment();
2947 log_open();
2948
ec16945e
LP
2949 r = parse_argv(argc, argv);
2950 if (r <= 0)
88213476 2951 goto finish;
88213476 2952
ec16945e
LP
2953 r = determine_names();
2954 if (r < 0)
2955 goto finish;
7027ff61 2956
88213476
LP
2957 if (geteuid() != 0) {
2958 log_error("Need to be root.");
ec16945e 2959 r = -EPERM;
88213476
LP
2960 goto finish;
2961 }
2962
04d391da
LP
2963 if (sd_booted() <= 0) {
2964 log_error("Not running on a systemd system.");
ec16945e 2965 r = -EINVAL;
04d391da
LP
2966 goto finish;
2967 }
2968
1b9e5b12
LP
2969 log_close();
2970 n_fd_passed = sd_listen_fds(false);
2971 if (n_fd_passed > 0) {
ec16945e
LP
2972 r = fdset_new_listen_fds(&fds, false);
2973 if (r < 0) {
2974 log_error_errno(r, "Failed to collect file descriptors: %m");
1b9e5b12
LP
2975 goto finish;
2976 }
88213476 2977 }
1b9e5b12
LP
2978 fdset_close_others(fds);
2979 log_open();
88213476 2980
1b9e5b12 2981 if (arg_directory) {
ec16945e
LP
2982 assert(!arg_image);
2983
c4e34a61
LP
2984 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
2985 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
ec16945e 2986 r = -EINVAL;
6b9132a9
LP
2987 goto finish;
2988 }
1b9e5b12 2989
ec16945e
LP
2990 if (arg_template) {
2991 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
2992 if (r == -EEXIST) {
2993 if (!arg_quiet)
2994 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
2995 } else if (r < 0) {
2996 log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
2997 goto finish;
2998 } else {
2999 if (!arg_quiet)
3000 log_info("Populated %s from template %s.", arg_directory, arg_template);
3001 }
3002
3003 } else if (arg_ephemeral) {
3004 char *np;
3005
c4e34a61
LP
3006 /* If the specified path is a mount point we
3007 * generate the new snapshot immediately
3008 * inside it under a random name. However if
3009 * the specified is not a mount point we
3010 * create the new snapshot in the parent
3011 * directory, just next to it. */
3012 r = path_is_mount_point(arg_directory, false);
3013 if (r < 0) {
3014 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3015 goto finish;
3016 }
3017 if (r > 0)
3018 r = tempfn_random_child(arg_directory, &np);
3019 else
3020 r = tempfn_random(arg_directory, &np);
ec16945e
LP
3021 if (r < 0) {
3022 log_error_errno(r, "Failed to generate name for snapshot: %m");
3023 goto finish;
3024 }
3025
3026 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3027 if (r < 0) {
3028 free(np);
3029 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3030 goto finish;
3031 }
3032
3033 free(arg_directory);
3034 arg_directory = np;
3035
3036 remove_subvol = true;
3037 }
3038
1b9e5b12
LP
3039 if (arg_boot) {
3040 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3041 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3042 r = -EINVAL;
1b9e5b12
LP
3043 goto finish;
3044 }
3045 } else {
3046 const char *p;
3047
3048 p = strappenda(arg_directory,
3049 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3050 if (access(p, F_OK) < 0) {
3051 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
ec16945e 3052 r = -EINVAL;
1b9e5b12 3053 goto finish;
1b9e5b12
LP
3054 }
3055 }
ec16945e 3056
6b9132a9 3057 } else {
1b9e5b12 3058 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3059
ec16945e
LP
3060 assert(arg_image);
3061 assert(!arg_template);
3062
1b9e5b12 3063 if (!mkdtemp(template)) {
56f64d95 3064 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3065 r = -errno;
6b9132a9 3066 goto finish;
1b9e5b12 3067 }
6b9132a9 3068
1b9e5b12
LP
3069 arg_directory = strdup(template);
3070 if (!arg_directory) {
3071 r = log_oom();
3072 goto finish;
6b9132a9 3073 }
88213476 3074
1b9e5b12
LP
3075 image_fd = setup_image(&device_path, &loop_nr);
3076 if (image_fd < 0) {
3077 r = image_fd;
842f3b0f
LP
3078 goto finish;
3079 }
1b9e5b12 3080
4d9f07b4
LP
3081 r = dissect_image(image_fd,
3082 &root_device, &root_device_rw,
3083 &home_device, &home_device_rw,
3084 &srv_device, &srv_device_rw,
3085 &secondary);
1b9e5b12
LP
3086 if (r < 0)
3087 goto finish;
842f3b0f 3088 }
842f3b0f 3089
db7feb7e
LP
3090 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3091 if (master < 0) {
ec16945e 3092 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3093 goto finish;
3094 }
3095
611b312b
LP
3096 r = ptsname_malloc(master, &console);
3097 if (r < 0) {
3098 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
3099 goto finish;
3100 }
3101
284c0b91 3102 if (!arg_quiet)
45f1386c 3103 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
ec16945e 3104 arg_machine, arg_image ?: arg_directory);
a258bf26
LP
3105
3106 if (unlockpt(master) < 0) {
ec16945e 3107 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3108 goto finish;
3109 }
3110
e58a1277 3111 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
ec16945e 3112 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
354bfd2b
LP
3113 goto finish;
3114 }
3115
af4ec430
LP
3116 sd_notify(false,
3117 "READY=1\n"
3118 "STATUS=Container running.");
05947bef 3119
a258bf26
LP
3120 assert_se(sigemptyset(&mask) == 0);
3121 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3122 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3123
023fb90b
LP
3124 assert_se(sigemptyset(&mask_chld) == 0);
3125 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3126
d87be9b0 3127 for (;;) {
113cea80 3128 ContainerStatus container_status;
7566e267 3129 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
e866af3a
DH
3130 struct sigaction sa = {
3131 .sa_handler = nop_handler,
3132 .sa_flags = SA_NOCLDSTOP,
3133 };
3134
7566e267 3135 r = barrier_create(&barrier);
a2da110b 3136 if (r < 0) {
da927ba9 3137 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3138 goto finish;
3139 }
3140
e866af3a
DH
3141 /* Child can be killed before execv(), so handle SIGCHLD
3142 * in order to interrupt parent's blocking calls and
3143 * give it a chance to call wait() and terminate. */
3144 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3145 if (r < 0) {
ec16945e 3146 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3147 goto finish;
3148 }
3149
e866af3a
DH
3150 r = sigaction(SIGCHLD, &sa, NULL);
3151 if (r < 0) {
ec16945e 3152 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3153 goto finish;
3154 }
3155
60e1651a
KW
3156 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3157 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3158 (arg_private_network ? CLONE_NEWNET : 0), NULL);
d87be9b0
LP
3159 if (pid < 0) {
3160 if (errno == EINVAL)
ec16945e 3161 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3162 else
ec16945e 3163 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3164
d87be9b0
LP
3165 goto finish;
3166 }
a258bf26 3167
d87be9b0
LP
3168 if (pid == 0) {
3169 /* child */
0cb9fbcd 3170 _cleanup_free_ char *home = NULL;
5674767e 3171 unsigned n_env = 2;
d87be9b0 3172 const char *envp[] = {
e10a55fd 3173 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
3174 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3175 NULL, /* TERM */
3176 NULL, /* HOME */
3177 NULL, /* USER */
3178 NULL, /* LOGNAME */
3179 NULL, /* container_uuid */
842f3b0f
LP
3180 NULL, /* LISTEN_FDS */
3181 NULL, /* LISTEN_PID */
d87be9b0
LP
3182 NULL
3183 };
f4889f65 3184 char **env_use;
a258bf26 3185
a2da110b
DH
3186 barrier_set_role(&barrier, BARRIER_CHILD);
3187
5674767e
ZJS
3188 envp[n_env] = strv_find_prefix(environ, "TERM=");
3189 if (envp[n_env])
3190 n_env ++;
a258bf26 3191
03e334a1 3192 master = safe_close(master);
a258bf26 3193
d87be9b0
LP
3194 close_nointr(STDIN_FILENO);
3195 close_nointr(STDOUT_FILENO);
3196 close_nointr(STDERR_FILENO);
db7feb7e 3197
03e334a1 3198 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
a258bf26 3199
d87be9b0 3200 reset_all_signal_handlers();
1b6d7fa7 3201 reset_signal_mask();
f5c1b9ee 3202
ec16945e
LP
3203 r = open_terminal(console, O_RDWR);
3204 if (r != STDIN_FILENO) {
3205 if (r >= 0) {
3206 safe_close(r);
3207 r = -EINVAL;
842f3b0f
LP
3208 }
3209
ec16945e 3210 log_error_errno(r, "Failed to open console: %m");
a2da110b 3211 _exit(EXIT_FAILURE);
842f3b0f
LP
3212 }
3213
3214 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3215 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
56f64d95 3216 log_error_errno(errno, "Failed to duplicate console: %m");
a2da110b 3217 _exit(EXIT_FAILURE);
842f3b0f 3218 }
bc2f673e 3219
d87be9b0 3220 if (setsid() < 0) {
56f64d95 3221 log_error_errno(errno, "setsid() failed: %m");
a2da110b 3222 _exit(EXIT_FAILURE);
bc2f673e
LP
3223 }
3224
db999e0f 3225 if (reset_audit_loginuid() < 0)
a2da110b 3226 _exit(EXIT_FAILURE);
db999e0f 3227
d87be9b0 3228 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
56f64d95 3229 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
a2da110b 3230 _exit(EXIT_FAILURE);
d87be9b0 3231 }
e58a1277 3232
d87be9b0
LP
3233 /* Mark everything as slave, so that we still
3234 * receive mounts from the real root, but don't
3235 * propagate mounts to the real root. */
3236 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
56f64d95 3237 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
a2da110b 3238 _exit(EXIT_FAILURE);
d87be9b0 3239 }
04bc4a3f 3240
727fd4fd
LP
3241 if (mount_devices(arg_directory,
3242 root_device, root_device_rw,
3243 home_device, home_device_rw,
3244 srv_device, srv_device_rw) < 0)
a2da110b 3245 _exit(EXIT_FAILURE);
1b9e5b12 3246
d87be9b0
LP
3247 /* Turn directory into bind mount */
3248 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
56f64d95 3249 log_error_errno(errno, "Failed to make bind mount: %m");
a2da110b 3250 _exit(EXIT_FAILURE);
d87be9b0 3251 }
88213476 3252
4d9f07b4
LP
3253 r = setup_volatile(arg_directory);
3254 if (r < 0)
a2da110b 3255 _exit(EXIT_FAILURE);
4d9f07b4
LP
3256
3257 if (setup_volatile_state(arg_directory) < 0)
a2da110b 3258 _exit(EXIT_FAILURE);
4d9f07b4
LP
3259
3260 r = base_filesystem_create(arg_directory);
3261 if (r < 0)
a2da110b 3262 _exit(EXIT_FAILURE);
4d9f07b4 3263
d6797c92 3264 if (arg_read_only) {
ec16945e
LP
3265 r = bind_remount_recursive(arg_directory, true);
3266 if (r < 0) {
3267 log_error_errno(r, "Failed to make tree read-only: %m");
a2da110b 3268 _exit(EXIT_FAILURE);
d87be9b0 3269 }
d6797c92 3270 }
2547bb41 3271
d87be9b0 3272 if (mount_all(arg_directory) < 0)
a2da110b 3273 _exit(EXIT_FAILURE);
57fb9fb5 3274
d87be9b0 3275 if (copy_devnodes(arg_directory) < 0)
a2da110b 3276 _exit(EXIT_FAILURE);
a258bf26 3277
f2d88580 3278 if (setup_ptmx(arg_directory) < 0)
a2da110b 3279 _exit(EXIT_FAILURE);
f2d88580 3280
d87be9b0 3281 dev_setup(arg_directory);
88213476 3282
785890ac
LP
3283 if (setup_propagate(arg_directory) < 0)
3284 _exit(EXIT_FAILURE);
3285
28650077 3286 if (setup_seccomp() < 0)
a2da110b 3287 _exit(EXIT_FAILURE);
24fb1112 3288
d87be9b0 3289 if (setup_dev_console(arg_directory, console) < 0)
a2da110b 3290 _exit(EXIT_FAILURE);
88213476 3291
d87be9b0 3292 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
a2da110b 3293 _exit(EXIT_FAILURE);
88213476 3294
03e334a1 3295 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
a258bf26 3296
d87be9b0 3297 if (setup_boot_id(arg_directory) < 0)
a2da110b 3298 _exit(EXIT_FAILURE);
a41fe3a2 3299
d87be9b0 3300 if (setup_timezone(arg_directory) < 0)
a2da110b 3301 _exit(EXIT_FAILURE);
88213476 3302
d87be9b0 3303 if (setup_resolv_conf(arg_directory) < 0)
a2da110b 3304 _exit(EXIT_FAILURE);
687d0825 3305
d87be9b0 3306 if (setup_journal(arg_directory) < 0)
a2da110b 3307 _exit(EXIT_FAILURE);
687d0825 3308
d6797c92 3309 if (mount_binds(arg_directory, arg_bind, false) < 0)
a2da110b 3310 _exit(EXIT_FAILURE);
17fe0523 3311
d6797c92 3312 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
a2da110b 3313 _exit(EXIT_FAILURE);
17fe0523 3314
06c17c39 3315 if (mount_tmpfs(arg_directory) < 0)
a2da110b 3316 _exit(EXIT_FAILURE);
06c17c39 3317
d96c1ecf
LP
3318 /* Tell the parent that we are ready, and that
3319 * it can cgroupify us to that we lack access
3320 * to certain devices and resources. */
dfb05a1c 3321 (void)barrier_place(&barrier);
d96c1ecf 3322
d87be9b0 3323 if (chdir(arg_directory) < 0) {
56f64d95 3324 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
a2da110b 3325 _exit(EXIT_FAILURE);
687d0825
MV
3326 }
3327
d87be9b0 3328 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
56f64d95 3329 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
a2da110b 3330 _exit(EXIT_FAILURE);
687d0825
MV
3331 }
3332
d87be9b0 3333 if (chroot(".") < 0) {
56f64d95 3334 log_error_errno(errno, "chroot() failed: %m");
a2da110b 3335 _exit(EXIT_FAILURE);
687d0825
MV
3336 }
3337
d87be9b0 3338 if (chdir("/") < 0) {
56f64d95 3339 log_error_errno(errno, "chdir() failed: %m");
a2da110b 3340 _exit(EXIT_FAILURE);
687d0825
MV
3341 }
3342
d87be9b0
LP
3343 umask(0022);
3344
eb91eb18
LP
3345 if (arg_private_network)
3346 loopback_setup();
d87be9b0
LP
3347
3348 if (drop_capabilities() < 0) {
56f64d95 3349 log_error_errno(errno, "drop_capabilities() failed: %m");
a2da110b 3350 _exit(EXIT_FAILURE);
687d0825 3351 }
687d0825 3352
0cb9fbcd
LP
3353 r = change_uid_gid(&home);
3354 if (r < 0)
a2da110b 3355 _exit(EXIT_FAILURE);
d87be9b0 3356
842f3b0f
LP
3357 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3358 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3359 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 3360 log_oom();
a2da110b 3361 _exit(EXIT_FAILURE);
144f0fc0 3362 }
687d0825 3363
9444b1f2 3364 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
9f24adc2
LP
3365 char as_uuid[37];
3366
3367 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
842f3b0f 3368 log_oom();
a2da110b 3369 _exit(EXIT_FAILURE);
842f3b0f
LP
3370 }
3371 }
3372
3373 if (fdset_size(fds) > 0) {
ec16945e
LP
3374 r = fdset_cloexec(fds, false);
3375 if (r < 0) {
3376 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
a2da110b 3377 _exit(EXIT_FAILURE);
842f3b0f
LP
3378 }
3379
3380 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 3381 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0 3382 log_oom();
a2da110b 3383 _exit(EXIT_FAILURE);
d87be9b0
LP
3384 }
3385 }
3386
3387 setup_hostname();
3388
6afc95b7
LP
3389 if (arg_personality != 0xffffffffLU) {
3390 if (personality(arg_personality) < 0) {
56f64d95 3391 log_error_errno(errno, "personality() failed: %m");
a2da110b 3392 _exit(EXIT_FAILURE);
6afc95b7 3393 }
1b9e5b12
LP
3394 } else if (secondary) {
3395 if (personality(PER_LINUX32) < 0) {
56f64d95 3396 log_error_errno(errno, "personality() failed: %m");
a2da110b 3397 _exit(EXIT_FAILURE);
1b9e5b12 3398 }
6afc95b7
LP
3399 }
3400
d96c1ecf
LP
3401#ifdef HAVE_SELINUX
3402 if (arg_selinux_context)
0cb9fbcd 3403 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
56f64d95 3404 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
a2da110b 3405 _exit(EXIT_FAILURE);
0cb9fbcd 3406 }
d96c1ecf 3407#endif
354bfd2b 3408
f4889f65
LP
3409 if (!strv_isempty(arg_setenv)) {
3410 char **n;
3411
3412 n = strv_env_merge(2, envp, arg_setenv);
3413 if (!n) {
3414 log_oom();
a2da110b 3415 _exit(EXIT_FAILURE);
f4889f65
LP
3416 }
3417
3418 env_use = n;
3419 } else
3420 env_use = (char**) envp;
3421
d96c1ecf 3422 /* Wait until the parent is ready with the setup, too... */
a2da110b
DH
3423 if (!barrier_place_and_sync(&barrier))
3424 _exit(EXIT_FAILURE);
d96c1ecf 3425
d87be9b0
LP
3426 if (arg_boot) {
3427 char **a;
3428 size_t l;
88213476 3429
d87be9b0 3430 /* Automatically search for the init system */
0f0dbc46 3431
d87be9b0
LP
3432 l = 1 + argc - optind;
3433 a = newa(char*, l + 1);
3434 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 3435
d87be9b0 3436 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 3437 execve(a[0], a, env_use);
0f0dbc46 3438
d87be9b0 3439 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 3440 execve(a[0], a, env_use);
0f0dbc46 3441
d87be9b0 3442 a[0] = (char*) "/sbin/init";
f4889f65 3443 execve(a[0], a, env_use);
d87be9b0 3444 } else if (argc > optind)
f4889f65 3445 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
3446 else {
3447 chdir(home ? home : "/root");
f4889f65 3448 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 3449 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
3450 }
3451
56f64d95 3452 log_error_errno(errno, "execv() failed: %m");
d87be9b0 3453 _exit(EXIT_FAILURE);
da5b3bad 3454 }
88213476 3455
a2da110b 3456 barrier_set_role(&barrier, BARRIER_PARENT);
842f3b0f
LP
3457 fdset_free(fds);
3458 fds = NULL;
3459
a2da110b
DH
3460 /* wait for child-setup to be done */
3461 if (barrier_place_and_sync(&barrier)) {
023fb90b
LP
3462 _cleanup_event_unref_ sd_event *event = NULL;
3463 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
c7b7d449 3464 char last_char = 0;
5aa4bb6b 3465 int ifi = 0;
354bfd2b 3466
840295fc
LP
3467 r = move_network_interfaces(pid);
3468 if (r < 0)
3469 goto finish;
aa28aefe 3470
5aa4bb6b 3471 r = setup_veth(pid, veth_name, &ifi);
840295fc
LP
3472 if (r < 0)
3473 goto finish;
ab046dde 3474
5aa4bb6b 3475 r = setup_bridge(veth_name, &ifi);
840295fc
LP
3476 if (r < 0)
3477 goto finish;
ab046dde 3478
840295fc
LP
3479 r = setup_macvlan(pid);
3480 if (r < 0)
3481 goto finish;
c74e630d 3482
5aa4bb6b
LP
3483 r = register_machine(pid, ifi);
3484 if (r < 0)
3485 goto finish;
3486
840295fc
LP
3487 /* Block SIGCHLD here, before notifying child.
3488 * process_pty() will handle it with the other signals. */
3489 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3490 if (r < 0)
3491 goto finish;
e866af3a 3492
840295fc
LP
3493 /* Reset signal to default */
3494 r = default_signals(SIGCHLD, -1);
3495 if (r < 0)
3496 goto finish;
e866af3a 3497
840295fc
LP
3498 /* Notify the child that the parent is ready with all
3499 * its setup, and that the child can now hand over
3500 * control to the code to run inside the container. */
dfb05a1c 3501 (void)barrier_place(&barrier);
354bfd2b 3502
023fb90b
LP
3503 r = sd_event_new(&event);
3504 if (r < 0) {
da927ba9 3505 log_error_errno(r, "Failed to get default event source: %m");
023fb90b 3506 goto finish;
840295fc 3507 }
88213476 3508
023fb90b
LP
3509 if (arg_boot) {
3510 /* Try to kill the init system on SIGINT or SIGTERM */
3511 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3512 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3513 } else {
3514 /* Immediately exit */
3515 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3516 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3517 }
3518
3519 /* simply exit on sigchld */
3520 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3521
9b15b784 3522 r = pty_forward_new(event, master, true, &forward);
023fb90b 3523 if (r < 0) {
da927ba9 3524 log_error_errno(r, "Failed to create PTY forwarder: %m");
023fb90b
LP
3525 goto finish;
3526 }
3527
3528 r = sd_event_loop(event);
ec16945e
LP
3529 if (r < 0) {
3530 log_error_errno(r, "Failed to run event loop: %m");
3531 goto finish;
3532 }
023fb90b 3533
c7b7d449
LP
3534 pty_forward_last_char(forward, &last_char);
3535
023fb90b
LP
3536 forward = pty_forward_free(forward);
3537
c7b7d449 3538 if (!arg_quiet && last_char != '\n')
840295fc 3539 putc('\n', stdout);
04d39279 3540
840295fc
LP
3541 /* Kill if it is not dead yet anyway */
3542 terminate_machine(pid);
3543 }
1f0cd86b 3544
840295fc 3545 /* Normally redundant, but better safe than sorry */
04d39279 3546 kill(pid, SIGKILL);
a258bf26 3547
113cea80 3548 r = wait_for_container(pid, &container_status);
04d39279
LP
3549 pid = 0;
3550
ec16945e 3551 if (r < 0)
ce9f1527
LP
3552 /* We failed to wait for the container, or the
3553 * container exited abnormally */
ec16945e
LP
3554 goto finish;
3555 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
3556 /* The container exited with a non-zero
3557 * status, or with zero status and no reboot
3558 * was requested. */
ec16945e 3559 ret = r;
d87be9b0 3560 break;
ec16945e 3561 }
88213476 3562
113cea80 3563 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3564
3565 if (arg_keep_unit) {
3566 /* Special handling if we are running as a
3567 * service: instead of simply restarting the
3568 * machine we want to restart the entire
3569 * service, so let's inform systemd about this
3570 * with the special exit code 133. The service
3571 * file uses RestartForceExitStatus=133 so
3572 * that this results in a full nspawn
3573 * restart. This is necessary since we might
3574 * have cgroup parameters set we want to have
3575 * flushed out. */
ec16945e
LP
3576 ret = 133;
3577 r = 0;
ce38dbc8
LP
3578 break;
3579 }
d87be9b0 3580 }
88213476
LP
3581
3582finish:
af4ec430
LP
3583 sd_notify(false,
3584 "STOPPING=1\n"
3585 "STATUS=Terminating...");
3586
1b9e5b12
LP
3587 loop_remove(loop_nr, &image_fd);
3588
9444b1f2
LP
3589 if (pid > 0)
3590 kill(pid, SIGKILL);
88213476 3591
ec16945e
LP
3592 if (remove_subvol && arg_directory) {
3593 int k;
3594
3595 k = btrfs_subvol_remove(arg_directory);
3596 if (k < 0)
3597 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3598 }
3599
785890ac
LP
3600 if (arg_machine) {
3601 const char *p;
3602
3603 p = strappenda("/run/systemd/nspawn/propagate", arg_machine);
3604 (void) rm_rf(p, false, true, false);
3605 }
3606
04d391da 3607 free(arg_directory);
ec16945e
LP
3608 free(arg_template);
3609 free(arg_image);
7027ff61 3610 free(arg_machine);
c74e630d
LP
3611 free(arg_user);
3612 strv_free(arg_setenv);
3613 strv_free(arg_network_interfaces);
3614 strv_free(arg_network_macvlan);
3615 strv_free(arg_bind);
3616 strv_free(arg_bind_ro);
06c17c39 3617 strv_free(arg_tmpfs);
88213476 3618
ec16945e 3619 return r < 0 ? EXIT_FAILURE : ret;
88213476 3620}