]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
treewide: use log_*_errno whenever %m is in the format string
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
a258bf26
LP
36#include <termios.h>
37#include <sys/signalfd.h>
687d0825 38#include <grp.h>
5ed27dbd 39#include <linux/fs.h>
9537eab0
LP
40#include <sys/un.h>
41#include <sys/socket.h>
aea38d80 42#include <linux/netlink.h>
aa28aefe 43#include <net/if.h>
69c79d3c 44#include <linux/veth.h>
6afc95b7 45#include <sys/personality.h>
1b9e5b12 46#include <linux/loop.h>
aa28aefe 47
5d63309c 48#ifdef HAVE_SELINUX
a8828ed9
DW
49#include <selinux/selinux.h>
50#endif
88213476 51
24fb1112
LP
52#ifdef HAVE_SECCOMP
53#include <seccomp.h>
54#endif
55
1b9e5b12
LP
56#ifdef HAVE_BLKID
57#include <blkid/blkid.h>
58#endif
59
1f0cd86b
LP
60#include "sd-daemon.h"
61#include "sd-bus.h"
62#include "sd-id128.h"
aa28aefe 63#include "sd-rtnl.h"
88213476
LP
64#include "log.h"
65#include "util.h"
49e942b2 66#include "mkdir.h"
6b2d0e85 67#include "macro.h"
d7832d2c 68#include "audit.h"
94d82985 69#include "missing.h"
04d391da 70#include "cgroup-util.h"
a258bf26 71#include "strv.h"
9eb977db 72#include "path-util.h"
a41fe3a2 73#include "loopback-setup.h"
4fc9982c 74#include "dev-setup.h"
842f3b0f 75#include "fdset.h"
acbeb427 76#include "build.h"
a5c32cff 77#include "fileio.h"
40ca29a1 78#include "bus-util.h"
1f0cd86b 79#include "bus-error.h"
4ba93280 80#include "ptyfwd.h"
9bd37b40 81#include "bus-kernel.h"
f4889f65 82#include "env-util.h"
7f112f50 83#include "def.h"
aa28aefe 84#include "rtnl-util.h"
7e227024 85#include "udev-util.h"
1b9e5b12
LP
86#include "blkid-util.h"
87#include "gpt.h"
01dde061 88#include "siphash24.h"
849958d1 89#include "copy.h"
3577de7a 90#include "base-filesystem.h"
a2da110b 91#include "barrier.h"
023fb90b 92#include "event-util.h"
f2d88580 93
e9642be2
LP
94#ifdef HAVE_SECCOMP
95#include "seccomp-util.h"
96#endif
97
113cea80
DH
98typedef enum ContainerStatus {
99 CONTAINER_TERMINATED,
100 CONTAINER_REBOOTED
101} ContainerStatus;
102
57fb9fb5
LP
103typedef enum LinkJournal {
104 LINK_NO,
105 LINK_AUTO,
106 LINK_HOST,
107 LINK_GUEST
108} LinkJournal;
88213476 109
4d9f07b4
LP
110typedef enum Volatile {
111 VOLATILE_NO,
112 VOLATILE_YES,
113 VOLATILE_STATE,
114} Volatile;
115
88213476 116static char *arg_directory = NULL;
687d0825 117static char *arg_user = NULL;
9444b1f2 118static sd_id128_t arg_uuid = {};
7027ff61 119static char *arg_machine = NULL;
c74e630d
LP
120static const char *arg_selinux_context = NULL;
121static const char *arg_selinux_apifs_context = NULL;
9444b1f2 122static const char *arg_slice = NULL;
ff01d048 123static bool arg_private_network = false;
bc2f673e 124static bool arg_read_only = false;
0f0dbc46 125static bool arg_boot = false;
57fb9fb5 126static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 127static bool arg_link_journal_try = false;
5076f0cc
LP
128static uint64_t arg_retain =
129 (1ULL << CAP_CHOWN) |
130 (1ULL << CAP_DAC_OVERRIDE) |
131 (1ULL << CAP_DAC_READ_SEARCH) |
132 (1ULL << CAP_FOWNER) |
133 (1ULL << CAP_FSETID) |
134 (1ULL << CAP_IPC_OWNER) |
135 (1ULL << CAP_KILL) |
136 (1ULL << CAP_LEASE) |
137 (1ULL << CAP_LINUX_IMMUTABLE) |
138 (1ULL << CAP_NET_BIND_SERVICE) |
139 (1ULL << CAP_NET_BROADCAST) |
140 (1ULL << CAP_NET_RAW) |
141 (1ULL << CAP_SETGID) |
142 (1ULL << CAP_SETFCAP) |
143 (1ULL << CAP_SETPCAP) |
144 (1ULL << CAP_SETUID) |
145 (1ULL << CAP_SYS_ADMIN) |
146 (1ULL << CAP_SYS_CHROOT) |
147 (1ULL << CAP_SYS_NICE) |
148 (1ULL << CAP_SYS_PTRACE) |
149 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 150 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
151 (1ULL << CAP_SYS_BOOT) |
152 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
153 (1ULL << CAP_AUDIT_CONTROL) |
154 (1ULL << CAP_MKNOD);
17fe0523
LP
155static char **arg_bind = NULL;
156static char **arg_bind_ro = NULL;
06c17c39 157static char **arg_tmpfs = NULL;
f4889f65 158static char **arg_setenv = NULL;
284c0b91 159static bool arg_quiet = false;
8a96d94e 160static bool arg_share_system = false;
eb91eb18 161static bool arg_register = true;
89f7c846 162static bool arg_keep_unit = false;
aa28aefe 163static char **arg_network_interfaces = NULL;
c74e630d 164static char **arg_network_macvlan = NULL;
69c79d3c 165static bool arg_network_veth = false;
c74e630d 166static const char *arg_network_bridge = NULL;
6afc95b7 167static unsigned long arg_personality = 0xffffffffLU;
1b9e5b12 168static const char *arg_image = NULL;
4d9f07b4 169static Volatile arg_volatile = VOLATILE_NO;
88213476 170
601185b4 171static void help(void) {
88213476
LP
172 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
173 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
174 " -h --help Show this help\n"
175 " --version Print version string\n"
69c79d3c 176 " -q --quiet Do not show status information\n"
1b9e5b12
LP
177 " -D --directory=PATH Root directory for the container\n"
178 " -i --image=PATH File system device or image for the container\n"
a8828ed9
DW
179 " -b --boot Boot up full system (i.e. invoke init)\n"
180 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 181 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 182 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 183 " -S --slice=SLICE Place the container in the specified slice\n"
69c79d3c
LP
184 " --private-network Disable network in container\n"
185 " --network-interface=INTERFACE\n"
186 " Assign an existing network interface to the\n"
187 " container\n"
c74e630d
LP
188 " --network-macvlan=INTERFACE\n"
189 " Create a macvlan network interface based on an\n"
190 " existing network interface to the container\n"
32457153 191 " --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 192 " and container\n"
ab046dde 193 " --network-bridge=INTERFACE\n"
32457153 194 " Add a virtual ethernet connection between host\n"
ab046dde
TG
195 " and container and add it to an existing bridge on\n"
196 " the host\n"
82adf6af
LP
197 " -Z --selinux-context=SECLABEL\n"
198 " Set the SELinux security context to be used by\n"
199 " processes in the container\n"
200 " -L --selinux-apifs-context=SECLABEL\n"
201 " Set the SELinux security context to be used by\n"
202 " API/tmpfs file systems in the container\n"
a8828ed9
DW
203 " --capability=CAP In addition to the default, retain specified\n"
204 " capability\n"
205 " --drop-capability=CAP Drop the specified capability from the default set\n"
574edc90
MP
206 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
207 " try-guest, try-host\n"
208 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 209 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
210 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
211 " the container\n"
212 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 213 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
284c0b91 214 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 215 " --share-system Share system namespaces with host\n"
eb91eb18 216 " --register=BOOLEAN Register container as machine\n"
89f7c846 217 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4
LP
218 " the service unit nspawn is running in\n"
219 " --volatile[=MODE] Run the system in volatile mode\n",
88213476 220 program_invocation_short_name);
88213476
LP
221}
222
223static int parse_argv(int argc, char *argv[]) {
224
a41fe3a2 225 enum {
acbeb427
ZJS
226 ARG_VERSION = 0x100,
227 ARG_PRIVATE_NETWORK,
bc2f673e 228 ARG_UUID,
5076f0cc 229 ARG_READ_ONLY,
57fb9fb5 230 ARG_CAPABILITY,
420c7379 231 ARG_DROP_CAPABILITY,
17fe0523
LP
232 ARG_LINK_JOURNAL,
233 ARG_BIND,
f4889f65 234 ARG_BIND_RO,
06c17c39 235 ARG_TMPFS,
f4889f65 236 ARG_SETENV,
eb91eb18 237 ARG_SHARE_SYSTEM,
89f7c846 238 ARG_REGISTER,
aa28aefe 239 ARG_KEEP_UNIT,
69c79d3c 240 ARG_NETWORK_INTERFACE,
c74e630d 241 ARG_NETWORK_MACVLAN,
69c79d3c 242 ARG_NETWORK_VETH,
ab046dde 243 ARG_NETWORK_BRIDGE,
6afc95b7 244 ARG_PERSONALITY,
4d9f07b4 245 ARG_VOLATILE,
a41fe3a2
LP
246 };
247
88213476 248 static const struct option options[] = {
aa28aefe
LP
249 { "help", no_argument, NULL, 'h' },
250 { "version", no_argument, NULL, ARG_VERSION },
251 { "directory", required_argument, NULL, 'D' },
252 { "user", required_argument, NULL, 'u' },
253 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
254 { "boot", no_argument, NULL, 'b' },
255 { "uuid", required_argument, NULL, ARG_UUID },
256 { "read-only", no_argument, NULL, ARG_READ_ONLY },
257 { "capability", required_argument, NULL, ARG_CAPABILITY },
258 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
259 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
260 { "bind", required_argument, NULL, ARG_BIND },
261 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 262 { "tmpfs", required_argument, NULL, ARG_TMPFS },
aa28aefe
LP
263 { "machine", required_argument, NULL, 'M' },
264 { "slice", required_argument, NULL, 'S' },
265 { "setenv", required_argument, NULL, ARG_SETENV },
266 { "selinux-context", required_argument, NULL, 'Z' },
267 { "selinux-apifs-context", required_argument, NULL, 'L' },
268 { "quiet", no_argument, NULL, 'q' },
269 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
270 { "register", required_argument, NULL, ARG_REGISTER },
271 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
272 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 273 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
ab046dde
TG
274 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
275 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 276 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 277 { "image", required_argument, NULL, 'i' },
4d9f07b4 278 { "volatile", optional_argument, NULL, ARG_VOLATILE },
eb9da376 279 {}
88213476
LP
280 };
281
9444b1f2 282 int c, r;
a42c8b54 283 uint64_t plus = 0, minus = 0;
88213476
LP
284
285 assert(argc >= 0);
286 assert(argv);
287
601185b4 288 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
88213476
LP
289
290 switch (c) {
291
292 case 'h':
601185b4
ZJS
293 help();
294 return 0;
88213476 295
acbeb427
ZJS
296 case ARG_VERSION:
297 puts(PACKAGE_STRING);
298 puts(SYSTEMD_FEATURES);
299 return 0;
300
88213476
LP
301 case 'D':
302 free(arg_directory);
3a74cea5
LP
303 arg_directory = canonicalize_file_name(optarg);
304 if (!arg_directory) {
56f64d95 305 log_error_errno(errno, "Invalid root directory: %m");
88213476
LP
306 return -ENOMEM;
307 }
308
309 break;
310
1b9e5b12
LP
311 case 'i':
312 arg_image = optarg;
313 break;
314
687d0825
MV
315 case 'u':
316 free(arg_user);
7027ff61
LP
317 arg_user = strdup(optarg);
318 if (!arg_user)
319 return log_oom();
687d0825
MV
320
321 break;
322
ab046dde 323 case ARG_NETWORK_BRIDGE:
c74e630d 324 arg_network_bridge = optarg;
ab046dde
TG
325
326 /* fall through */
327
69c79d3c
LP
328 case ARG_NETWORK_VETH:
329 arg_network_veth = true;
330 arg_private_network = true;
331 break;
332
aa28aefe 333 case ARG_NETWORK_INTERFACE:
c74e630d
LP
334 if (strv_extend(&arg_network_interfaces, optarg) < 0)
335 return log_oom();
336
337 arg_private_network = true;
338 break;
339
340 case ARG_NETWORK_MACVLAN:
341 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
342 return log_oom();
343
344 /* fall through */
345
ff01d048
LP
346 case ARG_PRIVATE_NETWORK:
347 arg_private_network = true;
a41fe3a2
LP
348 break;
349
0f0dbc46
LP
350 case 'b':
351 arg_boot = true;
352 break;
353
144f0fc0 354 case ARG_UUID:
9444b1f2
LP
355 r = sd_id128_from_string(optarg, &arg_uuid);
356 if (r < 0) {
aa96c6cb 357 log_error("Invalid UUID: %s", optarg);
9444b1f2 358 return r;
aa96c6cb 359 }
9444b1f2 360 break;
aa96c6cb 361
9444b1f2 362 case 'S':
c74e630d 363 arg_slice = optarg;
144f0fc0
LP
364 break;
365
7027ff61 366 case 'M':
eb91eb18
LP
367 if (isempty(optarg)) {
368 free(arg_machine);
369 arg_machine = NULL;
370 } else {
7027ff61 371
eb91eb18
LP
372 if (!hostname_is_valid(optarg)) {
373 log_error("Invalid machine name: %s", optarg);
374 return -EINVAL;
375 }
7027ff61 376
eb91eb18
LP
377 free(arg_machine);
378 arg_machine = strdup(optarg);
379 if (!arg_machine)
380 return log_oom();
381
382 break;
383 }
7027ff61 384
82adf6af
LP
385 case 'Z':
386 arg_selinux_context = optarg;
a8828ed9
DW
387 break;
388
82adf6af
LP
389 case 'L':
390 arg_selinux_apifs_context = optarg;
a8828ed9
DW
391 break;
392
bc2f673e
LP
393 case ARG_READ_ONLY:
394 arg_read_only = true;
395 break;
396
420c7379
LP
397 case ARG_CAPABILITY:
398 case ARG_DROP_CAPABILITY: {
a2a5291b 399 const char *state, *word;
5076f0cc
LP
400 size_t length;
401
402 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 403 _cleanup_free_ char *t;
5076f0cc 404 cap_value_t cap;
5076f0cc
LP
405
406 t = strndup(word, length);
0d0f0c50
SL
407 if (!t)
408 return log_oom();
5076f0cc 409
39ed67d1
LP
410 if (streq(t, "all")) {
411 if (c == ARG_CAPABILITY)
a42c8b54 412 plus = (uint64_t) -1;
39ed67d1 413 else
a42c8b54 414 minus = (uint64_t) -1;
39ed67d1
LP
415 } else {
416 if (cap_from_name(t, &cap) < 0) {
417 log_error("Failed to parse capability %s.", t);
418 return -EINVAL;
419 }
420
421 if (c == ARG_CAPABILITY)
a42c8b54 422 plus |= 1ULL << (uint64_t) cap;
39ed67d1 423 else
a42c8b54 424 minus |= 1ULL << (uint64_t) cap;
5076f0cc 425 }
5076f0cc
LP
426 }
427
428 break;
429 }
430
57fb9fb5
LP
431 case 'j':
432 arg_link_journal = LINK_GUEST;
574edc90 433 arg_link_journal_try = true;
57fb9fb5
LP
434 break;
435
436 case ARG_LINK_JOURNAL:
437 if (streq(optarg, "auto"))
438 arg_link_journal = LINK_AUTO;
439 else if (streq(optarg, "no"))
440 arg_link_journal = LINK_NO;
441 else if (streq(optarg, "guest"))
442 arg_link_journal = LINK_GUEST;
443 else if (streq(optarg, "host"))
444 arg_link_journal = LINK_HOST;
574edc90
MP
445 else if (streq(optarg, "try-guest")) {
446 arg_link_journal = LINK_GUEST;
447 arg_link_journal_try = true;
448 } else if (streq(optarg, "try-host")) {
449 arg_link_journal = LINK_HOST;
450 arg_link_journal_try = true;
451 } else {
57fb9fb5
LP
452 log_error("Failed to parse link journal mode %s", optarg);
453 return -EINVAL;
454 }
455
456 break;
457
17fe0523
LP
458 case ARG_BIND:
459 case ARG_BIND_RO: {
460 _cleanup_free_ char *a = NULL, *b = NULL;
461 char *e;
462 char ***x;
17fe0523
LP
463
464 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
465
466 e = strchr(optarg, ':');
467 if (e) {
468 a = strndup(optarg, e - optarg);
469 b = strdup(e + 1);
470 } else {
471 a = strdup(optarg);
472 b = strdup(optarg);
473 }
474
475 if (!a || !b)
476 return log_oom();
477
478 if (!path_is_absolute(a) || !path_is_absolute(b)) {
479 log_error("Invalid bind mount specification: %s", optarg);
480 return -EINVAL;
481 }
482
483 r = strv_extend(x, a);
484 if (r < 0)
b3451bed 485 return log_oom();
17fe0523
LP
486
487 r = strv_extend(x, b);
488 if (r < 0)
b3451bed 489 return log_oom();
17fe0523
LP
490
491 break;
492 }
493
06c17c39
LP
494 case ARG_TMPFS: {
495 _cleanup_free_ char *a = NULL, *b = NULL;
496 char *e;
497
498 e = strchr(optarg, ':');
499 if (e) {
500 a = strndup(optarg, e - optarg);
501 b = strdup(e + 1);
502 } else {
503 a = strdup(optarg);
504 b = strdup("mode=0755");
505 }
506
507 if (!a || !b)
508 return log_oom();
509
510 if (!path_is_absolute(a)) {
511 log_error("Invalid tmpfs specification: %s", optarg);
512 return -EINVAL;
513 }
514
515 r = strv_push(&arg_tmpfs, a);
516 if (r < 0)
517 return log_oom();
518
519 a = NULL;
520
521 r = strv_push(&arg_tmpfs, b);
522 if (r < 0)
523 return log_oom();
524
525 b = NULL;
526
527 break;
528 }
529
f4889f65
LP
530 case ARG_SETENV: {
531 char **n;
532
533 if (!env_assignment_is_valid(optarg)) {
534 log_error("Environment variable assignment '%s' is not valid.", optarg);
535 return -EINVAL;
536 }
537
538 n = strv_env_set(arg_setenv, optarg);
539 if (!n)
540 return log_oom();
541
542 strv_free(arg_setenv);
543 arg_setenv = n;
544 break;
545 }
546
284c0b91
LP
547 case 'q':
548 arg_quiet = true;
549 break;
550
8a96d94e
LP
551 case ARG_SHARE_SYSTEM:
552 arg_share_system = true;
553 break;
554
eb91eb18
LP
555 case ARG_REGISTER:
556 r = parse_boolean(optarg);
557 if (r < 0) {
558 log_error("Failed to parse --register= argument: %s", optarg);
559 return r;
560 }
561
562 arg_register = r;
563 break;
564
89f7c846
LP
565 case ARG_KEEP_UNIT:
566 arg_keep_unit = true;
567 break;
568
6afc95b7
LP
569 case ARG_PERSONALITY:
570
ac45f971 571 arg_personality = personality_from_string(optarg);
6afc95b7
LP
572 if (arg_personality == 0xffffffffLU) {
573 log_error("Unknown or unsupported personality '%s'.", optarg);
574 return -EINVAL;
575 }
576
577 break;
578
4d9f07b4
LP
579 case ARG_VOLATILE:
580
581 if (!optarg)
582 arg_volatile = VOLATILE_YES;
583 else {
584 r = parse_boolean(optarg);
585 if (r < 0) {
586 if (streq(optarg, "state"))
587 arg_volatile = VOLATILE_STATE;
588 else {
589 log_error("Failed to parse --volatile= argument: %s", optarg);
590 return r;
591 }
592 } else
593 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
594 }
595
596 break;
597
88213476
LP
598 case '?':
599 return -EINVAL;
600
601 default:
eb9da376 602 assert_not_reached("Unhandled option");
88213476 603 }
88213476 604
eb91eb18
LP
605 if (arg_share_system)
606 arg_register = false;
607
608 if (arg_boot && arg_share_system) {
609 log_error("--boot and --share-system may not be combined.");
610 return -EINVAL;
611 }
612
89f7c846
LP
613 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
614 log_error("--keep-unit may not be used when invoked from a user session.");
615 return -EINVAL;
616 }
617
1b9e5b12
LP
618 if (arg_directory && arg_image) {
619 log_error("--directory= and --image= may not be combined.");
620 return -EINVAL;
621 }
622
4d9f07b4
LP
623 if (arg_volatile != VOLATILE_NO && arg_read_only) {
624 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
625 return -EINVAL;
626 }
627
a42c8b54
LP
628 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
629
88213476
LP
630 return 1;
631}
632
633static int mount_all(const char *dest) {
634
635 typedef struct MountPoint {
636 const char *what;
637 const char *where;
638 const char *type;
639 const char *options;
640 unsigned long flags;
3bd66c05 641 bool fatal;
88213476
LP
642 } MountPoint;
643
644 static const MountPoint mount_table[] = {
06c17c39
LP
645 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
646 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
647 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
648 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
649 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 650 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
06c17c39
LP
651 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
652 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 653#ifdef HAVE_SELINUX
06c17c39
LP
654 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
655 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 656#endif
88213476
LP
657 };
658
659 unsigned k;
660 int r = 0;
661
662 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
7fd1b19b 663 _cleanup_free_ char *where = NULL;
d002827b 664#ifdef HAVE_SELINUX
a8828ed9 665 _cleanup_free_ char *options = NULL;
d002827b
LP
666#endif
667 const char *o;
88213476
LP
668 int t;
669
17fe0523
LP
670 where = strjoin(dest, "/", mount_table[k].where, NULL);
671 if (!where)
672 return log_oom();
88213476 673
e65aec12 674 t = path_is_mount_point(where, true);
68fb0892 675 if (t < 0) {
da927ba9 676 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
88213476
LP
677
678 if (r == 0)
679 r = t;
680
681 continue;
682 }
683
9c1c7f71
LP
684 /* Skip this entry if it is not a remount. */
685 if (mount_table[k].what && t > 0)
014a9c77
LP
686 continue;
687
79d80fc1
TG
688 t = mkdir_p(where, 0755);
689 if (t < 0) {
690 if (mount_table[k].fatal) {
da927ba9 691 log_error_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
692
693 if (r == 0)
694 r = t;
695 } else
da927ba9 696 log_warning_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
697
698 continue;
699 }
88213476 700
a8828ed9 701#ifdef HAVE_SELINUX
82adf6af
LP
702 if (arg_selinux_apifs_context &&
703 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
704 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
705 if (!options)
706 return log_oom();
707
708 o = options;
709 } else
a8828ed9 710#endif
d002827b 711 o = mount_table[k].options;
a8828ed9 712
a8828ed9 713
88213476
LP
714 if (mount(mount_table[k].what,
715 where,
716 mount_table[k].type,
717 mount_table[k].flags,
79d80fc1 718 o) < 0) {
88213476 719
79d80fc1 720 if (mount_table[k].fatal) {
56f64d95 721 log_error_errno(errno, "mount(%s) failed: %m", where);
88213476 722
79d80fc1
TG
723 if (r == 0)
724 r = -errno;
725 } else
56f64d95 726 log_warning_errno(errno, "mount(%s) failed: %m", where);
88213476 727 }
88213476
LP
728 }
729
e58a1277
LP
730 return r;
731}
f8440af5 732
d6797c92 733static int mount_binds(const char *dest, char **l, bool ro) {
17fe0523
LP
734 char **x, **y;
735
736 STRV_FOREACH_PAIR(x, y, l) {
06c17c39 737 _cleanup_free_ char *where = NULL;
d2421337 738 struct stat source_st, dest_st;
2ed4e5e0 739 int r;
d2421337
DR
740
741 if (stat(*x, &source_st) < 0) {
56f64d95 742 log_error_errno(errno, "Failed to stat %s: %m", *x);
d2421337
DR
743 return -errno;
744 }
17fe0523 745
06c17c39
LP
746 where = strappend(dest, *y);
747 if (!where)
748 return log_oom();
749
2ed4e5e0
SL
750 r = stat(where, &dest_st);
751 if (r == 0) {
d2421337 752 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
06c17c39 753 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
d2421337
DR
754 return -EINVAL;
755 }
2ed4e5e0
SL
756 } else if (errno == ENOENT) {
757 r = mkdir_parents_label(where, 0755);
f647962d
MS
758 if (r < 0)
759 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
2ed4e5e0 760 } else {
56f64d95 761 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
2ed4e5e0
SL
762 return -errno;
763 }
06c17c39 764
2ed4e5e0 765 /* Create the mount point, but be conservative -- refuse to create block
4d9f07b4 766 * and char devices. */
79d80fc1
TG
767 if (S_ISDIR(source_st.st_mode)) {
768 r = mkdir_label(where, 0755);
f647962d
MS
769 if (r < 0 && errno != EEXIST)
770 return log_error_errno(r, "Failed to create mount point %s: %m", where);
79d80fc1
TG
771 } else if (S_ISFIFO(source_st.st_mode)) {
772 r = mkfifo(where, 0644);
773 if (r < 0 && errno != EEXIST) {
56f64d95 774 log_error_errno(errno, "Failed to create mount point %s: %m", where);
79d80fc1
TG
775
776 return -errno;
777 }
778 } else if (S_ISSOCK(source_st.st_mode)) {
779 r = mknod(where, 0644 | S_IFSOCK, 0);
780 if (r < 0 && errno != EEXIST) {
56f64d95 781 log_error_errno(errno, "Failed to create mount point %s: %m", where);
79d80fc1
TG
782
783 return -errno;
784 }
785 } else if (S_ISREG(source_st.st_mode)) {
786 r = touch(where);
f647962d
MS
787 if (r < 0)
788 return log_error_errno(r, "Failed to create mount point %s: %m", where);
79d80fc1 789 } else {
2ed4e5e0
SL
790 log_error("Refusing to create mountpoint for file: %s", *x);
791 return -ENOTSUP;
d2421337 792 }
17fe0523
LP
793
794 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
56f64d95 795 log_error_errno(errno, "mount(%s) failed: %m", where);
17fe0523
LP
796 return -errno;
797 }
798
d6797c92
LP
799 if (ro) {
800 r = bind_remount_recursive(where, true);
f647962d
MS
801 if (r < 0)
802 return log_error_errno(r, "Read-Only bind mount failed: %m");
17fe0523
LP
803 }
804 }
805
806 return 0;
807}
808
06c17c39
LP
809static int mount_tmpfs(const char *dest) {
810 char **i, **o;
811
812 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
813 _cleanup_free_ char *where = NULL;
79d80fc1 814 int r;
06c17c39
LP
815
816 where = strappend(dest, *i);
817 if (!where)
818 return log_oom();
819
79d80fc1 820 r = mkdir_label(where, 0755);
f647962d
MS
821 if (r < 0 && errno != EEXIST)
822 return log_error_errno(r, "creating mount point for tmpfs %s failed: %m", where);
06c17c39
LP
823
824 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
56f64d95 825 log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
06c17c39
LP
826 return -errno;
827 }
828 }
829
830 return 0;
831}
832
e58a1277 833static int setup_timezone(const char *dest) {
d4036145
LP
834 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
835 char *z, *y;
836 int r;
f8440af5 837
e58a1277
LP
838 assert(dest);
839
840 /* Fix the timezone, if possible */
d4036145
LP
841 r = readlink_malloc("/etc/localtime", &p);
842 if (r < 0) {
843 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
844 return 0;
845 }
846
847 z = path_startswith(p, "../usr/share/zoneinfo/");
848 if (!z)
849 z = path_startswith(p, "/usr/share/zoneinfo/");
850 if (!z) {
851 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
852 return 0;
853 }
854
04bc4a3f
LP
855 where = strappend(dest, "/etc/localtime");
856 if (!where)
0d0f0c50 857 return log_oom();
715ac17a 858
d4036145
LP
859 r = readlink_malloc(where, &q);
860 if (r >= 0) {
861 y = path_startswith(q, "../usr/share/zoneinfo/");
862 if (!y)
863 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 864
d4036145
LP
865 /* Already pointing to the right place? Then do nothing .. */
866 if (y && streq(y, z))
867 return 0;
868 }
869
870 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
871 if (!check)
0d0f0c50 872 return log_oom();
4d1c38b8 873
d4036145
LP
874 if (access(check, F_OK) < 0) {
875 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
876 return 0;
877 }
68fb0892 878
d4036145
LP
879 what = strappend("../usr/share/zoneinfo/", z);
880 if (!what)
881 return log_oom();
882
79d80fc1
TG
883 r = mkdir_parents(where, 0755);
884 if (r < 0) {
da927ba9 885 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
79d80fc1
TG
886
887 return 0;
888 }
889
890 r = unlink(where);
891 if (r < 0 && errno != ENOENT) {
56f64d95 892 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
893
894 return 0;
895 }
4d9f07b4 896
d4036145 897 if (symlink(what, where) < 0) {
56f64d95 898 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
899 return 0;
900 }
e58a1277
LP
901
902 return 0;
88213476
LP
903}
904
2547bb41 905static int setup_resolv_conf(const char *dest) {
c8b32e11 906 _cleanup_free_ char *where = NULL;
79d80fc1 907 int r;
2547bb41
LP
908
909 assert(dest);
910
911 if (arg_private_network)
912 return 0;
913
914 /* Fix resolv.conf, if possible */
04bc4a3f
LP
915 where = strappend(dest, "/etc/resolv.conf");
916 if (!where)
0d0f0c50 917 return log_oom();
2547bb41 918
77e63faf
LP
919 /* We don't really care for the results of this really. If it
920 * fails, it fails, but meh... */
79d80fc1
TG
921 r = mkdir_parents(where, 0755);
922 if (r < 0) {
da927ba9 923 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
79d80fc1
TG
924
925 return 0;
926 }
927
928 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
929 if (r < 0) {
da927ba9 930 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
931
932 return 0;
933 }
2547bb41
LP
934
935 return 0;
936}
937
4d9f07b4
LP
938static int setup_volatile_state(const char *directory) {
939 const char *p;
940 int r;
941
942 assert(directory);
943
944 if (arg_volatile != VOLATILE_STATE)
945 return 0;
946
947 /* --volatile=state means we simply overmount /var
948 with a tmpfs, and the rest read-only. */
949
950 r = bind_remount_recursive(directory, true);
f647962d
MS
951 if (r < 0)
952 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
4d9f07b4
LP
953
954 p = strappenda(directory, "/var");
79d80fc1
TG
955 r = mkdir(p, 0755);
956 if (r < 0 && errno != EEXIST) {
56f64d95 957 log_error_errno(errno, "Failed to create %s: %m", directory);
79d80fc1
TG
958 return -errno;
959 }
4d9f07b4
LP
960
961 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
56f64d95 962 log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
4d9f07b4
LP
963 return -errno;
964 }
965
966 return 0;
967}
968
969static int setup_volatile(const char *directory) {
970 bool tmpfs_mounted = false, bind_mounted = false;
971 char template[] = "/tmp/nspawn-volatile-XXXXXX";
972 const char *f, *t;
973 int r;
974
975 assert(directory);
976
977 if (arg_volatile != VOLATILE_YES)
978 return 0;
979
980 /* --volatile=yes means we mount a tmpfs to the root dir, and
981 the original /usr to use inside it, and that read-only. */
982
983 if (!mkdtemp(template)) {
56f64d95 984 log_error_errno(errno, "Failed to create temporary directory: %m");
4d9f07b4
LP
985 return -errno;
986 }
987
988 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
56f64d95 989 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
4d9f07b4
LP
990 r = -errno;
991 goto fail;
992 }
993
994 tmpfs_mounted = true;
995
996 f = strappenda(directory, "/usr");
997 t = strappenda(template, "/usr");
998
79d80fc1
TG
999 r = mkdir(t, 0755);
1000 if (r < 0 && errno != EEXIST) {
56f64d95 1001 log_error_errno(errno, "Failed to create %s: %m", t);
79d80fc1
TG
1002 r = -errno;
1003 goto fail;
1004 }
1005
4d9f07b4 1006 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
56f64d95 1007 log_error_errno(errno, "Failed to create /usr bind mount: %m");
4d9f07b4
LP
1008 r = -errno;
1009 goto fail;
1010 }
1011
1012 bind_mounted = true;
1013
1014 r = bind_remount_recursive(t, true);
1015 if (r < 0) {
da927ba9 1016 log_error_errno(r, "Failed to remount %s read-only: %m", t);
4d9f07b4
LP
1017 goto fail;
1018 }
1019
1020 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
56f64d95 1021 log_error_errno(errno, "Failed to move root mount: %m");
4d9f07b4
LP
1022 r = -errno;
1023 goto fail;
1024 }
1025
1026 rmdir(template);
1027
1028 return 0;
1029
1030fail:
1031 if (bind_mounted)
1032 umount(t);
1033 if (tmpfs_mounted)
1034 umount(template);
1035 rmdir(template);
1036 return r;
1037}
1038
9f24adc2
LP
1039static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1040
1041 snprintf(s, 37,
1042 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1043 SD_ID128_FORMAT_VAL(id));
1044
1045 return s;
1046}
1047
04bc4a3f 1048static int setup_boot_id(const char *dest) {
7fd1b19b 1049 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 1050 sd_id128_t rnd = {};
04bc4a3f
LP
1051 char as_uuid[37];
1052 int r;
1053
1054 assert(dest);
1055
eb91eb18
LP
1056 if (arg_share_system)
1057 return 0;
1058
04bc4a3f
LP
1059 /* Generate a new randomized boot ID, so that each boot-up of
1060 * the container gets a new one */
1061
1062 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 1063 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
1064 if (!from || !to)
1065 return log_oom();
04bc4a3f
LP
1066
1067 r = sd_id128_randomize(&rnd);
f647962d
MS
1068 if (r < 0)
1069 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1070
9f24adc2 1071 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1072
574d5f2d 1073 r = write_string_file(from, as_uuid);
f647962d
MS
1074 if (r < 0)
1075 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f
LP
1076
1077 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
56f64d95 1078 log_error_errno(errno, "Failed to bind mount boot id: %m");
04bc4a3f 1079 r = -errno;
10d18763 1080 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
56f64d95 1081 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1082
1083 unlink(from);
04bc4a3f
LP
1084 return r;
1085}
1086
e58a1277 1087static int copy_devnodes(const char *dest) {
88213476
LP
1088
1089 static const char devnodes[] =
1090 "null\0"
1091 "zero\0"
1092 "full\0"
1093 "random\0"
1094 "urandom\0"
85614d66
TG
1095 "tty\0"
1096 "net/tun\0";
88213476
LP
1097
1098 const char *d;
e58a1277 1099 int r = 0;
7fd1b19b 1100 _cleanup_umask_ mode_t u;
a258bf26
LP
1101
1102 assert(dest);
124640f1
LP
1103
1104 u = umask(0000);
88213476
LP
1105
1106 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1107 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1108 struct stat st;
88213476 1109
7f112f50
LP
1110 from = strappend("/dev/", d);
1111 to = strjoin(dest, "/dev/", d, NULL);
1112 if (!from || !to)
1113 return log_oom();
88213476
LP
1114
1115 if (stat(from, &st) < 0) {
1116
1117 if (errno != ENOENT) {
56f64d95 1118 log_error_errno(errno, "Failed to stat %s: %m", from);
7f112f50 1119 return -errno;
88213476
LP
1120 }
1121
a258bf26 1122 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1123
ed8b7a3e 1124 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 1125 return -EIO;
a258bf26 1126
85614d66
TG
1127 } else {
1128 r = mkdir_parents(to, 0775);
1129 if (r < 0) {
da927ba9 1130 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
85614d66
TG
1131 return -r;
1132 }
a258bf26 1133
85614d66 1134 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
56f64d95 1135 log_error_errno(errno, "mknod(%s) failed: %m", dest);
85614d66
TG
1136 return -errno;
1137 }
88213476 1138 }
88213476
LP
1139 }
1140
e58a1277
LP
1141 return r;
1142}
88213476 1143
f2d88580
LP
1144static int setup_ptmx(const char *dest) {
1145 _cleanup_free_ char *p = NULL;
1146
1147 p = strappend(dest, "/dev/ptmx");
1148 if (!p)
1149 return log_oom();
1150
1151 if (symlink("pts/ptmx", p) < 0) {
56f64d95 1152 log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
f2d88580
LP
1153 return -errno;
1154 }
1155
1156 return 0;
1157}
1158
e58a1277 1159static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1160 _cleanup_umask_ mode_t u;
1161 const char *to;
e58a1277 1162 struct stat st;
e58a1277 1163 int r;
e58a1277
LP
1164
1165 assert(dest);
1166 assert(console);
1167
1168 u = umask(0000);
1169
eb0f0863 1170 if (stat("/dev/null", &st) < 0) {
56f64d95 1171 log_error_errno(errno, "Failed to stat /dev/null: %m");
25ea79fe 1172 return -errno;
e58a1277 1173 }
88213476 1174
e58a1277 1175 r = chmod_and_chown(console, 0600, 0, 0);
f647962d
MS
1176 if (r < 0)
1177 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1178
a258bf26
LP
1179 /* We need to bind mount the right tty to /dev/console since
1180 * ptys can only exist on pts file systems. To have something
eb0f0863
LP
1181 * to bind mount things on we create a device node first, and
1182 * use /dev/null for that since we the cgroups device policy
1183 * allows us to create that freely, while we cannot create
1184 * /dev/console. (Note that the major minor doesn't actually
1185 * matter here, since we mount it over anyway). */
a258bf26 1186
eb0f0863 1187 to = strappenda(dest, "/dev/console");
e58a1277 1188 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
56f64d95 1189 log_error_errno(errno, "mknod() for /dev/console failed: %m");
25ea79fe 1190 return -errno;
e58a1277 1191 }
a258bf26
LP
1192
1193 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
56f64d95 1194 log_error_errno(errno, "Bind mount for /dev/console failed: %m");
25ea79fe 1195 return -errno;
a258bf26
LP
1196 }
1197
25ea79fe 1198 return 0;
e58a1277
LP
1199}
1200
1201static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 1202 _cleanup_free_ char *from = NULL, *to = NULL;
e58a1277 1203 int r, fd, k;
7fd1b19b 1204 _cleanup_umask_ mode_t u;
e58a1277
LP
1205 union {
1206 struct cmsghdr cmsghdr;
1207 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1208 } control = {};
1209 struct msghdr mh = {
1210 .msg_control = &control,
1211 .msg_controllen = sizeof(control),
1212 };
e58a1277
LP
1213 struct cmsghdr *cmsg;
1214
1215 assert(dest);
1216 assert(kmsg_socket >= 0);
a258bf26 1217
e58a1277 1218 u = umask(0000);
a258bf26 1219
f1e5dfe2
LP
1220 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1221 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1222 * on the reading side behave very similar to /proc/kmsg,
1223 * their writing side behaves differently from /dev/kmsg in
1224 * that writing blocks when nothing is reading. In order to
1225 * avoid any problems with containers deadlocking due to this
1226 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
1227 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1228 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1229 return log_oom();
e58a1277
LP
1230
1231 if (mkfifo(from, 0600) < 0) {
56f64d95 1232 log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
25ea79fe 1233 return -errno;
e58a1277
LP
1234 }
1235
1236 r = chmod_and_chown(from, 0600, 0, 0);
f647962d
MS
1237 if (r < 0)
1238 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
e58a1277
LP
1239
1240 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
56f64d95 1241 log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
25ea79fe 1242 return -errno;
e58a1277
LP
1243 }
1244
1245 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1246 if (fd < 0) {
56f64d95 1247 log_error_errno(errno, "Failed to open fifo: %m");
25ea79fe 1248 return -errno;
e58a1277
LP
1249 }
1250
e58a1277
LP
1251 cmsg = CMSG_FIRSTHDR(&mh);
1252 cmsg->cmsg_level = SOL_SOCKET;
1253 cmsg->cmsg_type = SCM_RIGHTS;
1254 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1255 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1256
1257 mh.msg_controllen = cmsg->cmsg_len;
1258
1259 /* Store away the fd in the socket, so that it stays open as
1260 * long as we run the child */
1261 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
03e334a1 1262 safe_close(fd);
e58a1277
LP
1263
1264 if (k < 0) {
56f64d95 1265 log_error_errno(errno, "Failed to send FIFO fd: %m");
25ea79fe 1266 return -errno;
a258bf26
LP
1267 }
1268
f1e5dfe2
LP
1269 /* And now make the FIFO unavailable as /dev/kmsg... */
1270 unlink(from);
25ea79fe 1271 return 0;
88213476
LP
1272}
1273
3a74cea5 1274static int setup_hostname(void) {
3a74cea5 1275
eb91eb18
LP
1276 if (arg_share_system)
1277 return 0;
1278
605f81a8 1279 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1280 return -errno;
3a74cea5 1281
7027ff61 1282 return 0;
3a74cea5
LP
1283}
1284
57fb9fb5 1285static int setup_journal(const char *directory) {
4d680aee 1286 sd_id128_t machine_id, this_id;
7fd1b19b 1287 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 1288 char *id;
57fb9fb5
LP
1289 int r;
1290
57fb9fb5 1291 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
1292 if (!p)
1293 return log_oom();
57fb9fb5
LP
1294
1295 r = read_one_line_file(p, &b);
27407a01
ZJS
1296 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1297 return 0;
f647962d
MS
1298 else if (r < 0)
1299 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
57fb9fb5 1300
27407a01
ZJS
1301 id = strstrip(b);
1302 if (isempty(id) && arg_link_journal == LINK_AUTO)
1303 return 0;
57fb9fb5 1304
27407a01
ZJS
1305 /* Verify validity */
1306 r = sd_id128_from_string(id, &machine_id);
f647962d
MS
1307 if (r < 0)
1308 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
57fb9fb5 1309
4d680aee 1310 r = sd_id128_get_machine(&this_id);
f647962d
MS
1311 if (r < 0)
1312 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1313
1314 if (sd_id128_equal(machine_id, this_id)) {
1315 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1316 "Host and machine ids are equal (%s): refusing to link journals", id);
1317 if (arg_link_journal == LINK_AUTO)
1318 return 0;
1319 return
1320 -EEXIST;
1321 }
1322
1323 if (arg_link_journal == LINK_NO)
1324 return 0;
1325
57fb9fb5 1326 free(p);
27407a01
ZJS
1327 p = strappend("/var/log/journal/", id);
1328 q = strjoin(directory, "/var/log/journal/", id, NULL);
1329 if (!p || !q)
1330 return log_oom();
1331
1332 if (path_is_mount_point(p, false) > 0) {
1333 if (arg_link_journal != LINK_AUTO) {
1334 log_error("%s: already a mount point, refusing to use for journal", p);
1335 return -EEXIST;
1336 }
1337
1338 return 0;
57fb9fb5
LP
1339 }
1340
27407a01 1341 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1342 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1343 log_error("%s: already a mount point, refusing to use for journal", q);
1344 return -EEXIST;
57fb9fb5
LP
1345 }
1346
27407a01 1347 return 0;
57fb9fb5
LP
1348 }
1349
1350 r = readlink_and_make_absolute(p, &d);
1351 if (r >= 0) {
1352 if ((arg_link_journal == LINK_GUEST ||
1353 arg_link_journal == LINK_AUTO) &&
1354 path_equal(d, q)) {
1355
27407a01
ZJS
1356 r = mkdir_p(q, 0755);
1357 if (r < 0)
56f64d95 1358 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1359 return 0;
57fb9fb5
LP
1360 }
1361
1362 if (unlink(p) < 0) {
56f64d95 1363 log_error_errno(errno, "Failed to remove symlink %s: %m", p);
27407a01 1364 return -errno;
57fb9fb5
LP
1365 }
1366 } else if (r == -EINVAL) {
1367
1368 if (arg_link_journal == LINK_GUEST &&
1369 rmdir(p) < 0) {
1370
27407a01
ZJS
1371 if (errno == ENOTDIR) {
1372 log_error("%s already exists and is neither a symlink nor a directory", p);
1373 return r;
1374 } else {
56f64d95 1375 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 1376 return -errno;
57fb9fb5 1377 }
57fb9fb5
LP
1378 }
1379 } else if (r != -ENOENT) {
56f64d95 1380 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 1381 return r;
57fb9fb5
LP
1382 }
1383
1384 if (arg_link_journal == LINK_GUEST) {
1385
1386 if (symlink(q, p) < 0) {
574edc90 1387 if (arg_link_journal_try) {
56f64d95 1388 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
1389 return 0;
1390 } else {
56f64d95 1391 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
1392 return -errno;
1393 }
57fb9fb5
LP
1394 }
1395
27407a01
ZJS
1396 r = mkdir_p(q, 0755);
1397 if (r < 0)
56f64d95 1398 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1399 return 0;
57fb9fb5
LP
1400 }
1401
1402 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1403 /* don't create parents here -- if the host doesn't have
1404 * permanent journal set up, don't force it here */
1405 r = mkdir(p, 0755);
57fb9fb5 1406 if (r < 0) {
574edc90 1407 if (arg_link_journal_try) {
56f64d95 1408 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
1409 return 0;
1410 } else {
56f64d95 1411 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
1412 return r;
1413 }
57fb9fb5
LP
1414 }
1415
27407a01
ZJS
1416 } else if (access(p, F_OK) < 0)
1417 return 0;
57fb9fb5 1418
cdb2b9d0
LP
1419 if (dir_is_empty(q) == 0)
1420 log_warning("%s is not empty, proceeding anyway.", q);
1421
57fb9fb5
LP
1422 r = mkdir_p(q, 0755);
1423 if (r < 0) {
56f64d95 1424 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 1425 return r;
57fb9fb5
LP
1426 }
1427
1428 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
56f64d95 1429 log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
27407a01 1430 return -errno;
57fb9fb5
LP
1431 }
1432
27407a01 1433 return 0;
57fb9fb5
LP
1434}
1435
88213476 1436static int drop_capabilities(void) {
5076f0cc 1437 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1438}
1439
5aa4bb6b 1440static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 1441 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
24996861 1442 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
1443 int r;
1444
eb91eb18
LP
1445 if (!arg_register)
1446 return 0;
1447
1c03020c 1448 r = sd_bus_default_system(&bus);
f647962d
MS
1449 if (r < 0)
1450 return log_error_errno(r, "Failed to open system bus: %m");
9444b1f2 1451
89f7c846
LP
1452 if (arg_keep_unit) {
1453 r = sd_bus_call_method(
1454 bus,
1455 "org.freedesktop.machine1",
1456 "/org/freedesktop/machine1",
1457 "org.freedesktop.machine1.Manager",
5aa4bb6b 1458 "RegisterMachineWithNetwork",
89f7c846
LP
1459 &error,
1460 NULL,
5aa4bb6b 1461 "sayssusai",
89f7c846
LP
1462 arg_machine,
1463 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1464 "nspawn",
1465 "container",
1466 (uint32_t) pid,
5aa4bb6b
LP
1467 strempty(arg_directory),
1468 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 1469 } else {
9457ac5b
LP
1470 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1471
1472 r = sd_bus_message_new_method_call(
89f7c846 1473 bus,
9457ac5b 1474 &m,
89f7c846
LP
1475 "org.freedesktop.machine1",
1476 "/org/freedesktop/machine1",
1477 "org.freedesktop.machine1.Manager",
5aa4bb6b 1478 "CreateMachineWithNetwork");
f647962d
MS
1479 if (r < 0)
1480 return log_error_errno(r, "Failed to create message: %m");
9457ac5b
LP
1481
1482 r = sd_bus_message_append(
1483 m,
5aa4bb6b 1484 "sayssusai",
89f7c846
LP
1485 arg_machine,
1486 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1487 "nspawn",
1488 "container",
1489 (uint32_t) pid,
5aa4bb6b
LP
1490 strempty(arg_directory),
1491 local_ifindex > 0 ? 1 : 0, local_ifindex);
f647962d
MS
1492 if (r < 0)
1493 return log_error_errno(r, "Failed to append message arguments: %m");
9457ac5b
LP
1494
1495 r = sd_bus_message_open_container(m, 'a', "(sv)");
f647962d
MS
1496 if (r < 0)
1497 return log_error_errno(r, "Failed to open container: %m");
9457ac5b
LP
1498
1499 if (!isempty(arg_slice)) {
1500 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
f647962d
MS
1501 if (r < 0)
1502 return log_error_errno(r, "Failed to append slice: %m");
9457ac5b
LP
1503 }
1504
1505 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
f647962d
MS
1506 if (r < 0)
1507 return log_error_errno(r, "Failed to add device policy: %m");
9457ac5b 1508
63cc4c31 1509 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
9457ac5b
LP
1510 /* Allow the container to
1511 * access and create the API
1512 * device nodes, so that
1513 * PrivateDevices= in the
1514 * container can work
1515 * fine */
1516 "/dev/null", "rwm",
1517 "/dev/zero", "rwm",
1518 "/dev/full", "rwm",
1519 "/dev/random", "rwm",
1520 "/dev/urandom", "rwm",
1521 "/dev/tty", "rwm",
864e1706 1522 "/dev/net/tun", "rwm",
9457ac5b
LP
1523 /* Allow the container
1524 * access to ptys. However,
1525 * do not permit the
1526 * container to ever create
1527 * these device nodes. */
1528 "/dev/pts/ptmx", "rw",
63cc4c31 1529 "char-pts", "rw");
f647962d
MS
1530 if (r < 0)
1531 return log_error_errno(r, "Failed to add device whitelist: %m");
9457ac5b
LP
1532
1533 r = sd_bus_message_close_container(m);
f647962d
MS
1534 if (r < 0)
1535 return log_error_errno(r, "Failed to close container: %m");
9457ac5b
LP
1536
1537 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
1538 }
1539
9444b1f2 1540 if (r < 0) {
1f0cd86b
LP
1541 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1542 return r;
1543 }
1544
1545 return 0;
1546}
1547
1548static int terminate_machine(pid_t pid) {
1549 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1550 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
24996861 1551 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
1552 const char *path;
1553 int r;
1554
eb91eb18
LP
1555 if (!arg_register)
1556 return 0;
1557
76b54375 1558 r = sd_bus_default_system(&bus);
f647962d
MS
1559 if (r < 0)
1560 return log_error_errno(r, "Failed to open system bus: %m");
1f0cd86b
LP
1561
1562 r = sd_bus_call_method(
1563 bus,
1564 "org.freedesktop.machine1",
1565 "/org/freedesktop/machine1",
1566 "org.freedesktop.machine1.Manager",
1567 "GetMachineByPID",
1568 &error,
1569 &reply,
1570 "u",
1571 (uint32_t) pid);
1572 if (r < 0) {
1573 /* Note that the machine might already have been
1574 * cleaned up automatically, hence don't consider it a
1575 * failure if we cannot get the machine object. */
1576 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1577 return 0;
1578 }
1579
1580 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
1581 if (r < 0)
1582 return bus_log_parse_error(r);
9444b1f2 1583
1f0cd86b
LP
1584 r = sd_bus_call_method(
1585 bus,
1586 "org.freedesktop.machine1",
1587 path,
1588 "org.freedesktop.machine1.Machine",
1589 "Terminate",
1590 &error,
1591 NULL,
1592 NULL);
1593 if (r < 0) {
1594 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1595 return 0;
1596 }
1597
9444b1f2
LP
1598 return 0;
1599}
1600
db999e0f
LP
1601static int reset_audit_loginuid(void) {
1602 _cleanup_free_ char *p = NULL;
1603 int r;
1604
1605 if (arg_share_system)
1606 return 0;
1607
1608 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1609 if (r == -ENOENT)
db999e0f 1610 return 0;
f647962d
MS
1611 if (r < 0)
1612 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1613
1614 /* Already reset? */
1615 if (streq(p, "4294967295"))
1616 return 0;
1617
1618 r = write_string_file("/proc/self/loginuid", "4294967295");
1619 if (r < 0) {
1620 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1621 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1622 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1623 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1624 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
77b6e194 1625
db999e0f 1626 sleep(5);
77b6e194 1627 }
db999e0f
LP
1628
1629 return 0;
77b6e194
LP
1630}
1631
4f758c23
LP
1632#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1633#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
01dde061 1634
4f758c23 1635static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key) {
01dde061
TG
1636 int r;
1637
1638 uint8_t result[8];
1639 size_t l, sz;
1640 uint8_t *v;
1641
1642 l = strlen(arg_machine);
1643 sz = sizeof(sd_id128_t) + l;
1644 v = alloca(sz);
1645
1646 /* fetch some persistent data unique to the host */
1647 r = sd_id128_get_machine((sd_id128_t*) v);
1648 if (r < 0)
1649 return r;
1650
1651 /* combine with some data unique (on this host) to this
1652 * container instance */
1653 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1654
1655 /* Let's hash the host machine ID plus the container name. We
1656 * use a fixed, but originally randomly created hash key here. */
4f758c23 1657 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
1658
1659 assert_cc(ETH_ALEN <= sizeof(result));
1660 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1661
1662 /* see eth_random_addr in the kernel */
1663 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1664 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1665
1666 return 0;
1667}
1668
5aa4bb6b 1669static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
69c79d3c 1670 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 1671 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4f758c23 1672 struct ether_addr mac_host, mac_container;
5aa4bb6b 1673 int r, i;
69c79d3c
LP
1674
1675 if (!arg_private_network)
1676 return 0;
1677
1678 if (!arg_network_veth)
1679 return 0;
1680
08af0da2
LP
1681 /* Use two different interface name prefixes depending whether
1682 * we are in bridge mode or not. */
c00524c9 1683 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 1684 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 1685
4f758c23 1686 r = generate_mac(&mac_container, CONTAINER_HASH_KEY);
01dde061 1687 if (r < 0) {
4f758c23
LP
1688 log_error("Failed to generate predictable MAC address for container side");
1689 return r;
1690 }
1691
1692 r = generate_mac(&mac_host, HOST_HASH_KEY);
1693 if (r < 0) {
1694 log_error("Failed to generate predictable MAC address for host side");
01dde061
TG
1695 return r;
1696 }
1697
151b9b96 1698 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
1699 if (r < 0)
1700 return log_error_errno(r, "Failed to connect to netlink: %m");
69c79d3c 1701
151b9b96 1702 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
1703 if (r < 0)
1704 return log_error_errno(r, "Failed to allocate netlink message: %m");
69c79d3c 1705
ab046dde 1706 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
f647962d
MS
1707 if (r < 0)
1708 return log_error_errno(r, "Failed to add netlink interface name: %m");
69c79d3c 1709
4f758c23 1710 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
f647962d
MS
1711 if (r < 0)
1712 return log_error_errno(r, "Failed to add netlink MAC address: %m");
4f758c23 1713
ee3a6a51 1714 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
1715 if (r < 0)
1716 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 1717
d8e538ec 1718 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
f647962d
MS
1719 if (r < 0)
1720 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 1721
ee3a6a51 1722 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
f647962d
MS
1723 if (r < 0)
1724 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 1725
ab046dde 1726 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
f647962d
MS
1727 if (r < 0)
1728 return log_error_errno(r, "Failed to add netlink interface name: %m");
01dde061 1729
4f758c23 1730 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
f647962d
MS
1731 if (r < 0)
1732 return log_error_errno(r, "Failed to add netlink MAC address: %m");
69c79d3c 1733
ab046dde 1734 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
1735 if (r < 0)
1736 return log_error_errno(r, "Failed to add netlink namespace field: %m");
69c79d3c
LP
1737
1738 r = sd_rtnl_message_close_container(m);
f647962d
MS
1739 if (r < 0)
1740 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
1741
1742 r = sd_rtnl_message_close_container(m);
f647962d
MS
1743 if (r < 0)
1744 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
1745
1746 r = sd_rtnl_message_close_container(m);
f647962d
MS
1747 if (r < 0)
1748 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
1749
1750 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
1751 if (r < 0)
1752 return log_error_errno(r, "Failed to add new veth interfaces: %m");
69c79d3c 1753
5aa4bb6b
LP
1754 i = (int) if_nametoindex(iface_name);
1755 if (i <= 0) {
56f64d95 1756 log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
5aa4bb6b
LP
1757 return -errno;
1758 }
1759
1760 *ifi = i;
1761
69c79d3c
LP
1762 return 0;
1763}
1764
5aa4bb6b 1765static int setup_bridge(const char veth_name[], int *ifi) {
ab046dde
TG
1766 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1767 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1768 int r, bridge;
1769
1770 if (!arg_private_network)
1771 return 0;
1772
1773 if (!arg_network_veth)
1774 return 0;
1775
1776 if (!arg_network_bridge)
1777 return 0;
1778
1779 bridge = (int) if_nametoindex(arg_network_bridge);
1780 if (bridge <= 0) {
56f64d95 1781 log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
ab046dde
TG
1782 return -errno;
1783 }
1784
5aa4bb6b
LP
1785 *ifi = bridge;
1786
151b9b96 1787 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
1788 if (r < 0)
1789 return log_error_errno(r, "Failed to connect to netlink: %m");
ab046dde 1790
151b9b96 1791 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
f647962d
MS
1792 if (r < 0)
1793 return log_error_errno(r, "Failed to allocate netlink message: %m");
ab046dde 1794
039dd4af 1795 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
f647962d
MS
1796 if (r < 0)
1797 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
039dd4af 1798
ab046dde 1799 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
f647962d
MS
1800 if (r < 0)
1801 return log_error_errno(r, "Failed to add netlink interface name field: %m");
ab046dde
TG
1802
1803 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
f647962d
MS
1804 if (r < 0)
1805 return log_error_errno(r, "Failed to add netlink master field: %m");
ab046dde
TG
1806
1807 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
1808 if (r < 0)
1809 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
ab046dde
TG
1810
1811 return 0;
1812}
1813
c74e630d
LP
1814static int parse_interface(struct udev *udev, const char *name) {
1815 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1816 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1817 int ifi;
1818
1819 ifi = (int) if_nametoindex(name);
1820 if (ifi <= 0) {
56f64d95 1821 log_error_errno(errno, "Failed to resolve interface %s: %m", name);
c74e630d
LP
1822 return -errno;
1823 }
1824
1825 sprintf(ifi_str, "n%i", ifi);
1826 d = udev_device_new_from_device_id(udev, ifi_str);
1827 if (!d) {
56f64d95 1828 log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
c74e630d
LP
1829 return -errno;
1830 }
1831
1832 if (udev_device_get_is_initialized(d) <= 0) {
1833 log_error("Network interface %s is not initialized yet.", name);
1834 return -EBUSY;
1835 }
1836
1837 return ifi;
1838}
1839
69c79d3c 1840static int move_network_interfaces(pid_t pid) {
7e227024 1841 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 1842 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
1843 char **i;
1844 int r;
1845
1846 if (!arg_private_network)
1847 return 0;
1848
1849 if (strv_isempty(arg_network_interfaces))
1850 return 0;
1851
151b9b96 1852 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
1853 if (r < 0)
1854 return log_error_errno(r, "Failed to connect to netlink: %m");
aa28aefe 1855
7e227024
LP
1856 udev = udev_new();
1857 if (!udev) {
1858 log_error("Failed to connect to udev.");
1859 return -ENOMEM;
1860 }
1861
aa28aefe 1862 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 1863 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 1864 int ifi;
aa28aefe 1865
c74e630d
LP
1866 ifi = parse_interface(udev, *i);
1867 if (ifi < 0)
1868 return ifi;
1869
3125b3ef 1870 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
f647962d
MS
1871 if (r < 0)
1872 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 1873
c74e630d 1874 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
1875 if (r < 0)
1876 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
7e227024 1877
c74e630d 1878 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
1879 if (r < 0)
1880 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
c74e630d 1881 }
7e227024 1882
c74e630d
LP
1883 return 0;
1884}
1885
1886static int setup_macvlan(pid_t pid) {
1887 _cleanup_udev_unref_ struct udev *udev = NULL;
1888 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1889 char **i;
1890 int r;
1891
1892 if (!arg_private_network)
1893 return 0;
1894
1895 if (strv_isempty(arg_network_macvlan))
1896 return 0;
1897
1898 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
1899 if (r < 0)
1900 return log_error_errno(r, "Failed to connect to netlink: %m");
c74e630d
LP
1901
1902 udev = udev_new();
1903 if (!udev) {
1904 log_error("Failed to connect to udev.");
1905 return -ENOMEM;
1906 }
1907
1908 STRV_FOREACH(i, arg_network_macvlan) {
1909 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1910 _cleanup_free_ char *n = NULL;
1911 int ifi;
1912
1913 ifi = parse_interface(udev, *i);
1914 if (ifi < 0)
1915 return ifi;
1916
1917 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
1918 if (r < 0)
1919 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 1920
c74e630d 1921 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
f647962d
MS
1922 if (r < 0)
1923 return log_error_errno(r, "Failed to add netlink interface index: %m");
c74e630d
LP
1924
1925 n = strappend("mv-", *i);
1926 if (!n)
1927 return log_oom();
1928
1929 strshorten(n, IFNAMSIZ-1);
1930
1931 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
f647962d
MS
1932 if (r < 0)
1933 return log_error_errno(r, "Failed to add netlink interface name: %m");
c74e630d 1934
aa28aefe 1935 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
1936 if (r < 0)
1937 return log_error_errno(r, "Failed to add netlink namespace field: %m");
c74e630d
LP
1938
1939 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
1940 if (r < 0)
1941 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 1942
d8e538ec 1943 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
f647962d
MS
1944 if (r < 0)
1945 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d
LP
1946
1947 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
f647962d
MS
1948 if (r < 0)
1949 return log_error_errno(r, "Failed to append macvlan mode: %m");
c74e630d
LP
1950
1951 r = sd_rtnl_message_close_container(m);
f647962d
MS
1952 if (r < 0)
1953 return log_error_errno(r, "Failed to close netlink container: %m");
c74e630d
LP
1954
1955 r = sd_rtnl_message_close_container(m);
f647962d
MS
1956 if (r < 0)
1957 return log_error_errno(r, "Failed to close netlink container: %m");
aa28aefe
LP
1958
1959 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
1960 if (r < 0)
1961 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
aa28aefe
LP
1962 }
1963
1964 return 0;
1965}
1966
28650077 1967static int setup_seccomp(void) {
24fb1112
LP
1968
1969#ifdef HAVE_SECCOMP
28650077
LP
1970 static const int blacklist[] = {
1971 SCMP_SYS(kexec_load),
1972 SCMP_SYS(open_by_handle_at),
1973 SCMP_SYS(init_module),
1974 SCMP_SYS(finit_module),
1975 SCMP_SYS(delete_module),
1976 SCMP_SYS(iopl),
1977 SCMP_SYS(ioperm),
1978 SCMP_SYS(swapon),
1979 SCMP_SYS(swapoff),
1980 };
1981
24fb1112 1982 scmp_filter_ctx seccomp;
28650077 1983 unsigned i;
24fb1112
LP
1984 int r;
1985
24fb1112
LP
1986 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1987 if (!seccomp)
1988 return log_oom();
1989
e9642be2 1990 r = seccomp_add_secondary_archs(seccomp);
9875fd78 1991 if (r < 0) {
da927ba9 1992 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
1993 goto finish;
1994 }
1995
28650077
LP
1996 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1997 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
1998 if (r == -EFAULT)
1999 continue; /* unknown syscall */
2000 if (r < 0) {
da927ba9 2001 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
2002 goto finish;
2003 }
2004 }
2005
2006 /*
2007 Audit is broken in containers, much of the userspace audit
2008 hookup will fail if running inside a container. We don't
2009 care and just turn off creation of audit sockets.
2010
2011 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2012 with EAFNOSUPPORT which audit userspace uses as indication
2013 that audit is disabled in the kernel.
2014 */
2015
3302da46 2016 r = seccomp_rule_add(
24fb1112
LP
2017 seccomp,
2018 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2019 SCMP_SYS(socket),
2020 2,
2021 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2022 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2023 if (r < 0) {
da927ba9 2024 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
2025 goto finish;
2026 }
2027
2028 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2029 if (r < 0) {
da927ba9 2030 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
2031 goto finish;
2032 }
2033
2034 r = seccomp_load(seccomp);
2035 if (r < 0)
da927ba9 2036 log_error_errno(r, "Failed to install seccomp audit filter: %m");
24fb1112
LP
2037
2038finish:
2039 seccomp_release(seccomp);
2040 return r;
2041#else
2042 return 0;
2043#endif
2044
2045}
2046
1b9e5b12
LP
2047static int setup_image(char **device_path, int *loop_nr) {
2048 struct loop_info64 info = {
2049 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2050 };
2051 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2052 _cleanup_free_ char* loopdev = NULL;
2053 struct stat st;
2054 int r, nr;
2055
2056 assert(device_path);
2057 assert(loop_nr);
2058
2059 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2060 if (fd < 0) {
56f64d95 2061 log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12
LP
2062 return -errno;
2063 }
2064
2065 if (fstat(fd, &st) < 0) {
56f64d95 2066 log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
2067 return -errno;
2068 }
2069
2070 if (S_ISBLK(st.st_mode)) {
2071 char *p;
2072
2073 p = strdup(arg_image);
2074 if (!p)
2075 return log_oom();
2076
2077 *device_path = p;
2078
2079 *loop_nr = -1;
2080
2081 r = fd;
2082 fd = -1;
2083
2084 return r;
2085 }
2086
2087 if (!S_ISREG(st.st_mode)) {
56f64d95 2088 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
2089 return -EINVAL;
2090 }
2091
2092 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2093 if (control < 0) {
56f64d95 2094 log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
2095 return -errno;
2096 }
2097
2098 nr = ioctl(control, LOOP_CTL_GET_FREE);
2099 if (nr < 0) {
56f64d95 2100 log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
2101 return -errno;
2102 }
2103
2104 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2105 return log_oom();
2106
2107 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2108 if (loop < 0) {
56f64d95 2109 log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12
LP
2110 return -errno;
2111 }
2112
2113 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
56f64d95 2114 log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
2115 return -errno;
2116 }
2117
2118 if (arg_read_only)
2119 info.lo_flags |= LO_FLAGS_READ_ONLY;
2120
2121 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
56f64d95 2122 log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
2123 return -errno;
2124 }
2125
2126 *device_path = loopdev;
2127 loopdev = NULL;
2128
2129 *loop_nr = nr;
2130
2131 r = loop;
2132 loop = -1;
2133
2134 return r;
2135}
2136
2137static int dissect_image(
2138 int fd,
727fd4fd
LP
2139 char **root_device, bool *root_device_rw,
2140 char **home_device, bool *home_device_rw,
2141 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
2142 bool *secondary) {
2143
2144#ifdef HAVE_BLKID
2145 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2146 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2147 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2148 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2149 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2150 _cleanup_udev_unref_ struct udev *udev = NULL;
2151 struct udev_list_entry *first, *item;
727fd4fd 2152 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1b9e5b12
LP
2153 const char *pttype = NULL;
2154 blkid_partlist pl;
2155 struct stat st;
2156 int r;
2157
2158 assert(fd >= 0);
2159 assert(root_device);
2160 assert(home_device);
2161 assert(srv_device);
2162 assert(secondary);
2163
2164 b = blkid_new_probe();
2165 if (!b)
2166 return log_oom();
2167
2168 errno = 0;
2169 r = blkid_probe_set_device(b, fd, 0, 0);
2170 if (r != 0) {
2171 if (errno == 0)
2172 return log_oom();
2173
56f64d95 2174 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
2175 return -errno;
2176 }
2177
2178 blkid_probe_enable_partitions(b, 1);
2179 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2180
2181 errno = 0;
2182 r = blkid_do_safeprobe(b);
2183 if (r == -2 || r == 1) {
2184 log_error("Failed to identify any partition table on %s.\n"
2185 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2186 return -EINVAL;
2187 } else if (r != 0) {
2188 if (errno == 0)
2189 errno = EIO;
56f64d95 2190 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
2191 return -errno;
2192 }
2193
2194 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2195 if (!streq_ptr(pttype, "gpt")) {
2196 log_error("Image %s does not carry a GUID Partition Table.\n"
2197 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2198 return -EINVAL;
2199 }
2200
2201 errno = 0;
2202 pl = blkid_probe_get_partitions(b);
2203 if (!pl) {
2204 if (errno == 0)
2205 return log_oom();
2206
2207 log_error("Failed to list partitions of %s", arg_image);
2208 return -errno;
2209 }
2210
2211 udev = udev_new();
2212 if (!udev)
2213 return log_oom();
2214
2215 if (fstat(fd, &st) < 0) {
56f64d95 2216 log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12
LP
2217 return -errno;
2218 }
2219
2220 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2221 if (!d)
2222 return log_oom();
2223
2224 e = udev_enumerate_new(udev);
2225 if (!e)
2226 return log_oom();
2227
2228 r = udev_enumerate_add_match_parent(e, d);
2229 if (r < 0)
2230 return log_oom();
2231
2232 r = udev_enumerate_scan_devices(e);
f647962d
MS
2233 if (r < 0)
2234 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1b9e5b12
LP
2235
2236 first = udev_enumerate_get_list_entry(e);
2237 udev_list_entry_foreach(item, first) {
2238 _cleanup_udev_device_unref_ struct udev_device *q;
2239 const char *stype, *node;
727fd4fd 2240 unsigned long long flags;
1b9e5b12
LP
2241 sd_id128_t type_id;
2242 blkid_partition pp;
2243 dev_t qn;
2244 int nr;
2245
2246 errno = 0;
2247 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2248 if (!q) {
2249 if (!errno)
2250 errno = ENOMEM;
2251
56f64d95 2252 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
2253 return -errno;
2254 }
2255
2256 qn = udev_device_get_devnum(q);
2257 if (major(qn) == 0)
2258 continue;
2259
2260 if (st.st_rdev == qn)
2261 continue;
2262
2263 node = udev_device_get_devnode(q);
2264 if (!node)
2265 continue;
2266
2267 pp = blkid_partlist_devno_to_partition(pl, qn);
2268 if (!pp)
2269 continue;
2270
727fd4fd
LP
2271 flags = blkid_partition_get_flags(pp);
2272 if (flags & GPT_FLAG_NO_AUTO)
2273 continue;
2274
1b9e5b12
LP
2275 nr = blkid_partition_get_partno(pp);
2276 if (nr < 0)
2277 continue;
2278
2279 stype = blkid_partition_get_type_string(pp);
2280 if (!stype)
2281 continue;
2282
2283 if (sd_id128_from_string(stype, &type_id) < 0)
2284 continue;
2285
2286 if (sd_id128_equal(type_id, GPT_HOME)) {
2287
2288 if (home && nr >= home_nr)
2289 continue;
2290
2291 home_nr = nr;
727fd4fd
LP
2292 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2293
1b9e5b12
LP
2294 free(home);
2295 home = strdup(node);
2296 if (!home)
2297 return log_oom();
2298 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2299
2300 if (srv && nr >= srv_nr)
2301 continue;
2302
2303 srv_nr = nr;
727fd4fd
LP
2304 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2305
1b9e5b12
LP
2306 free(srv);
2307 srv = strdup(node);
2308 if (!srv)
2309 return log_oom();
2310 }
2311#ifdef GPT_ROOT_NATIVE
2312 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2313
2314 if (root && nr >= root_nr)
2315 continue;
2316
2317 root_nr = nr;
727fd4fd
LP
2318 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2319
1b9e5b12
LP
2320 free(root);
2321 root = strdup(node);
2322 if (!root)
2323 return log_oom();
2324 }
2325#endif
2326#ifdef GPT_ROOT_SECONDARY
2327 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2328
2329 if (secondary_root && nr >= secondary_root_nr)
2330 continue;
2331
2332 secondary_root_nr = nr;
727fd4fd
LP
2333 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2334
2335
1b9e5b12
LP
2336 free(secondary_root);
2337 secondary_root = strdup(node);
2338 if (!secondary_root)
2339 return log_oom();
2340 }
2341#endif
2342 }
2343
2344 if (!root && !secondary_root) {
2345 log_error("Failed to identify root partition in disk image %s.\n"
2346 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2347 return -EINVAL;
2348 }
2349
2350 if (root) {
2351 *root_device = root;
2352 root = NULL;
727fd4fd
LP
2353
2354 *root_device_rw = root_rw;
1b9e5b12
LP
2355 *secondary = false;
2356 } else if (secondary_root) {
2357 *root_device = secondary_root;
2358 secondary_root = NULL;
727fd4fd
LP
2359
2360 *root_device_rw = secondary_root_rw;
1b9e5b12
LP
2361 *secondary = true;
2362 }
2363
2364 if (home) {
2365 *home_device = home;
2366 home = NULL;
727fd4fd
LP
2367
2368 *home_device_rw = home_rw;
1b9e5b12
LP
2369 }
2370
2371 if (srv) {
2372 *srv_device = srv;
2373 srv = NULL;
727fd4fd
LP
2374
2375 *srv_device_rw = srv_rw;
1b9e5b12
LP
2376 }
2377
2378 return 0;
2379#else
2380 log_error("--image= is not supported, compiled without blkid support.");
2381 return -ENOTSUP;
2382#endif
2383}
2384
727fd4fd 2385static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2386#ifdef HAVE_BLKID
2387 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2388 const char *fstype, *p;
2389 int r;
2390
2391 assert(what);
2392 assert(where);
2393
727fd4fd
LP
2394 if (arg_read_only)
2395 rw = false;
2396
1b9e5b12
LP
2397 if (directory)
2398 p = strappenda(where, directory);
2399 else
2400 p = where;
2401
2402 errno = 0;
2403 b = blkid_new_probe_from_filename(what);
2404 if (!b) {
2405 if (errno == 0)
2406 return log_oom();
56f64d95 2407 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2408 return -errno;
2409 }
2410
2411 blkid_probe_enable_superblocks(b, 1);
2412 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2413
2414 errno = 0;
2415 r = blkid_do_safeprobe(b);
2416 if (r == -1 || r == 1) {
2417 log_error("Cannot determine file system type of %s", what);
2418 return -EINVAL;
2419 } else if (r != 0) {
2420 if (errno == 0)
2421 errno = EIO;
56f64d95 2422 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2423 return -errno;
2424 }
2425
2426 errno = 0;
2427 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2428 if (errno == 0)
2429 errno = EINVAL;
2430 log_error("Failed to determine file system type of %s", what);
2431 return -errno;
2432 }
2433
2434 if (streq(fstype, "crypto_LUKS")) {
2435 log_error("nspawn currently does not support LUKS disk images.");
2436 return -ENOTSUP;
2437 }
2438
727fd4fd 2439 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
56f64d95 2440 log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
2441 return -errno;
2442 }
2443
2444 return 0;
2445#else
2446 log_error("--image= is not supported, compiled without blkid support.");
2447 return -ENOTSUP;
2448#endif
2449}
2450
727fd4fd
LP
2451static int mount_devices(
2452 const char *where,
2453 const char *root_device, bool root_device_rw,
2454 const char *home_device, bool home_device_rw,
2455 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2456 int r;
2457
2458 assert(where);
2459
2460 if (root_device) {
727fd4fd 2461 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2462 if (r < 0)
2463 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2464 }
2465
2466 if (home_device) {
727fd4fd 2467 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2468 if (r < 0)
2469 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2470 }
2471
2472 if (srv_device) {
727fd4fd 2473 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2474 if (r < 0)
2475 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2476 }
2477
2478 return 0;
2479}
2480
2481static void loop_remove(int nr, int *image_fd) {
2482 _cleanup_close_ int control = -1;
e8c8ddcc 2483 int r;
1b9e5b12
LP
2484
2485 if (nr < 0)
2486 return;
2487
2488 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2489 r = ioctl(*image_fd, LOOP_CLR_FD);
2490 if (r < 0)
56f64d95 2491 log_warning_errno(errno, "Failed to close loop image: %m");
03e334a1 2492 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2493 }
2494
2495 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2496 if (control < 0) {
56f64d95 2497 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2498 return;
e8c8ddcc 2499 }
1b9e5b12 2500
e8c8ddcc
TG
2501 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2502 if (r < 0)
56f64d95 2503 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2504}
2505
0cb9fbcd
LP
2506static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2507 int pipe_fds[2];
2508 pid_t pid;
2509
2510 assert(database);
2511 assert(key);
2512 assert(rpid);
2513
2514 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
56f64d95 2515 log_error_errno(errno, "Failed to allocate pipe: %m");
0cb9fbcd
LP
2516 return -errno;
2517 }
2518
2519 pid = fork();
2520 if (pid < 0) {
56f64d95 2521 log_error_errno(errno, "Failed to fork getent child: %m");
0cb9fbcd
LP
2522 return -errno;
2523 } else if (pid == 0) {
2524 int nullfd;
2525 char *empty_env = NULL;
2526
2527 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2528 _exit(EXIT_FAILURE);
2529
2530 if (pipe_fds[0] > 2)
03e334a1 2531 safe_close(pipe_fds[0]);
0cb9fbcd 2532 if (pipe_fds[1] > 2)
03e334a1 2533 safe_close(pipe_fds[1]);
0cb9fbcd
LP
2534
2535 nullfd = open("/dev/null", O_RDWR);
2536 if (nullfd < 0)
2537 _exit(EXIT_FAILURE);
2538
2539 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2540 _exit(EXIT_FAILURE);
2541
2542 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2543 _exit(EXIT_FAILURE);
2544
2545 if (nullfd > 2)
03e334a1 2546 safe_close(nullfd);
0cb9fbcd
LP
2547
2548 reset_all_signal_handlers();
2549 close_all_fds(NULL, 0);
2550
4de82926
MM
2551 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2552 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
2553 _exit(EXIT_FAILURE);
2554 }
2555
03e334a1 2556 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
2557
2558 *rpid = pid;
2559
2560 return pipe_fds[0];
2561}
2562
2563static int change_uid_gid(char **_home) {
a2a5291b
ZJS
2564 char line[LINE_MAX], *x, *u, *g, *h;
2565 const char *word, *state;
0cb9fbcd
LP
2566 _cleanup_free_ uid_t *uids = NULL;
2567 _cleanup_free_ char *home = NULL;
2568 _cleanup_fclose_ FILE *f = NULL;
2569 _cleanup_close_ int fd = -1;
2570 unsigned n_uids = 0;
70f539ca 2571 size_t sz = 0, l;
0cb9fbcd
LP
2572 uid_t uid;
2573 gid_t gid;
2574 pid_t pid;
2575 int r;
2576
2577 assert(_home);
2578
2579 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2580 /* Reset everything fully to 0, just in case */
2581
2582 if (setgroups(0, NULL) < 0) {
56f64d95 2583 log_error_errno(errno, "setgroups() failed: %m");
0cb9fbcd
LP
2584 return -errno;
2585 }
2586
2587 if (setresgid(0, 0, 0) < 0) {
56f64d95 2588 log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd
LP
2589 return -errno;
2590 }
2591
2592 if (setresuid(0, 0, 0) < 0) {
56f64d95 2593 log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
2594 return -errno;
2595 }
2596
2597 *_home = NULL;
2598 return 0;
2599 }
2600
2601 /* First, get user credentials */
2602 fd = spawn_getent("passwd", arg_user, &pid);
2603 if (fd < 0)
2604 return fd;
2605
2606 f = fdopen(fd, "r");
2607 if (!f)
2608 return log_oom();
2609 fd = -1;
2610
2611 if (!fgets(line, sizeof(line), f)) {
2612
2613 if (!ferror(f)) {
2614 log_error("Failed to resolve user %s.", arg_user);
2615 return -ESRCH;
2616 }
2617
56f64d95 2618 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
2619 return -errno;
2620 }
2621
2622 truncate_nl(line);
2623
2624 wait_for_terminate_and_warn("getent passwd", pid);
2625
2626 x = strchr(line, ':');
2627 if (!x) {
2628 log_error("/etc/passwd entry has invalid user field.");
2629 return -EIO;
2630 }
2631
2632 u = strchr(x+1, ':');
2633 if (!u) {
2634 log_error("/etc/passwd entry has invalid password field.");
2635 return -EIO;
2636 }
2637
2638 u++;
2639 g = strchr(u, ':');
2640 if (!g) {
2641 log_error("/etc/passwd entry has invalid UID field.");
2642 return -EIO;
2643 }
2644
2645 *g = 0;
2646 g++;
2647 x = strchr(g, ':');
2648 if (!x) {
2649 log_error("/etc/passwd entry has invalid GID field.");
2650 return -EIO;
2651 }
2652
2653 *x = 0;
2654 h = strchr(x+1, ':');
2655 if (!h) {
2656 log_error("/etc/passwd entry has invalid GECOS field.");
2657 return -EIO;
2658 }
2659
2660 h++;
2661 x = strchr(h, ':');
2662 if (!x) {
2663 log_error("/etc/passwd entry has invalid home directory field.");
2664 return -EIO;
2665 }
2666
2667 *x = 0;
2668
2669 r = parse_uid(u, &uid);
2670 if (r < 0) {
2671 log_error("Failed to parse UID of user.");
2672 return -EIO;
2673 }
2674
2675 r = parse_gid(g, &gid);
2676 if (r < 0) {
2677 log_error("Failed to parse GID of user.");
2678 return -EIO;
2679 }
2680
2681 home = strdup(h);
2682 if (!home)
2683 return log_oom();
2684
2685 /* Second, get group memberships */
2686 fd = spawn_getent("initgroups", arg_user, &pid);
2687 if (fd < 0)
2688 return fd;
2689
2690 fclose(f);
2691 f = fdopen(fd, "r");
2692 if (!f)
2693 return log_oom();
2694 fd = -1;
2695
2696 if (!fgets(line, sizeof(line), f)) {
2697 if (!ferror(f)) {
2698 log_error("Failed to resolve user %s.", arg_user);
2699 return -ESRCH;
2700 }
2701
56f64d95 2702 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
2703 return -errno;
2704 }
2705
2706 truncate_nl(line);
2707
2708 wait_for_terminate_and_warn("getent initgroups", pid);
2709
2710 /* Skip over the username and subsequent separator whitespace */
2711 x = line;
2712 x += strcspn(x, WHITESPACE);
2713 x += strspn(x, WHITESPACE);
2714
a2a5291b 2715 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
2716 char c[l+1];
2717
a2a5291b 2718 memcpy(c, word, l);
0cb9fbcd
LP
2719 c[l] = 0;
2720
2721 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2722 return log_oom();
2723
2724 r = parse_uid(c, &uids[n_uids++]);
2725 if (r < 0) {
2726 log_error("Failed to parse group data from getent.");
2727 return -EIO;
2728 }
2729 }
2730
2731 r = mkdir_parents(home, 0775);
f647962d
MS
2732 if (r < 0)
2733 return log_error_errno(r, "Failed to make home root directory: %m");
0cb9fbcd
LP
2734
2735 r = mkdir_safe(home, 0755, uid, gid);
f647962d
MS
2736 if (r < 0 && r != -EEXIST)
2737 return log_error_errno(r, "Failed to make home directory: %m");
0cb9fbcd
LP
2738
2739 fchown(STDIN_FILENO, uid, gid);
2740 fchown(STDOUT_FILENO, uid, gid);
2741 fchown(STDERR_FILENO, uid, gid);
2742
2743 if (setgroups(n_uids, uids) < 0) {
56f64d95 2744 log_error_errno(errno, "Failed to set auxiliary groups: %m");
0cb9fbcd
LP
2745 return -errno;
2746 }
2747
2748 if (setresgid(gid, gid, gid) < 0) {
56f64d95 2749 log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd
LP
2750 return -errno;
2751 }
2752
2753 if (setresuid(uid, uid, uid) < 0) {
56f64d95 2754 log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
2755 return -errno;
2756 }
2757
2758 if (_home) {
2759 *_home = home;
2760 home = NULL;
2761 }
2762
2763 return 0;
2764}
2765
113cea80 2766/*
6d416b9c
LS
2767 * Return values:
2768 * < 0 : wait_for_terminate() failed to get the state of the
2769 * container, the container was terminated by a signal, or
2770 * failed for an unknown reason. No change is made to the
2771 * container argument.
2772 * > 0 : The program executed in the container terminated with an
2773 * error. The exit code of the program executed in the
919699ec
LP
2774 * container is returned. The container argument has been set
2775 * to CONTAINER_TERMINATED.
6d416b9c
LS
2776 * 0 : The container is being rebooted, has been shut down or exited
2777 * successfully. The container argument has been set to either
2778 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2779 *
6d416b9c
LS
2780 * That is, success is indicated by a return value of zero, and an
2781 * error is indicated by a non-zero value.
113cea80
DH
2782 */
2783static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2784 siginfo_t status;
919699ec 2785 int r;
113cea80
DH
2786
2787 r = wait_for_terminate(pid, &status);
f647962d
MS
2788 if (r < 0)
2789 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2790
2791 switch (status.si_code) {
fddbb89c 2792
113cea80 2793 case CLD_EXITED:
919699ec
LP
2794 if (status.si_status == 0) {
2795 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 2796
fddbb89c 2797 } else
919699ec 2798 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2799
919699ec
LP
2800 *container = CONTAINER_TERMINATED;
2801 return status.si_status;
113cea80
DH
2802
2803 case CLD_KILLED:
2804 if (status.si_status == SIGINT) {
113cea80 2805
919699ec 2806 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2807 *container = CONTAINER_TERMINATED;
919699ec
LP
2808 return 0;
2809
113cea80 2810 } else if (status.si_status == SIGHUP) {
113cea80 2811
919699ec 2812 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2813 *container = CONTAINER_REBOOTED;
919699ec 2814 return 0;
113cea80 2815 }
919699ec 2816
113cea80
DH
2817 /* CLD_KILLED fallthrough */
2818
2819 case CLD_DUMPED:
fddbb89c 2820 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2821 return -EIO;
113cea80
DH
2822
2823 default:
fddbb89c 2824 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2825 return -EIO;
113cea80
DH
2826 }
2827
2828 return r;
2829}
2830
e866af3a
DH
2831static void nop_handler(int sig) {}
2832
023fb90b
LP
2833static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2834 pid_t pid;
2835
2836 pid = PTR_TO_UINT32(userdata);
2837 if (pid > 0) {
2838 if (kill(pid, SIGRTMIN+3) >= 0) {
2839 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2840 sd_event_source_set_userdata(s, NULL);
2841 return 0;
2842 }
2843 }
2844
2845 sd_event_exit(sd_event_source_get_event(s), 0);
2846 return 0;
2847}
2848
88213476 2849int main(int argc, char *argv[]) {
69c79d3c 2850
63cc4c31 2851 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
727fd4fd 2852 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
63cc4c31 2853 _cleanup_close_ int master = -1, image_fd = -1;
3d94f76c 2854 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
69c79d3c 2855 _cleanup_fdset_free_ FDSet *fds = NULL;
1b9e5b12 2856 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
69c79d3c 2857 const char *console = NULL;
1b9e5b12
LP
2858 char veth_name[IFNAMSIZ];
2859 bool secondary = false;
e866af3a 2860 sigset_t mask, mask_chld;
69c79d3c 2861 pid_t pid = 0;
88213476
LP
2862
2863 log_parse_environment();
2864 log_open();
2865
05947bef
LP
2866 k = parse_argv(argc, argv);
2867 if (k < 0)
88213476 2868 goto finish;
05947bef
LP
2869 else if (k == 0) {
2870 r = EXIT_SUCCESS;
2871 goto finish;
2872 }
88213476 2873
1b9e5b12
LP
2874 if (!arg_image) {
2875 if (arg_directory) {
2876 char *p;
88213476 2877
1b9e5b12
LP
2878 p = path_make_absolute_cwd(arg_directory);
2879 free(arg_directory);
2880 arg_directory = p;
2881 } else
2882 arg_directory = get_current_dir_name();
88213476 2883
1b9e5b12
LP
2884 if (!arg_directory) {
2885 log_error("Failed to determine path, please use -D.");
2886 goto finish;
2887 }
2888 path_kill_slashes(arg_directory);
88213476
LP
2889 }
2890
7027ff61 2891 if (!arg_machine) {
1b9e5b12 2892 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
7027ff61
LP
2893 if (!arg_machine) {
2894 log_oom();
2895 goto finish;
2896 }
2897
e724b063 2898 hostname_cleanup(arg_machine, false);
7027ff61
LP
2899 if (isempty(arg_machine)) {
2900 log_error("Failed to determine machine name automatically, please use -M.");
2901 goto finish;
2902 }
2903 }
2904
88213476
LP
2905 if (geteuid() != 0) {
2906 log_error("Need to be root.");
2907 goto finish;
2908 }
2909
04d391da
LP
2910 if (sd_booted() <= 0) {
2911 log_error("Not running on a systemd system.");
2912 goto finish;
2913 }
2914
1b9e5b12
LP
2915 log_close();
2916 n_fd_passed = sd_listen_fds(false);
2917 if (n_fd_passed > 0) {
2918 k = fdset_new_listen_fds(&fds, false);
2919 if (k < 0) {
da927ba9 2920 log_error_errno(k, "Failed to collect file descriptors: %m");
1b9e5b12
LP
2921 goto finish;
2922 }
88213476 2923 }
1b9e5b12
LP
2924 fdset_close_others(fds);
2925 log_open();
88213476 2926
1b9e5b12
LP
2927 if (arg_directory) {
2928 if (path_equal(arg_directory, "/")) {
2929 log_error("Spawning container on root directory not supported.");
6b9132a9
LP
2930 goto finish;
2931 }
1b9e5b12
LP
2932
2933 if (arg_boot) {
2934 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 2935 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
1b9e5b12
LP
2936 goto finish;
2937 }
2938 } else {
2939 const char *p;
2940
2941 p = strappenda(arg_directory,
2942 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2943 if (access(p, F_OK) < 0) {
2944 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2945 goto finish;
2946
2947 }
2948 }
6b9132a9 2949 } else {
1b9e5b12 2950 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 2951
1b9e5b12 2952 if (!mkdtemp(template)) {
56f64d95 2953 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 2954 r = -errno;
6b9132a9 2955 goto finish;
1b9e5b12 2956 }
6b9132a9 2957
1b9e5b12
LP
2958 arg_directory = strdup(template);
2959 if (!arg_directory) {
2960 r = log_oom();
2961 goto finish;
6b9132a9 2962 }
88213476 2963
1b9e5b12
LP
2964 image_fd = setup_image(&device_path, &loop_nr);
2965 if (image_fd < 0) {
2966 r = image_fd;
842f3b0f
LP
2967 goto finish;
2968 }
1b9e5b12 2969
4d9f07b4
LP
2970 r = dissect_image(image_fd,
2971 &root_device, &root_device_rw,
2972 &home_device, &home_device_rw,
2973 &srv_device, &srv_device_rw,
2974 &secondary);
1b9e5b12
LP
2975 if (r < 0)
2976 goto finish;
842f3b0f 2977 }
842f3b0f 2978
db7feb7e
LP
2979 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2980 if (master < 0) {
56f64d95 2981 log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
2982 goto finish;
2983 }
2984
db7feb7e
LP
2985 console = ptsname(master);
2986 if (!console) {
56f64d95 2987 log_error_errno(errno, "Failed to determine tty name: %m");
a258bf26
LP
2988 goto finish;
2989 }
2990
284c0b91 2991 if (!arg_quiet)
45f1386c
ZJS
2992 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2993 arg_machine, arg_image ? arg_image : arg_directory);
a258bf26
LP
2994
2995 if (unlockpt(master) < 0) {
56f64d95 2996 log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
2997 goto finish;
2998 }
2999
e58a1277 3000 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
56f64d95 3001 log_error_errno(errno, "Failed to create kmsg socket pair: %m");
354bfd2b
LP
3002 goto finish;
3003 }
3004
af4ec430
LP
3005 sd_notify(false,
3006 "READY=1\n"
3007 "STATUS=Container running.");
05947bef 3008
a258bf26
LP
3009 assert_se(sigemptyset(&mask) == 0);
3010 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3011 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3012
023fb90b
LP
3013 assert_se(sigemptyset(&mask_chld) == 0);
3014 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3015
d87be9b0 3016 for (;;) {
113cea80 3017 ContainerStatus container_status;
7566e267 3018 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
e866af3a
DH
3019 struct sigaction sa = {
3020 .sa_handler = nop_handler,
3021 .sa_flags = SA_NOCLDSTOP,
3022 };
3023
7566e267 3024 r = barrier_create(&barrier);
a2da110b 3025 if (r < 0) {
da927ba9 3026 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3027 goto finish;
3028 }
3029
e866af3a
DH
3030 /* Child can be killed before execv(), so handle SIGCHLD
3031 * in order to interrupt parent's blocking calls and
3032 * give it a chance to call wait() and terminate. */
3033 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3034 if (r < 0) {
56f64d95 3035 log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3036 goto finish;
3037 }
3038
e866af3a
DH
3039 r = sigaction(SIGCHLD, &sa, NULL);
3040 if (r < 0) {
56f64d95 3041 log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3042 goto finish;
3043 }
3044
a2da110b
DH
3045 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3046 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3047 (arg_private_network ? CLONE_NEWNET : 0), NULL);
d87be9b0
LP
3048 if (pid < 0) {
3049 if (errno == EINVAL)
56f64d95 3050 log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3051 else
56f64d95 3052 log_error_errno(errno, "clone() failed: %m");
a258bf26 3053
e866af3a 3054 r = pid;
d87be9b0
LP
3055 goto finish;
3056 }
a258bf26 3057
d87be9b0
LP
3058 if (pid == 0) {
3059 /* child */
0cb9fbcd 3060 _cleanup_free_ char *home = NULL;
5674767e 3061 unsigned n_env = 2;
d87be9b0 3062 const char *envp[] = {
e10a55fd 3063 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
3064 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3065 NULL, /* TERM */
3066 NULL, /* HOME */
3067 NULL, /* USER */
3068 NULL, /* LOGNAME */
3069 NULL, /* container_uuid */
842f3b0f
LP
3070 NULL, /* LISTEN_FDS */
3071 NULL, /* LISTEN_PID */
d87be9b0
LP
3072 NULL
3073 };
f4889f65 3074 char **env_use;
a258bf26 3075
a2da110b
DH
3076 barrier_set_role(&barrier, BARRIER_CHILD);
3077
5674767e
ZJS
3078 envp[n_env] = strv_find_prefix(environ, "TERM=");
3079 if (envp[n_env])
3080 n_env ++;
a258bf26 3081
03e334a1 3082 master = safe_close(master);
a258bf26 3083
d87be9b0
LP
3084 close_nointr(STDIN_FILENO);
3085 close_nointr(STDOUT_FILENO);
3086 close_nointr(STDERR_FILENO);
db7feb7e 3087
03e334a1 3088 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
a258bf26 3089
d87be9b0 3090 reset_all_signal_handlers();
1b6d7fa7 3091 reset_signal_mask();
f5c1b9ee 3092
842f3b0f
LP
3093 k = open_terminal(console, O_RDWR);
3094 if (k != STDIN_FILENO) {
3095 if (k >= 0) {
03e334a1 3096 safe_close(k);
842f3b0f
LP
3097 k = -EINVAL;
3098 }
3099
da927ba9 3100 log_error_errno(k, "Failed to open console: %m");
a2da110b 3101 _exit(EXIT_FAILURE);
842f3b0f
LP
3102 }
3103
3104 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3105 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
56f64d95 3106 log_error_errno(errno, "Failed to duplicate console: %m");
a2da110b 3107 _exit(EXIT_FAILURE);
842f3b0f 3108 }
bc2f673e 3109
d87be9b0 3110 if (setsid() < 0) {
56f64d95 3111 log_error_errno(errno, "setsid() failed: %m");
a2da110b 3112 _exit(EXIT_FAILURE);
bc2f673e
LP
3113 }
3114
db999e0f 3115 if (reset_audit_loginuid() < 0)
a2da110b 3116 _exit(EXIT_FAILURE);
db999e0f 3117
d87be9b0 3118 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
56f64d95 3119 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
a2da110b 3120 _exit(EXIT_FAILURE);
d87be9b0 3121 }
e58a1277 3122
d87be9b0
LP
3123 /* Mark everything as slave, so that we still
3124 * receive mounts from the real root, but don't
3125 * propagate mounts to the real root. */
3126 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
56f64d95 3127 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
a2da110b 3128 _exit(EXIT_FAILURE);
d87be9b0 3129 }
04bc4a3f 3130
727fd4fd
LP
3131 if (mount_devices(arg_directory,
3132 root_device, root_device_rw,
3133 home_device, home_device_rw,
3134 srv_device, srv_device_rw) < 0)
a2da110b 3135 _exit(EXIT_FAILURE);
1b9e5b12 3136
d87be9b0
LP
3137 /* Turn directory into bind mount */
3138 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
56f64d95 3139 log_error_errno(errno, "Failed to make bind mount: %m");
a2da110b 3140 _exit(EXIT_FAILURE);
d87be9b0 3141 }
88213476 3142
4d9f07b4
LP
3143 r = setup_volatile(arg_directory);
3144 if (r < 0)
a2da110b 3145 _exit(EXIT_FAILURE);
4d9f07b4
LP
3146
3147 if (setup_volatile_state(arg_directory) < 0)
a2da110b 3148 _exit(EXIT_FAILURE);
4d9f07b4
LP
3149
3150 r = base_filesystem_create(arg_directory);
3151 if (r < 0)
a2da110b 3152 _exit(EXIT_FAILURE);
4d9f07b4 3153
d6797c92
LP
3154 if (arg_read_only) {
3155 k = bind_remount_recursive(arg_directory, true);
3156 if (k < 0) {
da927ba9 3157 log_error_errno(k, "Failed to make tree read-only: %m");
a2da110b 3158 _exit(EXIT_FAILURE);
d87be9b0 3159 }
d6797c92 3160 }
2547bb41 3161
d87be9b0 3162 if (mount_all(arg_directory) < 0)
a2da110b 3163 _exit(EXIT_FAILURE);
57fb9fb5 3164
d87be9b0 3165 if (copy_devnodes(arg_directory) < 0)
a2da110b 3166 _exit(EXIT_FAILURE);
a258bf26 3167
f2d88580 3168 if (setup_ptmx(arg_directory) < 0)
a2da110b 3169 _exit(EXIT_FAILURE);
f2d88580 3170
d87be9b0 3171 dev_setup(arg_directory);
88213476 3172
28650077 3173 if (setup_seccomp() < 0)
a2da110b 3174 _exit(EXIT_FAILURE);
24fb1112 3175
d87be9b0 3176 if (setup_dev_console(arg_directory, console) < 0)
a2da110b 3177 _exit(EXIT_FAILURE);
88213476 3178
d87be9b0 3179 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
a2da110b 3180 _exit(EXIT_FAILURE);
88213476 3181
03e334a1 3182 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
a258bf26 3183
d87be9b0 3184 if (setup_boot_id(arg_directory) < 0)
a2da110b 3185 _exit(EXIT_FAILURE);
a41fe3a2 3186
d87be9b0 3187 if (setup_timezone(arg_directory) < 0)
a2da110b 3188 _exit(EXIT_FAILURE);
88213476 3189
d87be9b0 3190 if (setup_resolv_conf(arg_directory) < 0)
a2da110b 3191 _exit(EXIT_FAILURE);
687d0825 3192
d87be9b0 3193 if (setup_journal(arg_directory) < 0)
a2da110b 3194 _exit(EXIT_FAILURE);
687d0825 3195
d6797c92 3196 if (mount_binds(arg_directory, arg_bind, false) < 0)
a2da110b 3197 _exit(EXIT_FAILURE);
17fe0523 3198
d6797c92 3199 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
a2da110b 3200 _exit(EXIT_FAILURE);
17fe0523 3201
06c17c39 3202 if (mount_tmpfs(arg_directory) < 0)
a2da110b 3203 _exit(EXIT_FAILURE);
06c17c39 3204
d96c1ecf
LP
3205 /* Tell the parent that we are ready, and that
3206 * it can cgroupify us to that we lack access
3207 * to certain devices and resources. */
dfb05a1c 3208 (void)barrier_place(&barrier);
d96c1ecf 3209
d87be9b0 3210 if (chdir(arg_directory) < 0) {
56f64d95 3211 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
a2da110b 3212 _exit(EXIT_FAILURE);
687d0825
MV
3213 }
3214
d87be9b0 3215 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
56f64d95 3216 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
a2da110b 3217 _exit(EXIT_FAILURE);
687d0825
MV
3218 }
3219
d87be9b0 3220 if (chroot(".") < 0) {
56f64d95 3221 log_error_errno(errno, "chroot() failed: %m");
a2da110b 3222 _exit(EXIT_FAILURE);
687d0825
MV
3223 }
3224
d87be9b0 3225 if (chdir("/") < 0) {
56f64d95 3226 log_error_errno(errno, "chdir() failed: %m");
a2da110b 3227 _exit(EXIT_FAILURE);
687d0825
MV
3228 }
3229
d87be9b0
LP
3230 umask(0022);
3231
eb91eb18
LP
3232 if (arg_private_network)
3233 loopback_setup();
d87be9b0
LP
3234
3235 if (drop_capabilities() < 0) {
56f64d95 3236 log_error_errno(errno, "drop_capabilities() failed: %m");
a2da110b 3237 _exit(EXIT_FAILURE);
687d0825 3238 }
687d0825 3239
0cb9fbcd
LP
3240 r = change_uid_gid(&home);
3241 if (r < 0)
a2da110b 3242 _exit(EXIT_FAILURE);
d87be9b0 3243
842f3b0f
LP
3244 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3245 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3246 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 3247 log_oom();
a2da110b 3248 _exit(EXIT_FAILURE);
144f0fc0 3249 }
687d0825 3250
9444b1f2 3251 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
9f24adc2
LP
3252 char as_uuid[37];
3253
3254 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
842f3b0f 3255 log_oom();
a2da110b 3256 _exit(EXIT_FAILURE);
842f3b0f
LP
3257 }
3258 }
3259
3260 if (fdset_size(fds) > 0) {
3261 k = fdset_cloexec(fds, false);
3262 if (k < 0) {
3263 log_error("Failed to unset O_CLOEXEC for file descriptors.");
a2da110b 3264 _exit(EXIT_FAILURE);
842f3b0f
LP
3265 }
3266
3267 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 3268 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0 3269 log_oom();
a2da110b 3270 _exit(EXIT_FAILURE);
d87be9b0
LP
3271 }
3272 }
3273
3274 setup_hostname();
3275
6afc95b7
LP
3276 if (arg_personality != 0xffffffffLU) {
3277 if (personality(arg_personality) < 0) {
56f64d95 3278 log_error_errno(errno, "personality() failed: %m");
a2da110b 3279 _exit(EXIT_FAILURE);
6afc95b7 3280 }
1b9e5b12
LP
3281 } else if (secondary) {
3282 if (personality(PER_LINUX32) < 0) {
56f64d95 3283 log_error_errno(errno, "personality() failed: %m");
a2da110b 3284 _exit(EXIT_FAILURE);
1b9e5b12 3285 }
6afc95b7
LP
3286 }
3287
d96c1ecf
LP
3288#ifdef HAVE_SELINUX
3289 if (arg_selinux_context)
0cb9fbcd 3290 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
56f64d95 3291 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
a2da110b 3292 _exit(EXIT_FAILURE);
0cb9fbcd 3293 }
d96c1ecf 3294#endif
354bfd2b 3295
f4889f65
LP
3296 if (!strv_isempty(arg_setenv)) {
3297 char **n;
3298
3299 n = strv_env_merge(2, envp, arg_setenv);
3300 if (!n) {
3301 log_oom();
a2da110b 3302 _exit(EXIT_FAILURE);
f4889f65
LP
3303 }
3304
3305 env_use = n;
3306 } else
3307 env_use = (char**) envp;
3308
d96c1ecf 3309 /* Wait until the parent is ready with the setup, too... */
a2da110b
DH
3310 if (!barrier_place_and_sync(&barrier))
3311 _exit(EXIT_FAILURE);
d96c1ecf 3312
d87be9b0
LP
3313 if (arg_boot) {
3314 char **a;
3315 size_t l;
88213476 3316
d87be9b0 3317 /* Automatically search for the init system */
0f0dbc46 3318
d87be9b0
LP
3319 l = 1 + argc - optind;
3320 a = newa(char*, l + 1);
3321 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 3322
d87be9b0 3323 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 3324 execve(a[0], a, env_use);
0f0dbc46 3325
d87be9b0 3326 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 3327 execve(a[0], a, env_use);
0f0dbc46 3328
d87be9b0 3329 a[0] = (char*) "/sbin/init";
f4889f65 3330 execve(a[0], a, env_use);
d87be9b0 3331 } else if (argc > optind)
f4889f65 3332 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
3333 else {
3334 chdir(home ? home : "/root");
f4889f65 3335 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 3336 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
3337 }
3338
56f64d95 3339 log_error_errno(errno, "execv() failed: %m");
d87be9b0 3340 _exit(EXIT_FAILURE);
da5b3bad 3341 }
88213476 3342
a2da110b 3343 barrier_set_role(&barrier, BARRIER_PARENT);
842f3b0f
LP
3344 fdset_free(fds);
3345 fds = NULL;
3346
a2da110b
DH
3347 /* wait for child-setup to be done */
3348 if (barrier_place_and_sync(&barrier)) {
023fb90b
LP
3349 _cleanup_event_unref_ sd_event *event = NULL;
3350 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
5aa4bb6b 3351 int ifi = 0;
354bfd2b 3352
840295fc
LP
3353 r = move_network_interfaces(pid);
3354 if (r < 0)
3355 goto finish;
aa28aefe 3356
5aa4bb6b 3357 r = setup_veth(pid, veth_name, &ifi);
840295fc
LP
3358 if (r < 0)
3359 goto finish;
ab046dde 3360
5aa4bb6b 3361 r = setup_bridge(veth_name, &ifi);
840295fc
LP
3362 if (r < 0)
3363 goto finish;
ab046dde 3364
840295fc
LP
3365 r = setup_macvlan(pid);
3366 if (r < 0)
3367 goto finish;
c74e630d 3368
5aa4bb6b
LP
3369 r = register_machine(pid, ifi);
3370 if (r < 0)
3371 goto finish;
3372
840295fc
LP
3373 /* Block SIGCHLD here, before notifying child.
3374 * process_pty() will handle it with the other signals. */
3375 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3376 if (r < 0)
3377 goto finish;
e866af3a 3378
840295fc
LP
3379 /* Reset signal to default */
3380 r = default_signals(SIGCHLD, -1);
3381 if (r < 0)
3382 goto finish;
e866af3a 3383
840295fc
LP
3384 /* Notify the child that the parent is ready with all
3385 * its setup, and that the child can now hand over
3386 * control to the code to run inside the container. */
dfb05a1c 3387 (void)barrier_place(&barrier);
354bfd2b 3388
023fb90b
LP
3389 r = sd_event_new(&event);
3390 if (r < 0) {
da927ba9 3391 log_error_errno(r, "Failed to get default event source: %m");
023fb90b 3392 goto finish;
840295fc 3393 }
88213476 3394
023fb90b
LP
3395 if (arg_boot) {
3396 /* Try to kill the init system on SIGINT or SIGTERM */
3397 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3398 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3399 } else {
3400 /* Immediately exit */
3401 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3402 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3403 }
3404
3405 /* simply exit on sigchld */
3406 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3407
3408 r = pty_forward_new(event, master, &forward);
3409 if (r < 0) {
da927ba9 3410 log_error_errno(r, "Failed to create PTY forwarder: %m");
023fb90b
LP
3411 goto finish;
3412 }
3413
3414 r = sd_event_loop(event);
f647962d
MS
3415 if (r < 0)
3416 return log_error_errno(r, "Failed to run event loop: %m");
023fb90b
LP
3417
3418 forward = pty_forward_free(forward);
3419
840295fc
LP
3420 if (!arg_quiet)
3421 putc('\n', stdout);
04d39279 3422
840295fc
LP
3423 /* Kill if it is not dead yet anyway */
3424 terminate_machine(pid);
3425 }
1f0cd86b 3426
840295fc 3427 /* Normally redundant, but better safe than sorry */
04d39279 3428 kill(pid, SIGKILL);
a258bf26 3429
113cea80 3430 r = wait_for_container(pid, &container_status);
04d39279
LP
3431 pid = 0;
3432
ce9f1527
LP
3433 if (r < 0) {
3434 /* We failed to wait for the container, or the
3435 * container exited abnormally */
3436 r = EXIT_FAILURE;
d87be9b0 3437 break;
ce9f1527
LP
3438 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3439 /* The container exited with a non-zero
3440 * status, or with zero status and no reboot
3441 * was requested. */
d87be9b0 3442 break;
88213476 3443
113cea80 3444 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3445
3446 if (arg_keep_unit) {
3447 /* Special handling if we are running as a
3448 * service: instead of simply restarting the
3449 * machine we want to restart the entire
3450 * service, so let's inform systemd about this
3451 * with the special exit code 133. The service
3452 * file uses RestartForceExitStatus=133 so
3453 * that this results in a full nspawn
3454 * restart. This is necessary since we might
3455 * have cgroup parameters set we want to have
3456 * flushed out. */
3457 r = 133;
3458 break;
3459 }
d87be9b0 3460 }
88213476
LP
3461
3462finish:
af4ec430
LP
3463 sd_notify(false,
3464 "STOPPING=1\n"
3465 "STATUS=Terminating...");
3466
1b9e5b12
LP
3467 loop_remove(loop_nr, &image_fd);
3468
9444b1f2
LP
3469 if (pid > 0)
3470 kill(pid, SIGKILL);
88213476 3471
04d391da 3472 free(arg_directory);
7027ff61 3473 free(arg_machine);
c74e630d
LP
3474 free(arg_user);
3475 strv_free(arg_setenv);
3476 strv_free(arg_network_interfaces);
3477 strv_free(arg_network_macvlan);
3478 strv_free(arg_bind);
3479 strv_free(arg_bind_ro);
06c17c39 3480 strv_free(arg_tmpfs);
88213476
LP
3481
3482 return r;
3483}