]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
seccomp-util.h: make sure seccomp-util.h can be included alone
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
a258bf26
LP
36#include <termios.h>
37#include <sys/signalfd.h>
687d0825 38#include <grp.h>
5ed27dbd 39#include <linux/fs.h>
9537eab0
LP
40#include <sys/un.h>
41#include <sys/socket.h>
aea38d80 42#include <linux/netlink.h>
aa28aefe 43#include <net/if.h>
69c79d3c 44#include <linux/veth.h>
6afc95b7 45#include <sys/personality.h>
1b9e5b12 46#include <linux/loop.h>
aa28aefe 47
5d63309c 48#ifdef HAVE_SELINUX
a8828ed9
DW
49#include <selinux/selinux.h>
50#endif
88213476 51
24fb1112
LP
52#ifdef HAVE_SECCOMP
53#include <seccomp.h>
54#endif
55
1b9e5b12
LP
56#ifdef HAVE_BLKID
57#include <blkid/blkid.h>
58#endif
59
1f0cd86b
LP
60#include "sd-daemon.h"
61#include "sd-bus.h"
62#include "sd-id128.h"
aa28aefe 63#include "sd-rtnl.h"
88213476
LP
64#include "log.h"
65#include "util.h"
49e942b2 66#include "mkdir.h"
6b2d0e85 67#include "macro.h"
d7832d2c 68#include "audit.h"
94d82985 69#include "missing.h"
04d391da 70#include "cgroup-util.h"
a258bf26 71#include "strv.h"
9eb977db 72#include "path-util.h"
a41fe3a2 73#include "loopback-setup.h"
4fc9982c 74#include "dev-setup.h"
842f3b0f 75#include "fdset.h"
acbeb427 76#include "build.h"
a5c32cff 77#include "fileio.h"
40ca29a1 78#include "bus-util.h"
1f0cd86b 79#include "bus-error.h"
4ba93280 80#include "ptyfwd.h"
9bd37b40 81#include "bus-kernel.h"
f4889f65 82#include "env-util.h"
7f112f50 83#include "def.h"
aa28aefe 84#include "rtnl-util.h"
7e227024 85#include "udev-util.h"
1b9e5b12
LP
86#include "blkid-util.h"
87#include "gpt.h"
01dde061 88#include "siphash24.h"
849958d1 89#include "copy.h"
3577de7a 90#include "base-filesystem.h"
a2da110b 91#include "barrier.h"
023fb90b 92#include "event-util.h"
2822da4f 93#include "cap-list.h"
f2d88580 94
e9642be2
LP
95#ifdef HAVE_SECCOMP
96#include "seccomp-util.h"
97#endif
98
113cea80
DH
99typedef enum ContainerStatus {
100 CONTAINER_TERMINATED,
101 CONTAINER_REBOOTED
102} ContainerStatus;
103
57fb9fb5
LP
104typedef enum LinkJournal {
105 LINK_NO,
106 LINK_AUTO,
107 LINK_HOST,
108 LINK_GUEST
109} LinkJournal;
88213476 110
4d9f07b4
LP
111typedef enum Volatile {
112 VOLATILE_NO,
113 VOLATILE_YES,
114 VOLATILE_STATE,
115} Volatile;
116
88213476 117static char *arg_directory = NULL;
687d0825 118static char *arg_user = NULL;
9444b1f2 119static sd_id128_t arg_uuid = {};
7027ff61 120static char *arg_machine = NULL;
c74e630d
LP
121static const char *arg_selinux_context = NULL;
122static const char *arg_selinux_apifs_context = NULL;
9444b1f2 123static const char *arg_slice = NULL;
ff01d048 124static bool arg_private_network = false;
bc2f673e 125static bool arg_read_only = false;
0f0dbc46 126static bool arg_boot = false;
57fb9fb5 127static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 128static bool arg_link_journal_try = false;
5076f0cc
LP
129static uint64_t arg_retain =
130 (1ULL << CAP_CHOWN) |
131 (1ULL << CAP_DAC_OVERRIDE) |
132 (1ULL << CAP_DAC_READ_SEARCH) |
133 (1ULL << CAP_FOWNER) |
134 (1ULL << CAP_FSETID) |
135 (1ULL << CAP_IPC_OWNER) |
136 (1ULL << CAP_KILL) |
137 (1ULL << CAP_LEASE) |
138 (1ULL << CAP_LINUX_IMMUTABLE) |
139 (1ULL << CAP_NET_BIND_SERVICE) |
140 (1ULL << CAP_NET_BROADCAST) |
141 (1ULL << CAP_NET_RAW) |
142 (1ULL << CAP_SETGID) |
143 (1ULL << CAP_SETFCAP) |
144 (1ULL << CAP_SETPCAP) |
145 (1ULL << CAP_SETUID) |
146 (1ULL << CAP_SYS_ADMIN) |
147 (1ULL << CAP_SYS_CHROOT) |
148 (1ULL << CAP_SYS_NICE) |
149 (1ULL << CAP_SYS_PTRACE) |
150 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 151 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
152 (1ULL << CAP_SYS_BOOT) |
153 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
154 (1ULL << CAP_AUDIT_CONTROL) |
155 (1ULL << CAP_MKNOD);
17fe0523
LP
156static char **arg_bind = NULL;
157static char **arg_bind_ro = NULL;
06c17c39 158static char **arg_tmpfs = NULL;
f4889f65 159static char **arg_setenv = NULL;
284c0b91 160static bool arg_quiet = false;
8a96d94e 161static bool arg_share_system = false;
eb91eb18 162static bool arg_register = true;
89f7c846 163static bool arg_keep_unit = false;
aa28aefe 164static char **arg_network_interfaces = NULL;
c74e630d 165static char **arg_network_macvlan = NULL;
69c79d3c 166static bool arg_network_veth = false;
c74e630d 167static const char *arg_network_bridge = NULL;
6afc95b7 168static unsigned long arg_personality = 0xffffffffLU;
1b9e5b12 169static const char *arg_image = NULL;
4d9f07b4 170static Volatile arg_volatile = VOLATILE_NO;
88213476 171
601185b4 172static void help(void) {
88213476
LP
173 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
174 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
175 " -h --help Show this help\n"
176 " --version Print version string\n"
69c79d3c 177 " -q --quiet Do not show status information\n"
1b9e5b12
LP
178 " -D --directory=PATH Root directory for the container\n"
179 " -i --image=PATH File system device or image for the container\n"
a8828ed9
DW
180 " -b --boot Boot up full system (i.e. invoke init)\n"
181 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 182 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 183 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 184 " -S --slice=SLICE Place the container in the specified slice\n"
69c79d3c
LP
185 " --private-network Disable network in container\n"
186 " --network-interface=INTERFACE\n"
187 " Assign an existing network interface to the\n"
188 " container\n"
c74e630d
LP
189 " --network-macvlan=INTERFACE\n"
190 " Create a macvlan network interface based on an\n"
191 " existing network interface to the container\n"
32457153 192 " --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 193 " and container\n"
ab046dde 194 " --network-bridge=INTERFACE\n"
32457153 195 " Add a virtual ethernet connection between host\n"
ab046dde
TG
196 " and container and add it to an existing bridge on\n"
197 " the host\n"
82adf6af
LP
198 " -Z --selinux-context=SECLABEL\n"
199 " Set the SELinux security context to be used by\n"
200 " processes in the container\n"
201 " -L --selinux-apifs-context=SECLABEL\n"
202 " Set the SELinux security context to be used by\n"
203 " API/tmpfs file systems in the container\n"
a8828ed9
DW
204 " --capability=CAP In addition to the default, retain specified\n"
205 " capability\n"
206 " --drop-capability=CAP Drop the specified capability from the default set\n"
574edc90
MP
207 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
208 " try-guest, try-host\n"
209 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 210 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
211 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
212 " the container\n"
213 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 214 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
284c0b91 215 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 216 " --share-system Share system namespaces with host\n"
eb91eb18 217 " --register=BOOLEAN Register container as machine\n"
89f7c846 218 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4
LP
219 " the service unit nspawn is running in\n"
220 " --volatile[=MODE] Run the system in volatile mode\n",
88213476 221 program_invocation_short_name);
88213476
LP
222}
223
224static int parse_argv(int argc, char *argv[]) {
225
a41fe3a2 226 enum {
acbeb427
ZJS
227 ARG_VERSION = 0x100,
228 ARG_PRIVATE_NETWORK,
bc2f673e 229 ARG_UUID,
5076f0cc 230 ARG_READ_ONLY,
57fb9fb5 231 ARG_CAPABILITY,
420c7379 232 ARG_DROP_CAPABILITY,
17fe0523
LP
233 ARG_LINK_JOURNAL,
234 ARG_BIND,
f4889f65 235 ARG_BIND_RO,
06c17c39 236 ARG_TMPFS,
f4889f65 237 ARG_SETENV,
eb91eb18 238 ARG_SHARE_SYSTEM,
89f7c846 239 ARG_REGISTER,
aa28aefe 240 ARG_KEEP_UNIT,
69c79d3c 241 ARG_NETWORK_INTERFACE,
c74e630d 242 ARG_NETWORK_MACVLAN,
69c79d3c 243 ARG_NETWORK_VETH,
ab046dde 244 ARG_NETWORK_BRIDGE,
6afc95b7 245 ARG_PERSONALITY,
4d9f07b4 246 ARG_VOLATILE,
a41fe3a2
LP
247 };
248
88213476 249 static const struct option options[] = {
aa28aefe
LP
250 { "help", no_argument, NULL, 'h' },
251 { "version", no_argument, NULL, ARG_VERSION },
252 { "directory", required_argument, NULL, 'D' },
253 { "user", required_argument, NULL, 'u' },
254 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
255 { "boot", no_argument, NULL, 'b' },
256 { "uuid", required_argument, NULL, ARG_UUID },
257 { "read-only", no_argument, NULL, ARG_READ_ONLY },
258 { "capability", required_argument, NULL, ARG_CAPABILITY },
259 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
260 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
261 { "bind", required_argument, NULL, ARG_BIND },
262 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 263 { "tmpfs", required_argument, NULL, ARG_TMPFS },
aa28aefe
LP
264 { "machine", required_argument, NULL, 'M' },
265 { "slice", required_argument, NULL, 'S' },
266 { "setenv", required_argument, NULL, ARG_SETENV },
267 { "selinux-context", required_argument, NULL, 'Z' },
268 { "selinux-apifs-context", required_argument, NULL, 'L' },
269 { "quiet", no_argument, NULL, 'q' },
270 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
271 { "register", required_argument, NULL, ARG_REGISTER },
272 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
273 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 274 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
ab046dde
TG
275 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
276 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 277 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 278 { "image", required_argument, NULL, 'i' },
4d9f07b4 279 { "volatile", optional_argument, NULL, ARG_VOLATILE },
eb9da376 280 {}
88213476
LP
281 };
282
9444b1f2 283 int c, r;
a42c8b54 284 uint64_t plus = 0, minus = 0;
88213476
LP
285
286 assert(argc >= 0);
287 assert(argv);
288
601185b4 289 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
88213476
LP
290
291 switch (c) {
292
293 case 'h':
601185b4
ZJS
294 help();
295 return 0;
88213476 296
acbeb427
ZJS
297 case ARG_VERSION:
298 puts(PACKAGE_STRING);
299 puts(SYSTEMD_FEATURES);
300 return 0;
301
88213476
LP
302 case 'D':
303 free(arg_directory);
3a74cea5
LP
304 arg_directory = canonicalize_file_name(optarg);
305 if (!arg_directory) {
56f64d95 306 log_error_errno(errno, "Invalid root directory: %m");
88213476
LP
307 return -ENOMEM;
308 }
309
310 break;
311
1b9e5b12
LP
312 case 'i':
313 arg_image = optarg;
314 break;
315
687d0825
MV
316 case 'u':
317 free(arg_user);
7027ff61
LP
318 arg_user = strdup(optarg);
319 if (!arg_user)
320 return log_oom();
687d0825
MV
321
322 break;
323
ab046dde 324 case ARG_NETWORK_BRIDGE:
c74e630d 325 arg_network_bridge = optarg;
ab046dde
TG
326
327 /* fall through */
328
69c79d3c
LP
329 case ARG_NETWORK_VETH:
330 arg_network_veth = true;
331 arg_private_network = true;
332 break;
333
aa28aefe 334 case ARG_NETWORK_INTERFACE:
c74e630d
LP
335 if (strv_extend(&arg_network_interfaces, optarg) < 0)
336 return log_oom();
337
338 arg_private_network = true;
339 break;
340
341 case ARG_NETWORK_MACVLAN:
342 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
343 return log_oom();
344
345 /* fall through */
346
ff01d048
LP
347 case ARG_PRIVATE_NETWORK:
348 arg_private_network = true;
a41fe3a2
LP
349 break;
350
0f0dbc46
LP
351 case 'b':
352 arg_boot = true;
353 break;
354
144f0fc0 355 case ARG_UUID:
9444b1f2
LP
356 r = sd_id128_from_string(optarg, &arg_uuid);
357 if (r < 0) {
aa96c6cb 358 log_error("Invalid UUID: %s", optarg);
9444b1f2 359 return r;
aa96c6cb 360 }
9444b1f2 361 break;
aa96c6cb 362
9444b1f2 363 case 'S':
c74e630d 364 arg_slice = optarg;
144f0fc0
LP
365 break;
366
7027ff61 367 case 'M':
eb91eb18
LP
368 if (isempty(optarg)) {
369 free(arg_machine);
370 arg_machine = NULL;
371 } else {
7027ff61 372
eb91eb18
LP
373 if (!hostname_is_valid(optarg)) {
374 log_error("Invalid machine name: %s", optarg);
375 return -EINVAL;
376 }
7027ff61 377
eb91eb18
LP
378 free(arg_machine);
379 arg_machine = strdup(optarg);
380 if (!arg_machine)
381 return log_oom();
382
383 break;
384 }
7027ff61 385
82adf6af
LP
386 case 'Z':
387 arg_selinux_context = optarg;
a8828ed9
DW
388 break;
389
82adf6af
LP
390 case 'L':
391 arg_selinux_apifs_context = optarg;
a8828ed9
DW
392 break;
393
bc2f673e
LP
394 case ARG_READ_ONLY:
395 arg_read_only = true;
396 break;
397
420c7379
LP
398 case ARG_CAPABILITY:
399 case ARG_DROP_CAPABILITY: {
a2a5291b 400 const char *state, *word;
5076f0cc
LP
401 size_t length;
402
403 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 404 _cleanup_free_ char *t;
5076f0cc
LP
405
406 t = strndup(word, length);
0d0f0c50
SL
407 if (!t)
408 return log_oom();
5076f0cc 409
39ed67d1
LP
410 if (streq(t, "all")) {
411 if (c == ARG_CAPABILITY)
a42c8b54 412 plus = (uint64_t) -1;
39ed67d1 413 else
a42c8b54 414 minus = (uint64_t) -1;
39ed67d1 415 } else {
2822da4f
LP
416 int cap;
417
418 cap = capability_from_name(t);
419 if (cap < 0) {
39ed67d1
LP
420 log_error("Failed to parse capability %s.", t);
421 return -EINVAL;
422 }
423
424 if (c == ARG_CAPABILITY)
a42c8b54 425 plus |= 1ULL << (uint64_t) cap;
39ed67d1 426 else
a42c8b54 427 minus |= 1ULL << (uint64_t) cap;
5076f0cc 428 }
5076f0cc
LP
429 }
430
431 break;
432 }
433
57fb9fb5
LP
434 case 'j':
435 arg_link_journal = LINK_GUEST;
574edc90 436 arg_link_journal_try = true;
57fb9fb5
LP
437 break;
438
439 case ARG_LINK_JOURNAL:
440 if (streq(optarg, "auto"))
441 arg_link_journal = LINK_AUTO;
442 else if (streq(optarg, "no"))
443 arg_link_journal = LINK_NO;
444 else if (streq(optarg, "guest"))
445 arg_link_journal = LINK_GUEST;
446 else if (streq(optarg, "host"))
447 arg_link_journal = LINK_HOST;
574edc90
MP
448 else if (streq(optarg, "try-guest")) {
449 arg_link_journal = LINK_GUEST;
450 arg_link_journal_try = true;
451 } else if (streq(optarg, "try-host")) {
452 arg_link_journal = LINK_HOST;
453 arg_link_journal_try = true;
454 } else {
57fb9fb5
LP
455 log_error("Failed to parse link journal mode %s", optarg);
456 return -EINVAL;
457 }
458
459 break;
460
17fe0523
LP
461 case ARG_BIND:
462 case ARG_BIND_RO: {
463 _cleanup_free_ char *a = NULL, *b = NULL;
464 char *e;
465 char ***x;
17fe0523
LP
466
467 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
468
469 e = strchr(optarg, ':');
470 if (e) {
471 a = strndup(optarg, e - optarg);
472 b = strdup(e + 1);
473 } else {
474 a = strdup(optarg);
475 b = strdup(optarg);
476 }
477
478 if (!a || !b)
479 return log_oom();
480
481 if (!path_is_absolute(a) || !path_is_absolute(b)) {
482 log_error("Invalid bind mount specification: %s", optarg);
483 return -EINVAL;
484 }
485
486 r = strv_extend(x, a);
487 if (r < 0)
b3451bed 488 return log_oom();
17fe0523
LP
489
490 r = strv_extend(x, b);
491 if (r < 0)
b3451bed 492 return log_oom();
17fe0523
LP
493
494 break;
495 }
496
06c17c39
LP
497 case ARG_TMPFS: {
498 _cleanup_free_ char *a = NULL, *b = NULL;
499 char *e;
500
501 e = strchr(optarg, ':');
502 if (e) {
503 a = strndup(optarg, e - optarg);
504 b = strdup(e + 1);
505 } else {
506 a = strdup(optarg);
507 b = strdup("mode=0755");
508 }
509
510 if (!a || !b)
511 return log_oom();
512
513 if (!path_is_absolute(a)) {
514 log_error("Invalid tmpfs specification: %s", optarg);
515 return -EINVAL;
516 }
517
518 r = strv_push(&arg_tmpfs, a);
519 if (r < 0)
520 return log_oom();
521
522 a = NULL;
523
524 r = strv_push(&arg_tmpfs, b);
525 if (r < 0)
526 return log_oom();
527
528 b = NULL;
529
530 break;
531 }
532
f4889f65
LP
533 case ARG_SETENV: {
534 char **n;
535
536 if (!env_assignment_is_valid(optarg)) {
537 log_error("Environment variable assignment '%s' is not valid.", optarg);
538 return -EINVAL;
539 }
540
541 n = strv_env_set(arg_setenv, optarg);
542 if (!n)
543 return log_oom();
544
545 strv_free(arg_setenv);
546 arg_setenv = n;
547 break;
548 }
549
284c0b91
LP
550 case 'q':
551 arg_quiet = true;
552 break;
553
8a96d94e
LP
554 case ARG_SHARE_SYSTEM:
555 arg_share_system = true;
556 break;
557
eb91eb18
LP
558 case ARG_REGISTER:
559 r = parse_boolean(optarg);
560 if (r < 0) {
561 log_error("Failed to parse --register= argument: %s", optarg);
562 return r;
563 }
564
565 arg_register = r;
566 break;
567
89f7c846
LP
568 case ARG_KEEP_UNIT:
569 arg_keep_unit = true;
570 break;
571
6afc95b7
LP
572 case ARG_PERSONALITY:
573
ac45f971 574 arg_personality = personality_from_string(optarg);
6afc95b7
LP
575 if (arg_personality == 0xffffffffLU) {
576 log_error("Unknown or unsupported personality '%s'.", optarg);
577 return -EINVAL;
578 }
579
580 break;
581
4d9f07b4
LP
582 case ARG_VOLATILE:
583
584 if (!optarg)
585 arg_volatile = VOLATILE_YES;
586 else {
587 r = parse_boolean(optarg);
588 if (r < 0) {
589 if (streq(optarg, "state"))
590 arg_volatile = VOLATILE_STATE;
591 else {
592 log_error("Failed to parse --volatile= argument: %s", optarg);
593 return r;
594 }
595 } else
596 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
597 }
598
599 break;
600
88213476
LP
601 case '?':
602 return -EINVAL;
603
604 default:
eb9da376 605 assert_not_reached("Unhandled option");
88213476 606 }
88213476 607
eb91eb18
LP
608 if (arg_share_system)
609 arg_register = false;
610
611 if (arg_boot && arg_share_system) {
612 log_error("--boot and --share-system may not be combined.");
613 return -EINVAL;
614 }
615
89f7c846
LP
616 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
617 log_error("--keep-unit may not be used when invoked from a user session.");
618 return -EINVAL;
619 }
620
1b9e5b12
LP
621 if (arg_directory && arg_image) {
622 log_error("--directory= and --image= may not be combined.");
623 return -EINVAL;
624 }
625
4d9f07b4
LP
626 if (arg_volatile != VOLATILE_NO && arg_read_only) {
627 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
628 return -EINVAL;
629 }
630
a42c8b54
LP
631 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
632
88213476
LP
633 return 1;
634}
635
636static int mount_all(const char *dest) {
637
638 typedef struct MountPoint {
639 const char *what;
640 const char *where;
641 const char *type;
642 const char *options;
643 unsigned long flags;
3bd66c05 644 bool fatal;
88213476
LP
645 } MountPoint;
646
647 static const MountPoint mount_table[] = {
06c17c39
LP
648 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
649 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
650 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
651 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
652 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 653 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
06c17c39
LP
654 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
655 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 656#ifdef HAVE_SELINUX
06c17c39
LP
657 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
658 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 659#endif
88213476
LP
660 };
661
662 unsigned k;
663 int r = 0;
664
665 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
7fd1b19b 666 _cleanup_free_ char *where = NULL;
d002827b 667#ifdef HAVE_SELINUX
a8828ed9 668 _cleanup_free_ char *options = NULL;
d002827b
LP
669#endif
670 const char *o;
88213476
LP
671 int t;
672
17fe0523
LP
673 where = strjoin(dest, "/", mount_table[k].where, NULL);
674 if (!where)
675 return log_oom();
88213476 676
e65aec12 677 t = path_is_mount_point(where, true);
68fb0892 678 if (t < 0) {
da927ba9 679 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
88213476
LP
680
681 if (r == 0)
682 r = t;
683
684 continue;
685 }
686
9c1c7f71
LP
687 /* Skip this entry if it is not a remount. */
688 if (mount_table[k].what && t > 0)
014a9c77
LP
689 continue;
690
79d80fc1
TG
691 t = mkdir_p(where, 0755);
692 if (t < 0) {
693 if (mount_table[k].fatal) {
da927ba9 694 log_error_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
695
696 if (r == 0)
697 r = t;
698 } else
da927ba9 699 log_warning_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
700
701 continue;
702 }
88213476 703
a8828ed9 704#ifdef HAVE_SELINUX
82adf6af
LP
705 if (arg_selinux_apifs_context &&
706 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
707 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
708 if (!options)
709 return log_oom();
710
711 o = options;
712 } else
a8828ed9 713#endif
d002827b 714 o = mount_table[k].options;
a8828ed9 715
a8828ed9 716
88213476
LP
717 if (mount(mount_table[k].what,
718 where,
719 mount_table[k].type,
720 mount_table[k].flags,
79d80fc1 721 o) < 0) {
88213476 722
79d80fc1 723 if (mount_table[k].fatal) {
56f64d95 724 log_error_errno(errno, "mount(%s) failed: %m", where);
88213476 725
79d80fc1
TG
726 if (r == 0)
727 r = -errno;
728 } else
56f64d95 729 log_warning_errno(errno, "mount(%s) failed: %m", where);
88213476 730 }
88213476
LP
731 }
732
e58a1277
LP
733 return r;
734}
f8440af5 735
d6797c92 736static int mount_binds(const char *dest, char **l, bool ro) {
17fe0523
LP
737 char **x, **y;
738
739 STRV_FOREACH_PAIR(x, y, l) {
06c17c39 740 _cleanup_free_ char *where = NULL;
d2421337 741 struct stat source_st, dest_st;
2ed4e5e0 742 int r;
d2421337 743
4a62c710
MS
744 if (stat(*x, &source_st) < 0)
745 return log_error_errno(errno, "Failed to stat %s: %m", *x);
17fe0523 746
06c17c39
LP
747 where = strappend(dest, *y);
748 if (!where)
749 return log_oom();
750
2ed4e5e0
SL
751 r = stat(where, &dest_st);
752 if (r == 0) {
d2421337 753 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
06c17c39 754 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
d2421337
DR
755 return -EINVAL;
756 }
2ed4e5e0
SL
757 } else if (errno == ENOENT) {
758 r = mkdir_parents_label(where, 0755);
f647962d
MS
759 if (r < 0)
760 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
2ed4e5e0 761 } else {
56f64d95 762 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
2ed4e5e0
SL
763 return -errno;
764 }
06c17c39 765
2ed4e5e0 766 /* Create the mount point, but be conservative -- refuse to create block
4d9f07b4 767 * and char devices. */
79d80fc1
TG
768 if (S_ISDIR(source_st.st_mode)) {
769 r = mkdir_label(where, 0755);
f647962d
MS
770 if (r < 0 && errno != EEXIST)
771 return log_error_errno(r, "Failed to create mount point %s: %m", where);
79d80fc1
TG
772 } else if (S_ISFIFO(source_st.st_mode)) {
773 r = mkfifo(where, 0644);
4a62c710
MS
774 if (r < 0 && errno != EEXIST)
775 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
79d80fc1
TG
776 } else if (S_ISSOCK(source_st.st_mode)) {
777 r = mknod(where, 0644 | S_IFSOCK, 0);
4a62c710
MS
778 if (r < 0 && errno != EEXIST)
779 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
79d80fc1
TG
780 } else if (S_ISREG(source_st.st_mode)) {
781 r = touch(where);
f647962d
MS
782 if (r < 0)
783 return log_error_errno(r, "Failed to create mount point %s: %m", where);
79d80fc1 784 } else {
2ed4e5e0
SL
785 log_error("Refusing to create mountpoint for file: %s", *x);
786 return -ENOTSUP;
d2421337 787 }
17fe0523 788
4a62c710
MS
789 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
790 return log_error_errno(errno, "mount(%s) failed: %m", where);
17fe0523 791
d6797c92
LP
792 if (ro) {
793 r = bind_remount_recursive(where, true);
f647962d
MS
794 if (r < 0)
795 return log_error_errno(r, "Read-Only bind mount failed: %m");
17fe0523
LP
796 }
797 }
798
799 return 0;
800}
801
06c17c39
LP
802static int mount_tmpfs(const char *dest) {
803 char **i, **o;
804
805 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
806 _cleanup_free_ char *where = NULL;
79d80fc1 807 int r;
06c17c39
LP
808
809 where = strappend(dest, *i);
810 if (!where)
811 return log_oom();
812
79d80fc1 813 r = mkdir_label(where, 0755);
04a91939
LP
814 if (r < 0 && r != -EEXIST)
815 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
06c17c39 816
4a62c710
MS
817 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
818 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
06c17c39
LP
819 }
820
821 return 0;
822}
823
e58a1277 824static int setup_timezone(const char *dest) {
d4036145
LP
825 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
826 char *z, *y;
827 int r;
f8440af5 828
e58a1277
LP
829 assert(dest);
830
831 /* Fix the timezone, if possible */
d4036145
LP
832 r = readlink_malloc("/etc/localtime", &p);
833 if (r < 0) {
834 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
835 return 0;
836 }
837
838 z = path_startswith(p, "../usr/share/zoneinfo/");
839 if (!z)
840 z = path_startswith(p, "/usr/share/zoneinfo/");
841 if (!z) {
842 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
843 return 0;
844 }
845
04bc4a3f
LP
846 where = strappend(dest, "/etc/localtime");
847 if (!where)
0d0f0c50 848 return log_oom();
715ac17a 849
d4036145
LP
850 r = readlink_malloc(where, &q);
851 if (r >= 0) {
852 y = path_startswith(q, "../usr/share/zoneinfo/");
853 if (!y)
854 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 855
d4036145
LP
856 /* Already pointing to the right place? Then do nothing .. */
857 if (y && streq(y, z))
858 return 0;
859 }
860
861 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
862 if (!check)
0d0f0c50 863 return log_oom();
4d1c38b8 864
d4036145
LP
865 if (access(check, F_OK) < 0) {
866 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
867 return 0;
868 }
68fb0892 869
d4036145
LP
870 what = strappend("../usr/share/zoneinfo/", z);
871 if (!what)
872 return log_oom();
873
79d80fc1
TG
874 r = mkdir_parents(where, 0755);
875 if (r < 0) {
da927ba9 876 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
79d80fc1
TG
877
878 return 0;
879 }
880
881 r = unlink(where);
882 if (r < 0 && errno != ENOENT) {
56f64d95 883 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
884
885 return 0;
886 }
4d9f07b4 887
d4036145 888 if (symlink(what, where) < 0) {
56f64d95 889 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
890 return 0;
891 }
e58a1277
LP
892
893 return 0;
88213476
LP
894}
895
2547bb41 896static int setup_resolv_conf(const char *dest) {
c8b32e11 897 _cleanup_free_ char *where = NULL;
79d80fc1 898 int r;
2547bb41
LP
899
900 assert(dest);
901
902 if (arg_private_network)
903 return 0;
904
905 /* Fix resolv.conf, if possible */
04bc4a3f
LP
906 where = strappend(dest, "/etc/resolv.conf");
907 if (!where)
0d0f0c50 908 return log_oom();
2547bb41 909
77e63faf
LP
910 /* We don't really care for the results of this really. If it
911 * fails, it fails, but meh... */
79d80fc1
TG
912 r = mkdir_parents(where, 0755);
913 if (r < 0) {
da927ba9 914 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
79d80fc1
TG
915
916 return 0;
917 }
918
919 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
920 if (r < 0) {
da927ba9 921 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
922
923 return 0;
924 }
2547bb41
LP
925
926 return 0;
927}
928
4d9f07b4
LP
929static int setup_volatile_state(const char *directory) {
930 const char *p;
931 int r;
932
933 assert(directory);
934
935 if (arg_volatile != VOLATILE_STATE)
936 return 0;
937
938 /* --volatile=state means we simply overmount /var
939 with a tmpfs, and the rest read-only. */
940
941 r = bind_remount_recursive(directory, true);
f647962d
MS
942 if (r < 0)
943 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
4d9f07b4
LP
944
945 p = strappenda(directory, "/var");
79d80fc1 946 r = mkdir(p, 0755);
4a62c710
MS
947 if (r < 0 && errno != EEXIST)
948 return log_error_errno(errno, "Failed to create %s: %m", directory);
4d9f07b4 949
4a62c710
MS
950 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
951 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
4d9f07b4
LP
952
953 return 0;
954}
955
956static int setup_volatile(const char *directory) {
957 bool tmpfs_mounted = false, bind_mounted = false;
958 char template[] = "/tmp/nspawn-volatile-XXXXXX";
959 const char *f, *t;
960 int r;
961
962 assert(directory);
963
964 if (arg_volatile != VOLATILE_YES)
965 return 0;
966
967 /* --volatile=yes means we mount a tmpfs to the root dir, and
968 the original /usr to use inside it, and that read-only. */
969
4a62c710
MS
970 if (!mkdtemp(template))
971 return log_error_errno(errno, "Failed to create temporary directory: %m");
4d9f07b4
LP
972
973 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
56f64d95 974 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
4d9f07b4
LP
975 r = -errno;
976 goto fail;
977 }
978
979 tmpfs_mounted = true;
980
981 f = strappenda(directory, "/usr");
982 t = strappenda(template, "/usr");
983
79d80fc1
TG
984 r = mkdir(t, 0755);
985 if (r < 0 && errno != EEXIST) {
56f64d95 986 log_error_errno(errno, "Failed to create %s: %m", t);
79d80fc1
TG
987 r = -errno;
988 goto fail;
989 }
990
4d9f07b4 991 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
56f64d95 992 log_error_errno(errno, "Failed to create /usr bind mount: %m");
4d9f07b4
LP
993 r = -errno;
994 goto fail;
995 }
996
997 bind_mounted = true;
998
999 r = bind_remount_recursive(t, true);
1000 if (r < 0) {
da927ba9 1001 log_error_errno(r, "Failed to remount %s read-only: %m", t);
4d9f07b4
LP
1002 goto fail;
1003 }
1004
1005 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
56f64d95 1006 log_error_errno(errno, "Failed to move root mount: %m");
4d9f07b4
LP
1007 r = -errno;
1008 goto fail;
1009 }
1010
1011 rmdir(template);
1012
1013 return 0;
1014
1015fail:
1016 if (bind_mounted)
1017 umount(t);
1018 if (tmpfs_mounted)
1019 umount(template);
1020 rmdir(template);
1021 return r;
1022}
1023
9f24adc2
LP
1024static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1025
1026 snprintf(s, 37,
1027 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1028 SD_ID128_FORMAT_VAL(id));
1029
1030 return s;
1031}
1032
04bc4a3f 1033static int setup_boot_id(const char *dest) {
7fd1b19b 1034 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 1035 sd_id128_t rnd = {};
04bc4a3f
LP
1036 char as_uuid[37];
1037 int r;
1038
1039 assert(dest);
1040
eb91eb18
LP
1041 if (arg_share_system)
1042 return 0;
1043
04bc4a3f
LP
1044 /* Generate a new randomized boot ID, so that each boot-up of
1045 * the container gets a new one */
1046
1047 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 1048 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
1049 if (!from || !to)
1050 return log_oom();
04bc4a3f
LP
1051
1052 r = sd_id128_randomize(&rnd);
f647962d
MS
1053 if (r < 0)
1054 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1055
9f24adc2 1056 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1057
574d5f2d 1058 r = write_string_file(from, as_uuid);
f647962d
MS
1059 if (r < 0)
1060 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f
LP
1061
1062 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
56f64d95 1063 log_error_errno(errno, "Failed to bind mount boot id: %m");
04bc4a3f 1064 r = -errno;
10d18763 1065 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
56f64d95 1066 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1067
1068 unlink(from);
04bc4a3f
LP
1069 return r;
1070}
1071
e58a1277 1072static int copy_devnodes(const char *dest) {
88213476
LP
1073
1074 static const char devnodes[] =
1075 "null\0"
1076 "zero\0"
1077 "full\0"
1078 "random\0"
1079 "urandom\0"
85614d66
TG
1080 "tty\0"
1081 "net/tun\0";
88213476
LP
1082
1083 const char *d;
e58a1277 1084 int r = 0;
7fd1b19b 1085 _cleanup_umask_ mode_t u;
a258bf26
LP
1086
1087 assert(dest);
124640f1
LP
1088
1089 u = umask(0000);
88213476
LP
1090
1091 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1092 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1093 struct stat st;
88213476 1094
7f112f50
LP
1095 from = strappend("/dev/", d);
1096 to = strjoin(dest, "/dev/", d, NULL);
1097 if (!from || !to)
1098 return log_oom();
88213476
LP
1099
1100 if (stat(from, &st) < 0) {
1101
4a62c710
MS
1102 if (errno != ENOENT)
1103 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1104
a258bf26 1105 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1106
ed8b7a3e 1107 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 1108 return -EIO;
a258bf26 1109
85614d66
TG
1110 } else {
1111 r = mkdir_parents(to, 0775);
1112 if (r < 0) {
da927ba9 1113 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
85614d66
TG
1114 return -r;
1115 }
a258bf26 1116
4a62c710
MS
1117 if (mknod(to, st.st_mode, st.st_rdev) < 0)
1118 return log_error_errno(errno, "mknod(%s) failed: %m", dest);
88213476 1119 }
88213476
LP
1120 }
1121
e58a1277
LP
1122 return r;
1123}
88213476 1124
f2d88580
LP
1125static int setup_ptmx(const char *dest) {
1126 _cleanup_free_ char *p = NULL;
1127
1128 p = strappend(dest, "/dev/ptmx");
1129 if (!p)
1130 return log_oom();
1131
4a62c710
MS
1132 if (symlink("pts/ptmx", p) < 0)
1133 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
f2d88580
LP
1134
1135 return 0;
1136}
1137
e58a1277 1138static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1139 _cleanup_umask_ mode_t u;
1140 const char *to;
e58a1277 1141 struct stat st;
e58a1277 1142 int r;
e58a1277
LP
1143
1144 assert(dest);
1145 assert(console);
1146
1147 u = umask(0000);
1148
4a62c710
MS
1149 if (stat("/dev/null", &st) < 0)
1150 return log_error_errno(errno, "Failed to stat /dev/null: %m");
88213476 1151
e58a1277 1152 r = chmod_and_chown(console, 0600, 0, 0);
f647962d
MS
1153 if (r < 0)
1154 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1155
a258bf26
LP
1156 /* We need to bind mount the right tty to /dev/console since
1157 * ptys can only exist on pts file systems. To have something
eb0f0863
LP
1158 * to bind mount things on we create a device node first, and
1159 * use /dev/null for that since we the cgroups device policy
1160 * allows us to create that freely, while we cannot create
1161 * /dev/console. (Note that the major minor doesn't actually
1162 * matter here, since we mount it over anyway). */
a258bf26 1163
eb0f0863 1164 to = strappenda(dest, "/dev/console");
4a62c710
MS
1165 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1166 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
a258bf26 1167
4a62c710
MS
1168 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1169 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1170
25ea79fe 1171 return 0;
e58a1277
LP
1172}
1173
1174static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 1175 _cleanup_free_ char *from = NULL, *to = NULL;
e58a1277 1176 int r, fd, k;
7fd1b19b 1177 _cleanup_umask_ mode_t u;
e58a1277
LP
1178 union {
1179 struct cmsghdr cmsghdr;
1180 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1181 } control = {};
1182 struct msghdr mh = {
1183 .msg_control = &control,
1184 .msg_controllen = sizeof(control),
1185 };
e58a1277
LP
1186 struct cmsghdr *cmsg;
1187
1188 assert(dest);
1189 assert(kmsg_socket >= 0);
a258bf26 1190
e58a1277 1191 u = umask(0000);
a258bf26 1192
f1e5dfe2
LP
1193 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1194 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1195 * on the reading side behave very similar to /proc/kmsg,
1196 * their writing side behaves differently from /dev/kmsg in
1197 * that writing blocks when nothing is reading. In order to
1198 * avoid any problems with containers deadlocking due to this
1199 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
1200 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1201 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1202 return log_oom();
e58a1277 1203
4a62c710
MS
1204 if (mkfifo(from, 0600) < 0)
1205 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
e58a1277
LP
1206
1207 r = chmod_and_chown(from, 0600, 0, 0);
f647962d
MS
1208 if (r < 0)
1209 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
e58a1277 1210
4a62c710
MS
1211 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1212 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1213
1214 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1215 if (fd < 0)
1216 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1217
e58a1277
LP
1218 cmsg = CMSG_FIRSTHDR(&mh);
1219 cmsg->cmsg_level = SOL_SOCKET;
1220 cmsg->cmsg_type = SCM_RIGHTS;
1221 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1222 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1223
1224 mh.msg_controllen = cmsg->cmsg_len;
1225
1226 /* Store away the fd in the socket, so that it stays open as
1227 * long as we run the child */
1228 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
03e334a1 1229 safe_close(fd);
e58a1277 1230
4a62c710
MS
1231 if (k < 0)
1232 return log_error_errno(errno, "Failed to send FIFO fd: %m");
a258bf26 1233
f1e5dfe2
LP
1234 /* And now make the FIFO unavailable as /dev/kmsg... */
1235 unlink(from);
25ea79fe 1236 return 0;
88213476
LP
1237}
1238
3a74cea5 1239static int setup_hostname(void) {
3a74cea5 1240
eb91eb18
LP
1241 if (arg_share_system)
1242 return 0;
1243
605f81a8 1244 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1245 return -errno;
3a74cea5 1246
7027ff61 1247 return 0;
3a74cea5
LP
1248}
1249
57fb9fb5 1250static int setup_journal(const char *directory) {
4d680aee 1251 sd_id128_t machine_id, this_id;
7fd1b19b 1252 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 1253 char *id;
57fb9fb5
LP
1254 int r;
1255
57fb9fb5 1256 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
1257 if (!p)
1258 return log_oom();
57fb9fb5
LP
1259
1260 r = read_one_line_file(p, &b);
27407a01
ZJS
1261 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1262 return 0;
f647962d
MS
1263 else if (r < 0)
1264 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
57fb9fb5 1265
27407a01
ZJS
1266 id = strstrip(b);
1267 if (isempty(id) && arg_link_journal == LINK_AUTO)
1268 return 0;
57fb9fb5 1269
27407a01
ZJS
1270 /* Verify validity */
1271 r = sd_id128_from_string(id, &machine_id);
f647962d
MS
1272 if (r < 0)
1273 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
57fb9fb5 1274
4d680aee 1275 r = sd_id128_get_machine(&this_id);
f647962d
MS
1276 if (r < 0)
1277 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1278
1279 if (sd_id128_equal(machine_id, this_id)) {
1280 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1281 "Host and machine ids are equal (%s): refusing to link journals", id);
1282 if (arg_link_journal == LINK_AUTO)
1283 return 0;
1284 return
1285 -EEXIST;
1286 }
1287
1288 if (arg_link_journal == LINK_NO)
1289 return 0;
1290
57fb9fb5 1291 free(p);
27407a01
ZJS
1292 p = strappend("/var/log/journal/", id);
1293 q = strjoin(directory, "/var/log/journal/", id, NULL);
1294 if (!p || !q)
1295 return log_oom();
1296
1297 if (path_is_mount_point(p, false) > 0) {
1298 if (arg_link_journal != LINK_AUTO) {
1299 log_error("%s: already a mount point, refusing to use for journal", p);
1300 return -EEXIST;
1301 }
1302
1303 return 0;
57fb9fb5
LP
1304 }
1305
27407a01 1306 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1307 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1308 log_error("%s: already a mount point, refusing to use for journal", q);
1309 return -EEXIST;
57fb9fb5
LP
1310 }
1311
27407a01 1312 return 0;
57fb9fb5
LP
1313 }
1314
1315 r = readlink_and_make_absolute(p, &d);
1316 if (r >= 0) {
1317 if ((arg_link_journal == LINK_GUEST ||
1318 arg_link_journal == LINK_AUTO) &&
1319 path_equal(d, q)) {
1320
27407a01
ZJS
1321 r = mkdir_p(q, 0755);
1322 if (r < 0)
56f64d95 1323 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1324 return 0;
57fb9fb5
LP
1325 }
1326
4a62c710
MS
1327 if (unlink(p) < 0)
1328 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1329 } else if (r == -EINVAL) {
1330
1331 if (arg_link_journal == LINK_GUEST &&
1332 rmdir(p) < 0) {
1333
27407a01
ZJS
1334 if (errno == ENOTDIR) {
1335 log_error("%s already exists and is neither a symlink nor a directory", p);
1336 return r;
1337 } else {
56f64d95 1338 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 1339 return -errno;
57fb9fb5 1340 }
57fb9fb5
LP
1341 }
1342 } else if (r != -ENOENT) {
56f64d95 1343 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 1344 return r;
57fb9fb5
LP
1345 }
1346
1347 if (arg_link_journal == LINK_GUEST) {
1348
1349 if (symlink(q, p) < 0) {
574edc90 1350 if (arg_link_journal_try) {
56f64d95 1351 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
1352 return 0;
1353 } else {
56f64d95 1354 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
1355 return -errno;
1356 }
57fb9fb5
LP
1357 }
1358
27407a01
ZJS
1359 r = mkdir_p(q, 0755);
1360 if (r < 0)
56f64d95 1361 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1362 return 0;
57fb9fb5
LP
1363 }
1364
1365 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1366 /* don't create parents here -- if the host doesn't have
1367 * permanent journal set up, don't force it here */
1368 r = mkdir(p, 0755);
57fb9fb5 1369 if (r < 0) {
574edc90 1370 if (arg_link_journal_try) {
56f64d95 1371 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
1372 return 0;
1373 } else {
56f64d95 1374 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
1375 return r;
1376 }
57fb9fb5
LP
1377 }
1378
27407a01
ZJS
1379 } else if (access(p, F_OK) < 0)
1380 return 0;
57fb9fb5 1381
cdb2b9d0
LP
1382 if (dir_is_empty(q) == 0)
1383 log_warning("%s is not empty, proceeding anyway.", q);
1384
57fb9fb5
LP
1385 r = mkdir_p(q, 0755);
1386 if (r < 0) {
56f64d95 1387 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 1388 return r;
57fb9fb5
LP
1389 }
1390
4a62c710
MS
1391 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1392 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1393
27407a01 1394 return 0;
57fb9fb5
LP
1395}
1396
88213476 1397static int drop_capabilities(void) {
5076f0cc 1398 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1399}
1400
5aa4bb6b 1401static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 1402 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
24996861 1403 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
1404 int r;
1405
eb91eb18
LP
1406 if (!arg_register)
1407 return 0;
1408
1c03020c 1409 r = sd_bus_default_system(&bus);
f647962d
MS
1410 if (r < 0)
1411 return log_error_errno(r, "Failed to open system bus: %m");
9444b1f2 1412
89f7c846
LP
1413 if (arg_keep_unit) {
1414 r = sd_bus_call_method(
1415 bus,
1416 "org.freedesktop.machine1",
1417 "/org/freedesktop/machine1",
1418 "org.freedesktop.machine1.Manager",
5aa4bb6b 1419 "RegisterMachineWithNetwork",
89f7c846
LP
1420 &error,
1421 NULL,
5aa4bb6b 1422 "sayssusai",
89f7c846
LP
1423 arg_machine,
1424 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1425 "nspawn",
1426 "container",
1427 (uint32_t) pid,
5aa4bb6b
LP
1428 strempty(arg_directory),
1429 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 1430 } else {
9457ac5b
LP
1431 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1432
1433 r = sd_bus_message_new_method_call(
89f7c846 1434 bus,
9457ac5b 1435 &m,
89f7c846
LP
1436 "org.freedesktop.machine1",
1437 "/org/freedesktop/machine1",
1438 "org.freedesktop.machine1.Manager",
5aa4bb6b 1439 "CreateMachineWithNetwork");
f647962d
MS
1440 if (r < 0)
1441 return log_error_errno(r, "Failed to create message: %m");
9457ac5b
LP
1442
1443 r = sd_bus_message_append(
1444 m,
5aa4bb6b 1445 "sayssusai",
89f7c846
LP
1446 arg_machine,
1447 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1448 "nspawn",
1449 "container",
1450 (uint32_t) pid,
5aa4bb6b
LP
1451 strempty(arg_directory),
1452 local_ifindex > 0 ? 1 : 0, local_ifindex);
f647962d
MS
1453 if (r < 0)
1454 return log_error_errno(r, "Failed to append message arguments: %m");
9457ac5b
LP
1455
1456 r = sd_bus_message_open_container(m, 'a', "(sv)");
f647962d
MS
1457 if (r < 0)
1458 return log_error_errno(r, "Failed to open container: %m");
9457ac5b
LP
1459
1460 if (!isempty(arg_slice)) {
1461 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
f647962d
MS
1462 if (r < 0)
1463 return log_error_errno(r, "Failed to append slice: %m");
9457ac5b
LP
1464 }
1465
1466 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
f647962d
MS
1467 if (r < 0)
1468 return log_error_errno(r, "Failed to add device policy: %m");
9457ac5b 1469
63cc4c31 1470 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
9457ac5b
LP
1471 /* Allow the container to
1472 * access and create the API
1473 * device nodes, so that
1474 * PrivateDevices= in the
1475 * container can work
1476 * fine */
1477 "/dev/null", "rwm",
1478 "/dev/zero", "rwm",
1479 "/dev/full", "rwm",
1480 "/dev/random", "rwm",
1481 "/dev/urandom", "rwm",
1482 "/dev/tty", "rwm",
864e1706 1483 "/dev/net/tun", "rwm",
9457ac5b
LP
1484 /* Allow the container
1485 * access to ptys. However,
1486 * do not permit the
1487 * container to ever create
1488 * these device nodes. */
1489 "/dev/pts/ptmx", "rw",
63cc4c31 1490 "char-pts", "rw");
f647962d
MS
1491 if (r < 0)
1492 return log_error_errno(r, "Failed to add device whitelist: %m");
9457ac5b
LP
1493
1494 r = sd_bus_message_close_container(m);
f647962d
MS
1495 if (r < 0)
1496 return log_error_errno(r, "Failed to close container: %m");
9457ac5b
LP
1497
1498 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
1499 }
1500
9444b1f2 1501 if (r < 0) {
1f0cd86b
LP
1502 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1503 return r;
1504 }
1505
1506 return 0;
1507}
1508
1509static int terminate_machine(pid_t pid) {
1510 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1511 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
24996861 1512 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
1513 const char *path;
1514 int r;
1515
eb91eb18
LP
1516 if (!arg_register)
1517 return 0;
1518
76b54375 1519 r = sd_bus_default_system(&bus);
f647962d
MS
1520 if (r < 0)
1521 return log_error_errno(r, "Failed to open system bus: %m");
1f0cd86b
LP
1522
1523 r = sd_bus_call_method(
1524 bus,
1525 "org.freedesktop.machine1",
1526 "/org/freedesktop/machine1",
1527 "org.freedesktop.machine1.Manager",
1528 "GetMachineByPID",
1529 &error,
1530 &reply,
1531 "u",
1532 (uint32_t) pid);
1533 if (r < 0) {
1534 /* Note that the machine might already have been
1535 * cleaned up automatically, hence don't consider it a
1536 * failure if we cannot get the machine object. */
1537 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1538 return 0;
1539 }
1540
1541 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
1542 if (r < 0)
1543 return bus_log_parse_error(r);
9444b1f2 1544
1f0cd86b
LP
1545 r = sd_bus_call_method(
1546 bus,
1547 "org.freedesktop.machine1",
1548 path,
1549 "org.freedesktop.machine1.Machine",
1550 "Terminate",
1551 &error,
1552 NULL,
1553 NULL);
1554 if (r < 0) {
1555 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1556 return 0;
1557 }
1558
9444b1f2
LP
1559 return 0;
1560}
1561
db999e0f
LP
1562static int reset_audit_loginuid(void) {
1563 _cleanup_free_ char *p = NULL;
1564 int r;
1565
1566 if (arg_share_system)
1567 return 0;
1568
1569 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1570 if (r == -ENOENT)
db999e0f 1571 return 0;
f647962d
MS
1572 if (r < 0)
1573 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1574
1575 /* Already reset? */
1576 if (streq(p, "4294967295"))
1577 return 0;
1578
1579 r = write_string_file("/proc/self/loginuid", "4294967295");
1580 if (r < 0) {
1581 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1582 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1583 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1584 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1585 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
77b6e194 1586
db999e0f 1587 sleep(5);
77b6e194 1588 }
db999e0f
LP
1589
1590 return 0;
77b6e194
LP
1591}
1592
4f758c23
LP
1593#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1594#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
e867ceb6 1595#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
01dde061 1596
a90e2305 1597static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
01dde061
TG
1598 uint8_t result[8];
1599 size_t l, sz;
a90e2305
LP
1600 uint8_t *v, *i;
1601 int r;
01dde061
TG
1602
1603 l = strlen(arg_machine);
1604 sz = sizeof(sd_id128_t) + l;
e867ceb6
LP
1605 if (idx > 0)
1606 sz += sizeof(idx);
a90e2305 1607
01dde061
TG
1608 v = alloca(sz);
1609
1610 /* fetch some persistent data unique to the host */
1611 r = sd_id128_get_machine((sd_id128_t*) v);
1612 if (r < 0)
1613 return r;
1614
1615 /* combine with some data unique (on this host) to this
1616 * container instance */
a90e2305
LP
1617 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
1618 if (idx > 0) {
1619 idx = htole64(idx);
1620 memcpy(i, &idx, sizeof(idx));
1621 }
01dde061
TG
1622
1623 /* Let's hash the host machine ID plus the container name. We
1624 * use a fixed, but originally randomly created hash key here. */
4f758c23 1625 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
1626
1627 assert_cc(ETH_ALEN <= sizeof(result));
1628 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1629
1630 /* see eth_random_addr in the kernel */
1631 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1632 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1633
1634 return 0;
1635}
1636
5aa4bb6b 1637static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
69c79d3c 1638 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 1639 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4f758c23 1640 struct ether_addr mac_host, mac_container;
5aa4bb6b 1641 int r, i;
69c79d3c
LP
1642
1643 if (!arg_private_network)
1644 return 0;
1645
1646 if (!arg_network_veth)
1647 return 0;
1648
08af0da2
LP
1649 /* Use two different interface name prefixes depending whether
1650 * we are in bridge mode or not. */
c00524c9 1651 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 1652 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 1653
e867ceb6
LP
1654 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
1655 if (r < 0)
1656 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
4f758c23 1657
e867ceb6
LP
1658 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
1659 if (r < 0)
1660 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
01dde061 1661
151b9b96 1662 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
1663 if (r < 0)
1664 return log_error_errno(r, "Failed to connect to netlink: %m");
69c79d3c 1665
151b9b96 1666 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
1667 if (r < 0)
1668 return log_error_errno(r, "Failed to allocate netlink message: %m");
69c79d3c 1669
ab046dde 1670 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
f647962d
MS
1671 if (r < 0)
1672 return log_error_errno(r, "Failed to add netlink interface name: %m");
69c79d3c 1673
4f758c23 1674 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
f647962d
MS
1675 if (r < 0)
1676 return log_error_errno(r, "Failed to add netlink MAC address: %m");
4f758c23 1677
ee3a6a51 1678 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
1679 if (r < 0)
1680 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 1681
d8e538ec 1682 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
f647962d
MS
1683 if (r < 0)
1684 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 1685
ee3a6a51 1686 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
f647962d
MS
1687 if (r < 0)
1688 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 1689
ab046dde 1690 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
f647962d
MS
1691 if (r < 0)
1692 return log_error_errno(r, "Failed to add netlink interface name: %m");
01dde061 1693
4f758c23 1694 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
f647962d
MS
1695 if (r < 0)
1696 return log_error_errno(r, "Failed to add netlink MAC address: %m");
69c79d3c 1697
ab046dde 1698 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
1699 if (r < 0)
1700 return log_error_errno(r, "Failed to add netlink namespace field: %m");
69c79d3c
LP
1701
1702 r = sd_rtnl_message_close_container(m);
f647962d
MS
1703 if (r < 0)
1704 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
1705
1706 r = sd_rtnl_message_close_container(m);
f647962d
MS
1707 if (r < 0)
1708 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
1709
1710 r = sd_rtnl_message_close_container(m);
f647962d
MS
1711 if (r < 0)
1712 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
1713
1714 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
1715 if (r < 0)
1716 return log_error_errno(r, "Failed to add new veth interfaces: %m");
69c79d3c 1717
5aa4bb6b 1718 i = (int) if_nametoindex(iface_name);
4a62c710
MS
1719 if (i <= 0)
1720 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
5aa4bb6b
LP
1721
1722 *ifi = i;
1723
69c79d3c
LP
1724 return 0;
1725}
1726
5aa4bb6b 1727static int setup_bridge(const char veth_name[], int *ifi) {
ab046dde
TG
1728 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1729 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1730 int r, bridge;
1731
1732 if (!arg_private_network)
1733 return 0;
1734
1735 if (!arg_network_veth)
1736 return 0;
1737
1738 if (!arg_network_bridge)
1739 return 0;
1740
1741 bridge = (int) if_nametoindex(arg_network_bridge);
4a62c710
MS
1742 if (bridge <= 0)
1743 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
ab046dde 1744
5aa4bb6b
LP
1745 *ifi = bridge;
1746
151b9b96 1747 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
1748 if (r < 0)
1749 return log_error_errno(r, "Failed to connect to netlink: %m");
ab046dde 1750
151b9b96 1751 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
f647962d
MS
1752 if (r < 0)
1753 return log_error_errno(r, "Failed to allocate netlink message: %m");
ab046dde 1754
039dd4af 1755 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
f647962d
MS
1756 if (r < 0)
1757 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
039dd4af 1758
ab046dde 1759 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
f647962d
MS
1760 if (r < 0)
1761 return log_error_errno(r, "Failed to add netlink interface name field: %m");
ab046dde
TG
1762
1763 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
f647962d
MS
1764 if (r < 0)
1765 return log_error_errno(r, "Failed to add netlink master field: %m");
ab046dde
TG
1766
1767 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
1768 if (r < 0)
1769 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
ab046dde
TG
1770
1771 return 0;
1772}
1773
c74e630d
LP
1774static int parse_interface(struct udev *udev, const char *name) {
1775 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1776 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1777 int ifi;
1778
1779 ifi = (int) if_nametoindex(name);
4a62c710
MS
1780 if (ifi <= 0)
1781 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
c74e630d
LP
1782
1783 sprintf(ifi_str, "n%i", ifi);
1784 d = udev_device_new_from_device_id(udev, ifi_str);
4a62c710
MS
1785 if (!d)
1786 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
c74e630d
LP
1787
1788 if (udev_device_get_is_initialized(d) <= 0) {
1789 log_error("Network interface %s is not initialized yet.", name);
1790 return -EBUSY;
1791 }
1792
1793 return ifi;
1794}
1795
69c79d3c 1796static int move_network_interfaces(pid_t pid) {
7e227024 1797 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 1798 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
1799 char **i;
1800 int r;
1801
1802 if (!arg_private_network)
1803 return 0;
1804
1805 if (strv_isempty(arg_network_interfaces))
1806 return 0;
1807
151b9b96 1808 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
1809 if (r < 0)
1810 return log_error_errno(r, "Failed to connect to netlink: %m");
aa28aefe 1811
7e227024
LP
1812 udev = udev_new();
1813 if (!udev) {
1814 log_error("Failed to connect to udev.");
1815 return -ENOMEM;
1816 }
1817
aa28aefe 1818 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 1819 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 1820 int ifi;
aa28aefe 1821
c74e630d
LP
1822 ifi = parse_interface(udev, *i);
1823 if (ifi < 0)
1824 return ifi;
1825
3125b3ef 1826 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
f647962d
MS
1827 if (r < 0)
1828 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 1829
c74e630d 1830 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
1831 if (r < 0)
1832 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
7e227024 1833
c74e630d 1834 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
1835 if (r < 0)
1836 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
c74e630d 1837 }
7e227024 1838
c74e630d
LP
1839 return 0;
1840}
1841
1842static int setup_macvlan(pid_t pid) {
1843 _cleanup_udev_unref_ struct udev *udev = NULL;
1844 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
e867ceb6 1845 unsigned idx = 0;
c74e630d
LP
1846 char **i;
1847 int r;
1848
1849 if (!arg_private_network)
1850 return 0;
1851
1852 if (strv_isempty(arg_network_macvlan))
1853 return 0;
1854
1855 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
1856 if (r < 0)
1857 return log_error_errno(r, "Failed to connect to netlink: %m");
c74e630d
LP
1858
1859 udev = udev_new();
1860 if (!udev) {
1861 log_error("Failed to connect to udev.");
1862 return -ENOMEM;
1863 }
1864
1865 STRV_FOREACH(i, arg_network_macvlan) {
1866 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1867 _cleanup_free_ char *n = NULL;
e867ceb6 1868 struct ether_addr mac;
c74e630d
LP
1869 int ifi;
1870
1871 ifi = parse_interface(udev, *i);
1872 if (ifi < 0)
1873 return ifi;
1874
e867ceb6
LP
1875 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
1876 if (r < 0)
1877 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
1878
c74e630d 1879 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
1880 if (r < 0)
1881 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 1882
c74e630d 1883 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
f647962d
MS
1884 if (r < 0)
1885 return log_error_errno(r, "Failed to add netlink interface index: %m");
c74e630d
LP
1886
1887 n = strappend("mv-", *i);
1888 if (!n)
1889 return log_oom();
1890
1891 strshorten(n, IFNAMSIZ-1);
1892
1893 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
f647962d
MS
1894 if (r < 0)
1895 return log_error_errno(r, "Failed to add netlink interface name: %m");
c74e630d 1896
e867ceb6
LP
1897 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1898 if (r < 0)
1899 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1900
aa28aefe 1901 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
1902 if (r < 0)
1903 return log_error_errno(r, "Failed to add netlink namespace field: %m");
c74e630d
LP
1904
1905 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
1906 if (r < 0)
1907 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 1908
d8e538ec 1909 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
f647962d
MS
1910 if (r < 0)
1911 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d
LP
1912
1913 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
f647962d
MS
1914 if (r < 0)
1915 return log_error_errno(r, "Failed to append macvlan mode: %m");
c74e630d
LP
1916
1917 r = sd_rtnl_message_close_container(m);
f647962d
MS
1918 if (r < 0)
1919 return log_error_errno(r, "Failed to close netlink container: %m");
c74e630d
LP
1920
1921 r = sd_rtnl_message_close_container(m);
f647962d
MS
1922 if (r < 0)
1923 return log_error_errno(r, "Failed to close netlink container: %m");
aa28aefe
LP
1924
1925 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
1926 if (r < 0)
1927 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
aa28aefe
LP
1928 }
1929
1930 return 0;
1931}
1932
28650077 1933static int setup_seccomp(void) {
24fb1112
LP
1934
1935#ifdef HAVE_SECCOMP
28650077
LP
1936 static const int blacklist[] = {
1937 SCMP_SYS(kexec_load),
1938 SCMP_SYS(open_by_handle_at),
1939 SCMP_SYS(init_module),
1940 SCMP_SYS(finit_module),
1941 SCMP_SYS(delete_module),
1942 SCMP_SYS(iopl),
1943 SCMP_SYS(ioperm),
1944 SCMP_SYS(swapon),
1945 SCMP_SYS(swapoff),
1946 };
1947
24fb1112 1948 scmp_filter_ctx seccomp;
28650077 1949 unsigned i;
24fb1112
LP
1950 int r;
1951
24fb1112
LP
1952 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1953 if (!seccomp)
1954 return log_oom();
1955
e9642be2 1956 r = seccomp_add_secondary_archs(seccomp);
9875fd78 1957 if (r < 0) {
da927ba9 1958 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
1959 goto finish;
1960 }
1961
28650077
LP
1962 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1963 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
1964 if (r == -EFAULT)
1965 continue; /* unknown syscall */
1966 if (r < 0) {
da927ba9 1967 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
1968 goto finish;
1969 }
1970 }
1971
1972 /*
1973 Audit is broken in containers, much of the userspace audit
1974 hookup will fail if running inside a container. We don't
1975 care and just turn off creation of audit sockets.
1976
1977 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1978 with EAFNOSUPPORT which audit userspace uses as indication
1979 that audit is disabled in the kernel.
1980 */
1981
3302da46 1982 r = seccomp_rule_add(
24fb1112
LP
1983 seccomp,
1984 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1985 SCMP_SYS(socket),
1986 2,
1987 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1988 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1989 if (r < 0) {
da927ba9 1990 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
1991 goto finish;
1992 }
1993
1994 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1995 if (r < 0) {
da927ba9 1996 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
1997 goto finish;
1998 }
1999
2000 r = seccomp_load(seccomp);
2001 if (r < 0)
da927ba9 2002 log_error_errno(r, "Failed to install seccomp audit filter: %m");
24fb1112
LP
2003
2004finish:
2005 seccomp_release(seccomp);
2006 return r;
2007#else
2008 return 0;
2009#endif
2010
2011}
2012
1b9e5b12
LP
2013static int setup_image(char **device_path, int *loop_nr) {
2014 struct loop_info64 info = {
2015 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2016 };
2017 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2018 _cleanup_free_ char* loopdev = NULL;
2019 struct stat st;
2020 int r, nr;
2021
2022 assert(device_path);
2023 assert(loop_nr);
2024
2025 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2026 if (fd < 0)
2027 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 2028
4a62c710
MS
2029 if (fstat(fd, &st) < 0)
2030 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
2031
2032 if (S_ISBLK(st.st_mode)) {
2033 char *p;
2034
2035 p = strdup(arg_image);
2036 if (!p)
2037 return log_oom();
2038
2039 *device_path = p;
2040
2041 *loop_nr = -1;
2042
2043 r = fd;
2044 fd = -1;
2045
2046 return r;
2047 }
2048
2049 if (!S_ISREG(st.st_mode)) {
56f64d95 2050 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
2051 return -EINVAL;
2052 }
2053
2054 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
2055 if (control < 0)
2056 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
2057
2058 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
2059 if (nr < 0)
2060 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
2061
2062 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2063 return log_oom();
2064
2065 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2066 if (loop < 0)
2067 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 2068
4a62c710
MS
2069 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2070 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
2071
2072 if (arg_read_only)
2073 info.lo_flags |= LO_FLAGS_READ_ONLY;
2074
4a62c710
MS
2075 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2076 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
2077
2078 *device_path = loopdev;
2079 loopdev = NULL;
2080
2081 *loop_nr = nr;
2082
2083 r = loop;
2084 loop = -1;
2085
2086 return r;
2087}
2088
2089static int dissect_image(
2090 int fd,
727fd4fd
LP
2091 char **root_device, bool *root_device_rw,
2092 char **home_device, bool *home_device_rw,
2093 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
2094 bool *secondary) {
2095
2096#ifdef HAVE_BLKID
01dc33ce
ZJS
2097 int home_nr = -1, srv_nr = -1;
2098#ifdef GPT_ROOT_NATIVE
2099 int root_nr = -1;
2100#endif
2101#ifdef GPT_ROOT_SECONDARY
2102 int secondary_root_nr = -1;
2103#endif
2104
1b9e5b12
LP
2105 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2106 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2107 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2108 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2109 _cleanup_udev_unref_ struct udev *udev = NULL;
2110 struct udev_list_entry *first, *item;
727fd4fd 2111 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1b9e5b12
LP
2112 const char *pttype = NULL;
2113 blkid_partlist pl;
2114 struct stat st;
2115 int r;
2116
2117 assert(fd >= 0);
2118 assert(root_device);
2119 assert(home_device);
2120 assert(srv_device);
2121 assert(secondary);
2122
2123 b = blkid_new_probe();
2124 if (!b)
2125 return log_oom();
2126
2127 errno = 0;
2128 r = blkid_probe_set_device(b, fd, 0, 0);
2129 if (r != 0) {
2130 if (errno == 0)
2131 return log_oom();
2132
56f64d95 2133 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
2134 return -errno;
2135 }
2136
2137 blkid_probe_enable_partitions(b, 1);
2138 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2139
2140 errno = 0;
2141 r = blkid_do_safeprobe(b);
2142 if (r == -2 || r == 1) {
2143 log_error("Failed to identify any partition table on %s.\n"
2144 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2145 return -EINVAL;
2146 } else if (r != 0) {
2147 if (errno == 0)
2148 errno = EIO;
56f64d95 2149 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
2150 return -errno;
2151 }
2152
2153 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2154 if (!streq_ptr(pttype, "gpt")) {
2155 log_error("Image %s does not carry a GUID Partition Table.\n"
2156 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2157 return -EINVAL;
2158 }
2159
2160 errno = 0;
2161 pl = blkid_probe_get_partitions(b);
2162 if (!pl) {
2163 if (errno == 0)
2164 return log_oom();
2165
2166 log_error("Failed to list partitions of %s", arg_image);
2167 return -errno;
2168 }
2169
2170 udev = udev_new();
2171 if (!udev)
2172 return log_oom();
2173
4a62c710
MS
2174 if (fstat(fd, &st) < 0)
2175 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12
LP
2176
2177 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2178 if (!d)
2179 return log_oom();
2180
2181 e = udev_enumerate_new(udev);
2182 if (!e)
2183 return log_oom();
2184
2185 r = udev_enumerate_add_match_parent(e, d);
2186 if (r < 0)
2187 return log_oom();
2188
2189 r = udev_enumerate_scan_devices(e);
f647962d
MS
2190 if (r < 0)
2191 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1b9e5b12
LP
2192
2193 first = udev_enumerate_get_list_entry(e);
2194 udev_list_entry_foreach(item, first) {
2195 _cleanup_udev_device_unref_ struct udev_device *q;
2196 const char *stype, *node;
727fd4fd 2197 unsigned long long flags;
1b9e5b12
LP
2198 sd_id128_t type_id;
2199 blkid_partition pp;
2200 dev_t qn;
2201 int nr;
2202
2203 errno = 0;
2204 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2205 if (!q) {
2206 if (!errno)
2207 errno = ENOMEM;
2208
56f64d95 2209 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
2210 return -errno;
2211 }
2212
2213 qn = udev_device_get_devnum(q);
2214 if (major(qn) == 0)
2215 continue;
2216
2217 if (st.st_rdev == qn)
2218 continue;
2219
2220 node = udev_device_get_devnode(q);
2221 if (!node)
2222 continue;
2223
2224 pp = blkid_partlist_devno_to_partition(pl, qn);
2225 if (!pp)
2226 continue;
2227
727fd4fd
LP
2228 flags = blkid_partition_get_flags(pp);
2229 if (flags & GPT_FLAG_NO_AUTO)
2230 continue;
2231
1b9e5b12
LP
2232 nr = blkid_partition_get_partno(pp);
2233 if (nr < 0)
2234 continue;
2235
2236 stype = blkid_partition_get_type_string(pp);
2237 if (!stype)
2238 continue;
2239
2240 if (sd_id128_from_string(stype, &type_id) < 0)
2241 continue;
2242
2243 if (sd_id128_equal(type_id, GPT_HOME)) {
2244
2245 if (home && nr >= home_nr)
2246 continue;
2247
2248 home_nr = nr;
727fd4fd
LP
2249 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2250
1b9e5b12
LP
2251 free(home);
2252 home = strdup(node);
2253 if (!home)
2254 return log_oom();
2255 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2256
2257 if (srv && nr >= srv_nr)
2258 continue;
2259
2260 srv_nr = nr;
727fd4fd
LP
2261 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2262
1b9e5b12
LP
2263 free(srv);
2264 srv = strdup(node);
2265 if (!srv)
2266 return log_oom();
2267 }
2268#ifdef GPT_ROOT_NATIVE
2269 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2270
2271 if (root && nr >= root_nr)
2272 continue;
2273
2274 root_nr = nr;
727fd4fd
LP
2275 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2276
1b9e5b12
LP
2277 free(root);
2278 root = strdup(node);
2279 if (!root)
2280 return log_oom();
2281 }
2282#endif
2283#ifdef GPT_ROOT_SECONDARY
2284 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2285
2286 if (secondary_root && nr >= secondary_root_nr)
2287 continue;
2288
2289 secondary_root_nr = nr;
727fd4fd
LP
2290 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2291
2292
1b9e5b12
LP
2293 free(secondary_root);
2294 secondary_root = strdup(node);
2295 if (!secondary_root)
2296 return log_oom();
2297 }
2298#endif
2299 }
2300
2301 if (!root && !secondary_root) {
2302 log_error("Failed to identify root partition in disk image %s.\n"
2303 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2304 return -EINVAL;
2305 }
2306
2307 if (root) {
2308 *root_device = root;
2309 root = NULL;
727fd4fd
LP
2310
2311 *root_device_rw = root_rw;
1b9e5b12
LP
2312 *secondary = false;
2313 } else if (secondary_root) {
2314 *root_device = secondary_root;
2315 secondary_root = NULL;
727fd4fd
LP
2316
2317 *root_device_rw = secondary_root_rw;
1b9e5b12
LP
2318 *secondary = true;
2319 }
2320
2321 if (home) {
2322 *home_device = home;
2323 home = NULL;
727fd4fd
LP
2324
2325 *home_device_rw = home_rw;
1b9e5b12
LP
2326 }
2327
2328 if (srv) {
2329 *srv_device = srv;
2330 srv = NULL;
727fd4fd
LP
2331
2332 *srv_device_rw = srv_rw;
1b9e5b12
LP
2333 }
2334
2335 return 0;
2336#else
2337 log_error("--image= is not supported, compiled without blkid support.");
2338 return -ENOTSUP;
2339#endif
2340}
2341
727fd4fd 2342static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2343#ifdef HAVE_BLKID
2344 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2345 const char *fstype, *p;
2346 int r;
2347
2348 assert(what);
2349 assert(where);
2350
727fd4fd
LP
2351 if (arg_read_only)
2352 rw = false;
2353
1b9e5b12
LP
2354 if (directory)
2355 p = strappenda(where, directory);
2356 else
2357 p = where;
2358
2359 errno = 0;
2360 b = blkid_new_probe_from_filename(what);
2361 if (!b) {
2362 if (errno == 0)
2363 return log_oom();
56f64d95 2364 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
2365 return -errno;
2366 }
2367
2368 blkid_probe_enable_superblocks(b, 1);
2369 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2370
2371 errno = 0;
2372 r = blkid_do_safeprobe(b);
2373 if (r == -1 || r == 1) {
2374 log_error("Cannot determine file system type of %s", what);
2375 return -EINVAL;
2376 } else if (r != 0) {
2377 if (errno == 0)
2378 errno = EIO;
56f64d95 2379 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
2380 return -errno;
2381 }
2382
2383 errno = 0;
2384 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2385 if (errno == 0)
2386 errno = EINVAL;
2387 log_error("Failed to determine file system type of %s", what);
2388 return -errno;
2389 }
2390
2391 if (streq(fstype, "crypto_LUKS")) {
2392 log_error("nspawn currently does not support LUKS disk images.");
2393 return -ENOTSUP;
2394 }
2395
4a62c710
MS
2396 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2397 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
2398
2399 return 0;
2400#else
2401 log_error("--image= is not supported, compiled without blkid support.");
2402 return -ENOTSUP;
2403#endif
2404}
2405
727fd4fd
LP
2406static int mount_devices(
2407 const char *where,
2408 const char *root_device, bool root_device_rw,
2409 const char *home_device, bool home_device_rw,
2410 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2411 int r;
2412
2413 assert(where);
2414
2415 if (root_device) {
727fd4fd 2416 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
2417 if (r < 0)
2418 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
2419 }
2420
2421 if (home_device) {
727fd4fd 2422 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
2423 if (r < 0)
2424 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
2425 }
2426
2427 if (srv_device) {
727fd4fd 2428 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
2429 if (r < 0)
2430 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
2431 }
2432
2433 return 0;
2434}
2435
2436static void loop_remove(int nr, int *image_fd) {
2437 _cleanup_close_ int control = -1;
e8c8ddcc 2438 int r;
1b9e5b12
LP
2439
2440 if (nr < 0)
2441 return;
2442
2443 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
2444 r = ioctl(*image_fd, LOOP_CLR_FD);
2445 if (r < 0)
56f64d95 2446 log_warning_errno(errno, "Failed to close loop image: %m");
03e334a1 2447 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2448 }
2449
2450 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 2451 if (control < 0) {
56f64d95 2452 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 2453 return;
e8c8ddcc 2454 }
1b9e5b12 2455
e8c8ddcc
TG
2456 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2457 if (r < 0)
56f64d95 2458 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
2459}
2460
0cb9fbcd
LP
2461static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2462 int pipe_fds[2];
2463 pid_t pid;
2464
2465 assert(database);
2466 assert(key);
2467 assert(rpid);
2468
4a62c710
MS
2469 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2470 return log_error_errno(errno, "Failed to allocate pipe: %m");
0cb9fbcd
LP
2471
2472 pid = fork();
4a62c710
MS
2473 if (pid < 0)
2474 return log_error_errno(errno, "Failed to fork getent child: %m");
2475 else if (pid == 0) {
0cb9fbcd
LP
2476 int nullfd;
2477 char *empty_env = NULL;
2478
2479 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2480 _exit(EXIT_FAILURE);
2481
2482 if (pipe_fds[0] > 2)
03e334a1 2483 safe_close(pipe_fds[0]);
0cb9fbcd 2484 if (pipe_fds[1] > 2)
03e334a1 2485 safe_close(pipe_fds[1]);
0cb9fbcd
LP
2486
2487 nullfd = open("/dev/null", O_RDWR);
2488 if (nullfd < 0)
2489 _exit(EXIT_FAILURE);
2490
2491 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2492 _exit(EXIT_FAILURE);
2493
2494 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2495 _exit(EXIT_FAILURE);
2496
2497 if (nullfd > 2)
03e334a1 2498 safe_close(nullfd);
0cb9fbcd
LP
2499
2500 reset_all_signal_handlers();
2501 close_all_fds(NULL, 0);
2502
4de82926
MM
2503 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2504 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
2505 _exit(EXIT_FAILURE);
2506 }
2507
03e334a1 2508 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
2509
2510 *rpid = pid;
2511
2512 return pipe_fds[0];
2513}
2514
2515static int change_uid_gid(char **_home) {
a2a5291b
ZJS
2516 char line[LINE_MAX], *x, *u, *g, *h;
2517 const char *word, *state;
0cb9fbcd
LP
2518 _cleanup_free_ uid_t *uids = NULL;
2519 _cleanup_free_ char *home = NULL;
2520 _cleanup_fclose_ FILE *f = NULL;
2521 _cleanup_close_ int fd = -1;
2522 unsigned n_uids = 0;
70f539ca 2523 size_t sz = 0, l;
0cb9fbcd
LP
2524 uid_t uid;
2525 gid_t gid;
2526 pid_t pid;
2527 int r;
2528
2529 assert(_home);
2530
2531 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2532 /* Reset everything fully to 0, just in case */
2533
4a62c710
MS
2534 if (setgroups(0, NULL) < 0)
2535 return log_error_errno(errno, "setgroups() failed: %m");
0cb9fbcd 2536
4a62c710
MS
2537 if (setresgid(0, 0, 0) < 0)
2538 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 2539
4a62c710
MS
2540 if (setresuid(0, 0, 0) < 0)
2541 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
2542
2543 *_home = NULL;
2544 return 0;
2545 }
2546
2547 /* First, get user credentials */
2548 fd = spawn_getent("passwd", arg_user, &pid);
2549 if (fd < 0)
2550 return fd;
2551
2552 f = fdopen(fd, "r");
2553 if (!f)
2554 return log_oom();
2555 fd = -1;
2556
2557 if (!fgets(line, sizeof(line), f)) {
2558
2559 if (!ferror(f)) {
2560 log_error("Failed to resolve user %s.", arg_user);
2561 return -ESRCH;
2562 }
2563
56f64d95 2564 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
2565 return -errno;
2566 }
2567
2568 truncate_nl(line);
2569
820d3acf 2570 wait_for_terminate_and_warn("getent passwd", pid, true);
0cb9fbcd
LP
2571
2572 x = strchr(line, ':');
2573 if (!x) {
2574 log_error("/etc/passwd entry has invalid user field.");
2575 return -EIO;
2576 }
2577
2578 u = strchr(x+1, ':');
2579 if (!u) {
2580 log_error("/etc/passwd entry has invalid password field.");
2581 return -EIO;
2582 }
2583
2584 u++;
2585 g = strchr(u, ':');
2586 if (!g) {
2587 log_error("/etc/passwd entry has invalid UID field.");
2588 return -EIO;
2589 }
2590
2591 *g = 0;
2592 g++;
2593 x = strchr(g, ':');
2594 if (!x) {
2595 log_error("/etc/passwd entry has invalid GID field.");
2596 return -EIO;
2597 }
2598
2599 *x = 0;
2600 h = strchr(x+1, ':');
2601 if (!h) {
2602 log_error("/etc/passwd entry has invalid GECOS field.");
2603 return -EIO;
2604 }
2605
2606 h++;
2607 x = strchr(h, ':');
2608 if (!x) {
2609 log_error("/etc/passwd entry has invalid home directory field.");
2610 return -EIO;
2611 }
2612
2613 *x = 0;
2614
2615 r = parse_uid(u, &uid);
2616 if (r < 0) {
2617 log_error("Failed to parse UID of user.");
2618 return -EIO;
2619 }
2620
2621 r = parse_gid(g, &gid);
2622 if (r < 0) {
2623 log_error("Failed to parse GID of user.");
2624 return -EIO;
2625 }
2626
2627 home = strdup(h);
2628 if (!home)
2629 return log_oom();
2630
2631 /* Second, get group memberships */
2632 fd = spawn_getent("initgroups", arg_user, &pid);
2633 if (fd < 0)
2634 return fd;
2635
2636 fclose(f);
2637 f = fdopen(fd, "r");
2638 if (!f)
2639 return log_oom();
2640 fd = -1;
2641
2642 if (!fgets(line, sizeof(line), f)) {
2643 if (!ferror(f)) {
2644 log_error("Failed to resolve user %s.", arg_user);
2645 return -ESRCH;
2646 }
2647
56f64d95 2648 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
2649 return -errno;
2650 }
2651
2652 truncate_nl(line);
2653
820d3acf 2654 wait_for_terminate_and_warn("getent initgroups", pid, true);
0cb9fbcd
LP
2655
2656 /* Skip over the username and subsequent separator whitespace */
2657 x = line;
2658 x += strcspn(x, WHITESPACE);
2659 x += strspn(x, WHITESPACE);
2660
a2a5291b 2661 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
2662 char c[l+1];
2663
a2a5291b 2664 memcpy(c, word, l);
0cb9fbcd
LP
2665 c[l] = 0;
2666
2667 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2668 return log_oom();
2669
2670 r = parse_uid(c, &uids[n_uids++]);
2671 if (r < 0) {
2672 log_error("Failed to parse group data from getent.");
2673 return -EIO;
2674 }
2675 }
2676
2677 r = mkdir_parents(home, 0775);
f647962d
MS
2678 if (r < 0)
2679 return log_error_errno(r, "Failed to make home root directory: %m");
0cb9fbcd
LP
2680
2681 r = mkdir_safe(home, 0755, uid, gid);
f647962d
MS
2682 if (r < 0 && r != -EEXIST)
2683 return log_error_errno(r, "Failed to make home directory: %m");
0cb9fbcd
LP
2684
2685 fchown(STDIN_FILENO, uid, gid);
2686 fchown(STDOUT_FILENO, uid, gid);
2687 fchown(STDERR_FILENO, uid, gid);
2688
4a62c710
MS
2689 if (setgroups(n_uids, uids) < 0)
2690 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
0cb9fbcd 2691
4a62c710
MS
2692 if (setresgid(gid, gid, gid) < 0)
2693 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 2694
4a62c710
MS
2695 if (setresuid(uid, uid, uid) < 0)
2696 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
2697
2698 if (_home) {
2699 *_home = home;
2700 home = NULL;
2701 }
2702
2703 return 0;
2704}
2705
113cea80 2706/*
6d416b9c
LS
2707 * Return values:
2708 * < 0 : wait_for_terminate() failed to get the state of the
2709 * container, the container was terminated by a signal, or
2710 * failed for an unknown reason. No change is made to the
2711 * container argument.
2712 * > 0 : The program executed in the container terminated with an
2713 * error. The exit code of the program executed in the
919699ec
LP
2714 * container is returned. The container argument has been set
2715 * to CONTAINER_TERMINATED.
6d416b9c
LS
2716 * 0 : The container is being rebooted, has been shut down or exited
2717 * successfully. The container argument has been set to either
2718 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2719 *
6d416b9c
LS
2720 * That is, success is indicated by a return value of zero, and an
2721 * error is indicated by a non-zero value.
113cea80
DH
2722 */
2723static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 2724 siginfo_t status;
919699ec 2725 int r;
113cea80
DH
2726
2727 r = wait_for_terminate(pid, &status);
f647962d
MS
2728 if (r < 0)
2729 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
2730
2731 switch (status.si_code) {
fddbb89c 2732
113cea80 2733 case CLD_EXITED:
919699ec
LP
2734 if (status.si_status == 0) {
2735 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 2736
fddbb89c 2737 } else
919699ec 2738 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 2739
919699ec
LP
2740 *container = CONTAINER_TERMINATED;
2741 return status.si_status;
113cea80
DH
2742
2743 case CLD_KILLED:
2744 if (status.si_status == SIGINT) {
113cea80 2745
919699ec 2746 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 2747 *container = CONTAINER_TERMINATED;
919699ec
LP
2748 return 0;
2749
113cea80 2750 } else if (status.si_status == SIGHUP) {
113cea80 2751
919699ec 2752 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 2753 *container = CONTAINER_REBOOTED;
919699ec 2754 return 0;
113cea80 2755 }
919699ec 2756
113cea80
DH
2757 /* CLD_KILLED fallthrough */
2758
2759 case CLD_DUMPED:
fddbb89c 2760 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 2761 return -EIO;
113cea80
DH
2762
2763 default:
fddbb89c 2764 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 2765 return -EIO;
113cea80
DH
2766 }
2767
2768 return r;
2769}
2770
e866af3a
DH
2771static void nop_handler(int sig) {}
2772
023fb90b
LP
2773static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2774 pid_t pid;
2775
2776 pid = PTR_TO_UINT32(userdata);
2777 if (pid > 0) {
2778 if (kill(pid, SIGRTMIN+3) >= 0) {
2779 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2780 sd_event_source_set_userdata(s, NULL);
2781 return 0;
2782 }
2783 }
2784
2785 sd_event_exit(sd_event_source_get_event(s), 0);
2786 return 0;
2787}
2788
88213476 2789int main(int argc, char *argv[]) {
69c79d3c 2790
63cc4c31 2791 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
727fd4fd 2792 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
63cc4c31 2793 _cleanup_close_ int master = -1, image_fd = -1;
3d94f76c 2794 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
69c79d3c 2795 _cleanup_fdset_free_ FDSet *fds = NULL;
1b9e5b12 2796 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
69c79d3c 2797 const char *console = NULL;
1b9e5b12
LP
2798 char veth_name[IFNAMSIZ];
2799 bool secondary = false;
e866af3a 2800 sigset_t mask, mask_chld;
69c79d3c 2801 pid_t pid = 0;
88213476
LP
2802
2803 log_parse_environment();
2804 log_open();
2805
05947bef
LP
2806 k = parse_argv(argc, argv);
2807 if (k < 0)
88213476 2808 goto finish;
05947bef
LP
2809 else if (k == 0) {
2810 r = EXIT_SUCCESS;
2811 goto finish;
2812 }
88213476 2813
1b9e5b12
LP
2814 if (!arg_image) {
2815 if (arg_directory) {
2816 char *p;
88213476 2817
1b9e5b12
LP
2818 p = path_make_absolute_cwd(arg_directory);
2819 free(arg_directory);
2820 arg_directory = p;
2821 } else
2822 arg_directory = get_current_dir_name();
88213476 2823
1b9e5b12
LP
2824 if (!arg_directory) {
2825 log_error("Failed to determine path, please use -D.");
2826 goto finish;
2827 }
2828 path_kill_slashes(arg_directory);
88213476
LP
2829 }
2830
7027ff61 2831 if (!arg_machine) {
1b9e5b12 2832 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
7027ff61
LP
2833 if (!arg_machine) {
2834 log_oom();
2835 goto finish;
2836 }
2837
e724b063 2838 hostname_cleanup(arg_machine, false);
7027ff61
LP
2839 if (isempty(arg_machine)) {
2840 log_error("Failed to determine machine name automatically, please use -M.");
2841 goto finish;
2842 }
2843 }
2844
88213476
LP
2845 if (geteuid() != 0) {
2846 log_error("Need to be root.");
2847 goto finish;
2848 }
2849
04d391da
LP
2850 if (sd_booted() <= 0) {
2851 log_error("Not running on a systemd system.");
2852 goto finish;
2853 }
2854
1b9e5b12
LP
2855 log_close();
2856 n_fd_passed = sd_listen_fds(false);
2857 if (n_fd_passed > 0) {
2858 k = fdset_new_listen_fds(&fds, false);
2859 if (k < 0) {
da927ba9 2860 log_error_errno(k, "Failed to collect file descriptors: %m");
1b9e5b12
LP
2861 goto finish;
2862 }
88213476 2863 }
1b9e5b12
LP
2864 fdset_close_others(fds);
2865 log_open();
88213476 2866
1b9e5b12
LP
2867 if (arg_directory) {
2868 if (path_equal(arg_directory, "/")) {
2869 log_error("Spawning container on root directory not supported.");
6b9132a9
LP
2870 goto finish;
2871 }
1b9e5b12
LP
2872
2873 if (arg_boot) {
2874 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 2875 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
1b9e5b12
LP
2876 goto finish;
2877 }
2878 } else {
2879 const char *p;
2880
2881 p = strappenda(arg_directory,
2882 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2883 if (access(p, F_OK) < 0) {
2884 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2885 goto finish;
2886
2887 }
2888 }
6b9132a9 2889 } else {
1b9e5b12 2890 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 2891
1b9e5b12 2892 if (!mkdtemp(template)) {
56f64d95 2893 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 2894 r = -errno;
6b9132a9 2895 goto finish;
1b9e5b12 2896 }
6b9132a9 2897
1b9e5b12
LP
2898 arg_directory = strdup(template);
2899 if (!arg_directory) {
2900 r = log_oom();
2901 goto finish;
6b9132a9 2902 }
88213476 2903
1b9e5b12
LP
2904 image_fd = setup_image(&device_path, &loop_nr);
2905 if (image_fd < 0) {
2906 r = image_fd;
842f3b0f
LP
2907 goto finish;
2908 }
1b9e5b12 2909
4d9f07b4
LP
2910 r = dissect_image(image_fd,
2911 &root_device, &root_device_rw,
2912 &home_device, &home_device_rw,
2913 &srv_device, &srv_device_rw,
2914 &secondary);
1b9e5b12
LP
2915 if (r < 0)
2916 goto finish;
842f3b0f 2917 }
842f3b0f 2918
db7feb7e
LP
2919 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2920 if (master < 0) {
56f64d95 2921 log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
2922 goto finish;
2923 }
2924
db7feb7e
LP
2925 console = ptsname(master);
2926 if (!console) {
56f64d95 2927 log_error_errno(errno, "Failed to determine tty name: %m");
a258bf26
LP
2928 goto finish;
2929 }
2930
284c0b91 2931 if (!arg_quiet)
45f1386c
ZJS
2932 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2933 arg_machine, arg_image ? arg_image : arg_directory);
a258bf26
LP
2934
2935 if (unlockpt(master) < 0) {
56f64d95 2936 log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
2937 goto finish;
2938 }
2939
e58a1277 2940 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
56f64d95 2941 log_error_errno(errno, "Failed to create kmsg socket pair: %m");
354bfd2b
LP
2942 goto finish;
2943 }
2944
af4ec430
LP
2945 sd_notify(false,
2946 "READY=1\n"
2947 "STATUS=Container running.");
05947bef 2948
a258bf26
LP
2949 assert_se(sigemptyset(&mask) == 0);
2950 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2951 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2952
023fb90b
LP
2953 assert_se(sigemptyset(&mask_chld) == 0);
2954 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
2955
d87be9b0 2956 for (;;) {
113cea80 2957 ContainerStatus container_status;
7566e267 2958 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
e866af3a
DH
2959 struct sigaction sa = {
2960 .sa_handler = nop_handler,
2961 .sa_flags = SA_NOCLDSTOP,
2962 };
2963
7566e267 2964 r = barrier_create(&barrier);
a2da110b 2965 if (r < 0) {
da927ba9 2966 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
2967 goto finish;
2968 }
2969
e866af3a
DH
2970 /* Child can be killed before execv(), so handle SIGCHLD
2971 * in order to interrupt parent's blocking calls and
2972 * give it a chance to call wait() and terminate. */
2973 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2974 if (r < 0) {
56f64d95 2975 log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
2976 goto finish;
2977 }
2978
e866af3a
DH
2979 r = sigaction(SIGCHLD, &sa, NULL);
2980 if (r < 0) {
56f64d95 2981 log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
2982 goto finish;
2983 }
2984
a2da110b
DH
2985 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
2986 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2987 (arg_private_network ? CLONE_NEWNET : 0), NULL);
d87be9b0
LP
2988 if (pid < 0) {
2989 if (errno == EINVAL)
56f64d95 2990 log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 2991 else
56f64d95 2992 log_error_errno(errno, "clone() failed: %m");
a258bf26 2993
e866af3a 2994 r = pid;
d87be9b0
LP
2995 goto finish;
2996 }
a258bf26 2997
d87be9b0
LP
2998 if (pid == 0) {
2999 /* child */
0cb9fbcd 3000 _cleanup_free_ char *home = NULL;
5674767e 3001 unsigned n_env = 2;
d87be9b0 3002 const char *envp[] = {
e10a55fd 3003 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
3004 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3005 NULL, /* TERM */
3006 NULL, /* HOME */
3007 NULL, /* USER */
3008 NULL, /* LOGNAME */
3009 NULL, /* container_uuid */
842f3b0f
LP
3010 NULL, /* LISTEN_FDS */
3011 NULL, /* LISTEN_PID */
d87be9b0
LP
3012 NULL
3013 };
f4889f65 3014 char **env_use;
a258bf26 3015
a2da110b
DH
3016 barrier_set_role(&barrier, BARRIER_CHILD);
3017
5674767e
ZJS
3018 envp[n_env] = strv_find_prefix(environ, "TERM=");
3019 if (envp[n_env])
3020 n_env ++;
a258bf26 3021
03e334a1 3022 master = safe_close(master);
a258bf26 3023
d87be9b0
LP
3024 close_nointr(STDIN_FILENO);
3025 close_nointr(STDOUT_FILENO);
3026 close_nointr(STDERR_FILENO);
db7feb7e 3027
03e334a1 3028 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
a258bf26 3029
d87be9b0 3030 reset_all_signal_handlers();
1b6d7fa7 3031 reset_signal_mask();
f5c1b9ee 3032
842f3b0f
LP
3033 k = open_terminal(console, O_RDWR);
3034 if (k != STDIN_FILENO) {
3035 if (k >= 0) {
03e334a1 3036 safe_close(k);
842f3b0f
LP
3037 k = -EINVAL;
3038 }
3039
da927ba9 3040 log_error_errno(k, "Failed to open console: %m");
a2da110b 3041 _exit(EXIT_FAILURE);
842f3b0f
LP
3042 }
3043
3044 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3045 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
56f64d95 3046 log_error_errno(errno, "Failed to duplicate console: %m");
a2da110b 3047 _exit(EXIT_FAILURE);
842f3b0f 3048 }
bc2f673e 3049
d87be9b0 3050 if (setsid() < 0) {
56f64d95 3051 log_error_errno(errno, "setsid() failed: %m");
a2da110b 3052 _exit(EXIT_FAILURE);
bc2f673e
LP
3053 }
3054
db999e0f 3055 if (reset_audit_loginuid() < 0)
a2da110b 3056 _exit(EXIT_FAILURE);
db999e0f 3057
d87be9b0 3058 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
56f64d95 3059 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
a2da110b 3060 _exit(EXIT_FAILURE);
d87be9b0 3061 }
e58a1277 3062
d87be9b0
LP
3063 /* Mark everything as slave, so that we still
3064 * receive mounts from the real root, but don't
3065 * propagate mounts to the real root. */
3066 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
56f64d95 3067 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
a2da110b 3068 _exit(EXIT_FAILURE);
d87be9b0 3069 }
04bc4a3f 3070
727fd4fd
LP
3071 if (mount_devices(arg_directory,
3072 root_device, root_device_rw,
3073 home_device, home_device_rw,
3074 srv_device, srv_device_rw) < 0)
a2da110b 3075 _exit(EXIT_FAILURE);
1b9e5b12 3076
d87be9b0
LP
3077 /* Turn directory into bind mount */
3078 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
56f64d95 3079 log_error_errno(errno, "Failed to make bind mount: %m");
a2da110b 3080 _exit(EXIT_FAILURE);
d87be9b0 3081 }
88213476 3082
4d9f07b4
LP
3083 r = setup_volatile(arg_directory);
3084 if (r < 0)
a2da110b 3085 _exit(EXIT_FAILURE);
4d9f07b4
LP
3086
3087 if (setup_volatile_state(arg_directory) < 0)
a2da110b 3088 _exit(EXIT_FAILURE);
4d9f07b4
LP
3089
3090 r = base_filesystem_create(arg_directory);
3091 if (r < 0)
a2da110b 3092 _exit(EXIT_FAILURE);
4d9f07b4 3093
d6797c92
LP
3094 if (arg_read_only) {
3095 k = bind_remount_recursive(arg_directory, true);
3096 if (k < 0) {
da927ba9 3097 log_error_errno(k, "Failed to make tree read-only: %m");
a2da110b 3098 _exit(EXIT_FAILURE);
d87be9b0 3099 }
d6797c92 3100 }
2547bb41 3101
d87be9b0 3102 if (mount_all(arg_directory) < 0)
a2da110b 3103 _exit(EXIT_FAILURE);
57fb9fb5 3104
d87be9b0 3105 if (copy_devnodes(arg_directory) < 0)
a2da110b 3106 _exit(EXIT_FAILURE);
a258bf26 3107
f2d88580 3108 if (setup_ptmx(arg_directory) < 0)
a2da110b 3109 _exit(EXIT_FAILURE);
f2d88580 3110
d87be9b0 3111 dev_setup(arg_directory);
88213476 3112
28650077 3113 if (setup_seccomp() < 0)
a2da110b 3114 _exit(EXIT_FAILURE);
24fb1112 3115
d87be9b0 3116 if (setup_dev_console(arg_directory, console) < 0)
a2da110b 3117 _exit(EXIT_FAILURE);
88213476 3118
d87be9b0 3119 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
a2da110b 3120 _exit(EXIT_FAILURE);
88213476 3121
03e334a1 3122 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
a258bf26 3123
d87be9b0 3124 if (setup_boot_id(arg_directory) < 0)
a2da110b 3125 _exit(EXIT_FAILURE);
a41fe3a2 3126
d87be9b0 3127 if (setup_timezone(arg_directory) < 0)
a2da110b 3128 _exit(EXIT_FAILURE);
88213476 3129
d87be9b0 3130 if (setup_resolv_conf(arg_directory) < 0)
a2da110b 3131 _exit(EXIT_FAILURE);
687d0825 3132
d87be9b0 3133 if (setup_journal(arg_directory) < 0)
a2da110b 3134 _exit(EXIT_FAILURE);
687d0825 3135
d6797c92 3136 if (mount_binds(arg_directory, arg_bind, false) < 0)
a2da110b 3137 _exit(EXIT_FAILURE);
17fe0523 3138
d6797c92 3139 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
a2da110b 3140 _exit(EXIT_FAILURE);
17fe0523 3141
06c17c39 3142 if (mount_tmpfs(arg_directory) < 0)
a2da110b 3143 _exit(EXIT_FAILURE);
06c17c39 3144
d96c1ecf
LP
3145 /* Tell the parent that we are ready, and that
3146 * it can cgroupify us to that we lack access
3147 * to certain devices and resources. */
dfb05a1c 3148 (void)barrier_place(&barrier);
d96c1ecf 3149
d87be9b0 3150 if (chdir(arg_directory) < 0) {
56f64d95 3151 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
a2da110b 3152 _exit(EXIT_FAILURE);
687d0825
MV
3153 }
3154
d87be9b0 3155 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
56f64d95 3156 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
a2da110b 3157 _exit(EXIT_FAILURE);
687d0825
MV
3158 }
3159
d87be9b0 3160 if (chroot(".") < 0) {
56f64d95 3161 log_error_errno(errno, "chroot() failed: %m");
a2da110b 3162 _exit(EXIT_FAILURE);
687d0825
MV
3163 }
3164
d87be9b0 3165 if (chdir("/") < 0) {
56f64d95 3166 log_error_errno(errno, "chdir() failed: %m");
a2da110b 3167 _exit(EXIT_FAILURE);
687d0825
MV
3168 }
3169
d87be9b0
LP
3170 umask(0022);
3171
eb91eb18
LP
3172 if (arg_private_network)
3173 loopback_setup();
d87be9b0
LP
3174
3175 if (drop_capabilities() < 0) {
56f64d95 3176 log_error_errno(errno, "drop_capabilities() failed: %m");
a2da110b 3177 _exit(EXIT_FAILURE);
687d0825 3178 }
687d0825 3179
0cb9fbcd
LP
3180 r = change_uid_gid(&home);
3181 if (r < 0)
a2da110b 3182 _exit(EXIT_FAILURE);
d87be9b0 3183
842f3b0f
LP
3184 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3185 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3186 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 3187 log_oom();
a2da110b 3188 _exit(EXIT_FAILURE);
144f0fc0 3189 }
687d0825 3190
9444b1f2 3191 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
9f24adc2
LP
3192 char as_uuid[37];
3193
3194 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
842f3b0f 3195 log_oom();
a2da110b 3196 _exit(EXIT_FAILURE);
842f3b0f
LP
3197 }
3198 }
3199
3200 if (fdset_size(fds) > 0) {
3201 k = fdset_cloexec(fds, false);
3202 if (k < 0) {
3203 log_error("Failed to unset O_CLOEXEC for file descriptors.");
a2da110b 3204 _exit(EXIT_FAILURE);
842f3b0f
LP
3205 }
3206
3207 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 3208 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0 3209 log_oom();
a2da110b 3210 _exit(EXIT_FAILURE);
d87be9b0
LP
3211 }
3212 }
3213
3214 setup_hostname();
3215
6afc95b7
LP
3216 if (arg_personality != 0xffffffffLU) {
3217 if (personality(arg_personality) < 0) {
56f64d95 3218 log_error_errno(errno, "personality() failed: %m");
a2da110b 3219 _exit(EXIT_FAILURE);
6afc95b7 3220 }
1b9e5b12
LP
3221 } else if (secondary) {
3222 if (personality(PER_LINUX32) < 0) {
56f64d95 3223 log_error_errno(errno, "personality() failed: %m");
a2da110b 3224 _exit(EXIT_FAILURE);
1b9e5b12 3225 }
6afc95b7
LP
3226 }
3227
d96c1ecf
LP
3228#ifdef HAVE_SELINUX
3229 if (arg_selinux_context)
0cb9fbcd 3230 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
56f64d95 3231 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
a2da110b 3232 _exit(EXIT_FAILURE);
0cb9fbcd 3233 }
d96c1ecf 3234#endif
354bfd2b 3235
f4889f65
LP
3236 if (!strv_isempty(arg_setenv)) {
3237 char **n;
3238
3239 n = strv_env_merge(2, envp, arg_setenv);
3240 if (!n) {
3241 log_oom();
a2da110b 3242 _exit(EXIT_FAILURE);
f4889f65
LP
3243 }
3244
3245 env_use = n;
3246 } else
3247 env_use = (char**) envp;
3248
d96c1ecf 3249 /* Wait until the parent is ready with the setup, too... */
a2da110b
DH
3250 if (!barrier_place_and_sync(&barrier))
3251 _exit(EXIT_FAILURE);
d96c1ecf 3252
d87be9b0
LP
3253 if (arg_boot) {
3254 char **a;
3255 size_t l;
88213476 3256
d87be9b0 3257 /* Automatically search for the init system */
0f0dbc46 3258
d87be9b0
LP
3259 l = 1 + argc - optind;
3260 a = newa(char*, l + 1);
3261 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 3262
d87be9b0 3263 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 3264 execve(a[0], a, env_use);
0f0dbc46 3265
d87be9b0 3266 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 3267 execve(a[0], a, env_use);
0f0dbc46 3268
d87be9b0 3269 a[0] = (char*) "/sbin/init";
f4889f65 3270 execve(a[0], a, env_use);
d87be9b0 3271 } else if (argc > optind)
f4889f65 3272 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
3273 else {
3274 chdir(home ? home : "/root");
f4889f65 3275 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 3276 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
3277 }
3278
56f64d95 3279 log_error_errno(errno, "execv() failed: %m");
d87be9b0 3280 _exit(EXIT_FAILURE);
da5b3bad 3281 }
88213476 3282
a2da110b 3283 barrier_set_role(&barrier, BARRIER_PARENT);
842f3b0f
LP
3284 fdset_free(fds);
3285 fds = NULL;
3286
a2da110b
DH
3287 /* wait for child-setup to be done */
3288 if (barrier_place_and_sync(&barrier)) {
023fb90b
LP
3289 _cleanup_event_unref_ sd_event *event = NULL;
3290 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
5aa4bb6b 3291 int ifi = 0;
354bfd2b 3292
840295fc
LP
3293 r = move_network_interfaces(pid);
3294 if (r < 0)
3295 goto finish;
aa28aefe 3296
5aa4bb6b 3297 r = setup_veth(pid, veth_name, &ifi);
840295fc
LP
3298 if (r < 0)
3299 goto finish;
ab046dde 3300
5aa4bb6b 3301 r = setup_bridge(veth_name, &ifi);
840295fc
LP
3302 if (r < 0)
3303 goto finish;
ab046dde 3304
840295fc
LP
3305 r = setup_macvlan(pid);
3306 if (r < 0)
3307 goto finish;
c74e630d 3308
5aa4bb6b
LP
3309 r = register_machine(pid, ifi);
3310 if (r < 0)
3311 goto finish;
3312
840295fc
LP
3313 /* Block SIGCHLD here, before notifying child.
3314 * process_pty() will handle it with the other signals. */
3315 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3316 if (r < 0)
3317 goto finish;
e866af3a 3318
840295fc
LP
3319 /* Reset signal to default */
3320 r = default_signals(SIGCHLD, -1);
3321 if (r < 0)
3322 goto finish;
e866af3a 3323
840295fc
LP
3324 /* Notify the child that the parent is ready with all
3325 * its setup, and that the child can now hand over
3326 * control to the code to run inside the container. */
dfb05a1c 3327 (void)barrier_place(&barrier);
354bfd2b 3328
023fb90b
LP
3329 r = sd_event_new(&event);
3330 if (r < 0) {
da927ba9 3331 log_error_errno(r, "Failed to get default event source: %m");
023fb90b 3332 goto finish;
840295fc 3333 }
88213476 3334
023fb90b
LP
3335 if (arg_boot) {
3336 /* Try to kill the init system on SIGINT or SIGTERM */
3337 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3338 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3339 } else {
3340 /* Immediately exit */
3341 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3342 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3343 }
3344
3345 /* simply exit on sigchld */
3346 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3347
3348 r = pty_forward_new(event, master, &forward);
3349 if (r < 0) {
da927ba9 3350 log_error_errno(r, "Failed to create PTY forwarder: %m");
023fb90b
LP
3351 goto finish;
3352 }
3353
3354 r = sd_event_loop(event);
f647962d
MS
3355 if (r < 0)
3356 return log_error_errno(r, "Failed to run event loop: %m");
023fb90b
LP
3357
3358 forward = pty_forward_free(forward);
3359
840295fc
LP
3360 if (!arg_quiet)
3361 putc('\n', stdout);
04d39279 3362
840295fc
LP
3363 /* Kill if it is not dead yet anyway */
3364 terminate_machine(pid);
3365 }
1f0cd86b 3366
840295fc 3367 /* Normally redundant, but better safe than sorry */
04d39279 3368 kill(pid, SIGKILL);
a258bf26 3369
113cea80 3370 r = wait_for_container(pid, &container_status);
04d39279
LP
3371 pid = 0;
3372
ce9f1527
LP
3373 if (r < 0) {
3374 /* We failed to wait for the container, or the
3375 * container exited abnormally */
3376 r = EXIT_FAILURE;
d87be9b0 3377 break;
ce9f1527
LP
3378 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3379 /* The container exited with a non-zero
3380 * status, or with zero status and no reboot
3381 * was requested. */
d87be9b0 3382 break;
88213476 3383
113cea80 3384 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3385
3386 if (arg_keep_unit) {
3387 /* Special handling if we are running as a
3388 * service: instead of simply restarting the
3389 * machine we want to restart the entire
3390 * service, so let's inform systemd about this
3391 * with the special exit code 133. The service
3392 * file uses RestartForceExitStatus=133 so
3393 * that this results in a full nspawn
3394 * restart. This is necessary since we might
3395 * have cgroup parameters set we want to have
3396 * flushed out. */
3397 r = 133;
3398 break;
3399 }
d87be9b0 3400 }
88213476
LP
3401
3402finish:
af4ec430
LP
3403 sd_notify(false,
3404 "STOPPING=1\n"
3405 "STATUS=Terminating...");
3406
1b9e5b12
LP
3407 loop_remove(loop_nr, &image_fd);
3408
9444b1f2
LP
3409 if (pid > 0)
3410 kill(pid, SIGKILL);
88213476 3411
04d391da 3412 free(arg_directory);
7027ff61 3413 free(arg_machine);
c74e630d
LP
3414 free(arg_user);
3415 strv_free(arg_setenv);
3416 strv_free(arg_network_interfaces);
3417 strv_free(arg_network_macvlan);
3418 strv_free(arg_bind);
3419 strv_free(arg_bind_ro);
06c17c39 3420 strv_free(arg_tmpfs);
88213476
LP
3421
3422 return r;
3423}