]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
main: change check whether /etc is unpopulated to look for /etc/machine-id
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
a258bf26
LP
36#include <termios.h>
37#include <sys/signalfd.h>
687d0825 38#include <grp.h>
5ed27dbd 39#include <linux/fs.h>
9537eab0
LP
40#include <sys/un.h>
41#include <sys/socket.h>
aea38d80 42#include <linux/netlink.h>
354bfd2b 43#include <sys/eventfd.h>
aa28aefe 44#include <net/if.h>
69c79d3c 45#include <linux/veth.h>
6afc95b7 46#include <sys/personality.h>
1b9e5b12 47#include <linux/loop.h>
aa28aefe 48
5d63309c 49#ifdef HAVE_SELINUX
a8828ed9
DW
50#include <selinux/selinux.h>
51#endif
88213476 52
24fb1112
LP
53#ifdef HAVE_SECCOMP
54#include <seccomp.h>
55#endif
56
1b9e5b12
LP
57#ifdef HAVE_BLKID
58#include <blkid/blkid.h>
59#endif
60
1f0cd86b
LP
61#include "sd-daemon.h"
62#include "sd-bus.h"
63#include "sd-id128.h"
aa28aefe 64#include "sd-rtnl.h"
88213476
LP
65#include "log.h"
66#include "util.h"
49e942b2 67#include "mkdir.h"
6b2d0e85 68#include "macro.h"
d7832d2c 69#include "audit.h"
94d82985 70#include "missing.h"
04d391da 71#include "cgroup-util.h"
a258bf26 72#include "strv.h"
9eb977db 73#include "path-util.h"
a41fe3a2 74#include "loopback-setup.h"
4fc9982c 75#include "dev-setup.h"
842f3b0f 76#include "fdset.h"
acbeb427 77#include "build.h"
a5c32cff 78#include "fileio.h"
40ca29a1 79#include "bus-util.h"
1f0cd86b 80#include "bus-error.h"
4ba93280 81#include "ptyfwd.h"
9bd37b40 82#include "bus-kernel.h"
f4889f65 83#include "env-util.h"
7f112f50 84#include "def.h"
aa28aefe 85#include "rtnl-util.h"
7e227024 86#include "udev-util.h"
e866af3a 87#include "eventfd-util.h"
1b9e5b12
LP
88#include "blkid-util.h"
89#include "gpt.h"
01dde061 90#include "siphash24.h"
849958d1 91#include "copy.h"
3577de7a 92#include "base-filesystem.h"
f2d88580 93
e9642be2
LP
94#ifdef HAVE_SECCOMP
95#include "seccomp-util.h"
96#endif
97
113cea80
DH
98typedef enum ContainerStatus {
99 CONTAINER_TERMINATED,
100 CONTAINER_REBOOTED
101} ContainerStatus;
102
57fb9fb5
LP
103typedef enum LinkJournal {
104 LINK_NO,
105 LINK_AUTO,
106 LINK_HOST,
107 LINK_GUEST
108} LinkJournal;
88213476
LP
109
110static char *arg_directory = NULL;
687d0825 111static char *arg_user = NULL;
9444b1f2 112static sd_id128_t arg_uuid = {};
7027ff61 113static char *arg_machine = NULL;
c74e630d
LP
114static const char *arg_selinux_context = NULL;
115static const char *arg_selinux_apifs_context = NULL;
9444b1f2 116static const char *arg_slice = NULL;
ff01d048 117static bool arg_private_network = false;
bc2f673e 118static bool arg_read_only = false;
0f0dbc46 119static bool arg_boot = false;
57fb9fb5 120static LinkJournal arg_link_journal = LINK_AUTO;
5076f0cc
LP
121static uint64_t arg_retain =
122 (1ULL << CAP_CHOWN) |
123 (1ULL << CAP_DAC_OVERRIDE) |
124 (1ULL << CAP_DAC_READ_SEARCH) |
125 (1ULL << CAP_FOWNER) |
126 (1ULL << CAP_FSETID) |
127 (1ULL << CAP_IPC_OWNER) |
128 (1ULL << CAP_KILL) |
129 (1ULL << CAP_LEASE) |
130 (1ULL << CAP_LINUX_IMMUTABLE) |
131 (1ULL << CAP_NET_BIND_SERVICE) |
132 (1ULL << CAP_NET_BROADCAST) |
133 (1ULL << CAP_NET_RAW) |
134 (1ULL << CAP_SETGID) |
135 (1ULL << CAP_SETFCAP) |
136 (1ULL << CAP_SETPCAP) |
137 (1ULL << CAP_SETUID) |
138 (1ULL << CAP_SYS_ADMIN) |
139 (1ULL << CAP_SYS_CHROOT) |
140 (1ULL << CAP_SYS_NICE) |
141 (1ULL << CAP_SYS_PTRACE) |
142 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 143 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
144 (1ULL << CAP_SYS_BOOT) |
145 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
146 (1ULL << CAP_AUDIT_CONTROL) |
147 (1ULL << CAP_MKNOD);
17fe0523
LP
148static char **arg_bind = NULL;
149static char **arg_bind_ro = NULL;
06c17c39 150static char **arg_tmpfs = NULL;
f4889f65 151static char **arg_setenv = NULL;
284c0b91 152static bool arg_quiet = false;
8a96d94e 153static bool arg_share_system = false;
eb91eb18 154static bool arg_register = true;
89f7c846 155static bool arg_keep_unit = false;
aa28aefe 156static char **arg_network_interfaces = NULL;
c74e630d 157static char **arg_network_macvlan = NULL;
69c79d3c 158static bool arg_network_veth = false;
c74e630d 159static const char *arg_network_bridge = NULL;
6afc95b7 160static unsigned long arg_personality = 0xffffffffLU;
1b9e5b12 161static const char *arg_image = NULL;
88213476
LP
162
163static int help(void) {
164
165 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
166 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
167 " -h --help Show this help\n"
168 " --version Print version string\n"
69c79d3c 169 " -q --quiet Do not show status information\n"
1b9e5b12
LP
170 " -D --directory=PATH Root directory for the container\n"
171 " -i --image=PATH File system device or image for the container\n"
a8828ed9
DW
172 " -b --boot Boot up full system (i.e. invoke init)\n"
173 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 174 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 175 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 176 " -S --slice=SLICE Place the container in the specified slice\n"
69c79d3c
LP
177 " --private-network Disable network in container\n"
178 " --network-interface=INTERFACE\n"
179 " Assign an existing network interface to the\n"
180 " container\n"
c74e630d
LP
181 " --network-macvlan=INTERFACE\n"
182 " Create a macvlan network interface based on an\n"
183 " existing network interface to the container\n"
32457153 184 " --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 185 " and container\n"
ab046dde 186 " --network-bridge=INTERFACE\n"
32457153 187 " Add a virtual ethernet connection between host\n"
ab046dde
TG
188 " and container and add it to an existing bridge on\n"
189 " the host\n"
82adf6af
LP
190 " -Z --selinux-context=SECLABEL\n"
191 " Set the SELinux security context to be used by\n"
192 " processes in the container\n"
193 " -L --selinux-apifs-context=SECLABEL\n"
194 " Set the SELinux security context to be used by\n"
195 " API/tmpfs file systems in the container\n"
a8828ed9
DW
196 " --capability=CAP In addition to the default, retain specified\n"
197 " capability\n"
198 " --drop-capability=CAP Drop the specified capability from the default set\n"
199 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
200 " -j Equivalent to --link-journal=host\n"
69c79d3c 201 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
202 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
203 " the container\n"
204 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 205 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
284c0b91 206 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 207 " --share-system Share system namespaces with host\n"
eb91eb18 208 " --register=BOOLEAN Register container as machine\n"
89f7c846 209 " --keep-unit Do not register a scope for the machine, reuse\n"
69c79d3c 210 " the service unit nspawn is running in\n",
88213476
LP
211 program_invocation_short_name);
212
213 return 0;
214}
215
216static int parse_argv(int argc, char *argv[]) {
217
a41fe3a2 218 enum {
acbeb427
ZJS
219 ARG_VERSION = 0x100,
220 ARG_PRIVATE_NETWORK,
bc2f673e 221 ARG_UUID,
5076f0cc 222 ARG_READ_ONLY,
57fb9fb5 223 ARG_CAPABILITY,
420c7379 224 ARG_DROP_CAPABILITY,
17fe0523
LP
225 ARG_LINK_JOURNAL,
226 ARG_BIND,
f4889f65 227 ARG_BIND_RO,
06c17c39 228 ARG_TMPFS,
f4889f65 229 ARG_SETENV,
eb91eb18 230 ARG_SHARE_SYSTEM,
89f7c846 231 ARG_REGISTER,
aa28aefe 232 ARG_KEEP_UNIT,
69c79d3c 233 ARG_NETWORK_INTERFACE,
c74e630d 234 ARG_NETWORK_MACVLAN,
69c79d3c 235 ARG_NETWORK_VETH,
ab046dde 236 ARG_NETWORK_BRIDGE,
6afc95b7 237 ARG_PERSONALITY,
a41fe3a2
LP
238 };
239
88213476 240 static const struct option options[] = {
aa28aefe
LP
241 { "help", no_argument, NULL, 'h' },
242 { "version", no_argument, NULL, ARG_VERSION },
243 { "directory", required_argument, NULL, 'D' },
244 { "user", required_argument, NULL, 'u' },
245 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
246 { "boot", no_argument, NULL, 'b' },
247 { "uuid", required_argument, NULL, ARG_UUID },
248 { "read-only", no_argument, NULL, ARG_READ_ONLY },
249 { "capability", required_argument, NULL, ARG_CAPABILITY },
250 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
251 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
252 { "bind", required_argument, NULL, ARG_BIND },
253 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 254 { "tmpfs", required_argument, NULL, ARG_TMPFS },
aa28aefe
LP
255 { "machine", required_argument, NULL, 'M' },
256 { "slice", required_argument, NULL, 'S' },
257 { "setenv", required_argument, NULL, ARG_SETENV },
258 { "selinux-context", required_argument, NULL, 'Z' },
259 { "selinux-apifs-context", required_argument, NULL, 'L' },
260 { "quiet", no_argument, NULL, 'q' },
261 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
262 { "register", required_argument, NULL, ARG_REGISTER },
263 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
264 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 265 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
ab046dde
TG
266 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
267 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 268 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 269 { "image", required_argument, NULL, 'i' },
eb9da376 270 {}
88213476
LP
271 };
272
9444b1f2 273 int c, r;
a42c8b54 274 uint64_t plus = 0, minus = 0;
88213476
LP
275
276 assert(argc >= 0);
277 assert(argv);
278
1b9e5b12 279 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
88213476
LP
280
281 switch (c) {
282
283 case 'h':
eb9da376 284 return help();
88213476 285
acbeb427
ZJS
286 case ARG_VERSION:
287 puts(PACKAGE_STRING);
288 puts(SYSTEMD_FEATURES);
289 return 0;
290
88213476
LP
291 case 'D':
292 free(arg_directory);
3a74cea5
LP
293 arg_directory = canonicalize_file_name(optarg);
294 if (!arg_directory) {
898d5c91 295 log_error("Invalid root directory: %m");
88213476
LP
296 return -ENOMEM;
297 }
298
299 break;
300
1b9e5b12
LP
301 case 'i':
302 arg_image = optarg;
303 break;
304
687d0825
MV
305 case 'u':
306 free(arg_user);
7027ff61
LP
307 arg_user = strdup(optarg);
308 if (!arg_user)
309 return log_oom();
687d0825
MV
310
311 break;
312
ab046dde 313 case ARG_NETWORK_BRIDGE:
c74e630d 314 arg_network_bridge = optarg;
ab046dde
TG
315
316 /* fall through */
317
69c79d3c
LP
318 case ARG_NETWORK_VETH:
319 arg_network_veth = true;
320 arg_private_network = true;
321 break;
322
aa28aefe 323 case ARG_NETWORK_INTERFACE:
c74e630d
LP
324 if (strv_extend(&arg_network_interfaces, optarg) < 0)
325 return log_oom();
326
327 arg_private_network = true;
328 break;
329
330 case ARG_NETWORK_MACVLAN:
331 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
332 return log_oom();
333
334 /* fall through */
335
ff01d048
LP
336 case ARG_PRIVATE_NETWORK:
337 arg_private_network = true;
a41fe3a2
LP
338 break;
339
0f0dbc46
LP
340 case 'b':
341 arg_boot = true;
342 break;
343
144f0fc0 344 case ARG_UUID:
9444b1f2
LP
345 r = sd_id128_from_string(optarg, &arg_uuid);
346 if (r < 0) {
aa96c6cb 347 log_error("Invalid UUID: %s", optarg);
9444b1f2 348 return r;
aa96c6cb 349 }
9444b1f2 350 break;
aa96c6cb 351
9444b1f2 352 case 'S':
c74e630d 353 arg_slice = optarg;
144f0fc0
LP
354 break;
355
7027ff61 356 case 'M':
eb91eb18
LP
357 if (isempty(optarg)) {
358 free(arg_machine);
359 arg_machine = NULL;
360 } else {
7027ff61 361
eb91eb18
LP
362 if (!hostname_is_valid(optarg)) {
363 log_error("Invalid machine name: %s", optarg);
364 return -EINVAL;
365 }
7027ff61 366
eb91eb18
LP
367 free(arg_machine);
368 arg_machine = strdup(optarg);
369 if (!arg_machine)
370 return log_oom();
371
372 break;
373 }
7027ff61 374
82adf6af
LP
375 case 'Z':
376 arg_selinux_context = optarg;
a8828ed9
DW
377 break;
378
82adf6af
LP
379 case 'L':
380 arg_selinux_apifs_context = optarg;
a8828ed9
DW
381 break;
382
bc2f673e
LP
383 case ARG_READ_ONLY:
384 arg_read_only = true;
385 break;
386
420c7379
LP
387 case ARG_CAPABILITY:
388 case ARG_DROP_CAPABILITY: {
5076f0cc
LP
389 char *state, *word;
390 size_t length;
391
392 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 393 _cleanup_free_ char *t;
5076f0cc 394 cap_value_t cap;
5076f0cc
LP
395
396 t = strndup(word, length);
0d0f0c50
SL
397 if (!t)
398 return log_oom();
5076f0cc 399
39ed67d1
LP
400 if (streq(t, "all")) {
401 if (c == ARG_CAPABILITY)
a42c8b54 402 plus = (uint64_t) -1;
39ed67d1 403 else
a42c8b54 404 minus = (uint64_t) -1;
39ed67d1
LP
405 } else {
406 if (cap_from_name(t, &cap) < 0) {
407 log_error("Failed to parse capability %s.", t);
408 return -EINVAL;
409 }
410
411 if (c == ARG_CAPABILITY)
a42c8b54 412 plus |= 1ULL << (uint64_t) cap;
39ed67d1 413 else
a42c8b54 414 minus |= 1ULL << (uint64_t) cap;
5076f0cc 415 }
5076f0cc
LP
416 }
417
418 break;
419 }
420
57fb9fb5
LP
421 case 'j':
422 arg_link_journal = LINK_GUEST;
423 break;
424
425 case ARG_LINK_JOURNAL:
426 if (streq(optarg, "auto"))
427 arg_link_journal = LINK_AUTO;
428 else if (streq(optarg, "no"))
429 arg_link_journal = LINK_NO;
430 else if (streq(optarg, "guest"))
431 arg_link_journal = LINK_GUEST;
432 else if (streq(optarg, "host"))
433 arg_link_journal = LINK_HOST;
434 else {
435 log_error("Failed to parse link journal mode %s", optarg);
436 return -EINVAL;
437 }
438
439 break;
440
17fe0523
LP
441 case ARG_BIND:
442 case ARG_BIND_RO: {
443 _cleanup_free_ char *a = NULL, *b = NULL;
444 char *e;
445 char ***x;
17fe0523
LP
446
447 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
448
449 e = strchr(optarg, ':');
450 if (e) {
451 a = strndup(optarg, e - optarg);
452 b = strdup(e + 1);
453 } else {
454 a = strdup(optarg);
455 b = strdup(optarg);
456 }
457
458 if (!a || !b)
459 return log_oom();
460
461 if (!path_is_absolute(a) || !path_is_absolute(b)) {
462 log_error("Invalid bind mount specification: %s", optarg);
463 return -EINVAL;
464 }
465
466 r = strv_extend(x, a);
467 if (r < 0)
b3451bed 468 return log_oom();
17fe0523
LP
469
470 r = strv_extend(x, b);
471 if (r < 0)
b3451bed 472 return log_oom();
17fe0523
LP
473
474 break;
475 }
476
06c17c39
LP
477 case ARG_TMPFS: {
478 _cleanup_free_ char *a = NULL, *b = NULL;
479 char *e;
480
481 e = strchr(optarg, ':');
482 if (e) {
483 a = strndup(optarg, e - optarg);
484 b = strdup(e + 1);
485 } else {
486 a = strdup(optarg);
487 b = strdup("mode=0755");
488 }
489
490 if (!a || !b)
491 return log_oom();
492
493 if (!path_is_absolute(a)) {
494 log_error("Invalid tmpfs specification: %s", optarg);
495 return -EINVAL;
496 }
497
498 r = strv_push(&arg_tmpfs, a);
499 if (r < 0)
500 return log_oom();
501
502 a = NULL;
503
504 r = strv_push(&arg_tmpfs, b);
505 if (r < 0)
506 return log_oom();
507
508 b = NULL;
509
510 break;
511 }
512
f4889f65
LP
513 case ARG_SETENV: {
514 char **n;
515
516 if (!env_assignment_is_valid(optarg)) {
517 log_error("Environment variable assignment '%s' is not valid.", optarg);
518 return -EINVAL;
519 }
520
521 n = strv_env_set(arg_setenv, optarg);
522 if (!n)
523 return log_oom();
524
525 strv_free(arg_setenv);
526 arg_setenv = n;
527 break;
528 }
529
284c0b91
LP
530 case 'q':
531 arg_quiet = true;
532 break;
533
8a96d94e
LP
534 case ARG_SHARE_SYSTEM:
535 arg_share_system = true;
536 break;
537
eb91eb18
LP
538 case ARG_REGISTER:
539 r = parse_boolean(optarg);
540 if (r < 0) {
541 log_error("Failed to parse --register= argument: %s", optarg);
542 return r;
543 }
544
545 arg_register = r;
546 break;
547
89f7c846
LP
548 case ARG_KEEP_UNIT:
549 arg_keep_unit = true;
550 break;
551
6afc95b7
LP
552 case ARG_PERSONALITY:
553
ac45f971 554 arg_personality = personality_from_string(optarg);
6afc95b7
LP
555 if (arg_personality == 0xffffffffLU) {
556 log_error("Unknown or unsupported personality '%s'.", optarg);
557 return -EINVAL;
558 }
559
560 break;
561
88213476
LP
562 case '?':
563 return -EINVAL;
564
565 default:
eb9da376 566 assert_not_reached("Unhandled option");
88213476
LP
567 }
568 }
569
eb91eb18
LP
570 if (arg_share_system)
571 arg_register = false;
572
573 if (arg_boot && arg_share_system) {
574 log_error("--boot and --share-system may not be combined.");
575 return -EINVAL;
576 }
577
89f7c846
LP
578 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
579 log_error("--keep-unit may not be used when invoked from a user session.");
580 return -EINVAL;
581 }
582
1b9e5b12
LP
583 if (arg_directory && arg_image) {
584 log_error("--directory= and --image= may not be combined.");
585 return -EINVAL;
586 }
587
a42c8b54
LP
588 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
589
88213476
LP
590 return 1;
591}
592
593static int mount_all(const char *dest) {
594
595 typedef struct MountPoint {
596 const char *what;
597 const char *where;
598 const char *type;
599 const char *options;
600 unsigned long flags;
3bd66c05 601 bool fatal;
88213476
LP
602 } MountPoint;
603
604 static const MountPoint mount_table[] = {
06c17c39
LP
605 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
606 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
607 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
608 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
609 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 610 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
06c17c39
LP
611 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
612 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 613#ifdef HAVE_SELINUX
06c17c39
LP
614 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
615 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 616#endif
88213476
LP
617 };
618
619 unsigned k;
620 int r = 0;
621
622 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
7fd1b19b 623 _cleanup_free_ char *where = NULL;
d002827b 624#ifdef HAVE_SELINUX
a8828ed9 625 _cleanup_free_ char *options = NULL;
d002827b
LP
626#endif
627 const char *o;
88213476
LP
628 int t;
629
17fe0523
LP
630 where = strjoin(dest, "/", mount_table[k].where, NULL);
631 if (!where)
632 return log_oom();
88213476 633
e65aec12 634 t = path_is_mount_point(where, true);
68fb0892 635 if (t < 0) {
88213476 636 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
88213476
LP
637
638 if (r == 0)
639 r = t;
640
641 continue;
642 }
643
9c1c7f71
LP
644 /* Skip this entry if it is not a remount. */
645 if (mount_table[k].what && t > 0)
014a9c77
LP
646 continue;
647
17fe0523 648 mkdir_p(where, 0755);
88213476 649
a8828ed9 650#ifdef HAVE_SELINUX
82adf6af
LP
651 if (arg_selinux_apifs_context &&
652 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
653 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
654 if (!options)
655 return log_oom();
656
657 o = options;
658 } else
a8828ed9 659#endif
d002827b 660 o = mount_table[k].options;
a8828ed9 661
a8828ed9 662
88213476
LP
663 if (mount(mount_table[k].what,
664 where,
665 mount_table[k].type,
666 mount_table[k].flags,
d002827b 667 o) < 0 &&
3bd66c05 668 mount_table[k].fatal) {
88213476
LP
669
670 log_error("mount(%s) failed: %m", where);
671
672 if (r == 0)
673 r = -errno;
674 }
88213476
LP
675 }
676
e58a1277
LP
677 return r;
678}
f8440af5 679
d6797c92 680static int mount_binds(const char *dest, char **l, bool ro) {
17fe0523
LP
681 char **x, **y;
682
683 STRV_FOREACH_PAIR(x, y, l) {
06c17c39 684 _cleanup_free_ char *where = NULL;
d2421337 685 struct stat source_st, dest_st;
2ed4e5e0 686 int r;
d2421337
DR
687
688 if (stat(*x, &source_st) < 0) {
1b9e5b12 689 log_error("Failed to stat %s: %m", *x);
d2421337
DR
690 return -errno;
691 }
17fe0523 692
06c17c39
LP
693 where = strappend(dest, *y);
694 if (!where)
695 return log_oom();
696
2ed4e5e0
SL
697 r = stat(where, &dest_st);
698 if (r == 0) {
d2421337 699 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
06c17c39 700 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
d2421337
DR
701 return -EINVAL;
702 }
2ed4e5e0
SL
703 } else if (errno == ENOENT) {
704 r = mkdir_parents_label(where, 0755);
705 if (r < 0) {
706 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
707 return r;
d2421337 708 }
2ed4e5e0 709 } else {
590b6b91 710 log_error("Failed to bind mount %s: %m", *x);
2ed4e5e0
SL
711 return -errno;
712 }
06c17c39 713
2ed4e5e0
SL
714 /* Create the mount point, but be conservative -- refuse to create block
715 * and char devices. */
716 if (S_ISDIR(source_st.st_mode))
717 mkdir_label(where, 0755);
718 else if (S_ISFIFO(source_st.st_mode))
719 mkfifo(where, 0644);
720 else if (S_ISSOCK(source_st.st_mode))
721 mknod(where, 0644 | S_IFSOCK, 0);
722 else if (S_ISREG(source_st.st_mode))
723 touch(where);
724 else {
725 log_error("Refusing to create mountpoint for file: %s", *x);
726 return -ENOTSUP;
d2421337 727 }
17fe0523
LP
728
729 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
730 log_error("mount(%s) failed: %m", where);
731 return -errno;
732 }
733
d6797c92
LP
734 if (ro) {
735 r = bind_remount_recursive(where, true);
736 if (r < 0) {
737 log_error("Read-Only bind mount failed: %s", strerror(-r));
738 return r;
739 }
17fe0523
LP
740 }
741 }
742
743 return 0;
744}
745
06c17c39
LP
746static int mount_tmpfs(const char *dest) {
747 char **i, **o;
748
749 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
750 _cleanup_free_ char *where = NULL;
751
752 where = strappend(dest, *i);
753 if (!where)
754 return log_oom();
755
756 mkdir_label(where, 0755);
757
758 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
759 log_error("tmpfs mount to %s failed: %m", where);
760 return -errno;
761 }
762 }
763
764 return 0;
765}
766
e58a1277 767static int setup_timezone(const char *dest) {
d4036145
LP
768 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
769 char *z, *y;
770 int r;
f8440af5 771
e58a1277
LP
772 assert(dest);
773
774 /* Fix the timezone, if possible */
d4036145
LP
775 r = readlink_malloc("/etc/localtime", &p);
776 if (r < 0) {
777 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
778 return 0;
779 }
780
781 z = path_startswith(p, "../usr/share/zoneinfo/");
782 if (!z)
783 z = path_startswith(p, "/usr/share/zoneinfo/");
784 if (!z) {
785 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
786 return 0;
787 }
788
04bc4a3f
LP
789 where = strappend(dest, "/etc/localtime");
790 if (!where)
0d0f0c50 791 return log_oom();
715ac17a 792
d4036145
LP
793 r = readlink_malloc(where, &q);
794 if (r >= 0) {
795 y = path_startswith(q, "../usr/share/zoneinfo/");
796 if (!y)
797 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 798
4d1c38b8 799
d4036145
LP
800 /* Already pointing to the right place? Then do nothing .. */
801 if (y && streq(y, z))
802 return 0;
803 }
804
805 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
806 if (!check)
0d0f0c50 807 return log_oom();
4d1c38b8 808
d4036145
LP
809 if (access(check, F_OK) < 0) {
810 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
811 return 0;
812 }
68fb0892 813
d4036145
LP
814 what = strappend("../usr/share/zoneinfo/", z);
815 if (!what)
816 return log_oom();
817
818 unlink(where);
819 if (symlink(what, where) < 0) {
820 log_error("Failed to correct timezone of container: %m");
821 return 0;
822 }
e58a1277
LP
823
824 return 0;
88213476
LP
825}
826
2547bb41 827static int setup_resolv_conf(const char *dest) {
c8b32e11 828 _cleanup_free_ char *where = NULL;
2547bb41
LP
829
830 assert(dest);
831
832 if (arg_private_network)
833 return 0;
834
835 /* Fix resolv.conf, if possible */
04bc4a3f
LP
836 where = strappend(dest, "/etc/resolv.conf");
837 if (!where)
0d0f0c50 838 return log_oom();
2547bb41 839
77e63faf
LP
840 /* We don't really care for the results of this really. If it
841 * fails, it fails, but meh... */
849958d1 842 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
2547bb41
LP
843
844 return 0;
845}
846
9f24adc2
LP
847static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
848
849 snprintf(s, 37,
850 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
851 SD_ID128_FORMAT_VAL(id));
852
853 return s;
854}
855
04bc4a3f 856static int setup_boot_id(const char *dest) {
7fd1b19b 857 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 858 sd_id128_t rnd = {};
04bc4a3f
LP
859 char as_uuid[37];
860 int r;
861
862 assert(dest);
863
eb91eb18
LP
864 if (arg_share_system)
865 return 0;
866
04bc4a3f
LP
867 /* Generate a new randomized boot ID, so that each boot-up of
868 * the container gets a new one */
869
870 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 871 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
872 if (!from || !to)
873 return log_oom();
04bc4a3f
LP
874
875 r = sd_id128_randomize(&rnd);
876 if (r < 0) {
877 log_error("Failed to generate random boot id: %s", strerror(-r));
ed8b7a3e 878 return r;
04bc4a3f
LP
879 }
880
9f24adc2 881 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 882
574d5f2d 883 r = write_string_file(from, as_uuid);
04bc4a3f
LP
884 if (r < 0) {
885 log_error("Failed to write boot id: %s", strerror(-r));
ed8b7a3e 886 return r;
04bc4a3f
LP
887 }
888
889 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
890 log_error("Failed to bind mount boot id: %m");
891 r = -errno;
10d18763
ZJS
892 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
893 log_warning("Failed to make boot id read-only: %m");
04bc4a3f
LP
894
895 unlink(from);
04bc4a3f
LP
896 return r;
897}
898
e58a1277 899static int copy_devnodes(const char *dest) {
88213476
LP
900
901 static const char devnodes[] =
902 "null\0"
903 "zero\0"
904 "full\0"
905 "random\0"
906 "urandom\0"
f2d88580 907 "tty\0";
88213476
LP
908
909 const char *d;
e58a1277 910 int r = 0;
7fd1b19b 911 _cleanup_umask_ mode_t u;
a258bf26
LP
912
913 assert(dest);
124640f1
LP
914
915 u = umask(0000);
88213476
LP
916
917 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 918 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 919 struct stat st;
88213476 920
7f112f50
LP
921 from = strappend("/dev/", d);
922 to = strjoin(dest, "/dev/", d, NULL);
923 if (!from || !to)
924 return log_oom();
88213476
LP
925
926 if (stat(from, &st) < 0) {
927
928 if (errno != ENOENT) {
929 log_error("Failed to stat %s: %m", from);
7f112f50 930 return -errno;
88213476
LP
931 }
932
a258bf26 933 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 934
ed8b7a3e 935 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 936 return -EIO;
a258bf26
LP
937
938 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
939
940 log_error("mknod(%s) failed: %m", dest);
7f112f50 941 return -errno;
88213476 942 }
88213476
LP
943 }
944
e58a1277
LP
945 return r;
946}
88213476 947
f2d88580
LP
948static int setup_ptmx(const char *dest) {
949 _cleanup_free_ char *p = NULL;
950
951 p = strappend(dest, "/dev/ptmx");
952 if (!p)
953 return log_oom();
954
955 if (symlink("pts/ptmx", p) < 0) {
956 log_error("Failed to create /dev/ptmx symlink: %m");
957 return -errno;
958 }
959
960 return 0;
961}
962
e58a1277 963static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
964 _cleanup_umask_ mode_t u;
965 const char *to;
e58a1277 966 struct stat st;
e58a1277 967 int r;
e58a1277
LP
968
969 assert(dest);
970 assert(console);
971
972 u = umask(0000);
973
eb0f0863
LP
974 if (stat("/dev/null", &st) < 0) {
975 log_error("Failed to stat /dev/null: %m");
25ea79fe 976 return -errno;
e58a1277 977 }
88213476 978
e58a1277
LP
979 r = chmod_and_chown(console, 0600, 0, 0);
980 if (r < 0) {
981 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
25ea79fe 982 return r;
a258bf26 983 }
88213476 984
a258bf26
LP
985 /* We need to bind mount the right tty to /dev/console since
986 * ptys can only exist on pts file systems. To have something
eb0f0863
LP
987 * to bind mount things on we create a device node first, and
988 * use /dev/null for that since we the cgroups device policy
989 * allows us to create that freely, while we cannot create
990 * /dev/console. (Note that the major minor doesn't actually
991 * matter here, since we mount it over anyway). */
a258bf26 992
eb0f0863 993 to = strappenda(dest, "/dev/console");
e58a1277
LP
994 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
995 log_error("mknod() for /dev/console failed: %m");
25ea79fe 996 return -errno;
e58a1277 997 }
a258bf26
LP
998
999 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
e58a1277 1000 log_error("Bind mount for /dev/console failed: %m");
25ea79fe 1001 return -errno;
a258bf26
LP
1002 }
1003
25ea79fe 1004 return 0;
e58a1277
LP
1005}
1006
1007static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 1008 _cleanup_free_ char *from = NULL, *to = NULL;
e58a1277 1009 int r, fd, k;
7fd1b19b 1010 _cleanup_umask_ mode_t u;
e58a1277
LP
1011 union {
1012 struct cmsghdr cmsghdr;
1013 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1014 } control = {};
1015 struct msghdr mh = {
1016 .msg_control = &control,
1017 .msg_controllen = sizeof(control),
1018 };
e58a1277
LP
1019 struct cmsghdr *cmsg;
1020
1021 assert(dest);
1022 assert(kmsg_socket >= 0);
a258bf26 1023
e58a1277 1024 u = umask(0000);
a258bf26 1025
f1e5dfe2
LP
1026 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1027 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1028 * on the reading side behave very similar to /proc/kmsg,
1029 * their writing side behaves differently from /dev/kmsg in
1030 * that writing blocks when nothing is reading. In order to
1031 * avoid any problems with containers deadlocking due to this
1032 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
1033 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1034 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1035 return log_oom();
e58a1277
LP
1036
1037 if (mkfifo(from, 0600) < 0) {
1038 log_error("mkfifo() for /dev/kmsg failed: %m");
25ea79fe 1039 return -errno;
e58a1277
LP
1040 }
1041
1042 r = chmod_and_chown(from, 0600, 0, 0);
1043 if (r < 0) {
1044 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
25ea79fe 1045 return r;
e58a1277
LP
1046 }
1047
1048 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1049 log_error("Bind mount for /proc/kmsg failed: %m");
25ea79fe 1050 return -errno;
e58a1277
LP
1051 }
1052
1053 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1054 if (fd < 0) {
1055 log_error("Failed to open fifo: %m");
25ea79fe 1056 return -errno;
e58a1277
LP
1057 }
1058
e58a1277
LP
1059 cmsg = CMSG_FIRSTHDR(&mh);
1060 cmsg->cmsg_level = SOL_SOCKET;
1061 cmsg->cmsg_type = SCM_RIGHTS;
1062 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1063 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1064
1065 mh.msg_controllen = cmsg->cmsg_len;
1066
1067 /* Store away the fd in the socket, so that it stays open as
1068 * long as we run the child */
1069 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
03e334a1 1070 safe_close(fd);
e58a1277
LP
1071
1072 if (k < 0) {
1073 log_error("Failed to send FIFO fd: %m");
25ea79fe 1074 return -errno;
a258bf26
LP
1075 }
1076
f1e5dfe2
LP
1077 /* And now make the FIFO unavailable as /dev/kmsg... */
1078 unlink(from);
25ea79fe 1079 return 0;
88213476
LP
1080}
1081
3a74cea5 1082static int setup_hostname(void) {
3a74cea5 1083
eb91eb18
LP
1084 if (arg_share_system)
1085 return 0;
1086
7027ff61
LP
1087 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1088 return -errno;
3a74cea5 1089
7027ff61 1090 return 0;
3a74cea5
LP
1091}
1092
57fb9fb5 1093static int setup_journal(const char *directory) {
4d680aee 1094 sd_id128_t machine_id, this_id;
7fd1b19b 1095 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 1096 char *id;
57fb9fb5
LP
1097 int r;
1098
57fb9fb5 1099 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
1100 if (!p)
1101 return log_oom();
57fb9fb5
LP
1102
1103 r = read_one_line_file(p, &b);
27407a01
ZJS
1104 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1105 return 0;
1106 else if (r < 0) {
1107 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
57fb9fb5
LP
1108 return r;
1109 }
1110
27407a01
ZJS
1111 id = strstrip(b);
1112 if (isempty(id) && arg_link_journal == LINK_AUTO)
1113 return 0;
57fb9fb5 1114
27407a01
ZJS
1115 /* Verify validity */
1116 r = sd_id128_from_string(id, &machine_id);
57fb9fb5 1117 if (r < 0) {
27407a01
ZJS
1118 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1119 return r;
57fb9fb5
LP
1120 }
1121
4d680aee
ZJS
1122 r = sd_id128_get_machine(&this_id);
1123 if (r < 0) {
1124 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1125 return r;
1126 }
1127
1128 if (sd_id128_equal(machine_id, this_id)) {
1129 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1130 "Host and machine ids are equal (%s): refusing to link journals", id);
1131 if (arg_link_journal == LINK_AUTO)
1132 return 0;
1133 return
1134 -EEXIST;
1135 }
1136
1137 if (arg_link_journal == LINK_NO)
1138 return 0;
1139
57fb9fb5 1140 free(p);
27407a01
ZJS
1141 p = strappend("/var/log/journal/", id);
1142 q = strjoin(directory, "/var/log/journal/", id, NULL);
1143 if (!p || !q)
1144 return log_oom();
1145
1146 if (path_is_mount_point(p, false) > 0) {
1147 if (arg_link_journal != LINK_AUTO) {
1148 log_error("%s: already a mount point, refusing to use for journal", p);
1149 return -EEXIST;
1150 }
1151
1152 return 0;
57fb9fb5
LP
1153 }
1154
27407a01 1155 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1156 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1157 log_error("%s: already a mount point, refusing to use for journal", q);
1158 return -EEXIST;
57fb9fb5
LP
1159 }
1160
27407a01 1161 return 0;
57fb9fb5
LP
1162 }
1163
1164 r = readlink_and_make_absolute(p, &d);
1165 if (r >= 0) {
1166 if ((arg_link_journal == LINK_GUEST ||
1167 arg_link_journal == LINK_AUTO) &&
1168 path_equal(d, q)) {
1169
27407a01
ZJS
1170 r = mkdir_p(q, 0755);
1171 if (r < 0)
1172 log_warning("failed to create directory %s: %m", q);
1173 return 0;
57fb9fb5
LP
1174 }
1175
1176 if (unlink(p) < 0) {
1177 log_error("Failed to remove symlink %s: %m", p);
27407a01 1178 return -errno;
57fb9fb5
LP
1179 }
1180 } else if (r == -EINVAL) {
1181
1182 if (arg_link_journal == LINK_GUEST &&
1183 rmdir(p) < 0) {
1184
27407a01
ZJS
1185 if (errno == ENOTDIR) {
1186 log_error("%s already exists and is neither a symlink nor a directory", p);
1187 return r;
1188 } else {
57fb9fb5 1189 log_error("Failed to remove %s: %m", p);
27407a01 1190 return -errno;
57fb9fb5 1191 }
57fb9fb5
LP
1192 }
1193 } else if (r != -ENOENT) {
1194 log_error("readlink(%s) failed: %m", p);
27407a01 1195 return r;
57fb9fb5
LP
1196 }
1197
1198 if (arg_link_journal == LINK_GUEST) {
1199
1200 if (symlink(q, p) < 0) {
1201 log_error("Failed to symlink %s to %s: %m", q, p);
27407a01 1202 return -errno;
57fb9fb5
LP
1203 }
1204
27407a01
ZJS
1205 r = mkdir_p(q, 0755);
1206 if (r < 0)
1207 log_warning("failed to create directory %s: %m", q);
1208 return 0;
57fb9fb5
LP
1209 }
1210
1211 if (arg_link_journal == LINK_HOST) {
1212 r = mkdir_p(p, 0755);
1213 if (r < 0) {
1214 log_error("Failed to create %s: %m", p);
27407a01 1215 return r;
57fb9fb5
LP
1216 }
1217
27407a01
ZJS
1218 } else if (access(p, F_OK) < 0)
1219 return 0;
57fb9fb5 1220
cdb2b9d0
LP
1221 if (dir_is_empty(q) == 0)
1222 log_warning("%s is not empty, proceeding anyway.", q);
1223
57fb9fb5
LP
1224 r = mkdir_p(q, 0755);
1225 if (r < 0) {
1226 log_error("Failed to create %s: %m", q);
27407a01 1227 return r;
57fb9fb5
LP
1228 }
1229
1230 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1231 log_error("Failed to bind mount journal from host into guest: %m");
27407a01 1232 return -errno;
57fb9fb5
LP
1233 }
1234
27407a01 1235 return 0;
57fb9fb5
LP
1236}
1237
9bd37b40
LP
1238static int setup_kdbus(const char *dest, const char *path) {
1239 const char *p;
1240
1241 if (!path)
1242 return 0;
1243
1244 p = strappenda(dest, "/dev/kdbus");
1245 if (mkdir(p, 0755) < 0) {
1246 log_error("Failed to create kdbus path: %m");
1247 return -errno;
1248 }
1249
1250 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
486e99a3 1251 log_error("Failed to mount kdbus domain path: %m");
9bd37b40
LP
1252 return -errno;
1253 }
1254
1255 return 0;
1256}
1257
88213476 1258static int drop_capabilities(void) {
5076f0cc 1259 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1260}
1261
354bfd2b 1262static int register_machine(pid_t pid) {
9444b1f2
LP
1263 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1264 _cleanup_bus_unref_ sd_bus *bus = NULL;
1265 int r;
1266
eb91eb18
LP
1267 if (!arg_register)
1268 return 0;
1269
1c03020c 1270 r = sd_bus_default_system(&bus);
9444b1f2
LP
1271 if (r < 0) {
1272 log_error("Failed to open system bus: %s", strerror(-r));
1273 return r;
1274 }
1275
89f7c846
LP
1276 if (arg_keep_unit) {
1277 r = sd_bus_call_method(
1278 bus,
1279 "org.freedesktop.machine1",
1280 "/org/freedesktop/machine1",
1281 "org.freedesktop.machine1.Manager",
1282 "RegisterMachine",
1283 &error,
1284 NULL,
1285 "sayssus",
1286 arg_machine,
1287 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1288 "nspawn",
1289 "container",
1290 (uint32_t) pid,
1291 strempty(arg_directory));
1292 } else {
9457ac5b
LP
1293 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1294
1295 r = sd_bus_message_new_method_call(
89f7c846 1296 bus,
9457ac5b 1297 &m,
89f7c846
LP
1298 "org.freedesktop.machine1",
1299 "/org/freedesktop/machine1",
1300 "org.freedesktop.machine1.Manager",
9457ac5b
LP
1301 "CreateMachine");
1302 if (r < 0) {
1303 log_error("Failed to create message: %s", strerror(-r));
1304 return r;
1305 }
1306
1307 r = sd_bus_message_append(
1308 m,
1309 "sayssus",
89f7c846
LP
1310 arg_machine,
1311 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1312 "nspawn",
1313 "container",
1314 (uint32_t) pid,
9457ac5b
LP
1315 strempty(arg_directory));
1316 if (r < 0) {
1317 log_error("Failed to append message arguments: %s", strerror(-r));
1318 return r;
1319 }
1320
1321 r = sd_bus_message_open_container(m, 'a', "(sv)");
1322 if (r < 0) {
1323 log_error("Failed to open container: %s", strerror(-r));
1324 return r;
1325 }
1326
1327 if (!isempty(arg_slice)) {
1328 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1329 if (r < 0) {
1330 log_error("Failed to append slice: %s", strerror(-r));
1331 return r;
1332 }
1333 }
1334
1335 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1336 if (r < 0) {
1337 log_error("Failed to add device policy: %s", strerror(-r));
1338 return r;
1339 }
1340
a07f961e 1341 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
9457ac5b
LP
1342 /* Allow the container to
1343 * access and create the API
1344 * device nodes, so that
1345 * PrivateDevices= in the
1346 * container can work
1347 * fine */
1348 "/dev/null", "rwm",
1349 "/dev/zero", "rwm",
1350 "/dev/full", "rwm",
1351 "/dev/random", "rwm",
1352 "/dev/urandom", "rwm",
1353 "/dev/tty", "rwm",
1354 /* Allow the container
1355 * access to ptys. However,
1356 * do not permit the
1357 * container to ever create
1358 * these device nodes. */
1359 "/dev/pts/ptmx", "rw",
a07f961e
LP
1360 "char-pts", "rw",
1361 /* Allow the container
1362 * access to all kdbus
1363 * devices. Again, the
1364 * container cannot create
1365 * these nodes, only use
1366 * them. We use a pretty
1367 * open match here, so that
1368 * the kernel API can still
1369 * change. */
1370 "char-kdbus", "rw",
1371 "char-kdbus/*", "rw");
9457ac5b
LP
1372 if (r < 0) {
1373 log_error("Failed to add device whitelist: %s", strerror(-r));
1374 return r;
1375 }
1376
1377 r = sd_bus_message_close_container(m);
1378 if (r < 0) {
1379 log_error("Failed to close container: %s", strerror(-r));
1380 return r;
1381 }
1382
1383 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
1384 }
1385
9444b1f2 1386 if (r < 0) {
1f0cd86b
LP
1387 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1388 return r;
1389 }
1390
1391 return 0;
1392}
1393
1394static int terminate_machine(pid_t pid) {
1395 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1396 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1397 _cleanup_bus_unref_ sd_bus *bus = NULL;
1398 const char *path;
1399 int r;
1400
eb91eb18
LP
1401 if (!arg_register)
1402 return 0;
1403
76b54375 1404 r = sd_bus_default_system(&bus);
1f0cd86b
LP
1405 if (r < 0) {
1406 log_error("Failed to open system bus: %s", strerror(-r));
1407 return r;
1408 }
1409
1410 r = sd_bus_call_method(
1411 bus,
1412 "org.freedesktop.machine1",
1413 "/org/freedesktop/machine1",
1414 "org.freedesktop.machine1.Manager",
1415 "GetMachineByPID",
1416 &error,
1417 &reply,
1418 "u",
1419 (uint32_t) pid);
1420 if (r < 0) {
1421 /* Note that the machine might already have been
1422 * cleaned up automatically, hence don't consider it a
1423 * failure if we cannot get the machine object. */
1424 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1425 return 0;
1426 }
1427
1428 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
1429 if (r < 0)
1430 return bus_log_parse_error(r);
9444b1f2 1431
1f0cd86b
LP
1432 r = sd_bus_call_method(
1433 bus,
1434 "org.freedesktop.machine1",
1435 path,
1436 "org.freedesktop.machine1.Machine",
1437 "Terminate",
1438 &error,
1439 NULL,
1440 NULL);
1441 if (r < 0) {
1442 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1443 return 0;
1444 }
1445
9444b1f2
LP
1446 return 0;
1447}
1448
db999e0f
LP
1449static int reset_audit_loginuid(void) {
1450 _cleanup_free_ char *p = NULL;
1451 int r;
1452
1453 if (arg_share_system)
1454 return 0;
1455
1456 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1457 if (r == -ENOENT)
db999e0f
LP
1458 return 0;
1459 if (r < 0) {
1460 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1461 return r;
1462 }
1463
1464 /* Already reset? */
1465 if (streq(p, "4294967295"))
1466 return 0;
1467
1468 r = write_string_file("/proc/self/loginuid", "4294967295");
1469 if (r < 0) {
1470 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1471 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1472 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1473 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1474 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
77b6e194 1475
db999e0f 1476 sleep(5);
77b6e194 1477 }
db999e0f
LP
1478
1479 return 0;
77b6e194
LP
1480}
1481
01dde061
TG
1482#define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1483
1484static int get_mac(struct ether_addr *mac) {
1485 int r;
1486
1487 uint8_t result[8];
1488 size_t l, sz;
1489 uint8_t *v;
1490
1491 l = strlen(arg_machine);
1492 sz = sizeof(sd_id128_t) + l;
1493 v = alloca(sz);
1494
1495 /* fetch some persistent data unique to the host */
1496 r = sd_id128_get_machine((sd_id128_t*) v);
1497 if (r < 0)
1498 return r;
1499
1500 /* combine with some data unique (on this host) to this
1501 * container instance */
1502 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1503
1504 /* Let's hash the host machine ID plus the container name. We
1505 * use a fixed, but originally randomly created hash key here. */
1506 siphash24(result, v, sz, HASH_KEY.bytes);
1507
1508 assert_cc(ETH_ALEN <= sizeof(result));
1509 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1510
1511 /* see eth_random_addr in the kernel */
1512 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1513 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1514
1515 return 0;
1516}
1517
08af0da2 1518static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
69c79d3c 1519 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 1520 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
01dde061 1521 struct ether_addr mac;
69c79d3c
LP
1522 int r;
1523
1524 if (!arg_private_network)
1525 return 0;
1526
1527 if (!arg_network_veth)
1528 return 0;
1529
08af0da2
LP
1530 /* Use two different interface name prefixes depending whether
1531 * we are in bridge mode or not. */
1532 if (arg_network_bridge)
1533 memcpy(iface_name, "vb-", 3);
1534 else
1535 memcpy(iface_name, "ve-", 3);
ab046dde 1536 strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
69c79d3c 1537
01dde061
TG
1538 r = get_mac(&mac);
1539 if (r < 0) {
1540 log_error("Failed to generate predictable MAC address for host0");
1541 return r;
1542 }
1543
151b9b96 1544 r = sd_rtnl_open(&rtnl, 0);
69c79d3c
LP
1545 if (r < 0) {
1546 log_error("Failed to connect to netlink: %s", strerror(-r));
1547 return r;
1548 }
1549
151b9b96 1550 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
69c79d3c
LP
1551 if (r < 0) {
1552 log_error("Failed to allocate netlink message: %s", strerror(-r));
1553 return r;
1554 }
1555
ab046dde 1556 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
69c79d3c 1557 if (r < 0) {
ab046dde 1558 log_error("Failed to add netlink interface name: %s", strerror(-r));
69c79d3c
LP
1559 return r;
1560 }
1561
ee3a6a51 1562 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
69c79d3c
LP
1563 if (r < 0) {
1564 log_error("Failed to open netlink container: %s", strerror(-r));
1565 return r;
1566 }
1567
d8e538ec 1568 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
69c79d3c
LP
1569 if (r < 0) {
1570 log_error("Failed to open netlink container: %s", strerror(-r));
1571 return r;
1572 }
1573
ee3a6a51 1574 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
69c79d3c 1575 if (r < 0) {
ab046dde 1576 log_error("Failed to open netlink container: %s", strerror(-r));
69c79d3c
LP
1577 return r;
1578 }
1579
ab046dde 1580 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
69c79d3c 1581 if (r < 0) {
ab046dde 1582 log_error("Failed to add netlink interface name: %s", strerror(-r));
69c79d3c
LP
1583 return r;
1584 }
01dde061
TG
1585
1586 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1587 if (r < 0) {
1588 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1589 return r;
1590 }
69c79d3c 1591
ab046dde 1592 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
69c79d3c
LP
1593 if (r < 0) {
1594 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1595 return r;
1596 }
1597
1598 r = sd_rtnl_message_close_container(m);
1599 if (r < 0) {
1600 log_error("Failed to close netlink container: %s", strerror(-r));
1601 return r;
1602 }
1603
1604 r = sd_rtnl_message_close_container(m);
1605 if (r < 0) {
1606 log_error("Failed to close netlink container: %s", strerror(-r));
1607 return r;
1608 }
1609
1610 r = sd_rtnl_message_close_container(m);
1611 if (r < 0) {
1612 log_error("Failed to close netlink container: %s", strerror(-r));
1613 return r;
1614 }
1615
1616 r = sd_rtnl_call(rtnl, m, 0, NULL);
1617 if (r < 0) {
1618 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1619 return r;
1620 }
1621
1622 return 0;
1623}
1624
ab046dde
TG
1625static int setup_bridge(const char veth_name[]) {
1626 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1627 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1628 int r, bridge;
1629
1630 if (!arg_private_network)
1631 return 0;
1632
1633 if (!arg_network_veth)
1634 return 0;
1635
1636 if (!arg_network_bridge)
1637 return 0;
1638
1639 bridge = (int) if_nametoindex(arg_network_bridge);
1640 if (bridge <= 0) {
1641 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1642 return -errno;
1643 }
1644
151b9b96 1645 r = sd_rtnl_open(&rtnl, 0);
ab046dde
TG
1646 if (r < 0) {
1647 log_error("Failed to connect to netlink: %s", strerror(-r));
1648 return r;
1649 }
1650
151b9b96 1651 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
ab046dde
TG
1652 if (r < 0) {
1653 log_error("Failed to allocate netlink message: %s", strerror(-r));
1654 return r;
1655 }
1656
039dd4af
TG
1657 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1658 if (r < 0) {
1659 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1660 return r;
1661 }
1662
ab046dde
TG
1663 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1664 if (r < 0) {
1665 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1666 return r;
1667 }
1668
1669 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1670 if (r < 0) {
1671 log_error("Failed to add netlink master field: %s", strerror(-r));
1672 return r;
1673 }
1674
1675 r = sd_rtnl_call(rtnl, m, 0, NULL);
1676 if (r < 0) {
1677 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1678 return r;
1679 }
1680
1681 return 0;
1682}
1683
c74e630d
LP
1684static int parse_interface(struct udev *udev, const char *name) {
1685 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1686 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1687 int ifi;
1688
1689 ifi = (int) if_nametoindex(name);
1690 if (ifi <= 0) {
1691 log_error("Failed to resolve interface %s: %m", name);
1692 return -errno;
1693 }
1694
1695 sprintf(ifi_str, "n%i", ifi);
1696 d = udev_device_new_from_device_id(udev, ifi_str);
1697 if (!d) {
1698 log_error("Failed to get udev device for interface %s: %m", name);
1699 return -errno;
1700 }
1701
1702 if (udev_device_get_is_initialized(d) <= 0) {
1703 log_error("Network interface %s is not initialized yet.", name);
1704 return -EBUSY;
1705 }
1706
1707 return ifi;
1708}
1709
69c79d3c 1710static int move_network_interfaces(pid_t pid) {
7e227024 1711 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 1712 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
1713 char **i;
1714 int r;
1715
1716 if (!arg_private_network)
1717 return 0;
1718
1719 if (strv_isempty(arg_network_interfaces))
1720 return 0;
1721
151b9b96 1722 r = sd_rtnl_open(&rtnl, 0);
aa28aefe
LP
1723 if (r < 0) {
1724 log_error("Failed to connect to netlink: %s", strerror(-r));
1725 return r;
1726 }
1727
7e227024
LP
1728 udev = udev_new();
1729 if (!udev) {
1730 log_error("Failed to connect to udev.");
1731 return -ENOMEM;
1732 }
1733
aa28aefe 1734 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 1735 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 1736 int ifi;
aa28aefe 1737
c74e630d
LP
1738 ifi = parse_interface(udev, *i);
1739 if (ifi < 0)
1740 return ifi;
1741
1742 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1743 if (r < 0) {
1744 log_error("Failed to allocate netlink message: %s", strerror(-r));
1745 return r;
aa28aefe
LP
1746 }
1747
c74e630d
LP
1748 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1749 if (r < 0) {
1750 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1751 return r;
7e227024
LP
1752 }
1753
c74e630d
LP
1754 r = sd_rtnl_call(rtnl, m, 0, NULL);
1755 if (r < 0) {
1756 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1757 return r;
7e227024 1758 }
c74e630d 1759 }
7e227024 1760
c74e630d
LP
1761 return 0;
1762}
1763
1764static int setup_macvlan(pid_t pid) {
1765 _cleanup_udev_unref_ struct udev *udev = NULL;
1766 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1767 char **i;
1768 int r;
1769
1770 if (!arg_private_network)
1771 return 0;
1772
1773 if (strv_isempty(arg_network_macvlan))
1774 return 0;
1775
1776 r = sd_rtnl_open(&rtnl, 0);
1777 if (r < 0) {
1778 log_error("Failed to connect to netlink: %s", strerror(-r));
1779 return r;
1780 }
1781
1782 udev = udev_new();
1783 if (!udev) {
1784 log_error("Failed to connect to udev.");
1785 return -ENOMEM;
1786 }
1787
1788 STRV_FOREACH(i, arg_network_macvlan) {
1789 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1790 _cleanup_free_ char *n = NULL;
1791 int ifi;
1792
1793 ifi = parse_interface(udev, *i);
1794 if (ifi < 0)
1795 return ifi;
1796
1797 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
aa28aefe
LP
1798 if (r < 0) {
1799 log_error("Failed to allocate netlink message: %s", strerror(-r));
1800 return r;
1801 }
1802
c74e630d
LP
1803 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1804 if (r < 0) {
1805 log_error("Failed to add netlink interface index: %s", strerror(-r));
1806 return r;
1807 }
1808
1809 n = strappend("mv-", *i);
1810 if (!n)
1811 return log_oom();
1812
1813 strshorten(n, IFNAMSIZ-1);
1814
1815 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1816 if (r < 0) {
1817 log_error("Failed to add netlink interface name: %s", strerror(-r));
1818 return r;
1819 }
1820
aa28aefe
LP
1821 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1822 if (r < 0) {
c74e630d
LP
1823 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1824 return r;
1825 }
1826
1827 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1828 if (r < 0) {
1829 log_error("Failed to open netlink container: %s", strerror(-r));
1830 return r;
1831 }
1832
d8e538ec 1833 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
c74e630d
LP
1834 if (r < 0) {
1835 log_error("Failed to open netlink container: %s", strerror(-r));
1836 return r;
1837 }
1838
1839 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1840 if (r < 0) {
1841 log_error("Failed to append macvlan mode: %s", strerror(-r));
1842 return r;
1843 }
1844
1845 r = sd_rtnl_message_close_container(m);
1846 if (r < 0) {
1847 log_error("Failed to close netlink container: %s", strerror(-r));
1848 return r;
1849 }
1850
1851 r = sd_rtnl_message_close_container(m);
1852 if (r < 0) {
1853 log_error("Failed to close netlink container: %s", strerror(-r));
aa28aefe
LP
1854 return r;
1855 }
1856
1857 r = sd_rtnl_call(rtnl, m, 0, NULL);
1858 if (r < 0) {
c74e630d 1859 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
aa28aefe
LP
1860 return r;
1861 }
1862 }
1863
1864 return 0;
1865}
1866
28650077 1867static int setup_seccomp(void) {
24fb1112
LP
1868
1869#ifdef HAVE_SECCOMP
28650077
LP
1870 static const int blacklist[] = {
1871 SCMP_SYS(kexec_load),
1872 SCMP_SYS(open_by_handle_at),
1873 SCMP_SYS(init_module),
1874 SCMP_SYS(finit_module),
1875 SCMP_SYS(delete_module),
1876 SCMP_SYS(iopl),
1877 SCMP_SYS(ioperm),
1878 SCMP_SYS(swapon),
1879 SCMP_SYS(swapoff),
1880 };
1881
24fb1112 1882 scmp_filter_ctx seccomp;
28650077 1883 unsigned i;
24fb1112
LP
1884 int r;
1885
24fb1112
LP
1886 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1887 if (!seccomp)
1888 return log_oom();
1889
e9642be2 1890 r = seccomp_add_secondary_archs(seccomp);
9875fd78 1891 if (r < 0) {
e9642be2
LP
1892 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1893 goto finish;
1894 }
1895
28650077
LP
1896 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1897 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
1898 if (r == -EFAULT)
1899 continue; /* unknown syscall */
1900 if (r < 0) {
1901 log_error("Failed to block syscall: %s", strerror(-r));
1902 goto finish;
1903 }
1904 }
1905
1906 /*
1907 Audit is broken in containers, much of the userspace audit
1908 hookup will fail if running inside a container. We don't
1909 care and just turn off creation of audit sockets.
1910
1911 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1912 with EAFNOSUPPORT which audit userspace uses as indication
1913 that audit is disabled in the kernel.
1914 */
1915
3302da46 1916 r = seccomp_rule_add(
24fb1112
LP
1917 seccomp,
1918 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1919 SCMP_SYS(socket),
1920 2,
1921 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1922 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1923 if (r < 0) {
1924 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1925 goto finish;
1926 }
1927
1928 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1929 if (r < 0) {
1930 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1931 goto finish;
1932 }
1933
1934 r = seccomp_load(seccomp);
1935 if (r < 0)
1936 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1937
1938finish:
1939 seccomp_release(seccomp);
1940 return r;
1941#else
1942 return 0;
1943#endif
1944
1945}
1946
1b9e5b12
LP
1947static int setup_image(char **device_path, int *loop_nr) {
1948 struct loop_info64 info = {
1949 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1950 };
1951 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1952 _cleanup_free_ char* loopdev = NULL;
1953 struct stat st;
1954 int r, nr;
1955
1956 assert(device_path);
1957 assert(loop_nr);
1958
1959 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1960 if (fd < 0) {
1961 log_error("Failed to open %s: %m", arg_image);
1962 return -errno;
1963 }
1964
1965 if (fstat(fd, &st) < 0) {
1966 log_error("Failed to stat %s: %m", arg_image);
1967 return -errno;
1968 }
1969
1970 if (S_ISBLK(st.st_mode)) {
1971 char *p;
1972
1973 p = strdup(arg_image);
1974 if (!p)
1975 return log_oom();
1976
1977 *device_path = p;
1978
1979 *loop_nr = -1;
1980
1981 r = fd;
1982 fd = -1;
1983
1984 return r;
1985 }
1986
1987 if (!S_ISREG(st.st_mode)) {
1988 log_error("%s is not a regular file or block device: %m", arg_image);
1989 return -EINVAL;
1990 }
1991
1992 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1993 if (control < 0) {
1994 log_error("Failed to open /dev/loop-control: %m");
1995 return -errno;
1996 }
1997
1998 nr = ioctl(control, LOOP_CTL_GET_FREE);
1999 if (nr < 0) {
2000 log_error("Failed to allocate loop device: %m");
2001 return -errno;
2002 }
2003
2004 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2005 return log_oom();
2006
2007 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2008 if (loop < 0) {
2009 log_error("Failed to open loop device %s: %m", loopdev);
2010 return -errno;
2011 }
2012
2013 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2014 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2015 return -errno;
2016 }
2017
2018 if (arg_read_only)
2019 info.lo_flags |= LO_FLAGS_READ_ONLY;
2020
2021 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2022 log_error("Failed to set loopback settings on %s: %m", loopdev);
2023 return -errno;
2024 }
2025
2026 *device_path = loopdev;
2027 loopdev = NULL;
2028
2029 *loop_nr = nr;
2030
2031 r = loop;
2032 loop = -1;
2033
2034 return r;
2035}
2036
2037static int dissect_image(
2038 int fd,
727fd4fd
LP
2039 char **root_device, bool *root_device_rw,
2040 char **home_device, bool *home_device_rw,
2041 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
2042 bool *secondary) {
2043
2044#ifdef HAVE_BLKID
2045 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2046 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2047 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2048 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2049 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2050 _cleanup_udev_unref_ struct udev *udev = NULL;
2051 struct udev_list_entry *first, *item;
727fd4fd 2052 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1b9e5b12
LP
2053 const char *pttype = NULL;
2054 blkid_partlist pl;
2055 struct stat st;
2056 int r;
2057
2058 assert(fd >= 0);
2059 assert(root_device);
2060 assert(home_device);
2061 assert(srv_device);
2062 assert(secondary);
2063
2064 b = blkid_new_probe();
2065 if (!b)
2066 return log_oom();
2067
2068 errno = 0;
2069 r = blkid_probe_set_device(b, fd, 0, 0);
2070 if (r != 0) {
2071 if (errno == 0)
2072 return log_oom();
2073
2074 log_error("Failed to set device on blkid probe: %m");
2075 return -errno;
2076 }
2077
2078 blkid_probe_enable_partitions(b, 1);
2079 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2080
2081 errno = 0;
2082 r = blkid_do_safeprobe(b);
2083 if (r == -2 || r == 1) {
2084 log_error("Failed to identify any partition table on %s.\n"
2085 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2086 return -EINVAL;
2087 } else if (r != 0) {
2088 if (errno == 0)
2089 errno = EIO;
2090 log_error("Failed to probe: %m");
2091 return -errno;
2092 }
2093
2094 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2095 if (!streq_ptr(pttype, "gpt")) {
2096 log_error("Image %s does not carry a GUID Partition Table.\n"
2097 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2098 return -EINVAL;
2099 }
2100
2101 errno = 0;
2102 pl = blkid_probe_get_partitions(b);
2103 if (!pl) {
2104 if (errno == 0)
2105 return log_oom();
2106
2107 log_error("Failed to list partitions of %s", arg_image);
2108 return -errno;
2109 }
2110
2111 udev = udev_new();
2112 if (!udev)
2113 return log_oom();
2114
2115 if (fstat(fd, &st) < 0) {
2116 log_error("Failed to stat block device: %m");
2117 return -errno;
2118 }
2119
2120 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2121 if (!d)
2122 return log_oom();
2123
2124 e = udev_enumerate_new(udev);
2125 if (!e)
2126 return log_oom();
2127
2128 r = udev_enumerate_add_match_parent(e, d);
2129 if (r < 0)
2130 return log_oom();
2131
2132 r = udev_enumerate_scan_devices(e);
2133 if (r < 0) {
2134 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2135 return r;
2136 }
2137
2138 first = udev_enumerate_get_list_entry(e);
2139 udev_list_entry_foreach(item, first) {
2140 _cleanup_udev_device_unref_ struct udev_device *q;
2141 const char *stype, *node;
727fd4fd 2142 unsigned long long flags;
1b9e5b12
LP
2143 sd_id128_t type_id;
2144 blkid_partition pp;
2145 dev_t qn;
2146 int nr;
2147
2148 errno = 0;
2149 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2150 if (!q) {
2151 if (!errno)
2152 errno = ENOMEM;
2153
2154 log_error("Failed to get partition device of %s: %m", arg_image);
2155 return -errno;
2156 }
2157
2158 qn = udev_device_get_devnum(q);
2159 if (major(qn) == 0)
2160 continue;
2161
2162 if (st.st_rdev == qn)
2163 continue;
2164
2165 node = udev_device_get_devnode(q);
2166 if (!node)
2167 continue;
2168
2169 pp = blkid_partlist_devno_to_partition(pl, qn);
2170 if (!pp)
2171 continue;
2172
727fd4fd
LP
2173 flags = blkid_partition_get_flags(pp);
2174 if (flags & GPT_FLAG_NO_AUTO)
2175 continue;
2176
1b9e5b12
LP
2177 nr = blkid_partition_get_partno(pp);
2178 if (nr < 0)
2179 continue;
2180
2181 stype = blkid_partition_get_type_string(pp);
2182 if (!stype)
2183 continue;
2184
2185 if (sd_id128_from_string(stype, &type_id) < 0)
2186 continue;
2187
2188 if (sd_id128_equal(type_id, GPT_HOME)) {
2189
2190 if (home && nr >= home_nr)
2191 continue;
2192
2193 home_nr = nr;
727fd4fd
LP
2194 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2195
1b9e5b12
LP
2196 free(home);
2197 home = strdup(node);
2198 if (!home)
2199 return log_oom();
2200 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2201
2202 if (srv && nr >= srv_nr)
2203 continue;
2204
2205 srv_nr = nr;
727fd4fd
LP
2206 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2207
1b9e5b12
LP
2208 free(srv);
2209 srv = strdup(node);
2210 if (!srv)
2211 return log_oom();
2212 }
2213#ifdef GPT_ROOT_NATIVE
2214 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2215
2216 if (root && nr >= root_nr)
2217 continue;
2218
2219 root_nr = nr;
727fd4fd
LP
2220 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2221
1b9e5b12
LP
2222 free(root);
2223 root = strdup(node);
2224 if (!root)
2225 return log_oom();
2226 }
2227#endif
2228#ifdef GPT_ROOT_SECONDARY
2229 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2230
2231 if (secondary_root && nr >= secondary_root_nr)
2232 continue;
2233
2234 secondary_root_nr = nr;
727fd4fd
LP
2235 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2236
2237
1b9e5b12
LP
2238 free(secondary_root);
2239 secondary_root = strdup(node);
2240 if (!secondary_root)
2241 return log_oom();
2242 }
2243#endif
2244 }
2245
2246 if (!root && !secondary_root) {
2247 log_error("Failed to identify root partition in disk image %s.\n"
2248 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2249 return -EINVAL;
2250 }
2251
2252 if (root) {
2253 *root_device = root;
2254 root = NULL;
727fd4fd
LP
2255
2256 *root_device_rw = root_rw;
1b9e5b12
LP
2257 *secondary = false;
2258 } else if (secondary_root) {
2259 *root_device = secondary_root;
2260 secondary_root = NULL;
727fd4fd
LP
2261
2262 *root_device_rw = secondary_root_rw;
1b9e5b12
LP
2263 *secondary = true;
2264 }
2265
2266 if (home) {
2267 *home_device = home;
2268 home = NULL;
727fd4fd
LP
2269
2270 *home_device_rw = home_rw;
1b9e5b12
LP
2271 }
2272
2273 if (srv) {
2274 *srv_device = srv;
2275 srv = NULL;
727fd4fd
LP
2276
2277 *srv_device_rw = srv_rw;
1b9e5b12
LP
2278 }
2279
2280 return 0;
2281#else
2282 log_error("--image= is not supported, compiled without blkid support.");
2283 return -ENOTSUP;
2284#endif
2285}
2286
727fd4fd 2287static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2288#ifdef HAVE_BLKID
2289 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2290 const char *fstype, *p;
2291 int r;
2292
2293 assert(what);
2294 assert(where);
2295
727fd4fd
LP
2296 if (arg_read_only)
2297 rw = false;
2298
1b9e5b12
LP
2299 if (directory)
2300 p = strappenda(where, directory);
2301 else
2302 p = where;
2303
2304 errno = 0;
2305 b = blkid_new_probe_from_filename(what);
2306 if (!b) {
2307 if (errno == 0)
2308 return log_oom();
2309 log_error("Failed to allocate prober for %s: %m", what);
2310 return -errno;
2311 }
2312
2313 blkid_probe_enable_superblocks(b, 1);
2314 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2315
2316 errno = 0;
2317 r = blkid_do_safeprobe(b);
2318 if (r == -1 || r == 1) {
2319 log_error("Cannot determine file system type of %s", what);
2320 return -EINVAL;
2321 } else if (r != 0) {
2322 if (errno == 0)
2323 errno = EIO;
2324 log_error("Failed to probe %s: %m", what);
2325 return -errno;
2326 }
2327
2328 errno = 0;
2329 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2330 if (errno == 0)
2331 errno = EINVAL;
2332 log_error("Failed to determine file system type of %s", what);
2333 return -errno;
2334 }
2335
2336 if (streq(fstype, "crypto_LUKS")) {
2337 log_error("nspawn currently does not support LUKS disk images.");
2338 return -ENOTSUP;
2339 }
2340
727fd4fd 2341 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
1b9e5b12
LP
2342 log_error("Failed to mount %s: %m", what);
2343 return -errno;
2344 }
2345
2346 return 0;
2347#else
2348 log_error("--image= is not supported, compiled without blkid support.");
2349 return -ENOTSUP;
2350#endif
2351}
2352
727fd4fd
LP
2353static int mount_devices(
2354 const char *where,
2355 const char *root_device, bool root_device_rw,
2356 const char *home_device, bool home_device_rw,
2357 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2358 int r;
2359
2360 assert(where);
2361
2362 if (root_device) {
727fd4fd 2363 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
1b9e5b12
LP
2364 if (r < 0) {
2365 log_error("Failed to mount root directory: %s", strerror(-r));
2366 return r;
2367 }
2368 }
2369
2370 if (home_device) {
727fd4fd 2371 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
1b9e5b12
LP
2372 if (r < 0) {
2373 log_error("Failed to mount home directory: %s", strerror(-r));
2374 return r;
2375 }
2376 }
2377
2378 if (srv_device) {
727fd4fd 2379 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
1b9e5b12
LP
2380 if (r < 0) {
2381 log_error("Failed to mount server data directory: %s", strerror(-r));
2382 return r;
2383 }
2384 }
2385
2386 return 0;
2387}
2388
2389static void loop_remove(int nr, int *image_fd) {
2390 _cleanup_close_ int control = -1;
2391
2392 if (nr < 0)
2393 return;
2394
2395 if (image_fd && *image_fd >= 0) {
2396 ioctl(*image_fd, LOOP_CLR_FD);
03e334a1 2397 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2398 }
2399
2400 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2401 if (control < 0)
2402 return;
2403
2404 ioctl(control, LOOP_CTL_REMOVE, nr);
2405}
2406
0cb9fbcd
LP
2407static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2408 int pipe_fds[2];
2409 pid_t pid;
2410
2411 assert(database);
2412 assert(key);
2413 assert(rpid);
2414
2415 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2416 log_error("Failed to allocate pipe: %m");
2417 return -errno;
2418 }
2419
2420 pid = fork();
2421 if (pid < 0) {
2422 log_error("Failed to fork getent child: %m");
2423 return -errno;
2424 } else if (pid == 0) {
2425 int nullfd;
2426 char *empty_env = NULL;
2427
2428 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2429 _exit(EXIT_FAILURE);
2430
2431 if (pipe_fds[0] > 2)
03e334a1 2432 safe_close(pipe_fds[0]);
0cb9fbcd 2433 if (pipe_fds[1] > 2)
03e334a1 2434 safe_close(pipe_fds[1]);
0cb9fbcd
LP
2435
2436 nullfd = open("/dev/null", O_RDWR);
2437 if (nullfd < 0)
2438 _exit(EXIT_FAILURE);
2439
2440 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2441 _exit(EXIT_FAILURE);
2442
2443 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2444 _exit(EXIT_FAILURE);
2445
2446 if (nullfd > 2)
03e334a1 2447 safe_close(nullfd);
0cb9fbcd
LP
2448
2449 reset_all_signal_handlers();
2450 close_all_fds(NULL, 0);
2451
4de82926
MM
2452 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2453 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
2454 _exit(EXIT_FAILURE);
2455 }
2456
03e334a1 2457 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
2458
2459 *rpid = pid;
2460
2461 return pipe_fds[0];
2462}
2463
2464static int change_uid_gid(char **_home) {
0cb9fbcd
LP
2465 char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2466 _cleanup_free_ uid_t *uids = NULL;
2467 _cleanup_free_ char *home = NULL;
2468 _cleanup_fclose_ FILE *f = NULL;
2469 _cleanup_close_ int fd = -1;
2470 unsigned n_uids = 0;
70f539ca 2471 size_t sz = 0, l;
0cb9fbcd
LP
2472 uid_t uid;
2473 gid_t gid;
2474 pid_t pid;
2475 int r;
2476
2477 assert(_home);
2478
2479 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2480 /* Reset everything fully to 0, just in case */
2481
2482 if (setgroups(0, NULL) < 0) {
2483 log_error("setgroups() failed: %m");
2484 return -errno;
2485 }
2486
2487 if (setresgid(0, 0, 0) < 0) {
2488 log_error("setregid() failed: %m");
2489 return -errno;
2490 }
2491
2492 if (setresuid(0, 0, 0) < 0) {
2493 log_error("setreuid() failed: %m");
2494 return -errno;
2495 }
2496
2497 *_home = NULL;
2498 return 0;
2499 }
2500
2501 /* First, get user credentials */
2502 fd = spawn_getent("passwd", arg_user, &pid);
2503 if (fd < 0)
2504 return fd;
2505
2506 f = fdopen(fd, "r");
2507 if (!f)
2508 return log_oom();
2509 fd = -1;
2510
2511 if (!fgets(line, sizeof(line), f)) {
2512
2513 if (!ferror(f)) {
2514 log_error("Failed to resolve user %s.", arg_user);
2515 return -ESRCH;
2516 }
2517
2518 log_error("Failed to read from getent: %m");
2519 return -errno;
2520 }
2521
2522 truncate_nl(line);
2523
2524 wait_for_terminate_and_warn("getent passwd", pid);
2525
2526 x = strchr(line, ':');
2527 if (!x) {
2528 log_error("/etc/passwd entry has invalid user field.");
2529 return -EIO;
2530 }
2531
2532 u = strchr(x+1, ':');
2533 if (!u) {
2534 log_error("/etc/passwd entry has invalid password field.");
2535 return -EIO;
2536 }
2537
2538 u++;
2539 g = strchr(u, ':');
2540 if (!g) {
2541 log_error("/etc/passwd entry has invalid UID field.");
2542 return -EIO;
2543 }
2544
2545 *g = 0;
2546 g++;
2547 x = strchr(g, ':');
2548 if (!x) {
2549 log_error("/etc/passwd entry has invalid GID field.");
2550 return -EIO;
2551 }
2552
2553 *x = 0;
2554 h = strchr(x+1, ':');
2555 if (!h) {
2556 log_error("/etc/passwd entry has invalid GECOS field.");
2557 return -EIO;
2558 }
2559
2560 h++;
2561 x = strchr(h, ':');
2562 if (!x) {
2563 log_error("/etc/passwd entry has invalid home directory field.");
2564 return -EIO;
2565 }
2566
2567 *x = 0;
2568
2569 r = parse_uid(u, &uid);
2570 if (r < 0) {
2571 log_error("Failed to parse UID of user.");
2572 return -EIO;
2573 }
2574
2575 r = parse_gid(g, &gid);
2576 if (r < 0) {
2577 log_error("Failed to parse GID of user.");
2578 return -EIO;
2579 }
2580
2581 home = strdup(h);
2582 if (!home)
2583 return log_oom();
2584
2585 /* Second, get group memberships */
2586 fd = spawn_getent("initgroups", arg_user, &pid);
2587 if (fd < 0)
2588 return fd;
2589
2590 fclose(f);
2591 f = fdopen(fd, "r");
2592 if (!f)
2593 return log_oom();
2594 fd = -1;
2595
2596 if (!fgets(line, sizeof(line), f)) {
2597 if (!ferror(f)) {
2598 log_error("Failed to resolve user %s.", arg_user);
2599 return -ESRCH;
2600 }
2601
2602 log_error("Failed to read from getent: %m");
2603 return -errno;
2604 }
2605
2606 truncate_nl(line);
2607
2608 wait_for_terminate_and_warn("getent initgroups", pid);
2609
2610 /* Skip over the username and subsequent separator whitespace */
2611 x = line;
2612 x += strcspn(x, WHITESPACE);
2613 x += strspn(x, WHITESPACE);
2614
2615 FOREACH_WORD(w, l, x, state) {
2616 char c[l+1];
2617
2618 memcpy(c, w, l);
2619 c[l] = 0;
2620
2621 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2622 return log_oom();
2623
2624 r = parse_uid(c, &uids[n_uids++]);
2625 if (r < 0) {
2626 log_error("Failed to parse group data from getent.");
2627 return -EIO;
2628 }
2629 }
2630
2631 r = mkdir_parents(home, 0775);
2632 if (r < 0) {
2633 log_error("Failed to make home root directory: %s", strerror(-r));
2634 return r;
2635 }
2636
2637 r = mkdir_safe(home, 0755, uid, gid);
f418f31d 2638 if (r < 0 && r != -EEXIST) {
0cb9fbcd
LP
2639 log_error("Failed to make home directory: %s", strerror(-r));
2640 return r;
2641 }
2642
2643 fchown(STDIN_FILENO, uid, gid);
2644 fchown(STDOUT_FILENO, uid, gid);
2645 fchown(STDERR_FILENO, uid, gid);
2646
2647 if (setgroups(n_uids, uids) < 0) {
2648 log_error("Failed to set auxiliary groups: %m");
2649 return -errno;
2650 }
2651
2652 if (setresgid(gid, gid, gid) < 0) {
2653 log_error("setregid() failed: %m");
2654 return -errno;
2655 }
2656
2657 if (setresuid(uid, uid, uid) < 0) {
2658 log_error("setreuid() failed: %m");
2659 return -errno;
2660 }
2661
2662 if (_home) {
2663 *_home = home;
2664 home = NULL;
2665 }
2666
2667 return 0;
2668}
2669
113cea80 2670/*
6d416b9c
LS
2671 * Return values:
2672 * < 0 : wait_for_terminate() failed to get the state of the
2673 * container, the container was terminated by a signal, or
2674 * failed for an unknown reason. No change is made to the
2675 * container argument.
2676 * > 0 : The program executed in the container terminated with an
2677 * error. The exit code of the program executed in the
2678 * container is returned. No change is made to the container
2679 * argument.
2680 * 0 : The container is being rebooted, has been shut down or exited
2681 * successfully. The container argument has been set to either
2682 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 2683 *
6d416b9c
LS
2684 * That is, success is indicated by a return value of zero, and an
2685 * error is indicated by a non-zero value.
113cea80
DH
2686 */
2687static int wait_for_container(pid_t pid, ContainerStatus *container) {
2688 int r;
2689 siginfo_t status;
2690
2691 r = wait_for_terminate(pid, &status);
8baaf7a3
LS
2692 if (r < 0) {
2693 log_warning("Failed to wait for container: %s", strerror(-r));
113cea80 2694 return r;
8baaf7a3 2695 }
113cea80
DH
2696
2697 switch (status.si_code) {
2698 case CLD_EXITED:
2699 r = status.si_status;
2700 if (r == 0) {
2701 if (!arg_quiet)
2702 log_debug("Container %s exited successfully.",
2703 arg_machine);
2704
2705 *container = CONTAINER_TERMINATED;
2706 } else {
2707 log_error("Container %s failed with error code %i.",
2708 arg_machine, status.si_status);
113cea80
DH
2709 }
2710 break;
2711
2712 case CLD_KILLED:
2713 if (status.si_status == SIGINT) {
2714 if (!arg_quiet)
2715 log_info("Container %s has been shut down.",
2716 arg_machine);
2717
2718 *container = CONTAINER_TERMINATED;
2719 r = 0;
2720 break;
2721 } else if (status.si_status == SIGHUP) {
2722 if (!arg_quiet)
2723 log_info("Container %s is being rebooted.",
2724 arg_machine);
2725
2726 *container = CONTAINER_REBOOTED;
2727 r = 0;
2728 break;
2729 }
2730 /* CLD_KILLED fallthrough */
2731
2732 case CLD_DUMPED:
2733 log_error("Container %s terminated by signal %s.",
2734 arg_machine, signal_to_string(status.si_status));
2735 r = -1;
2736 break;
2737
2738 default:
2739 log_error("Container %s failed due to unknown reason.",
2740 arg_machine);
2741 r = -1;
2742 break;
2743 }
2744
2745 return r;
2746}
2747
e866af3a
DH
2748static void nop_handler(int sig) {}
2749
88213476 2750int main(int argc, char *argv[]) {
69c79d3c 2751
1b9e5b12 2752 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
727fd4fd 2753 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
d96c1ecf 2754 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
3d94f76c 2755 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
69c79d3c 2756 _cleanup_fdset_free_ FDSet *fds = NULL;
1b9e5b12 2757 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
69c79d3c 2758 const char *console = NULL;
1b9e5b12
LP
2759 char veth_name[IFNAMSIZ];
2760 bool secondary = false;
e866af3a 2761 sigset_t mask, mask_chld;
69c79d3c 2762 pid_t pid = 0;
88213476
LP
2763
2764 log_parse_environment();
2765 log_open();
2766
05947bef
LP
2767 k = parse_argv(argc, argv);
2768 if (k < 0)
88213476 2769 goto finish;
05947bef
LP
2770 else if (k == 0) {
2771 r = EXIT_SUCCESS;
2772 goto finish;
2773 }
88213476 2774
1b9e5b12
LP
2775 if (!arg_image) {
2776 if (arg_directory) {
2777 char *p;
88213476 2778
1b9e5b12
LP
2779 p = path_make_absolute_cwd(arg_directory);
2780 free(arg_directory);
2781 arg_directory = p;
2782 } else
2783 arg_directory = get_current_dir_name();
88213476 2784
1b9e5b12
LP
2785 if (!arg_directory) {
2786 log_error("Failed to determine path, please use -D.");
2787 goto finish;
2788 }
2789 path_kill_slashes(arg_directory);
88213476
LP
2790 }
2791
7027ff61 2792 if (!arg_machine) {
1b9e5b12 2793 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
7027ff61
LP
2794 if (!arg_machine) {
2795 log_oom();
2796 goto finish;
2797 }
2798
e724b063 2799 hostname_cleanup(arg_machine, false);
7027ff61
LP
2800 if (isempty(arg_machine)) {
2801 log_error("Failed to determine machine name automatically, please use -M.");
2802 goto finish;
2803 }
2804 }
2805
88213476
LP
2806 if (geteuid() != 0) {
2807 log_error("Need to be root.");
2808 goto finish;
2809 }
2810
04d391da
LP
2811 if (sd_booted() <= 0) {
2812 log_error("Not running on a systemd system.");
2813 goto finish;
2814 }
2815
1b9e5b12
LP
2816 log_close();
2817 n_fd_passed = sd_listen_fds(false);
2818 if (n_fd_passed > 0) {
2819 k = fdset_new_listen_fds(&fds, false);
2820 if (k < 0) {
2821 log_error("Failed to collect file descriptors: %s", strerror(-k));
2822 goto finish;
2823 }
88213476 2824 }
1b9e5b12
LP
2825 fdset_close_others(fds);
2826 log_open();
88213476 2827
1b9e5b12
LP
2828 if (arg_directory) {
2829 if (path_equal(arg_directory, "/")) {
2830 log_error("Spawning container on root directory not supported.");
6b9132a9
LP
2831 goto finish;
2832 }
1b9e5b12
LP
2833
2834 if (arg_boot) {
2835 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 2836 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
1b9e5b12
LP
2837 goto finish;
2838 }
2839 } else {
2840 const char *p;
2841
2842 p = strappenda(arg_directory,
2843 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2844 if (access(p, F_OK) < 0) {
2845 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2846 goto finish;
2847
2848 }
2849 }
6b9132a9 2850 } else {
1b9e5b12 2851 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 2852
1b9e5b12
LP
2853 if (!mkdtemp(template)) {
2854 log_error("Failed to create temporary directory: %m");
2855 r = -errno;
6b9132a9 2856 goto finish;
1b9e5b12 2857 }
6b9132a9 2858
1b9e5b12
LP
2859 arg_directory = strdup(template);
2860 if (!arg_directory) {
2861 r = log_oom();
2862 goto finish;
6b9132a9 2863 }
88213476 2864
1b9e5b12
LP
2865 image_fd = setup_image(&device_path, &loop_nr);
2866 if (image_fd < 0) {
2867 r = image_fd;
842f3b0f
LP
2868 goto finish;
2869 }
1b9e5b12 2870
727fd4fd 2871 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
1b9e5b12
LP
2872 if (r < 0)
2873 goto finish;
842f3b0f 2874 }
842f3b0f 2875
db7feb7e
LP
2876 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2877 if (master < 0) {
a258bf26
LP
2878 log_error("Failed to acquire pseudo tty: %m");
2879 goto finish;
2880 }
2881
db7feb7e
LP
2882 console = ptsname(master);
2883 if (!console) {
a258bf26
LP
2884 log_error("Failed to determine tty name: %m");
2885 goto finish;
2886 }
2887
284c0b91 2888 if (!arg_quiet)
45f1386c
ZJS
2889 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2890 arg_machine, arg_image ? arg_image : arg_directory);
a258bf26
LP
2891
2892 if (unlockpt(master) < 0) {
2893 log_error("Failed to unlock tty: %m");
2894 goto finish;
2895 }
2896
eb91eb18
LP
2897 if (access("/dev/kdbus/control", F_OK) >= 0) {
2898
2899 if (arg_share_system) {
2900 kdbus_domain = strdup("/dev/kdbus");
2901 if (!kdbus_domain) {
2902 log_oom();
2903 goto finish;
2904 }
2905 } else {
2906 const char *ns;
2907
2908 ns = strappenda("machine-", arg_machine);
2909 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2910 if (r < 0)
2911 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2912 else
2913 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2914 }
2915 }
9bd37b40 2916
e58a1277 2917 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
354bfd2b
LP
2918 log_error("Failed to create kmsg socket pair: %m");
2919 goto finish;
2920 }
2921
05947bef
LP
2922 sd_notify(0, "READY=1");
2923
a258bf26 2924 assert_se(sigemptyset(&mask) == 0);
e866af3a
DH
2925 assert_se(sigemptyset(&mask_chld) == 0);
2926 sigaddset(&mask_chld, SIGCHLD);
a258bf26
LP
2927 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2928 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2929
d87be9b0 2930 for (;;) {
113cea80 2931 ContainerStatus container_status;
e866af3a
DH
2932 int eventfds[2] = { -1, -1 };
2933 struct sigaction sa = {
2934 .sa_handler = nop_handler,
2935 .sa_flags = SA_NOCLDSTOP,
2936 };
2937
2938 /* Child can be killed before execv(), so handle SIGCHLD
2939 * in order to interrupt parent's blocking calls and
2940 * give it a chance to call wait() and terminate. */
2941 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2942 if (r < 0) {
2943 log_error("Failed to change the signal mask: %m");
d96c1ecf
LP
2944 goto finish;
2945 }
2946
e866af3a
DH
2947 r = sigaction(SIGCHLD, &sa, NULL);
2948 if (r < 0) {
2949 log_error("Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
2950 goto finish;
2951 }
2952
e866af3a
DH
2953 pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
2954 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2955 (arg_private_network ? CLONE_NEWNET : 0), eventfds);
d87be9b0
LP
2956 if (pid < 0) {
2957 if (errno == EINVAL)
2958 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2959 else
2960 log_error("clone() failed: %m");
a258bf26 2961
e866af3a 2962 r = pid;
d87be9b0
LP
2963 goto finish;
2964 }
a258bf26 2965
d87be9b0
LP
2966 if (pid == 0) {
2967 /* child */
0cb9fbcd 2968 _cleanup_free_ char *home = NULL;
5674767e 2969 unsigned n_env = 2;
d87be9b0 2970 const char *envp[] = {
e10a55fd 2971 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
2972 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2973 NULL, /* TERM */
2974 NULL, /* HOME */
2975 NULL, /* USER */
2976 NULL, /* LOGNAME */
2977 NULL, /* container_uuid */
842f3b0f
LP
2978 NULL, /* LISTEN_FDS */
2979 NULL, /* LISTEN_PID */
d87be9b0
LP
2980 NULL
2981 };
f4889f65 2982 char **env_use;
a258bf26 2983
5674767e
ZJS
2984 envp[n_env] = strv_find_prefix(environ, "TERM=");
2985 if (envp[n_env])
2986 n_env ++;
a258bf26 2987
03e334a1 2988 master = safe_close(master);
a258bf26 2989
d87be9b0
LP
2990 close_nointr(STDIN_FILENO);
2991 close_nointr(STDOUT_FILENO);
2992 close_nointr(STDERR_FILENO);
db7feb7e 2993
03e334a1 2994 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
a258bf26 2995
d87be9b0 2996 reset_all_signal_handlers();
88213476 2997
d87be9b0
LP
2998 assert_se(sigemptyset(&mask) == 0);
2999 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
f5c1b9ee 3000
842f3b0f
LP
3001 k = open_terminal(console, O_RDWR);
3002 if (k != STDIN_FILENO) {
3003 if (k >= 0) {
03e334a1 3004 safe_close(k);
842f3b0f
LP
3005 k = -EINVAL;
3006 }
3007
3008 log_error("Failed to open console: %s", strerror(-k));
3009 goto child_fail;
3010 }
3011
3012 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3013 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3014 log_error("Failed to duplicate console: %m");
d87be9b0 3015 goto child_fail;
842f3b0f 3016 }
bc2f673e 3017
d87be9b0
LP
3018 if (setsid() < 0) {
3019 log_error("setsid() failed: %m");
bc2f673e
LP
3020 goto child_fail;
3021 }
3022
db999e0f
LP
3023 if (reset_audit_loginuid() < 0)
3024 goto child_fail;
3025
d87be9b0
LP
3026 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3027 log_error("PR_SET_PDEATHSIG failed: %m");
3028 goto child_fail;
3029 }
e58a1277 3030
d87be9b0
LP
3031 /* Mark everything as slave, so that we still
3032 * receive mounts from the real root, but don't
3033 * propagate mounts to the real root. */
3034 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3035 log_error("MS_SLAVE|MS_REC failed: %m");
3036 goto child_fail;
3037 }
04bc4a3f 3038
727fd4fd
LP
3039 if (mount_devices(arg_directory,
3040 root_device, root_device_rw,
3041 home_device, home_device_rw,
3042 srv_device, srv_device_rw) < 0)
1b9e5b12
LP
3043 goto child_fail;
3044
3577de7a
KS
3045 r = base_filesystem_create(arg_directory);
3046 if (r < 0) {
971ff8c7 3047 log_error("Failed to create the base filesystem: %s", strerror(-r));
3577de7a
KS
3048 goto child_fail;
3049 }
3050
d87be9b0
LP
3051 /* Turn directory into bind mount */
3052 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
d6797c92 3053 log_error("Failed to make bind mount: %m");
d87be9b0
LP
3054 goto child_fail;
3055 }
88213476 3056
d6797c92
LP
3057 if (arg_read_only) {
3058 k = bind_remount_recursive(arg_directory, true);
3059 if (k < 0) {
3060 log_error("Failed to make tree read-only: %s", strerror(-k));
d87be9b0
LP
3061 goto child_fail;
3062 }
d6797c92 3063 }
2547bb41 3064
d87be9b0
LP
3065 if (mount_all(arg_directory) < 0)
3066 goto child_fail;
57fb9fb5 3067
d87be9b0
LP
3068 if (copy_devnodes(arg_directory) < 0)
3069 goto child_fail;
a258bf26 3070
f2d88580
LP
3071 if (setup_ptmx(arg_directory) < 0)
3072 goto child_fail;
3073
d87be9b0 3074 dev_setup(arg_directory);
88213476 3075
28650077 3076 if (setup_seccomp() < 0)
24fb1112
LP
3077 goto child_fail;
3078
d87be9b0
LP
3079 if (setup_dev_console(arg_directory, console) < 0)
3080 goto child_fail;
88213476 3081
d87be9b0
LP
3082 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3083 goto child_fail;
88213476 3084
03e334a1 3085 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
a258bf26 3086
d87be9b0
LP
3087 if (setup_boot_id(arg_directory) < 0)
3088 goto child_fail;
a41fe3a2 3089
d87be9b0
LP
3090 if (setup_timezone(arg_directory) < 0)
3091 goto child_fail;
88213476 3092
d87be9b0
LP
3093 if (setup_resolv_conf(arg_directory) < 0)
3094 goto child_fail;
687d0825 3095
d87be9b0 3096 if (setup_journal(arg_directory) < 0)
687d0825 3097 goto child_fail;
687d0825 3098
d6797c92 3099 if (mount_binds(arg_directory, arg_bind, false) < 0)
17fe0523
LP
3100 goto child_fail;
3101
d6797c92 3102 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
17fe0523
LP
3103 goto child_fail;
3104
06c17c39
LP
3105 if (mount_tmpfs(arg_directory) < 0)
3106 goto child_fail;
3107
486e99a3 3108 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
9bd37b40
LP
3109 goto child_fail;
3110
d96c1ecf
LP
3111 /* Tell the parent that we are ready, and that
3112 * it can cgroupify us to that we lack access
3113 * to certain devices and resources. */
e866af3a
DH
3114 r = eventfd_send_state(eventfds[1],
3115 EVENTFD_CHILD_SUCCEEDED);
3116 eventfds[1] = safe_close(eventfds[1]);
3117 if (r < 0)
3118 goto child_fail;
d96c1ecf 3119
d87be9b0
LP
3120 if (chdir(arg_directory) < 0) {
3121 log_error("chdir(%s) failed: %m", arg_directory);
687d0825
MV
3122 goto child_fail;
3123 }
3124
d87be9b0
LP
3125 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3126 log_error("mount(MS_MOVE) failed: %m");
687d0825
MV
3127 goto child_fail;
3128 }
3129
d87be9b0
LP
3130 if (chroot(".") < 0) {
3131 log_error("chroot() failed: %m");
687d0825
MV
3132 goto child_fail;
3133 }
3134
d87be9b0
LP
3135 if (chdir("/") < 0) {
3136 log_error("chdir() failed: %m");
687d0825
MV
3137 goto child_fail;
3138 }
3139
d87be9b0
LP
3140 umask(0022);
3141
eb91eb18
LP
3142 if (arg_private_network)
3143 loopback_setup();
d87be9b0
LP
3144
3145 if (drop_capabilities() < 0) {
3146 log_error("drop_capabilities() failed: %m");
687d0825
MV
3147 goto child_fail;
3148 }
687d0825 3149
0cb9fbcd
LP
3150 r = change_uid_gid(&home);
3151 if (r < 0)
3152 goto child_fail;
d87be9b0 3153
842f3b0f
LP
3154 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3155 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3156 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 3157 log_oom();
144f0fc0
LP
3158 goto child_fail;
3159 }
687d0825 3160
9444b1f2 3161 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
9f24adc2
LP
3162 char as_uuid[37];
3163
3164 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
842f3b0f
LP
3165 log_oom();
3166 goto child_fail;
3167 }
3168 }
3169
3170 if (fdset_size(fds) > 0) {
3171 k = fdset_cloexec(fds, false);
3172 if (k < 0) {
3173 log_error("Failed to unset O_CLOEXEC for file descriptors.");
3174 goto child_fail;
3175 }
3176
3177 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 3178 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0
LP
3179 log_oom();
3180 goto child_fail;
3181 }
3182 }
3183
3184 setup_hostname();
3185
6afc95b7
LP
3186 if (arg_personality != 0xffffffffLU) {
3187 if (personality(arg_personality) < 0) {
3188 log_error("personality() failed: %m");
3189 goto child_fail;
3190 }
1b9e5b12
LP
3191 } else if (secondary) {
3192 if (personality(PER_LINUX32) < 0) {
3193 log_error("personality() failed: %m");
3194 goto child_fail;
3195 }
6afc95b7
LP
3196 }
3197
d96c1ecf
LP
3198#ifdef HAVE_SELINUX
3199 if (arg_selinux_context)
0cb9fbcd 3200 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
d96c1ecf 3201 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
0cb9fbcd
LP
3202 goto child_fail;
3203 }
d96c1ecf 3204#endif
354bfd2b 3205
f4889f65
LP
3206 if (!strv_isempty(arg_setenv)) {
3207 char **n;
3208
3209 n = strv_env_merge(2, envp, arg_setenv);
3210 if (!n) {
3211 log_oom();
3212 goto child_fail;
3213 }
3214
3215 env_use = n;
3216 } else
3217 env_use = (char**) envp;
3218
d96c1ecf 3219 /* Wait until the parent is ready with the setup, too... */
e866af3a
DH
3220 r = eventfd_parent_succeeded(eventfds[0]);
3221 eventfds[0] = safe_close(eventfds[0]);
3222 if (r < 0)
3223 goto child_fail;
d96c1ecf 3224
d87be9b0
LP
3225 if (arg_boot) {
3226 char **a;
3227 size_t l;
88213476 3228
d87be9b0 3229 /* Automatically search for the init system */
0f0dbc46 3230
d87be9b0
LP
3231 l = 1 + argc - optind;
3232 a = newa(char*, l + 1);
3233 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 3234
d87be9b0 3235 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 3236 execve(a[0], a, env_use);
0f0dbc46 3237
d87be9b0 3238 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 3239 execve(a[0], a, env_use);
0f0dbc46 3240
d87be9b0 3241 a[0] = (char*) "/sbin/init";
f4889f65 3242 execve(a[0], a, env_use);
d87be9b0 3243 } else if (argc > optind)
f4889f65 3244 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
3245 else {
3246 chdir(home ? home : "/root");
f4889f65 3247 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 3248 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
3249 }
3250
3251 log_error("execv() failed: %m");
0f0dbc46 3252
d87be9b0 3253 child_fail:
e866af3a
DH
3254 /* Tell the parent that the setup failed, so he
3255 * can clean up resources and terminate. */
3256 if (eventfds[1] != -1)
3257 eventfd_send_state(eventfds[1],
3258 EVENTFD_CHILD_FAILED);
d87be9b0 3259 _exit(EXIT_FAILURE);
da5b3bad 3260 }
88213476 3261
842f3b0f
LP
3262 fdset_free(fds);
3263 fds = NULL;
3264
e866af3a
DH
3265 /* Wait for the child event:
3266 * If EVENTFD_CHILD_FAILED, the child will terminate soon.
3267 * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
3268 * it is ready with all it needs to do with priviliges.
3269 * After we got the notification we can make the process
3270 * join its cgroup which might limit what it can do */
3271 r = eventfd_child_succeeded(eventfds[1]);
3272 eventfds[1] = safe_close(eventfds[1]);
d96c1ecf 3273
840295fc
LP
3274 if (r >= 0) {
3275 r = register_machine(pid);
3276 if (r < 0)
3277 goto finish;
354bfd2b 3278
840295fc
LP
3279 r = move_network_interfaces(pid);
3280 if (r < 0)
3281 goto finish;
aa28aefe 3282
840295fc
LP
3283 r = setup_veth(pid, veth_name);
3284 if (r < 0)
3285 goto finish;
ab046dde 3286
840295fc
LP
3287 r = setup_bridge(veth_name);
3288 if (r < 0)
3289 goto finish;
ab046dde 3290
840295fc
LP
3291 r = setup_macvlan(pid);
3292 if (r < 0)
3293 goto finish;
c74e630d 3294
840295fc
LP
3295 /* Block SIGCHLD here, before notifying child.
3296 * process_pty() will handle it with the other signals. */
3297 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3298 if (r < 0)
3299 goto finish;
e866af3a 3300
840295fc
LP
3301 /* Reset signal to default */
3302 r = default_signals(SIGCHLD, -1);
3303 if (r < 0)
3304 goto finish;
e866af3a 3305
840295fc
LP
3306 /* Notify the child that the parent is ready with all
3307 * its setup, and that the child can now hand over
3308 * control to the code to run inside the container. */
3309 r = eventfd_send_state(eventfds[0], EVENTFD_PARENT_SUCCEEDED);
3310 eventfds[0] = safe_close(eventfds[0]);
3311 if (r < 0)
3312 goto finish;
354bfd2b 3313
840295fc
LP
3314 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3315 if (k < 0) {
3316 r = EXIT_FAILURE;
3317 break;
3318 }
88213476 3319
840295fc
LP
3320 if (!arg_quiet)
3321 putc('\n', stdout);
04d39279 3322
840295fc
LP
3323 /* Kill if it is not dead yet anyway */
3324 terminate_machine(pid);
3325 }
1f0cd86b 3326
840295fc 3327 /* Normally redundant, but better safe than sorry */
04d39279 3328 kill(pid, SIGKILL);
a258bf26 3329
113cea80 3330 r = wait_for_container(pid, &container_status);
04d39279
LP
3331 pid = 0;
3332
ce9f1527
LP
3333 if (r < 0) {
3334 /* We failed to wait for the container, or the
3335 * container exited abnormally */
3336 r = EXIT_FAILURE;
d87be9b0 3337 break;
ce9f1527
LP
3338 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3339 /* The container exited with a non-zero
3340 * status, or with zero status and no reboot
3341 * was requested. */
d87be9b0 3342 break;
88213476 3343
113cea80 3344 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
3345
3346 if (arg_keep_unit) {
3347 /* Special handling if we are running as a
3348 * service: instead of simply restarting the
3349 * machine we want to restart the entire
3350 * service, so let's inform systemd about this
3351 * with the special exit code 133. The service
3352 * file uses RestartForceExitStatus=133 so
3353 * that this results in a full nspawn
3354 * restart. This is necessary since we might
3355 * have cgroup parameters set we want to have
3356 * flushed out. */
3357 r = 133;
3358 break;
3359 }
d87be9b0 3360 }
88213476
LP
3361
3362finish:
1b9e5b12
LP
3363 loop_remove(loop_nr, &image_fd);
3364
9444b1f2
LP
3365 if (pid > 0)
3366 kill(pid, SIGKILL);
88213476 3367
04d391da 3368 free(arg_directory);
7027ff61 3369 free(arg_machine);
c74e630d
LP
3370 free(arg_user);
3371 strv_free(arg_setenv);
3372 strv_free(arg_network_interfaces);
3373 strv_free(arg_network_macvlan);
3374 strv_free(arg_bind);
3375 strv_free(arg_bind_ro);
06c17c39 3376 strv_free(arg_tmpfs);
88213476
LP
3377
3378 return r;
3379}