]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
update TODO
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
a258bf26
LP
36#include <termios.h>
37#include <sys/signalfd.h>
687d0825 38#include <grp.h>
5ed27dbd 39#include <linux/fs.h>
9537eab0
LP
40#include <sys/un.h>
41#include <sys/socket.h>
aea38d80 42#include <linux/netlink.h>
354bfd2b 43#include <sys/eventfd.h>
aa28aefe 44#include <net/if.h>
69c79d3c 45#include <linux/veth.h>
6afc95b7 46#include <sys/personality.h>
1b9e5b12 47#include <linux/loop.h>
aa28aefe 48
5d63309c 49#ifdef HAVE_SELINUX
a8828ed9
DW
50#include <selinux/selinux.h>
51#endif
88213476 52
24fb1112
LP
53#ifdef HAVE_SECCOMP
54#include <seccomp.h>
55#endif
56
1b9e5b12
LP
57#ifdef HAVE_BLKID
58#include <blkid/blkid.h>
59#endif
60
1f0cd86b
LP
61#include "sd-daemon.h"
62#include "sd-bus.h"
63#include "sd-id128.h"
aa28aefe 64#include "sd-rtnl.h"
88213476
LP
65#include "log.h"
66#include "util.h"
49e942b2 67#include "mkdir.h"
6b2d0e85 68#include "macro.h"
d7832d2c 69#include "audit.h"
94d82985 70#include "missing.h"
04d391da 71#include "cgroup-util.h"
a258bf26 72#include "strv.h"
9eb977db 73#include "path-util.h"
a41fe3a2 74#include "loopback-setup.h"
4fc9982c 75#include "dev-setup.h"
842f3b0f 76#include "fdset.h"
acbeb427 77#include "build.h"
a5c32cff 78#include "fileio.h"
40ca29a1 79#include "bus-util.h"
1f0cd86b 80#include "bus-error.h"
4ba93280 81#include "ptyfwd.h"
9bd37b40 82#include "bus-kernel.h"
f4889f65 83#include "env-util.h"
7f112f50 84#include "def.h"
aa28aefe 85#include "rtnl-util.h"
7e227024 86#include "udev-util.h"
1b9e5b12
LP
87#include "blkid-util.h"
88#include "gpt.h"
01dde061 89#include "siphash24.h"
f2d88580 90
e9642be2
LP
91#ifdef HAVE_SECCOMP
92#include "seccomp-util.h"
93#endif
94
57fb9fb5
LP
95typedef enum LinkJournal {
96 LINK_NO,
97 LINK_AUTO,
98 LINK_HOST,
99 LINK_GUEST
100} LinkJournal;
88213476
LP
101
102static char *arg_directory = NULL;
687d0825 103static char *arg_user = NULL;
9444b1f2 104static sd_id128_t arg_uuid = {};
7027ff61 105static char *arg_machine = NULL;
c74e630d
LP
106static const char *arg_selinux_context = NULL;
107static const char *arg_selinux_apifs_context = NULL;
9444b1f2 108static const char *arg_slice = NULL;
ff01d048 109static bool arg_private_network = false;
bc2f673e 110static bool arg_read_only = false;
0f0dbc46 111static bool arg_boot = false;
57fb9fb5 112static LinkJournal arg_link_journal = LINK_AUTO;
5076f0cc
LP
113static uint64_t arg_retain =
114 (1ULL << CAP_CHOWN) |
115 (1ULL << CAP_DAC_OVERRIDE) |
116 (1ULL << CAP_DAC_READ_SEARCH) |
117 (1ULL << CAP_FOWNER) |
118 (1ULL << CAP_FSETID) |
119 (1ULL << CAP_IPC_OWNER) |
120 (1ULL << CAP_KILL) |
121 (1ULL << CAP_LEASE) |
122 (1ULL << CAP_LINUX_IMMUTABLE) |
123 (1ULL << CAP_NET_BIND_SERVICE) |
124 (1ULL << CAP_NET_BROADCAST) |
125 (1ULL << CAP_NET_RAW) |
126 (1ULL << CAP_SETGID) |
127 (1ULL << CAP_SETFCAP) |
128 (1ULL << CAP_SETPCAP) |
129 (1ULL << CAP_SETUID) |
130 (1ULL << CAP_SYS_ADMIN) |
131 (1ULL << CAP_SYS_CHROOT) |
132 (1ULL << CAP_SYS_NICE) |
133 (1ULL << CAP_SYS_PTRACE) |
134 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 135 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
136 (1ULL << CAP_SYS_BOOT) |
137 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
138 (1ULL << CAP_AUDIT_CONTROL) |
139 (1ULL << CAP_MKNOD);
17fe0523
LP
140static char **arg_bind = NULL;
141static char **arg_bind_ro = NULL;
f4889f65 142static char **arg_setenv = NULL;
284c0b91 143static bool arg_quiet = false;
8a96d94e 144static bool arg_share_system = false;
eb91eb18 145static bool arg_register = true;
89f7c846 146static bool arg_keep_unit = false;
aa28aefe 147static char **arg_network_interfaces = NULL;
c74e630d 148static char **arg_network_macvlan = NULL;
69c79d3c 149static bool arg_network_veth = false;
c74e630d 150static const char *arg_network_bridge = NULL;
6afc95b7 151static unsigned long arg_personality = 0xffffffffLU;
1b9e5b12 152static const char *arg_image = NULL;
88213476
LP
153
154static int help(void) {
155
156 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
157 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
158 " -h --help Show this help\n"
159 " --version Print version string\n"
69c79d3c 160 " -q --quiet Do not show status information\n"
1b9e5b12
LP
161 " -D --directory=PATH Root directory for the container\n"
162 " -i --image=PATH File system device or image for the container\n"
a8828ed9
DW
163 " -b --boot Boot up full system (i.e. invoke init)\n"
164 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 165 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 166 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 167 " -S --slice=SLICE Place the container in the specified slice\n"
69c79d3c
LP
168 " --private-network Disable network in container\n"
169 " --network-interface=INTERFACE\n"
170 " Assign an existing network interface to the\n"
171 " container\n"
c74e630d
LP
172 " --network-macvlan=INTERFACE\n"
173 " Create a macvlan network interface based on an\n"
174 " existing network interface to the container\n"
32457153 175 " --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 176 " and container\n"
ab046dde 177 " --network-bridge=INTERFACE\n"
32457153 178 " Add a virtual ethernet connection between host\n"
ab046dde
TG
179 " and container and add it to an existing bridge on\n"
180 " the host\n"
82adf6af
LP
181 " -Z --selinux-context=SECLABEL\n"
182 " Set the SELinux security context to be used by\n"
183 " processes in the container\n"
184 " -L --selinux-apifs-context=SECLABEL\n"
185 " Set the SELinux security context to be used by\n"
186 " API/tmpfs file systems in the container\n"
a8828ed9
DW
187 " --capability=CAP In addition to the default, retain specified\n"
188 " capability\n"
189 " --drop-capability=CAP Drop the specified capability from the default set\n"
190 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
191 " -j Equivalent to --link-journal=host\n"
69c79d3c 192 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
193 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
194 " the container\n"
195 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
284c0b91 196 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 197 " --share-system Share system namespaces with host\n"
eb91eb18 198 " --register=BOOLEAN Register container as machine\n"
89f7c846 199 " --keep-unit Do not register a scope for the machine, reuse\n"
69c79d3c 200 " the service unit nspawn is running in\n",
88213476
LP
201 program_invocation_short_name);
202
203 return 0;
204}
205
206static int parse_argv(int argc, char *argv[]) {
207
a41fe3a2 208 enum {
acbeb427
ZJS
209 ARG_VERSION = 0x100,
210 ARG_PRIVATE_NETWORK,
bc2f673e 211 ARG_UUID,
5076f0cc 212 ARG_READ_ONLY,
57fb9fb5 213 ARG_CAPABILITY,
420c7379 214 ARG_DROP_CAPABILITY,
17fe0523
LP
215 ARG_LINK_JOURNAL,
216 ARG_BIND,
f4889f65
LP
217 ARG_BIND_RO,
218 ARG_SETENV,
eb91eb18 219 ARG_SHARE_SYSTEM,
89f7c846 220 ARG_REGISTER,
aa28aefe 221 ARG_KEEP_UNIT,
69c79d3c 222 ARG_NETWORK_INTERFACE,
c74e630d 223 ARG_NETWORK_MACVLAN,
69c79d3c 224 ARG_NETWORK_VETH,
ab046dde 225 ARG_NETWORK_BRIDGE,
6afc95b7 226 ARG_PERSONALITY,
a41fe3a2
LP
227 };
228
88213476 229 static const struct option options[] = {
aa28aefe
LP
230 { "help", no_argument, NULL, 'h' },
231 { "version", no_argument, NULL, ARG_VERSION },
232 { "directory", required_argument, NULL, 'D' },
233 { "user", required_argument, NULL, 'u' },
234 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
235 { "boot", no_argument, NULL, 'b' },
236 { "uuid", required_argument, NULL, ARG_UUID },
237 { "read-only", no_argument, NULL, ARG_READ_ONLY },
238 { "capability", required_argument, NULL, ARG_CAPABILITY },
239 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
240 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
241 { "bind", required_argument, NULL, ARG_BIND },
242 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
243 { "machine", required_argument, NULL, 'M' },
244 { "slice", required_argument, NULL, 'S' },
245 { "setenv", required_argument, NULL, ARG_SETENV },
246 { "selinux-context", required_argument, NULL, 'Z' },
247 { "selinux-apifs-context", required_argument, NULL, 'L' },
248 { "quiet", no_argument, NULL, 'q' },
249 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
250 { "register", required_argument, NULL, ARG_REGISTER },
251 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
252 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 253 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
ab046dde
TG
254 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
255 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 256 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 257 { "image", required_argument, NULL, 'i' },
eb9da376 258 {}
88213476
LP
259 };
260
9444b1f2 261 int c, r;
a42c8b54 262 uint64_t plus = 0, minus = 0;
88213476
LP
263
264 assert(argc >= 0);
265 assert(argv);
266
1b9e5b12 267 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
88213476
LP
268
269 switch (c) {
270
271 case 'h':
eb9da376 272 return help();
88213476 273
acbeb427
ZJS
274 case ARG_VERSION:
275 puts(PACKAGE_STRING);
276 puts(SYSTEMD_FEATURES);
277 return 0;
278
88213476
LP
279 case 'D':
280 free(arg_directory);
3a74cea5
LP
281 arg_directory = canonicalize_file_name(optarg);
282 if (!arg_directory) {
898d5c91 283 log_error("Invalid root directory: %m");
88213476
LP
284 return -ENOMEM;
285 }
286
287 break;
288
1b9e5b12
LP
289 case 'i':
290 arg_image = optarg;
291 break;
292
687d0825
MV
293 case 'u':
294 free(arg_user);
7027ff61
LP
295 arg_user = strdup(optarg);
296 if (!arg_user)
297 return log_oom();
687d0825
MV
298
299 break;
300
ab046dde 301 case ARG_NETWORK_BRIDGE:
c74e630d 302 arg_network_bridge = optarg;
ab046dde
TG
303
304 /* fall through */
305
69c79d3c
LP
306 case ARG_NETWORK_VETH:
307 arg_network_veth = true;
308 arg_private_network = true;
309 break;
310
aa28aefe 311 case ARG_NETWORK_INTERFACE:
c74e630d
LP
312 if (strv_extend(&arg_network_interfaces, optarg) < 0)
313 return log_oom();
314
315 arg_private_network = true;
316 break;
317
318 case ARG_NETWORK_MACVLAN:
319 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
320 return log_oom();
321
322 /* fall through */
323
ff01d048
LP
324 case ARG_PRIVATE_NETWORK:
325 arg_private_network = true;
a41fe3a2
LP
326 break;
327
0f0dbc46
LP
328 case 'b':
329 arg_boot = true;
330 break;
331
144f0fc0 332 case ARG_UUID:
9444b1f2
LP
333 r = sd_id128_from_string(optarg, &arg_uuid);
334 if (r < 0) {
aa96c6cb 335 log_error("Invalid UUID: %s", optarg);
9444b1f2 336 return r;
aa96c6cb 337 }
9444b1f2 338 break;
aa96c6cb 339
9444b1f2 340 case 'S':
c74e630d 341 arg_slice = optarg;
144f0fc0
LP
342 break;
343
7027ff61 344 case 'M':
eb91eb18
LP
345 if (isempty(optarg)) {
346 free(arg_machine);
347 arg_machine = NULL;
348 } else {
7027ff61 349
eb91eb18
LP
350 if (!hostname_is_valid(optarg)) {
351 log_error("Invalid machine name: %s", optarg);
352 return -EINVAL;
353 }
7027ff61 354
eb91eb18
LP
355 free(arg_machine);
356 arg_machine = strdup(optarg);
357 if (!arg_machine)
358 return log_oom();
359
360 break;
361 }
7027ff61 362
82adf6af
LP
363 case 'Z':
364 arg_selinux_context = optarg;
a8828ed9
DW
365 break;
366
82adf6af
LP
367 case 'L':
368 arg_selinux_apifs_context = optarg;
a8828ed9
DW
369 break;
370
bc2f673e
LP
371 case ARG_READ_ONLY:
372 arg_read_only = true;
373 break;
374
420c7379
LP
375 case ARG_CAPABILITY:
376 case ARG_DROP_CAPABILITY: {
5076f0cc
LP
377 char *state, *word;
378 size_t length;
379
380 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 381 _cleanup_free_ char *t;
5076f0cc 382 cap_value_t cap;
5076f0cc
LP
383
384 t = strndup(word, length);
0d0f0c50
SL
385 if (!t)
386 return log_oom();
5076f0cc 387
39ed67d1
LP
388 if (streq(t, "all")) {
389 if (c == ARG_CAPABILITY)
a42c8b54 390 plus = (uint64_t) -1;
39ed67d1 391 else
a42c8b54 392 minus = (uint64_t) -1;
39ed67d1
LP
393 } else {
394 if (cap_from_name(t, &cap) < 0) {
395 log_error("Failed to parse capability %s.", t);
396 return -EINVAL;
397 }
398
399 if (c == ARG_CAPABILITY)
a42c8b54 400 plus |= 1ULL << (uint64_t) cap;
39ed67d1 401 else
a42c8b54 402 minus |= 1ULL << (uint64_t) cap;
5076f0cc 403 }
5076f0cc
LP
404 }
405
406 break;
407 }
408
57fb9fb5
LP
409 case 'j':
410 arg_link_journal = LINK_GUEST;
411 break;
412
413 case ARG_LINK_JOURNAL:
414 if (streq(optarg, "auto"))
415 arg_link_journal = LINK_AUTO;
416 else if (streq(optarg, "no"))
417 arg_link_journal = LINK_NO;
418 else if (streq(optarg, "guest"))
419 arg_link_journal = LINK_GUEST;
420 else if (streq(optarg, "host"))
421 arg_link_journal = LINK_HOST;
422 else {
423 log_error("Failed to parse link journal mode %s", optarg);
424 return -EINVAL;
425 }
426
427 break;
428
17fe0523
LP
429 case ARG_BIND:
430 case ARG_BIND_RO: {
431 _cleanup_free_ char *a = NULL, *b = NULL;
432 char *e;
433 char ***x;
17fe0523
LP
434
435 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
436
437 e = strchr(optarg, ':');
438 if (e) {
439 a = strndup(optarg, e - optarg);
440 b = strdup(e + 1);
441 } else {
442 a = strdup(optarg);
443 b = strdup(optarg);
444 }
445
446 if (!a || !b)
447 return log_oom();
448
449 if (!path_is_absolute(a) || !path_is_absolute(b)) {
450 log_error("Invalid bind mount specification: %s", optarg);
451 return -EINVAL;
452 }
453
454 r = strv_extend(x, a);
455 if (r < 0)
b3451bed 456 return log_oom();
17fe0523
LP
457
458 r = strv_extend(x, b);
459 if (r < 0)
b3451bed 460 return log_oom();
17fe0523
LP
461
462 break;
463 }
464
f4889f65
LP
465 case ARG_SETENV: {
466 char **n;
467
468 if (!env_assignment_is_valid(optarg)) {
469 log_error("Environment variable assignment '%s' is not valid.", optarg);
470 return -EINVAL;
471 }
472
473 n = strv_env_set(arg_setenv, optarg);
474 if (!n)
475 return log_oom();
476
477 strv_free(arg_setenv);
478 arg_setenv = n;
479 break;
480 }
481
284c0b91
LP
482 case 'q':
483 arg_quiet = true;
484 break;
485
8a96d94e
LP
486 case ARG_SHARE_SYSTEM:
487 arg_share_system = true;
488 break;
489
eb91eb18
LP
490 case ARG_REGISTER:
491 r = parse_boolean(optarg);
492 if (r < 0) {
493 log_error("Failed to parse --register= argument: %s", optarg);
494 return r;
495 }
496
497 arg_register = r;
498 break;
499
89f7c846
LP
500 case ARG_KEEP_UNIT:
501 arg_keep_unit = true;
502 break;
503
6afc95b7
LP
504 case ARG_PERSONALITY:
505
ac45f971 506 arg_personality = personality_from_string(optarg);
6afc95b7
LP
507 if (arg_personality == 0xffffffffLU) {
508 log_error("Unknown or unsupported personality '%s'.", optarg);
509 return -EINVAL;
510 }
511
512 break;
513
88213476
LP
514 case '?':
515 return -EINVAL;
516
517 default:
eb9da376 518 assert_not_reached("Unhandled option");
88213476
LP
519 }
520 }
521
eb91eb18
LP
522 if (arg_share_system)
523 arg_register = false;
524
525 if (arg_boot && arg_share_system) {
526 log_error("--boot and --share-system may not be combined.");
527 return -EINVAL;
528 }
529
89f7c846
LP
530 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
531 log_error("--keep-unit may not be used when invoked from a user session.");
532 return -EINVAL;
533 }
534
1b9e5b12
LP
535 if (arg_directory && arg_image) {
536 log_error("--directory= and --image= may not be combined.");
537 return -EINVAL;
538 }
539
a42c8b54
LP
540 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
541
88213476
LP
542 return 1;
543}
544
545static int mount_all(const char *dest) {
546
547 typedef struct MountPoint {
548 const char *what;
549 const char *where;
550 const char *type;
551 const char *options;
552 unsigned long flags;
3bd66c05 553 bool fatal;
88213476
LP
554 } MountPoint;
555
556 static const MountPoint mount_table[] = {
4b7a6af4 557 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
b4c59701
LP
558 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
559 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
e65aec12 560 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
635f7d8c 561 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 562 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
ede89845 563 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
635f7d8c 564 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 565#ifdef HAVE_SELINUX
b4c59701
LP
566 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
567 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 568#endif
88213476
LP
569 };
570
571 unsigned k;
572 int r = 0;
573
574 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
7fd1b19b 575 _cleanup_free_ char *where = NULL;
d002827b 576#ifdef HAVE_SELINUX
a8828ed9 577 _cleanup_free_ char *options = NULL;
d002827b
LP
578#endif
579 const char *o;
88213476
LP
580 int t;
581
17fe0523
LP
582 where = strjoin(dest, "/", mount_table[k].where, NULL);
583 if (!where)
584 return log_oom();
88213476 585
e65aec12 586 t = path_is_mount_point(where, true);
68fb0892 587 if (t < 0) {
88213476 588 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
88213476
LP
589
590 if (r == 0)
591 r = t;
592
593 continue;
594 }
595
9c1c7f71
LP
596 /* Skip this entry if it is not a remount. */
597 if (mount_table[k].what && t > 0)
014a9c77
LP
598 continue;
599
17fe0523 600 mkdir_p(where, 0755);
88213476 601
a8828ed9 602#ifdef HAVE_SELINUX
82adf6af
LP
603 if (arg_selinux_apifs_context &&
604 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
605 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
606 if (!options)
607 return log_oom();
608
609 o = options;
610 } else
a8828ed9 611#endif
d002827b 612 o = mount_table[k].options;
a8828ed9 613
a8828ed9 614
88213476
LP
615 if (mount(mount_table[k].what,
616 where,
617 mount_table[k].type,
618 mount_table[k].flags,
d002827b 619 o) < 0 &&
3bd66c05 620 mount_table[k].fatal) {
88213476
LP
621
622 log_error("mount(%s) failed: %m", where);
623
624 if (r == 0)
625 r = -errno;
626 }
88213476
LP
627 }
628
e58a1277
LP
629 return r;
630}
f8440af5 631
17fe0523
LP
632static int mount_binds(const char *dest, char **l, unsigned long flags) {
633 char **x, **y;
634
635 STRV_FOREACH_PAIR(x, y, l) {
2ed4e5e0 636 char *where;
d2421337 637 struct stat source_st, dest_st;
2ed4e5e0 638 int r;
d2421337
DR
639
640 if (stat(*x, &source_st) < 0) {
1b9e5b12 641 log_error("Failed to stat %s: %m", *x);
d2421337
DR
642 return -errno;
643 }
17fe0523 644
2ed4e5e0
SL
645 where = strappenda(dest, *y);
646 r = stat(where, &dest_st);
647 if (r == 0) {
d2421337 648 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
cecf24e7 649 log_error("The file types of %s and %s do not match. Refusing bind mount",
d2421337
DR
650 *x, where);
651 return -EINVAL;
652 }
2ed4e5e0
SL
653 } else if (errno == ENOENT) {
654 r = mkdir_parents_label(where, 0755);
655 if (r < 0) {
656 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
657 return r;
d2421337 658 }
2ed4e5e0
SL
659 } else {
660 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
661 return -errno;
662 }
663 /* Create the mount point, but be conservative -- refuse to create block
664 * and char devices. */
665 if (S_ISDIR(source_st.st_mode))
666 mkdir_label(where, 0755);
667 else if (S_ISFIFO(source_st.st_mode))
668 mkfifo(where, 0644);
669 else if (S_ISSOCK(source_st.st_mode))
670 mknod(where, 0644 | S_IFSOCK, 0);
671 else if (S_ISREG(source_st.st_mode))
672 touch(where);
673 else {
674 log_error("Refusing to create mountpoint for file: %s", *x);
675 return -ENOTSUP;
d2421337 676 }
17fe0523
LP
677
678 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
679 log_error("mount(%s) failed: %m", where);
680 return -errno;
681 }
682
683 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
684 log_error("mount(%s) failed: %m", where);
685 return -errno;
686 }
687 }
688
689 return 0;
690}
691
e58a1277 692static int setup_timezone(const char *dest) {
d4036145
LP
693 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
694 char *z, *y;
695 int r;
f8440af5 696
e58a1277
LP
697 assert(dest);
698
699 /* Fix the timezone, if possible */
d4036145
LP
700 r = readlink_malloc("/etc/localtime", &p);
701 if (r < 0) {
702 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
703 return 0;
704 }
705
706 z = path_startswith(p, "../usr/share/zoneinfo/");
707 if (!z)
708 z = path_startswith(p, "/usr/share/zoneinfo/");
709 if (!z) {
710 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
711 return 0;
712 }
713
04bc4a3f
LP
714 where = strappend(dest, "/etc/localtime");
715 if (!where)
0d0f0c50 716 return log_oom();
715ac17a 717
d4036145
LP
718 r = readlink_malloc(where, &q);
719 if (r >= 0) {
720 y = path_startswith(q, "../usr/share/zoneinfo/");
721 if (!y)
722 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 723
4d1c38b8 724
d4036145
LP
725 /* Already pointing to the right place? Then do nothing .. */
726 if (y && streq(y, z))
727 return 0;
728 }
729
730 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
731 if (!check)
0d0f0c50 732 return log_oom();
4d1c38b8 733
d4036145
LP
734 if (access(check, F_OK) < 0) {
735 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
736 return 0;
737 }
68fb0892 738
d4036145
LP
739 what = strappend("../usr/share/zoneinfo/", z);
740 if (!what)
741 return log_oom();
742
743 unlink(where);
744 if (symlink(what, where) < 0) {
745 log_error("Failed to correct timezone of container: %m");
746 return 0;
747 }
e58a1277
LP
748
749 return 0;
88213476
LP
750}
751
2547bb41 752static int setup_resolv_conf(const char *dest) {
f333fbb1 753 char _cleanup_free_ *where = NULL;
2547bb41
LP
754
755 assert(dest);
756
757 if (arg_private_network)
758 return 0;
759
760 /* Fix resolv.conf, if possible */
04bc4a3f
LP
761 where = strappend(dest, "/etc/resolv.conf");
762 if (!where)
0d0f0c50 763 return log_oom();
2547bb41 764
77e63faf
LP
765 /* We don't really care for the results of this really. If it
766 * fails, it fails, but meh... */
51045322 767 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
2547bb41
LP
768
769 return 0;
770}
771
9f24adc2
LP
772static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
773
774 snprintf(s, 37,
775 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
776 SD_ID128_FORMAT_VAL(id));
777
778 return s;
779}
780
04bc4a3f 781static int setup_boot_id(const char *dest) {
7fd1b19b 782 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 783 sd_id128_t rnd = {};
04bc4a3f
LP
784 char as_uuid[37];
785 int r;
786
787 assert(dest);
788
eb91eb18
LP
789 if (arg_share_system)
790 return 0;
791
04bc4a3f
LP
792 /* Generate a new randomized boot ID, so that each boot-up of
793 * the container gets a new one */
794
795 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 796 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
797 if (!from || !to)
798 return log_oom();
04bc4a3f
LP
799
800 r = sd_id128_randomize(&rnd);
801 if (r < 0) {
802 log_error("Failed to generate random boot id: %s", strerror(-r));
ed8b7a3e 803 return r;
04bc4a3f
LP
804 }
805
9f24adc2 806 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 807
574d5f2d 808 r = write_string_file(from, as_uuid);
04bc4a3f
LP
809 if (r < 0) {
810 log_error("Failed to write boot id: %s", strerror(-r));
ed8b7a3e 811 return r;
04bc4a3f
LP
812 }
813
814 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
815 log_error("Failed to bind mount boot id: %m");
816 r = -errno;
10d18763
ZJS
817 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
818 log_warning("Failed to make boot id read-only: %m");
04bc4a3f
LP
819
820 unlink(from);
04bc4a3f
LP
821 return r;
822}
823
e58a1277 824static int copy_devnodes(const char *dest) {
88213476
LP
825
826 static const char devnodes[] =
827 "null\0"
828 "zero\0"
829 "full\0"
830 "random\0"
831 "urandom\0"
f2d88580 832 "tty\0";
88213476
LP
833
834 const char *d;
e58a1277 835 int r = 0;
7fd1b19b 836 _cleanup_umask_ mode_t u;
a258bf26
LP
837
838 assert(dest);
124640f1
LP
839
840 u = umask(0000);
88213476
LP
841
842 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 843 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 844 struct stat st;
88213476 845
7f112f50
LP
846 from = strappend("/dev/", d);
847 to = strjoin(dest, "/dev/", d, NULL);
848 if (!from || !to)
849 return log_oom();
88213476
LP
850
851 if (stat(from, &st) < 0) {
852
853 if (errno != ENOENT) {
854 log_error("Failed to stat %s: %m", from);
7f112f50 855 return -errno;
88213476
LP
856 }
857
a258bf26 858 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 859
ed8b7a3e 860 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 861 return -EIO;
a258bf26
LP
862
863 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
864
865 log_error("mknod(%s) failed: %m", dest);
7f112f50 866 return -errno;
88213476 867 }
88213476
LP
868 }
869
e58a1277
LP
870 return r;
871}
88213476 872
f2d88580
LP
873static int setup_ptmx(const char *dest) {
874 _cleanup_free_ char *p = NULL;
875
876 p = strappend(dest, "/dev/ptmx");
877 if (!p)
878 return log_oom();
879
880 if (symlink("pts/ptmx", p) < 0) {
881 log_error("Failed to create /dev/ptmx symlink: %m");
882 return -errno;
883 }
884
885 return 0;
886}
887
e58a1277 888static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
889 _cleanup_umask_ mode_t u;
890 const char *to;
e58a1277 891 struct stat st;
e58a1277 892 int r;
e58a1277
LP
893
894 assert(dest);
895 assert(console);
896
897 u = umask(0000);
898
eb0f0863
LP
899 if (stat("/dev/null", &st) < 0) {
900 log_error("Failed to stat /dev/null: %m");
25ea79fe 901 return -errno;
e58a1277 902 }
88213476 903
e58a1277
LP
904 r = chmod_and_chown(console, 0600, 0, 0);
905 if (r < 0) {
906 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
25ea79fe 907 return r;
a258bf26 908 }
88213476 909
a258bf26
LP
910 /* We need to bind mount the right tty to /dev/console since
911 * ptys can only exist on pts file systems. To have something
eb0f0863
LP
912 * to bind mount things on we create a device node first, and
913 * use /dev/null for that since we the cgroups device policy
914 * allows us to create that freely, while we cannot create
915 * /dev/console. (Note that the major minor doesn't actually
916 * matter here, since we mount it over anyway). */
a258bf26 917
eb0f0863 918 to = strappenda(dest, "/dev/console");
e58a1277
LP
919 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
920 log_error("mknod() for /dev/console failed: %m");
25ea79fe 921 return -errno;
e58a1277 922 }
a258bf26
LP
923
924 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
e58a1277 925 log_error("Bind mount for /dev/console failed: %m");
25ea79fe 926 return -errno;
a258bf26
LP
927 }
928
25ea79fe 929 return 0;
e58a1277
LP
930}
931
932static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 933 _cleanup_free_ char *from = NULL, *to = NULL;
e58a1277 934 int r, fd, k;
7fd1b19b 935 _cleanup_umask_ mode_t u;
e58a1277
LP
936 union {
937 struct cmsghdr cmsghdr;
938 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
939 } control = {};
940 struct msghdr mh = {
941 .msg_control = &control,
942 .msg_controllen = sizeof(control),
943 };
e58a1277
LP
944 struct cmsghdr *cmsg;
945
946 assert(dest);
947 assert(kmsg_socket >= 0);
a258bf26 948
e58a1277 949 u = umask(0000);
a258bf26 950
f1e5dfe2
LP
951 /* We create the kmsg FIFO as /dev/kmsg, but immediately
952 * delete it after bind mounting it to /proc/kmsg. While FIFOs
953 * on the reading side behave very similar to /proc/kmsg,
954 * their writing side behaves differently from /dev/kmsg in
955 * that writing blocks when nothing is reading. In order to
956 * avoid any problems with containers deadlocking due to this
957 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
958 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
959 asprintf(&to, "%s/proc/kmsg", dest) < 0)
960 return log_oom();
e58a1277
LP
961
962 if (mkfifo(from, 0600) < 0) {
963 log_error("mkfifo() for /dev/kmsg failed: %m");
25ea79fe 964 return -errno;
e58a1277
LP
965 }
966
967 r = chmod_and_chown(from, 0600, 0, 0);
968 if (r < 0) {
969 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
25ea79fe 970 return r;
e58a1277
LP
971 }
972
973 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
974 log_error("Bind mount for /proc/kmsg failed: %m");
25ea79fe 975 return -errno;
e58a1277
LP
976 }
977
978 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
979 if (fd < 0) {
980 log_error("Failed to open fifo: %m");
25ea79fe 981 return -errno;
e58a1277
LP
982 }
983
e58a1277
LP
984 cmsg = CMSG_FIRSTHDR(&mh);
985 cmsg->cmsg_level = SOL_SOCKET;
986 cmsg->cmsg_type = SCM_RIGHTS;
987 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
988 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
989
990 mh.msg_controllen = cmsg->cmsg_len;
991
992 /* Store away the fd in the socket, so that it stays open as
993 * long as we run the child */
994 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
03e334a1 995 safe_close(fd);
e58a1277
LP
996
997 if (k < 0) {
998 log_error("Failed to send FIFO fd: %m");
25ea79fe 999 return -errno;
a258bf26
LP
1000 }
1001
f1e5dfe2
LP
1002 /* And now make the FIFO unavailable as /dev/kmsg... */
1003 unlink(from);
25ea79fe 1004 return 0;
88213476
LP
1005}
1006
3a74cea5 1007static int setup_hostname(void) {
3a74cea5 1008
eb91eb18
LP
1009 if (arg_share_system)
1010 return 0;
1011
7027ff61
LP
1012 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1013 return -errno;
3a74cea5 1014
7027ff61 1015 return 0;
3a74cea5
LP
1016}
1017
57fb9fb5 1018static int setup_journal(const char *directory) {
4d680aee 1019 sd_id128_t machine_id, this_id;
7fd1b19b 1020 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 1021 char *id;
57fb9fb5
LP
1022 int r;
1023
57fb9fb5 1024 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
1025 if (!p)
1026 return log_oom();
57fb9fb5
LP
1027
1028 r = read_one_line_file(p, &b);
27407a01
ZJS
1029 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1030 return 0;
1031 else if (r < 0) {
1032 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
57fb9fb5
LP
1033 return r;
1034 }
1035
27407a01
ZJS
1036 id = strstrip(b);
1037 if (isempty(id) && arg_link_journal == LINK_AUTO)
1038 return 0;
57fb9fb5 1039
27407a01
ZJS
1040 /* Verify validity */
1041 r = sd_id128_from_string(id, &machine_id);
57fb9fb5 1042 if (r < 0) {
27407a01
ZJS
1043 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1044 return r;
57fb9fb5
LP
1045 }
1046
4d680aee
ZJS
1047 r = sd_id128_get_machine(&this_id);
1048 if (r < 0) {
1049 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1050 return r;
1051 }
1052
1053 if (sd_id128_equal(machine_id, this_id)) {
1054 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1055 "Host and machine ids are equal (%s): refusing to link journals", id);
1056 if (arg_link_journal == LINK_AUTO)
1057 return 0;
1058 return
1059 -EEXIST;
1060 }
1061
1062 if (arg_link_journal == LINK_NO)
1063 return 0;
1064
57fb9fb5 1065 free(p);
27407a01
ZJS
1066 p = strappend("/var/log/journal/", id);
1067 q = strjoin(directory, "/var/log/journal/", id, NULL);
1068 if (!p || !q)
1069 return log_oom();
1070
1071 if (path_is_mount_point(p, false) > 0) {
1072 if (arg_link_journal != LINK_AUTO) {
1073 log_error("%s: already a mount point, refusing to use for journal", p);
1074 return -EEXIST;
1075 }
1076
1077 return 0;
57fb9fb5
LP
1078 }
1079
27407a01 1080 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1081 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1082 log_error("%s: already a mount point, refusing to use for journal", q);
1083 return -EEXIST;
57fb9fb5
LP
1084 }
1085
27407a01 1086 return 0;
57fb9fb5
LP
1087 }
1088
1089 r = readlink_and_make_absolute(p, &d);
1090 if (r >= 0) {
1091 if ((arg_link_journal == LINK_GUEST ||
1092 arg_link_journal == LINK_AUTO) &&
1093 path_equal(d, q)) {
1094
27407a01
ZJS
1095 r = mkdir_p(q, 0755);
1096 if (r < 0)
1097 log_warning("failed to create directory %s: %m", q);
1098 return 0;
57fb9fb5
LP
1099 }
1100
1101 if (unlink(p) < 0) {
1102 log_error("Failed to remove symlink %s: %m", p);
27407a01 1103 return -errno;
57fb9fb5
LP
1104 }
1105 } else if (r == -EINVAL) {
1106
1107 if (arg_link_journal == LINK_GUEST &&
1108 rmdir(p) < 0) {
1109
27407a01
ZJS
1110 if (errno == ENOTDIR) {
1111 log_error("%s already exists and is neither a symlink nor a directory", p);
1112 return r;
1113 } else {
57fb9fb5 1114 log_error("Failed to remove %s: %m", p);
27407a01 1115 return -errno;
57fb9fb5 1116 }
57fb9fb5
LP
1117 }
1118 } else if (r != -ENOENT) {
1119 log_error("readlink(%s) failed: %m", p);
27407a01 1120 return r;
57fb9fb5
LP
1121 }
1122
1123 if (arg_link_journal == LINK_GUEST) {
1124
1125 if (symlink(q, p) < 0) {
1126 log_error("Failed to symlink %s to %s: %m", q, p);
27407a01 1127 return -errno;
57fb9fb5
LP
1128 }
1129
27407a01
ZJS
1130 r = mkdir_p(q, 0755);
1131 if (r < 0)
1132 log_warning("failed to create directory %s: %m", q);
1133 return 0;
57fb9fb5
LP
1134 }
1135
1136 if (arg_link_journal == LINK_HOST) {
1137 r = mkdir_p(p, 0755);
1138 if (r < 0) {
1139 log_error("Failed to create %s: %m", p);
27407a01 1140 return r;
57fb9fb5
LP
1141 }
1142
27407a01
ZJS
1143 } else if (access(p, F_OK) < 0)
1144 return 0;
57fb9fb5 1145
57fb9fb5
LP
1146 r = mkdir_p(q, 0755);
1147 if (r < 0) {
1148 log_error("Failed to create %s: %m", q);
27407a01 1149 return r;
57fb9fb5
LP
1150 }
1151
1152 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1153 log_error("Failed to bind mount journal from host into guest: %m");
27407a01 1154 return -errno;
57fb9fb5
LP
1155 }
1156
27407a01 1157 return 0;
57fb9fb5
LP
1158}
1159
9bd37b40
LP
1160static int setup_kdbus(const char *dest, const char *path) {
1161 const char *p;
1162
1163 if (!path)
1164 return 0;
1165
1166 p = strappenda(dest, "/dev/kdbus");
1167 if (mkdir(p, 0755) < 0) {
1168 log_error("Failed to create kdbus path: %m");
1169 return -errno;
1170 }
1171
1172 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
486e99a3 1173 log_error("Failed to mount kdbus domain path: %m");
9bd37b40
LP
1174 return -errno;
1175 }
1176
1177 return 0;
1178}
1179
88213476 1180static int drop_capabilities(void) {
5076f0cc 1181 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1182}
1183
354bfd2b 1184static int register_machine(pid_t pid) {
9444b1f2
LP
1185 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1186 _cleanup_bus_unref_ sd_bus *bus = NULL;
1187 int r;
1188
eb91eb18
LP
1189 if (!arg_register)
1190 return 0;
1191
1c03020c 1192 r = sd_bus_default_system(&bus);
9444b1f2
LP
1193 if (r < 0) {
1194 log_error("Failed to open system bus: %s", strerror(-r));
1195 return r;
1196 }
1197
89f7c846
LP
1198 if (arg_keep_unit) {
1199 r = sd_bus_call_method(
1200 bus,
1201 "org.freedesktop.machine1",
1202 "/org/freedesktop/machine1",
1203 "org.freedesktop.machine1.Manager",
1204 "RegisterMachine",
1205 &error,
1206 NULL,
1207 "sayssus",
1208 arg_machine,
1209 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1210 "nspawn",
1211 "container",
1212 (uint32_t) pid,
1213 strempty(arg_directory));
1214 } else {
9457ac5b
LP
1215 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1216
1217 r = sd_bus_message_new_method_call(
89f7c846 1218 bus,
9457ac5b 1219 &m,
89f7c846
LP
1220 "org.freedesktop.machine1",
1221 "/org/freedesktop/machine1",
1222 "org.freedesktop.machine1.Manager",
9457ac5b
LP
1223 "CreateMachine");
1224 if (r < 0) {
1225 log_error("Failed to create message: %s", strerror(-r));
1226 return r;
1227 }
1228
1229 r = sd_bus_message_append(
1230 m,
1231 "sayssus",
89f7c846
LP
1232 arg_machine,
1233 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1234 "nspawn",
1235 "container",
1236 (uint32_t) pid,
9457ac5b
LP
1237 strempty(arg_directory));
1238 if (r < 0) {
1239 log_error("Failed to append message arguments: %s", strerror(-r));
1240 return r;
1241 }
1242
1243 r = sd_bus_message_open_container(m, 'a', "(sv)");
1244 if (r < 0) {
1245 log_error("Failed to open container: %s", strerror(-r));
1246 return r;
1247 }
1248
1249 if (!isempty(arg_slice)) {
1250 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1251 if (r < 0) {
1252 log_error("Failed to append slice: %s", strerror(-r));
1253 return r;
1254 }
1255 }
1256
1257 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1258 if (r < 0) {
1259 log_error("Failed to add device policy: %s", strerror(-r));
1260 return r;
1261 }
1262
a07f961e 1263 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
9457ac5b
LP
1264 /* Allow the container to
1265 * access and create the API
1266 * device nodes, so that
1267 * PrivateDevices= in the
1268 * container can work
1269 * fine */
1270 "/dev/null", "rwm",
1271 "/dev/zero", "rwm",
1272 "/dev/full", "rwm",
1273 "/dev/random", "rwm",
1274 "/dev/urandom", "rwm",
1275 "/dev/tty", "rwm",
1276 /* Allow the container
1277 * access to ptys. However,
1278 * do not permit the
1279 * container to ever create
1280 * these device nodes. */
1281 "/dev/pts/ptmx", "rw",
a07f961e
LP
1282 "char-pts", "rw",
1283 /* Allow the container
1284 * access to all kdbus
1285 * devices. Again, the
1286 * container cannot create
1287 * these nodes, only use
1288 * them. We use a pretty
1289 * open match here, so that
1290 * the kernel API can still
1291 * change. */
1292 "char-kdbus", "rw",
1293 "char-kdbus/*", "rw");
9457ac5b
LP
1294 if (r < 0) {
1295 log_error("Failed to add device whitelist: %s", strerror(-r));
1296 return r;
1297 }
1298
1299 r = sd_bus_message_close_container(m);
1300 if (r < 0) {
1301 log_error("Failed to close container: %s", strerror(-r));
1302 return r;
1303 }
1304
1305 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
1306 }
1307
9444b1f2 1308 if (r < 0) {
1f0cd86b
LP
1309 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1310 return r;
1311 }
1312
1313 return 0;
1314}
1315
1316static int terminate_machine(pid_t pid) {
1317 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1318 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1319 _cleanup_bus_unref_ sd_bus *bus = NULL;
1320 const char *path;
1321 int r;
1322
eb91eb18
LP
1323 if (!arg_register)
1324 return 0;
1325
76b54375 1326 r = sd_bus_default_system(&bus);
1f0cd86b
LP
1327 if (r < 0) {
1328 log_error("Failed to open system bus: %s", strerror(-r));
1329 return r;
1330 }
1331
1332 r = sd_bus_call_method(
1333 bus,
1334 "org.freedesktop.machine1",
1335 "/org/freedesktop/machine1",
1336 "org.freedesktop.machine1.Manager",
1337 "GetMachineByPID",
1338 &error,
1339 &reply,
1340 "u",
1341 (uint32_t) pid);
1342 if (r < 0) {
1343 /* Note that the machine might already have been
1344 * cleaned up automatically, hence don't consider it a
1345 * failure if we cannot get the machine object. */
1346 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1347 return 0;
1348 }
1349
1350 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
1351 if (r < 0)
1352 return bus_log_parse_error(r);
9444b1f2 1353
1f0cd86b
LP
1354 r = sd_bus_call_method(
1355 bus,
1356 "org.freedesktop.machine1",
1357 path,
1358 "org.freedesktop.machine1.Machine",
1359 "Terminate",
1360 &error,
1361 NULL,
1362 NULL);
1363 if (r < 0) {
1364 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1365 return 0;
1366 }
1367
9444b1f2
LP
1368 return 0;
1369}
1370
db999e0f
LP
1371static int reset_audit_loginuid(void) {
1372 _cleanup_free_ char *p = NULL;
1373 int r;
1374
1375 if (arg_share_system)
1376 return 0;
1377
1378 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1379 if (r == -ENOENT)
db999e0f
LP
1380 return 0;
1381 if (r < 0) {
1382 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1383 return r;
1384 }
1385
1386 /* Already reset? */
1387 if (streq(p, "4294967295"))
1388 return 0;
1389
1390 r = write_string_file("/proc/self/loginuid", "4294967295");
1391 if (r < 0) {
1392 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1393 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1394 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1395 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1396 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
77b6e194 1397
db999e0f 1398 sleep(5);
77b6e194 1399 }
db999e0f
LP
1400
1401 return 0;
77b6e194
LP
1402}
1403
01dde061
TG
1404#define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1405
1406static int get_mac(struct ether_addr *mac) {
1407 int r;
1408
1409 uint8_t result[8];
1410 size_t l, sz;
1411 uint8_t *v;
1412
1413 l = strlen(arg_machine);
1414 sz = sizeof(sd_id128_t) + l;
1415 v = alloca(sz);
1416
1417 /* fetch some persistent data unique to the host */
1418 r = sd_id128_get_machine((sd_id128_t*) v);
1419 if (r < 0)
1420 return r;
1421
1422 /* combine with some data unique (on this host) to this
1423 * container instance */
1424 memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1425
1426 /* Let's hash the host machine ID plus the container name. We
1427 * use a fixed, but originally randomly created hash key here. */
1428 siphash24(result, v, sz, HASH_KEY.bytes);
1429
1430 assert_cc(ETH_ALEN <= sizeof(result));
1431 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1432
1433 /* see eth_random_addr in the kernel */
1434 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
1435 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
1436
1437 return 0;
1438}
1439
08af0da2 1440static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
69c79d3c 1441 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 1442 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
01dde061 1443 struct ether_addr mac;
69c79d3c
LP
1444 int r;
1445
1446 if (!arg_private_network)
1447 return 0;
1448
1449 if (!arg_network_veth)
1450 return 0;
1451
08af0da2
LP
1452 /* Use two different interface name prefixes depending whether
1453 * we are in bridge mode or not. */
1454 if (arg_network_bridge)
1455 memcpy(iface_name, "vb-", 3);
1456 else
1457 memcpy(iface_name, "ve-", 3);
ab046dde 1458 strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
69c79d3c 1459
01dde061
TG
1460 r = get_mac(&mac);
1461 if (r < 0) {
1462 log_error("Failed to generate predictable MAC address for host0");
1463 return r;
1464 }
1465
151b9b96 1466 r = sd_rtnl_open(&rtnl, 0);
69c79d3c
LP
1467 if (r < 0) {
1468 log_error("Failed to connect to netlink: %s", strerror(-r));
1469 return r;
1470 }
1471
151b9b96 1472 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
69c79d3c
LP
1473 if (r < 0) {
1474 log_error("Failed to allocate netlink message: %s", strerror(-r));
1475 return r;
1476 }
1477
ab046dde 1478 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
69c79d3c 1479 if (r < 0) {
ab046dde 1480 log_error("Failed to add netlink interface name: %s", strerror(-r));
69c79d3c
LP
1481 return r;
1482 }
1483
ee3a6a51 1484 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
69c79d3c
LP
1485 if (r < 0) {
1486 log_error("Failed to open netlink container: %s", strerror(-r));
1487 return r;
1488 }
1489
d8e538ec 1490 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
69c79d3c
LP
1491 if (r < 0) {
1492 log_error("Failed to open netlink container: %s", strerror(-r));
1493 return r;
1494 }
1495
ee3a6a51 1496 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
69c79d3c 1497 if (r < 0) {
ab046dde 1498 log_error("Failed to open netlink container: %s", strerror(-r));
69c79d3c
LP
1499 return r;
1500 }
1501
ab046dde 1502 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
69c79d3c 1503 if (r < 0) {
ab046dde 1504 log_error("Failed to add netlink interface name: %s", strerror(-r));
69c79d3c
LP
1505 return r;
1506 }
01dde061
TG
1507
1508 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1509 if (r < 0) {
1510 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1511 return r;
1512 }
69c79d3c 1513
ab046dde 1514 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
69c79d3c
LP
1515 if (r < 0) {
1516 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1517 return r;
1518 }
1519
1520 r = sd_rtnl_message_close_container(m);
1521 if (r < 0) {
1522 log_error("Failed to close netlink container: %s", strerror(-r));
1523 return r;
1524 }
1525
1526 r = sd_rtnl_message_close_container(m);
1527 if (r < 0) {
1528 log_error("Failed to close netlink container: %s", strerror(-r));
1529 return r;
1530 }
1531
1532 r = sd_rtnl_message_close_container(m);
1533 if (r < 0) {
1534 log_error("Failed to close netlink container: %s", strerror(-r));
1535 return r;
1536 }
1537
1538 r = sd_rtnl_call(rtnl, m, 0, NULL);
1539 if (r < 0) {
1540 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1541 return r;
1542 }
1543
1544 return 0;
1545}
1546
ab046dde
TG
1547static int setup_bridge(const char veth_name[]) {
1548 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1549 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1550 int r, bridge;
1551
1552 if (!arg_private_network)
1553 return 0;
1554
1555 if (!arg_network_veth)
1556 return 0;
1557
1558 if (!arg_network_bridge)
1559 return 0;
1560
1561 bridge = (int) if_nametoindex(arg_network_bridge);
1562 if (bridge <= 0) {
1563 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1564 return -errno;
1565 }
1566
151b9b96 1567 r = sd_rtnl_open(&rtnl, 0);
ab046dde
TG
1568 if (r < 0) {
1569 log_error("Failed to connect to netlink: %s", strerror(-r));
1570 return r;
1571 }
1572
151b9b96 1573 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
ab046dde
TG
1574 if (r < 0) {
1575 log_error("Failed to allocate netlink message: %s", strerror(-r));
1576 return r;
1577 }
1578
039dd4af
TG
1579 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1580 if (r < 0) {
1581 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1582 return r;
1583 }
1584
ab046dde
TG
1585 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1586 if (r < 0) {
1587 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1588 return r;
1589 }
1590
1591 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1592 if (r < 0) {
1593 log_error("Failed to add netlink master field: %s", strerror(-r));
1594 return r;
1595 }
1596
1597 r = sd_rtnl_call(rtnl, m, 0, NULL);
1598 if (r < 0) {
1599 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1600 return r;
1601 }
1602
1603 return 0;
1604}
1605
c74e630d
LP
1606static int parse_interface(struct udev *udev, const char *name) {
1607 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1608 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1609 int ifi;
1610
1611 ifi = (int) if_nametoindex(name);
1612 if (ifi <= 0) {
1613 log_error("Failed to resolve interface %s: %m", name);
1614 return -errno;
1615 }
1616
1617 sprintf(ifi_str, "n%i", ifi);
1618 d = udev_device_new_from_device_id(udev, ifi_str);
1619 if (!d) {
1620 log_error("Failed to get udev device for interface %s: %m", name);
1621 return -errno;
1622 }
1623
1624 if (udev_device_get_is_initialized(d) <= 0) {
1625 log_error("Network interface %s is not initialized yet.", name);
1626 return -EBUSY;
1627 }
1628
1629 return ifi;
1630}
1631
69c79d3c 1632static int move_network_interfaces(pid_t pid) {
7e227024 1633 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 1634 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
1635 char **i;
1636 int r;
1637
1638 if (!arg_private_network)
1639 return 0;
1640
1641 if (strv_isempty(arg_network_interfaces))
1642 return 0;
1643
151b9b96 1644 r = sd_rtnl_open(&rtnl, 0);
aa28aefe
LP
1645 if (r < 0) {
1646 log_error("Failed to connect to netlink: %s", strerror(-r));
1647 return r;
1648 }
1649
7e227024
LP
1650 udev = udev_new();
1651 if (!udev) {
1652 log_error("Failed to connect to udev.");
1653 return -ENOMEM;
1654 }
1655
aa28aefe 1656 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 1657 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 1658 int ifi;
aa28aefe 1659
c74e630d
LP
1660 ifi = parse_interface(udev, *i);
1661 if (ifi < 0)
1662 return ifi;
1663
1664 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1665 if (r < 0) {
1666 log_error("Failed to allocate netlink message: %s", strerror(-r));
1667 return r;
aa28aefe
LP
1668 }
1669
c74e630d
LP
1670 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1671 if (r < 0) {
1672 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1673 return r;
7e227024
LP
1674 }
1675
c74e630d
LP
1676 r = sd_rtnl_call(rtnl, m, 0, NULL);
1677 if (r < 0) {
1678 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1679 return r;
7e227024 1680 }
c74e630d 1681 }
7e227024 1682
c74e630d
LP
1683 return 0;
1684}
1685
1686static int setup_macvlan(pid_t pid) {
1687 _cleanup_udev_unref_ struct udev *udev = NULL;
1688 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1689 char **i;
1690 int r;
1691
1692 if (!arg_private_network)
1693 return 0;
1694
1695 if (strv_isempty(arg_network_macvlan))
1696 return 0;
1697
1698 r = sd_rtnl_open(&rtnl, 0);
1699 if (r < 0) {
1700 log_error("Failed to connect to netlink: %s", strerror(-r));
1701 return r;
1702 }
1703
1704 udev = udev_new();
1705 if (!udev) {
1706 log_error("Failed to connect to udev.");
1707 return -ENOMEM;
1708 }
1709
1710 STRV_FOREACH(i, arg_network_macvlan) {
1711 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1712 _cleanup_free_ char *n = NULL;
1713 int ifi;
1714
1715 ifi = parse_interface(udev, *i);
1716 if (ifi < 0)
1717 return ifi;
1718
1719 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
aa28aefe
LP
1720 if (r < 0) {
1721 log_error("Failed to allocate netlink message: %s", strerror(-r));
1722 return r;
1723 }
1724
c74e630d
LP
1725 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1726 if (r < 0) {
1727 log_error("Failed to add netlink interface index: %s", strerror(-r));
1728 return r;
1729 }
1730
1731 n = strappend("mv-", *i);
1732 if (!n)
1733 return log_oom();
1734
1735 strshorten(n, IFNAMSIZ-1);
1736
1737 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1738 if (r < 0) {
1739 log_error("Failed to add netlink interface name: %s", strerror(-r));
1740 return r;
1741 }
1742
aa28aefe
LP
1743 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1744 if (r < 0) {
c74e630d
LP
1745 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1746 return r;
1747 }
1748
1749 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1750 if (r < 0) {
1751 log_error("Failed to open netlink container: %s", strerror(-r));
1752 return r;
1753 }
1754
d8e538ec 1755 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
c74e630d
LP
1756 if (r < 0) {
1757 log_error("Failed to open netlink container: %s", strerror(-r));
1758 return r;
1759 }
1760
1761 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1762 if (r < 0) {
1763 log_error("Failed to append macvlan mode: %s", strerror(-r));
1764 return r;
1765 }
1766
1767 r = sd_rtnl_message_close_container(m);
1768 if (r < 0) {
1769 log_error("Failed to close netlink container: %s", strerror(-r));
1770 return r;
1771 }
1772
1773 r = sd_rtnl_message_close_container(m);
1774 if (r < 0) {
1775 log_error("Failed to close netlink container: %s", strerror(-r));
aa28aefe
LP
1776 return r;
1777 }
1778
1779 r = sd_rtnl_call(rtnl, m, 0, NULL);
1780 if (r < 0) {
c74e630d 1781 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
aa28aefe
LP
1782 return r;
1783 }
1784 }
1785
1786 return 0;
1787}
1788
24fb1112
LP
1789static int audit_still_doesnt_work_in_containers(void) {
1790
1791#ifdef HAVE_SECCOMP
1792 scmp_filter_ctx seccomp;
1793 int r;
1794
1795 /*
1796 Audit is broken in containers, much of the userspace audit
1797 hookup will fail if running inside a container. We don't
1798 care and just turn off creation of audit sockets.
1799
1800 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1801 with EAFNOSUPPORT which audit userspace uses as indication
1802 that audit is disabled in the kernel.
1803 */
1804
1805 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1806 if (!seccomp)
1807 return log_oom();
1808
e9642be2 1809 r = seccomp_add_secondary_archs(seccomp);
9875fd78 1810 if (r < 0) {
e9642be2
LP
1811 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1812 goto finish;
1813 }
1814
3302da46 1815 r = seccomp_rule_add(
24fb1112
LP
1816 seccomp,
1817 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1818 SCMP_SYS(socket),
1819 2,
1820 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1821 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1822 if (r < 0) {
1823 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1824 goto finish;
1825 }
1826
1827 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1828 if (r < 0) {
1829 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1830 goto finish;
1831 }
1832
1833 r = seccomp_load(seccomp);
1834 if (r < 0)
1835 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1836
1837finish:
1838 seccomp_release(seccomp);
1839 return r;
1840#else
1841 return 0;
1842#endif
1843
1844}
1845
1b9e5b12
LP
1846static int setup_image(char **device_path, int *loop_nr) {
1847 struct loop_info64 info = {
1848 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1849 };
1850 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1851 _cleanup_free_ char* loopdev = NULL;
1852 struct stat st;
1853 int r, nr;
1854
1855 assert(device_path);
1856 assert(loop_nr);
1857
1858 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1859 if (fd < 0) {
1860 log_error("Failed to open %s: %m", arg_image);
1861 return -errno;
1862 }
1863
1864 if (fstat(fd, &st) < 0) {
1865 log_error("Failed to stat %s: %m", arg_image);
1866 return -errno;
1867 }
1868
1869 if (S_ISBLK(st.st_mode)) {
1870 char *p;
1871
1872 p = strdup(arg_image);
1873 if (!p)
1874 return log_oom();
1875
1876 *device_path = p;
1877
1878 *loop_nr = -1;
1879
1880 r = fd;
1881 fd = -1;
1882
1883 return r;
1884 }
1885
1886 if (!S_ISREG(st.st_mode)) {
1887 log_error("%s is not a regular file or block device: %m", arg_image);
1888 return -EINVAL;
1889 }
1890
1891 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1892 if (control < 0) {
1893 log_error("Failed to open /dev/loop-control: %m");
1894 return -errno;
1895 }
1896
1897 nr = ioctl(control, LOOP_CTL_GET_FREE);
1898 if (nr < 0) {
1899 log_error("Failed to allocate loop device: %m");
1900 return -errno;
1901 }
1902
1903 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1904 return log_oom();
1905
1906 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1907 if (loop < 0) {
1908 log_error("Failed to open loop device %s: %m", loopdev);
1909 return -errno;
1910 }
1911
1912 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1913 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1914 return -errno;
1915 }
1916
1917 if (arg_read_only)
1918 info.lo_flags |= LO_FLAGS_READ_ONLY;
1919
1920 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1921 log_error("Failed to set loopback settings on %s: %m", loopdev);
1922 return -errno;
1923 }
1924
1925 *device_path = loopdev;
1926 loopdev = NULL;
1927
1928 *loop_nr = nr;
1929
1930 r = loop;
1931 loop = -1;
1932
1933 return r;
1934}
1935
1936static int dissect_image(
1937 int fd,
727fd4fd
LP
1938 char **root_device, bool *root_device_rw,
1939 char **home_device, bool *home_device_rw,
1940 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
1941 bool *secondary) {
1942
1943#ifdef HAVE_BLKID
1944 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1945 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1946 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1947 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1948 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1949 _cleanup_udev_unref_ struct udev *udev = NULL;
1950 struct udev_list_entry *first, *item;
727fd4fd 1951 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1b9e5b12
LP
1952 const char *pttype = NULL;
1953 blkid_partlist pl;
1954 struct stat st;
1955 int r;
1956
1957 assert(fd >= 0);
1958 assert(root_device);
1959 assert(home_device);
1960 assert(srv_device);
1961 assert(secondary);
1962
1963 b = blkid_new_probe();
1964 if (!b)
1965 return log_oom();
1966
1967 errno = 0;
1968 r = blkid_probe_set_device(b, fd, 0, 0);
1969 if (r != 0) {
1970 if (errno == 0)
1971 return log_oom();
1972
1973 log_error("Failed to set device on blkid probe: %m");
1974 return -errno;
1975 }
1976
1977 blkid_probe_enable_partitions(b, 1);
1978 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1979
1980 errno = 0;
1981 r = blkid_do_safeprobe(b);
1982 if (r == -2 || r == 1) {
1983 log_error("Failed to identify any partition table on %s.\n"
1984 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1985 return -EINVAL;
1986 } else if (r != 0) {
1987 if (errno == 0)
1988 errno = EIO;
1989 log_error("Failed to probe: %m");
1990 return -errno;
1991 }
1992
1993 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1994 if (!streq_ptr(pttype, "gpt")) {
1995 log_error("Image %s does not carry a GUID Partition Table.\n"
1996 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1997 return -EINVAL;
1998 }
1999
2000 errno = 0;
2001 pl = blkid_probe_get_partitions(b);
2002 if (!pl) {
2003 if (errno == 0)
2004 return log_oom();
2005
2006 log_error("Failed to list partitions of %s", arg_image);
2007 return -errno;
2008 }
2009
2010 udev = udev_new();
2011 if (!udev)
2012 return log_oom();
2013
2014 if (fstat(fd, &st) < 0) {
2015 log_error("Failed to stat block device: %m");
2016 return -errno;
2017 }
2018
2019 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2020 if (!d)
2021 return log_oom();
2022
2023 e = udev_enumerate_new(udev);
2024 if (!e)
2025 return log_oom();
2026
2027 r = udev_enumerate_add_match_parent(e, d);
2028 if (r < 0)
2029 return log_oom();
2030
2031 r = udev_enumerate_scan_devices(e);
2032 if (r < 0) {
2033 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2034 return r;
2035 }
2036
2037 first = udev_enumerate_get_list_entry(e);
2038 udev_list_entry_foreach(item, first) {
2039 _cleanup_udev_device_unref_ struct udev_device *q;
2040 const char *stype, *node;
727fd4fd 2041 unsigned long long flags;
1b9e5b12
LP
2042 sd_id128_t type_id;
2043 blkid_partition pp;
2044 dev_t qn;
2045 int nr;
2046
2047 errno = 0;
2048 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2049 if (!q) {
2050 if (!errno)
2051 errno = ENOMEM;
2052
2053 log_error("Failed to get partition device of %s: %m", arg_image);
2054 return -errno;
2055 }
2056
2057 qn = udev_device_get_devnum(q);
2058 if (major(qn) == 0)
2059 continue;
2060
2061 if (st.st_rdev == qn)
2062 continue;
2063
2064 node = udev_device_get_devnode(q);
2065 if (!node)
2066 continue;
2067
2068 pp = blkid_partlist_devno_to_partition(pl, qn);
2069 if (!pp)
2070 continue;
2071
727fd4fd
LP
2072 flags = blkid_partition_get_flags(pp);
2073 if (flags & GPT_FLAG_NO_AUTO)
2074 continue;
2075
1b9e5b12
LP
2076 nr = blkid_partition_get_partno(pp);
2077 if (nr < 0)
2078 continue;
2079
2080 stype = blkid_partition_get_type_string(pp);
2081 if (!stype)
2082 continue;
2083
2084 if (sd_id128_from_string(stype, &type_id) < 0)
2085 continue;
2086
2087 if (sd_id128_equal(type_id, GPT_HOME)) {
2088
2089 if (home && nr >= home_nr)
2090 continue;
2091
2092 home_nr = nr;
727fd4fd
LP
2093 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2094
1b9e5b12
LP
2095 free(home);
2096 home = strdup(node);
2097 if (!home)
2098 return log_oom();
2099 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2100
2101 if (srv && nr >= srv_nr)
2102 continue;
2103
2104 srv_nr = nr;
727fd4fd
LP
2105 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2106
1b9e5b12
LP
2107 free(srv);
2108 srv = strdup(node);
2109 if (!srv)
2110 return log_oom();
2111 }
2112#ifdef GPT_ROOT_NATIVE
2113 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2114
2115 if (root && nr >= root_nr)
2116 continue;
2117
2118 root_nr = nr;
727fd4fd
LP
2119 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2120
1b9e5b12
LP
2121 free(root);
2122 root = strdup(node);
2123 if (!root)
2124 return log_oom();
2125 }
2126#endif
2127#ifdef GPT_ROOT_SECONDARY
2128 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2129
2130 if (secondary_root && nr >= secondary_root_nr)
2131 continue;
2132
2133 secondary_root_nr = nr;
727fd4fd
LP
2134 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2135
2136
1b9e5b12
LP
2137 free(secondary_root);
2138 secondary_root = strdup(node);
2139 if (!secondary_root)
2140 return log_oom();
2141 }
2142#endif
2143 }
2144
2145 if (!root && !secondary_root) {
2146 log_error("Failed to identify root partition in disk image %s.\n"
2147 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2148 return -EINVAL;
2149 }
2150
2151 if (root) {
2152 *root_device = root;
2153 root = NULL;
727fd4fd
LP
2154
2155 *root_device_rw = root_rw;
1b9e5b12
LP
2156 *secondary = false;
2157 } else if (secondary_root) {
2158 *root_device = secondary_root;
2159 secondary_root = NULL;
727fd4fd
LP
2160
2161 *root_device_rw = secondary_root_rw;
1b9e5b12
LP
2162 *secondary = true;
2163 }
2164
2165 if (home) {
2166 *home_device = home;
2167 home = NULL;
727fd4fd
LP
2168
2169 *home_device_rw = home_rw;
1b9e5b12
LP
2170 }
2171
2172 if (srv) {
2173 *srv_device = srv;
2174 srv = NULL;
727fd4fd
LP
2175
2176 *srv_device_rw = srv_rw;
1b9e5b12
LP
2177 }
2178
2179 return 0;
2180#else
2181 log_error("--image= is not supported, compiled without blkid support.");
2182 return -ENOTSUP;
2183#endif
2184}
2185
727fd4fd 2186static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
2187#ifdef HAVE_BLKID
2188 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2189 const char *fstype, *p;
2190 int r;
2191
2192 assert(what);
2193 assert(where);
2194
727fd4fd
LP
2195 if (arg_read_only)
2196 rw = false;
2197
1b9e5b12
LP
2198 if (directory)
2199 p = strappenda(where, directory);
2200 else
2201 p = where;
2202
2203 errno = 0;
2204 b = blkid_new_probe_from_filename(what);
2205 if (!b) {
2206 if (errno == 0)
2207 return log_oom();
2208 log_error("Failed to allocate prober for %s: %m", what);
2209 return -errno;
2210 }
2211
2212 blkid_probe_enable_superblocks(b, 1);
2213 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2214
2215 errno = 0;
2216 r = blkid_do_safeprobe(b);
2217 if (r == -1 || r == 1) {
2218 log_error("Cannot determine file system type of %s", what);
2219 return -EINVAL;
2220 } else if (r != 0) {
2221 if (errno == 0)
2222 errno = EIO;
2223 log_error("Failed to probe %s: %m", what);
2224 return -errno;
2225 }
2226
2227 errno = 0;
2228 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2229 if (errno == 0)
2230 errno = EINVAL;
2231 log_error("Failed to determine file system type of %s", what);
2232 return -errno;
2233 }
2234
2235 if (streq(fstype, "crypto_LUKS")) {
2236 log_error("nspawn currently does not support LUKS disk images.");
2237 return -ENOTSUP;
2238 }
2239
727fd4fd 2240 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
1b9e5b12
LP
2241 log_error("Failed to mount %s: %m", what);
2242 return -errno;
2243 }
2244
2245 return 0;
2246#else
2247 log_error("--image= is not supported, compiled without blkid support.");
2248 return -ENOTSUP;
2249#endif
2250}
2251
727fd4fd
LP
2252static int mount_devices(
2253 const char *where,
2254 const char *root_device, bool root_device_rw,
2255 const char *home_device, bool home_device_rw,
2256 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
2257 int r;
2258
2259 assert(where);
2260
2261 if (root_device) {
727fd4fd 2262 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
1b9e5b12
LP
2263 if (r < 0) {
2264 log_error("Failed to mount root directory: %s", strerror(-r));
2265 return r;
2266 }
2267 }
2268
2269 if (home_device) {
727fd4fd 2270 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
1b9e5b12
LP
2271 if (r < 0) {
2272 log_error("Failed to mount home directory: %s", strerror(-r));
2273 return r;
2274 }
2275 }
2276
2277 if (srv_device) {
727fd4fd 2278 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
1b9e5b12
LP
2279 if (r < 0) {
2280 log_error("Failed to mount server data directory: %s", strerror(-r));
2281 return r;
2282 }
2283 }
2284
2285 return 0;
2286}
2287
2288static void loop_remove(int nr, int *image_fd) {
2289 _cleanup_close_ int control = -1;
2290
2291 if (nr < 0)
2292 return;
2293
2294 if (image_fd && *image_fd >= 0) {
2295 ioctl(*image_fd, LOOP_CLR_FD);
03e334a1 2296 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
2297 }
2298
2299 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2300 if (control < 0)
2301 return;
2302
2303 ioctl(control, LOOP_CTL_REMOVE, nr);
2304}
2305
0cb9fbcd
LP
2306static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2307 int pipe_fds[2];
2308 pid_t pid;
2309
2310 assert(database);
2311 assert(key);
2312 assert(rpid);
2313
2314 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2315 log_error("Failed to allocate pipe: %m");
2316 return -errno;
2317 }
2318
2319 pid = fork();
2320 if (pid < 0) {
2321 log_error("Failed to fork getent child: %m");
2322 return -errno;
2323 } else if (pid == 0) {
2324 int nullfd;
2325 char *empty_env = NULL;
2326
2327 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2328 _exit(EXIT_FAILURE);
2329
2330 if (pipe_fds[0] > 2)
03e334a1 2331 safe_close(pipe_fds[0]);
0cb9fbcd 2332 if (pipe_fds[1] > 2)
03e334a1 2333 safe_close(pipe_fds[1]);
0cb9fbcd
LP
2334
2335 nullfd = open("/dev/null", O_RDWR);
2336 if (nullfd < 0)
2337 _exit(EXIT_FAILURE);
2338
2339 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2340 _exit(EXIT_FAILURE);
2341
2342 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2343 _exit(EXIT_FAILURE);
2344
2345 if (nullfd > 2)
03e334a1 2346 safe_close(nullfd);
0cb9fbcd
LP
2347
2348 reset_all_signal_handlers();
2349 close_all_fds(NULL, 0);
2350
4de82926
MM
2351 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2352 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
2353 _exit(EXIT_FAILURE);
2354 }
2355
03e334a1 2356 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
2357
2358 *rpid = pid;
2359
2360 return pipe_fds[0];
2361}
2362
2363static int change_uid_gid(char **_home) {
0cb9fbcd
LP
2364 char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2365 _cleanup_free_ uid_t *uids = NULL;
2366 _cleanup_free_ char *home = NULL;
2367 _cleanup_fclose_ FILE *f = NULL;
2368 _cleanup_close_ int fd = -1;
2369 unsigned n_uids = 0;
70f539ca 2370 size_t sz = 0, l;
0cb9fbcd
LP
2371 uid_t uid;
2372 gid_t gid;
2373 pid_t pid;
2374 int r;
2375
2376 assert(_home);
2377
2378 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2379 /* Reset everything fully to 0, just in case */
2380
2381 if (setgroups(0, NULL) < 0) {
2382 log_error("setgroups() failed: %m");
2383 return -errno;
2384 }
2385
2386 if (setresgid(0, 0, 0) < 0) {
2387 log_error("setregid() failed: %m");
2388 return -errno;
2389 }
2390
2391 if (setresuid(0, 0, 0) < 0) {
2392 log_error("setreuid() failed: %m");
2393 return -errno;
2394 }
2395
2396 *_home = NULL;
2397 return 0;
2398 }
2399
2400 /* First, get user credentials */
2401 fd = spawn_getent("passwd", arg_user, &pid);
2402 if (fd < 0)
2403 return fd;
2404
2405 f = fdopen(fd, "r");
2406 if (!f)
2407 return log_oom();
2408 fd = -1;
2409
2410 if (!fgets(line, sizeof(line), f)) {
2411
2412 if (!ferror(f)) {
2413 log_error("Failed to resolve user %s.", arg_user);
2414 return -ESRCH;
2415 }
2416
2417 log_error("Failed to read from getent: %m");
2418 return -errno;
2419 }
2420
2421 truncate_nl(line);
2422
2423 wait_for_terminate_and_warn("getent passwd", pid);
2424
2425 x = strchr(line, ':');
2426 if (!x) {
2427 log_error("/etc/passwd entry has invalid user field.");
2428 return -EIO;
2429 }
2430
2431 u = strchr(x+1, ':');
2432 if (!u) {
2433 log_error("/etc/passwd entry has invalid password field.");
2434 return -EIO;
2435 }
2436
2437 u++;
2438 g = strchr(u, ':');
2439 if (!g) {
2440 log_error("/etc/passwd entry has invalid UID field.");
2441 return -EIO;
2442 }
2443
2444 *g = 0;
2445 g++;
2446 x = strchr(g, ':');
2447 if (!x) {
2448 log_error("/etc/passwd entry has invalid GID field.");
2449 return -EIO;
2450 }
2451
2452 *x = 0;
2453 h = strchr(x+1, ':');
2454 if (!h) {
2455 log_error("/etc/passwd entry has invalid GECOS field.");
2456 return -EIO;
2457 }
2458
2459 h++;
2460 x = strchr(h, ':');
2461 if (!x) {
2462 log_error("/etc/passwd entry has invalid home directory field.");
2463 return -EIO;
2464 }
2465
2466 *x = 0;
2467
2468 r = parse_uid(u, &uid);
2469 if (r < 0) {
2470 log_error("Failed to parse UID of user.");
2471 return -EIO;
2472 }
2473
2474 r = parse_gid(g, &gid);
2475 if (r < 0) {
2476 log_error("Failed to parse GID of user.");
2477 return -EIO;
2478 }
2479
2480 home = strdup(h);
2481 if (!home)
2482 return log_oom();
2483
2484 /* Second, get group memberships */
2485 fd = spawn_getent("initgroups", arg_user, &pid);
2486 if (fd < 0)
2487 return fd;
2488
2489 fclose(f);
2490 f = fdopen(fd, "r");
2491 if (!f)
2492 return log_oom();
2493 fd = -1;
2494
2495 if (!fgets(line, sizeof(line), f)) {
2496 if (!ferror(f)) {
2497 log_error("Failed to resolve user %s.", arg_user);
2498 return -ESRCH;
2499 }
2500
2501 log_error("Failed to read from getent: %m");
2502 return -errno;
2503 }
2504
2505 truncate_nl(line);
2506
2507 wait_for_terminate_and_warn("getent initgroups", pid);
2508
2509 /* Skip over the username and subsequent separator whitespace */
2510 x = line;
2511 x += strcspn(x, WHITESPACE);
2512 x += strspn(x, WHITESPACE);
2513
2514 FOREACH_WORD(w, l, x, state) {
2515 char c[l+1];
2516
2517 memcpy(c, w, l);
2518 c[l] = 0;
2519
2520 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2521 return log_oom();
2522
2523 r = parse_uid(c, &uids[n_uids++]);
2524 if (r < 0) {
2525 log_error("Failed to parse group data from getent.");
2526 return -EIO;
2527 }
2528 }
2529
2530 r = mkdir_parents(home, 0775);
2531 if (r < 0) {
2532 log_error("Failed to make home root directory: %s", strerror(-r));
2533 return r;
2534 }
2535
2536 r = mkdir_safe(home, 0755, uid, gid);
f418f31d 2537 if (r < 0 && r != -EEXIST) {
0cb9fbcd
LP
2538 log_error("Failed to make home directory: %s", strerror(-r));
2539 return r;
2540 }
2541
2542 fchown(STDIN_FILENO, uid, gid);
2543 fchown(STDOUT_FILENO, uid, gid);
2544 fchown(STDERR_FILENO, uid, gid);
2545
2546 if (setgroups(n_uids, uids) < 0) {
2547 log_error("Failed to set auxiliary groups: %m");
2548 return -errno;
2549 }
2550
2551 if (setresgid(gid, gid, gid) < 0) {
2552 log_error("setregid() failed: %m");
2553 return -errno;
2554 }
2555
2556 if (setresuid(uid, uid, uid) < 0) {
2557 log_error("setreuid() failed: %m");
2558 return -errno;
2559 }
2560
2561 if (_home) {
2562 *_home = home;
2563 home = NULL;
2564 }
2565
2566 return 0;
2567}
2568
88213476 2569int main(int argc, char *argv[]) {
69c79d3c 2570
1b9e5b12 2571 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
727fd4fd 2572 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
d96c1ecf 2573 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
3d94f76c 2574 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
69c79d3c 2575 _cleanup_fdset_free_ FDSet *fds = NULL;
1b9e5b12 2576 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
69c79d3c 2577 const char *console = NULL;
1b9e5b12
LP
2578 char veth_name[IFNAMSIZ];
2579 bool secondary = false;
69c79d3c 2580 pid_t pid = 0;
a258bf26 2581 sigset_t mask;
88213476
LP
2582
2583 log_parse_environment();
2584 log_open();
2585
05947bef
LP
2586 k = parse_argv(argc, argv);
2587 if (k < 0)
88213476 2588 goto finish;
05947bef
LP
2589 else if (k == 0) {
2590 r = EXIT_SUCCESS;
2591 goto finish;
2592 }
88213476 2593
1b9e5b12
LP
2594 if (!arg_image) {
2595 if (arg_directory) {
2596 char *p;
88213476 2597
1b9e5b12
LP
2598 p = path_make_absolute_cwd(arg_directory);
2599 free(arg_directory);
2600 arg_directory = p;
2601 } else
2602 arg_directory = get_current_dir_name();
88213476 2603
1b9e5b12
LP
2604 if (!arg_directory) {
2605 log_error("Failed to determine path, please use -D.");
2606 goto finish;
2607 }
2608 path_kill_slashes(arg_directory);
88213476
LP
2609 }
2610
7027ff61 2611 if (!arg_machine) {
1b9e5b12 2612 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
7027ff61
LP
2613 if (!arg_machine) {
2614 log_oom();
2615 goto finish;
2616 }
2617
e724b063 2618 hostname_cleanup(arg_machine, false);
7027ff61
LP
2619 if (isempty(arg_machine)) {
2620 log_error("Failed to determine machine name automatically, please use -M.");
2621 goto finish;
2622 }
2623 }
2624
88213476
LP
2625 if (geteuid() != 0) {
2626 log_error("Need to be root.");
2627 goto finish;
2628 }
2629
04d391da
LP
2630 if (sd_booted() <= 0) {
2631 log_error("Not running on a systemd system.");
2632 goto finish;
2633 }
2634
1b9e5b12
LP
2635 log_close();
2636 n_fd_passed = sd_listen_fds(false);
2637 if (n_fd_passed > 0) {
2638 k = fdset_new_listen_fds(&fds, false);
2639 if (k < 0) {
2640 log_error("Failed to collect file descriptors: %s", strerror(-k));
2641 goto finish;
2642 }
88213476 2643 }
1b9e5b12
LP
2644 fdset_close_others(fds);
2645 log_open();
88213476 2646
1b9e5b12
LP
2647 if (arg_directory) {
2648 if (path_equal(arg_directory, "/")) {
2649 log_error("Spawning container on root directory not supported.");
6b9132a9
LP
2650 goto finish;
2651 }
1b9e5b12
LP
2652
2653 if (arg_boot) {
2654 if (path_is_os_tree(arg_directory) <= 0) {
2655 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2656 goto finish;
2657 }
2658 } else {
2659 const char *p;
2660
2661 p = strappenda(arg_directory,
2662 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2663 if (access(p, F_OK) < 0) {
2664 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2665 goto finish;
2666
2667 }
2668 }
6b9132a9 2669 } else {
1b9e5b12 2670 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 2671
1b9e5b12
LP
2672 if (!mkdtemp(template)) {
2673 log_error("Failed to create temporary directory: %m");
2674 r = -errno;
6b9132a9 2675 goto finish;
1b9e5b12 2676 }
6b9132a9 2677
1b9e5b12
LP
2678 arg_directory = strdup(template);
2679 if (!arg_directory) {
2680 r = log_oom();
2681 goto finish;
6b9132a9 2682 }
88213476 2683
1b9e5b12
LP
2684 image_fd = setup_image(&device_path, &loop_nr);
2685 if (image_fd < 0) {
2686 r = image_fd;
842f3b0f
LP
2687 goto finish;
2688 }
1b9e5b12 2689
727fd4fd 2690 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
1b9e5b12
LP
2691 if (r < 0)
2692 goto finish;
842f3b0f 2693 }
842f3b0f 2694
db7feb7e
LP
2695 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2696 if (master < 0) {
a258bf26
LP
2697 log_error("Failed to acquire pseudo tty: %m");
2698 goto finish;
2699 }
2700
db7feb7e
LP
2701 console = ptsname(master);
2702 if (!console) {
a258bf26
LP
2703 log_error("Failed to determine tty name: %m");
2704 goto finish;
2705 }
2706
284c0b91 2707 if (!arg_quiet)
1b9e5b12 2708 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_image ? arg_image : arg_directory);
a258bf26
LP
2709
2710 if (unlockpt(master) < 0) {
2711 log_error("Failed to unlock tty: %m");
2712 goto finish;
2713 }
2714
eb91eb18
LP
2715 if (access("/dev/kdbus/control", F_OK) >= 0) {
2716
2717 if (arg_share_system) {
2718 kdbus_domain = strdup("/dev/kdbus");
2719 if (!kdbus_domain) {
2720 log_oom();
2721 goto finish;
2722 }
2723 } else {
2724 const char *ns;
2725
2726 ns = strappenda("machine-", arg_machine);
2727 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2728 if (r < 0)
2729 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2730 else
2731 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2732 }
2733 }
9bd37b40 2734
e58a1277 2735 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
354bfd2b
LP
2736 log_error("Failed to create kmsg socket pair: %m");
2737 goto finish;
2738 }
2739
05947bef
LP
2740 sd_notify(0, "READY=1");
2741
a258bf26
LP
2742 assert_se(sigemptyset(&mask) == 0);
2743 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2744 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2745
d87be9b0 2746 for (;;) {
d96c1ecf 2747 int parent_ready_fd = -1, child_ready_fd = -1;
d87be9b0 2748 siginfo_t status;
d96c1ecf 2749 eventfd_t x;
a383724e 2750
d96c1ecf
LP
2751 parent_ready_fd = eventfd(0, EFD_CLOEXEC);
2752 if (parent_ready_fd < 0) {
2753 log_error("Failed to create event fd: %m");
2754 goto finish;
2755 }
2756
2757 child_ready_fd = eventfd(0, EFD_CLOEXEC);
2758 if (child_ready_fd < 0) {
40ddbdf8
LP
2759 log_error("Failed to create event fd: %m");
2760 goto finish;
2761 }
2762
8a96d94e
LP
2763 pid = syscall(__NR_clone,
2764 SIGCHLD|CLONE_NEWNS|
2765 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2766 (arg_private_network ? CLONE_NEWNET : 0), NULL);
d87be9b0
LP
2767 if (pid < 0) {
2768 if (errno == EINVAL)
2769 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2770 else
2771 log_error("clone() failed: %m");
a258bf26 2772
d87be9b0
LP
2773 goto finish;
2774 }
a258bf26 2775
d87be9b0
LP
2776 if (pid == 0) {
2777 /* child */
0cb9fbcd 2778 _cleanup_free_ char *home = NULL;
5674767e 2779 unsigned n_env = 2;
d87be9b0 2780 const char *envp[] = {
e10a55fd 2781 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
2782 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2783 NULL, /* TERM */
2784 NULL, /* HOME */
2785 NULL, /* USER */
2786 NULL, /* LOGNAME */
2787 NULL, /* container_uuid */
842f3b0f
LP
2788 NULL, /* LISTEN_FDS */
2789 NULL, /* LISTEN_PID */
d87be9b0
LP
2790 NULL
2791 };
f4889f65 2792 char **env_use;
a258bf26 2793
5674767e
ZJS
2794 envp[n_env] = strv_find_prefix(environ, "TERM=");
2795 if (envp[n_env])
2796 n_env ++;
a258bf26 2797
03e334a1 2798 master = safe_close(master);
a258bf26 2799
d87be9b0
LP
2800 close_nointr(STDIN_FILENO);
2801 close_nointr(STDOUT_FILENO);
2802 close_nointr(STDERR_FILENO);
db7feb7e 2803
03e334a1 2804 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
a258bf26 2805
d87be9b0 2806 reset_all_signal_handlers();
88213476 2807
d87be9b0
LP
2808 assert_se(sigemptyset(&mask) == 0);
2809 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
f5c1b9ee 2810
842f3b0f
LP
2811 k = open_terminal(console, O_RDWR);
2812 if (k != STDIN_FILENO) {
2813 if (k >= 0) {
03e334a1 2814 safe_close(k);
842f3b0f
LP
2815 k = -EINVAL;
2816 }
2817
2818 log_error("Failed to open console: %s", strerror(-k));
2819 goto child_fail;
2820 }
2821
2822 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2823 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2824 log_error("Failed to duplicate console: %m");
d87be9b0 2825 goto child_fail;
842f3b0f 2826 }
bc2f673e 2827
d87be9b0
LP
2828 if (setsid() < 0) {
2829 log_error("setsid() failed: %m");
bc2f673e
LP
2830 goto child_fail;
2831 }
2832
db999e0f
LP
2833 if (reset_audit_loginuid() < 0)
2834 goto child_fail;
2835
d87be9b0
LP
2836 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2837 log_error("PR_SET_PDEATHSIG failed: %m");
2838 goto child_fail;
2839 }
e58a1277 2840
d87be9b0
LP
2841 /* Mark everything as slave, so that we still
2842 * receive mounts from the real root, but don't
2843 * propagate mounts to the real root. */
2844 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2845 log_error("MS_SLAVE|MS_REC failed: %m");
2846 goto child_fail;
2847 }
04bc4a3f 2848
727fd4fd
LP
2849 if (mount_devices(arg_directory,
2850 root_device, root_device_rw,
2851 home_device, home_device_rw,
2852 srv_device, srv_device_rw) < 0)
1b9e5b12
LP
2853 goto child_fail;
2854
d87be9b0
LP
2855 /* Turn directory into bind mount */
2856 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2857 log_error("Failed to make bind mount.");
2858 goto child_fail;
2859 }
88213476 2860
d87be9b0
LP
2861 if (arg_read_only)
2862 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
2863 log_error("Failed to make read-only.");
2864 goto child_fail;
2865 }
2547bb41 2866
d87be9b0
LP
2867 if (mount_all(arg_directory) < 0)
2868 goto child_fail;
57fb9fb5 2869
d87be9b0
LP
2870 if (copy_devnodes(arg_directory) < 0)
2871 goto child_fail;
a258bf26 2872
f2d88580
LP
2873 if (setup_ptmx(arg_directory) < 0)
2874 goto child_fail;
2875
d87be9b0 2876 dev_setup(arg_directory);
88213476 2877
24fb1112
LP
2878 if (audit_still_doesnt_work_in_containers() < 0)
2879 goto child_fail;
2880
d87be9b0
LP
2881 if (setup_dev_console(arg_directory, console) < 0)
2882 goto child_fail;
88213476 2883
d87be9b0
LP
2884 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2885 goto child_fail;
88213476 2886
03e334a1 2887 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
a258bf26 2888
d87be9b0
LP
2889 if (setup_boot_id(arg_directory) < 0)
2890 goto child_fail;
a41fe3a2 2891
d87be9b0
LP
2892 if (setup_timezone(arg_directory) < 0)
2893 goto child_fail;
88213476 2894
d87be9b0
LP
2895 if (setup_resolv_conf(arg_directory) < 0)
2896 goto child_fail;
687d0825 2897
d87be9b0 2898 if (setup_journal(arg_directory) < 0)
687d0825 2899 goto child_fail;
687d0825 2900
17fe0523
LP
2901 if (mount_binds(arg_directory, arg_bind, 0) < 0)
2902 goto child_fail;
2903
2904 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
2905 goto child_fail;
2906
486e99a3 2907 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
9bd37b40
LP
2908 goto child_fail;
2909
d96c1ecf
LP
2910 /* Tell the parent that we are ready, and that
2911 * it can cgroupify us to that we lack access
2912 * to certain devices and resources. */
2913 eventfd_write(child_ready_fd, 1);
03e334a1 2914 child_ready_fd = safe_close(child_ready_fd);
d96c1ecf 2915
d87be9b0
LP
2916 if (chdir(arg_directory) < 0) {
2917 log_error("chdir(%s) failed: %m", arg_directory);
687d0825
MV
2918 goto child_fail;
2919 }
2920
d87be9b0
LP
2921 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
2922 log_error("mount(MS_MOVE) failed: %m");
687d0825
MV
2923 goto child_fail;
2924 }
2925
d87be9b0
LP
2926 if (chroot(".") < 0) {
2927 log_error("chroot() failed: %m");
687d0825
MV
2928 goto child_fail;
2929 }
2930
d87be9b0
LP
2931 if (chdir("/") < 0) {
2932 log_error("chdir() failed: %m");
687d0825
MV
2933 goto child_fail;
2934 }
2935
d87be9b0
LP
2936 umask(0022);
2937
eb91eb18
LP
2938 if (arg_private_network)
2939 loopback_setup();
d87be9b0
LP
2940
2941 if (drop_capabilities() < 0) {
2942 log_error("drop_capabilities() failed: %m");
687d0825
MV
2943 goto child_fail;
2944 }
687d0825 2945
0cb9fbcd
LP
2946 r = change_uid_gid(&home);
2947 if (r < 0)
2948 goto child_fail;
d87be9b0 2949
842f3b0f
LP
2950 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2951 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2952 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 2953 log_oom();
144f0fc0
LP
2954 goto child_fail;
2955 }
687d0825 2956
9444b1f2 2957 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
9f24adc2
LP
2958 char as_uuid[37];
2959
2960 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
842f3b0f
LP
2961 log_oom();
2962 goto child_fail;
2963 }
2964 }
2965
2966 if (fdset_size(fds) > 0) {
2967 k = fdset_cloexec(fds, false);
2968 if (k < 0) {
2969 log_error("Failed to unset O_CLOEXEC for file descriptors.");
2970 goto child_fail;
2971 }
2972
2973 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 2974 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0
LP
2975 log_oom();
2976 goto child_fail;
2977 }
2978 }
2979
2980 setup_hostname();
2981
6afc95b7
LP
2982 if (arg_personality != 0xffffffffLU) {
2983 if (personality(arg_personality) < 0) {
2984 log_error("personality() failed: %m");
2985 goto child_fail;
2986 }
1b9e5b12
LP
2987 } else if (secondary) {
2988 if (personality(PER_LINUX32) < 0) {
2989 log_error("personality() failed: %m");
2990 goto child_fail;
2991 }
6afc95b7
LP
2992 }
2993
d96c1ecf
LP
2994#ifdef HAVE_SELINUX
2995 if (arg_selinux_context)
0cb9fbcd 2996 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
d96c1ecf 2997 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
0cb9fbcd
LP
2998 goto child_fail;
2999 }
d96c1ecf 3000#endif
354bfd2b 3001
f4889f65
LP
3002 if (!strv_isempty(arg_setenv)) {
3003 char **n;
3004
3005 n = strv_env_merge(2, envp, arg_setenv);
3006 if (!n) {
3007 log_oom();
3008 goto child_fail;
3009 }
3010
3011 env_use = n;
3012 } else
3013 env_use = (char**) envp;
3014
d96c1ecf
LP
3015 /* Wait until the parent is ready with the setup, too... */
3016 eventfd_read(parent_ready_fd, &x);
03e334a1 3017 parent_ready_fd = safe_close(parent_ready_fd);
d96c1ecf 3018
d87be9b0
LP
3019 if (arg_boot) {
3020 char **a;
3021 size_t l;
88213476 3022
d87be9b0 3023 /* Automatically search for the init system */
0f0dbc46 3024
d87be9b0
LP
3025 l = 1 + argc - optind;
3026 a = newa(char*, l + 1);
3027 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 3028
d87be9b0 3029 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 3030 execve(a[0], a, env_use);
0f0dbc46 3031
d87be9b0 3032 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 3033 execve(a[0], a, env_use);
0f0dbc46 3034
d87be9b0 3035 a[0] = (char*) "/sbin/init";
f4889f65 3036 execve(a[0], a, env_use);
d87be9b0 3037 } else if (argc > optind)
f4889f65 3038 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
3039 else {
3040 chdir(home ? home : "/root");
f4889f65 3041 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 3042 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
3043 }
3044
3045 log_error("execv() failed: %m");
0f0dbc46 3046
d87be9b0
LP
3047 child_fail:
3048 _exit(EXIT_FAILURE);
da5b3bad 3049 }
88213476 3050
842f3b0f
LP
3051 fdset_free(fds);
3052 fds = NULL;
3053
d96c1ecf 3054 /* Wait until the child reported that it is ready with
f1721625 3055 * all it needs to do with privileges. After we got
d96c1ecf
LP
3056 * the notification we can make the process join its
3057 * cgroup which might limit what it can do */
3058 eventfd_read(child_ready_fd, &x);
3059
354bfd2b
LP
3060 r = register_machine(pid);
3061 if (r < 0)
3062 goto finish;
3063
aa28aefe
LP
3064 r = move_network_interfaces(pid);
3065 if (r < 0)
3066 goto finish;
3067
ab046dde
TG
3068 r = setup_veth(pid, veth_name);
3069 if (r < 0)
3070 goto finish;
3071
3072 r = setup_bridge(veth_name);
3073 if (r < 0)
3074 goto finish;
3075
c74e630d
LP
3076 r = setup_macvlan(pid);
3077 if (r < 0)
3078 goto finish;
3079
d96c1ecf
LP
3080 /* Notify the child that the parent is ready with all
3081 * its setup, and thtat the child can now hand over
3082 * control to the code to run inside the container. */
3083 eventfd_write(parent_ready_fd, 1);
354bfd2b 3084
04d39279
LP
3085 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3086 if (k < 0) {
3087 r = EXIT_FAILURE;
3088 break;
3089 }
88213476 3090
284c0b91
LP
3091 if (!arg_quiet)
3092 putc('\n', stdout);
04d39279
LP
3093
3094 /* Kill if it is not dead yet anyway */
1f0cd86b
LP
3095 terminate_machine(pid);
3096
3097 /* Redundant, but better safe than sorry */
04d39279 3098 kill(pid, SIGKILL);
a258bf26 3099
05947bef 3100 k = wait_for_terminate(pid, &status);
04d39279
LP
3101 pid = 0;
3102
05947bef 3103 if (k < 0) {
d87be9b0
LP
3104 r = EXIT_FAILURE;
3105 break;
3106 }
a258bf26 3107
d87be9b0 3108 if (status.si_code == CLD_EXITED) {
a5f5f8a0 3109 r = status.si_status;
d87be9b0 3110 if (status.si_status != 0) {
04d39279 3111 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
d87be9b0
LP
3112 break;
3113 }
3114
284c0b91
LP
3115 if (!arg_quiet)
3116 log_debug("Container %s exited successfully.", arg_machine);
d87be9b0
LP
3117 break;
3118 } else if (status.si_code == CLD_KILLED &&
3119 status.si_status == SIGINT) {
284c0b91
LP
3120
3121 if (!arg_quiet)
3122 log_info("Container %s has been shut down.", arg_machine);
d87be9b0
LP
3123 r = 0;
3124 break;
3125 } else if (status.si_code == CLD_KILLED &&
3126 status.si_status == SIGHUP) {
284c0b91
LP
3127
3128 if (!arg_quiet)
3129 log_info("Container %s is being rebooted.", arg_machine);
d87be9b0
LP
3130 continue;
3131 } else if (status.si_code == CLD_KILLED ||
3132 status.si_code == CLD_DUMPED) {
88213476 3133
eb91eb18 3134 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
d87be9b0
LP
3135 r = EXIT_FAILURE;
3136 break;
3137 } else {
04d39279 3138 log_error("Container %s failed due to unknown reason.", arg_machine);
d87be9b0
LP
3139 r = EXIT_FAILURE;
3140 break;
3141 }
3142 }
88213476
LP
3143
3144finish:
1b9e5b12
LP
3145 loop_remove(loop_nr, &image_fd);
3146
9444b1f2
LP
3147 if (pid > 0)
3148 kill(pid, SIGKILL);
88213476 3149
04d391da 3150 free(arg_directory);
7027ff61 3151 free(arg_machine);
c74e630d
LP
3152 free(arg_user);
3153 strv_free(arg_setenv);
3154 strv_free(arg_network_interfaces);
3155 strv_free(arg_network_macvlan);
3156 strv_free(arg_bind);
3157 strv_free(arg_bind_ro);
88213476
LP
3158
3159 return r;
3160}