]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
gpt-auto-generator: there's no point in looking for a superblock on raw disk, we...
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
a258bf26
LP
36#include <termios.h>
37#include <sys/signalfd.h>
687d0825 38#include <grp.h>
5ed27dbd 39#include <linux/fs.h>
9537eab0
LP
40#include <sys/un.h>
41#include <sys/socket.h>
aea38d80 42#include <linux/netlink.h>
354bfd2b 43#include <sys/eventfd.h>
aa28aefe 44#include <net/if.h>
69c79d3c 45#include <linux/veth.h>
6afc95b7 46#include <sys/personality.h>
1b9e5b12 47#include <linux/loop.h>
aa28aefe 48
5d63309c 49#ifdef HAVE_SELINUX
a8828ed9
DW
50#include <selinux/selinux.h>
51#endif
88213476 52
24fb1112
LP
53#ifdef HAVE_SECCOMP
54#include <seccomp.h>
55#endif
56
1b9e5b12
LP
57#ifdef HAVE_BLKID
58#include <blkid/blkid.h>
59#endif
60
1f0cd86b
LP
61#include "sd-daemon.h"
62#include "sd-bus.h"
63#include "sd-id128.h"
aa28aefe 64#include "sd-rtnl.h"
88213476
LP
65#include "log.h"
66#include "util.h"
49e942b2 67#include "mkdir.h"
6b2d0e85 68#include "macro.h"
d7832d2c 69#include "audit.h"
94d82985 70#include "missing.h"
04d391da 71#include "cgroup-util.h"
a258bf26 72#include "strv.h"
9eb977db 73#include "path-util.h"
a41fe3a2 74#include "loopback-setup.h"
4fc9982c 75#include "dev-setup.h"
842f3b0f 76#include "fdset.h"
acbeb427 77#include "build.h"
a5c32cff 78#include "fileio.h"
40ca29a1 79#include "bus-util.h"
1f0cd86b 80#include "bus-error.h"
4ba93280 81#include "ptyfwd.h"
9bd37b40 82#include "bus-kernel.h"
f4889f65 83#include "env-util.h"
7f112f50 84#include "def.h"
aa28aefe 85#include "rtnl-util.h"
7e227024 86#include "udev-util.h"
1b9e5b12
LP
87#include "blkid-util.h"
88#include "gpt.h"
f2d88580 89
e9642be2
LP
90#ifdef HAVE_SECCOMP
91#include "seccomp-util.h"
92#endif
93
57fb9fb5
LP
94typedef enum LinkJournal {
95 LINK_NO,
96 LINK_AUTO,
97 LINK_HOST,
98 LINK_GUEST
99} LinkJournal;
88213476
LP
100
101static char *arg_directory = NULL;
687d0825 102static char *arg_user = NULL;
9444b1f2 103static sd_id128_t arg_uuid = {};
7027ff61 104static char *arg_machine = NULL;
c74e630d
LP
105static const char *arg_selinux_context = NULL;
106static const char *arg_selinux_apifs_context = NULL;
9444b1f2 107static const char *arg_slice = NULL;
ff01d048 108static bool arg_private_network = false;
bc2f673e 109static bool arg_read_only = false;
0f0dbc46 110static bool arg_boot = false;
57fb9fb5 111static LinkJournal arg_link_journal = LINK_AUTO;
5076f0cc
LP
112static uint64_t arg_retain =
113 (1ULL << CAP_CHOWN) |
114 (1ULL << CAP_DAC_OVERRIDE) |
115 (1ULL << CAP_DAC_READ_SEARCH) |
116 (1ULL << CAP_FOWNER) |
117 (1ULL << CAP_FSETID) |
118 (1ULL << CAP_IPC_OWNER) |
119 (1ULL << CAP_KILL) |
120 (1ULL << CAP_LEASE) |
121 (1ULL << CAP_LINUX_IMMUTABLE) |
122 (1ULL << CAP_NET_BIND_SERVICE) |
123 (1ULL << CAP_NET_BROADCAST) |
124 (1ULL << CAP_NET_RAW) |
125 (1ULL << CAP_SETGID) |
126 (1ULL << CAP_SETFCAP) |
127 (1ULL << CAP_SETPCAP) |
128 (1ULL << CAP_SETUID) |
129 (1ULL << CAP_SYS_ADMIN) |
130 (1ULL << CAP_SYS_CHROOT) |
131 (1ULL << CAP_SYS_NICE) |
132 (1ULL << CAP_SYS_PTRACE) |
133 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 134 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
135 (1ULL << CAP_SYS_BOOT) |
136 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
137 (1ULL << CAP_AUDIT_CONTROL) |
138 (1ULL << CAP_MKNOD);
17fe0523
LP
139static char **arg_bind = NULL;
140static char **arg_bind_ro = NULL;
f4889f65 141static char **arg_setenv = NULL;
284c0b91 142static bool arg_quiet = false;
8a96d94e 143static bool arg_share_system = false;
eb91eb18 144static bool arg_register = true;
89f7c846 145static bool arg_keep_unit = false;
aa28aefe 146static char **arg_network_interfaces = NULL;
c74e630d 147static char **arg_network_macvlan = NULL;
69c79d3c 148static bool arg_network_veth = false;
c74e630d 149static const char *arg_network_bridge = NULL;
6afc95b7 150static unsigned long arg_personality = 0xffffffffLU;
1b9e5b12 151static const char *arg_image = NULL;
88213476
LP
152
153static int help(void) {
154
155 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
156 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
157 " -h --help Show this help\n"
158 " --version Print version string\n"
69c79d3c 159 " -q --quiet Do not show status information\n"
1b9e5b12
LP
160 " -D --directory=PATH Root directory for the container\n"
161 " -i --image=PATH File system device or image for the container\n"
a8828ed9
DW
162 " -b --boot Boot up full system (i.e. invoke init)\n"
163 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 164 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 165 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 166 " -S --slice=SLICE Place the container in the specified slice\n"
69c79d3c
LP
167 " --private-network Disable network in container\n"
168 " --network-interface=INTERFACE\n"
169 " Assign an existing network interface to the\n"
170 " container\n"
c74e630d
LP
171 " --network-macvlan=INTERFACE\n"
172 " Create a macvlan network interface based on an\n"
173 " existing network interface to the container\n"
32457153 174 " --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 175 " and container\n"
ab046dde 176 " --network-bridge=INTERFACE\n"
32457153 177 " Add a virtual ethernet connection between host\n"
ab046dde
TG
178 " and container and add it to an existing bridge on\n"
179 " the host\n"
82adf6af
LP
180 " -Z --selinux-context=SECLABEL\n"
181 " Set the SELinux security context to be used by\n"
182 " processes in the container\n"
183 " -L --selinux-apifs-context=SECLABEL\n"
184 " Set the SELinux security context to be used by\n"
185 " API/tmpfs file systems in the container\n"
a8828ed9
DW
186 " --capability=CAP In addition to the default, retain specified\n"
187 " capability\n"
188 " --drop-capability=CAP Drop the specified capability from the default set\n"
189 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
190 " -j Equivalent to --link-journal=host\n"
69c79d3c 191 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
192 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
193 " the container\n"
194 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
284c0b91 195 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 196 " --share-system Share system namespaces with host\n"
eb91eb18 197 " --register=BOOLEAN Register container as machine\n"
89f7c846 198 " --keep-unit Do not register a scope for the machine, reuse\n"
69c79d3c 199 " the service unit nspawn is running in\n",
88213476
LP
200 program_invocation_short_name);
201
202 return 0;
203}
204
205static int parse_argv(int argc, char *argv[]) {
206
a41fe3a2 207 enum {
acbeb427
ZJS
208 ARG_VERSION = 0x100,
209 ARG_PRIVATE_NETWORK,
bc2f673e 210 ARG_UUID,
5076f0cc 211 ARG_READ_ONLY,
57fb9fb5 212 ARG_CAPABILITY,
420c7379 213 ARG_DROP_CAPABILITY,
17fe0523
LP
214 ARG_LINK_JOURNAL,
215 ARG_BIND,
f4889f65
LP
216 ARG_BIND_RO,
217 ARG_SETENV,
eb91eb18 218 ARG_SHARE_SYSTEM,
89f7c846 219 ARG_REGISTER,
aa28aefe 220 ARG_KEEP_UNIT,
69c79d3c 221 ARG_NETWORK_INTERFACE,
c74e630d 222 ARG_NETWORK_MACVLAN,
69c79d3c 223 ARG_NETWORK_VETH,
ab046dde 224 ARG_NETWORK_BRIDGE,
6afc95b7 225 ARG_PERSONALITY,
a41fe3a2
LP
226 };
227
88213476 228 static const struct option options[] = {
aa28aefe
LP
229 { "help", no_argument, NULL, 'h' },
230 { "version", no_argument, NULL, ARG_VERSION },
231 { "directory", required_argument, NULL, 'D' },
232 { "user", required_argument, NULL, 'u' },
233 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
234 { "boot", no_argument, NULL, 'b' },
235 { "uuid", required_argument, NULL, ARG_UUID },
236 { "read-only", no_argument, NULL, ARG_READ_ONLY },
237 { "capability", required_argument, NULL, ARG_CAPABILITY },
238 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
239 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
240 { "bind", required_argument, NULL, ARG_BIND },
241 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
242 { "machine", required_argument, NULL, 'M' },
243 { "slice", required_argument, NULL, 'S' },
244 { "setenv", required_argument, NULL, ARG_SETENV },
245 { "selinux-context", required_argument, NULL, 'Z' },
246 { "selinux-apifs-context", required_argument, NULL, 'L' },
247 { "quiet", no_argument, NULL, 'q' },
248 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
249 { "register", required_argument, NULL, ARG_REGISTER },
250 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
251 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 252 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
ab046dde
TG
253 { "network-veth", no_argument, NULL, ARG_NETWORK_VETH },
254 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 255 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 256 { "image", required_argument, NULL, 'i' },
eb9da376 257 {}
88213476
LP
258 };
259
9444b1f2 260 int c, r;
a42c8b54 261 uint64_t plus = 0, minus = 0;
88213476
LP
262
263 assert(argc >= 0);
264 assert(argv);
265
1b9e5b12 266 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
88213476
LP
267
268 switch (c) {
269
270 case 'h':
eb9da376 271 return help();
88213476 272
acbeb427
ZJS
273 case ARG_VERSION:
274 puts(PACKAGE_STRING);
275 puts(SYSTEMD_FEATURES);
276 return 0;
277
88213476
LP
278 case 'D':
279 free(arg_directory);
3a74cea5
LP
280 arg_directory = canonicalize_file_name(optarg);
281 if (!arg_directory) {
898d5c91 282 log_error("Invalid root directory: %m");
88213476
LP
283 return -ENOMEM;
284 }
285
286 break;
287
1b9e5b12
LP
288 case 'i':
289 arg_image = optarg;
290 break;
291
687d0825
MV
292 case 'u':
293 free(arg_user);
7027ff61
LP
294 arg_user = strdup(optarg);
295 if (!arg_user)
296 return log_oom();
687d0825
MV
297
298 break;
299
ab046dde 300 case ARG_NETWORK_BRIDGE:
c74e630d 301 arg_network_bridge = optarg;
ab046dde
TG
302
303 /* fall through */
304
69c79d3c
LP
305 case ARG_NETWORK_VETH:
306 arg_network_veth = true;
307 arg_private_network = true;
308 break;
309
aa28aefe 310 case ARG_NETWORK_INTERFACE:
c74e630d
LP
311 if (strv_extend(&arg_network_interfaces, optarg) < 0)
312 return log_oom();
313
314 arg_private_network = true;
315 break;
316
317 case ARG_NETWORK_MACVLAN:
318 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
319 return log_oom();
320
321 /* fall through */
322
ff01d048
LP
323 case ARG_PRIVATE_NETWORK:
324 arg_private_network = true;
a41fe3a2
LP
325 break;
326
0f0dbc46
LP
327 case 'b':
328 arg_boot = true;
329 break;
330
144f0fc0 331 case ARG_UUID:
9444b1f2
LP
332 r = sd_id128_from_string(optarg, &arg_uuid);
333 if (r < 0) {
aa96c6cb 334 log_error("Invalid UUID: %s", optarg);
9444b1f2 335 return r;
aa96c6cb 336 }
9444b1f2 337 break;
aa96c6cb 338
9444b1f2 339 case 'S':
c74e630d 340 arg_slice = optarg;
144f0fc0
LP
341 break;
342
7027ff61 343 case 'M':
eb91eb18
LP
344 if (isempty(optarg)) {
345 free(arg_machine);
346 arg_machine = NULL;
347 } else {
7027ff61 348
eb91eb18
LP
349 if (!hostname_is_valid(optarg)) {
350 log_error("Invalid machine name: %s", optarg);
351 return -EINVAL;
352 }
7027ff61 353
eb91eb18
LP
354 free(arg_machine);
355 arg_machine = strdup(optarg);
356 if (!arg_machine)
357 return log_oom();
358
359 break;
360 }
7027ff61 361
82adf6af
LP
362 case 'Z':
363 arg_selinux_context = optarg;
a8828ed9
DW
364 break;
365
82adf6af
LP
366 case 'L':
367 arg_selinux_apifs_context = optarg;
a8828ed9
DW
368 break;
369
bc2f673e
LP
370 case ARG_READ_ONLY:
371 arg_read_only = true;
372 break;
373
420c7379
LP
374 case ARG_CAPABILITY:
375 case ARG_DROP_CAPABILITY: {
5076f0cc
LP
376 char *state, *word;
377 size_t length;
378
379 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 380 _cleanup_free_ char *t;
5076f0cc 381 cap_value_t cap;
5076f0cc
LP
382
383 t = strndup(word, length);
0d0f0c50
SL
384 if (!t)
385 return log_oom();
5076f0cc 386
39ed67d1
LP
387 if (streq(t, "all")) {
388 if (c == ARG_CAPABILITY)
a42c8b54 389 plus = (uint64_t) -1;
39ed67d1 390 else
a42c8b54 391 minus = (uint64_t) -1;
39ed67d1
LP
392 } else {
393 if (cap_from_name(t, &cap) < 0) {
394 log_error("Failed to parse capability %s.", t);
395 return -EINVAL;
396 }
397
398 if (c == ARG_CAPABILITY)
a42c8b54 399 plus |= 1ULL << (uint64_t) cap;
39ed67d1 400 else
a42c8b54 401 minus |= 1ULL << (uint64_t) cap;
5076f0cc 402 }
5076f0cc
LP
403 }
404
405 break;
406 }
407
57fb9fb5
LP
408 case 'j':
409 arg_link_journal = LINK_GUEST;
410 break;
411
412 case ARG_LINK_JOURNAL:
413 if (streq(optarg, "auto"))
414 arg_link_journal = LINK_AUTO;
415 else if (streq(optarg, "no"))
416 arg_link_journal = LINK_NO;
417 else if (streq(optarg, "guest"))
418 arg_link_journal = LINK_GUEST;
419 else if (streq(optarg, "host"))
420 arg_link_journal = LINK_HOST;
421 else {
422 log_error("Failed to parse link journal mode %s", optarg);
423 return -EINVAL;
424 }
425
426 break;
427
17fe0523
LP
428 case ARG_BIND:
429 case ARG_BIND_RO: {
430 _cleanup_free_ char *a = NULL, *b = NULL;
431 char *e;
432 char ***x;
17fe0523
LP
433
434 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
435
436 e = strchr(optarg, ':');
437 if (e) {
438 a = strndup(optarg, e - optarg);
439 b = strdup(e + 1);
440 } else {
441 a = strdup(optarg);
442 b = strdup(optarg);
443 }
444
445 if (!a || !b)
446 return log_oom();
447
448 if (!path_is_absolute(a) || !path_is_absolute(b)) {
449 log_error("Invalid bind mount specification: %s", optarg);
450 return -EINVAL;
451 }
452
453 r = strv_extend(x, a);
454 if (r < 0)
b3451bed 455 return log_oom();
17fe0523
LP
456
457 r = strv_extend(x, b);
458 if (r < 0)
b3451bed 459 return log_oom();
17fe0523
LP
460
461 break;
462 }
463
f4889f65
LP
464 case ARG_SETENV: {
465 char **n;
466
467 if (!env_assignment_is_valid(optarg)) {
468 log_error("Environment variable assignment '%s' is not valid.", optarg);
469 return -EINVAL;
470 }
471
472 n = strv_env_set(arg_setenv, optarg);
473 if (!n)
474 return log_oom();
475
476 strv_free(arg_setenv);
477 arg_setenv = n;
478 break;
479 }
480
284c0b91
LP
481 case 'q':
482 arg_quiet = true;
483 break;
484
8a96d94e
LP
485 case ARG_SHARE_SYSTEM:
486 arg_share_system = true;
487 break;
488
eb91eb18
LP
489 case ARG_REGISTER:
490 r = parse_boolean(optarg);
491 if (r < 0) {
492 log_error("Failed to parse --register= argument: %s", optarg);
493 return r;
494 }
495
496 arg_register = r;
497 break;
498
89f7c846
LP
499 case ARG_KEEP_UNIT:
500 arg_keep_unit = true;
501 break;
502
6afc95b7
LP
503 case ARG_PERSONALITY:
504
ac45f971 505 arg_personality = personality_from_string(optarg);
6afc95b7
LP
506 if (arg_personality == 0xffffffffLU) {
507 log_error("Unknown or unsupported personality '%s'.", optarg);
508 return -EINVAL;
509 }
510
511 break;
512
88213476
LP
513 case '?':
514 return -EINVAL;
515
516 default:
eb9da376 517 assert_not_reached("Unhandled option");
88213476
LP
518 }
519 }
520
eb91eb18
LP
521 if (arg_share_system)
522 arg_register = false;
523
524 if (arg_boot && arg_share_system) {
525 log_error("--boot and --share-system may not be combined.");
526 return -EINVAL;
527 }
528
89f7c846
LP
529 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
530 log_error("--keep-unit may not be used when invoked from a user session.");
531 return -EINVAL;
532 }
533
1b9e5b12
LP
534 if (arg_directory && arg_image) {
535 log_error("--directory= and --image= may not be combined.");
536 return -EINVAL;
537 }
538
a42c8b54
LP
539 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
540
88213476
LP
541 return 1;
542}
543
544static int mount_all(const char *dest) {
545
546 typedef struct MountPoint {
547 const char *what;
548 const char *where;
549 const char *type;
550 const char *options;
551 unsigned long flags;
3bd66c05 552 bool fatal;
88213476
LP
553 } MountPoint;
554
555 static const MountPoint mount_table[] = {
4b7a6af4 556 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
b4c59701
LP
557 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
558 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
e65aec12 559 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
635f7d8c 560 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 561 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
ede89845 562 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
635f7d8c 563 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 564#ifdef HAVE_SELINUX
b4c59701
LP
565 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
566 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 567#endif
88213476
LP
568 };
569
570 unsigned k;
571 int r = 0;
572
573 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
7fd1b19b 574 _cleanup_free_ char *where = NULL;
d002827b 575#ifdef HAVE_SELINUX
a8828ed9 576 _cleanup_free_ char *options = NULL;
d002827b
LP
577#endif
578 const char *o;
88213476
LP
579 int t;
580
17fe0523
LP
581 where = strjoin(dest, "/", mount_table[k].where, NULL);
582 if (!where)
583 return log_oom();
88213476 584
e65aec12 585 t = path_is_mount_point(where, true);
68fb0892 586 if (t < 0) {
88213476 587 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
88213476
LP
588
589 if (r == 0)
590 r = t;
591
592 continue;
593 }
594
9c1c7f71
LP
595 /* Skip this entry if it is not a remount. */
596 if (mount_table[k].what && t > 0)
014a9c77
LP
597 continue;
598
17fe0523 599 mkdir_p(where, 0755);
88213476 600
a8828ed9 601#ifdef HAVE_SELINUX
82adf6af
LP
602 if (arg_selinux_apifs_context &&
603 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
604 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
605 if (!options)
606 return log_oom();
607
608 o = options;
609 } else
a8828ed9 610#endif
d002827b 611 o = mount_table[k].options;
a8828ed9 612
a8828ed9 613
88213476
LP
614 if (mount(mount_table[k].what,
615 where,
616 mount_table[k].type,
617 mount_table[k].flags,
d002827b 618 o) < 0 &&
3bd66c05 619 mount_table[k].fatal) {
88213476
LP
620
621 log_error("mount(%s) failed: %m", where);
622
623 if (r == 0)
624 r = -errno;
625 }
88213476
LP
626 }
627
e58a1277
LP
628 return r;
629}
f8440af5 630
17fe0523
LP
631static int mount_binds(const char *dest, char **l, unsigned long flags) {
632 char **x, **y;
633
634 STRV_FOREACH_PAIR(x, y, l) {
2ed4e5e0 635 char *where;
d2421337 636 struct stat source_st, dest_st;
2ed4e5e0 637 int r;
d2421337
DR
638
639 if (stat(*x, &source_st) < 0) {
1b9e5b12 640 log_error("Failed to stat %s: %m", *x);
d2421337
DR
641 return -errno;
642 }
17fe0523 643
2ed4e5e0
SL
644 where = strappenda(dest, *y);
645 r = stat(where, &dest_st);
646 if (r == 0) {
d2421337 647 if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
cecf24e7 648 log_error("The file types of %s and %s do not match. Refusing bind mount",
d2421337
DR
649 *x, where);
650 return -EINVAL;
651 }
2ed4e5e0
SL
652 } else if (errno == ENOENT) {
653 r = mkdir_parents_label(where, 0755);
654 if (r < 0) {
655 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
656 return r;
d2421337 657 }
2ed4e5e0
SL
658 } else {
659 log_error("Failed to bind mount %s: %s", *x, strerror(errno));
660 return -errno;
661 }
662 /* Create the mount point, but be conservative -- refuse to create block
663 * and char devices. */
664 if (S_ISDIR(source_st.st_mode))
665 mkdir_label(where, 0755);
666 else if (S_ISFIFO(source_st.st_mode))
667 mkfifo(where, 0644);
668 else if (S_ISSOCK(source_st.st_mode))
669 mknod(where, 0644 | S_IFSOCK, 0);
670 else if (S_ISREG(source_st.st_mode))
671 touch(where);
672 else {
673 log_error("Refusing to create mountpoint for file: %s", *x);
674 return -ENOTSUP;
d2421337 675 }
17fe0523
LP
676
677 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
678 log_error("mount(%s) failed: %m", where);
679 return -errno;
680 }
681
682 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
683 log_error("mount(%s) failed: %m", where);
684 return -errno;
685 }
686 }
687
688 return 0;
689}
690
e58a1277 691static int setup_timezone(const char *dest) {
d4036145
LP
692 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
693 char *z, *y;
694 int r;
f8440af5 695
e58a1277
LP
696 assert(dest);
697
698 /* Fix the timezone, if possible */
d4036145
LP
699 r = readlink_malloc("/etc/localtime", &p);
700 if (r < 0) {
701 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
702 return 0;
703 }
704
705 z = path_startswith(p, "../usr/share/zoneinfo/");
706 if (!z)
707 z = path_startswith(p, "/usr/share/zoneinfo/");
708 if (!z) {
709 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
710 return 0;
711 }
712
04bc4a3f
LP
713 where = strappend(dest, "/etc/localtime");
714 if (!where)
0d0f0c50 715 return log_oom();
715ac17a 716
d4036145
LP
717 r = readlink_malloc(where, &q);
718 if (r >= 0) {
719 y = path_startswith(q, "../usr/share/zoneinfo/");
720 if (!y)
721 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 722
4d1c38b8 723
d4036145
LP
724 /* Already pointing to the right place? Then do nothing .. */
725 if (y && streq(y, z))
726 return 0;
727 }
728
729 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
730 if (!check)
0d0f0c50 731 return log_oom();
4d1c38b8 732
d4036145
LP
733 if (access(check, F_OK) < 0) {
734 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
735 return 0;
736 }
68fb0892 737
d4036145
LP
738 what = strappend("../usr/share/zoneinfo/", z);
739 if (!what)
740 return log_oom();
741
742 unlink(where);
743 if (symlink(what, where) < 0) {
744 log_error("Failed to correct timezone of container: %m");
745 return 0;
746 }
e58a1277
LP
747
748 return 0;
88213476
LP
749}
750
2547bb41 751static int setup_resolv_conf(const char *dest) {
f333fbb1 752 char _cleanup_free_ *where = NULL;
2547bb41
LP
753
754 assert(dest);
755
756 if (arg_private_network)
757 return 0;
758
759 /* Fix resolv.conf, if possible */
04bc4a3f
LP
760 where = strappend(dest, "/etc/resolv.conf");
761 if (!where)
0d0f0c50 762 return log_oom();
2547bb41 763
77e63faf
LP
764 /* We don't really care for the results of this really. If it
765 * fails, it fails, but meh... */
51045322 766 copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
2547bb41
LP
767
768 return 0;
769}
770
04bc4a3f 771static int setup_boot_id(const char *dest) {
7fd1b19b 772 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 773 sd_id128_t rnd = {};
04bc4a3f
LP
774 char as_uuid[37];
775 int r;
776
777 assert(dest);
778
eb91eb18
LP
779 if (arg_share_system)
780 return 0;
781
04bc4a3f
LP
782 /* Generate a new randomized boot ID, so that each boot-up of
783 * the container gets a new one */
784
785 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 786 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
787 if (!from || !to)
788 return log_oom();
04bc4a3f
LP
789
790 r = sd_id128_randomize(&rnd);
791 if (r < 0) {
792 log_error("Failed to generate random boot id: %s", strerror(-r));
ed8b7a3e 793 return r;
04bc4a3f
LP
794 }
795
796 snprintf(as_uuid, sizeof(as_uuid),
797 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
798 SD_ID128_FORMAT_VAL(rnd));
799 char_array_0(as_uuid);
800
574d5f2d 801 r = write_string_file(from, as_uuid);
04bc4a3f
LP
802 if (r < 0) {
803 log_error("Failed to write boot id: %s", strerror(-r));
ed8b7a3e 804 return r;
04bc4a3f
LP
805 }
806
807 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
808 log_error("Failed to bind mount boot id: %m");
809 r = -errno;
10d18763
ZJS
810 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
811 log_warning("Failed to make boot id read-only: %m");
04bc4a3f
LP
812
813 unlink(from);
04bc4a3f
LP
814 return r;
815}
816
e58a1277 817static int copy_devnodes(const char *dest) {
88213476
LP
818
819 static const char devnodes[] =
820 "null\0"
821 "zero\0"
822 "full\0"
823 "random\0"
824 "urandom\0"
f2d88580 825 "tty\0";
88213476
LP
826
827 const char *d;
e58a1277 828 int r = 0;
7fd1b19b 829 _cleanup_umask_ mode_t u;
a258bf26
LP
830
831 assert(dest);
124640f1
LP
832
833 u = umask(0000);
88213476
LP
834
835 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 836 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 837 struct stat st;
88213476 838
7f112f50
LP
839 from = strappend("/dev/", d);
840 to = strjoin(dest, "/dev/", d, NULL);
841 if (!from || !to)
842 return log_oom();
88213476
LP
843
844 if (stat(from, &st) < 0) {
845
846 if (errno != ENOENT) {
847 log_error("Failed to stat %s: %m", from);
7f112f50 848 return -errno;
88213476
LP
849 }
850
a258bf26 851 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 852
ed8b7a3e 853 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 854 return -EIO;
a258bf26
LP
855
856 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
857
858 log_error("mknod(%s) failed: %m", dest);
7f112f50 859 return -errno;
88213476 860 }
88213476
LP
861 }
862
e58a1277
LP
863 return r;
864}
88213476 865
f2d88580
LP
866static int setup_ptmx(const char *dest) {
867 _cleanup_free_ char *p = NULL;
868
869 p = strappend(dest, "/dev/ptmx");
870 if (!p)
871 return log_oom();
872
873 if (symlink("pts/ptmx", p) < 0) {
874 log_error("Failed to create /dev/ptmx symlink: %m");
875 return -errno;
876 }
877
878 return 0;
879}
880
e58a1277 881static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
882 _cleanup_umask_ mode_t u;
883 const char *to;
e58a1277 884 struct stat st;
e58a1277 885 int r;
e58a1277
LP
886
887 assert(dest);
888 assert(console);
889
890 u = umask(0000);
891
eb0f0863
LP
892 if (stat("/dev/null", &st) < 0) {
893 log_error("Failed to stat /dev/null: %m");
25ea79fe 894 return -errno;
e58a1277 895 }
88213476 896
e58a1277
LP
897 r = chmod_and_chown(console, 0600, 0, 0);
898 if (r < 0) {
899 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
25ea79fe 900 return r;
a258bf26 901 }
88213476 902
a258bf26
LP
903 /* We need to bind mount the right tty to /dev/console since
904 * ptys can only exist on pts file systems. To have something
eb0f0863
LP
905 * to bind mount things on we create a device node first, and
906 * use /dev/null for that since we the cgroups device policy
907 * allows us to create that freely, while we cannot create
908 * /dev/console. (Note that the major minor doesn't actually
909 * matter here, since we mount it over anyway). */
a258bf26 910
eb0f0863 911 to = strappenda(dest, "/dev/console");
e58a1277
LP
912 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
913 log_error("mknod() for /dev/console failed: %m");
25ea79fe 914 return -errno;
e58a1277 915 }
a258bf26
LP
916
917 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
e58a1277 918 log_error("Bind mount for /dev/console failed: %m");
25ea79fe 919 return -errno;
a258bf26
LP
920 }
921
25ea79fe 922 return 0;
e58a1277
LP
923}
924
925static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 926 _cleanup_free_ char *from = NULL, *to = NULL;
e58a1277 927 int r, fd, k;
7fd1b19b 928 _cleanup_umask_ mode_t u;
e58a1277
LP
929 union {
930 struct cmsghdr cmsghdr;
931 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
932 } control = {};
933 struct msghdr mh = {
934 .msg_control = &control,
935 .msg_controllen = sizeof(control),
936 };
e58a1277
LP
937 struct cmsghdr *cmsg;
938
939 assert(dest);
940 assert(kmsg_socket >= 0);
a258bf26 941
e58a1277 942 u = umask(0000);
a258bf26 943
f1e5dfe2
LP
944 /* We create the kmsg FIFO as /dev/kmsg, but immediately
945 * delete it after bind mounting it to /proc/kmsg. While FIFOs
946 * on the reading side behave very similar to /proc/kmsg,
947 * their writing side behaves differently from /dev/kmsg in
948 * that writing blocks when nothing is reading. In order to
949 * avoid any problems with containers deadlocking due to this
950 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
951 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
952 asprintf(&to, "%s/proc/kmsg", dest) < 0)
953 return log_oom();
e58a1277
LP
954
955 if (mkfifo(from, 0600) < 0) {
956 log_error("mkfifo() for /dev/kmsg failed: %m");
25ea79fe 957 return -errno;
e58a1277
LP
958 }
959
960 r = chmod_and_chown(from, 0600, 0, 0);
961 if (r < 0) {
962 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
25ea79fe 963 return r;
e58a1277
LP
964 }
965
966 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
967 log_error("Bind mount for /proc/kmsg failed: %m");
25ea79fe 968 return -errno;
e58a1277
LP
969 }
970
971 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
972 if (fd < 0) {
973 log_error("Failed to open fifo: %m");
25ea79fe 974 return -errno;
e58a1277
LP
975 }
976
e58a1277
LP
977 cmsg = CMSG_FIRSTHDR(&mh);
978 cmsg->cmsg_level = SOL_SOCKET;
979 cmsg->cmsg_type = SCM_RIGHTS;
980 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
981 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
982
983 mh.msg_controllen = cmsg->cmsg_len;
984
985 /* Store away the fd in the socket, so that it stays open as
986 * long as we run the child */
987 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
988 close_nointr_nofail(fd);
989
990 if (k < 0) {
991 log_error("Failed to send FIFO fd: %m");
25ea79fe 992 return -errno;
a258bf26
LP
993 }
994
f1e5dfe2
LP
995 /* And now make the FIFO unavailable as /dev/kmsg... */
996 unlink(from);
25ea79fe 997 return 0;
88213476
LP
998}
999
3a74cea5 1000static int setup_hostname(void) {
3a74cea5 1001
eb91eb18
LP
1002 if (arg_share_system)
1003 return 0;
1004
7027ff61
LP
1005 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1006 return -errno;
3a74cea5 1007
7027ff61 1008 return 0;
3a74cea5
LP
1009}
1010
57fb9fb5 1011static int setup_journal(const char *directory) {
4d680aee 1012 sd_id128_t machine_id, this_id;
7fd1b19b 1013 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 1014 char *id;
57fb9fb5
LP
1015 int r;
1016
57fb9fb5 1017 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
1018 if (!p)
1019 return log_oom();
57fb9fb5
LP
1020
1021 r = read_one_line_file(p, &b);
27407a01
ZJS
1022 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1023 return 0;
1024 else if (r < 0) {
1025 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
57fb9fb5
LP
1026 return r;
1027 }
1028
27407a01
ZJS
1029 id = strstrip(b);
1030 if (isempty(id) && arg_link_journal == LINK_AUTO)
1031 return 0;
57fb9fb5 1032
27407a01
ZJS
1033 /* Verify validity */
1034 r = sd_id128_from_string(id, &machine_id);
57fb9fb5 1035 if (r < 0) {
27407a01
ZJS
1036 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1037 return r;
57fb9fb5
LP
1038 }
1039
4d680aee
ZJS
1040 r = sd_id128_get_machine(&this_id);
1041 if (r < 0) {
1042 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1043 return r;
1044 }
1045
1046 if (sd_id128_equal(machine_id, this_id)) {
1047 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1048 "Host and machine ids are equal (%s): refusing to link journals", id);
1049 if (arg_link_journal == LINK_AUTO)
1050 return 0;
1051 return
1052 -EEXIST;
1053 }
1054
1055 if (arg_link_journal == LINK_NO)
1056 return 0;
1057
57fb9fb5 1058 free(p);
27407a01
ZJS
1059 p = strappend("/var/log/journal/", id);
1060 q = strjoin(directory, "/var/log/journal/", id, NULL);
1061 if (!p || !q)
1062 return log_oom();
1063
1064 if (path_is_mount_point(p, false) > 0) {
1065 if (arg_link_journal != LINK_AUTO) {
1066 log_error("%s: already a mount point, refusing to use for journal", p);
1067 return -EEXIST;
1068 }
1069
1070 return 0;
57fb9fb5
LP
1071 }
1072
27407a01 1073 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1074 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1075 log_error("%s: already a mount point, refusing to use for journal", q);
1076 return -EEXIST;
57fb9fb5
LP
1077 }
1078
27407a01 1079 return 0;
57fb9fb5
LP
1080 }
1081
1082 r = readlink_and_make_absolute(p, &d);
1083 if (r >= 0) {
1084 if ((arg_link_journal == LINK_GUEST ||
1085 arg_link_journal == LINK_AUTO) &&
1086 path_equal(d, q)) {
1087
27407a01
ZJS
1088 r = mkdir_p(q, 0755);
1089 if (r < 0)
1090 log_warning("failed to create directory %s: %m", q);
1091 return 0;
57fb9fb5
LP
1092 }
1093
1094 if (unlink(p) < 0) {
1095 log_error("Failed to remove symlink %s: %m", p);
27407a01 1096 return -errno;
57fb9fb5
LP
1097 }
1098 } else if (r == -EINVAL) {
1099
1100 if (arg_link_journal == LINK_GUEST &&
1101 rmdir(p) < 0) {
1102
27407a01
ZJS
1103 if (errno == ENOTDIR) {
1104 log_error("%s already exists and is neither a symlink nor a directory", p);
1105 return r;
1106 } else {
57fb9fb5 1107 log_error("Failed to remove %s: %m", p);
27407a01 1108 return -errno;
57fb9fb5 1109 }
57fb9fb5
LP
1110 }
1111 } else if (r != -ENOENT) {
1112 log_error("readlink(%s) failed: %m", p);
27407a01 1113 return r;
57fb9fb5
LP
1114 }
1115
1116 if (arg_link_journal == LINK_GUEST) {
1117
1118 if (symlink(q, p) < 0) {
1119 log_error("Failed to symlink %s to %s: %m", q, p);
27407a01 1120 return -errno;
57fb9fb5
LP
1121 }
1122
27407a01
ZJS
1123 r = mkdir_p(q, 0755);
1124 if (r < 0)
1125 log_warning("failed to create directory %s: %m", q);
1126 return 0;
57fb9fb5
LP
1127 }
1128
1129 if (arg_link_journal == LINK_HOST) {
1130 r = mkdir_p(p, 0755);
1131 if (r < 0) {
1132 log_error("Failed to create %s: %m", p);
27407a01 1133 return r;
57fb9fb5
LP
1134 }
1135
27407a01
ZJS
1136 } else if (access(p, F_OK) < 0)
1137 return 0;
57fb9fb5
LP
1138
1139 if (dir_is_empty(q) == 0) {
1140 log_error("%s not empty.", q);
27407a01 1141 return -ENOTEMPTY;
57fb9fb5
LP
1142 }
1143
1144 r = mkdir_p(q, 0755);
1145 if (r < 0) {
1146 log_error("Failed to create %s: %m", q);
27407a01 1147 return r;
57fb9fb5
LP
1148 }
1149
1150 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1151 log_error("Failed to bind mount journal from host into guest: %m");
27407a01 1152 return -errno;
57fb9fb5
LP
1153 }
1154
27407a01 1155 return 0;
57fb9fb5
LP
1156}
1157
9bd37b40
LP
1158static int setup_kdbus(const char *dest, const char *path) {
1159 const char *p;
1160
1161 if (!path)
1162 return 0;
1163
1164 p = strappenda(dest, "/dev/kdbus");
1165 if (mkdir(p, 0755) < 0) {
1166 log_error("Failed to create kdbus path: %m");
1167 return -errno;
1168 }
1169
1170 if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
486e99a3 1171 log_error("Failed to mount kdbus domain path: %m");
9bd37b40
LP
1172 return -errno;
1173 }
1174
1175 return 0;
1176}
1177
88213476 1178static int drop_capabilities(void) {
5076f0cc 1179 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1180}
1181
354bfd2b 1182static int register_machine(pid_t pid) {
9444b1f2
LP
1183 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1184 _cleanup_bus_unref_ sd_bus *bus = NULL;
1185 int r;
1186
eb91eb18
LP
1187 if (!arg_register)
1188 return 0;
1189
1c03020c 1190 r = sd_bus_default_system(&bus);
9444b1f2
LP
1191 if (r < 0) {
1192 log_error("Failed to open system bus: %s", strerror(-r));
1193 return r;
1194 }
1195
89f7c846
LP
1196 if (arg_keep_unit) {
1197 r = sd_bus_call_method(
1198 bus,
1199 "org.freedesktop.machine1",
1200 "/org/freedesktop/machine1",
1201 "org.freedesktop.machine1.Manager",
1202 "RegisterMachine",
1203 &error,
1204 NULL,
1205 "sayssus",
1206 arg_machine,
1207 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1208 "nspawn",
1209 "container",
1210 (uint32_t) pid,
1211 strempty(arg_directory));
1212 } else {
9457ac5b
LP
1213 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1214
1215 r = sd_bus_message_new_method_call(
89f7c846 1216 bus,
9457ac5b 1217 &m,
89f7c846
LP
1218 "org.freedesktop.machine1",
1219 "/org/freedesktop/machine1",
1220 "org.freedesktop.machine1.Manager",
9457ac5b
LP
1221 "CreateMachine");
1222 if (r < 0) {
1223 log_error("Failed to create message: %s", strerror(-r));
1224 return r;
1225 }
1226
1227 r = sd_bus_message_append(
1228 m,
1229 "sayssus",
89f7c846
LP
1230 arg_machine,
1231 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1232 "nspawn",
1233 "container",
1234 (uint32_t) pid,
9457ac5b
LP
1235 strempty(arg_directory));
1236 if (r < 0) {
1237 log_error("Failed to append message arguments: %s", strerror(-r));
1238 return r;
1239 }
1240
1241 r = sd_bus_message_open_container(m, 'a', "(sv)");
1242 if (r < 0) {
1243 log_error("Failed to open container: %s", strerror(-r));
1244 return r;
1245 }
1246
1247 if (!isempty(arg_slice)) {
1248 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1249 if (r < 0) {
1250 log_error("Failed to append slice: %s", strerror(-r));
1251 return r;
1252 }
1253 }
1254
1255 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1256 if (r < 0) {
1257 log_error("Failed to add device policy: %s", strerror(-r));
1258 return r;
1259 }
1260
a07f961e 1261 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
9457ac5b
LP
1262 /* Allow the container to
1263 * access and create the API
1264 * device nodes, so that
1265 * PrivateDevices= in the
1266 * container can work
1267 * fine */
1268 "/dev/null", "rwm",
1269 "/dev/zero", "rwm",
1270 "/dev/full", "rwm",
1271 "/dev/random", "rwm",
1272 "/dev/urandom", "rwm",
1273 "/dev/tty", "rwm",
1274 /* Allow the container
1275 * access to ptys. However,
1276 * do not permit the
1277 * container to ever create
1278 * these device nodes. */
1279 "/dev/pts/ptmx", "rw",
a07f961e
LP
1280 "char-pts", "rw",
1281 /* Allow the container
1282 * access to all kdbus
1283 * devices. Again, the
1284 * container cannot create
1285 * these nodes, only use
1286 * them. We use a pretty
1287 * open match here, so that
1288 * the kernel API can still
1289 * change. */
1290 "char-kdbus", "rw",
1291 "char-kdbus/*", "rw");
9457ac5b
LP
1292 if (r < 0) {
1293 log_error("Failed to add device whitelist: %s", strerror(-r));
1294 return r;
1295 }
1296
1297 r = sd_bus_message_close_container(m);
1298 if (r < 0) {
1299 log_error("Failed to close container: %s", strerror(-r));
1300 return r;
1301 }
1302
1303 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
1304 }
1305
9444b1f2 1306 if (r < 0) {
1f0cd86b
LP
1307 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1308 return r;
1309 }
1310
1311 return 0;
1312}
1313
1314static int terminate_machine(pid_t pid) {
1315 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1316 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1317 _cleanup_bus_unref_ sd_bus *bus = NULL;
1318 const char *path;
1319 int r;
1320
eb91eb18
LP
1321 if (!arg_register)
1322 return 0;
1323
76b54375 1324 r = sd_bus_default_system(&bus);
1f0cd86b
LP
1325 if (r < 0) {
1326 log_error("Failed to open system bus: %s", strerror(-r));
1327 return r;
1328 }
1329
1330 r = sd_bus_call_method(
1331 bus,
1332 "org.freedesktop.machine1",
1333 "/org/freedesktop/machine1",
1334 "org.freedesktop.machine1.Manager",
1335 "GetMachineByPID",
1336 &error,
1337 &reply,
1338 "u",
1339 (uint32_t) pid);
1340 if (r < 0) {
1341 /* Note that the machine might already have been
1342 * cleaned up automatically, hence don't consider it a
1343 * failure if we cannot get the machine object. */
1344 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1345 return 0;
1346 }
1347
1348 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
1349 if (r < 0)
1350 return bus_log_parse_error(r);
9444b1f2 1351
1f0cd86b
LP
1352 r = sd_bus_call_method(
1353 bus,
1354 "org.freedesktop.machine1",
1355 path,
1356 "org.freedesktop.machine1.Machine",
1357 "Terminate",
1358 &error,
1359 NULL,
1360 NULL);
1361 if (r < 0) {
1362 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1363 return 0;
1364 }
1365
9444b1f2
LP
1366 return 0;
1367}
1368
db999e0f
LP
1369static int reset_audit_loginuid(void) {
1370 _cleanup_free_ char *p = NULL;
1371 int r;
1372
1373 if (arg_share_system)
1374 return 0;
1375
1376 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1377 if (r == -ENOENT)
db999e0f
LP
1378 return 0;
1379 if (r < 0) {
1380 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1381 return r;
1382 }
1383
1384 /* Already reset? */
1385 if (streq(p, "4294967295"))
1386 return 0;
1387
1388 r = write_string_file("/proc/self/loginuid", "4294967295");
1389 if (r < 0) {
1390 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1391 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1392 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1393 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1394 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
77b6e194 1395
db999e0f 1396 sleep(5);
77b6e194 1397 }
db999e0f
LP
1398
1399 return 0;
77b6e194
LP
1400}
1401
08af0da2 1402static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
69c79d3c 1403 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 1404 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
69c79d3c
LP
1405 int r;
1406
1407 if (!arg_private_network)
1408 return 0;
1409
1410 if (!arg_network_veth)
1411 return 0;
1412
08af0da2
LP
1413 /* Use two different interface name prefixes depending whether
1414 * we are in bridge mode or not. */
1415 if (arg_network_bridge)
1416 memcpy(iface_name, "vb-", 3);
1417 else
1418 memcpy(iface_name, "ve-", 3);
1419
ab046dde 1420 strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
69c79d3c 1421
151b9b96 1422 r = sd_rtnl_open(&rtnl, 0);
69c79d3c
LP
1423 if (r < 0) {
1424 log_error("Failed to connect to netlink: %s", strerror(-r));
1425 return r;
1426 }
1427
151b9b96 1428 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
69c79d3c
LP
1429 if (r < 0) {
1430 log_error("Failed to allocate netlink message: %s", strerror(-r));
1431 return r;
1432 }
1433
ab046dde 1434 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
69c79d3c 1435 if (r < 0) {
ab046dde 1436 log_error("Failed to add netlink interface name: %s", strerror(-r));
69c79d3c
LP
1437 return r;
1438 }
1439
ee3a6a51 1440 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
69c79d3c
LP
1441 if (r < 0) {
1442 log_error("Failed to open netlink container: %s", strerror(-r));
1443 return r;
1444 }
1445
1446 r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
1447 if (r < 0) {
1448 log_error("Failed to append netlink kind: %s", strerror(-r));
1449 return r;
1450 }
1451
ee3a6a51 1452 r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
69c79d3c
LP
1453 if (r < 0) {
1454 log_error("Failed to open netlink container: %s", strerror(-r));
1455 return r;
1456 }
1457
ee3a6a51 1458 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
69c79d3c 1459 if (r < 0) {
ab046dde 1460 log_error("Failed to open netlink container: %s", strerror(-r));
69c79d3c
LP
1461 return r;
1462 }
1463
ab046dde 1464 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
69c79d3c 1465 if (r < 0) {
ab046dde 1466 log_error("Failed to add netlink interface name: %s", strerror(-r));
69c79d3c
LP
1467 return r;
1468 }
1469
ab046dde 1470 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
69c79d3c
LP
1471 if (r < 0) {
1472 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1473 return r;
1474 }
1475
1476 r = sd_rtnl_message_close_container(m);
1477 if (r < 0) {
1478 log_error("Failed to close netlink container: %s", strerror(-r));
1479 return r;
1480 }
1481
1482 r = sd_rtnl_message_close_container(m);
1483 if (r < 0) {
1484 log_error("Failed to close netlink container: %s", strerror(-r));
1485 return r;
1486 }
1487
1488 r = sd_rtnl_message_close_container(m);
1489 if (r < 0) {
1490 log_error("Failed to close netlink container: %s", strerror(-r));
1491 return r;
1492 }
1493
1494 r = sd_rtnl_call(rtnl, m, 0, NULL);
1495 if (r < 0) {
1496 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1497 return r;
1498 }
1499
1500 return 0;
1501}
1502
ab046dde
TG
1503static int setup_bridge(const char veth_name[]) {
1504 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1505 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1506 int r, bridge;
1507
1508 if (!arg_private_network)
1509 return 0;
1510
1511 if (!arg_network_veth)
1512 return 0;
1513
1514 if (!arg_network_bridge)
1515 return 0;
1516
1517 bridge = (int) if_nametoindex(arg_network_bridge);
1518 if (bridge <= 0) {
1519 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1520 return -errno;
1521 }
1522
151b9b96 1523 r = sd_rtnl_open(&rtnl, 0);
ab046dde
TG
1524 if (r < 0) {
1525 log_error("Failed to connect to netlink: %s", strerror(-r));
1526 return r;
1527 }
1528
151b9b96 1529 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
ab046dde
TG
1530 if (r < 0) {
1531 log_error("Failed to allocate netlink message: %s", strerror(-r));
1532 return r;
1533 }
1534
1535 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1536 if (r < 0) {
1537 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1538 return r;
1539 }
1540
1541 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1542 if (r < 0) {
1543 log_error("Failed to add netlink master field: %s", strerror(-r));
1544 return r;
1545 }
1546
1547 r = sd_rtnl_call(rtnl, m, 0, NULL);
1548 if (r < 0) {
1549 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1550 return r;
1551 }
1552
1553 return 0;
1554}
1555
c74e630d
LP
1556static int parse_interface(struct udev *udev, const char *name) {
1557 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1558 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1559 int ifi;
1560
1561 ifi = (int) if_nametoindex(name);
1562 if (ifi <= 0) {
1563 log_error("Failed to resolve interface %s: %m", name);
1564 return -errno;
1565 }
1566
1567 sprintf(ifi_str, "n%i", ifi);
1568 d = udev_device_new_from_device_id(udev, ifi_str);
1569 if (!d) {
1570 log_error("Failed to get udev device for interface %s: %m", name);
1571 return -errno;
1572 }
1573
1574 if (udev_device_get_is_initialized(d) <= 0) {
1575 log_error("Network interface %s is not initialized yet.", name);
1576 return -EBUSY;
1577 }
1578
1579 return ifi;
1580}
1581
69c79d3c 1582static int move_network_interfaces(pid_t pid) {
7e227024 1583 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 1584 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
1585 char **i;
1586 int r;
1587
1588 if (!arg_private_network)
1589 return 0;
1590
1591 if (strv_isempty(arg_network_interfaces))
1592 return 0;
1593
151b9b96 1594 r = sd_rtnl_open(&rtnl, 0);
aa28aefe
LP
1595 if (r < 0) {
1596 log_error("Failed to connect to netlink: %s", strerror(-r));
1597 return r;
1598 }
1599
7e227024
LP
1600 udev = udev_new();
1601 if (!udev) {
1602 log_error("Failed to connect to udev.");
1603 return -ENOMEM;
1604 }
1605
aa28aefe 1606 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 1607 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 1608 int ifi;
aa28aefe 1609
c74e630d
LP
1610 ifi = parse_interface(udev, *i);
1611 if (ifi < 0)
1612 return ifi;
1613
1614 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1615 if (r < 0) {
1616 log_error("Failed to allocate netlink message: %s", strerror(-r));
1617 return r;
aa28aefe
LP
1618 }
1619
c74e630d
LP
1620 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1621 if (r < 0) {
1622 log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1623 return r;
7e227024
LP
1624 }
1625
c74e630d
LP
1626 r = sd_rtnl_call(rtnl, m, 0, NULL);
1627 if (r < 0) {
1628 log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1629 return r;
7e227024 1630 }
c74e630d 1631 }
7e227024 1632
c74e630d
LP
1633 return 0;
1634}
1635
1636static int setup_macvlan(pid_t pid) {
1637 _cleanup_udev_unref_ struct udev *udev = NULL;
1638 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1639 char **i;
1640 int r;
1641
1642 if (!arg_private_network)
1643 return 0;
1644
1645 if (strv_isempty(arg_network_macvlan))
1646 return 0;
1647
1648 r = sd_rtnl_open(&rtnl, 0);
1649 if (r < 0) {
1650 log_error("Failed to connect to netlink: %s", strerror(-r));
1651 return r;
1652 }
1653
1654 udev = udev_new();
1655 if (!udev) {
1656 log_error("Failed to connect to udev.");
1657 return -ENOMEM;
1658 }
1659
1660 STRV_FOREACH(i, arg_network_macvlan) {
1661 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1662 _cleanup_free_ char *n = NULL;
1663 int ifi;
1664
1665 ifi = parse_interface(udev, *i);
1666 if (ifi < 0)
1667 return ifi;
1668
1669 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
aa28aefe
LP
1670 if (r < 0) {
1671 log_error("Failed to allocate netlink message: %s", strerror(-r));
1672 return r;
1673 }
1674
c74e630d
LP
1675 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1676 if (r < 0) {
1677 log_error("Failed to add netlink interface index: %s", strerror(-r));
1678 return r;
1679 }
1680
1681 n = strappend("mv-", *i);
1682 if (!n)
1683 return log_oom();
1684
1685 strshorten(n, IFNAMSIZ-1);
1686
1687 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1688 if (r < 0) {
1689 log_error("Failed to add netlink interface name: %s", strerror(-r));
1690 return r;
1691 }
1692
aa28aefe
LP
1693 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1694 if (r < 0) {
c74e630d
LP
1695 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1696 return r;
1697 }
1698
1699 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1700 if (r < 0) {
1701 log_error("Failed to open netlink container: %s", strerror(-r));
1702 return r;
1703 }
1704
1705 r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "macvlan");
1706 if (r < 0) {
1707 log_error("Failed to append netlink kind: %s", strerror(-r));
1708 return r;
1709 }
1710
1711 r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1712 if (r < 0) {
1713 log_error("Failed to open netlink container: %s", strerror(-r));
1714 return r;
1715 }
1716
1717 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1718 if (r < 0) {
1719 log_error("Failed to append macvlan mode: %s", strerror(-r));
1720 return r;
1721 }
1722
1723 r = sd_rtnl_message_close_container(m);
1724 if (r < 0) {
1725 log_error("Failed to close netlink container: %s", strerror(-r));
1726 return r;
1727 }
1728
1729 r = sd_rtnl_message_close_container(m);
1730 if (r < 0) {
1731 log_error("Failed to close netlink container: %s", strerror(-r));
aa28aefe
LP
1732 return r;
1733 }
1734
1735 r = sd_rtnl_call(rtnl, m, 0, NULL);
1736 if (r < 0) {
c74e630d 1737 log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
aa28aefe
LP
1738 return r;
1739 }
1740 }
1741
1742 return 0;
1743}
1744
24fb1112
LP
1745static int audit_still_doesnt_work_in_containers(void) {
1746
1747#ifdef HAVE_SECCOMP
1748 scmp_filter_ctx seccomp;
1749 int r;
1750
1751 /*
1752 Audit is broken in containers, much of the userspace audit
1753 hookup will fail if running inside a container. We don't
1754 care and just turn off creation of audit sockets.
1755
1756 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1757 with EAFNOSUPPORT which audit userspace uses as indication
1758 that audit is disabled in the kernel.
1759 */
1760
1761 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1762 if (!seccomp)
1763 return log_oom();
1764
e9642be2 1765 r = seccomp_add_secondary_archs(seccomp);
9875fd78 1766 if (r < 0) {
e9642be2
LP
1767 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1768 goto finish;
1769 }
1770
3302da46 1771 r = seccomp_rule_add(
24fb1112
LP
1772 seccomp,
1773 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1774 SCMP_SYS(socket),
1775 2,
1776 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1777 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1778 if (r < 0) {
1779 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1780 goto finish;
1781 }
1782
1783 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1784 if (r < 0) {
1785 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1786 goto finish;
1787 }
1788
1789 r = seccomp_load(seccomp);
1790 if (r < 0)
1791 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1792
1793finish:
1794 seccomp_release(seccomp);
1795 return r;
1796#else
1797 return 0;
1798#endif
1799
1800}
1801
1b9e5b12
LP
1802static int setup_image(char **device_path, int *loop_nr) {
1803 struct loop_info64 info = {
1804 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1805 };
1806 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1807 _cleanup_free_ char* loopdev = NULL;
1808 struct stat st;
1809 int r, nr;
1810
1811 assert(device_path);
1812 assert(loop_nr);
1813
1814 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1815 if (fd < 0) {
1816 log_error("Failed to open %s: %m", arg_image);
1817 return -errno;
1818 }
1819
1820 if (fstat(fd, &st) < 0) {
1821 log_error("Failed to stat %s: %m", arg_image);
1822 return -errno;
1823 }
1824
1825 if (S_ISBLK(st.st_mode)) {
1826 char *p;
1827
1828 p = strdup(arg_image);
1829 if (!p)
1830 return log_oom();
1831
1832 *device_path = p;
1833
1834 *loop_nr = -1;
1835
1836 r = fd;
1837 fd = -1;
1838
1839 return r;
1840 }
1841
1842 if (!S_ISREG(st.st_mode)) {
1843 log_error("%s is not a regular file or block device: %m", arg_image);
1844 return -EINVAL;
1845 }
1846
1847 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1848 if (control < 0) {
1849 log_error("Failed to open /dev/loop-control: %m");
1850 return -errno;
1851 }
1852
1853 nr = ioctl(control, LOOP_CTL_GET_FREE);
1854 if (nr < 0) {
1855 log_error("Failed to allocate loop device: %m");
1856 return -errno;
1857 }
1858
1859 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1860 return log_oom();
1861
1862 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1863 if (loop < 0) {
1864 log_error("Failed to open loop device %s: %m", loopdev);
1865 return -errno;
1866 }
1867
1868 if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1869 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1870 return -errno;
1871 }
1872
1873 if (arg_read_only)
1874 info.lo_flags |= LO_FLAGS_READ_ONLY;
1875
1876 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1877 log_error("Failed to set loopback settings on %s: %m", loopdev);
1878 return -errno;
1879 }
1880
1881 *device_path = loopdev;
1882 loopdev = NULL;
1883
1884 *loop_nr = nr;
1885
1886 r = loop;
1887 loop = -1;
1888
1889 return r;
1890}
1891
1892static int dissect_image(
1893 int fd,
1894 char **root_device,
1895 char **home_device,
1896 char **srv_device,
1897 bool *secondary) {
1898
1899#ifdef HAVE_BLKID
1900 int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1901 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1902 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1903 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1904 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1905 _cleanup_udev_unref_ struct udev *udev = NULL;
1906 struct udev_list_entry *first, *item;
1907 const char *pttype = NULL;
1908 blkid_partlist pl;
1909 struct stat st;
1910 int r;
1911
1912 assert(fd >= 0);
1913 assert(root_device);
1914 assert(home_device);
1915 assert(srv_device);
1916 assert(secondary);
1917
1918 b = blkid_new_probe();
1919 if (!b)
1920 return log_oom();
1921
1922 errno = 0;
1923 r = blkid_probe_set_device(b, fd, 0, 0);
1924 if (r != 0) {
1925 if (errno == 0)
1926 return log_oom();
1927
1928 log_error("Failed to set device on blkid probe: %m");
1929 return -errno;
1930 }
1931
1932 blkid_probe_enable_partitions(b, 1);
1933 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1934
1935 errno = 0;
1936 r = blkid_do_safeprobe(b);
1937 if (r == -2 || r == 1) {
1938 log_error("Failed to identify any partition table on %s.\n"
1939 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1940 return -EINVAL;
1941 } else if (r != 0) {
1942 if (errno == 0)
1943 errno = EIO;
1944 log_error("Failed to probe: %m");
1945 return -errno;
1946 }
1947
1948 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1949 if (!streq_ptr(pttype, "gpt")) {
1950 log_error("Image %s does not carry a GUID Partition Table.\n"
1951 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1952 return -EINVAL;
1953 }
1954
1955 errno = 0;
1956 pl = blkid_probe_get_partitions(b);
1957 if (!pl) {
1958 if (errno == 0)
1959 return log_oom();
1960
1961 log_error("Failed to list partitions of %s", arg_image);
1962 return -errno;
1963 }
1964
1965 udev = udev_new();
1966 if (!udev)
1967 return log_oom();
1968
1969 if (fstat(fd, &st) < 0) {
1970 log_error("Failed to stat block device: %m");
1971 return -errno;
1972 }
1973
1974 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1975 if (!d)
1976 return log_oom();
1977
1978 e = udev_enumerate_new(udev);
1979 if (!e)
1980 return log_oom();
1981
1982 r = udev_enumerate_add_match_parent(e, d);
1983 if (r < 0)
1984 return log_oom();
1985
1986 r = udev_enumerate_scan_devices(e);
1987 if (r < 0) {
1988 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
1989 return r;
1990 }
1991
1992 first = udev_enumerate_get_list_entry(e);
1993 udev_list_entry_foreach(item, first) {
1994 _cleanup_udev_device_unref_ struct udev_device *q;
1995 const char *stype, *node;
1996 sd_id128_t type_id;
1997 blkid_partition pp;
1998 dev_t qn;
1999 int nr;
2000
2001 errno = 0;
2002 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2003 if (!q) {
2004 if (!errno)
2005 errno = ENOMEM;
2006
2007 log_error("Failed to get partition device of %s: %m", arg_image);
2008 return -errno;
2009 }
2010
2011 qn = udev_device_get_devnum(q);
2012 if (major(qn) == 0)
2013 continue;
2014
2015 if (st.st_rdev == qn)
2016 continue;
2017
2018 node = udev_device_get_devnode(q);
2019 if (!node)
2020 continue;
2021
2022 pp = blkid_partlist_devno_to_partition(pl, qn);
2023 if (!pp)
2024 continue;
2025
2026 nr = blkid_partition_get_partno(pp);
2027 if (nr < 0)
2028 continue;
2029
2030 stype = blkid_partition_get_type_string(pp);
2031 if (!stype)
2032 continue;
2033
2034 if (sd_id128_from_string(stype, &type_id) < 0)
2035 continue;
2036
2037 if (sd_id128_equal(type_id, GPT_HOME)) {
2038
2039 if (home && nr >= home_nr)
2040 continue;
2041
2042 home_nr = nr;
2043 free(home);
2044 home = strdup(node);
2045 if (!home)
2046 return log_oom();
2047 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2048
2049 if (srv && nr >= srv_nr)
2050 continue;
2051
2052 srv_nr = nr;
2053 free(srv);
2054 srv = strdup(node);
2055 if (!srv)
2056 return log_oom();
2057 }
2058#ifdef GPT_ROOT_NATIVE
2059 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2060
2061 if (root && nr >= root_nr)
2062 continue;
2063
2064 root_nr = nr;
2065 free(root);
2066 root = strdup(node);
2067 if (!root)
2068 return log_oom();
2069 }
2070#endif
2071#ifdef GPT_ROOT_SECONDARY
2072 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2073
2074 if (secondary_root && nr >= secondary_root_nr)
2075 continue;
2076
2077 secondary_root_nr = nr;
2078 free(secondary_root);
2079 secondary_root = strdup(node);
2080 if (!secondary_root)
2081 return log_oom();
2082 }
2083#endif
2084 }
2085
2086 if (!root && !secondary_root) {
2087 log_error("Failed to identify root partition in disk image %s.\n"
2088 "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2089 return -EINVAL;
2090 }
2091
2092 if (root) {
2093 *root_device = root;
2094 root = NULL;
2095 *secondary = false;
2096 } else if (secondary_root) {
2097 *root_device = secondary_root;
2098 secondary_root = NULL;
2099 *secondary = true;
2100 }
2101
2102 if (home) {
2103 *home_device = home;
2104 home = NULL;
2105 }
2106
2107 if (srv) {
2108 *srv_device = srv;
2109 srv = NULL;
2110 }
2111
2112 return 0;
2113#else
2114 log_error("--image= is not supported, compiled without blkid support.");
2115 return -ENOTSUP;
2116#endif
2117}
2118
2119static int mount_device(const char *what, const char *where, const char *directory) {
2120#ifdef HAVE_BLKID
2121 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2122 const char *fstype, *p;
2123 int r;
2124
2125 assert(what);
2126 assert(where);
2127
2128 if (directory)
2129 p = strappenda(where, directory);
2130 else
2131 p = where;
2132
2133 errno = 0;
2134 b = blkid_new_probe_from_filename(what);
2135 if (!b) {
2136 if (errno == 0)
2137 return log_oom();
2138 log_error("Failed to allocate prober for %s: %m", what);
2139 return -errno;
2140 }
2141
2142 blkid_probe_enable_superblocks(b, 1);
2143 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2144
2145 errno = 0;
2146 r = blkid_do_safeprobe(b);
2147 if (r == -1 || r == 1) {
2148 log_error("Cannot determine file system type of %s", what);
2149 return -EINVAL;
2150 } else if (r != 0) {
2151 if (errno == 0)
2152 errno = EIO;
2153 log_error("Failed to probe %s: %m", what);
2154 return -errno;
2155 }
2156
2157 errno = 0;
2158 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2159 if (errno == 0)
2160 errno = EINVAL;
2161 log_error("Failed to determine file system type of %s", what);
2162 return -errno;
2163 }
2164
2165 if (streq(fstype, "crypto_LUKS")) {
2166 log_error("nspawn currently does not support LUKS disk images.");
2167 return -ENOTSUP;
2168 }
2169
2170 if (mount(what, p, fstype, arg_read_only ? MS_NODEV|MS_RDONLY : 0, NULL) < 0) {
2171 log_error("Failed to mount %s: %m", what);
2172 return -errno;
2173 }
2174
2175 return 0;
2176#else
2177 log_error("--image= is not supported, compiled without blkid support.");
2178 return -ENOTSUP;
2179#endif
2180}
2181
2182static int mount_devices(const char *where, const char *root_device, const char *home_device, const char *srv_device) {
2183 int r;
2184
2185 assert(where);
2186
2187 if (root_device) {
2188 r = mount_device(root_device, arg_directory, NULL);
2189 if (r < 0) {
2190 log_error("Failed to mount root directory: %s", strerror(-r));
2191 return r;
2192 }
2193 }
2194
2195 if (home_device) {
2196 r = mount_device(home_device, arg_directory, "/home");
2197 if (r < 0) {
2198 log_error("Failed to mount home directory: %s", strerror(-r));
2199 return r;
2200 }
2201 }
2202
2203 if (srv_device) {
2204 r = mount_device(srv_device, arg_directory, "/srv");
2205 if (r < 0) {
2206 log_error("Failed to mount server data directory: %s", strerror(-r));
2207 return r;
2208 }
2209 }
2210
2211 return 0;
2212}
2213
2214static void loop_remove(int nr, int *image_fd) {
2215 _cleanup_close_ int control = -1;
2216
2217 if (nr < 0)
2218 return;
2219
2220 if (image_fd && *image_fd >= 0) {
2221 ioctl(*image_fd, LOOP_CLR_FD);
2222 close_nointr_nofail(*image_fd);
2223 *image_fd = -1;
2224 }
2225
2226 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2227 if (control < 0)
2228 return;
2229
2230 ioctl(control, LOOP_CTL_REMOVE, nr);
2231}
2232
0cb9fbcd
LP
2233static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2234 int pipe_fds[2];
2235 pid_t pid;
2236
2237 assert(database);
2238 assert(key);
2239 assert(rpid);
2240
2241 if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2242 log_error("Failed to allocate pipe: %m");
2243 return -errno;
2244 }
2245
2246 pid = fork();
2247 if (pid < 0) {
2248 log_error("Failed to fork getent child: %m");
2249 return -errno;
2250 } else if (pid == 0) {
2251 int nullfd;
2252 char *empty_env = NULL;
2253
2254 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2255 _exit(EXIT_FAILURE);
2256
2257 if (pipe_fds[0] > 2)
2258 close_nointr_nofail(pipe_fds[0]);
2259 if (pipe_fds[1] > 2)
2260 close_nointr_nofail(pipe_fds[1]);
2261
2262 nullfd = open("/dev/null", O_RDWR);
2263 if (nullfd < 0)
2264 _exit(EXIT_FAILURE);
2265
2266 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2267 _exit(EXIT_FAILURE);
2268
2269 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2270 _exit(EXIT_FAILURE);
2271
2272 if (nullfd > 2)
2273 close_nointr_nofail(nullfd);
2274
2275 reset_all_signal_handlers();
2276 close_all_fds(NULL, 0);
2277
4de82926
MM
2278 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2279 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
2280 _exit(EXIT_FAILURE);
2281 }
2282
2283 close_nointr_nofail(pipe_fds[1]);
2284 pipe_fds[1] = -1;
2285
2286 *rpid = pid;
2287
2288 return pipe_fds[0];
2289}
2290
2291static int change_uid_gid(char **_home) {
2292
2293 _cleanup_strv_free_ char **passwd = NULL;
2294 char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2295 _cleanup_free_ uid_t *uids = NULL;
2296 _cleanup_free_ char *home = NULL;
2297 _cleanup_fclose_ FILE *f = NULL;
2298 _cleanup_close_ int fd = -1;
2299 unsigned n_uids = 0;
2300 size_t sz, l;
2301 uid_t uid;
2302 gid_t gid;
2303 pid_t pid;
2304 int r;
2305
2306 assert(_home);
2307
2308 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2309 /* Reset everything fully to 0, just in case */
2310
2311 if (setgroups(0, NULL) < 0) {
2312 log_error("setgroups() failed: %m");
2313 return -errno;
2314 }
2315
2316 if (setresgid(0, 0, 0) < 0) {
2317 log_error("setregid() failed: %m");
2318 return -errno;
2319 }
2320
2321 if (setresuid(0, 0, 0) < 0) {
2322 log_error("setreuid() failed: %m");
2323 return -errno;
2324 }
2325
2326 *_home = NULL;
2327 return 0;
2328 }
2329
2330 /* First, get user credentials */
2331 fd = spawn_getent("passwd", arg_user, &pid);
2332 if (fd < 0)
2333 return fd;
2334
2335 f = fdopen(fd, "r");
2336 if (!f)
2337 return log_oom();
2338 fd = -1;
2339
2340 if (!fgets(line, sizeof(line), f)) {
2341
2342 if (!ferror(f)) {
2343 log_error("Failed to resolve user %s.", arg_user);
2344 return -ESRCH;
2345 }
2346
2347 log_error("Failed to read from getent: %m");
2348 return -errno;
2349 }
2350
2351 truncate_nl(line);
2352
2353 wait_for_terminate_and_warn("getent passwd", pid);
2354
2355 x = strchr(line, ':');
2356 if (!x) {
2357 log_error("/etc/passwd entry has invalid user field.");
2358 return -EIO;
2359 }
2360
2361 u = strchr(x+1, ':');
2362 if (!u) {
2363 log_error("/etc/passwd entry has invalid password field.");
2364 return -EIO;
2365 }
2366
2367 u++;
2368 g = strchr(u, ':');
2369 if (!g) {
2370 log_error("/etc/passwd entry has invalid UID field.");
2371 return -EIO;
2372 }
2373
2374 *g = 0;
2375 g++;
2376 x = strchr(g, ':');
2377 if (!x) {
2378 log_error("/etc/passwd entry has invalid GID field.");
2379 return -EIO;
2380 }
2381
2382 *x = 0;
2383 h = strchr(x+1, ':');
2384 if (!h) {
2385 log_error("/etc/passwd entry has invalid GECOS field.");
2386 return -EIO;
2387 }
2388
2389 h++;
2390 x = strchr(h, ':');
2391 if (!x) {
2392 log_error("/etc/passwd entry has invalid home directory field.");
2393 return -EIO;
2394 }
2395
2396 *x = 0;
2397
2398 r = parse_uid(u, &uid);
2399 if (r < 0) {
2400 log_error("Failed to parse UID of user.");
2401 return -EIO;
2402 }
2403
2404 r = parse_gid(g, &gid);
2405 if (r < 0) {
2406 log_error("Failed to parse GID of user.");
2407 return -EIO;
2408 }
2409
2410 home = strdup(h);
2411 if (!home)
2412 return log_oom();
2413
2414 /* Second, get group memberships */
2415 fd = spawn_getent("initgroups", arg_user, &pid);
2416 if (fd < 0)
2417 return fd;
2418
2419 fclose(f);
2420 f = fdopen(fd, "r");
2421 if (!f)
2422 return log_oom();
2423 fd = -1;
2424
2425 if (!fgets(line, sizeof(line), f)) {
2426 if (!ferror(f)) {
2427 log_error("Failed to resolve user %s.", arg_user);
2428 return -ESRCH;
2429 }
2430
2431 log_error("Failed to read from getent: %m");
2432 return -errno;
2433 }
2434
2435 truncate_nl(line);
2436
2437 wait_for_terminate_and_warn("getent initgroups", pid);
2438
2439 /* Skip over the username and subsequent separator whitespace */
2440 x = line;
2441 x += strcspn(x, WHITESPACE);
2442 x += strspn(x, WHITESPACE);
2443
2444 FOREACH_WORD(w, l, x, state) {
2445 char c[l+1];
2446
2447 memcpy(c, w, l);
2448 c[l] = 0;
2449
2450 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2451 return log_oom();
2452
2453 r = parse_uid(c, &uids[n_uids++]);
2454 if (r < 0) {
2455 log_error("Failed to parse group data from getent.");
2456 return -EIO;
2457 }
2458 }
2459
2460 r = mkdir_parents(home, 0775);
2461 if (r < 0) {
2462 log_error("Failed to make home root directory: %s", strerror(-r));
2463 return r;
2464 }
2465
2466 r = mkdir_safe(home, 0755, uid, gid);
2467 if (r < 0) {
2468 log_error("Failed to make home directory: %s", strerror(-r));
2469 return r;
2470 }
2471
2472 fchown(STDIN_FILENO, uid, gid);
2473 fchown(STDOUT_FILENO, uid, gid);
2474 fchown(STDERR_FILENO, uid, gid);
2475
2476 if (setgroups(n_uids, uids) < 0) {
2477 log_error("Failed to set auxiliary groups: %m");
2478 return -errno;
2479 }
2480
2481 if (setresgid(gid, gid, gid) < 0) {
2482 log_error("setregid() failed: %m");
2483 return -errno;
2484 }
2485
2486 if (setresuid(uid, uid, uid) < 0) {
2487 log_error("setreuid() failed: %m");
2488 return -errno;
2489 }
2490
2491 if (_home) {
2492 *_home = home;
2493 home = NULL;
2494 }
2495
2496 return 0;
2497}
2498
88213476 2499int main(int argc, char *argv[]) {
69c79d3c 2500
1b9e5b12 2501 _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
d96c1ecf 2502 _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
69c79d3c 2503 _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
69c79d3c 2504 _cleanup_fdset_free_ FDSet *fds = NULL;
1b9e5b12 2505 int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
69c79d3c 2506 const char *console = NULL;
1b9e5b12
LP
2507 char veth_name[IFNAMSIZ];
2508 bool secondary = false;
69c79d3c 2509 pid_t pid = 0;
a258bf26 2510 sigset_t mask;
88213476
LP
2511
2512 log_parse_environment();
2513 log_open();
2514
05947bef
LP
2515 k = parse_argv(argc, argv);
2516 if (k < 0)
88213476 2517 goto finish;
05947bef
LP
2518 else if (k == 0) {
2519 r = EXIT_SUCCESS;
2520 goto finish;
2521 }
88213476 2522
1b9e5b12
LP
2523 if (!arg_image) {
2524 if (arg_directory) {
2525 char *p;
88213476 2526
1b9e5b12
LP
2527 p = path_make_absolute_cwd(arg_directory);
2528 free(arg_directory);
2529 arg_directory = p;
2530 } else
2531 arg_directory = get_current_dir_name();
88213476 2532
1b9e5b12
LP
2533 if (!arg_directory) {
2534 log_error("Failed to determine path, please use -D.");
2535 goto finish;
2536 }
2537 path_kill_slashes(arg_directory);
88213476
LP
2538 }
2539
7027ff61 2540 if (!arg_machine) {
1b9e5b12 2541 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
7027ff61
LP
2542 if (!arg_machine) {
2543 log_oom();
2544 goto finish;
2545 }
2546
e724b063 2547 hostname_cleanup(arg_machine, false);
7027ff61
LP
2548 if (isempty(arg_machine)) {
2549 log_error("Failed to determine machine name automatically, please use -M.");
2550 goto finish;
2551 }
2552 }
2553
88213476
LP
2554 if (geteuid() != 0) {
2555 log_error("Need to be root.");
2556 goto finish;
2557 }
2558
04d391da
LP
2559 if (sd_booted() <= 0) {
2560 log_error("Not running on a systemd system.");
2561 goto finish;
2562 }
2563
1b9e5b12
LP
2564 log_close();
2565 n_fd_passed = sd_listen_fds(false);
2566 if (n_fd_passed > 0) {
2567 k = fdset_new_listen_fds(&fds, false);
2568 if (k < 0) {
2569 log_error("Failed to collect file descriptors: %s", strerror(-k));
2570 goto finish;
2571 }
88213476 2572 }
1b9e5b12
LP
2573 fdset_close_others(fds);
2574 log_open();
88213476 2575
1b9e5b12
LP
2576 if (arg_directory) {
2577 if (path_equal(arg_directory, "/")) {
2578 log_error("Spawning container on root directory not supported.");
6b9132a9
LP
2579 goto finish;
2580 }
1b9e5b12
LP
2581
2582 if (arg_boot) {
2583 if (path_is_os_tree(arg_directory) <= 0) {
2584 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2585 goto finish;
2586 }
2587 } else {
2588 const char *p;
2589
2590 p = strappenda(arg_directory,
2591 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2592 if (access(p, F_OK) < 0) {
2593 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2594 goto finish;
2595
2596 }
2597 }
6b9132a9 2598 } else {
1b9e5b12 2599 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 2600
1b9e5b12
LP
2601 if (!mkdtemp(template)) {
2602 log_error("Failed to create temporary directory: %m");
2603 r = -errno;
6b9132a9 2604 goto finish;
1b9e5b12 2605 }
6b9132a9 2606
1b9e5b12
LP
2607 arg_directory = strdup(template);
2608 if (!arg_directory) {
2609 r = log_oom();
2610 goto finish;
6b9132a9 2611 }
88213476 2612
1b9e5b12
LP
2613 image_fd = setup_image(&device_path, &loop_nr);
2614 if (image_fd < 0) {
2615 r = image_fd;
842f3b0f
LP
2616 goto finish;
2617 }
1b9e5b12
LP
2618
2619 r = dissect_image(image_fd, &root_device, &home_device, &srv_device, &secondary);
2620 if (r < 0)
2621 goto finish;
842f3b0f 2622 }
842f3b0f 2623
db7feb7e
LP
2624 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2625 if (master < 0) {
a258bf26
LP
2626 log_error("Failed to acquire pseudo tty: %m");
2627 goto finish;
2628 }
2629
db7feb7e
LP
2630 console = ptsname(master);
2631 if (!console) {
a258bf26
LP
2632 log_error("Failed to determine tty name: %m");
2633 goto finish;
2634 }
2635
284c0b91 2636 if (!arg_quiet)
1b9e5b12 2637 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_image ? arg_image : arg_directory);
a258bf26
LP
2638
2639 if (unlockpt(master) < 0) {
2640 log_error("Failed to unlock tty: %m");
2641 goto finish;
2642 }
2643
eb91eb18
LP
2644 if (access("/dev/kdbus/control", F_OK) >= 0) {
2645
2646 if (arg_share_system) {
2647 kdbus_domain = strdup("/dev/kdbus");
2648 if (!kdbus_domain) {
2649 log_oom();
2650 goto finish;
2651 }
2652 } else {
2653 const char *ns;
2654
2655 ns = strappenda("machine-", arg_machine);
2656 kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2657 if (r < 0)
2658 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2659 else
2660 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2661 }
2662 }
9bd37b40 2663
e58a1277 2664 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
354bfd2b
LP
2665 log_error("Failed to create kmsg socket pair: %m");
2666 goto finish;
2667 }
2668
05947bef
LP
2669 sd_notify(0, "READY=1");
2670
a258bf26
LP
2671 assert_se(sigemptyset(&mask) == 0);
2672 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2673 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2674
d87be9b0 2675 for (;;) {
d96c1ecf 2676 int parent_ready_fd = -1, child_ready_fd = -1;
d87be9b0 2677 siginfo_t status;
d96c1ecf 2678 eventfd_t x;
a383724e 2679
d96c1ecf
LP
2680 parent_ready_fd = eventfd(0, EFD_CLOEXEC);
2681 if (parent_ready_fd < 0) {
2682 log_error("Failed to create event fd: %m");
2683 goto finish;
2684 }
2685
2686 child_ready_fd = eventfd(0, EFD_CLOEXEC);
2687 if (child_ready_fd < 0) {
40ddbdf8
LP
2688 log_error("Failed to create event fd: %m");
2689 goto finish;
2690 }
2691
8a96d94e
LP
2692 pid = syscall(__NR_clone,
2693 SIGCHLD|CLONE_NEWNS|
2694 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2695 (arg_private_network ? CLONE_NEWNET : 0), NULL);
d87be9b0
LP
2696 if (pid < 0) {
2697 if (errno == EINVAL)
2698 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2699 else
2700 log_error("clone() failed: %m");
a258bf26 2701
d87be9b0
LP
2702 goto finish;
2703 }
a258bf26 2704
d87be9b0
LP
2705 if (pid == 0) {
2706 /* child */
0cb9fbcd 2707 _cleanup_free_ char *home = NULL;
5674767e 2708 unsigned n_env = 2;
d87be9b0 2709 const char *envp[] = {
e10a55fd 2710 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
2711 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2712 NULL, /* TERM */
2713 NULL, /* HOME */
2714 NULL, /* USER */
2715 NULL, /* LOGNAME */
2716 NULL, /* container_uuid */
842f3b0f
LP
2717 NULL, /* LISTEN_FDS */
2718 NULL, /* LISTEN_PID */
d87be9b0
LP
2719 NULL
2720 };
f4889f65 2721 char **env_use;
a258bf26 2722
5674767e
ZJS
2723 envp[n_env] = strv_find_prefix(environ, "TERM=");
2724 if (envp[n_env])
2725 n_env ++;
a258bf26 2726
d87be9b0 2727 close_nointr_nofail(master);
842f3b0f 2728 master = -1;
a258bf26 2729
d87be9b0
LP
2730 close_nointr(STDIN_FILENO);
2731 close_nointr(STDOUT_FILENO);
2732 close_nointr(STDERR_FILENO);
db7feb7e 2733
842f3b0f
LP
2734 close_nointr_nofail(kmsg_socket_pair[0]);
2735 kmsg_socket_pair[0] = -1;
a258bf26 2736
d87be9b0 2737 reset_all_signal_handlers();
88213476 2738
d87be9b0
LP
2739 assert_se(sigemptyset(&mask) == 0);
2740 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
f5c1b9ee 2741
842f3b0f
LP
2742 k = open_terminal(console, O_RDWR);
2743 if (k != STDIN_FILENO) {
2744 if (k >= 0) {
2745 close_nointr_nofail(k);
2746 k = -EINVAL;
2747 }
2748
2749 log_error("Failed to open console: %s", strerror(-k));
2750 goto child_fail;
2751 }
2752
2753 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2754 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2755 log_error("Failed to duplicate console: %m");
d87be9b0 2756 goto child_fail;
842f3b0f 2757 }
bc2f673e 2758
d87be9b0
LP
2759 if (setsid() < 0) {
2760 log_error("setsid() failed: %m");
bc2f673e
LP
2761 goto child_fail;
2762 }
2763
db999e0f
LP
2764 if (reset_audit_loginuid() < 0)
2765 goto child_fail;
2766
d87be9b0
LP
2767 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2768 log_error("PR_SET_PDEATHSIG failed: %m");
2769 goto child_fail;
2770 }
e58a1277 2771
d87be9b0
LP
2772 /* Mark everything as slave, so that we still
2773 * receive mounts from the real root, but don't
2774 * propagate mounts to the real root. */
2775 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2776 log_error("MS_SLAVE|MS_REC failed: %m");
2777 goto child_fail;
2778 }
04bc4a3f 2779
1b9e5b12
LP
2780 if (mount_devices(arg_directory, root_device, home_device, srv_device) < 0)
2781 goto child_fail;
2782
d87be9b0
LP
2783 /* Turn directory into bind mount */
2784 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2785 log_error("Failed to make bind mount.");
2786 goto child_fail;
2787 }
88213476 2788
d87be9b0
LP
2789 if (arg_read_only)
2790 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
2791 log_error("Failed to make read-only.");
2792 goto child_fail;
2793 }
2547bb41 2794
d87be9b0
LP
2795 if (mount_all(arg_directory) < 0)
2796 goto child_fail;
57fb9fb5 2797
d87be9b0
LP
2798 if (copy_devnodes(arg_directory) < 0)
2799 goto child_fail;
a258bf26 2800
f2d88580
LP
2801 if (setup_ptmx(arg_directory) < 0)
2802 goto child_fail;
2803
d87be9b0 2804 dev_setup(arg_directory);
88213476 2805
24fb1112
LP
2806 if (audit_still_doesnt_work_in_containers() < 0)
2807 goto child_fail;
2808
d87be9b0
LP
2809 if (setup_dev_console(arg_directory, console) < 0)
2810 goto child_fail;
88213476 2811
d87be9b0
LP
2812 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2813 goto child_fail;
88213476 2814
d87be9b0 2815 close_nointr_nofail(kmsg_socket_pair[1]);
842f3b0f 2816 kmsg_socket_pair[1] = -1;
a258bf26 2817
d87be9b0
LP
2818 if (setup_boot_id(arg_directory) < 0)
2819 goto child_fail;
a41fe3a2 2820
d87be9b0
LP
2821 if (setup_timezone(arg_directory) < 0)
2822 goto child_fail;
88213476 2823
d87be9b0
LP
2824 if (setup_resolv_conf(arg_directory) < 0)
2825 goto child_fail;
687d0825 2826
d87be9b0 2827 if (setup_journal(arg_directory) < 0)
687d0825 2828 goto child_fail;
687d0825 2829
17fe0523
LP
2830 if (mount_binds(arg_directory, arg_bind, 0) < 0)
2831 goto child_fail;
2832
2833 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
2834 goto child_fail;
2835
486e99a3 2836 if (setup_kdbus(arg_directory, kdbus_domain) < 0)
9bd37b40
LP
2837 goto child_fail;
2838
d96c1ecf
LP
2839 /* Tell the parent that we are ready, and that
2840 * it can cgroupify us to that we lack access
2841 * to certain devices and resources. */
2842 eventfd_write(child_ready_fd, 1);
2843 close_nointr_nofail(child_ready_fd);
2844 child_ready_fd = -1;
2845
d87be9b0
LP
2846 if (chdir(arg_directory) < 0) {
2847 log_error("chdir(%s) failed: %m", arg_directory);
687d0825
MV
2848 goto child_fail;
2849 }
2850
d87be9b0
LP
2851 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
2852 log_error("mount(MS_MOVE) failed: %m");
687d0825
MV
2853 goto child_fail;
2854 }
2855
d87be9b0
LP
2856 if (chroot(".") < 0) {
2857 log_error("chroot() failed: %m");
687d0825
MV
2858 goto child_fail;
2859 }
2860
d87be9b0
LP
2861 if (chdir("/") < 0) {
2862 log_error("chdir() failed: %m");
687d0825
MV
2863 goto child_fail;
2864 }
2865
d87be9b0
LP
2866 umask(0022);
2867
eb91eb18
LP
2868 if (arg_private_network)
2869 loopback_setup();
d87be9b0
LP
2870
2871 if (drop_capabilities() < 0) {
2872 log_error("drop_capabilities() failed: %m");
687d0825
MV
2873 goto child_fail;
2874 }
687d0825 2875
0cb9fbcd
LP
2876 r = change_uid_gid(&home);
2877 if (r < 0)
2878 goto child_fail;
d87be9b0 2879
842f3b0f
LP
2880 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2881 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2882 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 2883 log_oom();
144f0fc0
LP
2884 goto child_fail;
2885 }
687d0825 2886
9444b1f2
LP
2887 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2888 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
842f3b0f
LP
2889 log_oom();
2890 goto child_fail;
2891 }
2892 }
2893
2894 if (fdset_size(fds) > 0) {
2895 k = fdset_cloexec(fds, false);
2896 if (k < 0) {
2897 log_error("Failed to unset O_CLOEXEC for file descriptors.");
2898 goto child_fail;
2899 }
2900
2901 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 2902 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0
LP
2903 log_oom();
2904 goto child_fail;
2905 }
2906 }
2907
2908 setup_hostname();
2909
6afc95b7
LP
2910 if (arg_personality != 0xffffffffLU) {
2911 if (personality(arg_personality) < 0) {
2912 log_error("personality() failed: %m");
2913 goto child_fail;
2914 }
1b9e5b12
LP
2915 } else if (secondary) {
2916 if (personality(PER_LINUX32) < 0) {
2917 log_error("personality() failed: %m");
2918 goto child_fail;
2919 }
6afc95b7
LP
2920 }
2921
d96c1ecf
LP
2922#ifdef HAVE_SELINUX
2923 if (arg_selinux_context)
0cb9fbcd 2924 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
d96c1ecf 2925 log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
0cb9fbcd
LP
2926 goto child_fail;
2927 }
d96c1ecf 2928#endif
354bfd2b 2929
f4889f65
LP
2930 if (!strv_isempty(arg_setenv)) {
2931 char **n;
2932
2933 n = strv_env_merge(2, envp, arg_setenv);
2934 if (!n) {
2935 log_oom();
2936 goto child_fail;
2937 }
2938
2939 env_use = n;
2940 } else
2941 env_use = (char**) envp;
2942
d96c1ecf
LP
2943 /* Wait until the parent is ready with the setup, too... */
2944 eventfd_read(parent_ready_fd, &x);
2945 close_nointr_nofail(parent_ready_fd);
2946 parent_ready_fd = -1;
2947
d87be9b0
LP
2948 if (arg_boot) {
2949 char **a;
2950 size_t l;
88213476 2951
d87be9b0 2952 /* Automatically search for the init system */
0f0dbc46 2953
d87be9b0
LP
2954 l = 1 + argc - optind;
2955 a = newa(char*, l + 1);
2956 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 2957
d87be9b0 2958 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 2959 execve(a[0], a, env_use);
0f0dbc46 2960
d87be9b0 2961 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 2962 execve(a[0], a, env_use);
0f0dbc46 2963
d87be9b0 2964 a[0] = (char*) "/sbin/init";
f4889f65 2965 execve(a[0], a, env_use);
d87be9b0 2966 } else if (argc > optind)
f4889f65 2967 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
2968 else {
2969 chdir(home ? home : "/root");
f4889f65 2970 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 2971 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
2972 }
2973
2974 log_error("execv() failed: %m");
0f0dbc46 2975
d87be9b0
LP
2976 child_fail:
2977 _exit(EXIT_FAILURE);
da5b3bad 2978 }
88213476 2979
842f3b0f
LP
2980 fdset_free(fds);
2981 fds = NULL;
2982
d96c1ecf
LP
2983 /* Wait until the child reported that it is ready with
2984 * all it needs to do with priviliges. After we got
2985 * the notification we can make the process join its
2986 * cgroup which might limit what it can do */
2987 eventfd_read(child_ready_fd, &x);
2988
354bfd2b
LP
2989 r = register_machine(pid);
2990 if (r < 0)
2991 goto finish;
2992
aa28aefe
LP
2993 r = move_network_interfaces(pid);
2994 if (r < 0)
2995 goto finish;
2996
ab046dde
TG
2997 r = setup_veth(pid, veth_name);
2998 if (r < 0)
2999 goto finish;
3000
3001 r = setup_bridge(veth_name);
3002 if (r < 0)
3003 goto finish;
3004
c74e630d
LP
3005 r = setup_macvlan(pid);
3006 if (r < 0)
3007 goto finish;
3008
d96c1ecf
LP
3009 /* Notify the child that the parent is ready with all
3010 * its setup, and thtat the child can now hand over
3011 * control to the code to run inside the container. */
3012 eventfd_write(parent_ready_fd, 1);
354bfd2b 3013
04d39279
LP
3014 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3015 if (k < 0) {
3016 r = EXIT_FAILURE;
3017 break;
3018 }
88213476 3019
284c0b91
LP
3020 if (!arg_quiet)
3021 putc('\n', stdout);
04d39279
LP
3022
3023 /* Kill if it is not dead yet anyway */
1f0cd86b
LP
3024 terminate_machine(pid);
3025
3026 /* Redundant, but better safe than sorry */
04d39279 3027 kill(pid, SIGKILL);
a258bf26 3028
05947bef 3029 k = wait_for_terminate(pid, &status);
04d39279
LP
3030 pid = 0;
3031
05947bef 3032 if (k < 0) {
d87be9b0
LP
3033 r = EXIT_FAILURE;
3034 break;
3035 }
a258bf26 3036
d87be9b0 3037 if (status.si_code == CLD_EXITED) {
a5f5f8a0 3038 r = status.si_status;
d87be9b0 3039 if (status.si_status != 0) {
04d39279 3040 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
d87be9b0
LP
3041 break;
3042 }
3043
284c0b91
LP
3044 if (!arg_quiet)
3045 log_debug("Container %s exited successfully.", arg_machine);
d87be9b0
LP
3046 break;
3047 } else if (status.si_code == CLD_KILLED &&
3048 status.si_status == SIGINT) {
284c0b91
LP
3049
3050 if (!arg_quiet)
3051 log_info("Container %s has been shut down.", arg_machine);
d87be9b0
LP
3052 r = 0;
3053 break;
3054 } else if (status.si_code == CLD_KILLED &&
3055 status.si_status == SIGHUP) {
284c0b91
LP
3056
3057 if (!arg_quiet)
3058 log_info("Container %s is being rebooted.", arg_machine);
d87be9b0
LP
3059 continue;
3060 } else if (status.si_code == CLD_KILLED ||
3061 status.si_code == CLD_DUMPED) {
88213476 3062
eb91eb18 3063 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
d87be9b0
LP
3064 r = EXIT_FAILURE;
3065 break;
3066 } else {
04d39279 3067 log_error("Container %s failed due to unknown reason.", arg_machine);
d87be9b0
LP
3068 r = EXIT_FAILURE;
3069 break;
3070 }
3071 }
88213476
LP
3072
3073finish:
1b9e5b12
LP
3074 loop_remove(loop_nr, &image_fd);
3075
9444b1f2
LP
3076 if (pid > 0)
3077 kill(pid, SIGKILL);
88213476 3078
04d391da 3079 free(arg_directory);
7027ff61 3080 free(arg_machine);
c74e630d
LP
3081 free(arg_user);
3082 strv_free(arg_setenv);
3083 strv_free(arg_network_interfaces);
3084 strv_free(arg_network_macvlan);
3085 strv_free(arg_bind);
3086 strv_free(arg_bind_ro);
88213476
LP
3087
3088 return r;
3089}