]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: mount /tmp in the container, don't leave this to the container's init
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
88213476 34#include <getopt.h>
a258bf26
LP
35#include <termios.h>
36#include <sys/signalfd.h>
687d0825 37#include <grp.h>
5ed27dbd 38#include <linux/fs.h>
9537eab0
LP
39#include <sys/un.h>
40#include <sys/socket.h>
aea38d80 41#include <linux/netlink.h>
aa28aefe 42#include <net/if.h>
69c79d3c 43#include <linux/veth.h>
6afc95b7 44#include <sys/personality.h>
1b9e5b12 45#include <linux/loop.h>
2fbe4296
LP
46#include <poll.h>
47#include <sys/file.h>
aa28aefe 48
5d63309c 49#ifdef HAVE_SELINUX
a8828ed9
DW
50#include <selinux/selinux.h>
51#endif
88213476 52
24fb1112
LP
53#ifdef HAVE_SECCOMP
54#include <seccomp.h>
55#endif
56
1b9e5b12
LP
57#ifdef HAVE_BLKID
58#include <blkid/blkid.h>
59#endif
60
1f0cd86b
LP
61#include "sd-daemon.h"
62#include "sd-bus.h"
63#include "sd-id128.h"
aa28aefe 64#include "sd-rtnl.h"
88213476
LP
65#include "log.h"
66#include "util.h"
49e942b2 67#include "mkdir.h"
6b2d0e85 68#include "macro.h"
d7832d2c 69#include "audit.h"
94d82985 70#include "missing.h"
04d391da 71#include "cgroup-util.h"
a258bf26 72#include "strv.h"
9eb977db 73#include "path-util.h"
a41fe3a2 74#include "loopback-setup.h"
4fc9982c 75#include "dev-setup.h"
842f3b0f 76#include "fdset.h"
acbeb427 77#include "build.h"
a5c32cff 78#include "fileio.h"
40ca29a1 79#include "bus-util.h"
1f0cd86b 80#include "bus-error.h"
4ba93280 81#include "ptyfwd.h"
9bd37b40 82#include "bus-kernel.h"
f4889f65 83#include "env-util.h"
7f112f50 84#include "def.h"
aa28aefe 85#include "rtnl-util.h"
7e227024 86#include "udev-util.h"
1b9e5b12
LP
87#include "blkid-util.h"
88#include "gpt.h"
01dde061 89#include "siphash24.h"
849958d1 90#include "copy.h"
3577de7a 91#include "base-filesystem.h"
a2da110b 92#include "barrier.h"
023fb90b 93#include "event-util.h"
f01ae826 94#include "capability.h"
2822da4f 95#include "cap-list.h"
ec16945e 96#include "btrfs-util.h"
1b9cebf6 97#include "machine-image.h"
6d0b55c2
LP
98#include "list.h"
99#include "in-addr-util.h"
100#include "fw-util.h"
101#include "local-addresses.h"
f2d88580 102
e9642be2
LP
103#ifdef HAVE_SECCOMP
104#include "seccomp-util.h"
105#endif
106
6d0b55c2
LP
107typedef struct ExposePort {
108 int protocol;
109 uint16_t host_port;
110 uint16_t container_port;
111 LIST_FIELDS(struct ExposePort, ports);
112} ExposePort;
113
113cea80
DH
114typedef enum ContainerStatus {
115 CONTAINER_TERMINATED,
116 CONTAINER_REBOOTED
117} ContainerStatus;
118
57fb9fb5
LP
119typedef enum LinkJournal {
120 LINK_NO,
121 LINK_AUTO,
122 LINK_HOST,
123 LINK_GUEST
124} LinkJournal;
88213476 125
4d9f07b4
LP
126typedef enum Volatile {
127 VOLATILE_NO,
128 VOLATILE_YES,
129 VOLATILE_STATE,
130} Volatile;
131
88213476 132static char *arg_directory = NULL;
ec16945e 133static char *arg_template = NULL;
687d0825 134static char *arg_user = NULL;
9444b1f2 135static sd_id128_t arg_uuid = {};
7027ff61 136static char *arg_machine = NULL;
c74e630d
LP
137static const char *arg_selinux_context = NULL;
138static const char *arg_selinux_apifs_context = NULL;
9444b1f2 139static const char *arg_slice = NULL;
ff01d048 140static bool arg_private_network = false;
bc2f673e 141static bool arg_read_only = false;
0f0dbc46 142static bool arg_boot = false;
ec16945e 143static bool arg_ephemeral = false;
57fb9fb5 144static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 145static bool arg_link_journal_try = false;
5076f0cc
LP
146static uint64_t arg_retain =
147 (1ULL << CAP_CHOWN) |
148 (1ULL << CAP_DAC_OVERRIDE) |
149 (1ULL << CAP_DAC_READ_SEARCH) |
150 (1ULL << CAP_FOWNER) |
151 (1ULL << CAP_FSETID) |
152 (1ULL << CAP_IPC_OWNER) |
153 (1ULL << CAP_KILL) |
154 (1ULL << CAP_LEASE) |
155 (1ULL << CAP_LINUX_IMMUTABLE) |
156 (1ULL << CAP_NET_BIND_SERVICE) |
157 (1ULL << CAP_NET_BROADCAST) |
158 (1ULL << CAP_NET_RAW) |
159 (1ULL << CAP_SETGID) |
160 (1ULL << CAP_SETFCAP) |
161 (1ULL << CAP_SETPCAP) |
162 (1ULL << CAP_SETUID) |
163 (1ULL << CAP_SYS_ADMIN) |
164 (1ULL << CAP_SYS_CHROOT) |
165 (1ULL << CAP_SYS_NICE) |
166 (1ULL << CAP_SYS_PTRACE) |
167 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 168 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
169 (1ULL << CAP_SYS_BOOT) |
170 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
171 (1ULL << CAP_AUDIT_CONTROL) |
172 (1ULL << CAP_MKNOD);
17fe0523
LP
173static char **arg_bind = NULL;
174static char **arg_bind_ro = NULL;
06c17c39 175static char **arg_tmpfs = NULL;
f4889f65 176static char **arg_setenv = NULL;
284c0b91 177static bool arg_quiet = false;
8a96d94e 178static bool arg_share_system = false;
eb91eb18 179static bool arg_register = true;
89f7c846 180static bool arg_keep_unit = false;
aa28aefe 181static char **arg_network_interfaces = NULL;
c74e630d 182static char **arg_network_macvlan = NULL;
4bbfe7ad 183static char **arg_network_ipvlan = NULL;
69c79d3c 184static bool arg_network_veth = false;
c74e630d 185static const char *arg_network_bridge = NULL;
6afc95b7 186static unsigned long arg_personality = 0xffffffffLU;
ec16945e 187static char *arg_image = NULL;
4d9f07b4 188static Volatile arg_volatile = VOLATILE_NO;
6d0b55c2 189static ExposePort *arg_expose_ports = NULL;
88213476 190
601185b4 191static void help(void) {
88213476
LP
192 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
193 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
194 " -h --help Show this help\n"
195 " --version Print version string\n"
69c79d3c 196 " -q --quiet Do not show status information\n"
1b9e5b12 197 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
198 " --template=PATH Initialize root directory from template directory,\n"
199 " if missing\n"
200 " -x --ephemeral Run container with snapshot of root directory, and\n"
201 " remove it after exit\n"
202 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
203 " -b --boot Boot up full system (i.e. invoke init)\n"
204 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 205 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 206 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 207 " -S --slice=SLICE Place the container in the specified slice\n"
69c79d3c
LP
208 " --private-network Disable network in container\n"
209 " --network-interface=INTERFACE\n"
210 " Assign an existing network interface to the\n"
211 " container\n"
c74e630d
LP
212 " --network-macvlan=INTERFACE\n"
213 " Create a macvlan network interface based on an\n"
214 " existing network interface to the container\n"
4bbfe7ad
TG
215 " --network-ipvlan=INTERFACE\n"
216 " Create a ipvlan network interface based on an\n"
217 " existing network interface to the container\n"
0dfaa006 218 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 219 " and container\n"
ab046dde 220 " --network-bridge=INTERFACE\n"
32457153 221 " Add a virtual ethernet connection between host\n"
ab046dde
TG
222 " and container and add it to an existing bridge on\n"
223 " the host\n"
6d0b55c2 224 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 225 " Expose a container IP port on the host\n"
82adf6af
LP
226 " -Z --selinux-context=SECLABEL\n"
227 " Set the SELinux security context to be used by\n"
228 " processes in the container\n"
229 " -L --selinux-apifs-context=SECLABEL\n"
230 " Set the SELinux security context to be used by\n"
231 " API/tmpfs file systems in the container\n"
a8828ed9
DW
232 " --capability=CAP In addition to the default, retain specified\n"
233 " capability\n"
234 " --drop-capability=CAP Drop the specified capability from the default set\n"
574edc90
MP
235 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
236 " try-guest, try-host\n"
237 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 238 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
239 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
240 " the container\n"
241 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 242 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
284c0b91 243 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 244 " --share-system Share system namespaces with host\n"
eb91eb18 245 " --register=BOOLEAN Register container as machine\n"
89f7c846 246 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 247 " the service unit nspawn is running in\n"
6d0b55c2
LP
248 " --volatile[=MODE] Run the system in volatile mode\n"
249 , program_invocation_short_name);
88213476
LP
250}
251
ec16945e
LP
252static int set_sanitized_path(char **b, const char *path) {
253 char *p;
254
255 assert(b);
256 assert(path);
257
258 p = canonicalize_file_name(path);
259 if (!p) {
260 if (errno != ENOENT)
261 return -errno;
262
263 p = path_make_absolute_cwd(path);
264 if (!p)
265 return -ENOMEM;
266 }
267
268 free(*b);
269 *b = path_kill_slashes(p);
270 return 0;
271}
272
88213476
LP
273static int parse_argv(int argc, char *argv[]) {
274
a41fe3a2 275 enum {
acbeb427
ZJS
276 ARG_VERSION = 0x100,
277 ARG_PRIVATE_NETWORK,
bc2f673e 278 ARG_UUID,
5076f0cc 279 ARG_READ_ONLY,
57fb9fb5 280 ARG_CAPABILITY,
420c7379 281 ARG_DROP_CAPABILITY,
17fe0523
LP
282 ARG_LINK_JOURNAL,
283 ARG_BIND,
f4889f65 284 ARG_BIND_RO,
06c17c39 285 ARG_TMPFS,
f4889f65 286 ARG_SETENV,
eb91eb18 287 ARG_SHARE_SYSTEM,
89f7c846 288 ARG_REGISTER,
aa28aefe 289 ARG_KEEP_UNIT,
69c79d3c 290 ARG_NETWORK_INTERFACE,
c74e630d 291 ARG_NETWORK_MACVLAN,
4bbfe7ad 292 ARG_NETWORK_IPVLAN,
ab046dde 293 ARG_NETWORK_BRIDGE,
6afc95b7 294 ARG_PERSONALITY,
4d9f07b4 295 ARG_VOLATILE,
ec16945e 296 ARG_TEMPLATE,
a41fe3a2
LP
297 };
298
88213476 299 static const struct option options[] = {
aa28aefe
LP
300 { "help", no_argument, NULL, 'h' },
301 { "version", no_argument, NULL, ARG_VERSION },
302 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
303 { "template", required_argument, NULL, ARG_TEMPLATE },
304 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
305 { "user", required_argument, NULL, 'u' },
306 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
307 { "boot", no_argument, NULL, 'b' },
308 { "uuid", required_argument, NULL, ARG_UUID },
309 { "read-only", no_argument, NULL, ARG_READ_ONLY },
310 { "capability", required_argument, NULL, ARG_CAPABILITY },
311 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
312 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
313 { "bind", required_argument, NULL, ARG_BIND },
314 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 315 { "tmpfs", required_argument, NULL, ARG_TMPFS },
aa28aefe
LP
316 { "machine", required_argument, NULL, 'M' },
317 { "slice", required_argument, NULL, 'S' },
318 { "setenv", required_argument, NULL, ARG_SETENV },
319 { "selinux-context", required_argument, NULL, 'Z' },
320 { "selinux-apifs-context", required_argument, NULL, 'L' },
321 { "quiet", no_argument, NULL, 'q' },
322 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
323 { "register", required_argument, NULL, ARG_REGISTER },
324 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
325 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 326 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 327 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 328 { "network-veth", no_argument, NULL, 'n' },
ab046dde 329 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 330 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 331 { "image", required_argument, NULL, 'i' },
4d9f07b4 332 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 333 { "port", required_argument, NULL, 'p' },
eb9da376 334 {}
88213476
LP
335 };
336
9444b1f2 337 int c, r;
a42c8b54 338 uint64_t plus = 0, minus = 0;
88213476
LP
339
340 assert(argc >= 0);
341 assert(argv);
342
0dfaa006 343 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
344
345 switch (c) {
346
347 case 'h':
601185b4
ZJS
348 help();
349 return 0;
88213476 350
acbeb427
ZJS
351 case ARG_VERSION:
352 puts(PACKAGE_STRING);
353 puts(SYSTEMD_FEATURES);
354 return 0;
355
88213476 356 case 'D':
ec16945e
LP
357 r = set_sanitized_path(&arg_directory, optarg);
358 if (r < 0)
359 return log_error_errno(r, "Invalid root directory: %m");
360
361 break;
362
363 case ARG_TEMPLATE:
364 r = set_sanitized_path(&arg_template, optarg);
365 if (r < 0)
366 return log_error_errno(r, "Invalid template directory: %m");
88213476
LP
367
368 break;
369
1b9e5b12 370 case 'i':
ec16945e
LP
371 r = set_sanitized_path(&arg_image, optarg);
372 if (r < 0)
373 return log_error_errno(r, "Invalid image path: %m");
374
375 break;
376
377 case 'x':
378 arg_ephemeral = true;
1b9e5b12
LP
379 break;
380
687d0825
MV
381 case 'u':
382 free(arg_user);
7027ff61
LP
383 arg_user = strdup(optarg);
384 if (!arg_user)
385 return log_oom();
687d0825
MV
386
387 break;
388
ab046dde 389 case ARG_NETWORK_BRIDGE:
c74e630d 390 arg_network_bridge = optarg;
ab046dde
TG
391
392 /* fall through */
393
0dfaa006 394 case 'n':
69c79d3c
LP
395 arg_network_veth = true;
396 arg_private_network = true;
397 break;
398
aa28aefe 399 case ARG_NETWORK_INTERFACE:
c74e630d
LP
400 if (strv_extend(&arg_network_interfaces, optarg) < 0)
401 return log_oom();
402
403 arg_private_network = true;
404 break;
405
406 case ARG_NETWORK_MACVLAN:
407 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
408 return log_oom();
409
4bbfe7ad
TG
410 arg_private_network = true;
411 break;
412
413 case ARG_NETWORK_IPVLAN:
414 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
415 return log_oom();
416
aa28aefe
LP
417 /* fall through */
418
ff01d048
LP
419 case ARG_PRIVATE_NETWORK:
420 arg_private_network = true;
a41fe3a2
LP
421 break;
422
0f0dbc46
LP
423 case 'b':
424 arg_boot = true;
425 break;
426
144f0fc0 427 case ARG_UUID:
9444b1f2
LP
428 r = sd_id128_from_string(optarg, &arg_uuid);
429 if (r < 0) {
aa96c6cb 430 log_error("Invalid UUID: %s", optarg);
9444b1f2 431 return r;
aa96c6cb 432 }
9444b1f2 433 break;
aa96c6cb 434
9444b1f2 435 case 'S':
c74e630d 436 arg_slice = optarg;
144f0fc0
LP
437 break;
438
7027ff61 439 case 'M':
eb91eb18
LP
440 if (isempty(optarg)) {
441 free(arg_machine);
442 arg_machine = NULL;
443 } else {
0c3c4284 444 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
445 log_error("Invalid machine name: %s", optarg);
446 return -EINVAL;
447 }
7027ff61 448
0c3c4284
LP
449 r = free_and_strdup(&arg_machine, optarg);
450 if (r < 0)
eb91eb18
LP
451 return log_oom();
452
453 break;
454 }
7027ff61 455
82adf6af
LP
456 case 'Z':
457 arg_selinux_context = optarg;
a8828ed9
DW
458 break;
459
82adf6af
LP
460 case 'L':
461 arg_selinux_apifs_context = optarg;
a8828ed9
DW
462 break;
463
bc2f673e
LP
464 case ARG_READ_ONLY:
465 arg_read_only = true;
466 break;
467
420c7379
LP
468 case ARG_CAPABILITY:
469 case ARG_DROP_CAPABILITY: {
a2a5291b 470 const char *state, *word;
5076f0cc
LP
471 size_t length;
472
473 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 474 _cleanup_free_ char *t;
5076f0cc
LP
475
476 t = strndup(word, length);
0d0f0c50
SL
477 if (!t)
478 return log_oom();
5076f0cc 479
39ed67d1
LP
480 if (streq(t, "all")) {
481 if (c == ARG_CAPABILITY)
a42c8b54 482 plus = (uint64_t) -1;
39ed67d1 483 else
a42c8b54 484 minus = (uint64_t) -1;
39ed67d1 485 } else {
2822da4f
LP
486 int cap;
487
488 cap = capability_from_name(t);
489 if (cap < 0) {
39ed67d1
LP
490 log_error("Failed to parse capability %s.", t);
491 return -EINVAL;
492 }
493
494 if (c == ARG_CAPABILITY)
a42c8b54 495 plus |= 1ULL << (uint64_t) cap;
39ed67d1 496 else
a42c8b54 497 minus |= 1ULL << (uint64_t) cap;
5076f0cc 498 }
5076f0cc
LP
499 }
500
501 break;
502 }
503
57fb9fb5
LP
504 case 'j':
505 arg_link_journal = LINK_GUEST;
574edc90 506 arg_link_journal_try = true;
57fb9fb5
LP
507 break;
508
509 case ARG_LINK_JOURNAL:
53e438e3 510 if (streq(optarg, "auto")) {
57fb9fb5 511 arg_link_journal = LINK_AUTO;
53e438e3
LP
512 arg_link_journal_try = false;
513 } else if (streq(optarg, "no")) {
57fb9fb5 514 arg_link_journal = LINK_NO;
53e438e3
LP
515 arg_link_journal_try = false;
516 } else if (streq(optarg, "guest")) {
57fb9fb5 517 arg_link_journal = LINK_GUEST;
53e438e3
LP
518 arg_link_journal_try = false;
519 } else if (streq(optarg, "host")) {
57fb9fb5 520 arg_link_journal = LINK_HOST;
53e438e3
LP
521 arg_link_journal_try = false;
522 } else if (streq(optarg, "try-guest")) {
574edc90
MP
523 arg_link_journal = LINK_GUEST;
524 arg_link_journal_try = true;
525 } else if (streq(optarg, "try-host")) {
526 arg_link_journal = LINK_HOST;
527 arg_link_journal_try = true;
528 } else {
57fb9fb5
LP
529 log_error("Failed to parse link journal mode %s", optarg);
530 return -EINVAL;
531 }
532
533 break;
534
17fe0523
LP
535 case ARG_BIND:
536 case ARG_BIND_RO: {
537 _cleanup_free_ char *a = NULL, *b = NULL;
538 char *e;
539 char ***x;
17fe0523
LP
540
541 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
542
543 e = strchr(optarg, ':');
544 if (e) {
545 a = strndup(optarg, e - optarg);
546 b = strdup(e + 1);
547 } else {
548 a = strdup(optarg);
549 b = strdup(optarg);
550 }
551
552 if (!a || !b)
553 return log_oom();
554
555 if (!path_is_absolute(a) || !path_is_absolute(b)) {
556 log_error("Invalid bind mount specification: %s", optarg);
557 return -EINVAL;
558 }
559
560 r = strv_extend(x, a);
561 if (r < 0)
b3451bed 562 return log_oom();
17fe0523
LP
563
564 r = strv_extend(x, b);
565 if (r < 0)
b3451bed 566 return log_oom();
17fe0523
LP
567
568 break;
569 }
570
06c17c39
LP
571 case ARG_TMPFS: {
572 _cleanup_free_ char *a = NULL, *b = NULL;
573 char *e;
574
575 e = strchr(optarg, ':');
576 if (e) {
577 a = strndup(optarg, e - optarg);
578 b = strdup(e + 1);
579 } else {
580 a = strdup(optarg);
581 b = strdup("mode=0755");
582 }
583
584 if (!a || !b)
585 return log_oom();
586
587 if (!path_is_absolute(a)) {
588 log_error("Invalid tmpfs specification: %s", optarg);
589 return -EINVAL;
590 }
591
592 r = strv_push(&arg_tmpfs, a);
593 if (r < 0)
594 return log_oom();
595
596 a = NULL;
597
598 r = strv_push(&arg_tmpfs, b);
599 if (r < 0)
600 return log_oom();
601
602 b = NULL;
603
604 break;
605 }
606
f4889f65
LP
607 case ARG_SETENV: {
608 char **n;
609
610 if (!env_assignment_is_valid(optarg)) {
611 log_error("Environment variable assignment '%s' is not valid.", optarg);
612 return -EINVAL;
613 }
614
615 n = strv_env_set(arg_setenv, optarg);
616 if (!n)
617 return log_oom();
618
619 strv_free(arg_setenv);
620 arg_setenv = n;
621 break;
622 }
623
284c0b91
LP
624 case 'q':
625 arg_quiet = true;
626 break;
627
8a96d94e
LP
628 case ARG_SHARE_SYSTEM:
629 arg_share_system = true;
630 break;
631
eb91eb18
LP
632 case ARG_REGISTER:
633 r = parse_boolean(optarg);
634 if (r < 0) {
635 log_error("Failed to parse --register= argument: %s", optarg);
636 return r;
637 }
638
639 arg_register = r;
640 break;
641
89f7c846
LP
642 case ARG_KEEP_UNIT:
643 arg_keep_unit = true;
644 break;
645
6afc95b7
LP
646 case ARG_PERSONALITY:
647
ac45f971 648 arg_personality = personality_from_string(optarg);
6afc95b7
LP
649 if (arg_personality == 0xffffffffLU) {
650 log_error("Unknown or unsupported personality '%s'.", optarg);
651 return -EINVAL;
652 }
653
654 break;
655
4d9f07b4
LP
656 case ARG_VOLATILE:
657
658 if (!optarg)
659 arg_volatile = VOLATILE_YES;
660 else {
661 r = parse_boolean(optarg);
662 if (r < 0) {
663 if (streq(optarg, "state"))
664 arg_volatile = VOLATILE_STATE;
665 else {
666 log_error("Failed to parse --volatile= argument: %s", optarg);
667 return r;
668 }
669 } else
670 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
671 }
672
673 break;
674
6d0b55c2
LP
675 case 'p': {
676 const char *split, *e;
677 uint16_t container_port, host_port;
678 int protocol;
679 ExposePort *p;
680
681 if ((e = startswith(optarg, "tcp:")))
682 protocol = IPPROTO_TCP;
683 else if ((e = startswith(optarg, "udp:")))
684 protocol = IPPROTO_UDP;
685 else {
686 e = optarg;
687 protocol = IPPROTO_TCP;
688 }
689
690 split = strchr(e, ':');
691 if (split) {
692 char v[split - e + 1];
693
694 memcpy(v, e, split - e);
695 v[split - e] = 0;
696
697 r = safe_atou16(v, &host_port);
698 if (r < 0 || host_port <= 0) {
699 log_error("Failed to parse host port: %s", optarg);
700 return -EINVAL;
701 }
702
703 r = safe_atou16(split + 1, &container_port);
704 } else {
705 r = safe_atou16(e, &container_port);
706 host_port = container_port;
707 }
708
709 if (r < 0 || container_port <= 0) {
710 log_error("Failed to parse host port: %s", optarg);
711 return -EINVAL;
712 }
713
714 LIST_FOREACH(ports, p, arg_expose_ports) {
715 if (p->protocol == protocol && p->host_port == host_port) {
716 log_error("Duplicate port specification: %s", optarg);
717 return -EINVAL;
718 }
719 }
720
721 p = new(ExposePort, 1);
722 if (!p)
723 return log_oom();
724
725 p->protocol = protocol;
726 p->host_port = host_port;
727 p->container_port = container_port;
728
729 LIST_PREPEND(ports, arg_expose_ports, p);
730
731 break;
732 }
733
88213476
LP
734 case '?':
735 return -EINVAL;
736
737 default:
eb9da376 738 assert_not_reached("Unhandled option");
88213476 739 }
88213476 740
eb91eb18
LP
741 if (arg_share_system)
742 arg_register = false;
743
744 if (arg_boot && arg_share_system) {
745 log_error("--boot and --share-system may not be combined.");
746 return -EINVAL;
747 }
748
89f7c846
LP
749 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
750 log_error("--keep-unit may not be used when invoked from a user session.");
751 return -EINVAL;
752 }
753
1b9e5b12
LP
754 if (arg_directory && arg_image) {
755 log_error("--directory= and --image= may not be combined.");
756 return -EINVAL;
757 }
758
ec16945e
LP
759 if (arg_template && arg_image) {
760 log_error("--template= and --image= may not be combined.");
761 return -EINVAL;
762 }
763
764 if (arg_template && !(arg_directory || arg_machine)) {
765 log_error("--template= needs --directory= or --machine=.");
766 return -EINVAL;
767 }
768
769 if (arg_ephemeral && arg_template) {
770 log_error("--ephemeral and --template= may not be combined.");
771 return -EINVAL;
772 }
773
774 if (arg_ephemeral && arg_image) {
775 log_error("--ephemeral and --image= may not be combined.");
776 return -EINVAL;
777 }
778
df9a75e4
LP
779 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
780 log_error("--ephemeral and --link-journal= may not be combined.");
781 return -EINVAL;
782 }
783
4d9f07b4
LP
784 if (arg_volatile != VOLATILE_NO && arg_read_only) {
785 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
786 return -EINVAL;
787 }
788
6d0b55c2
LP
789 if (arg_expose_ports && !arg_private_network) {
790 log_error("Cannot use --port= without private networking.");
791 return -EINVAL;
792 }
793
a42c8b54
LP
794 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
795
88213476
LP
796 return 1;
797}
798
799static int mount_all(const char *dest) {
800
801 typedef struct MountPoint {
802 const char *what;
803 const char *where;
804 const char *type;
805 const char *options;
806 unsigned long flags;
3bd66c05 807 bool fatal;
88213476
LP
808 } MountPoint;
809
810 static const MountPoint mount_table[] = {
06c17c39
LP
811 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
812 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
813 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
814 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
815 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
f2d88580 816 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
06c17c39
LP
817 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
818 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
bbb99c30 819 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true },
9b634ea5 820#ifdef HAVE_SELINUX
06c17c39
LP
821 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
822 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 823#endif
88213476
LP
824 };
825
826 unsigned k;
827 int r = 0;
828
829 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
7fd1b19b 830 _cleanup_free_ char *where = NULL;
d002827b 831#ifdef HAVE_SELINUX
a8828ed9 832 _cleanup_free_ char *options = NULL;
d002827b
LP
833#endif
834 const char *o;
88213476
LP
835 int t;
836
17fe0523
LP
837 where = strjoin(dest, "/", mount_table[k].where, NULL);
838 if (!where)
839 return log_oom();
88213476 840
e65aec12 841 t = path_is_mount_point(where, true);
68fb0892 842 if (t < 0) {
da927ba9 843 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
88213476
LP
844
845 if (r == 0)
846 r = t;
847
848 continue;
849 }
850
9c1c7f71
LP
851 /* Skip this entry if it is not a remount. */
852 if (mount_table[k].what && t > 0)
014a9c77
LP
853 continue;
854
79d80fc1
TG
855 t = mkdir_p(where, 0755);
856 if (t < 0) {
857 if (mount_table[k].fatal) {
da927ba9 858 log_error_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
859
860 if (r == 0)
861 r = t;
862 } else
da927ba9 863 log_warning_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
864
865 continue;
866 }
88213476 867
a8828ed9 868#ifdef HAVE_SELINUX
82adf6af
LP
869 if (arg_selinux_apifs_context &&
870 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
871 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
872 if (!options)
873 return log_oom();
874
875 o = options;
876 } else
a8828ed9 877#endif
d002827b 878 o = mount_table[k].options;
a8828ed9 879
a8828ed9 880
88213476
LP
881 if (mount(mount_table[k].what,
882 where,
883 mount_table[k].type,
884 mount_table[k].flags,
79d80fc1 885 o) < 0) {
88213476 886
79d80fc1 887 if (mount_table[k].fatal) {
56f64d95 888 log_error_errno(errno, "mount(%s) failed: %m", where);
88213476 889
79d80fc1
TG
890 if (r == 0)
891 r = -errno;
892 } else
56f64d95 893 log_warning_errno(errno, "mount(%s) failed: %m", where);
88213476 894 }
88213476
LP
895 }
896
e58a1277
LP
897 return r;
898}
f8440af5 899
d6797c92 900static int mount_binds(const char *dest, char **l, bool ro) {
17fe0523
LP
901 char **x, **y;
902
903 STRV_FOREACH_PAIR(x, y, l) {
06c17c39 904 _cleanup_free_ char *where = NULL;
d2421337 905 struct stat source_st, dest_st;
2ed4e5e0 906 int r;
d2421337 907
4a62c710
MS
908 if (stat(*x, &source_st) < 0)
909 return log_error_errno(errno, "Failed to stat %s: %m", *x);
17fe0523 910
06c17c39
LP
911 where = strappend(dest, *y);
912 if (!where)
913 return log_oom();
914
2ed4e5e0
SL
915 r = stat(where, &dest_st);
916 if (r == 0) {
05e7da5a
AC
917 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
918 log_error("Cannot bind mount directory %s on file %s.", *x, where);
919 return -EINVAL;
920 }
921 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
922 log_error("Cannot bind mount file %s on directory %s.", *x, where);
d2421337
DR
923 return -EINVAL;
924 }
2ed4e5e0
SL
925 } else if (errno == ENOENT) {
926 r = mkdir_parents_label(where, 0755);
f647962d
MS
927 if (r < 0)
928 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
2ed4e5e0 929 } else {
56f64d95 930 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
2ed4e5e0
SL
931 return -errno;
932 }
06c17c39 933
05e7da5a
AC
934 /* Create the mount point. Any non-directory file can be
935 * mounted on any non-directory file (regular, fifo, socket,
936 * char, block).
937 */
79d80fc1
TG
938 if (S_ISDIR(source_st.st_mode)) {
939 r = mkdir_label(where, 0755);
f647962d
MS
940 if (r < 0 && errno != EEXIST)
941 return log_error_errno(r, "Failed to create mount point %s: %m", where);
05e7da5a 942 } else {
79d80fc1 943 r = touch(where);
f647962d
MS
944 if (r < 0)
945 return log_error_errno(r, "Failed to create mount point %s: %m", where);
d2421337 946 }
17fe0523 947
4a62c710
MS
948 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
949 return log_error_errno(errno, "mount(%s) failed: %m", where);
17fe0523 950
d6797c92
LP
951 if (ro) {
952 r = bind_remount_recursive(where, true);
f647962d
MS
953 if (r < 0)
954 return log_error_errno(r, "Read-Only bind mount failed: %m");
17fe0523
LP
955 }
956 }
957
958 return 0;
959}
960
b12afc8c
LP
961static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
962 char *to;
963 int r;
964
965 to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
966
967 r = path_is_mount_point(to, false);
968 if (r < 0)
969 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
970 if (r > 0)
971 return 0;
972
973 mkdir_p(to, 0755);
974
975 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
976 return log_error_errno(errno, "Failed to mount to %s: %m", to);
977
978 return 1;
979}
980
981static int mount_cgroup(const char *dest) {
982 _cleanup_set_free_free_ Set *controllers = NULL;
983 _cleanup_free_ char *own_cgroup_path = NULL;
984 const char *cgroup_root, *systemd_root, *systemd_own;
985 int r;
986
987 controllers = set_new(&string_hash_ops);
988 if (!controllers)
989 return log_oom();
990
991 r = cg_kernel_controllers(controllers);
992 if (r < 0)
993 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
994
995 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
996 if (r < 0)
997 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
998
999 cgroup_root = strappenda(dest, "/sys/fs/cgroup");
1000 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1001 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1002
1003 for (;;) {
1004 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1005
1006 controller = set_steal_first(controllers);
1007 if (!controller)
1008 break;
1009
1010 origin = strappend("/sys/fs/cgroup/", controller);
1011 if (!origin)
1012 return log_oom();
1013
1014 r = readlink_malloc(origin, &combined);
1015 if (r == -EINVAL) {
1016 /* Not a symbolic link, but directly a single cgroup hierarchy */
1017
1018 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1019 if (r < 0)
1020 return r;
1021
1022 } else if (r < 0)
1023 return log_error_errno(r, "Failed to read link %s: %m", origin);
1024 else {
1025 _cleanup_free_ char *target = NULL;
1026
1027 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1028 if (!target)
1029 return log_oom();
1030
1031 /* A symbolic link, a combination of controllers in one hierarchy */
1032
1033 if (!filename_is_valid(combined)) {
1034 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1035 continue;
1036 }
1037
1038 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1039 if (r < 0)
1040 return r;
1041
1042 if (symlink(combined, target) < 0)
83521414 1043 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
b12afc8c
LP
1044 }
1045 }
1046
1047 r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1048 if (r < 0)
1049 return r;
1050
1051 /* Make our own cgroup a (writable) bind mount */
1052 systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1053 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1054 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1055
1056 /* And then remount the systemd cgroup root read-only */
1057 systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1058 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1059 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1060
1061 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1062 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1063
1064 return 0;
1065}
1066
06c17c39
LP
1067static int mount_tmpfs(const char *dest) {
1068 char **i, **o;
1069
1070 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1071 _cleanup_free_ char *where = NULL;
79d80fc1 1072 int r;
06c17c39
LP
1073
1074 where = strappend(dest, *i);
1075 if (!where)
1076 return log_oom();
1077
79d80fc1 1078 r = mkdir_label(where, 0755);
04a91939
LP
1079 if (r < 0 && r != -EEXIST)
1080 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
06c17c39 1081
4a62c710
MS
1082 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1083 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
06c17c39
LP
1084 }
1085
1086 return 0;
1087}
1088
e58a1277 1089static int setup_timezone(const char *dest) {
d4036145
LP
1090 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1091 char *z, *y;
1092 int r;
f8440af5 1093
e58a1277
LP
1094 assert(dest);
1095
1096 /* Fix the timezone, if possible */
d4036145
LP
1097 r = readlink_malloc("/etc/localtime", &p);
1098 if (r < 0) {
1099 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1100 return 0;
1101 }
1102
1103 z = path_startswith(p, "../usr/share/zoneinfo/");
1104 if (!z)
1105 z = path_startswith(p, "/usr/share/zoneinfo/");
1106 if (!z) {
1107 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1108 return 0;
1109 }
1110
04bc4a3f
LP
1111 where = strappend(dest, "/etc/localtime");
1112 if (!where)
0d0f0c50 1113 return log_oom();
715ac17a 1114
d4036145
LP
1115 r = readlink_malloc(where, &q);
1116 if (r >= 0) {
1117 y = path_startswith(q, "../usr/share/zoneinfo/");
1118 if (!y)
1119 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1120
d4036145
LP
1121 /* Already pointing to the right place? Then do nothing .. */
1122 if (y && streq(y, z))
1123 return 0;
1124 }
1125
1126 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1127 if (!check)
0d0f0c50 1128 return log_oom();
4d1c38b8 1129
d4036145
LP
1130 if (access(check, F_OK) < 0) {
1131 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1132 return 0;
1133 }
68fb0892 1134
d4036145
LP
1135 what = strappend("../usr/share/zoneinfo/", z);
1136 if (!what)
1137 return log_oom();
1138
79d80fc1
TG
1139 r = mkdir_parents(where, 0755);
1140 if (r < 0) {
da927ba9 1141 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
79d80fc1
TG
1142
1143 return 0;
1144 }
1145
1146 r = unlink(where);
1147 if (r < 0 && errno != ENOENT) {
56f64d95 1148 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1149
1150 return 0;
1151 }
4d9f07b4 1152
d4036145 1153 if (symlink(what, where) < 0) {
56f64d95 1154 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1155 return 0;
1156 }
e58a1277
LP
1157
1158 return 0;
88213476
LP
1159}
1160
2547bb41 1161static int setup_resolv_conf(const char *dest) {
c8b32e11 1162 _cleanup_free_ char *where = NULL;
79d80fc1 1163 int r;
2547bb41
LP
1164
1165 assert(dest);
1166
1167 if (arg_private_network)
1168 return 0;
1169
1170 /* Fix resolv.conf, if possible */
04bc4a3f
LP
1171 where = strappend(dest, "/etc/resolv.conf");
1172 if (!where)
0d0f0c50 1173 return log_oom();
2547bb41 1174
77e63faf
LP
1175 /* We don't really care for the results of this really. If it
1176 * fails, it fails, but meh... */
79d80fc1
TG
1177 r = mkdir_parents(where, 0755);
1178 if (r < 0) {
da927ba9 1179 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
79d80fc1
TG
1180
1181 return 0;
1182 }
1183
f2068bcc 1184 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1185 if (r < 0) {
da927ba9 1186 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1187
1188 return 0;
1189 }
2547bb41
LP
1190
1191 return 0;
1192}
1193
4d9f07b4
LP
1194static int setup_volatile_state(const char *directory) {
1195 const char *p;
1196 int r;
1197
1198 assert(directory);
1199
1200 if (arg_volatile != VOLATILE_STATE)
1201 return 0;
1202
1203 /* --volatile=state means we simply overmount /var
1204 with a tmpfs, and the rest read-only. */
1205
1206 r = bind_remount_recursive(directory, true);
f647962d
MS
1207 if (r < 0)
1208 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
4d9f07b4
LP
1209
1210 p = strappenda(directory, "/var");
79d80fc1 1211 r = mkdir(p, 0755);
4a62c710
MS
1212 if (r < 0 && errno != EEXIST)
1213 return log_error_errno(errno, "Failed to create %s: %m", directory);
4d9f07b4 1214
4a62c710
MS
1215 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1216 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
4d9f07b4
LP
1217
1218 return 0;
1219}
1220
1221static int setup_volatile(const char *directory) {
1222 bool tmpfs_mounted = false, bind_mounted = false;
1223 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1224 const char *f, *t;
1225 int r;
1226
1227 assert(directory);
1228
1229 if (arg_volatile != VOLATILE_YES)
1230 return 0;
1231
1232 /* --volatile=yes means we mount a tmpfs to the root dir, and
1233 the original /usr to use inside it, and that read-only. */
1234
4a62c710
MS
1235 if (!mkdtemp(template))
1236 return log_error_errno(errno, "Failed to create temporary directory: %m");
4d9f07b4
LP
1237
1238 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
56f64d95 1239 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
4d9f07b4
LP
1240 r = -errno;
1241 goto fail;
1242 }
1243
1244 tmpfs_mounted = true;
1245
1246 f = strappenda(directory, "/usr");
1247 t = strappenda(template, "/usr");
1248
79d80fc1
TG
1249 r = mkdir(t, 0755);
1250 if (r < 0 && errno != EEXIST) {
56f64d95 1251 log_error_errno(errno, "Failed to create %s: %m", t);
79d80fc1
TG
1252 r = -errno;
1253 goto fail;
1254 }
1255
4d9f07b4 1256 if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
56f64d95 1257 log_error_errno(errno, "Failed to create /usr bind mount: %m");
4d9f07b4
LP
1258 r = -errno;
1259 goto fail;
1260 }
1261
1262 bind_mounted = true;
1263
1264 r = bind_remount_recursive(t, true);
1265 if (r < 0) {
da927ba9 1266 log_error_errno(r, "Failed to remount %s read-only: %m", t);
4d9f07b4
LP
1267 goto fail;
1268 }
1269
1270 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
56f64d95 1271 log_error_errno(errno, "Failed to move root mount: %m");
4d9f07b4
LP
1272 r = -errno;
1273 goto fail;
1274 }
1275
1276 rmdir(template);
1277
1278 return 0;
1279
1280fail:
1281 if (bind_mounted)
1282 umount(t);
1283 if (tmpfs_mounted)
1284 umount(template);
1285 rmdir(template);
1286 return r;
1287}
1288
9f24adc2
LP
1289static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1290
1291 snprintf(s, 37,
1292 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1293 SD_ID128_FORMAT_VAL(id));
1294
1295 return s;
1296}
1297
04bc4a3f 1298static int setup_boot_id(const char *dest) {
7fd1b19b 1299 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 1300 sd_id128_t rnd = {};
04bc4a3f
LP
1301 char as_uuid[37];
1302 int r;
1303
1304 assert(dest);
1305
eb91eb18
LP
1306 if (arg_share_system)
1307 return 0;
1308
04bc4a3f
LP
1309 /* Generate a new randomized boot ID, so that each boot-up of
1310 * the container gets a new one */
1311
1312 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 1313 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
1314 if (!from || !to)
1315 return log_oom();
04bc4a3f
LP
1316
1317 r = sd_id128_randomize(&rnd);
f647962d
MS
1318 if (r < 0)
1319 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1320
9f24adc2 1321 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1322
574d5f2d 1323 r = write_string_file(from, as_uuid);
f647962d
MS
1324 if (r < 0)
1325 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f
LP
1326
1327 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
56f64d95 1328 log_error_errno(errno, "Failed to bind mount boot id: %m");
04bc4a3f 1329 r = -errno;
10d18763 1330 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
56f64d95 1331 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1332
1333 unlink(from);
04bc4a3f
LP
1334 return r;
1335}
1336
e58a1277 1337static int copy_devnodes(const char *dest) {
88213476
LP
1338
1339 static const char devnodes[] =
1340 "null\0"
1341 "zero\0"
1342 "full\0"
1343 "random\0"
1344 "urandom\0"
85614d66
TG
1345 "tty\0"
1346 "net/tun\0";
88213476
LP
1347
1348 const char *d;
e58a1277 1349 int r = 0;
7fd1b19b 1350 _cleanup_umask_ mode_t u;
a258bf26
LP
1351
1352 assert(dest);
124640f1
LP
1353
1354 u = umask(0000);
88213476
LP
1355
1356 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1357 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1358 struct stat st;
88213476 1359
7f112f50
LP
1360 from = strappend("/dev/", d);
1361 to = strjoin(dest, "/dev/", d, NULL);
1362 if (!from || !to)
1363 return log_oom();
88213476
LP
1364
1365 if (stat(from, &st) < 0) {
1366
4a62c710
MS
1367 if (errno != ENOENT)
1368 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1369
a258bf26 1370 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1371
ed8b7a3e 1372 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 1373 return -EIO;
a258bf26 1374
85614d66
TG
1375 } else {
1376 r = mkdir_parents(to, 0775);
1377 if (r < 0) {
da927ba9 1378 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
85614d66
TG
1379 return -r;
1380 }
a258bf26 1381
4a62c710 1382 if (mknod(to, st.st_mode, st.st_rdev) < 0)
080e7832 1383 return log_error_errno(errno, "mknod(%s) failed: %m", to);
88213476 1384 }
88213476
LP
1385 }
1386
e58a1277
LP
1387 return r;
1388}
88213476 1389
f2d88580
LP
1390static int setup_ptmx(const char *dest) {
1391 _cleanup_free_ char *p = NULL;
1392
1393 p = strappend(dest, "/dev/ptmx");
1394 if (!p)
1395 return log_oom();
1396
4a62c710
MS
1397 if (symlink("pts/ptmx", p) < 0)
1398 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
f2d88580
LP
1399
1400 return 0;
1401}
1402
e58a1277 1403static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1404 _cleanup_umask_ mode_t u;
1405 const char *to;
e58a1277 1406 struct stat st;
e58a1277 1407 int r;
e58a1277
LP
1408
1409 assert(dest);
1410 assert(console);
1411
1412 u = umask(0000);
1413
4a62c710
MS
1414 if (stat("/dev/null", &st) < 0)
1415 return log_error_errno(errno, "Failed to stat /dev/null: %m");
88213476 1416
e58a1277 1417 r = chmod_and_chown(console, 0600, 0, 0);
f647962d
MS
1418 if (r < 0)
1419 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1420
a258bf26
LP
1421 /* We need to bind mount the right tty to /dev/console since
1422 * ptys can only exist on pts file systems. To have something
eb0f0863
LP
1423 * to bind mount things on we create a device node first, and
1424 * use /dev/null for that since we the cgroups device policy
1425 * allows us to create that freely, while we cannot create
1426 * /dev/console. (Note that the major minor doesn't actually
1427 * matter here, since we mount it over anyway). */
a258bf26 1428
eb0f0863 1429 to = strappenda(dest, "/dev/console");
4a62c710
MS
1430 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1431 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
a258bf26 1432
4a62c710
MS
1433 if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1434 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1435
25ea79fe 1436 return 0;
e58a1277
LP
1437}
1438
1439static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 1440 _cleanup_free_ char *from = NULL, *to = NULL;
7fd1b19b 1441 _cleanup_umask_ mode_t u;
6d0b55c2 1442 int r, fd, k;
e58a1277
LP
1443 union {
1444 struct cmsghdr cmsghdr;
1445 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1446 } control = {};
1447 struct msghdr mh = {
1448 .msg_control = &control,
1449 .msg_controllen = sizeof(control),
1450 };
e58a1277
LP
1451 struct cmsghdr *cmsg;
1452
1453 assert(dest);
1454 assert(kmsg_socket >= 0);
a258bf26 1455
e58a1277 1456 u = umask(0000);
a258bf26 1457
f1e5dfe2
LP
1458 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1459 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1460 * on the reading side behave very similar to /proc/kmsg,
1461 * their writing side behaves differently from /dev/kmsg in
1462 * that writing blocks when nothing is reading. In order to
1463 * avoid any problems with containers deadlocking due to this
1464 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
1465 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1466 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1467 return log_oom();
e58a1277 1468
4a62c710
MS
1469 if (mkfifo(from, 0600) < 0)
1470 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
e58a1277
LP
1471
1472 r = chmod_and_chown(from, 0600, 0, 0);
f647962d
MS
1473 if (r < 0)
1474 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
e58a1277 1475
4a62c710
MS
1476 if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1477 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1478
1479 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1480 if (fd < 0)
1481 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1482
e58a1277
LP
1483 cmsg = CMSG_FIRSTHDR(&mh);
1484 cmsg->cmsg_level = SOL_SOCKET;
1485 cmsg->cmsg_type = SCM_RIGHTS;
1486 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1487 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1488
1489 mh.msg_controllen = cmsg->cmsg_len;
1490
1491 /* Store away the fd in the socket, so that it stays open as
1492 * long as we run the child */
6d0b55c2 1493 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
03e334a1 1494 safe_close(fd);
e58a1277 1495
4a62c710
MS
1496 if (k < 0)
1497 return log_error_errno(errno, "Failed to send FIFO fd: %m");
a258bf26 1498
f1e5dfe2
LP
1499 /* And now make the FIFO unavailable as /dev/kmsg... */
1500 unlink(from);
25ea79fe 1501 return 0;
88213476
LP
1502}
1503
6d0b55c2
LP
1504static int send_rtnl(int send_fd) {
1505 union {
1506 struct cmsghdr cmsghdr;
1507 uint8_t buf[CMSG_SPACE(sizeof(int))];
1508 } control = {};
1509 struct msghdr mh = {
1510 .msg_control = &control,
1511 .msg_controllen = sizeof(control),
1512 };
1513 struct cmsghdr *cmsg;
1514 _cleanup_close_ int fd = -1;
1515 ssize_t k;
1516
1517 assert(send_fd >= 0);
1518
1519 if (!arg_expose_ports)
1520 return 0;
1521
1522 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1523 if (fd < 0)
1524 return log_error_errno(errno, "failed to allocate container netlink: %m");
1525
1526 cmsg = CMSG_FIRSTHDR(&mh);
1527 cmsg->cmsg_level = SOL_SOCKET;
1528 cmsg->cmsg_type = SCM_RIGHTS;
1529 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1530 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1531
1532 mh.msg_controllen = cmsg->cmsg_len;
1533
1534 /* Store away the fd in the socket, so that it stays open as
1535 * long as we run the child */
1536 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1537 if (k < 0)
1538 return log_error_errno(errno, "Failed to send netlink fd: %m");
1539
1540 return 0;
1541}
1542
1543static int flush_ports(union in_addr_union *exposed) {
1544 ExposePort *p;
1545 int r, af = AF_INET;
1546
1547 assert(exposed);
1548
1549 if (!arg_expose_ports)
1550 return 0;
1551
1552 if (in_addr_is_null(af, exposed))
1553 return 0;
1554
1555 log_debug("Lost IP address.");
1556
1557 LIST_FOREACH(ports, p, arg_expose_ports) {
1558 r = fw_add_local_dnat(false,
1559 af,
1560 p->protocol,
1561 NULL,
1562 NULL, 0,
1563 NULL, 0,
1564 p->host_port,
1565 exposed,
1566 p->container_port,
1567 NULL);
1568 if (r < 0)
1569 log_warning_errno(r, "Failed to modify firewall: %m");
1570 }
1571
1572 *exposed = IN_ADDR_NULL;
1573 return 0;
1574}
1575
1576static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1577 _cleanup_free_ struct local_address *addresses = NULL;
1578 _cleanup_free_ char *pretty = NULL;
1579 union in_addr_union new_exposed;
1580 ExposePort *p;
1581 bool add;
1582 int af = AF_INET, r;
1583
1584 assert(exposed);
1585
1586 /* Invoked each time an address is added or removed inside the
1587 * container */
1588
1589 if (!arg_expose_ports)
1590 return 0;
1591
1592 r = local_addresses(rtnl, 0, af, &addresses);
1593 if (r < 0)
1594 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1595
1596 add = r > 0 &&
1597 addresses[0].family == af &&
1598 addresses[0].scope < RT_SCOPE_LINK;
1599
1600 if (!add)
1601 return flush_ports(exposed);
1602
1603 new_exposed = addresses[0].address;
1604 if (in_addr_equal(af, exposed, &new_exposed))
1605 return 0;
1606
1607 in_addr_to_string(af, &new_exposed, &pretty);
1608 log_debug("New container IP is %s.", strna(pretty));
1609
1610 LIST_FOREACH(ports, p, arg_expose_ports) {
1611
1612 r = fw_add_local_dnat(true,
1613 af,
1614 p->protocol,
1615 NULL,
1616 NULL, 0,
1617 NULL, 0,
1618 p->host_port,
1619 &new_exposed,
1620 p->container_port,
1621 in_addr_is_null(af, exposed) ? NULL : exposed);
1622 if (r < 0)
1623 log_warning_errno(r, "Failed to modify firewall: %m");
1624 }
1625
1626 *exposed = new_exposed;
1627 return 0;
1628}
1629
1630static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1631 union in_addr_union *exposed = userdata;
1632
1633 assert(rtnl);
1634 assert(m);
1635 assert(exposed);
1636
1637 expose_ports(rtnl, exposed);
1638 return 0;
1639}
1640
1641static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1642 union {
1643 struct cmsghdr cmsghdr;
1644 uint8_t buf[CMSG_SPACE(sizeof(int))];
1645 } control = {};
1646 struct msghdr mh = {
1647 .msg_control = &control,
1648 .msg_controllen = sizeof(control),
1649 };
1650 struct cmsghdr *cmsg;
1651 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1652 int fd, r;
1653 ssize_t k;
1654
1655 assert(event);
1656 assert(recv_fd >= 0);
1657 assert(ret);
1658
1659 if (!arg_expose_ports)
1660 return 0;
1661
1662 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1663 if (k < 0)
1664 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1665
1666 cmsg = CMSG_FIRSTHDR(&mh);
1667 assert(cmsg->cmsg_level == SOL_SOCKET);
1668 assert(cmsg->cmsg_type == SCM_RIGHTS);
657bdca9 1669 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
6d0b55c2
LP
1670 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1671
1672 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1673 if (r < 0) {
1674 safe_close(fd);
1675 return log_error_errno(r, "Failed to create rtnl object: %m");
1676 }
1677
1678 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1679 if (r < 0)
1680 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1681
1682 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1683 if (r < 0)
1684 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1685
1686 r = sd_rtnl_attach_event(rtnl, event, 0);
1687 if (r < 0)
1688 return log_error_errno(r, "Failed to add to even loop: %m");
1689
1690 *ret = rtnl;
1691 rtnl = NULL;
1692
1693 return 0;
1694}
1695
3a74cea5 1696static int setup_hostname(void) {
3a74cea5 1697
eb91eb18
LP
1698 if (arg_share_system)
1699 return 0;
1700
605f81a8 1701 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1702 return -errno;
3a74cea5 1703
7027ff61 1704 return 0;
3a74cea5
LP
1705}
1706
57fb9fb5 1707static int setup_journal(const char *directory) {
4d680aee 1708 sd_id128_t machine_id, this_id;
7fd1b19b 1709 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 1710 char *id;
57fb9fb5
LP
1711 int r;
1712
df9a75e4
LP
1713 /* Don't link journals in ephemeral mode */
1714 if (arg_ephemeral)
1715 return 0;
1716
57fb9fb5 1717 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
1718 if (!p)
1719 return log_oom();
57fb9fb5
LP
1720
1721 r = read_one_line_file(p, &b);
27407a01
ZJS
1722 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1723 return 0;
f647962d
MS
1724 else if (r < 0)
1725 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
57fb9fb5 1726
27407a01
ZJS
1727 id = strstrip(b);
1728 if (isempty(id) && arg_link_journal == LINK_AUTO)
1729 return 0;
57fb9fb5 1730
27407a01
ZJS
1731 /* Verify validity */
1732 r = sd_id128_from_string(id, &machine_id);
f647962d
MS
1733 if (r < 0)
1734 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
57fb9fb5 1735
4d680aee 1736 r = sd_id128_get_machine(&this_id);
f647962d
MS
1737 if (r < 0)
1738 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1739
1740 if (sd_id128_equal(machine_id, this_id)) {
1741 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1742 "Host and machine ids are equal (%s): refusing to link journals", id);
1743 if (arg_link_journal == LINK_AUTO)
1744 return 0;
df9a75e4 1745 return -EEXIST;
4d680aee
ZJS
1746 }
1747
1748 if (arg_link_journal == LINK_NO)
1749 return 0;
1750
57fb9fb5 1751 free(p);
27407a01
ZJS
1752 p = strappend("/var/log/journal/", id);
1753 q = strjoin(directory, "/var/log/journal/", id, NULL);
1754 if (!p || !q)
1755 return log_oom();
1756
1757 if (path_is_mount_point(p, false) > 0) {
1758 if (arg_link_journal != LINK_AUTO) {
1759 log_error("%s: already a mount point, refusing to use for journal", p);
1760 return -EEXIST;
1761 }
1762
1763 return 0;
57fb9fb5
LP
1764 }
1765
27407a01 1766 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 1767 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1768 log_error("%s: already a mount point, refusing to use for journal", q);
1769 return -EEXIST;
57fb9fb5
LP
1770 }
1771
27407a01 1772 return 0;
57fb9fb5
LP
1773 }
1774
1775 r = readlink_and_make_absolute(p, &d);
1776 if (r >= 0) {
1777 if ((arg_link_journal == LINK_GUEST ||
1778 arg_link_journal == LINK_AUTO) &&
1779 path_equal(d, q)) {
1780
27407a01
ZJS
1781 r = mkdir_p(q, 0755);
1782 if (r < 0)
56f64d95 1783 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1784 return 0;
57fb9fb5
LP
1785 }
1786
4a62c710
MS
1787 if (unlink(p) < 0)
1788 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1789 } else if (r == -EINVAL) {
1790
1791 if (arg_link_journal == LINK_GUEST &&
1792 rmdir(p) < 0) {
1793
27407a01
ZJS
1794 if (errno == ENOTDIR) {
1795 log_error("%s already exists and is neither a symlink nor a directory", p);
1796 return r;
1797 } else {
56f64d95 1798 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 1799 return -errno;
57fb9fb5 1800 }
57fb9fb5
LP
1801 }
1802 } else if (r != -ENOENT) {
56f64d95 1803 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 1804 return r;
57fb9fb5
LP
1805 }
1806
1807 if (arg_link_journal == LINK_GUEST) {
1808
1809 if (symlink(q, p) < 0) {
574edc90 1810 if (arg_link_journal_try) {
56f64d95 1811 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
1812 return 0;
1813 } else {
56f64d95 1814 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
1815 return -errno;
1816 }
57fb9fb5
LP
1817 }
1818
27407a01
ZJS
1819 r = mkdir_p(q, 0755);
1820 if (r < 0)
56f64d95 1821 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1822 return 0;
57fb9fb5
LP
1823 }
1824
1825 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1826 /* don't create parents here -- if the host doesn't have
1827 * permanent journal set up, don't force it here */
1828 r = mkdir(p, 0755);
57fb9fb5 1829 if (r < 0) {
574edc90 1830 if (arg_link_journal_try) {
56f64d95 1831 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
1832 return 0;
1833 } else {
56f64d95 1834 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
1835 return r;
1836 }
57fb9fb5
LP
1837 }
1838
27407a01
ZJS
1839 } else if (access(p, F_OK) < 0)
1840 return 0;
57fb9fb5 1841
cdb2b9d0
LP
1842 if (dir_is_empty(q) == 0)
1843 log_warning("%s is not empty, proceeding anyway.", q);
1844
57fb9fb5
LP
1845 r = mkdir_p(q, 0755);
1846 if (r < 0) {
56f64d95 1847 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 1848 return r;
57fb9fb5
LP
1849 }
1850
4a62c710
MS
1851 if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1852 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1853
27407a01 1854 return 0;
57fb9fb5
LP
1855}
1856
88213476 1857static int drop_capabilities(void) {
5076f0cc 1858 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1859}
1860
5aa4bb6b 1861static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 1862 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
24996861 1863 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
1864 int r;
1865
eb91eb18
LP
1866 if (!arg_register)
1867 return 0;
1868
1c03020c 1869 r = sd_bus_default_system(&bus);
f647962d
MS
1870 if (r < 0)
1871 return log_error_errno(r, "Failed to open system bus: %m");
9444b1f2 1872
89f7c846
LP
1873 if (arg_keep_unit) {
1874 r = sd_bus_call_method(
1875 bus,
1876 "org.freedesktop.machine1",
1877 "/org/freedesktop/machine1",
1878 "org.freedesktop.machine1.Manager",
5aa4bb6b 1879 "RegisterMachineWithNetwork",
89f7c846
LP
1880 &error,
1881 NULL,
5aa4bb6b 1882 "sayssusai",
89f7c846
LP
1883 arg_machine,
1884 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1885 "nspawn",
1886 "container",
1887 (uint32_t) pid,
5aa4bb6b
LP
1888 strempty(arg_directory),
1889 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 1890 } else {
9457ac5b
LP
1891 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1892
1893 r = sd_bus_message_new_method_call(
89f7c846 1894 bus,
9457ac5b 1895 &m,
89f7c846
LP
1896 "org.freedesktop.machine1",
1897 "/org/freedesktop/machine1",
1898 "org.freedesktop.machine1.Manager",
5aa4bb6b 1899 "CreateMachineWithNetwork");
f647962d
MS
1900 if (r < 0)
1901 return log_error_errno(r, "Failed to create message: %m");
9457ac5b
LP
1902
1903 r = sd_bus_message_append(
1904 m,
5aa4bb6b 1905 "sayssusai",
89f7c846
LP
1906 arg_machine,
1907 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1908 "nspawn",
1909 "container",
1910 (uint32_t) pid,
5aa4bb6b
LP
1911 strempty(arg_directory),
1912 local_ifindex > 0 ? 1 : 0, local_ifindex);
f647962d
MS
1913 if (r < 0)
1914 return log_error_errno(r, "Failed to append message arguments: %m");
9457ac5b
LP
1915
1916 r = sd_bus_message_open_container(m, 'a', "(sv)");
f647962d
MS
1917 if (r < 0)
1918 return log_error_errno(r, "Failed to open container: %m");
9457ac5b
LP
1919
1920 if (!isempty(arg_slice)) {
1921 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
f647962d
MS
1922 if (r < 0)
1923 return log_error_errno(r, "Failed to append slice: %m");
9457ac5b
LP
1924 }
1925
1926 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
f647962d
MS
1927 if (r < 0)
1928 return log_error_errno(r, "Failed to add device policy: %m");
9457ac5b 1929
63cc4c31 1930 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
9457ac5b
LP
1931 /* Allow the container to
1932 * access and create the API
1933 * device nodes, so that
1934 * PrivateDevices= in the
1935 * container can work
1936 * fine */
1937 "/dev/null", "rwm",
1938 "/dev/zero", "rwm",
1939 "/dev/full", "rwm",
1940 "/dev/random", "rwm",
1941 "/dev/urandom", "rwm",
1942 "/dev/tty", "rwm",
864e1706 1943 "/dev/net/tun", "rwm",
9457ac5b
LP
1944 /* Allow the container
1945 * access to ptys. However,
1946 * do not permit the
1947 * container to ever create
1948 * these device nodes. */
1949 "/dev/pts/ptmx", "rw",
63cc4c31 1950 "char-pts", "rw");
f647962d
MS
1951 if (r < 0)
1952 return log_error_errno(r, "Failed to add device whitelist: %m");
9457ac5b
LP
1953
1954 r = sd_bus_message_close_container(m);
f647962d
MS
1955 if (r < 0)
1956 return log_error_errno(r, "Failed to close container: %m");
9457ac5b
LP
1957
1958 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
1959 }
1960
9444b1f2 1961 if (r < 0) {
1f0cd86b
LP
1962 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1963 return r;
1964 }
1965
1966 return 0;
1967}
1968
1969static int terminate_machine(pid_t pid) {
1970 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1971 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
24996861 1972 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
1973 const char *path;
1974 int r;
1975
eb91eb18
LP
1976 if (!arg_register)
1977 return 0;
1978
76b54375 1979 r = sd_bus_default_system(&bus);
f647962d
MS
1980 if (r < 0)
1981 return log_error_errno(r, "Failed to open system bus: %m");
1f0cd86b
LP
1982
1983 r = sd_bus_call_method(
1984 bus,
1985 "org.freedesktop.machine1",
1986 "/org/freedesktop/machine1",
1987 "org.freedesktop.machine1.Manager",
1988 "GetMachineByPID",
1989 &error,
1990 &reply,
1991 "u",
1992 (uint32_t) pid);
1993 if (r < 0) {
1994 /* Note that the machine might already have been
1995 * cleaned up automatically, hence don't consider it a
1996 * failure if we cannot get the machine object. */
1997 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1998 return 0;
1999 }
2000
2001 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
2002 if (r < 0)
2003 return bus_log_parse_error(r);
9444b1f2 2004
1f0cd86b
LP
2005 r = sd_bus_call_method(
2006 bus,
2007 "org.freedesktop.machine1",
2008 path,
2009 "org.freedesktop.machine1.Machine",
2010 "Terminate",
2011 &error,
2012 NULL,
2013 NULL);
2014 if (r < 0) {
2015 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2016 return 0;
2017 }
2018
9444b1f2
LP
2019 return 0;
2020}
2021
db999e0f
LP
2022static int reset_audit_loginuid(void) {
2023 _cleanup_free_ char *p = NULL;
2024 int r;
2025
2026 if (arg_share_system)
2027 return 0;
2028
2029 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2030 if (r == -ENOENT)
db999e0f 2031 return 0;
f647962d
MS
2032 if (r < 0)
2033 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2034
2035 /* Already reset? */
2036 if (streq(p, "4294967295"))
2037 return 0;
2038
2039 r = write_string_file("/proc/self/loginuid", "4294967295");
2040 if (r < 0) {
2041 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2042 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2043 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2044 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2045 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
77b6e194 2046
db999e0f 2047 sleep(5);
77b6e194 2048 }
db999e0f
LP
2049
2050 return 0;
77b6e194
LP
2051}
2052
4f758c23
LP
2053#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2054#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
e867ceb6 2055#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
01dde061 2056
a90e2305 2057static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
01dde061
TG
2058 uint8_t result[8];
2059 size_t l, sz;
a90e2305
LP
2060 uint8_t *v, *i;
2061 int r;
01dde061
TG
2062
2063 l = strlen(arg_machine);
2064 sz = sizeof(sd_id128_t) + l;
e867ceb6
LP
2065 if (idx > 0)
2066 sz += sizeof(idx);
a90e2305 2067
01dde061
TG
2068 v = alloca(sz);
2069
2070 /* fetch some persistent data unique to the host */
2071 r = sd_id128_get_machine((sd_id128_t*) v);
2072 if (r < 0)
2073 return r;
2074
2075 /* combine with some data unique (on this host) to this
2076 * container instance */
a90e2305
LP
2077 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2078 if (idx > 0) {
2079 idx = htole64(idx);
2080 memcpy(i, &idx, sizeof(idx));
2081 }
01dde061
TG
2082
2083 /* Let's hash the host machine ID plus the container name. We
2084 * use a fixed, but originally randomly created hash key here. */
4f758c23 2085 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
2086
2087 assert_cc(ETH_ALEN <= sizeof(result));
2088 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2089
2090 /* see eth_random_addr in the kernel */
2091 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2092 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2093
2094 return 0;
2095}
2096
5aa4bb6b 2097static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
69c79d3c 2098 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 2099 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4f758c23 2100 struct ether_addr mac_host, mac_container;
5aa4bb6b 2101 int r, i;
69c79d3c
LP
2102
2103 if (!arg_private_network)
2104 return 0;
2105
2106 if (!arg_network_veth)
2107 return 0;
2108
08af0da2
LP
2109 /* Use two different interface name prefixes depending whether
2110 * we are in bridge mode or not. */
c00524c9 2111 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 2112 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 2113
e867ceb6
LP
2114 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2115 if (r < 0)
2116 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
4f758c23 2117
e867ceb6
LP
2118 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2119 if (r < 0)
2120 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
01dde061 2121
151b9b96 2122 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2123 if (r < 0)
2124 return log_error_errno(r, "Failed to connect to netlink: %m");
69c79d3c 2125
151b9b96 2126 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2127 if (r < 0)
2128 return log_error_errno(r, "Failed to allocate netlink message: %m");
69c79d3c 2129
ab046dde 2130 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
f647962d
MS
2131 if (r < 0)
2132 return log_error_errno(r, "Failed to add netlink interface name: %m");
69c79d3c 2133
4f758c23 2134 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
f647962d
MS
2135 if (r < 0)
2136 return log_error_errno(r, "Failed to add netlink MAC address: %m");
4f758c23 2137
ee3a6a51 2138 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2139 if (r < 0)
2140 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2141
d8e538ec 2142 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
f647962d
MS
2143 if (r < 0)
2144 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2145
ee3a6a51 2146 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
f647962d
MS
2147 if (r < 0)
2148 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2149
ab046dde 2150 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
f647962d
MS
2151 if (r < 0)
2152 return log_error_errno(r, "Failed to add netlink interface name: %m");
01dde061 2153
4f758c23 2154 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
f647962d
MS
2155 if (r < 0)
2156 return log_error_errno(r, "Failed to add netlink MAC address: %m");
69c79d3c 2157
ab046dde 2158 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2159 if (r < 0)
2160 return log_error_errno(r, "Failed to add netlink namespace field: %m");
69c79d3c
LP
2161
2162 r = sd_rtnl_message_close_container(m);
f647962d
MS
2163 if (r < 0)
2164 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2165
2166 r = sd_rtnl_message_close_container(m);
f647962d
MS
2167 if (r < 0)
2168 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2169
2170 r = sd_rtnl_message_close_container(m);
f647962d
MS
2171 if (r < 0)
2172 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2173
2174 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2175 if (r < 0)
2176 return log_error_errno(r, "Failed to add new veth interfaces: %m");
69c79d3c 2177
5aa4bb6b 2178 i = (int) if_nametoindex(iface_name);
4a62c710
MS
2179 if (i <= 0)
2180 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
5aa4bb6b
LP
2181
2182 *ifi = i;
2183
69c79d3c
LP
2184 return 0;
2185}
2186
5aa4bb6b 2187static int setup_bridge(const char veth_name[], int *ifi) {
ab046dde
TG
2188 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2189 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2190 int r, bridge;
2191
2192 if (!arg_private_network)
2193 return 0;
2194
2195 if (!arg_network_veth)
2196 return 0;
2197
2198 if (!arg_network_bridge)
2199 return 0;
2200
2201 bridge = (int) if_nametoindex(arg_network_bridge);
4a62c710
MS
2202 if (bridge <= 0)
2203 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
ab046dde 2204
5aa4bb6b
LP
2205 *ifi = bridge;
2206
151b9b96 2207 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2208 if (r < 0)
2209 return log_error_errno(r, "Failed to connect to netlink: %m");
ab046dde 2210
151b9b96 2211 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
f647962d
MS
2212 if (r < 0)
2213 return log_error_errno(r, "Failed to allocate netlink message: %m");
ab046dde 2214
039dd4af 2215 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
f647962d
MS
2216 if (r < 0)
2217 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
039dd4af 2218
ab046dde 2219 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
f647962d
MS
2220 if (r < 0)
2221 return log_error_errno(r, "Failed to add netlink interface name field: %m");
ab046dde
TG
2222
2223 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
f647962d
MS
2224 if (r < 0)
2225 return log_error_errno(r, "Failed to add netlink master field: %m");
ab046dde
TG
2226
2227 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2228 if (r < 0)
2229 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
ab046dde
TG
2230
2231 return 0;
2232}
2233
c74e630d
LP
2234static int parse_interface(struct udev *udev, const char *name) {
2235 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2236 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2237 int ifi;
2238
2239 ifi = (int) if_nametoindex(name);
4a62c710
MS
2240 if (ifi <= 0)
2241 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
c74e630d
LP
2242
2243 sprintf(ifi_str, "n%i", ifi);
2244 d = udev_device_new_from_device_id(udev, ifi_str);
4a62c710
MS
2245 if (!d)
2246 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
c74e630d
LP
2247
2248 if (udev_device_get_is_initialized(d) <= 0) {
2249 log_error("Network interface %s is not initialized yet.", name);
2250 return -EBUSY;
2251 }
2252
2253 return ifi;
2254}
2255
69c79d3c 2256static int move_network_interfaces(pid_t pid) {
7e227024 2257 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 2258 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
2259 char **i;
2260 int r;
2261
2262 if (!arg_private_network)
2263 return 0;
2264
2265 if (strv_isempty(arg_network_interfaces))
2266 return 0;
2267
151b9b96 2268 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2269 if (r < 0)
2270 return log_error_errno(r, "Failed to connect to netlink: %m");
aa28aefe 2271
7e227024
LP
2272 udev = udev_new();
2273 if (!udev) {
2274 log_error("Failed to connect to udev.");
2275 return -ENOMEM;
2276 }
2277
aa28aefe 2278 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 2279 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 2280 int ifi;
aa28aefe 2281
c74e630d
LP
2282 ifi = parse_interface(udev, *i);
2283 if (ifi < 0)
2284 return ifi;
2285
3125b3ef 2286 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
f647962d
MS
2287 if (r < 0)
2288 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2289
c74e630d 2290 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2291 if (r < 0)
2292 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
7e227024 2293
c74e630d 2294 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2295 if (r < 0)
2296 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
c74e630d 2297 }
7e227024 2298
c74e630d
LP
2299 return 0;
2300}
2301
2302static int setup_macvlan(pid_t pid) {
2303 _cleanup_udev_unref_ struct udev *udev = NULL;
2304 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
e867ceb6 2305 unsigned idx = 0;
c74e630d
LP
2306 char **i;
2307 int r;
2308
2309 if (!arg_private_network)
2310 return 0;
2311
2312 if (strv_isempty(arg_network_macvlan))
2313 return 0;
2314
2315 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2316 if (r < 0)
2317 return log_error_errno(r, "Failed to connect to netlink: %m");
c74e630d
LP
2318
2319 udev = udev_new();
2320 if (!udev) {
2321 log_error("Failed to connect to udev.");
2322 return -ENOMEM;
2323 }
2324
2325 STRV_FOREACH(i, arg_network_macvlan) {
2326 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2327 _cleanup_free_ char *n = NULL;
e867ceb6 2328 struct ether_addr mac;
c74e630d
LP
2329 int ifi;
2330
2331 ifi = parse_interface(udev, *i);
2332 if (ifi < 0)
2333 return ifi;
2334
e867ceb6
LP
2335 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2336 if (r < 0)
2337 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2338
c74e630d 2339 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2340 if (r < 0)
2341 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2342
c74e630d 2343 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
f647962d
MS
2344 if (r < 0)
2345 return log_error_errno(r, "Failed to add netlink interface index: %m");
c74e630d
LP
2346
2347 n = strappend("mv-", *i);
2348 if (!n)
2349 return log_oom();
2350
2351 strshorten(n, IFNAMSIZ-1);
2352
2353 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
f647962d
MS
2354 if (r < 0)
2355 return log_error_errno(r, "Failed to add netlink interface name: %m");
c74e630d 2356
e867ceb6
LP
2357 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2358 if (r < 0)
2359 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2360
aa28aefe 2361 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2362 if (r < 0)
2363 return log_error_errno(r, "Failed to add netlink namespace field: %m");
c74e630d
LP
2364
2365 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2366 if (r < 0)
2367 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 2368
d8e538ec 2369 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
f647962d
MS
2370 if (r < 0)
2371 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d
LP
2372
2373 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
f647962d
MS
2374 if (r < 0)
2375 return log_error_errno(r, "Failed to append macvlan mode: %m");
c74e630d
LP
2376
2377 r = sd_rtnl_message_close_container(m);
f647962d
MS
2378 if (r < 0)
2379 return log_error_errno(r, "Failed to close netlink container: %m");
c74e630d
LP
2380
2381 r = sd_rtnl_message_close_container(m);
f647962d
MS
2382 if (r < 0)
2383 return log_error_errno(r, "Failed to close netlink container: %m");
aa28aefe
LP
2384
2385 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2386 if (r < 0)
2387 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
aa28aefe
LP
2388 }
2389
2390 return 0;
2391}
2392
4bbfe7ad
TG
2393static int setup_ipvlan(pid_t pid) {
2394 _cleanup_udev_unref_ struct udev *udev = NULL;
2395 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2396 char **i;
2397 int r;
2398
2399 if (!arg_private_network)
2400 return 0;
2401
2402 if (strv_isempty(arg_network_ipvlan))
2403 return 0;
2404
2405 r = sd_rtnl_open(&rtnl, 0);
2406 if (r < 0)
2407 return log_error_errno(r, "Failed to connect to netlink: %m");
2408
2409 udev = udev_new();
2410 if (!udev) {
2411 log_error("Failed to connect to udev.");
2412 return -ENOMEM;
2413 }
2414
2415 STRV_FOREACH(i, arg_network_ipvlan) {
2416 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2417 _cleanup_free_ char *n = NULL;
2418 int ifi;
2419
2420 ifi = parse_interface(udev, *i);
2421 if (ifi < 0)
2422 return ifi;
2423
2424 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2425 if (r < 0)
2426 return log_error_errno(r, "Failed to allocate netlink message: %m");
2427
2428 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2429 if (r < 0)
2430 return log_error_errno(r, "Failed to add netlink interface index: %m");
2431
2432 n = strappend("iv-", *i);
2433 if (!n)
2434 return log_oom();
2435
2436 strshorten(n, IFNAMSIZ-1);
2437
2438 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2439 if (r < 0)
2440 return log_error_errno(r, "Failed to add netlink interface name: %m");
2441
2442 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2443 if (r < 0)
2444 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2445
2446 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2447 if (r < 0)
2448 return log_error_errno(r, "Failed to open netlink container: %m");
2449
2450 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2451 if (r < 0)
2452 return log_error_errno(r, "Failed to open netlink container: %m");
2453
2454 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2455 if (r < 0)
2456 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2457
2458 r = sd_rtnl_message_close_container(m);
2459 if (r < 0)
2460 return log_error_errno(r, "Failed to close netlink container: %m");
2461
2462 r = sd_rtnl_message_close_container(m);
2463 if (r < 0)
2464 return log_error_errno(r, "Failed to close netlink container: %m");
2465
2466 r = sd_rtnl_call(rtnl, m, 0, NULL);
2467 if (r < 0)
2468 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2469 }
2470
2471 return 0;
2472}
2473
28650077 2474static int setup_seccomp(void) {
24fb1112
LP
2475
2476#ifdef HAVE_SECCOMP
28650077
LP
2477 static const int blacklist[] = {
2478 SCMP_SYS(kexec_load),
2479 SCMP_SYS(open_by_handle_at),
2480 SCMP_SYS(init_module),
2481 SCMP_SYS(finit_module),
2482 SCMP_SYS(delete_module),
2483 SCMP_SYS(iopl),
2484 SCMP_SYS(ioperm),
2485 SCMP_SYS(swapon),
2486 SCMP_SYS(swapoff),
2487 };
2488
24fb1112 2489 scmp_filter_ctx seccomp;
28650077 2490 unsigned i;
24fb1112
LP
2491 int r;
2492
24fb1112
LP
2493 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2494 if (!seccomp)
2495 return log_oom();
2496
e9642be2 2497 r = seccomp_add_secondary_archs(seccomp);
9875fd78 2498 if (r < 0) {
da927ba9 2499 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
2500 goto finish;
2501 }
2502
28650077
LP
2503 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2504 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2505 if (r == -EFAULT)
2506 continue; /* unknown syscall */
2507 if (r < 0) {
da927ba9 2508 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
2509 goto finish;
2510 }
2511 }
2512
2513 /*
2514 Audit is broken in containers, much of the userspace audit
2515 hookup will fail if running inside a container. We don't
2516 care and just turn off creation of audit sockets.
2517
2518 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2519 with EAFNOSUPPORT which audit userspace uses as indication
2520 that audit is disabled in the kernel.
2521 */
2522
3302da46 2523 r = seccomp_rule_add(
24fb1112
LP
2524 seccomp,
2525 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2526 SCMP_SYS(socket),
2527 2,
2528 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2529 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2530 if (r < 0) {
da927ba9 2531 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
2532 goto finish;
2533 }
2534
2535 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2536 if (r < 0) {
da927ba9 2537 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
2538 goto finish;
2539 }
2540
2541 r = seccomp_load(seccomp);
2542 if (r < 0)
da927ba9 2543 log_error_errno(r, "Failed to install seccomp audit filter: %m");
24fb1112
LP
2544
2545finish:
2546 seccomp_release(seccomp);
2547 return r;
2548#else
2549 return 0;
2550#endif
2551
2552}
2553
785890ac
LP
2554static int setup_propagate(const char *root) {
2555 const char *p, *q;
2556
2557 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2558 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2559 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2560 (void) mkdir_p(p, 0600);
2561
2562 q = strappenda(root, "/run/systemd/nspawn/incoming");
2563 mkdir_parents(q, 0755);
2564 mkdir_p(q, 0600);
2565
2566 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2567 return log_error_errno(errno, "Failed to install propagation bind mount.");
2568
2569 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2570 return log_error_errno(errno, "Failed to make propagation mount read-only");
2571
2572 return 0;
2573}
2574
1b9e5b12
LP
2575static int setup_image(char **device_path, int *loop_nr) {
2576 struct loop_info64 info = {
2577 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2578 };
2579 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2580 _cleanup_free_ char* loopdev = NULL;
2581 struct stat st;
2582 int r, nr;
2583
2584 assert(device_path);
2585 assert(loop_nr);
ec16945e 2586 assert(arg_image);
1b9e5b12
LP
2587
2588 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2589 if (fd < 0)
2590 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 2591
4a62c710
MS
2592 if (fstat(fd, &st) < 0)
2593 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
2594
2595 if (S_ISBLK(st.st_mode)) {
2596 char *p;
2597
2598 p = strdup(arg_image);
2599 if (!p)
2600 return log_oom();
2601
2602 *device_path = p;
2603
2604 *loop_nr = -1;
2605
2606 r = fd;
2607 fd = -1;
2608
2609 return r;
2610 }
2611
2612 if (!S_ISREG(st.st_mode)) {
56f64d95 2613 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
2614 return -EINVAL;
2615 }
2616
2617 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
2618 if (control < 0)
2619 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
2620
2621 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
2622 if (nr < 0)
2623 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
2624
2625 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2626 return log_oom();
2627
2628 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2629 if (loop < 0)
2630 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 2631
4a62c710
MS
2632 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2633 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
2634
2635 if (arg_read_only)
2636 info.lo_flags |= LO_FLAGS_READ_ONLY;
2637
4a62c710
MS
2638 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2639 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
2640
2641 *device_path = loopdev;
2642 loopdev = NULL;
2643
2644 *loop_nr = nr;
2645
2646 r = loop;
2647 loop = -1;
2648
2649 return r;
2650}
2651
ada4799a
LP
2652#define PARTITION_TABLE_BLURB \
2653 "Note that the disk image needs to either contain only a single MBR partition of\n" \
f6c51a81
LP
2654 "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2655 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
2656 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2657 "to be bootable with systemd-nspawn."
2658
1b9e5b12
LP
2659static int dissect_image(
2660 int fd,
727fd4fd
LP
2661 char **root_device, bool *root_device_rw,
2662 char **home_device, bool *home_device_rw,
2663 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
2664 bool *secondary) {
2665
2666#ifdef HAVE_BLKID
01dc33ce
ZJS
2667 int home_nr = -1, srv_nr = -1;
2668#ifdef GPT_ROOT_NATIVE
2669 int root_nr = -1;
2670#endif
2671#ifdef GPT_ROOT_SECONDARY
2672 int secondary_root_nr = -1;
2673#endif
f6c51a81 2674 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
2675 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2676 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2677 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2678 _cleanup_udev_unref_ struct udev *udev = NULL;
2679 struct udev_list_entry *first, *item;
f6c51a81 2680 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 2681 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
2682 const char *pttype = NULL;
2683 blkid_partlist pl;
2684 struct stat st;
c09ef2e4 2685 unsigned i;
1b9e5b12
LP
2686 int r;
2687
2688 assert(fd >= 0);
2689 assert(root_device);
2690 assert(home_device);
2691 assert(srv_device);
2692 assert(secondary);
ec16945e 2693 assert(arg_image);
1b9e5b12
LP
2694
2695 b = blkid_new_probe();
2696 if (!b)
2697 return log_oom();
2698
2699 errno = 0;
2700 r = blkid_probe_set_device(b, fd, 0, 0);
2701 if (r != 0) {
2702 if (errno == 0)
2703 return log_oom();
2704
56f64d95 2705 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
2706 return -errno;
2707 }
2708
2709 blkid_probe_enable_partitions(b, 1);
2710 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2711
2712 errno = 0;
2713 r = blkid_do_safeprobe(b);
2714 if (r == -2 || r == 1) {
ada4799a
LP
2715 log_error("Failed to identify any partition table on\n"
2716 " %s\n"
2717 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
2718 return -EINVAL;
2719 } else if (r != 0) {
2720 if (errno == 0)
2721 errno = EIO;
56f64d95 2722 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
2723 return -errno;
2724 }
2725
2726 blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
2727
2728 is_gpt = streq_ptr(pttype, "gpt");
2729 is_mbr = streq_ptr(pttype, "dos");
2730
2731 if (!is_gpt && !is_mbr) {
2732 log_error("No GPT or MBR partition table discovered on\n"
2733 " %s\n"
2734 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
2735 return -EINVAL;
2736 }
2737
2738 errno = 0;
2739 pl = blkid_probe_get_partitions(b);
2740 if (!pl) {
2741 if (errno == 0)
2742 return log_oom();
2743
2744 log_error("Failed to list partitions of %s", arg_image);
2745 return -errno;
2746 }
2747
2748 udev = udev_new();
2749 if (!udev)
2750 return log_oom();
2751
4a62c710
MS
2752 if (fstat(fd, &st) < 0)
2753 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 2754
c09ef2e4
LP
2755 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2756 if (!d)
1b9e5b12
LP
2757 return log_oom();
2758
c09ef2e4
LP
2759 for (i = 0;; i++) {
2760 int n, m;
1b9e5b12 2761
c09ef2e4
LP
2762 if (i >= 10) {
2763 log_error("Kernel partitions never appeared.");
2764 return -ENXIO;
2765 }
2766
2767 e = udev_enumerate_new(udev);
2768 if (!e)
2769 return log_oom();
2770
2771 r = udev_enumerate_add_match_parent(e, d);
2772 if (r < 0)
2773 return log_oom();
2774
2775 r = udev_enumerate_scan_devices(e);
2776 if (r < 0)
2777 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2778
2779 /* Count the partitions enumerated by the kernel */
2780 n = 0;
2781 first = udev_enumerate_get_list_entry(e);
2782 udev_list_entry_foreach(item, first)
2783 n++;
2784
2785 /* Count the partitions enumerated by blkid */
2786 m = blkid_partlist_numof_partitions(pl);
2787 if (n == m + 1)
2788 break;
2789 if (n > m + 1) {
2790 log_error("blkid and kernel partition list do not match.");
2791 return -EIO;
2792 }
2793 if (n < m + 1) {
2794 unsigned j;
2795
2796 /* The kernel has probed fewer partitions than
2797 * blkid? Maybe the kernel prober is still
2798 * running or it got EBUSY because udev
2799 * already opened the device. Let's reprobe
2800 * the device, which is a synchronous call
2801 * that waits until probing is complete. */
2802
2803 for (j = 0; j < 20; j++) {
2804
2805 r = ioctl(fd, BLKRRPART, 0);
2806 if (r < 0)
2807 r = -errno;
2808 if (r >= 0 || r != -EBUSY)
2809 break;
2810
2811 /* If something else has the device
2812 * open, such as an udev rule, the
2813 * ioctl will return EBUSY. Since
2814 * there's no way to wait until it
2815 * isn't busy anymore, let's just wait
2816 * a bit, and try again.
2817 *
2818 * This is really something they
2819 * should fix in the kernel! */
2820
2821 usleep(50 * USEC_PER_MSEC);
2822 }
2823
2824 if (r < 0)
2825 return log_error_errno(r, "Failed to reread partition table: %m");
2826 }
2827
2828 e = udev_enumerate_unref(e);
2829 }
1b9e5b12
LP
2830
2831 first = udev_enumerate_get_list_entry(e);
2832 udev_list_entry_foreach(item, first) {
2833 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 2834 const char *node;
727fd4fd 2835 unsigned long long flags;
1b9e5b12
LP
2836 blkid_partition pp;
2837 dev_t qn;
2838 int nr;
2839
2840 errno = 0;
2841 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2842 if (!q) {
2843 if (!errno)
2844 errno = ENOMEM;
2845
56f64d95 2846 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
2847 return -errno;
2848 }
2849
2850 qn = udev_device_get_devnum(q);
2851 if (major(qn) == 0)
2852 continue;
2853
2854 if (st.st_rdev == qn)
2855 continue;
2856
2857 node = udev_device_get_devnode(q);
2858 if (!node)
2859 continue;
2860
2861 pp = blkid_partlist_devno_to_partition(pl, qn);
2862 if (!pp)
2863 continue;
2864
727fd4fd 2865 flags = blkid_partition_get_flags(pp);
727fd4fd 2866
1b9e5b12
LP
2867 nr = blkid_partition_get_partno(pp);
2868 if (nr < 0)
2869 continue;
2870
ada4799a
LP
2871 if (is_gpt) {
2872 sd_id128_t type_id;
2873 const char *stype;
1b9e5b12 2874
f6c51a81
LP
2875 if (flags & GPT_FLAG_NO_AUTO)
2876 continue;
2877
ada4799a
LP
2878 stype = blkid_partition_get_type_string(pp);
2879 if (!stype)
2880 continue;
1b9e5b12 2881
ada4799a 2882 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
2883 continue;
2884
ada4799a 2885 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 2886
ada4799a
LP
2887 if (home && nr >= home_nr)
2888 continue;
1b9e5b12 2889
ada4799a
LP
2890 home_nr = nr;
2891 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 2892
ada4799a
LP
2893 r = free_and_strdup(&home, node);
2894 if (r < 0)
2895 return log_oom();
727fd4fd 2896
ada4799a
LP
2897 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2898
2899 if (srv && nr >= srv_nr)
2900 continue;
2901
2902 srv_nr = nr;
2903 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2904
2905 r = free_and_strdup(&srv, node);
2906 if (r < 0)
2907 return log_oom();
2908 }
1b9e5b12 2909#ifdef GPT_ROOT_NATIVE
ada4799a 2910 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 2911
ada4799a
LP
2912 if (root && nr >= root_nr)
2913 continue;
1b9e5b12 2914
ada4799a
LP
2915 root_nr = nr;
2916 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 2917
ada4799a
LP
2918 r = free_and_strdup(&root, node);
2919 if (r < 0)
2920 return log_oom();
2921 }
1b9e5b12
LP
2922#endif
2923#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
2924 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2925
2926 if (secondary_root && nr >= secondary_root_nr)
2927 continue;
2928
2929 secondary_root_nr = nr;
2930 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2931
2932 r = free_and_strdup(&secondary_root, node);
2933 if (r < 0)
2934 return log_oom();
2935 }
2936#endif
f6c51a81
LP
2937 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2938
2939 if (generic)
2940 multiple_generic = true;
2941 else {
2942 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2943
2944 r = free_and_strdup(&generic, node);
2945 if (r < 0)
2946 return log_oom();
2947 }
2948 }
ada4799a
LP
2949
2950 } else if (is_mbr) {
2951 int type;
1b9e5b12 2952
f6c51a81
LP
2953 if (flags != 0x80) /* Bootable flag */
2954 continue;
2955
ada4799a
LP
2956 type = blkid_partition_get_type(pp);
2957 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2958 continue;
2959
f6c51a81
LP
2960 if (generic)
2961 multiple_generic = true;
2962 else {
2963 generic_rw = true;
727fd4fd 2964
f6c51a81
LP
2965 r = free_and_strdup(&root, node);
2966 if (r < 0)
2967 return log_oom();
2968 }
1b9e5b12 2969 }
1b9e5b12
LP
2970 }
2971
1b9e5b12
LP
2972 if (root) {
2973 *root_device = root;
2974 root = NULL;
727fd4fd
LP
2975
2976 *root_device_rw = root_rw;
1b9e5b12
LP
2977 *secondary = false;
2978 } else if (secondary_root) {
2979 *root_device = secondary_root;
2980 secondary_root = NULL;
727fd4fd
LP
2981
2982 *root_device_rw = secondary_root_rw;
1b9e5b12 2983 *secondary = true;
f6c51a81
LP
2984 } else if (generic) {
2985
2986 /* There were no partitions with precise meanings
2987 * around, but we found generic partitions. In this
2988 * case, if there's only one, we can go ahead and boot
2989 * it, otherwise we bail out, because we really cannot
2990 * make any sense of it. */
2991
2992 if (multiple_generic) {
2993 log_error("Identified multiple bootable Linux partitions on\n"
2994 " %s\n"
2995 PARTITION_TABLE_BLURB, arg_image);
2996 return -EINVAL;
2997 }
2998
2999 *root_device = generic;
3000 generic = NULL;
3001
3002 *root_device_rw = generic_rw;
3003 *secondary = false;
3004 } else {
3005 log_error("Failed to identify root partition in disk image\n"
3006 " %s\n"
3007 PARTITION_TABLE_BLURB, arg_image);
3008 return -EINVAL;
1b9e5b12
LP
3009 }
3010
3011 if (home) {
3012 *home_device = home;
3013 home = NULL;
727fd4fd
LP
3014
3015 *home_device_rw = home_rw;
1b9e5b12
LP
3016 }
3017
3018 if (srv) {
3019 *srv_device = srv;
3020 srv = NULL;
727fd4fd
LP
3021
3022 *srv_device_rw = srv_rw;
1b9e5b12
LP
3023 }
3024
3025 return 0;
3026#else
3027 log_error("--image= is not supported, compiled without blkid support.");
3028 return -ENOTSUP;
3029#endif
3030}
3031
727fd4fd 3032static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
3033#ifdef HAVE_BLKID
3034 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3035 const char *fstype, *p;
3036 int r;
3037
3038 assert(what);
3039 assert(where);
3040
727fd4fd
LP
3041 if (arg_read_only)
3042 rw = false;
3043
1b9e5b12
LP
3044 if (directory)
3045 p = strappenda(where, directory);
3046 else
3047 p = where;
3048
3049 errno = 0;
3050 b = blkid_new_probe_from_filename(what);
3051 if (!b) {
3052 if (errno == 0)
3053 return log_oom();
56f64d95 3054 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
3055 return -errno;
3056 }
3057
3058 blkid_probe_enable_superblocks(b, 1);
3059 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3060
3061 errno = 0;
3062 r = blkid_do_safeprobe(b);
3063 if (r == -1 || r == 1) {
3064 log_error("Cannot determine file system type of %s", what);
3065 return -EINVAL;
3066 } else if (r != 0) {
3067 if (errno == 0)
3068 errno = EIO;
56f64d95 3069 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
3070 return -errno;
3071 }
3072
3073 errno = 0;
3074 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3075 if (errno == 0)
3076 errno = EINVAL;
3077 log_error("Failed to determine file system type of %s", what);
3078 return -errno;
3079 }
3080
3081 if (streq(fstype, "crypto_LUKS")) {
3082 log_error("nspawn currently does not support LUKS disk images.");
3083 return -ENOTSUP;
3084 }
3085
4a62c710
MS
3086 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3087 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
3088
3089 return 0;
3090#else
3091 log_error("--image= is not supported, compiled without blkid support.");
3092 return -ENOTSUP;
3093#endif
3094}
3095
727fd4fd
LP
3096static int mount_devices(
3097 const char *where,
3098 const char *root_device, bool root_device_rw,
3099 const char *home_device, bool home_device_rw,
3100 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
3101 int r;
3102
3103 assert(where);
3104
3105 if (root_device) {
727fd4fd 3106 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
3107 if (r < 0)
3108 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
3109 }
3110
3111 if (home_device) {
727fd4fd 3112 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
3113 if (r < 0)
3114 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
3115 }
3116
3117 if (srv_device) {
727fd4fd 3118 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
3119 if (r < 0)
3120 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
3121 }
3122
3123 return 0;
3124}
3125
3126static void loop_remove(int nr, int *image_fd) {
3127 _cleanup_close_ int control = -1;
e8c8ddcc 3128 int r;
1b9e5b12
LP
3129
3130 if (nr < 0)
3131 return;
3132
3133 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
3134 r = ioctl(*image_fd, LOOP_CLR_FD);
3135 if (r < 0)
5e4074aa 3136 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 3137 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
3138 }
3139
3140 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 3141 if (control < 0) {
56f64d95 3142 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 3143 return;
e8c8ddcc 3144 }
1b9e5b12 3145
e8c8ddcc
TG
3146 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3147 if (r < 0)
5e4074aa 3148 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
3149}
3150
0cb9fbcd
LP
3151static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3152 int pipe_fds[2];
3153 pid_t pid;
3154
3155 assert(database);
3156 assert(key);
3157 assert(rpid);
3158
4a62c710
MS
3159 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3160 return log_error_errno(errno, "Failed to allocate pipe: %m");
0cb9fbcd
LP
3161
3162 pid = fork();
4a62c710
MS
3163 if (pid < 0)
3164 return log_error_errno(errno, "Failed to fork getent child: %m");
3165 else if (pid == 0) {
0cb9fbcd
LP
3166 int nullfd;
3167 char *empty_env = NULL;
3168
3169 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3170 _exit(EXIT_FAILURE);
3171
3172 if (pipe_fds[0] > 2)
03e334a1 3173 safe_close(pipe_fds[0]);
0cb9fbcd 3174 if (pipe_fds[1] > 2)
03e334a1 3175 safe_close(pipe_fds[1]);
0cb9fbcd
LP
3176
3177 nullfd = open("/dev/null", O_RDWR);
3178 if (nullfd < 0)
3179 _exit(EXIT_FAILURE);
3180
3181 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3182 _exit(EXIT_FAILURE);
3183
3184 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3185 _exit(EXIT_FAILURE);
3186
3187 if (nullfd > 2)
03e334a1 3188 safe_close(nullfd);
0cb9fbcd
LP
3189
3190 reset_all_signal_handlers();
3191 close_all_fds(NULL, 0);
3192
4de82926
MM
3193 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3194 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
3195 _exit(EXIT_FAILURE);
3196 }
3197
03e334a1 3198 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
3199
3200 *rpid = pid;
3201
3202 return pipe_fds[0];
3203}
3204
3205static int change_uid_gid(char **_home) {
a2a5291b
ZJS
3206 char line[LINE_MAX], *x, *u, *g, *h;
3207 const char *word, *state;
0cb9fbcd
LP
3208 _cleanup_free_ uid_t *uids = NULL;
3209 _cleanup_free_ char *home = NULL;
3210 _cleanup_fclose_ FILE *f = NULL;
3211 _cleanup_close_ int fd = -1;
3212 unsigned n_uids = 0;
70f539ca 3213 size_t sz = 0, l;
0cb9fbcd
LP
3214 uid_t uid;
3215 gid_t gid;
3216 pid_t pid;
3217 int r;
3218
3219 assert(_home);
3220
3221 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3222 /* Reset everything fully to 0, just in case */
3223
4a62c710
MS
3224 if (setgroups(0, NULL) < 0)
3225 return log_error_errno(errno, "setgroups() failed: %m");
0cb9fbcd 3226
4a62c710
MS
3227 if (setresgid(0, 0, 0) < 0)
3228 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3229
4a62c710
MS
3230 if (setresuid(0, 0, 0) < 0)
3231 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3232
3233 *_home = NULL;
3234 return 0;
3235 }
3236
3237 /* First, get user credentials */
3238 fd = spawn_getent("passwd", arg_user, &pid);
3239 if (fd < 0)
3240 return fd;
3241
3242 f = fdopen(fd, "r");
3243 if (!f)
3244 return log_oom();
3245 fd = -1;
3246
3247 if (!fgets(line, sizeof(line), f)) {
3248
3249 if (!ferror(f)) {
3250 log_error("Failed to resolve user %s.", arg_user);
3251 return -ESRCH;
3252 }
3253
56f64d95 3254 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3255 return -errno;
3256 }
3257
3258 truncate_nl(line);
3259
820d3acf 3260 wait_for_terminate_and_warn("getent passwd", pid, true);
0cb9fbcd
LP
3261
3262 x = strchr(line, ':');
3263 if (!x) {
3264 log_error("/etc/passwd entry has invalid user field.");
3265 return -EIO;
3266 }
3267
3268 u = strchr(x+1, ':');
3269 if (!u) {
3270 log_error("/etc/passwd entry has invalid password field.");
3271 return -EIO;
3272 }
3273
3274 u++;
3275 g = strchr(u, ':');
3276 if (!g) {
3277 log_error("/etc/passwd entry has invalid UID field.");
3278 return -EIO;
3279 }
3280
3281 *g = 0;
3282 g++;
3283 x = strchr(g, ':');
3284 if (!x) {
3285 log_error("/etc/passwd entry has invalid GID field.");
3286 return -EIO;
3287 }
3288
3289 *x = 0;
3290 h = strchr(x+1, ':');
3291 if (!h) {
3292 log_error("/etc/passwd entry has invalid GECOS field.");
3293 return -EIO;
3294 }
3295
3296 h++;
3297 x = strchr(h, ':');
3298 if (!x) {
3299 log_error("/etc/passwd entry has invalid home directory field.");
3300 return -EIO;
3301 }
3302
3303 *x = 0;
3304
3305 r = parse_uid(u, &uid);
3306 if (r < 0) {
3307 log_error("Failed to parse UID of user.");
3308 return -EIO;
3309 }
3310
3311 r = parse_gid(g, &gid);
3312 if (r < 0) {
3313 log_error("Failed to parse GID of user.");
3314 return -EIO;
3315 }
3316
3317 home = strdup(h);
3318 if (!home)
3319 return log_oom();
3320
3321 /* Second, get group memberships */
3322 fd = spawn_getent("initgroups", arg_user, &pid);
3323 if (fd < 0)
3324 return fd;
3325
3326 fclose(f);
3327 f = fdopen(fd, "r");
3328 if (!f)
3329 return log_oom();
3330 fd = -1;
3331
3332 if (!fgets(line, sizeof(line), f)) {
3333 if (!ferror(f)) {
3334 log_error("Failed to resolve user %s.", arg_user);
3335 return -ESRCH;
3336 }
3337
56f64d95 3338 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3339 return -errno;
3340 }
3341
3342 truncate_nl(line);
3343
820d3acf 3344 wait_for_terminate_and_warn("getent initgroups", pid, true);
0cb9fbcd
LP
3345
3346 /* Skip over the username and subsequent separator whitespace */
3347 x = line;
3348 x += strcspn(x, WHITESPACE);
3349 x += strspn(x, WHITESPACE);
3350
a2a5291b 3351 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
3352 char c[l+1];
3353
a2a5291b 3354 memcpy(c, word, l);
0cb9fbcd
LP
3355 c[l] = 0;
3356
3357 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3358 return log_oom();
3359
3360 r = parse_uid(c, &uids[n_uids++]);
3361 if (r < 0) {
3362 log_error("Failed to parse group data from getent.");
3363 return -EIO;
3364 }
3365 }
3366
3367 r = mkdir_parents(home, 0775);
f647962d
MS
3368 if (r < 0)
3369 return log_error_errno(r, "Failed to make home root directory: %m");
0cb9fbcd
LP
3370
3371 r = mkdir_safe(home, 0755, uid, gid);
f647962d
MS
3372 if (r < 0 && r != -EEXIST)
3373 return log_error_errno(r, "Failed to make home directory: %m");
0cb9fbcd
LP
3374
3375 fchown(STDIN_FILENO, uid, gid);
3376 fchown(STDOUT_FILENO, uid, gid);
3377 fchown(STDERR_FILENO, uid, gid);
3378
4a62c710
MS
3379 if (setgroups(n_uids, uids) < 0)
3380 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
0cb9fbcd 3381
4a62c710
MS
3382 if (setresgid(gid, gid, gid) < 0)
3383 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3384
4a62c710
MS
3385 if (setresuid(uid, uid, uid) < 0)
3386 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3387
3388 if (_home) {
3389 *_home = home;
3390 home = NULL;
3391 }
3392
3393 return 0;
3394}
3395
113cea80 3396/*
6d416b9c
LS
3397 * Return values:
3398 * < 0 : wait_for_terminate() failed to get the state of the
3399 * container, the container was terminated by a signal, or
3400 * failed for an unknown reason. No change is made to the
3401 * container argument.
3402 * > 0 : The program executed in the container terminated with an
3403 * error. The exit code of the program executed in the
919699ec
LP
3404 * container is returned. The container argument has been set
3405 * to CONTAINER_TERMINATED.
6d416b9c
LS
3406 * 0 : The container is being rebooted, has been shut down or exited
3407 * successfully. The container argument has been set to either
3408 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 3409 *
6d416b9c
LS
3410 * That is, success is indicated by a return value of zero, and an
3411 * error is indicated by a non-zero value.
113cea80
DH
3412 */
3413static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 3414 siginfo_t status;
919699ec 3415 int r;
113cea80
DH
3416
3417 r = wait_for_terminate(pid, &status);
f647962d
MS
3418 if (r < 0)
3419 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
3420
3421 switch (status.si_code) {
fddbb89c 3422
113cea80 3423 case CLD_EXITED:
919699ec
LP
3424 if (status.si_status == 0) {
3425 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 3426
fddbb89c 3427 } else
919699ec 3428 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 3429
919699ec
LP
3430 *container = CONTAINER_TERMINATED;
3431 return status.si_status;
113cea80
DH
3432
3433 case CLD_KILLED:
3434 if (status.si_status == SIGINT) {
113cea80 3435
919699ec 3436 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 3437 *container = CONTAINER_TERMINATED;
919699ec
LP
3438 return 0;
3439
113cea80 3440 } else if (status.si_status == SIGHUP) {
113cea80 3441
919699ec 3442 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 3443 *container = CONTAINER_REBOOTED;
919699ec 3444 return 0;
113cea80 3445 }
919699ec 3446
113cea80
DH
3447 /* CLD_KILLED fallthrough */
3448
3449 case CLD_DUMPED:
fddbb89c 3450 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 3451 return -EIO;
113cea80
DH
3452
3453 default:
fddbb89c 3454 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 3455 return -EIO;
113cea80
DH
3456 }
3457
3458 return r;
3459}
3460
e866af3a
DH
3461static void nop_handler(int sig) {}
3462
023fb90b
LP
3463static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3464 pid_t pid;
3465
3466 pid = PTR_TO_UINT32(userdata);
3467 if (pid > 0) {
3468 if (kill(pid, SIGRTMIN+3) >= 0) {
3469 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3470 sd_event_source_set_userdata(s, NULL);
3471 return 0;
3472 }
3473 }
3474
3475 sd_event_exit(sd_event_source_get_event(s), 0);
3476 return 0;
3477}
3478
ec16945e 3479static int determine_names(void) {
1b9cebf6 3480 int r;
ec16945e
LP
3481
3482 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3483 if (arg_machine) {
3484 _cleanup_(image_unrefp) Image *i = NULL;
3485
3486 r = image_find(arg_machine, &i);
3487 if (r < 0)
3488 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3489 else if (r == 0) {
3490 log_error("No image for machine '%s': %m", arg_machine);
3491 return -ENOENT;
3492 }
3493
aceac2f0 3494 if (i->type == IMAGE_RAW)
1b9cebf6
LP
3495 r = set_sanitized_path(&arg_image, i->path);
3496 else
3497 r = set_sanitized_path(&arg_directory, i->path);
3498 if (r < 0)
3499 return log_error_errno(r, "Invalid image directory: %m");
3500
3501 arg_read_only = arg_read_only || i->read_only;
3502 } else
ec16945e
LP
3503 arg_directory = get_current_dir_name();
3504
1b9cebf6
LP
3505 if (!arg_directory && !arg_machine) {
3506 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
3507 return -EINVAL;
3508 }
3509 }
3510
3511 if (!arg_machine) {
b9ba4dab
LP
3512 if (arg_directory && path_equal(arg_directory, "/"))
3513 arg_machine = gethostname_malloc();
3514 else
3515 arg_machine = strdup(basename(arg_image ?: arg_directory));
3516
ec16945e
LP
3517 if (!arg_machine)
3518 return log_oom();
3519
3520 hostname_cleanup(arg_machine, false);
3521 if (!machine_name_is_valid(arg_machine)) {
3522 log_error("Failed to determine machine name automatically, please use -M.");
3523 return -EINVAL;
3524 }
b9ba4dab
LP
3525
3526 if (arg_ephemeral) {
3527 char *b;
3528
3529 /* Add a random suffix when this is an
3530 * ephemeral machine, so that we can run many
3531 * instances at once without manually having
3532 * to specify -M each time. */
3533
3534 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3535 return log_oom();
3536
3537 free(arg_machine);
3538 arg_machine = b;
3539 }
ec16945e
LP
3540 }
3541
3542 return 0;
3543}
3544
88213476 3545int main(int argc, char *argv[]) {
69c79d3c 3546
611b312b 3547 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
727fd4fd 3548 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
63cc4c31 3549 _cleanup_close_ int master = -1, image_fd = -1;
69c79d3c 3550 _cleanup_fdset_free_ FDSet *fds = NULL;
ec16945e 3551 int r, n_fd_passed, loop_nr = -1;
1b9e5b12 3552 char veth_name[IFNAMSIZ];
ec16945e 3553 bool secondary = false, remove_subvol = false;
e866af3a 3554 sigset_t mask, mask_chld;
69c79d3c 3555 pid_t pid = 0;
ec16945e 3556 int ret = EXIT_SUCCESS;
6d0b55c2 3557 union in_addr_union exposed = {};
30535c16 3558 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
88213476
LP
3559
3560 log_parse_environment();
3561 log_open();
3562
ec16945e
LP
3563 r = parse_argv(argc, argv);
3564 if (r <= 0)
88213476 3565 goto finish;
88213476 3566
ec16945e
LP
3567 r = determine_names();
3568 if (r < 0)
3569 goto finish;
7027ff61 3570
88213476
LP
3571 if (geteuid() != 0) {
3572 log_error("Need to be root.");
ec16945e 3573 r = -EPERM;
88213476
LP
3574 goto finish;
3575 }
3576
04d391da
LP
3577 if (sd_booted() <= 0) {
3578 log_error("Not running on a systemd system.");
ec16945e 3579 r = -EINVAL;
04d391da
LP
3580 goto finish;
3581 }
3582
1b9e5b12
LP
3583 log_close();
3584 n_fd_passed = sd_listen_fds(false);
3585 if (n_fd_passed > 0) {
ec16945e
LP
3586 r = fdset_new_listen_fds(&fds, false);
3587 if (r < 0) {
3588 log_error_errno(r, "Failed to collect file descriptors: %m");
1b9e5b12
LP
3589 goto finish;
3590 }
88213476 3591 }
1b9e5b12
LP
3592 fdset_close_others(fds);
3593 log_open();
88213476 3594
1b9e5b12 3595 if (arg_directory) {
ec16945e
LP
3596 assert(!arg_image);
3597
c4e34a61
LP
3598 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3599 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
ec16945e 3600 r = -EINVAL;
6b9132a9
LP
3601 goto finish;
3602 }
1b9e5b12 3603
30535c16
LP
3604 if (arg_ephemeral) {
3605 _cleanup_release_lock_file_ LockFile original_lock = LOCK_FILE_INIT;
ec16945e
LP
3606 char *np;
3607
c4e34a61
LP
3608 /* If the specified path is a mount point we
3609 * generate the new snapshot immediately
3610 * inside it under a random name. However if
3611 * the specified is not a mount point we
3612 * create the new snapshot in the parent
3613 * directory, just next to it. */
3614 r = path_is_mount_point(arg_directory, false);
3615 if (r < 0) {
3616 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3617 goto finish;
3618 }
3619 if (r > 0)
3620 r = tempfn_random_child(arg_directory, &np);
3621 else
3622 r = tempfn_random(arg_directory, &np);
ec16945e
LP
3623 if (r < 0) {
3624 log_error_errno(r, "Failed to generate name for snapshot: %m");
3625 goto finish;
3626 }
3627
30535c16
LP
3628 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3629 if (r < 0) {
3630 log_error_errno(r, "Failed to lock %s: %m", np);
3631 goto finish;
3632 }
3633
ec16945e
LP
3634 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3635 if (r < 0) {
3636 free(np);
3637 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3638 goto finish;
3639 }
3640
3641 free(arg_directory);
3642 arg_directory = np;
3643
3644 remove_subvol = true;
30535c16
LP
3645
3646 } else {
3647 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3648 if (r == -EBUSY) {
3649 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3650 goto finish;
3651 }
3652 if (r < 0) {
3653 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3654 return r;
3655 }
3656
3657 if (arg_template) {
3658 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3659 if (r == -EEXIST) {
3660 if (!arg_quiet)
3661 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3662 } else if (r < 0) {
83521414 3663 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
3664 goto finish;
3665 } else {
3666 if (!arg_quiet)
3667 log_info("Populated %s from template %s.", arg_directory, arg_template);
3668 }
3669 }
ec16945e
LP
3670 }
3671
1b9e5b12
LP
3672 if (arg_boot) {
3673 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 3674 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 3675 r = -EINVAL;
1b9e5b12
LP
3676 goto finish;
3677 }
3678 } else {
3679 const char *p;
3680
3681 p = strappenda(arg_directory,
3682 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3683 if (access(p, F_OK) < 0) {
3684 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
ec16945e 3685 r = -EINVAL;
1b9e5b12 3686 goto finish;
1b9e5b12
LP
3687 }
3688 }
ec16945e 3689
6b9132a9 3690 } else {
1b9e5b12 3691 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 3692
ec16945e
LP
3693 assert(arg_image);
3694 assert(!arg_template);
3695
30535c16
LP
3696 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3697 if (r == -EBUSY) {
3698 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3699 goto finish;
3700 }
3701 if (r < 0) {
3702 r = log_error_errno(r, "Failed to create image lock: %m");
3703 goto finish;
3704 }
3705
1b9e5b12 3706 if (!mkdtemp(template)) {
56f64d95 3707 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 3708 r = -errno;
6b9132a9 3709 goto finish;
1b9e5b12 3710 }
6b9132a9 3711
1b9e5b12
LP
3712 arg_directory = strdup(template);
3713 if (!arg_directory) {
3714 r = log_oom();
3715 goto finish;
6b9132a9 3716 }
88213476 3717
1b9e5b12
LP
3718 image_fd = setup_image(&device_path, &loop_nr);
3719 if (image_fd < 0) {
3720 r = image_fd;
842f3b0f
LP
3721 goto finish;
3722 }
1b9e5b12 3723
4d9f07b4
LP
3724 r = dissect_image(image_fd,
3725 &root_device, &root_device_rw,
3726 &home_device, &home_device_rw,
3727 &srv_device, &srv_device_rw,
3728 &secondary);
1b9e5b12
LP
3729 if (r < 0)
3730 goto finish;
842f3b0f 3731 }
842f3b0f 3732
db7feb7e
LP
3733 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3734 if (master < 0) {
ec16945e 3735 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
3736 goto finish;
3737 }
3738
611b312b
LP
3739 r = ptsname_malloc(master, &console);
3740 if (r < 0) {
3741 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
3742 goto finish;
3743 }
3744
284c0b91 3745 if (!arg_quiet)
45f1386c 3746 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
ec16945e 3747 arg_machine, arg_image ?: arg_directory);
a258bf26
LP
3748
3749 if (unlockpt(master) < 0) {
ec16945e 3750 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
3751 goto finish;
3752 }
3753
a258bf26
LP
3754 assert_se(sigemptyset(&mask) == 0);
3755 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3756 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3757
023fb90b
LP
3758 assert_se(sigemptyset(&mask_chld) == 0);
3759 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3760
d87be9b0 3761 for (;;) {
6d0b55c2 3762 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
113cea80 3763 ContainerStatus container_status;
7566e267 3764 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
e866af3a
DH
3765 struct sigaction sa = {
3766 .sa_handler = nop_handler,
3767 .sa_flags = SA_NOCLDSTOP,
3768 };
3769
7566e267 3770 r = barrier_create(&barrier);
a2da110b 3771 if (r < 0) {
da927ba9 3772 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
3773 goto finish;
3774 }
3775
6d0b55c2
LP
3776 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3777 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3778 goto finish;
3779 }
3780
3781 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3782 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3783 goto finish;
3784 }
3785
e866af3a
DH
3786 /* Child can be killed before execv(), so handle SIGCHLD
3787 * in order to interrupt parent's blocking calls and
3788 * give it a chance to call wait() and terminate. */
3789 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3790 if (r < 0) {
ec16945e 3791 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
3792 goto finish;
3793 }
3794
e866af3a
DH
3795 r = sigaction(SIGCHLD, &sa, NULL);
3796 if (r < 0) {
ec16945e 3797 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
3798 goto finish;
3799 }
3800
60e1651a
KW
3801 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3802 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3803 (arg_private_network ? CLONE_NEWNET : 0), NULL);
d87be9b0
LP
3804 if (pid < 0) {
3805 if (errno == EINVAL)
ec16945e 3806 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 3807 else
ec16945e 3808 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 3809
d87be9b0
LP
3810 goto finish;
3811 }
a258bf26 3812
d87be9b0
LP
3813 if (pid == 0) {
3814 /* child */
0cb9fbcd 3815 _cleanup_free_ char *home = NULL;
5674767e 3816 unsigned n_env = 2;
d87be9b0 3817 const char *envp[] = {
e10a55fd 3818 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
3819 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3820 NULL, /* TERM */
3821 NULL, /* HOME */
3822 NULL, /* USER */
3823 NULL, /* LOGNAME */
3824 NULL, /* container_uuid */
842f3b0f
LP
3825 NULL, /* LISTEN_FDS */
3826 NULL, /* LISTEN_PID */
d87be9b0
LP
3827 NULL
3828 };
f4889f65 3829 char **env_use;
a258bf26 3830
a2da110b
DH
3831 barrier_set_role(&barrier, BARRIER_CHILD);
3832
5674767e
ZJS
3833 envp[n_env] = strv_find_prefix(environ, "TERM=");
3834 if (envp[n_env])
3835 n_env ++;
a258bf26 3836
03e334a1 3837 master = safe_close(master);
a258bf26 3838
d87be9b0
LP
3839 close_nointr(STDIN_FILENO);
3840 close_nointr(STDOUT_FILENO);
3841 close_nointr(STDERR_FILENO);
db7feb7e 3842
03e334a1 3843 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 3844 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
a258bf26 3845
d87be9b0 3846 reset_all_signal_handlers();
1b6d7fa7 3847 reset_signal_mask();
f5c1b9ee 3848
ec16945e
LP
3849 r = open_terminal(console, O_RDWR);
3850 if (r != STDIN_FILENO) {
3851 if (r >= 0) {
3852 safe_close(r);
3853 r = -EINVAL;
842f3b0f
LP
3854 }
3855
ec16945e 3856 log_error_errno(r, "Failed to open console: %m");
a2da110b 3857 _exit(EXIT_FAILURE);
842f3b0f
LP
3858 }
3859
3860 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3861 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
56f64d95 3862 log_error_errno(errno, "Failed to duplicate console: %m");
a2da110b 3863 _exit(EXIT_FAILURE);
842f3b0f 3864 }
bc2f673e 3865
d87be9b0 3866 if (setsid() < 0) {
56f64d95 3867 log_error_errno(errno, "setsid() failed: %m");
a2da110b 3868 _exit(EXIT_FAILURE);
bc2f673e
LP
3869 }
3870
db999e0f 3871 if (reset_audit_loginuid() < 0)
a2da110b 3872 _exit(EXIT_FAILURE);
db999e0f 3873
d87be9b0 3874 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
56f64d95 3875 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
a2da110b 3876 _exit(EXIT_FAILURE);
d87be9b0 3877 }
e58a1277 3878
d87be9b0
LP
3879 /* Mark everything as slave, so that we still
3880 * receive mounts from the real root, but don't
3881 * propagate mounts to the real root. */
3882 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
56f64d95 3883 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
a2da110b 3884 _exit(EXIT_FAILURE);
d87be9b0 3885 }
04bc4a3f 3886
727fd4fd
LP
3887 if (mount_devices(arg_directory,
3888 root_device, root_device_rw,
3889 home_device, home_device_rw,
3890 srv_device, srv_device_rw) < 0)
a2da110b 3891 _exit(EXIT_FAILURE);
1b9e5b12 3892
d87be9b0
LP
3893 /* Turn directory into bind mount */
3894 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
56f64d95 3895 log_error_errno(errno, "Failed to make bind mount: %m");
a2da110b 3896 _exit(EXIT_FAILURE);
d87be9b0 3897 }
88213476 3898
4d9f07b4
LP
3899 r = setup_volatile(arg_directory);
3900 if (r < 0)
a2da110b 3901 _exit(EXIT_FAILURE);
4d9f07b4
LP
3902
3903 if (setup_volatile_state(arg_directory) < 0)
a2da110b 3904 _exit(EXIT_FAILURE);
4d9f07b4
LP
3905
3906 r = base_filesystem_create(arg_directory);
3907 if (r < 0)
a2da110b 3908 _exit(EXIT_FAILURE);
4d9f07b4 3909
d6797c92 3910 if (arg_read_only) {
ec16945e
LP
3911 r = bind_remount_recursive(arg_directory, true);
3912 if (r < 0) {
3913 log_error_errno(r, "Failed to make tree read-only: %m");
a2da110b 3914 _exit(EXIT_FAILURE);
d87be9b0 3915 }
d6797c92 3916 }
2547bb41 3917
d87be9b0 3918 if (mount_all(arg_directory) < 0)
a2da110b 3919 _exit(EXIT_FAILURE);
57fb9fb5 3920
d87be9b0 3921 if (copy_devnodes(arg_directory) < 0)
a2da110b 3922 _exit(EXIT_FAILURE);
a258bf26 3923
f2d88580 3924 if (setup_ptmx(arg_directory) < 0)
a2da110b 3925 _exit(EXIT_FAILURE);
f2d88580 3926
d87be9b0 3927 dev_setup(arg_directory);
88213476 3928
785890ac
LP
3929 if (setup_propagate(arg_directory) < 0)
3930 _exit(EXIT_FAILURE);
3931
28650077 3932 if (setup_seccomp() < 0)
a2da110b 3933 _exit(EXIT_FAILURE);
24fb1112 3934
d87be9b0 3935 if (setup_dev_console(arg_directory, console) < 0)
a2da110b 3936 _exit(EXIT_FAILURE);
88213476 3937
d87be9b0 3938 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
a2da110b 3939 _exit(EXIT_FAILURE);
03e334a1 3940 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
a258bf26 3941
6d0b55c2
LP
3942 if (send_rtnl(rtnl_socket_pair[1]) < 0)
3943 _exit(EXIT_FAILURE);
3944 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3945
b12afc8c
LP
3946 /* Tell the parent that we are ready, and that
3947 * it can cgroupify us to that we lack access
3948 * to certain devices and resources. */
3949 (void) barrier_place(&barrier);
3950
d87be9b0 3951 if (setup_boot_id(arg_directory) < 0)
a2da110b 3952 _exit(EXIT_FAILURE);
a41fe3a2 3953
d87be9b0 3954 if (setup_timezone(arg_directory) < 0)
a2da110b 3955 _exit(EXIT_FAILURE);
88213476 3956
d87be9b0 3957 if (setup_resolv_conf(arg_directory) < 0)
a2da110b 3958 _exit(EXIT_FAILURE);
687d0825 3959
d87be9b0 3960 if (setup_journal(arg_directory) < 0)
a2da110b 3961 _exit(EXIT_FAILURE);
687d0825 3962
d6797c92 3963 if (mount_binds(arg_directory, arg_bind, false) < 0)
a2da110b 3964 _exit(EXIT_FAILURE);
17fe0523 3965
d6797c92 3966 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
a2da110b 3967 _exit(EXIT_FAILURE);
17fe0523 3968
06c17c39 3969 if (mount_tmpfs(arg_directory) < 0)
a2da110b 3970 _exit(EXIT_FAILURE);
06c17c39 3971
b12afc8c
LP
3972 /* Wait until we are cgroup-ified, so that we
3973 * can mount the right cgroup path writable */
3974 (void) barrier_sync_next(&barrier);
3975
3976 if (mount_cgroup(arg_directory) < 0)
3977 _exit(EXIT_FAILURE);
d96c1ecf 3978
d87be9b0 3979 if (chdir(arg_directory) < 0) {
56f64d95 3980 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
a2da110b 3981 _exit(EXIT_FAILURE);
687d0825
MV
3982 }
3983
d87be9b0 3984 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
56f64d95 3985 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
a2da110b 3986 _exit(EXIT_FAILURE);
687d0825
MV
3987 }
3988
d87be9b0 3989 if (chroot(".") < 0) {
56f64d95 3990 log_error_errno(errno, "chroot() failed: %m");
a2da110b 3991 _exit(EXIT_FAILURE);
687d0825
MV
3992 }
3993
d87be9b0 3994 if (chdir("/") < 0) {
56f64d95 3995 log_error_errno(errno, "chdir() failed: %m");
a2da110b 3996 _exit(EXIT_FAILURE);
687d0825
MV
3997 }
3998
d87be9b0
LP
3999 umask(0022);
4000
eb91eb18
LP
4001 if (arg_private_network)
4002 loopback_setup();
d87be9b0
LP
4003
4004 if (drop_capabilities() < 0) {
56f64d95 4005 log_error_errno(errno, "drop_capabilities() failed: %m");
a2da110b 4006 _exit(EXIT_FAILURE);
687d0825 4007 }
687d0825 4008
0cb9fbcd
LP
4009 r = change_uid_gid(&home);
4010 if (r < 0)
a2da110b 4011 _exit(EXIT_FAILURE);
d87be9b0 4012
842f3b0f
LP
4013 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4014 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4015 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 4016 log_oom();
a2da110b 4017 _exit(EXIT_FAILURE);
144f0fc0 4018 }
687d0825 4019
9444b1f2 4020 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
9f24adc2
LP
4021 char as_uuid[37];
4022
4023 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
842f3b0f 4024 log_oom();
a2da110b 4025 _exit(EXIT_FAILURE);
842f3b0f
LP
4026 }
4027 }
4028
4029 if (fdset_size(fds) > 0) {
ec16945e
LP
4030 r = fdset_cloexec(fds, false);
4031 if (r < 0) {
4032 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
a2da110b 4033 _exit(EXIT_FAILURE);
842f3b0f
LP
4034 }
4035
4036 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 4037 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0 4038 log_oom();
a2da110b 4039 _exit(EXIT_FAILURE);
d87be9b0
LP
4040 }
4041 }
4042
4043 setup_hostname();
4044
6afc95b7
LP
4045 if (arg_personality != 0xffffffffLU) {
4046 if (personality(arg_personality) < 0) {
56f64d95 4047 log_error_errno(errno, "personality() failed: %m");
a2da110b 4048 _exit(EXIT_FAILURE);
6afc95b7 4049 }
1b9e5b12
LP
4050 } else if (secondary) {
4051 if (personality(PER_LINUX32) < 0) {
56f64d95 4052 log_error_errno(errno, "personality() failed: %m");
a2da110b 4053 _exit(EXIT_FAILURE);
1b9e5b12 4054 }
6afc95b7
LP
4055 }
4056
d96c1ecf
LP
4057#ifdef HAVE_SELINUX
4058 if (arg_selinux_context)
0cb9fbcd 4059 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
56f64d95 4060 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
a2da110b 4061 _exit(EXIT_FAILURE);
0cb9fbcd 4062 }
d96c1ecf 4063#endif
354bfd2b 4064
f4889f65
LP
4065 if (!strv_isempty(arg_setenv)) {
4066 char **n;
4067
4068 n = strv_env_merge(2, envp, arg_setenv);
4069 if (!n) {
4070 log_oom();
a2da110b 4071 _exit(EXIT_FAILURE);
f4889f65
LP
4072 }
4073
4074 env_use = n;
4075 } else
4076 env_use = (char**) envp;
4077
d96c1ecf 4078 /* Wait until the parent is ready with the setup, too... */
a2da110b
DH
4079 if (!barrier_place_and_sync(&barrier))
4080 _exit(EXIT_FAILURE);
d96c1ecf 4081
d87be9b0
LP
4082 if (arg_boot) {
4083 char **a;
4084 size_t l;
88213476 4085
d87be9b0 4086 /* Automatically search for the init system */
0f0dbc46 4087
d87be9b0
LP
4088 l = 1 + argc - optind;
4089 a = newa(char*, l + 1);
4090 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 4091
d87be9b0 4092 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 4093 execve(a[0], a, env_use);
0f0dbc46 4094
d87be9b0 4095 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 4096 execve(a[0], a, env_use);
0f0dbc46 4097
d87be9b0 4098 a[0] = (char*) "/sbin/init";
f4889f65 4099 execve(a[0], a, env_use);
d87be9b0 4100 } else if (argc > optind)
f4889f65 4101 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
4102 else {
4103 chdir(home ? home : "/root");
f4889f65 4104 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 4105 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
4106 }
4107
56f64d95 4108 log_error_errno(errno, "execv() failed: %m");
d87be9b0 4109 _exit(EXIT_FAILURE);
da5b3bad 4110 }
88213476 4111
a2da110b 4112 barrier_set_role(&barrier, BARRIER_PARENT);
842f3b0f
LP
4113 fdset_free(fds);
4114 fds = NULL;
4115
6d0b55c2
LP
4116 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4117 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4118
b12afc8c
LP
4119 /* Wait for the most basic Child-setup to be done,
4120 * before we add hardware to it, and place it in a
4121 * cgroup. */
4122 if (barrier_sync_next(&barrier)) {
5aa4bb6b 4123 int ifi = 0;
354bfd2b 4124
840295fc
LP
4125 r = move_network_interfaces(pid);
4126 if (r < 0)
4127 goto finish;
aa28aefe 4128
5aa4bb6b 4129 r = setup_veth(pid, veth_name, &ifi);
840295fc
LP
4130 if (r < 0)
4131 goto finish;
ab046dde 4132
5aa4bb6b 4133 r = setup_bridge(veth_name, &ifi);
840295fc
LP
4134 if (r < 0)
4135 goto finish;
ab046dde 4136
840295fc
LP
4137 r = setup_macvlan(pid);
4138 if (r < 0)
4139 goto finish;
c74e630d 4140
4bbfe7ad
TG
4141 r = setup_ipvlan(pid);
4142 if (r < 0)
4143 goto finish;
4144
5aa4bb6b
LP
4145 r = register_machine(pid, ifi);
4146 if (r < 0)
4147 goto finish;
4148
840295fc
LP
4149 /* Block SIGCHLD here, before notifying child.
4150 * process_pty() will handle it with the other signals. */
4151 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4152 if (r < 0)
4153 goto finish;
e866af3a 4154
840295fc
LP
4155 /* Reset signal to default */
4156 r = default_signals(SIGCHLD, -1);
4157 if (r < 0)
4158 goto finish;
e866af3a 4159
840295fc
LP
4160 /* Notify the child that the parent is ready with all
4161 * its setup, and that the child can now hand over
4162 * control to the code to run inside the container. */
814a3fdf
LP
4163 (void) barrier_place(&barrier);
4164
b12afc8c 4165 /* And wait that the child is completely ready now. */
6d0b55c2
LP
4166 if (barrier_place_and_sync(&barrier)) {
4167 _cleanup_event_unref_ sd_event *event = NULL;
4168 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4169 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4170 char last_char = 0;
b12afc8c 4171
733d15ac
LP
4172 sd_notifyf(false,
4173 "READY=1\n"
4174 "STATUS=Container running.\n"
4175 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 4176
6d0b55c2
LP
4177 r = sd_event_new(&event);
4178 if (r < 0) {
4179 log_error_errno(r, "Failed to get default event source: %m");
4180 goto finish;
4181 }
88213476 4182
6d0b55c2
LP
4183 if (arg_boot) {
4184 /* Try to kill the init system on SIGINT or SIGTERM */
4185 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4186 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4187 } else {
4188 /* Immediately exit */
4189 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4190 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4191 }
023fb90b 4192
6d0b55c2
LP
4193 /* simply exit on sigchld */
4194 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 4195
6d0b55c2
LP
4196 if (arg_expose_ports) {
4197 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4198 if (r < 0)
4199 goto finish;
023fb90b 4200
6d0b55c2
LP
4201 (void) expose_ports(rtnl, &exposed);
4202 }
023fb90b 4203
6d0b55c2 4204 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 4205
6d0b55c2
LP
4206 r = pty_forward_new(event, master, true, &forward);
4207 if (r < 0) {
4208 log_error_errno(r, "Failed to create PTY forwarder: %m");
4209 goto finish;
4210 }
023fb90b 4211
6d0b55c2
LP
4212 r = sd_event_loop(event);
4213 if (r < 0) {
4214 log_error_errno(r, "Failed to run event loop: %m");
4215 goto finish;
4216 }
4217
4218 pty_forward_get_last_char(forward, &last_char);
4219
4220 forward = pty_forward_free(forward);
4221
4222 if (!arg_quiet && last_char != '\n')
4223 putc('\n', stdout);
04d39279 4224
6d0b55c2
LP
4225 /* Kill if it is not dead yet anyway */
4226 terminate_machine(pid);
4227 }
840295fc 4228 }
1f0cd86b 4229
840295fc 4230 /* Normally redundant, but better safe than sorry */
04d39279 4231 kill(pid, SIGKILL);
a258bf26 4232
113cea80 4233 r = wait_for_container(pid, &container_status);
04d39279
LP
4234 pid = 0;
4235
ec16945e 4236 if (r < 0)
ce9f1527
LP
4237 /* We failed to wait for the container, or the
4238 * container exited abnormally */
ec16945e
LP
4239 goto finish;
4240 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
4241 /* The container exited with a non-zero
4242 * status, or with zero status and no reboot
4243 * was requested. */
ec16945e 4244 ret = r;
d87be9b0 4245 break;
ec16945e 4246 }
88213476 4247
113cea80 4248 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
4249
4250 if (arg_keep_unit) {
4251 /* Special handling if we are running as a
4252 * service: instead of simply restarting the
4253 * machine we want to restart the entire
4254 * service, so let's inform systemd about this
4255 * with the special exit code 133. The service
4256 * file uses RestartForceExitStatus=133 so
4257 * that this results in a full nspawn
4258 * restart. This is necessary since we might
4259 * have cgroup parameters set we want to have
4260 * flushed out. */
ec16945e
LP
4261 ret = 133;
4262 r = 0;
ce38dbc8
LP
4263 break;
4264 }
6d0b55c2
LP
4265
4266 flush_ports(&exposed);
d87be9b0 4267 }
88213476
LP
4268
4269finish:
af4ec430
LP
4270 sd_notify(false,
4271 "STOPPING=1\n"
4272 "STATUS=Terminating...");
4273
1b9e5b12
LP
4274 loop_remove(loop_nr, &image_fd);
4275
9444b1f2
LP
4276 if (pid > 0)
4277 kill(pid, SIGKILL);
88213476 4278
ec16945e
LP
4279 if (remove_subvol && arg_directory) {
4280 int k;
4281
4282 k = btrfs_subvol_remove(arg_directory);
4283 if (k < 0)
4284 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4285 }
4286
785890ac
LP
4287 if (arg_machine) {
4288 const char *p;
4289
8937422f 4290 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
4291 (void) rm_rf(p, false, true, false);
4292 }
4293
04d391da 4294 free(arg_directory);
ec16945e
LP
4295 free(arg_template);
4296 free(arg_image);
7027ff61 4297 free(arg_machine);
c74e630d
LP
4298 free(arg_user);
4299 strv_free(arg_setenv);
4300 strv_free(arg_network_interfaces);
4301 strv_free(arg_network_macvlan);
4bbfe7ad 4302 strv_free(arg_network_ipvlan);
c74e630d
LP
4303 strv_free(arg_bind);
4304 strv_free(arg_bind_ro);
06c17c39 4305 strv_free(arg_tmpfs);
88213476 4306
6d0b55c2
LP
4307 flush_ports(&exposed);
4308
4309 while (arg_expose_ports) {
4310 ExposePort *p = arg_expose_ports;
4311 LIST_REMOVE(ports, arg_expose_ports, p);
4312 free(p);
4313 }
4314
ec16945e 4315 return r < 0 ? EXIT_FAILURE : ret;
88213476 4316}