]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
util: split all hostname related calls into hostname-util.c
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
88213476 26#include <sys/mount.h>
88213476
LP
27#include <stdlib.h>
28#include <string.h>
29#include <stdio.h>
30#include <errno.h>
31#include <sys/prctl.h>
88213476 32#include <getopt.h>
687d0825 33#include <grp.h>
5ed27dbd 34#include <linux/fs.h>
9537eab0 35#include <sys/socket.h>
aea38d80 36#include <linux/netlink.h>
aa28aefe 37#include <net/if.h>
69c79d3c 38#include <linux/veth.h>
6afc95b7 39#include <sys/personality.h>
1b9e5b12 40#include <linux/loop.h>
2fbe4296 41#include <sys/file.h>
aa28aefe 42
5d63309c 43#ifdef HAVE_SELINUX
a8828ed9
DW
44#include <selinux/selinux.h>
45#endif
88213476 46
24fb1112
LP
47#ifdef HAVE_SECCOMP
48#include <seccomp.h>
49#endif
50
1b9e5b12
LP
51#ifdef HAVE_BLKID
52#include <blkid/blkid.h>
53#endif
54
1f0cd86b
LP
55#include "sd-daemon.h"
56#include "sd-bus.h"
57#include "sd-id128.h"
aa28aefe 58#include "sd-rtnl.h"
958b66ea 59#include "random-util.h"
88213476
LP
60#include "log.h"
61#include "util.h"
49e942b2 62#include "mkdir.h"
c6878637 63#include "rm-rf.h"
6b2d0e85 64#include "macro.h"
94d82985 65#include "missing.h"
04d391da 66#include "cgroup-util.h"
a258bf26 67#include "strv.h"
9eb977db 68#include "path-util.h"
a41fe3a2 69#include "loopback-setup.h"
4fc9982c 70#include "dev-setup.h"
842f3b0f 71#include "fdset.h"
acbeb427 72#include "build.h"
a5c32cff 73#include "fileio.h"
40ca29a1 74#include "bus-util.h"
1f0cd86b 75#include "bus-error.h"
4ba93280 76#include "ptyfwd.h"
f4889f65 77#include "env-util.h"
aa28aefe 78#include "rtnl-util.h"
7e227024 79#include "udev-util.h"
1b9e5b12
LP
80#include "blkid-util.h"
81#include "gpt.h"
01dde061 82#include "siphash24.h"
849958d1 83#include "copy.h"
3577de7a 84#include "base-filesystem.h"
a2da110b 85#include "barrier.h"
023fb90b 86#include "event-util.h"
f01ae826 87#include "capability.h"
2822da4f 88#include "cap-list.h"
ec16945e 89#include "btrfs-util.h"
1b9cebf6 90#include "machine-image.h"
6d0b55c2
LP
91#include "list.h"
92#include "in-addr-util.h"
93#include "fw-util.h"
94#include "local-addresses.h"
6482f626 95#include "formats-util.h"
0b452006 96#include "process-util.h"
288a74cc 97#include "terminal-util.h"
958b66ea 98#include "hostname-util.h"
f2d88580 99
e9642be2
LP
100#ifdef HAVE_SECCOMP
101#include "seccomp-util.h"
102#endif
103
6d0b55c2
LP
104typedef struct ExposePort {
105 int protocol;
106 uint16_t host_port;
107 uint16_t container_port;
108 LIST_FIELDS(struct ExposePort, ports);
109} ExposePort;
110
113cea80
DH
111typedef enum ContainerStatus {
112 CONTAINER_TERMINATED,
113 CONTAINER_REBOOTED
114} ContainerStatus;
115
57fb9fb5
LP
116typedef enum LinkJournal {
117 LINK_NO,
118 LINK_AUTO,
119 LINK_HOST,
120 LINK_GUEST
121} LinkJournal;
88213476 122
4d9f07b4
LP
123typedef enum Volatile {
124 VOLATILE_NO,
125 VOLATILE_YES,
126 VOLATILE_STATE,
127} Volatile;
128
5a8af538
LP
129typedef enum CustomMountType {
130 CUSTOM_MOUNT_BIND,
131 CUSTOM_MOUNT_TMPFS,
132 CUSTOM_MOUNT_OVERLAY,
133} CustomMountType;
134
135typedef struct CustomMount {
136 CustomMountType type;
137 bool read_only;
138 char *source; /* for overlayfs this is the upper directory */
139 char *destination;
140 char *options;
141 char *work_dir;
142 char **lower;
143} CustomMount;
144
88213476 145static char *arg_directory = NULL;
ec16945e 146static char *arg_template = NULL;
687d0825 147static char *arg_user = NULL;
9444b1f2 148static sd_id128_t arg_uuid = {};
7027ff61 149static char *arg_machine = NULL;
c74e630d
LP
150static const char *arg_selinux_context = NULL;
151static const char *arg_selinux_apifs_context = NULL;
9444b1f2 152static const char *arg_slice = NULL;
ff01d048 153static bool arg_private_network = false;
bc2f673e 154static bool arg_read_only = false;
0f0dbc46 155static bool arg_boot = false;
ec16945e 156static bool arg_ephemeral = false;
57fb9fb5 157static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 158static bool arg_link_journal_try = false;
5076f0cc
LP
159static uint64_t arg_retain =
160 (1ULL << CAP_CHOWN) |
161 (1ULL << CAP_DAC_OVERRIDE) |
162 (1ULL << CAP_DAC_READ_SEARCH) |
163 (1ULL << CAP_FOWNER) |
164 (1ULL << CAP_FSETID) |
165 (1ULL << CAP_IPC_OWNER) |
166 (1ULL << CAP_KILL) |
167 (1ULL << CAP_LEASE) |
168 (1ULL << CAP_LINUX_IMMUTABLE) |
169 (1ULL << CAP_NET_BIND_SERVICE) |
170 (1ULL << CAP_NET_BROADCAST) |
171 (1ULL << CAP_NET_RAW) |
172 (1ULL << CAP_SETGID) |
173 (1ULL << CAP_SETFCAP) |
174 (1ULL << CAP_SETPCAP) |
175 (1ULL << CAP_SETUID) |
176 (1ULL << CAP_SYS_ADMIN) |
177 (1ULL << CAP_SYS_CHROOT) |
178 (1ULL << CAP_SYS_NICE) |
179 (1ULL << CAP_SYS_PTRACE) |
180 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 181 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
182 (1ULL << CAP_SYS_BOOT) |
183 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
184 (1ULL << CAP_AUDIT_CONTROL) |
185 (1ULL << CAP_MKNOD);
5a8af538
LP
186static CustomMount *arg_custom_mounts = NULL;
187static unsigned arg_n_custom_mounts = 0;
f4889f65 188static char **arg_setenv = NULL;
284c0b91 189static bool arg_quiet = false;
8a96d94e 190static bool arg_share_system = false;
eb91eb18 191static bool arg_register = true;
89f7c846 192static bool arg_keep_unit = false;
aa28aefe 193static char **arg_network_interfaces = NULL;
c74e630d 194static char **arg_network_macvlan = NULL;
4bbfe7ad 195static char **arg_network_ipvlan = NULL;
69c79d3c 196static bool arg_network_veth = false;
c74e630d 197static const char *arg_network_bridge = NULL;
6afc95b7 198static unsigned long arg_personality = 0xffffffffLU;
ec16945e 199static char *arg_image = NULL;
4d9f07b4 200static Volatile arg_volatile = VOLATILE_NO;
6d0b55c2 201static ExposePort *arg_expose_ports = NULL;
f36933fe 202static char **arg_property = NULL;
6dac160c
LP
203static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
204static bool arg_userns = false;
c6c8f6e2 205static int arg_kill_signal = 0;
88213476 206
601185b4 207static void help(void) {
88213476
LP
208 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
209 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
210 " -h --help Show this help\n"
211 " --version Print version string\n"
69c79d3c 212 " -q --quiet Do not show status information\n"
1b9e5b12 213 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
214 " --template=PATH Initialize root directory from template directory,\n"
215 " if missing\n"
216 " -x --ephemeral Run container with snapshot of root directory, and\n"
217 " remove it after exit\n"
218 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
219 " -b --boot Boot up full system (i.e. invoke init)\n"
220 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 221 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 222 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 223 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 224 " --property=NAME=VALUE Set scope unit property\n"
69c79d3c
LP
225 " --private-network Disable network in container\n"
226 " --network-interface=INTERFACE\n"
227 " Assign an existing network interface to the\n"
228 " container\n"
c74e630d
LP
229 " --network-macvlan=INTERFACE\n"
230 " Create a macvlan network interface based on an\n"
231 " existing network interface to the container\n"
4bbfe7ad
TG
232 " --network-ipvlan=INTERFACE\n"
233 " Create a ipvlan network interface based on an\n"
234 " existing network interface to the container\n"
0dfaa006 235 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 236 " and container\n"
ab046dde 237 " --network-bridge=INTERFACE\n"
32457153 238 " Add a virtual ethernet connection between host\n"
ab046dde
TG
239 " and container and add it to an existing bridge on\n"
240 " the host\n"
6dac160c
LP
241 " --private-users[=UIDBASE[:NUIDS]]\n"
242 " Run within user namespace\n"
6d0b55c2 243 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 244 " Expose a container IP port on the host\n"
82adf6af
LP
245 " -Z --selinux-context=SECLABEL\n"
246 " Set the SELinux security context to be used by\n"
247 " processes in the container\n"
248 " -L --selinux-apifs-context=SECLABEL\n"
249 " Set the SELinux security context to be used by\n"
250 " API/tmpfs file systems in the container\n"
a8828ed9
DW
251 " --capability=CAP In addition to the default, retain specified\n"
252 " capability\n"
253 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 254 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
255 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
256 " try-guest, try-host\n"
257 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 258 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
259 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
260 " the container\n"
261 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 262 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
263 " --overlay=PATH[:PATH...]:PATH\n"
264 " Create an overlay mount from the host to \n"
265 " the container\n"
266 " --overlay-ro=PATH[:PATH...]:PATH\n"
267 " Similar, but creates a read-only overlay mount\n"
284c0b91 268 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 269 " --share-system Share system namespaces with host\n"
eb91eb18 270 " --register=BOOLEAN Register container as machine\n"
89f7c846 271 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 272 " the service unit nspawn is running in\n"
6d0b55c2
LP
273 " --volatile[=MODE] Run the system in volatile mode\n"
274 , program_invocation_short_name);
88213476
LP
275}
276
5a8af538
LP
277static CustomMount* custom_mount_add(CustomMountType t) {
278 CustomMount *c, *ret;
279
280 c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
281 if (!c)
282 return NULL;
283
284 arg_custom_mounts = c;
285 ret = arg_custom_mounts + arg_n_custom_mounts;
286 arg_n_custom_mounts++;
287
288 *ret = (CustomMount) { .type = t };
289
290 return ret;
291}
292
293static void custom_mount_free_all(void) {
294 unsigned i;
295
296 for (i = 0; i < arg_n_custom_mounts; i++) {
297 CustomMount *m = &arg_custom_mounts[i];
298
299 free(m->source);
300 free(m->destination);
301 free(m->options);
302
303 if (m->work_dir) {
304 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
305 free(m->work_dir);
306 }
307
308 strv_free(m->lower);
309 }
310
311 free(arg_custom_mounts);
312 arg_custom_mounts = NULL;
313 arg_n_custom_mounts = 0;
314}
315
316static int custom_mount_compare(const void *a, const void *b) {
317 const CustomMount *x = a, *y = b;
318 int r;
319
320 r = path_compare(x->destination, y->destination);
321 if (r != 0)
322 return r;
323
324 if (x->type < y->type)
325 return -1;
326 if (x->type > y->type)
327 return 1;
328
329 return 0;
330}
331
332static int custom_mounts_prepare(void) {
333 unsigned i;
334 int r;
335
336 /* Ensure the mounts are applied prefix first. */
337 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
338
339 /* Allocate working directories for the overlay file systems that need it */
340 for (i = 0; i < arg_n_custom_mounts; i++) {
341 CustomMount *m = &arg_custom_mounts[i];
342
343 if (m->type != CUSTOM_MOUNT_OVERLAY)
344 continue;
345
346 if (m->work_dir)
347 continue;
348
349 if (m->read_only)
350 continue;
351
352 r = tempfn_random(m->source, &m->work_dir);
353 if (r < 0)
354 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
355 }
356
357 return 0;
358}
359
ec16945e
LP
360static int set_sanitized_path(char **b, const char *path) {
361 char *p;
362
363 assert(b);
364 assert(path);
365
366 p = canonicalize_file_name(path);
367 if (!p) {
368 if (errno != ENOENT)
369 return -errno;
370
371 p = path_make_absolute_cwd(path);
372 if (!p)
373 return -ENOMEM;
374 }
375
376 free(*b);
377 *b = path_kill_slashes(p);
378 return 0;
379}
380
88213476
LP
381static int parse_argv(int argc, char *argv[]) {
382
a41fe3a2 383 enum {
acbeb427
ZJS
384 ARG_VERSION = 0x100,
385 ARG_PRIVATE_NETWORK,
bc2f673e 386 ARG_UUID,
5076f0cc 387 ARG_READ_ONLY,
57fb9fb5 388 ARG_CAPABILITY,
420c7379 389 ARG_DROP_CAPABILITY,
17fe0523
LP
390 ARG_LINK_JOURNAL,
391 ARG_BIND,
f4889f65 392 ARG_BIND_RO,
06c17c39 393 ARG_TMPFS,
5a8af538
LP
394 ARG_OVERLAY,
395 ARG_OVERLAY_RO,
f4889f65 396 ARG_SETENV,
eb91eb18 397 ARG_SHARE_SYSTEM,
89f7c846 398 ARG_REGISTER,
aa28aefe 399 ARG_KEEP_UNIT,
69c79d3c 400 ARG_NETWORK_INTERFACE,
c74e630d 401 ARG_NETWORK_MACVLAN,
4bbfe7ad 402 ARG_NETWORK_IPVLAN,
ab046dde 403 ARG_NETWORK_BRIDGE,
6afc95b7 404 ARG_PERSONALITY,
4d9f07b4 405 ARG_VOLATILE,
ec16945e 406 ARG_TEMPLATE,
f36933fe 407 ARG_PROPERTY,
6dac160c 408 ARG_PRIVATE_USERS,
c6c8f6e2 409 ARG_KILL_SIGNAL,
a41fe3a2
LP
410 };
411
88213476 412 static const struct option options[] = {
aa28aefe
LP
413 { "help", no_argument, NULL, 'h' },
414 { "version", no_argument, NULL, ARG_VERSION },
415 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
416 { "template", required_argument, NULL, ARG_TEMPLATE },
417 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
418 { "user", required_argument, NULL, 'u' },
419 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
420 { "boot", no_argument, NULL, 'b' },
421 { "uuid", required_argument, NULL, ARG_UUID },
422 { "read-only", no_argument, NULL, ARG_READ_ONLY },
423 { "capability", required_argument, NULL, ARG_CAPABILITY },
424 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
425 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
426 { "bind", required_argument, NULL, ARG_BIND },
427 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 428 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
429 { "overlay", required_argument, NULL, ARG_OVERLAY },
430 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
431 { "machine", required_argument, NULL, 'M' },
432 { "slice", required_argument, NULL, 'S' },
433 { "setenv", required_argument, NULL, ARG_SETENV },
434 { "selinux-context", required_argument, NULL, 'Z' },
435 { "selinux-apifs-context", required_argument, NULL, 'L' },
436 { "quiet", no_argument, NULL, 'q' },
437 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
438 { "register", required_argument, NULL, ARG_REGISTER },
439 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
440 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 441 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 442 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 443 { "network-veth", no_argument, NULL, 'n' },
ab046dde 444 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 445 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 446 { "image", required_argument, NULL, 'i' },
4d9f07b4 447 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 448 { "port", required_argument, NULL, 'p' },
f36933fe 449 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 450 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 451 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
eb9da376 452 {}
88213476
LP
453 };
454
9444b1f2 455 int c, r;
a42c8b54 456 uint64_t plus = 0, minus = 0;
88213476
LP
457
458 assert(argc >= 0);
459 assert(argv);
460
0dfaa006 461 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
462
463 switch (c) {
464
465 case 'h':
601185b4
ZJS
466 help();
467 return 0;
88213476 468
acbeb427
ZJS
469 case ARG_VERSION:
470 puts(PACKAGE_STRING);
471 puts(SYSTEMD_FEATURES);
472 return 0;
473
88213476 474 case 'D':
ec16945e
LP
475 r = set_sanitized_path(&arg_directory, optarg);
476 if (r < 0)
477 return log_error_errno(r, "Invalid root directory: %m");
478
479 break;
480
481 case ARG_TEMPLATE:
482 r = set_sanitized_path(&arg_template, optarg);
483 if (r < 0)
484 return log_error_errno(r, "Invalid template directory: %m");
88213476
LP
485
486 break;
487
1b9e5b12 488 case 'i':
ec16945e
LP
489 r = set_sanitized_path(&arg_image, optarg);
490 if (r < 0)
491 return log_error_errno(r, "Invalid image path: %m");
492
493 break;
494
495 case 'x':
496 arg_ephemeral = true;
1b9e5b12
LP
497 break;
498
687d0825
MV
499 case 'u':
500 free(arg_user);
7027ff61
LP
501 arg_user = strdup(optarg);
502 if (!arg_user)
503 return log_oom();
687d0825
MV
504
505 break;
506
ab046dde 507 case ARG_NETWORK_BRIDGE:
c74e630d 508 arg_network_bridge = optarg;
ab046dde
TG
509
510 /* fall through */
511
0dfaa006 512 case 'n':
69c79d3c
LP
513 arg_network_veth = true;
514 arg_private_network = true;
515 break;
516
aa28aefe 517 case ARG_NETWORK_INTERFACE:
c74e630d
LP
518 if (strv_extend(&arg_network_interfaces, optarg) < 0)
519 return log_oom();
520
521 arg_private_network = true;
522 break;
523
524 case ARG_NETWORK_MACVLAN:
525 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
526 return log_oom();
527
4bbfe7ad
TG
528 arg_private_network = true;
529 break;
530
531 case ARG_NETWORK_IPVLAN:
532 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
533 return log_oom();
534
aa28aefe
LP
535 /* fall through */
536
ff01d048
LP
537 case ARG_PRIVATE_NETWORK:
538 arg_private_network = true;
a41fe3a2
LP
539 break;
540
0f0dbc46
LP
541 case 'b':
542 arg_boot = true;
543 break;
544
144f0fc0 545 case ARG_UUID:
9444b1f2
LP
546 r = sd_id128_from_string(optarg, &arg_uuid);
547 if (r < 0) {
aa96c6cb 548 log_error("Invalid UUID: %s", optarg);
9444b1f2 549 return r;
aa96c6cb 550 }
9444b1f2 551 break;
aa96c6cb 552
9444b1f2 553 case 'S':
c74e630d 554 arg_slice = optarg;
144f0fc0
LP
555 break;
556
7027ff61 557 case 'M':
eb91eb18
LP
558 if (isempty(optarg)) {
559 free(arg_machine);
560 arg_machine = NULL;
561 } else {
0c3c4284 562 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
563 log_error("Invalid machine name: %s", optarg);
564 return -EINVAL;
565 }
7027ff61 566
0c3c4284
LP
567 r = free_and_strdup(&arg_machine, optarg);
568 if (r < 0)
eb91eb18
LP
569 return log_oom();
570
571 break;
572 }
7027ff61 573
82adf6af
LP
574 case 'Z':
575 arg_selinux_context = optarg;
a8828ed9
DW
576 break;
577
82adf6af
LP
578 case 'L':
579 arg_selinux_apifs_context = optarg;
a8828ed9
DW
580 break;
581
bc2f673e
LP
582 case ARG_READ_ONLY:
583 arg_read_only = true;
584 break;
585
420c7379
LP
586 case ARG_CAPABILITY:
587 case ARG_DROP_CAPABILITY: {
a2a5291b 588 const char *state, *word;
5076f0cc
LP
589 size_t length;
590
591 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 592 _cleanup_free_ char *t;
5076f0cc
LP
593
594 t = strndup(word, length);
0d0f0c50
SL
595 if (!t)
596 return log_oom();
5076f0cc 597
39ed67d1
LP
598 if (streq(t, "all")) {
599 if (c == ARG_CAPABILITY)
a42c8b54 600 plus = (uint64_t) -1;
39ed67d1 601 else
a42c8b54 602 minus = (uint64_t) -1;
39ed67d1 603 } else {
2822da4f
LP
604 int cap;
605
606 cap = capability_from_name(t);
607 if (cap < 0) {
39ed67d1
LP
608 log_error("Failed to parse capability %s.", t);
609 return -EINVAL;
610 }
611
612 if (c == ARG_CAPABILITY)
a42c8b54 613 plus |= 1ULL << (uint64_t) cap;
39ed67d1 614 else
a42c8b54 615 minus |= 1ULL << (uint64_t) cap;
5076f0cc 616 }
5076f0cc
LP
617 }
618
619 break;
620 }
621
57fb9fb5
LP
622 case 'j':
623 arg_link_journal = LINK_GUEST;
574edc90 624 arg_link_journal_try = true;
57fb9fb5
LP
625 break;
626
627 case ARG_LINK_JOURNAL:
53e438e3 628 if (streq(optarg, "auto")) {
57fb9fb5 629 arg_link_journal = LINK_AUTO;
53e438e3
LP
630 arg_link_journal_try = false;
631 } else if (streq(optarg, "no")) {
57fb9fb5 632 arg_link_journal = LINK_NO;
53e438e3
LP
633 arg_link_journal_try = false;
634 } else if (streq(optarg, "guest")) {
57fb9fb5 635 arg_link_journal = LINK_GUEST;
53e438e3
LP
636 arg_link_journal_try = false;
637 } else if (streq(optarg, "host")) {
57fb9fb5 638 arg_link_journal = LINK_HOST;
53e438e3
LP
639 arg_link_journal_try = false;
640 } else if (streq(optarg, "try-guest")) {
574edc90
MP
641 arg_link_journal = LINK_GUEST;
642 arg_link_journal_try = true;
643 } else if (streq(optarg, "try-host")) {
644 arg_link_journal = LINK_HOST;
645 arg_link_journal_try = true;
646 } else {
57fb9fb5
LP
647 log_error("Failed to parse link journal mode %s", optarg);
648 return -EINVAL;
649 }
650
651 break;
652
17fe0523
LP
653 case ARG_BIND:
654 case ARG_BIND_RO: {
5a8af538
LP
655 _cleanup_free_ char *source = NULL, *destination = NULL;
656 CustomMount *m;
17fe0523 657 char *e;
17fe0523
LP
658
659 e = strchr(optarg, ':');
660 if (e) {
5a8af538
LP
661 source = strndup(optarg, e - optarg);
662 destination = strdup(e + 1);
17fe0523 663 } else {
5a8af538
LP
664 source = strdup(optarg);
665 destination = strdup(optarg);
17fe0523
LP
666 }
667
5a8af538 668 if (!source || !destination)
17fe0523
LP
669 return log_oom();
670
5a8af538 671 if (!path_is_absolute(source) || !path_is_absolute(destination)) {
17fe0523
LP
672 log_error("Invalid bind mount specification: %s", optarg);
673 return -EINVAL;
674 }
675
5a8af538
LP
676 m = custom_mount_add(CUSTOM_MOUNT_BIND);
677 if (!m)
b3451bed 678 return log_oom();
17fe0523 679
5a8af538
LP
680 m->source = source;
681 m->destination = destination;
682 m->read_only = c == ARG_BIND_RO;
683
684 source = destination = NULL;
17fe0523
LP
685
686 break;
687 }
688
06c17c39 689 case ARG_TMPFS: {
5a8af538
LP
690 _cleanup_free_ char *path = NULL, *opts = NULL;
691 CustomMount *m;
06c17c39
LP
692 char *e;
693
694 e = strchr(optarg, ':');
695 if (e) {
5a8af538
LP
696 path = strndup(optarg, e - optarg);
697 opts = strdup(e + 1);
06c17c39 698 } else {
5a8af538
LP
699 path = strdup(optarg);
700 opts = strdup("mode=0755");
06c17c39
LP
701 }
702
5a8af538 703 if (!path || !opts)
06c17c39
LP
704 return log_oom();
705
5a8af538 706 if (!path_is_absolute(path)) {
06c17c39
LP
707 log_error("Invalid tmpfs specification: %s", optarg);
708 return -EINVAL;
709 }
710
5a8af538
LP
711 m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
712 if (!m)
06c17c39
LP
713 return log_oom();
714
5a8af538
LP
715 m->destination = path;
716 m->options = opts;
06c17c39 717
5a8af538
LP
718 path = opts = NULL;
719
720 break;
721 }
722
723 case ARG_OVERLAY:
724 case ARG_OVERLAY_RO: {
725 _cleanup_free_ char *upper = NULL, *destination = NULL;
726 _cleanup_strv_free_ char **lower = NULL;
727 CustomMount *m;
728 unsigned n = 0;
729 char **i;
730
731 lower = strv_split(optarg, ":");
732 if (!lower)
06c17c39
LP
733 return log_oom();
734
5a8af538
LP
735 STRV_FOREACH(i, lower) {
736 if (!path_is_absolute(*i)) {
737 log_error("Overlay path %s is not absolute.", *i);
738 return -EINVAL;
739 }
740
741 n++;
742 }
743
744 if (n < 2) {
745 log_error("--overlay= needs at least two colon-separated directories specified.");
746 return -EINVAL;
747 }
748
749 if (n == 2) {
750 /* If two parameters are specified,
751 * the first one is the lower, the
752 * second one the upper directory. And
753 * we'll also define the the
754 * destination mount point the same as
755 * the upper. */
756 upper = lower[1];
757 lower[1] = NULL;
758
759 destination = strdup(upper);
760 if (!destination)
761 return log_oom();
762
763 } else {
764 upper = lower[n - 2];
765 destination = lower[n - 1];
766 lower[n - 2] = NULL;
767 }
768
769 m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
770 if (!m)
771 return log_oom();
772
773 m->destination = destination;
774 m->source = upper;
775 m->lower = lower;
776 m->read_only = c == ARG_OVERLAY_RO;
777
778 upper = destination = NULL;
779 lower = NULL;
06c17c39
LP
780
781 break;
782 }
783
f4889f65
LP
784 case ARG_SETENV: {
785 char **n;
786
787 if (!env_assignment_is_valid(optarg)) {
788 log_error("Environment variable assignment '%s' is not valid.", optarg);
789 return -EINVAL;
790 }
791
792 n = strv_env_set(arg_setenv, optarg);
793 if (!n)
794 return log_oom();
795
796 strv_free(arg_setenv);
797 arg_setenv = n;
798 break;
799 }
800
284c0b91
LP
801 case 'q':
802 arg_quiet = true;
803 break;
804
8a96d94e
LP
805 case ARG_SHARE_SYSTEM:
806 arg_share_system = true;
807 break;
808
eb91eb18
LP
809 case ARG_REGISTER:
810 r = parse_boolean(optarg);
811 if (r < 0) {
812 log_error("Failed to parse --register= argument: %s", optarg);
813 return r;
814 }
815
816 arg_register = r;
817 break;
818
89f7c846
LP
819 case ARG_KEEP_UNIT:
820 arg_keep_unit = true;
821 break;
822
6afc95b7
LP
823 case ARG_PERSONALITY:
824
ac45f971 825 arg_personality = personality_from_string(optarg);
6afc95b7
LP
826 if (arg_personality == 0xffffffffLU) {
827 log_error("Unknown or unsupported personality '%s'.", optarg);
828 return -EINVAL;
829 }
830
831 break;
832
4d9f07b4
LP
833 case ARG_VOLATILE:
834
835 if (!optarg)
836 arg_volatile = VOLATILE_YES;
837 else {
838 r = parse_boolean(optarg);
839 if (r < 0) {
840 if (streq(optarg, "state"))
841 arg_volatile = VOLATILE_STATE;
842 else {
843 log_error("Failed to parse --volatile= argument: %s", optarg);
844 return r;
845 }
846 } else
847 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
848 }
849
850 break;
851
6d0b55c2
LP
852 case 'p': {
853 const char *split, *e;
854 uint16_t container_port, host_port;
855 int protocol;
856 ExposePort *p;
857
858 if ((e = startswith(optarg, "tcp:")))
859 protocol = IPPROTO_TCP;
860 else if ((e = startswith(optarg, "udp:")))
861 protocol = IPPROTO_UDP;
862 else {
863 e = optarg;
864 protocol = IPPROTO_TCP;
865 }
866
867 split = strchr(e, ':');
868 if (split) {
869 char v[split - e + 1];
870
871 memcpy(v, e, split - e);
872 v[split - e] = 0;
873
874 r = safe_atou16(v, &host_port);
875 if (r < 0 || host_port <= 0) {
876 log_error("Failed to parse host port: %s", optarg);
877 return -EINVAL;
878 }
879
880 r = safe_atou16(split + 1, &container_port);
881 } else {
882 r = safe_atou16(e, &container_port);
883 host_port = container_port;
884 }
885
886 if (r < 0 || container_port <= 0) {
887 log_error("Failed to parse host port: %s", optarg);
888 return -EINVAL;
889 }
890
891 LIST_FOREACH(ports, p, arg_expose_ports) {
892 if (p->protocol == protocol && p->host_port == host_port) {
893 log_error("Duplicate port specification: %s", optarg);
894 return -EINVAL;
895 }
896 }
897
898 p = new(ExposePort, 1);
899 if (!p)
900 return log_oom();
901
902 p->protocol = protocol;
903 p->host_port = host_port;
904 p->container_port = container_port;
905
906 LIST_PREPEND(ports, arg_expose_ports, p);
907
908 break;
909 }
910
f36933fe
LP
911 case ARG_PROPERTY:
912 if (strv_extend(&arg_property, optarg) < 0)
913 return log_oom();
914
915 break;
916
6dac160c
LP
917 case ARG_PRIVATE_USERS:
918 if (optarg) {
919 _cleanup_free_ char *buffer = NULL;
920 const char *range, *shift;
921
922 range = strchr(optarg, ':');
923 if (range) {
924 buffer = strndup(optarg, range - optarg);
925 if (!buffer)
926 return log_oom();
927 shift = buffer;
928
929 range++;
930 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
931 log_error("Failed to parse UID range: %s", range);
932 return -EINVAL;
933 }
934 } else
935 shift = optarg;
936
937 if (parse_uid(shift, &arg_uid_shift) < 0) {
938 log_error("Failed to parse UID: %s", optarg);
939 return -EINVAL;
940 }
941 }
942
943 arg_userns = true;
944 break;
945
c6c8f6e2
LP
946 case ARG_KILL_SIGNAL:
947 arg_kill_signal = signal_from_string_try_harder(optarg);
948 if (arg_kill_signal < 0) {
949 log_error("Cannot parse signal: %s", optarg);
950 return -EINVAL;
951 }
952
953 break;
954
88213476
LP
955 case '?':
956 return -EINVAL;
957
958 default:
eb9da376 959 assert_not_reached("Unhandled option");
88213476 960 }
88213476 961
eb91eb18
LP
962 if (arg_share_system)
963 arg_register = false;
964
965 if (arg_boot && arg_share_system) {
966 log_error("--boot and --share-system may not be combined.");
967 return -EINVAL;
968 }
969
89f7c846
LP
970 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
971 log_error("--keep-unit may not be used when invoked from a user session.");
972 return -EINVAL;
973 }
974
1b9e5b12
LP
975 if (arg_directory && arg_image) {
976 log_error("--directory= and --image= may not be combined.");
977 return -EINVAL;
978 }
979
ec16945e
LP
980 if (arg_template && arg_image) {
981 log_error("--template= and --image= may not be combined.");
982 return -EINVAL;
983 }
984
985 if (arg_template && !(arg_directory || arg_machine)) {
986 log_error("--template= needs --directory= or --machine=.");
987 return -EINVAL;
988 }
989
990 if (arg_ephemeral && arg_template) {
991 log_error("--ephemeral and --template= may not be combined.");
992 return -EINVAL;
993 }
994
995 if (arg_ephemeral && arg_image) {
996 log_error("--ephemeral and --image= may not be combined.");
997 return -EINVAL;
998 }
999
df9a75e4
LP
1000 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1001 log_error("--ephemeral and --link-journal= may not be combined.");
1002 return -EINVAL;
1003 }
1004
4d9f07b4
LP
1005 if (arg_volatile != VOLATILE_NO && arg_read_only) {
1006 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1007 return -EINVAL;
1008 }
1009
6d0b55c2
LP
1010 if (arg_expose_ports && !arg_private_network) {
1011 log_error("Cannot use --port= without private networking.");
1012 return -EINVAL;
1013 }
1014
a42c8b54
LP
1015 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1016
c6c8f6e2
LP
1017 if (arg_boot && arg_kill_signal <= 0)
1018 arg_kill_signal = SIGRTMIN+3;
1019
88213476
LP
1020 return 1;
1021}
1022
1023static int mount_all(const char *dest) {
1024
1025 typedef struct MountPoint {
1026 const char *what;
1027 const char *where;
1028 const char *type;
1029 const char *options;
1030 unsigned long flags;
3bd66c05 1031 bool fatal;
88213476
LP
1032 } MountPoint;
1033
1034 static const MountPoint mount_table[] = {
54b4755f
ILG
1035 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
1036 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
1037 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
1038 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
1039 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true },
1040 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
1041 { "devpts", "/dev/pts", "devpts", "newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
1042 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
1043 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
1044 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true },
9b634ea5 1045#ifdef HAVE_SELINUX
54b4755f
ILG
1046 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
1047 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 1048#endif
88213476
LP
1049 };
1050
1051 unsigned k;
1052 int r = 0;
1053
1054 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
d15d65a0 1055 _cleanup_free_ char *where = NULL, *options = NULL;
d002827b 1056 const char *o;
88213476
LP
1057 int t;
1058
17fe0523
LP
1059 where = strjoin(dest, "/", mount_table[k].where, NULL);
1060 if (!where)
1061 return log_oom();
88213476 1062
e65aec12 1063 t = path_is_mount_point(where, true);
da00518b 1064 if (t < 0 && t != -ENOENT) {
da927ba9 1065 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
88213476
LP
1066
1067 if (r == 0)
1068 r = t;
1069
1070 continue;
1071 }
1072
9c1c7f71
LP
1073 /* Skip this entry if it is not a remount. */
1074 if (mount_table[k].what && t > 0)
014a9c77
LP
1075 continue;
1076
79d80fc1
TG
1077 t = mkdir_p(where, 0755);
1078 if (t < 0) {
1079 if (mount_table[k].fatal) {
da927ba9 1080 log_error_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
1081
1082 if (r == 0)
1083 r = t;
1084 } else
da927ba9 1085 log_warning_errno(t, "Failed to create directory %s: %m", where);
79d80fc1
TG
1086
1087 continue;
1088 }
88213476 1089
a8828ed9 1090#ifdef HAVE_SELINUX
82adf6af
LP
1091 if (arg_selinux_apifs_context &&
1092 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
1093 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
d002827b
LP
1094 if (!options)
1095 return log_oom();
1096
1097 o = options;
1098 } else
a8828ed9 1099#endif
d002827b 1100 o = mount_table[k].options;
a8828ed9 1101
6dac160c
LP
1102 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
1103 char *uid_options = NULL;
1104
1105 if (o)
1106 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
1107 else
1108 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
1109 if (!uid_options)
1110 return log_oom();
1111
1112 free(options);
1113 o = options = uid_options;
1114 }
a8828ed9 1115
88213476
LP
1116 if (mount(mount_table[k].what,
1117 where,
1118 mount_table[k].type,
1119 mount_table[k].flags,
79d80fc1 1120 o) < 0) {
88213476 1121
79d80fc1 1122 if (mount_table[k].fatal) {
56f64d95 1123 log_error_errno(errno, "mount(%s) failed: %m", where);
88213476 1124
79d80fc1
TG
1125 if (r == 0)
1126 r = -errno;
1127 } else
56f64d95 1128 log_warning_errno(errno, "mount(%s) failed: %m", where);
88213476 1129 }
88213476
LP
1130 }
1131
e58a1277
LP
1132 return r;
1133}
f8440af5 1134
5a8af538
LP
1135static int mount_bind(const char *dest, CustomMount *m) {
1136 struct stat source_st, dest_st;
1137 char *where;
1138 int r;
17fe0523 1139
5a8af538
LP
1140 assert(dest);
1141 assert(m);
d2421337 1142
5a8af538
LP
1143 if (stat(m->source, &source_st) < 0)
1144 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
17fe0523 1145
5a8af538 1146 where = strjoina(dest, m->destination);
06c17c39 1147
5a8af538
LP
1148 r = stat(where, &dest_st);
1149 if (r >= 0) {
1150 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
1151 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
1152 return -EINVAL;
2ed4e5e0 1153 }
06c17c39 1154
5a8af538
LP
1155 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
1156 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
1157 return -EINVAL;
d2421337 1158 }
17fe0523 1159
5a8af538
LP
1160 } else if (errno == ENOENT) {
1161 r = mkdir_parents_label(where, 0755);
1162 if (r < 0)
1163 return log_error_errno(r, "Failed to make parents of %s: %m", where);
1164 } else {
1165 log_error_errno(errno, "Failed to stat %s: %m", where);
1166 return -errno;
1167 }
17fe0523 1168
5a8af538
LP
1169 /* Create the mount point. Any non-directory file can be
1170 * mounted on any non-directory file (regular, fifo, socket,
1171 * char, block).
1172 */
1173 if (S_ISDIR(source_st.st_mode))
1174 r = mkdir_label(where, 0755);
1175 else
1176 r = touch(where);
1177 if (r < 0 && r != -EEXIST)
1178 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1179
1180 if (mount(m->source, where, NULL, MS_BIND, NULL) < 0)
1181 return log_error_errno(errno, "mount(%s) failed: %m", where);
1182
1183 if (m->read_only) {
1184 r = bind_remount_recursive(where, true);
1185 if (r < 0)
1186 return log_error_errno(r, "Read-only bind mount failed: %m");
1187 }
1188
1189 return 0;
1190}
1191
1192static int mount_tmpfs(const char *dest, CustomMount *m) {
1193 char *where;
1194 int r;
1195
1196 assert(dest);
1197 assert(m);
1198
1199 where = strjoina(dest, m->destination);
1200
1201 r = mkdir_label(where, 0755);
1202 if (r < 0 && r != -EEXIST)
1203 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1204
1205 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, m->options) < 0)
1206 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1207
1208 return 0;
1209}
1210
1211static int mount_overlay(const char *dest, CustomMount *m) {
1212 _cleanup_free_ char *lower = NULL;
1213 char *where, *options;
1214 int r;
1215
1216 assert(dest);
1217 assert(m);
1218
1219 where = strjoina(dest, m->destination);
1220
1221 r = mkdir_label(where, 0755);
1222 if (r < 0 && r != -EEXIST)
1223 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
1224
1225 (void) mkdir_p_label(m->source, 0755);
1226
1227 strv_reverse(m->lower);
1228 lower = strv_join(m->lower, ":");
1229 strv_reverse(m->lower);
1230
1231 if (!lower)
1232 return log_oom();
1233
1234 if (m->read_only)
1235 options = strjoina("lowerdir=", m->source, ":", lower);
1236 else {
1237 assert(m->work_dir);
1238 (void) mkdir_label(m->work_dir, 0700);
1239
1240 options = strjoina("lowerdir=", lower, ",upperdir=", m->source, ",workdir=", m->work_dir);
1241 }
1242
1243 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
1244 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
1245
1246 return 0;
1247}
1248
1249static int mount_custom(const char *dest) {
1250 unsigned i;
1251 int r;
1252
1253 assert(dest);
1254
1255 for (i = 0; i < arg_n_custom_mounts; i++) {
1256 CustomMount *m = &arg_custom_mounts[i];
1257
1258 switch (m->type) {
1259
1260 case CUSTOM_MOUNT_BIND:
1261 r = mount_bind(dest, m);
1262 break;
1263
1264 case CUSTOM_MOUNT_TMPFS:
1265 r = mount_tmpfs(dest, m);
1266 break;
1267
1268 case CUSTOM_MOUNT_OVERLAY:
1269 r = mount_overlay(dest, m);
1270 break;
1271
1272 default:
1273 assert_not_reached("Unknown custom mount type");
17fe0523 1274 }
5a8af538
LP
1275
1276 if (r < 0)
1277 return r;
17fe0523
LP
1278 }
1279
1280 return 0;
1281}
1282
b12afc8c
LP
1283static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1284 char *to;
1285 int r;
1286
63c372cb 1287 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
b12afc8c
LP
1288
1289 r = path_is_mount_point(to, false);
da00518b 1290 if (r < 0 && r != -ENOENT)
b12afc8c
LP
1291 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1292 if (r > 0)
1293 return 0;
1294
1295 mkdir_p(to, 0755);
1296
c0534580
LP
1297 /* The superblock mount options of the mount point need to be
1298 * identical to the hosts', and hence writable... */
1299 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
b12afc8c
LP
1300 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1301
c0534580
LP
1302 /* ... hence let's only make the bind mount read-only, not the
1303 * superblock. */
1304 if (read_only) {
1305 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1306 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1307 }
b12afc8c
LP
1308 return 1;
1309}
1310
1311static int mount_cgroup(const char *dest) {
1312 _cleanup_set_free_free_ Set *controllers = NULL;
1313 _cleanup_free_ char *own_cgroup_path = NULL;
1314 const char *cgroup_root, *systemd_root, *systemd_own;
1315 int r;
1316
1317 controllers = set_new(&string_hash_ops);
1318 if (!controllers)
1319 return log_oom();
1320
1321 r = cg_kernel_controllers(controllers);
1322 if (r < 0)
1323 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1324
1325 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1326 if (r < 0)
1327 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1328
b12afc8c
LP
1329 for (;;) {
1330 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1331
1332 controller = set_steal_first(controllers);
1333 if (!controller)
1334 break;
1335
1336 origin = strappend("/sys/fs/cgroup/", controller);
1337 if (!origin)
1338 return log_oom();
1339
1340 r = readlink_malloc(origin, &combined);
1341 if (r == -EINVAL) {
1342 /* Not a symbolic link, but directly a single cgroup hierarchy */
1343
1344 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1345 if (r < 0)
1346 return r;
1347
1348 } else if (r < 0)
1349 return log_error_errno(r, "Failed to read link %s: %m", origin);
1350 else {
1351 _cleanup_free_ char *target = NULL;
1352
1353 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1354 if (!target)
1355 return log_oom();
1356
1357 /* A symbolic link, a combination of controllers in one hierarchy */
1358
1359 if (!filename_is_valid(combined)) {
1360 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1361 continue;
1362 }
1363
1364 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1365 if (r < 0)
1366 return r;
1367
875e1014
ILG
1368 r = symlink_idempotent(combined, target);
1369 if (r == -EINVAL) {
1370 log_error("Invalid existing symlink for combined hierarchy");
1371 return r;
1372 }
1373 if (r < 0)
1374 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
b12afc8c
LP
1375 }
1376 }
1377
c0534580 1378 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
b12afc8c
LP
1379 if (r < 0)
1380 return r;
1381
1382 /* Make our own cgroup a (writable) bind mount */
63c372cb 1383 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
b12afc8c
LP
1384 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1385 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1386
1387 /* And then remount the systemd cgroup root read-only */
63c372cb 1388 systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
b12afc8c
LP
1389 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1390 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1391
54b4755f 1392 cgroup_root = strjoina(dest, "/sys/fs/cgroup");
b12afc8c
LP
1393 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1394 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1395
1396 return 0;
1397}
1398
e58a1277 1399static int setup_timezone(const char *dest) {
d4036145
LP
1400 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1401 char *z, *y;
1402 int r;
f8440af5 1403
e58a1277
LP
1404 assert(dest);
1405
1406 /* Fix the timezone, if possible */
d4036145
LP
1407 r = readlink_malloc("/etc/localtime", &p);
1408 if (r < 0) {
1409 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1410 return 0;
1411 }
1412
1413 z = path_startswith(p, "../usr/share/zoneinfo/");
1414 if (!z)
1415 z = path_startswith(p, "/usr/share/zoneinfo/");
1416 if (!z) {
1417 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1418 return 0;
1419 }
1420
04bc4a3f
LP
1421 where = strappend(dest, "/etc/localtime");
1422 if (!where)
0d0f0c50 1423 return log_oom();
715ac17a 1424
d4036145
LP
1425 r = readlink_malloc(where, &q);
1426 if (r >= 0) {
1427 y = path_startswith(q, "../usr/share/zoneinfo/");
1428 if (!y)
1429 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1430
d4036145
LP
1431 /* Already pointing to the right place? Then do nothing .. */
1432 if (y && streq(y, z))
1433 return 0;
1434 }
1435
1436 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1437 if (!check)
0d0f0c50 1438 return log_oom();
4d1c38b8 1439
d4036145
LP
1440 if (access(check, F_OK) < 0) {
1441 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1442 return 0;
1443 }
68fb0892 1444
d4036145
LP
1445 what = strappend("../usr/share/zoneinfo/", z);
1446 if (!what)
1447 return log_oom();
1448
79d80fc1
TG
1449 r = mkdir_parents(where, 0755);
1450 if (r < 0) {
da927ba9 1451 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
79d80fc1
TG
1452
1453 return 0;
1454 }
1455
1456 r = unlink(where);
1457 if (r < 0 && errno != ENOENT) {
56f64d95 1458 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1459
1460 return 0;
1461 }
4d9f07b4 1462
d4036145 1463 if (symlink(what, where) < 0) {
56f64d95 1464 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1465 return 0;
1466 }
e58a1277
LP
1467
1468 return 0;
88213476
LP
1469}
1470
2547bb41 1471static int setup_resolv_conf(const char *dest) {
c8b32e11 1472 _cleanup_free_ char *where = NULL;
79d80fc1 1473 int r;
2547bb41
LP
1474
1475 assert(dest);
1476
1477 if (arg_private_network)
1478 return 0;
1479
1480 /* Fix resolv.conf, if possible */
04bc4a3f
LP
1481 where = strappend(dest, "/etc/resolv.conf");
1482 if (!where)
0d0f0c50 1483 return log_oom();
2547bb41 1484
77e63faf
LP
1485 /* We don't really care for the results of this really. If it
1486 * fails, it fails, but meh... */
79d80fc1
TG
1487 r = mkdir_parents(where, 0755);
1488 if (r < 0) {
da927ba9 1489 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
79d80fc1
TG
1490
1491 return 0;
1492 }
1493
f2068bcc 1494 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1495 if (r < 0) {
da927ba9 1496 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1497
1498 return 0;
1499 }
2547bb41
LP
1500
1501 return 0;
1502}
1503
4d9f07b4
LP
1504static int setup_volatile_state(const char *directory) {
1505 const char *p;
1506 int r;
1507
1508 assert(directory);
1509
1510 if (arg_volatile != VOLATILE_STATE)
1511 return 0;
1512
1513 /* --volatile=state means we simply overmount /var
1514 with a tmpfs, and the rest read-only. */
1515
1516 r = bind_remount_recursive(directory, true);
f647962d
MS
1517 if (r < 0)
1518 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
4d9f07b4 1519
63c372cb 1520 p = strjoina(directory, "/var");
79d80fc1 1521 r = mkdir(p, 0755);
4a62c710
MS
1522 if (r < 0 && errno != EEXIST)
1523 return log_error_errno(errno, "Failed to create %s: %m", directory);
4d9f07b4 1524
4a62c710
MS
1525 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1526 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
4d9f07b4
LP
1527
1528 return 0;
1529}
1530
1531static int setup_volatile(const char *directory) {
1532 bool tmpfs_mounted = false, bind_mounted = false;
1533 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1534 const char *f, *t;
1535 int r;
1536
1537 assert(directory);
1538
1539 if (arg_volatile != VOLATILE_YES)
1540 return 0;
1541
1542 /* --volatile=yes means we mount a tmpfs to the root dir, and
1543 the original /usr to use inside it, and that read-only. */
1544
4a62c710
MS
1545 if (!mkdtemp(template))
1546 return log_error_errno(errno, "Failed to create temporary directory: %m");
4d9f07b4
LP
1547
1548 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
56f64d95 1549 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
4d9f07b4
LP
1550 r = -errno;
1551 goto fail;
1552 }
1553
1554 tmpfs_mounted = true;
1555
63c372cb
LP
1556 f = strjoina(directory, "/usr");
1557 t = strjoina(template, "/usr");
4d9f07b4 1558
79d80fc1
TG
1559 r = mkdir(t, 0755);
1560 if (r < 0 && errno != EEXIST) {
56f64d95 1561 log_error_errno(errno, "Failed to create %s: %m", t);
79d80fc1
TG
1562 r = -errno;
1563 goto fail;
1564 }
1565
4543768d 1566 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
56f64d95 1567 log_error_errno(errno, "Failed to create /usr bind mount: %m");
4d9f07b4
LP
1568 r = -errno;
1569 goto fail;
1570 }
1571
1572 bind_mounted = true;
1573
1574 r = bind_remount_recursive(t, true);
1575 if (r < 0) {
da927ba9 1576 log_error_errno(r, "Failed to remount %s read-only: %m", t);
4d9f07b4
LP
1577 goto fail;
1578 }
1579
1580 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
56f64d95 1581 log_error_errno(errno, "Failed to move root mount: %m");
4d9f07b4
LP
1582 r = -errno;
1583 goto fail;
1584 }
1585
1586 rmdir(template);
1587
1588 return 0;
1589
1590fail:
1591 if (bind_mounted)
1592 umount(t);
1593 if (tmpfs_mounted)
1594 umount(template);
1595 rmdir(template);
1596 return r;
1597}
1598
9f24adc2
LP
1599static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1600
1601 snprintf(s, 37,
1602 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1603 SD_ID128_FORMAT_VAL(id));
1604
1605 return s;
1606}
1607
04bc4a3f 1608static int setup_boot_id(const char *dest) {
7fd1b19b 1609 _cleanup_free_ char *from = NULL, *to = NULL;
39883f62 1610 sd_id128_t rnd = {};
04bc4a3f
LP
1611 char as_uuid[37];
1612 int r;
1613
1614 assert(dest);
1615
eb91eb18
LP
1616 if (arg_share_system)
1617 return 0;
1618
04bc4a3f
LP
1619 /* Generate a new randomized boot ID, so that each boot-up of
1620 * the container gets a new one */
1621
1622 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
04bc4a3f 1623 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
ed8b7a3e
ZJS
1624 if (!from || !to)
1625 return log_oom();
04bc4a3f
LP
1626
1627 r = sd_id128_randomize(&rnd);
f647962d
MS
1628 if (r < 0)
1629 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1630
9f24adc2 1631 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1632
574d5f2d 1633 r = write_string_file(from, as_uuid);
f647962d
MS
1634 if (r < 0)
1635 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1636
4543768d 1637 if (mount(from, to, NULL, MS_BIND, NULL) < 0) {
56f64d95 1638 log_error_errno(errno, "Failed to bind mount boot id: %m");
04bc4a3f 1639 r = -errno;
4543768d 1640 } else if (mount(from, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
56f64d95 1641 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1642
1643 unlink(from);
04bc4a3f
LP
1644 return r;
1645}
1646
e58a1277 1647static int copy_devnodes(const char *dest) {
88213476
LP
1648
1649 static const char devnodes[] =
1650 "null\0"
1651 "zero\0"
1652 "full\0"
1653 "random\0"
1654 "urandom\0"
85614d66
TG
1655 "tty\0"
1656 "net/tun\0";
88213476
LP
1657
1658 const char *d;
e58a1277 1659 int r = 0;
7fd1b19b 1660 _cleanup_umask_ mode_t u;
a258bf26
LP
1661
1662 assert(dest);
124640f1
LP
1663
1664 u = umask(0000);
88213476
LP
1665
1666 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1667 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1668 struct stat st;
88213476 1669
7f112f50
LP
1670 from = strappend("/dev/", d);
1671 to = strjoin(dest, "/dev/", d, NULL);
1672 if (!from || !to)
1673 return log_oom();
88213476
LP
1674
1675 if (stat(from, &st) < 0) {
1676
4a62c710
MS
1677 if (errno != ENOENT)
1678 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1679
a258bf26 1680 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1681
ed8b7a3e 1682 log_error("%s is not a char or block device, cannot copy", from);
7f112f50 1683 return -EIO;
a258bf26 1684
85614d66
TG
1685 } else {
1686 r = mkdir_parents(to, 0775);
1687 if (r < 0) {
da927ba9 1688 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
85614d66
TG
1689 return -r;
1690 }
a258bf26 1691
81f5049b
AC
1692 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1693 if (errno != EPERM)
1694 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1695
1696 /* Some systems abusively restrict mknod but
1697 * allow bind mounts. */
1698 r = touch(to);
1699 if (r < 0)
1700 return log_error_errno(r, "touch (%s) failed: %m", to);
1701 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1702 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1703 }
6278cf60
LP
1704
1705 if (arg_userns && arg_uid_shift != UID_INVALID)
1706 if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
1707 return log_error_errno(errno, "chown() of device node %s failed: %m", to);
88213476 1708 }
88213476
LP
1709 }
1710
e58a1277
LP
1711 return r;
1712}
88213476 1713
f2d88580
LP
1714static int setup_ptmx(const char *dest) {
1715 _cleanup_free_ char *p = NULL;
1716
1717 p = strappend(dest, "/dev/ptmx");
1718 if (!p)
1719 return log_oom();
1720
4a62c710
MS
1721 if (symlink("pts/ptmx", p) < 0)
1722 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
f2d88580 1723
6278cf60
LP
1724 if (arg_userns && arg_uid_shift != UID_INVALID)
1725 if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
1726 return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
1727
f2d88580
LP
1728 return 0;
1729}
1730
e58a1277 1731static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1732 _cleanup_umask_ mode_t u;
1733 const char *to;
e58a1277 1734 int r;
e58a1277
LP
1735
1736 assert(dest);
1737 assert(console);
1738
1739 u = umask(0000);
1740
e58a1277 1741 r = chmod_and_chown(console, 0600, 0, 0);
f647962d
MS
1742 if (r < 0)
1743 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1744
a258bf26
LP
1745 /* We need to bind mount the right tty to /dev/console since
1746 * ptys can only exist on pts file systems. To have something
81f5049b 1747 * to bind mount things on we create a empty regular file. */
a258bf26 1748
63c372cb 1749 to = strjoina(dest, "/dev/console");
81f5049b
AC
1750 r = touch(to);
1751 if (r < 0)
1752 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1753
4543768d 1754 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1755 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1756
25ea79fe 1757 return 0;
e58a1277
LP
1758}
1759
1760static int setup_kmsg(const char *dest, int kmsg_socket) {
7fd1b19b 1761 _cleanup_free_ char *from = NULL, *to = NULL;
7fd1b19b 1762 _cleanup_umask_ mode_t u;
6d0b55c2 1763 int r, fd, k;
e58a1277
LP
1764 union {
1765 struct cmsghdr cmsghdr;
1766 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1767 } control = {};
1768 struct msghdr mh = {
1769 .msg_control = &control,
1770 .msg_controllen = sizeof(control),
1771 };
e58a1277
LP
1772 struct cmsghdr *cmsg;
1773
1774 assert(dest);
1775 assert(kmsg_socket >= 0);
a258bf26 1776
e58a1277 1777 u = umask(0000);
a258bf26 1778
f1e5dfe2
LP
1779 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1780 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1781 * on the reading side behave very similar to /proc/kmsg,
1782 * their writing side behaves differently from /dev/kmsg in
1783 * that writing blocks when nothing is reading. In order to
1784 * avoid any problems with containers deadlocking due to this
1785 * we simply make /dev/kmsg unavailable to the container. */
25ea79fe
ZJS
1786 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1787 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1788 return log_oom();
e58a1277 1789
4a62c710
MS
1790 if (mkfifo(from, 0600) < 0)
1791 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
e58a1277
LP
1792
1793 r = chmod_and_chown(from, 0600, 0, 0);
f647962d
MS
1794 if (r < 0)
1795 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
e58a1277 1796
4543768d 1797 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1798 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1799
1800 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1801 if (fd < 0)
1802 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1803
e58a1277
LP
1804 cmsg = CMSG_FIRSTHDR(&mh);
1805 cmsg->cmsg_level = SOL_SOCKET;
1806 cmsg->cmsg_type = SCM_RIGHTS;
1807 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1808 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1809
1810 mh.msg_controllen = cmsg->cmsg_len;
1811
1812 /* Store away the fd in the socket, so that it stays open as
1813 * long as we run the child */
6d0b55c2 1814 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
03e334a1 1815 safe_close(fd);
e58a1277 1816
4a62c710
MS
1817 if (k < 0)
1818 return log_error_errno(errno, "Failed to send FIFO fd: %m");
a258bf26 1819
f1e5dfe2
LP
1820 /* And now make the FIFO unavailable as /dev/kmsg... */
1821 unlink(from);
25ea79fe 1822 return 0;
88213476
LP
1823}
1824
6d0b55c2
LP
1825static int send_rtnl(int send_fd) {
1826 union {
1827 struct cmsghdr cmsghdr;
1828 uint8_t buf[CMSG_SPACE(sizeof(int))];
1829 } control = {};
1830 struct msghdr mh = {
1831 .msg_control = &control,
1832 .msg_controllen = sizeof(control),
1833 };
1834 struct cmsghdr *cmsg;
1835 _cleanup_close_ int fd = -1;
1836 ssize_t k;
1837
1838 assert(send_fd >= 0);
1839
1840 if (!arg_expose_ports)
1841 return 0;
1842
1843 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1844 if (fd < 0)
1845 return log_error_errno(errno, "failed to allocate container netlink: %m");
1846
1847 cmsg = CMSG_FIRSTHDR(&mh);
1848 cmsg->cmsg_level = SOL_SOCKET;
1849 cmsg->cmsg_type = SCM_RIGHTS;
1850 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1851 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1852
1853 mh.msg_controllen = cmsg->cmsg_len;
1854
1855 /* Store away the fd in the socket, so that it stays open as
1856 * long as we run the child */
1857 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1858 if (k < 0)
1859 return log_error_errno(errno, "Failed to send netlink fd: %m");
1860
1861 return 0;
1862}
1863
1864static int flush_ports(union in_addr_union *exposed) {
1865 ExposePort *p;
1866 int r, af = AF_INET;
1867
1868 assert(exposed);
1869
1870 if (!arg_expose_ports)
1871 return 0;
1872
1873 if (in_addr_is_null(af, exposed))
1874 return 0;
1875
1876 log_debug("Lost IP address.");
1877
1878 LIST_FOREACH(ports, p, arg_expose_ports) {
1879 r = fw_add_local_dnat(false,
1880 af,
1881 p->protocol,
1882 NULL,
1883 NULL, 0,
1884 NULL, 0,
1885 p->host_port,
1886 exposed,
1887 p->container_port,
1888 NULL);
1889 if (r < 0)
1890 log_warning_errno(r, "Failed to modify firewall: %m");
1891 }
1892
1893 *exposed = IN_ADDR_NULL;
1894 return 0;
1895}
1896
1897static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1898 _cleanup_free_ struct local_address *addresses = NULL;
1899 _cleanup_free_ char *pretty = NULL;
1900 union in_addr_union new_exposed;
1901 ExposePort *p;
1902 bool add;
1903 int af = AF_INET, r;
1904
1905 assert(exposed);
1906
1907 /* Invoked each time an address is added or removed inside the
1908 * container */
1909
1910 if (!arg_expose_ports)
1911 return 0;
1912
1913 r = local_addresses(rtnl, 0, af, &addresses);
1914 if (r < 0)
1915 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1916
1917 add = r > 0 &&
1918 addresses[0].family == af &&
1919 addresses[0].scope < RT_SCOPE_LINK;
1920
1921 if (!add)
1922 return flush_ports(exposed);
1923
1924 new_exposed = addresses[0].address;
1925 if (in_addr_equal(af, exposed, &new_exposed))
1926 return 0;
1927
1928 in_addr_to_string(af, &new_exposed, &pretty);
1929 log_debug("New container IP is %s.", strna(pretty));
1930
1931 LIST_FOREACH(ports, p, arg_expose_ports) {
1932
1933 r = fw_add_local_dnat(true,
1934 af,
1935 p->protocol,
1936 NULL,
1937 NULL, 0,
1938 NULL, 0,
1939 p->host_port,
1940 &new_exposed,
1941 p->container_port,
1942 in_addr_is_null(af, exposed) ? NULL : exposed);
1943 if (r < 0)
1944 log_warning_errno(r, "Failed to modify firewall: %m");
1945 }
1946
1947 *exposed = new_exposed;
1948 return 0;
1949}
1950
1951static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1952 union in_addr_union *exposed = userdata;
1953
1954 assert(rtnl);
1955 assert(m);
1956 assert(exposed);
1957
1958 expose_ports(rtnl, exposed);
1959 return 0;
1960}
1961
1962static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1963 union {
1964 struct cmsghdr cmsghdr;
1965 uint8_t buf[CMSG_SPACE(sizeof(int))];
1966 } control = {};
1967 struct msghdr mh = {
1968 .msg_control = &control,
1969 .msg_controllen = sizeof(control),
1970 };
1971 struct cmsghdr *cmsg;
1972 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1973 int fd, r;
1974 ssize_t k;
1975
1976 assert(event);
1977 assert(recv_fd >= 0);
1978 assert(ret);
1979
1980 if (!arg_expose_ports)
1981 return 0;
1982
1983 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1984 if (k < 0)
1985 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1986
1987 cmsg = CMSG_FIRSTHDR(&mh);
1988 assert(cmsg->cmsg_level == SOL_SOCKET);
1989 assert(cmsg->cmsg_type == SCM_RIGHTS);
657bdca9 1990 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
6d0b55c2
LP
1991 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1992
1993 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1994 if (r < 0) {
1995 safe_close(fd);
1996 return log_error_errno(r, "Failed to create rtnl object: %m");
1997 }
1998
1999 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
2000 if (r < 0)
2001 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
2002
2003 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
2004 if (r < 0)
2005 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
2006
2007 r = sd_rtnl_attach_event(rtnl, event, 0);
2008 if (r < 0)
2009 return log_error_errno(r, "Failed to add to even loop: %m");
2010
2011 *ret = rtnl;
2012 rtnl = NULL;
2013
2014 return 0;
2015}
2016
3a74cea5 2017static int setup_hostname(void) {
3a74cea5 2018
eb91eb18
LP
2019 if (arg_share_system)
2020 return 0;
2021
605f81a8 2022 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 2023 return -errno;
3a74cea5 2024
7027ff61 2025 return 0;
3a74cea5
LP
2026}
2027
57fb9fb5 2028static int setup_journal(const char *directory) {
4d680aee 2029 sd_id128_t machine_id, this_id;
7fd1b19b 2030 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
27407a01 2031 char *id;
57fb9fb5
LP
2032 int r;
2033
df9a75e4
LP
2034 /* Don't link journals in ephemeral mode */
2035 if (arg_ephemeral)
2036 return 0;
2037
57fb9fb5 2038 p = strappend(directory, "/etc/machine-id");
27407a01
ZJS
2039 if (!p)
2040 return log_oom();
57fb9fb5
LP
2041
2042 r = read_one_line_file(p, &b);
27407a01
ZJS
2043 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
2044 return 0;
f647962d
MS
2045 else if (r < 0)
2046 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
57fb9fb5 2047
27407a01
ZJS
2048 id = strstrip(b);
2049 if (isempty(id) && arg_link_journal == LINK_AUTO)
2050 return 0;
57fb9fb5 2051
27407a01
ZJS
2052 /* Verify validity */
2053 r = sd_id128_from_string(id, &machine_id);
f647962d
MS
2054 if (r < 0)
2055 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
57fb9fb5 2056
4d680aee 2057 r = sd_id128_get_machine(&this_id);
f647962d
MS
2058 if (r < 0)
2059 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
2060
2061 if (sd_id128_equal(machine_id, this_id)) {
2062 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
2063 "Host and machine ids are equal (%s): refusing to link journals", id);
2064 if (arg_link_journal == LINK_AUTO)
2065 return 0;
df9a75e4 2066 return -EEXIST;
4d680aee
ZJS
2067 }
2068
2069 if (arg_link_journal == LINK_NO)
2070 return 0;
2071
57fb9fb5 2072 free(p);
27407a01
ZJS
2073 p = strappend("/var/log/journal/", id);
2074 q = strjoin(directory, "/var/log/journal/", id, NULL);
2075 if (!p || !q)
2076 return log_oom();
2077
2078 if (path_is_mount_point(p, false) > 0) {
2079 if (arg_link_journal != LINK_AUTO) {
2080 log_error("%s: already a mount point, refusing to use for journal", p);
2081 return -EEXIST;
2082 }
2083
2084 return 0;
57fb9fb5
LP
2085 }
2086
27407a01 2087 if (path_is_mount_point(q, false) > 0) {
57fb9fb5 2088 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
2089 log_error("%s: already a mount point, refusing to use for journal", q);
2090 return -EEXIST;
57fb9fb5
LP
2091 }
2092
27407a01 2093 return 0;
57fb9fb5
LP
2094 }
2095
2096 r = readlink_and_make_absolute(p, &d);
2097 if (r >= 0) {
2098 if ((arg_link_journal == LINK_GUEST ||
2099 arg_link_journal == LINK_AUTO) &&
2100 path_equal(d, q)) {
2101
27407a01
ZJS
2102 r = mkdir_p(q, 0755);
2103 if (r < 0)
56f64d95 2104 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 2105 return 0;
57fb9fb5
LP
2106 }
2107
4a62c710
MS
2108 if (unlink(p) < 0)
2109 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2110 } else if (r == -EINVAL) {
2111
2112 if (arg_link_journal == LINK_GUEST &&
2113 rmdir(p) < 0) {
2114
27407a01
ZJS
2115 if (errno == ENOTDIR) {
2116 log_error("%s already exists and is neither a symlink nor a directory", p);
2117 return r;
2118 } else {
56f64d95 2119 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 2120 return -errno;
57fb9fb5 2121 }
57fb9fb5
LP
2122 }
2123 } else if (r != -ENOENT) {
56f64d95 2124 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 2125 return r;
57fb9fb5
LP
2126 }
2127
2128 if (arg_link_journal == LINK_GUEST) {
2129
2130 if (symlink(q, p) < 0) {
574edc90 2131 if (arg_link_journal_try) {
56f64d95 2132 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
2133 return 0;
2134 } else {
56f64d95 2135 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
2136 return -errno;
2137 }
57fb9fb5
LP
2138 }
2139
27407a01
ZJS
2140 r = mkdir_p(q, 0755);
2141 if (r < 0)
56f64d95 2142 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 2143 return 0;
57fb9fb5
LP
2144 }
2145
2146 if (arg_link_journal == LINK_HOST) {
574edc90
MP
2147 /* don't create parents here -- if the host doesn't have
2148 * permanent journal set up, don't force it here */
2149 r = mkdir(p, 0755);
57fb9fb5 2150 if (r < 0) {
574edc90 2151 if (arg_link_journal_try) {
56f64d95 2152 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
2153 return 0;
2154 } else {
56f64d95 2155 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
2156 return r;
2157 }
57fb9fb5
LP
2158 }
2159
27407a01
ZJS
2160 } else if (access(p, F_OK) < 0)
2161 return 0;
57fb9fb5 2162
cdb2b9d0
LP
2163 if (dir_is_empty(q) == 0)
2164 log_warning("%s is not empty, proceeding anyway.", q);
2165
57fb9fb5
LP
2166 r = mkdir_p(q, 0755);
2167 if (r < 0) {
56f64d95 2168 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 2169 return r;
57fb9fb5
LP
2170 }
2171
4543768d 2172 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 2173 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2174
27407a01 2175 return 0;
57fb9fb5
LP
2176}
2177
88213476 2178static int drop_capabilities(void) {
5076f0cc 2179 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
2180}
2181
5aa4bb6b 2182static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 2183 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
24996861 2184 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
2185 int r;
2186
eb91eb18
LP
2187 if (!arg_register)
2188 return 0;
2189
1c03020c 2190 r = sd_bus_default_system(&bus);
f647962d
MS
2191 if (r < 0)
2192 return log_error_errno(r, "Failed to open system bus: %m");
9444b1f2 2193
89f7c846
LP
2194 if (arg_keep_unit) {
2195 r = sd_bus_call_method(
2196 bus,
2197 "org.freedesktop.machine1",
2198 "/org/freedesktop/machine1",
2199 "org.freedesktop.machine1.Manager",
5aa4bb6b 2200 "RegisterMachineWithNetwork",
89f7c846
LP
2201 &error,
2202 NULL,
5aa4bb6b 2203 "sayssusai",
89f7c846
LP
2204 arg_machine,
2205 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2206 "nspawn",
2207 "container",
2208 (uint32_t) pid,
5aa4bb6b
LP
2209 strempty(arg_directory),
2210 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 2211 } else {
9457ac5b 2212 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
f36933fe 2213 char **i;
ce5b3ad4 2214 unsigned j;
9457ac5b
LP
2215
2216 r = sd_bus_message_new_method_call(
89f7c846 2217 bus,
9457ac5b 2218 &m,
89f7c846
LP
2219 "org.freedesktop.machine1",
2220 "/org/freedesktop/machine1",
2221 "org.freedesktop.machine1.Manager",
5aa4bb6b 2222 "CreateMachineWithNetwork");
f647962d 2223 if (r < 0)
f36933fe 2224 return bus_log_create_error(r);
9457ac5b
LP
2225
2226 r = sd_bus_message_append(
2227 m,
5aa4bb6b 2228 "sayssusai",
89f7c846
LP
2229 arg_machine,
2230 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2231 "nspawn",
2232 "container",
2233 (uint32_t) pid,
5aa4bb6b
LP
2234 strempty(arg_directory),
2235 local_ifindex > 0 ? 1 : 0, local_ifindex);
f647962d 2236 if (r < 0)
f36933fe 2237 return bus_log_create_error(r);
9457ac5b
LP
2238
2239 r = sd_bus_message_open_container(m, 'a', "(sv)");
f647962d 2240 if (r < 0)
f36933fe 2241 return bus_log_create_error(r);
9457ac5b
LP
2242
2243 if (!isempty(arg_slice)) {
2244 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
f647962d 2245 if (r < 0)
f36933fe 2246 return bus_log_create_error(r);
9457ac5b
LP
2247 }
2248
2249 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
f647962d 2250 if (r < 0)
f36933fe 2251 return bus_log_create_error(r);
9457ac5b 2252
773ce3d8
LP
2253 /* If you make changes here, also make sure to update
2254 * systemd-nspawn@.service, to keep the device
2255 * policies in sync regardless if we are run with or
2256 * without the --keep-unit switch. */
63cc4c31 2257 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
9457ac5b
LP
2258 /* Allow the container to
2259 * access and create the API
2260 * device nodes, so that
2261 * PrivateDevices= in the
2262 * container can work
2263 * fine */
2264 "/dev/null", "rwm",
2265 "/dev/zero", "rwm",
2266 "/dev/full", "rwm",
2267 "/dev/random", "rwm",
2268 "/dev/urandom", "rwm",
2269 "/dev/tty", "rwm",
864e1706 2270 "/dev/net/tun", "rwm",
9457ac5b
LP
2271 /* Allow the container
2272 * access to ptys. However,
2273 * do not permit the
2274 * container to ever create
2275 * these device nodes. */
2276 "/dev/pts/ptmx", "rw",
63cc4c31 2277 "char-pts", "rw");
f647962d 2278 if (r < 0)
27023c0e
LP
2279 return bus_log_create_error(r);
2280
ce5b3ad4
SJ
2281 for (j = 0; j < arg_n_custom_mounts; j++) {
2282 CustomMount *cm = &arg_custom_mounts[j];
2283
2284 if (cm->type != CUSTOM_MOUNT_BIND)
2285 continue;
2286
2287 r = is_device_node(cm->source);
2288 if (r < 0)
2289 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
2290
2291 if (r) {
2292 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
2293 cm->source, cm->read_only ? "r" : "rw");
2294 if (r < 0)
2295 return log_error_errno(r, "Failed to append message arguments: %m");
2296 }
2297 }
2298
27023c0e
LP
2299 if (arg_kill_signal != 0) {
2300 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
2301 if (r < 0)
2302 return bus_log_create_error(r);
2303
2304 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
2305 if (r < 0)
2306 return bus_log_create_error(r);
2307 }
9457ac5b 2308
f36933fe
LP
2309 STRV_FOREACH(i, arg_property) {
2310 r = sd_bus_message_open_container(m, 'r', "sv");
2311 if (r < 0)
2312 return bus_log_create_error(r);
2313
2314 r = bus_append_unit_property_assignment(m, *i);
2315 if (r < 0)
2316 return r;
2317
2318 r = sd_bus_message_close_container(m);
2319 if (r < 0)
2320 return bus_log_create_error(r);
2321 }
2322
9457ac5b 2323 r = sd_bus_message_close_container(m);
f647962d 2324 if (r < 0)
f36933fe 2325 return bus_log_create_error(r);
9457ac5b
LP
2326
2327 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
2328 }
2329
9444b1f2 2330 if (r < 0) {
1f0cd86b
LP
2331 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2332 return r;
2333 }
2334
2335 return 0;
2336}
2337
2338static int terminate_machine(pid_t pid) {
2339 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2340 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
24996861 2341 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
2342 const char *path;
2343 int r;
2344
eb91eb18
LP
2345 if (!arg_register)
2346 return 0;
2347
1a2399e5
LP
2348 /* If we are reusing the unit, then just exit, systemd will do
2349 * the right thing when we exit. */
2350 if (arg_keep_unit)
2351 return 0;
2352
76b54375 2353 r = sd_bus_default_system(&bus);
f647962d
MS
2354 if (r < 0)
2355 return log_error_errno(r, "Failed to open system bus: %m");
1f0cd86b
LP
2356
2357 r = sd_bus_call_method(
2358 bus,
2359 "org.freedesktop.machine1",
2360 "/org/freedesktop/machine1",
2361 "org.freedesktop.machine1.Manager",
2362 "GetMachineByPID",
2363 &error,
2364 &reply,
2365 "u",
2366 (uint32_t) pid);
2367 if (r < 0) {
2368 /* Note that the machine might already have been
2369 * cleaned up automatically, hence don't consider it a
2370 * failure if we cannot get the machine object. */
2371 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2372 return 0;
2373 }
2374
2375 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
2376 if (r < 0)
2377 return bus_log_parse_error(r);
9444b1f2 2378
1f0cd86b
LP
2379 r = sd_bus_call_method(
2380 bus,
2381 "org.freedesktop.machine1",
2382 path,
2383 "org.freedesktop.machine1.Machine",
2384 "Terminate",
2385 &error,
2386 NULL,
2387 NULL);
2388 if (r < 0) {
2389 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2390 return 0;
2391 }
2392
9444b1f2
LP
2393 return 0;
2394}
2395
db999e0f
LP
2396static int reset_audit_loginuid(void) {
2397 _cleanup_free_ char *p = NULL;
2398 int r;
2399
2400 if (arg_share_system)
2401 return 0;
2402
2403 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2404 if (r == -ENOENT)
db999e0f 2405 return 0;
f647962d
MS
2406 if (r < 0)
2407 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2408
2409 /* Already reset? */
2410 if (streq(p, "4294967295"))
2411 return 0;
2412
2413 r = write_string_file("/proc/self/loginuid", "4294967295");
2414 if (r < 0) {
10a87006
LP
2415 log_error_errno(r,
2416 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2417 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2418 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2419 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2420 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2421
db999e0f 2422 sleep(5);
77b6e194 2423 }
db999e0f
LP
2424
2425 return 0;
77b6e194
LP
2426}
2427
4f758c23
LP
2428#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2429#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
e867ceb6 2430#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
01dde061 2431
a90e2305 2432static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
01dde061
TG
2433 uint8_t result[8];
2434 size_t l, sz;
a90e2305
LP
2435 uint8_t *v, *i;
2436 int r;
01dde061
TG
2437
2438 l = strlen(arg_machine);
2439 sz = sizeof(sd_id128_t) + l;
e867ceb6
LP
2440 if (idx > 0)
2441 sz += sizeof(idx);
a90e2305 2442
01dde061
TG
2443 v = alloca(sz);
2444
2445 /* fetch some persistent data unique to the host */
2446 r = sd_id128_get_machine((sd_id128_t*) v);
2447 if (r < 0)
2448 return r;
2449
2450 /* combine with some data unique (on this host) to this
2451 * container instance */
a90e2305
LP
2452 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2453 if (idx > 0) {
2454 idx = htole64(idx);
2455 memcpy(i, &idx, sizeof(idx));
2456 }
01dde061
TG
2457
2458 /* Let's hash the host machine ID plus the container name. We
2459 * use a fixed, but originally randomly created hash key here. */
4f758c23 2460 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
2461
2462 assert_cc(ETH_ALEN <= sizeof(result));
2463 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2464
2465 /* see eth_random_addr in the kernel */
2466 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2467 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2468
2469 return 0;
2470}
2471
5aa4bb6b 2472static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
69c79d3c 2473 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
cf6a8911 2474 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4f758c23 2475 struct ether_addr mac_host, mac_container;
5aa4bb6b 2476 int r, i;
69c79d3c
LP
2477
2478 if (!arg_private_network)
2479 return 0;
2480
2481 if (!arg_network_veth)
2482 return 0;
2483
08af0da2
LP
2484 /* Use two different interface name prefixes depending whether
2485 * we are in bridge mode or not. */
c00524c9 2486 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 2487 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 2488
e867ceb6
LP
2489 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2490 if (r < 0)
2491 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
4f758c23 2492
e867ceb6
LP
2493 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2494 if (r < 0)
2495 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
01dde061 2496
151b9b96 2497 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2498 if (r < 0)
2499 return log_error_errno(r, "Failed to connect to netlink: %m");
69c79d3c 2500
151b9b96 2501 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2502 if (r < 0)
2503 return log_error_errno(r, "Failed to allocate netlink message: %m");
69c79d3c 2504
ab046dde 2505 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
f647962d
MS
2506 if (r < 0)
2507 return log_error_errno(r, "Failed to add netlink interface name: %m");
69c79d3c 2508
4f758c23 2509 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
f647962d
MS
2510 if (r < 0)
2511 return log_error_errno(r, "Failed to add netlink MAC address: %m");
4f758c23 2512
ee3a6a51 2513 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2514 if (r < 0)
2515 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2516
d8e538ec 2517 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
f647962d
MS
2518 if (r < 0)
2519 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2520
ee3a6a51 2521 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
f647962d
MS
2522 if (r < 0)
2523 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2524
ab046dde 2525 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
f647962d
MS
2526 if (r < 0)
2527 return log_error_errno(r, "Failed to add netlink interface name: %m");
01dde061 2528
4f758c23 2529 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
f647962d
MS
2530 if (r < 0)
2531 return log_error_errno(r, "Failed to add netlink MAC address: %m");
69c79d3c 2532
ab046dde 2533 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2534 if (r < 0)
2535 return log_error_errno(r, "Failed to add netlink namespace field: %m");
69c79d3c
LP
2536
2537 r = sd_rtnl_message_close_container(m);
f647962d
MS
2538 if (r < 0)
2539 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2540
2541 r = sd_rtnl_message_close_container(m);
f647962d
MS
2542 if (r < 0)
2543 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2544
2545 r = sd_rtnl_message_close_container(m);
f647962d
MS
2546 if (r < 0)
2547 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c
LP
2548
2549 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2550 if (r < 0)
2551 return log_error_errno(r, "Failed to add new veth interfaces: %m");
69c79d3c 2552
5aa4bb6b 2553 i = (int) if_nametoindex(iface_name);
4a62c710
MS
2554 if (i <= 0)
2555 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
5aa4bb6b
LP
2556
2557 *ifi = i;
2558
69c79d3c
LP
2559 return 0;
2560}
2561
5aa4bb6b 2562static int setup_bridge(const char veth_name[], int *ifi) {
ab046dde
TG
2563 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2564 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2565 int r, bridge;
2566
2567 if (!arg_private_network)
2568 return 0;
2569
2570 if (!arg_network_veth)
2571 return 0;
2572
2573 if (!arg_network_bridge)
2574 return 0;
2575
2576 bridge = (int) if_nametoindex(arg_network_bridge);
4a62c710
MS
2577 if (bridge <= 0)
2578 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
ab046dde 2579
5aa4bb6b
LP
2580 *ifi = bridge;
2581
151b9b96 2582 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2583 if (r < 0)
2584 return log_error_errno(r, "Failed to connect to netlink: %m");
ab046dde 2585
151b9b96 2586 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
f647962d
MS
2587 if (r < 0)
2588 return log_error_errno(r, "Failed to allocate netlink message: %m");
ab046dde 2589
039dd4af 2590 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
f647962d
MS
2591 if (r < 0)
2592 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
039dd4af 2593
ab046dde 2594 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
f647962d
MS
2595 if (r < 0)
2596 return log_error_errno(r, "Failed to add netlink interface name field: %m");
ab046dde
TG
2597
2598 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
f647962d
MS
2599 if (r < 0)
2600 return log_error_errno(r, "Failed to add netlink master field: %m");
ab046dde
TG
2601
2602 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2603 if (r < 0)
2604 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
ab046dde
TG
2605
2606 return 0;
2607}
2608
c74e630d
LP
2609static int parse_interface(struct udev *udev, const char *name) {
2610 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2611 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2612 int ifi;
2613
2614 ifi = (int) if_nametoindex(name);
4a62c710
MS
2615 if (ifi <= 0)
2616 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
c74e630d
LP
2617
2618 sprintf(ifi_str, "n%i", ifi);
2619 d = udev_device_new_from_device_id(udev, ifi_str);
4a62c710
MS
2620 if (!d)
2621 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
c74e630d
LP
2622
2623 if (udev_device_get_is_initialized(d) <= 0) {
2624 log_error("Network interface %s is not initialized yet.", name);
2625 return -EBUSY;
2626 }
2627
2628 return ifi;
2629}
2630
69c79d3c 2631static int move_network_interfaces(pid_t pid) {
7e227024 2632 _cleanup_udev_unref_ struct udev *udev = NULL;
69c79d3c 2633 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
aa28aefe
LP
2634 char **i;
2635 int r;
2636
2637 if (!arg_private_network)
2638 return 0;
2639
2640 if (strv_isempty(arg_network_interfaces))
2641 return 0;
2642
151b9b96 2643 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2644 if (r < 0)
2645 return log_error_errno(r, "Failed to connect to netlink: %m");
aa28aefe 2646
7e227024
LP
2647 udev = udev_new();
2648 if (!udev) {
2649 log_error("Failed to connect to udev.");
2650 return -ENOMEM;
2651 }
2652
aa28aefe 2653 STRV_FOREACH(i, arg_network_interfaces) {
cf6a8911 2654 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
b88eb17a 2655 int ifi;
aa28aefe 2656
c74e630d
LP
2657 ifi = parse_interface(udev, *i);
2658 if (ifi < 0)
2659 return ifi;
2660
3125b3ef 2661 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
f647962d
MS
2662 if (r < 0)
2663 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2664
c74e630d 2665 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2666 if (r < 0)
2667 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
7e227024 2668
c74e630d 2669 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2670 if (r < 0)
2671 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
c74e630d 2672 }
7e227024 2673
c74e630d
LP
2674 return 0;
2675}
2676
2677static int setup_macvlan(pid_t pid) {
2678 _cleanup_udev_unref_ struct udev *udev = NULL;
2679 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
e867ceb6 2680 unsigned idx = 0;
c74e630d
LP
2681 char **i;
2682 int r;
2683
2684 if (!arg_private_network)
2685 return 0;
2686
2687 if (strv_isempty(arg_network_macvlan))
2688 return 0;
2689
2690 r = sd_rtnl_open(&rtnl, 0);
f647962d
MS
2691 if (r < 0)
2692 return log_error_errno(r, "Failed to connect to netlink: %m");
c74e630d
LP
2693
2694 udev = udev_new();
2695 if (!udev) {
2696 log_error("Failed to connect to udev.");
2697 return -ENOMEM;
2698 }
2699
2700 STRV_FOREACH(i, arg_network_macvlan) {
2701 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2702 _cleanup_free_ char *n = NULL;
e867ceb6 2703 struct ether_addr mac;
c74e630d
LP
2704 int ifi;
2705
2706 ifi = parse_interface(udev, *i);
2707 if (ifi < 0)
2708 return ifi;
2709
e867ceb6
LP
2710 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2711 if (r < 0)
2712 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2713
c74e630d 2714 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2715 if (r < 0)
2716 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2717
c74e630d 2718 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
f647962d
MS
2719 if (r < 0)
2720 return log_error_errno(r, "Failed to add netlink interface index: %m");
c74e630d
LP
2721
2722 n = strappend("mv-", *i);
2723 if (!n)
2724 return log_oom();
2725
2726 strshorten(n, IFNAMSIZ-1);
2727
2728 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
f647962d
MS
2729 if (r < 0)
2730 return log_error_errno(r, "Failed to add netlink interface name: %m");
c74e630d 2731
e867ceb6
LP
2732 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2733 if (r < 0)
2734 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2735
aa28aefe 2736 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2737 if (r < 0)
2738 return log_error_errno(r, "Failed to add netlink namespace field: %m");
c74e630d
LP
2739
2740 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2741 if (r < 0)
2742 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 2743
d8e538ec 2744 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
f647962d
MS
2745 if (r < 0)
2746 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d
LP
2747
2748 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
f647962d
MS
2749 if (r < 0)
2750 return log_error_errno(r, "Failed to append macvlan mode: %m");
c74e630d
LP
2751
2752 r = sd_rtnl_message_close_container(m);
f647962d
MS
2753 if (r < 0)
2754 return log_error_errno(r, "Failed to close netlink container: %m");
c74e630d
LP
2755
2756 r = sd_rtnl_message_close_container(m);
f647962d
MS
2757 if (r < 0)
2758 return log_error_errno(r, "Failed to close netlink container: %m");
aa28aefe
LP
2759
2760 r = sd_rtnl_call(rtnl, m, 0, NULL);
f647962d
MS
2761 if (r < 0)
2762 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
aa28aefe
LP
2763 }
2764
2765 return 0;
2766}
2767
4bbfe7ad
TG
2768static int setup_ipvlan(pid_t pid) {
2769 _cleanup_udev_unref_ struct udev *udev = NULL;
2770 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2771 char **i;
2772 int r;
2773
2774 if (!arg_private_network)
2775 return 0;
2776
2777 if (strv_isempty(arg_network_ipvlan))
2778 return 0;
2779
2780 r = sd_rtnl_open(&rtnl, 0);
2781 if (r < 0)
2782 return log_error_errno(r, "Failed to connect to netlink: %m");
2783
2784 udev = udev_new();
2785 if (!udev) {
2786 log_error("Failed to connect to udev.");
2787 return -ENOMEM;
2788 }
2789
2790 STRV_FOREACH(i, arg_network_ipvlan) {
2791 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2792 _cleanup_free_ char *n = NULL;
2793 int ifi;
2794
2795 ifi = parse_interface(udev, *i);
2796 if (ifi < 0)
2797 return ifi;
2798
2799 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2800 if (r < 0)
2801 return log_error_errno(r, "Failed to allocate netlink message: %m");
2802
2803 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2804 if (r < 0)
2805 return log_error_errno(r, "Failed to add netlink interface index: %m");
2806
2807 n = strappend("iv-", *i);
2808 if (!n)
2809 return log_oom();
2810
2811 strshorten(n, IFNAMSIZ-1);
2812
2813 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2814 if (r < 0)
2815 return log_error_errno(r, "Failed to add netlink interface name: %m");
2816
2817 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2818 if (r < 0)
2819 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2820
2821 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2822 if (r < 0)
2823 return log_error_errno(r, "Failed to open netlink container: %m");
2824
2825 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2826 if (r < 0)
2827 return log_error_errno(r, "Failed to open netlink container: %m");
2828
2829 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2830 if (r < 0)
2831 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2832
2833 r = sd_rtnl_message_close_container(m);
2834 if (r < 0)
2835 return log_error_errno(r, "Failed to close netlink container: %m");
2836
2837 r = sd_rtnl_message_close_container(m);
2838 if (r < 0)
2839 return log_error_errno(r, "Failed to close netlink container: %m");
2840
2841 r = sd_rtnl_call(rtnl, m, 0, NULL);
2842 if (r < 0)
2843 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2844 }
2845
2846 return 0;
2847}
2848
28650077 2849static int setup_seccomp(void) {
24fb1112
LP
2850
2851#ifdef HAVE_SECCOMP
9a71b112
JF
2852 static const struct {
2853 uint64_t capability;
2854 int syscall_num;
2855 } blacklist[] = {
2856 { CAP_SYS_RAWIO, SCMP_SYS(iopl)},
2857 { CAP_SYS_RAWIO, SCMP_SYS(ioperm)},
2858 { CAP_SYS_BOOT, SCMP_SYS(kexec_load)},
2859 { CAP_SYS_ADMIN, SCMP_SYS(swapon)},
2860 { CAP_SYS_ADMIN, SCMP_SYS(swapoff)},
2861 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at)},
2862 { CAP_SYS_MODULE, SCMP_SYS(init_module)},
2863 { CAP_SYS_MODULE, SCMP_SYS(finit_module)},
2864 { CAP_SYS_MODULE, SCMP_SYS(delete_module)},
d0a0ccf3
JF
2865 };
2866
24fb1112 2867 scmp_filter_ctx seccomp;
28650077 2868 unsigned i;
24fb1112
LP
2869 int r;
2870
24fb1112
LP
2871 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2872 if (!seccomp)
2873 return log_oom();
2874
e9642be2 2875 r = seccomp_add_secondary_archs(seccomp);
9875fd78 2876 if (r < 0) {
da927ba9 2877 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
2878 goto finish;
2879 }
2880
28650077 2881 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
2882 if (arg_retain & (1ULL << blacklist[i].capability))
2883 continue;
2884
2885 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
2886 if (r == -EFAULT)
2887 continue; /* unknown syscall */
2888 if (r < 0) {
da927ba9 2889 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
2890 goto finish;
2891 }
2892 }
2893
d0a0ccf3 2894
28650077
LP
2895 /*
2896 Audit is broken in containers, much of the userspace audit
2897 hookup will fail if running inside a container. We don't
2898 care and just turn off creation of audit sockets.
2899
2900 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2901 with EAFNOSUPPORT which audit userspace uses as indication
2902 that audit is disabled in the kernel.
2903 */
2904
3302da46 2905 r = seccomp_rule_add(
24fb1112
LP
2906 seccomp,
2907 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2908 SCMP_SYS(socket),
2909 2,
2910 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2911 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2912 if (r < 0) {
da927ba9 2913 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
2914 goto finish;
2915 }
2916
2917 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2918 if (r < 0) {
da927ba9 2919 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
2920 goto finish;
2921 }
2922
2923 r = seccomp_load(seccomp);
2924 if (r < 0)
da927ba9 2925 log_error_errno(r, "Failed to install seccomp audit filter: %m");
24fb1112
LP
2926
2927finish:
2928 seccomp_release(seccomp);
2929 return r;
2930#else
2931 return 0;
2932#endif
2933
2934}
2935
785890ac
LP
2936static int setup_propagate(const char *root) {
2937 const char *p, *q;
2938
2939 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2940 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2941 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2942 (void) mkdir_p(p, 0600);
2943
63c372cb 2944 q = strjoina(root, "/run/systemd/nspawn/incoming");
785890ac
LP
2945 mkdir_parents(q, 0755);
2946 mkdir_p(q, 0600);
2947
2948 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2949 return log_error_errno(errno, "Failed to install propagation bind mount.");
2950
2951 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2952 return log_error_errno(errno, "Failed to make propagation mount read-only");
2953
2954 return 0;
2955}
2956
1b9e5b12
LP
2957static int setup_image(char **device_path, int *loop_nr) {
2958 struct loop_info64 info = {
2959 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2960 };
2961 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2962 _cleanup_free_ char* loopdev = NULL;
2963 struct stat st;
2964 int r, nr;
2965
2966 assert(device_path);
2967 assert(loop_nr);
ec16945e 2968 assert(arg_image);
1b9e5b12
LP
2969
2970 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2971 if (fd < 0)
2972 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 2973
4a62c710
MS
2974 if (fstat(fd, &st) < 0)
2975 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
2976
2977 if (S_ISBLK(st.st_mode)) {
2978 char *p;
2979
2980 p = strdup(arg_image);
2981 if (!p)
2982 return log_oom();
2983
2984 *device_path = p;
2985
2986 *loop_nr = -1;
2987
2988 r = fd;
2989 fd = -1;
2990
2991 return r;
2992 }
2993
2994 if (!S_ISREG(st.st_mode)) {
56f64d95 2995 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
2996 return -EINVAL;
2997 }
2998
2999 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
3000 if (control < 0)
3001 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
3002
3003 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
3004 if (nr < 0)
3005 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
3006
3007 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
3008 return log_oom();
3009
3010 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
3011 if (loop < 0)
3012 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 3013
4a62c710
MS
3014 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
3015 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
3016
3017 if (arg_read_only)
3018 info.lo_flags |= LO_FLAGS_READ_ONLY;
3019
4a62c710
MS
3020 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
3021 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
3022
3023 *device_path = loopdev;
3024 loopdev = NULL;
3025
3026 *loop_nr = nr;
3027
3028 r = loop;
3029 loop = -1;
3030
3031 return r;
3032}
3033
ada4799a
LP
3034#define PARTITION_TABLE_BLURB \
3035 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 3036 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 3037 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
3038 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3039 "to be bootable with systemd-nspawn."
3040
1b9e5b12
LP
3041static int dissect_image(
3042 int fd,
727fd4fd
LP
3043 char **root_device, bool *root_device_rw,
3044 char **home_device, bool *home_device_rw,
3045 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
3046 bool *secondary) {
3047
3048#ifdef HAVE_BLKID
01dc33ce
ZJS
3049 int home_nr = -1, srv_nr = -1;
3050#ifdef GPT_ROOT_NATIVE
3051 int root_nr = -1;
3052#endif
3053#ifdef GPT_ROOT_SECONDARY
3054 int secondary_root_nr = -1;
3055#endif
f6c51a81 3056 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
3057 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
3058 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
3059 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3060 _cleanup_udev_unref_ struct udev *udev = NULL;
3061 struct udev_list_entry *first, *item;
f6c51a81 3062 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 3063 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
3064 const char *pttype = NULL;
3065 blkid_partlist pl;
3066 struct stat st;
c09ef2e4 3067 unsigned i;
1b9e5b12
LP
3068 int r;
3069
3070 assert(fd >= 0);
3071 assert(root_device);
3072 assert(home_device);
3073 assert(srv_device);
3074 assert(secondary);
ec16945e 3075 assert(arg_image);
1b9e5b12
LP
3076
3077 b = blkid_new_probe();
3078 if (!b)
3079 return log_oom();
3080
3081 errno = 0;
3082 r = blkid_probe_set_device(b, fd, 0, 0);
3083 if (r != 0) {
3084 if (errno == 0)
3085 return log_oom();
3086
56f64d95 3087 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
3088 return -errno;
3089 }
3090
3091 blkid_probe_enable_partitions(b, 1);
3092 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
3093
3094 errno = 0;
3095 r = blkid_do_safeprobe(b);
3096 if (r == -2 || r == 1) {
ada4799a
LP
3097 log_error("Failed to identify any partition table on\n"
3098 " %s\n"
3099 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
3100 return -EINVAL;
3101 } else if (r != 0) {
3102 if (errno == 0)
3103 errno = EIO;
56f64d95 3104 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
3105 return -errno;
3106 }
3107
48861960 3108 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
3109
3110 is_gpt = streq_ptr(pttype, "gpt");
3111 is_mbr = streq_ptr(pttype, "dos");
3112
3113 if (!is_gpt && !is_mbr) {
3114 log_error("No GPT or MBR partition table discovered on\n"
3115 " %s\n"
3116 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
3117 return -EINVAL;
3118 }
3119
3120 errno = 0;
3121 pl = blkid_probe_get_partitions(b);
3122 if (!pl) {
3123 if (errno == 0)
3124 return log_oom();
3125
3126 log_error("Failed to list partitions of %s", arg_image);
3127 return -errno;
3128 }
3129
3130 udev = udev_new();
3131 if (!udev)
3132 return log_oom();
3133
4a62c710
MS
3134 if (fstat(fd, &st) < 0)
3135 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 3136
c09ef2e4
LP
3137 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
3138 if (!d)
1b9e5b12
LP
3139 return log_oom();
3140
c09ef2e4
LP
3141 for (i = 0;; i++) {
3142 int n, m;
1b9e5b12 3143
c09ef2e4
LP
3144 if (i >= 10) {
3145 log_error("Kernel partitions never appeared.");
3146 return -ENXIO;
3147 }
3148
3149 e = udev_enumerate_new(udev);
3150 if (!e)
3151 return log_oom();
3152
3153 r = udev_enumerate_add_match_parent(e, d);
3154 if (r < 0)
3155 return log_oom();
3156
3157 r = udev_enumerate_scan_devices(e);
3158 if (r < 0)
3159 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
3160
3161 /* Count the partitions enumerated by the kernel */
3162 n = 0;
3163 first = udev_enumerate_get_list_entry(e);
3164 udev_list_entry_foreach(item, first)
3165 n++;
3166
3167 /* Count the partitions enumerated by blkid */
3168 m = blkid_partlist_numof_partitions(pl);
3169 if (n == m + 1)
3170 break;
3171 if (n > m + 1) {
3172 log_error("blkid and kernel partition list do not match.");
3173 return -EIO;
3174 }
3175 if (n < m + 1) {
3176 unsigned j;
3177
3178 /* The kernel has probed fewer partitions than
3179 * blkid? Maybe the kernel prober is still
3180 * running or it got EBUSY because udev
3181 * already opened the device. Let's reprobe
3182 * the device, which is a synchronous call
3183 * that waits until probing is complete. */
3184
3185 for (j = 0; j < 20; j++) {
3186
3187 r = ioctl(fd, BLKRRPART, 0);
3188 if (r < 0)
3189 r = -errno;
3190 if (r >= 0 || r != -EBUSY)
3191 break;
3192
3193 /* If something else has the device
3194 * open, such as an udev rule, the
3195 * ioctl will return EBUSY. Since
3196 * there's no way to wait until it
3197 * isn't busy anymore, let's just wait
3198 * a bit, and try again.
3199 *
3200 * This is really something they
3201 * should fix in the kernel! */
3202
3203 usleep(50 * USEC_PER_MSEC);
3204 }
3205
3206 if (r < 0)
3207 return log_error_errno(r, "Failed to reread partition table: %m");
3208 }
3209
3210 e = udev_enumerate_unref(e);
3211 }
1b9e5b12
LP
3212
3213 first = udev_enumerate_get_list_entry(e);
3214 udev_list_entry_foreach(item, first) {
3215 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 3216 const char *node;
727fd4fd 3217 unsigned long long flags;
1b9e5b12
LP
3218 blkid_partition pp;
3219 dev_t qn;
3220 int nr;
3221
3222 errno = 0;
3223 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
3224 if (!q) {
3225 if (!errno)
3226 errno = ENOMEM;
3227
56f64d95 3228 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
3229 return -errno;
3230 }
3231
3232 qn = udev_device_get_devnum(q);
3233 if (major(qn) == 0)
3234 continue;
3235
3236 if (st.st_rdev == qn)
3237 continue;
3238
3239 node = udev_device_get_devnode(q);
3240 if (!node)
3241 continue;
3242
3243 pp = blkid_partlist_devno_to_partition(pl, qn);
3244 if (!pp)
3245 continue;
3246
727fd4fd 3247 flags = blkid_partition_get_flags(pp);
727fd4fd 3248
1b9e5b12
LP
3249 nr = blkid_partition_get_partno(pp);
3250 if (nr < 0)
3251 continue;
3252
ada4799a
LP
3253 if (is_gpt) {
3254 sd_id128_t type_id;
3255 const char *stype;
1b9e5b12 3256
f6c51a81
LP
3257 if (flags & GPT_FLAG_NO_AUTO)
3258 continue;
3259
ada4799a
LP
3260 stype = blkid_partition_get_type_string(pp);
3261 if (!stype)
3262 continue;
1b9e5b12 3263
ada4799a 3264 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
3265 continue;
3266
ada4799a 3267 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 3268
ada4799a
LP
3269 if (home && nr >= home_nr)
3270 continue;
1b9e5b12 3271
ada4799a
LP
3272 home_nr = nr;
3273 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 3274
ada4799a
LP
3275 r = free_and_strdup(&home, node);
3276 if (r < 0)
3277 return log_oom();
727fd4fd 3278
ada4799a
LP
3279 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3280
3281 if (srv && nr >= srv_nr)
3282 continue;
3283
3284 srv_nr = nr;
3285 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3286
3287 r = free_and_strdup(&srv, node);
3288 if (r < 0)
3289 return log_oom();
3290 }
1b9e5b12 3291#ifdef GPT_ROOT_NATIVE
ada4799a 3292 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 3293
ada4799a
LP
3294 if (root && nr >= root_nr)
3295 continue;
1b9e5b12 3296
ada4799a
LP
3297 root_nr = nr;
3298 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 3299
ada4799a
LP
3300 r = free_and_strdup(&root, node);
3301 if (r < 0)
3302 return log_oom();
3303 }
1b9e5b12
LP
3304#endif
3305#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
3306 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3307
3308 if (secondary_root && nr >= secondary_root_nr)
3309 continue;
3310
3311 secondary_root_nr = nr;
3312 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3313
3314 r = free_and_strdup(&secondary_root, node);
3315 if (r < 0)
3316 return log_oom();
3317 }
3318#endif
f6c51a81
LP
3319 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3320
3321 if (generic)
3322 multiple_generic = true;
3323 else {
3324 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3325
3326 r = free_and_strdup(&generic, node);
3327 if (r < 0)
3328 return log_oom();
3329 }
3330 }
ada4799a
LP
3331
3332 } else if (is_mbr) {
3333 int type;
1b9e5b12 3334
f6c51a81
LP
3335 if (flags != 0x80) /* Bootable flag */
3336 continue;
3337
ada4799a
LP
3338 type = blkid_partition_get_type(pp);
3339 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
3340 continue;
3341
f6c51a81
LP
3342 if (generic)
3343 multiple_generic = true;
3344 else {
3345 generic_rw = true;
727fd4fd 3346
f6c51a81
LP
3347 r = free_and_strdup(&root, node);
3348 if (r < 0)
3349 return log_oom();
3350 }
1b9e5b12 3351 }
1b9e5b12
LP
3352 }
3353
1b9e5b12
LP
3354 if (root) {
3355 *root_device = root;
3356 root = NULL;
727fd4fd
LP
3357
3358 *root_device_rw = root_rw;
1b9e5b12
LP
3359 *secondary = false;
3360 } else if (secondary_root) {
3361 *root_device = secondary_root;
3362 secondary_root = NULL;
727fd4fd
LP
3363
3364 *root_device_rw = secondary_root_rw;
1b9e5b12 3365 *secondary = true;
f6c51a81
LP
3366 } else if (generic) {
3367
3368 /* There were no partitions with precise meanings
3369 * around, but we found generic partitions. In this
3370 * case, if there's only one, we can go ahead and boot
3371 * it, otherwise we bail out, because we really cannot
3372 * make any sense of it. */
3373
3374 if (multiple_generic) {
3375 log_error("Identified multiple bootable Linux partitions on\n"
3376 " %s\n"
3377 PARTITION_TABLE_BLURB, arg_image);
3378 return -EINVAL;
3379 }
3380
3381 *root_device = generic;
3382 generic = NULL;
3383
3384 *root_device_rw = generic_rw;
3385 *secondary = false;
3386 } else {
3387 log_error("Failed to identify root partition in disk image\n"
3388 " %s\n"
3389 PARTITION_TABLE_BLURB, arg_image);
3390 return -EINVAL;
1b9e5b12
LP
3391 }
3392
3393 if (home) {
3394 *home_device = home;
3395 home = NULL;
727fd4fd
LP
3396
3397 *home_device_rw = home_rw;
1b9e5b12
LP
3398 }
3399
3400 if (srv) {
3401 *srv_device = srv;
3402 srv = NULL;
727fd4fd
LP
3403
3404 *srv_device_rw = srv_rw;
1b9e5b12
LP
3405 }
3406
3407 return 0;
3408#else
3409 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3410 return -EOPNOTSUPP;
1b9e5b12
LP
3411#endif
3412}
3413
727fd4fd 3414static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
3415#ifdef HAVE_BLKID
3416 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3417 const char *fstype, *p;
3418 int r;
3419
3420 assert(what);
3421 assert(where);
3422
727fd4fd
LP
3423 if (arg_read_only)
3424 rw = false;
3425
1b9e5b12 3426 if (directory)
63c372cb 3427 p = strjoina(where, directory);
1b9e5b12
LP
3428 else
3429 p = where;
3430
3431 errno = 0;
3432 b = blkid_new_probe_from_filename(what);
3433 if (!b) {
3434 if (errno == 0)
3435 return log_oom();
56f64d95 3436 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
3437 return -errno;
3438 }
3439
3440 blkid_probe_enable_superblocks(b, 1);
3441 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3442
3443 errno = 0;
3444 r = blkid_do_safeprobe(b);
3445 if (r == -1 || r == 1) {
3446 log_error("Cannot determine file system type of %s", what);
3447 return -EINVAL;
3448 } else if (r != 0) {
3449 if (errno == 0)
3450 errno = EIO;
56f64d95 3451 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
3452 return -errno;
3453 }
3454
3455 errno = 0;
3456 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3457 if (errno == 0)
3458 errno = EINVAL;
3459 log_error("Failed to determine file system type of %s", what);
3460 return -errno;
3461 }
3462
3463 if (streq(fstype, "crypto_LUKS")) {
3464 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 3465 return -EOPNOTSUPP;
1b9e5b12
LP
3466 }
3467
4a62c710
MS
3468 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3469 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
3470
3471 return 0;
3472#else
3473 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3474 return -EOPNOTSUPP;
1b9e5b12
LP
3475#endif
3476}
3477
727fd4fd
LP
3478static int mount_devices(
3479 const char *where,
3480 const char *root_device, bool root_device_rw,
3481 const char *home_device, bool home_device_rw,
3482 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
3483 int r;
3484
3485 assert(where);
3486
3487 if (root_device) {
727fd4fd 3488 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
3489 if (r < 0)
3490 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
3491 }
3492
3493 if (home_device) {
727fd4fd 3494 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
3495 if (r < 0)
3496 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
3497 }
3498
3499 if (srv_device) {
727fd4fd 3500 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
3501 if (r < 0)
3502 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
3503 }
3504
3505 return 0;
3506}
3507
3508static void loop_remove(int nr, int *image_fd) {
3509 _cleanup_close_ int control = -1;
e8c8ddcc 3510 int r;
1b9e5b12
LP
3511
3512 if (nr < 0)
3513 return;
3514
3515 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
3516 r = ioctl(*image_fd, LOOP_CLR_FD);
3517 if (r < 0)
5e4074aa 3518 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 3519 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
3520 }
3521
3522 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 3523 if (control < 0) {
56f64d95 3524 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 3525 return;
e8c8ddcc 3526 }
1b9e5b12 3527
e8c8ddcc
TG
3528 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3529 if (r < 0)
5e4074aa 3530 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
3531}
3532
0cb9fbcd
LP
3533static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3534 int pipe_fds[2];
3535 pid_t pid;
3536
3537 assert(database);
3538 assert(key);
3539 assert(rpid);
3540
4a62c710
MS
3541 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3542 return log_error_errno(errno, "Failed to allocate pipe: %m");
0cb9fbcd
LP
3543
3544 pid = fork();
4a62c710
MS
3545 if (pid < 0)
3546 return log_error_errno(errno, "Failed to fork getent child: %m");
3547 else if (pid == 0) {
0cb9fbcd
LP
3548 int nullfd;
3549 char *empty_env = NULL;
3550
3551 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3552 _exit(EXIT_FAILURE);
3553
3554 if (pipe_fds[0] > 2)
03e334a1 3555 safe_close(pipe_fds[0]);
0cb9fbcd 3556 if (pipe_fds[1] > 2)
03e334a1 3557 safe_close(pipe_fds[1]);
0cb9fbcd
LP
3558
3559 nullfd = open("/dev/null", O_RDWR);
3560 if (nullfd < 0)
3561 _exit(EXIT_FAILURE);
3562
3563 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3564 _exit(EXIT_FAILURE);
3565
3566 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3567 _exit(EXIT_FAILURE);
3568
3569 if (nullfd > 2)
03e334a1 3570 safe_close(nullfd);
0cb9fbcd
LP
3571
3572 reset_all_signal_handlers();
3573 close_all_fds(NULL, 0);
3574
4de82926
MM
3575 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3576 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
3577 _exit(EXIT_FAILURE);
3578 }
3579
03e334a1 3580 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
3581
3582 *rpid = pid;
3583
3584 return pipe_fds[0];
3585}
3586
3587static int change_uid_gid(char **_home) {
a2a5291b
ZJS
3588 char line[LINE_MAX], *x, *u, *g, *h;
3589 const char *word, *state;
0cb9fbcd
LP
3590 _cleanup_free_ uid_t *uids = NULL;
3591 _cleanup_free_ char *home = NULL;
3592 _cleanup_fclose_ FILE *f = NULL;
3593 _cleanup_close_ int fd = -1;
3594 unsigned n_uids = 0;
70f539ca 3595 size_t sz = 0, l;
0cb9fbcd
LP
3596 uid_t uid;
3597 gid_t gid;
3598 pid_t pid;
3599 int r;
3600
3601 assert(_home);
3602
3603 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3604 /* Reset everything fully to 0, just in case */
3605
4a62c710
MS
3606 if (setgroups(0, NULL) < 0)
3607 return log_error_errno(errno, "setgroups() failed: %m");
0cb9fbcd 3608
4a62c710
MS
3609 if (setresgid(0, 0, 0) < 0)
3610 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3611
4a62c710
MS
3612 if (setresuid(0, 0, 0) < 0)
3613 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3614
3615 *_home = NULL;
3616 return 0;
3617 }
3618
3619 /* First, get user credentials */
3620 fd = spawn_getent("passwd", arg_user, &pid);
3621 if (fd < 0)
3622 return fd;
3623
3624 f = fdopen(fd, "r");
3625 if (!f)
3626 return log_oom();
3627 fd = -1;
3628
3629 if (!fgets(line, sizeof(line), f)) {
3630
3631 if (!ferror(f)) {
3632 log_error("Failed to resolve user %s.", arg_user);
3633 return -ESRCH;
3634 }
3635
56f64d95 3636 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3637 return -errno;
3638 }
3639
3640 truncate_nl(line);
3641
820d3acf 3642 wait_for_terminate_and_warn("getent passwd", pid, true);
0cb9fbcd
LP
3643
3644 x = strchr(line, ':');
3645 if (!x) {
3646 log_error("/etc/passwd entry has invalid user field.");
3647 return -EIO;
3648 }
3649
3650 u = strchr(x+1, ':');
3651 if (!u) {
3652 log_error("/etc/passwd entry has invalid password field.");
3653 return -EIO;
3654 }
3655
3656 u++;
3657 g = strchr(u, ':');
3658 if (!g) {
3659 log_error("/etc/passwd entry has invalid UID field.");
3660 return -EIO;
3661 }
3662
3663 *g = 0;
3664 g++;
3665 x = strchr(g, ':');
3666 if (!x) {
3667 log_error("/etc/passwd entry has invalid GID field.");
3668 return -EIO;
3669 }
3670
3671 *x = 0;
3672 h = strchr(x+1, ':');
3673 if (!h) {
3674 log_error("/etc/passwd entry has invalid GECOS field.");
3675 return -EIO;
3676 }
3677
3678 h++;
3679 x = strchr(h, ':');
3680 if (!x) {
3681 log_error("/etc/passwd entry has invalid home directory field.");
3682 return -EIO;
3683 }
3684
3685 *x = 0;
3686
3687 r = parse_uid(u, &uid);
3688 if (r < 0) {
3689 log_error("Failed to parse UID of user.");
3690 return -EIO;
3691 }
3692
3693 r = parse_gid(g, &gid);
3694 if (r < 0) {
3695 log_error("Failed to parse GID of user.");
3696 return -EIO;
3697 }
3698
3699 home = strdup(h);
3700 if (!home)
3701 return log_oom();
3702
3703 /* Second, get group memberships */
3704 fd = spawn_getent("initgroups", arg_user, &pid);
3705 if (fd < 0)
3706 return fd;
3707
3708 fclose(f);
3709 f = fdopen(fd, "r");
3710 if (!f)
3711 return log_oom();
3712 fd = -1;
3713
3714 if (!fgets(line, sizeof(line), f)) {
3715 if (!ferror(f)) {
3716 log_error("Failed to resolve user %s.", arg_user);
3717 return -ESRCH;
3718 }
3719
56f64d95 3720 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3721 return -errno;
3722 }
3723
3724 truncate_nl(line);
3725
820d3acf 3726 wait_for_terminate_and_warn("getent initgroups", pid, true);
0cb9fbcd
LP
3727
3728 /* Skip over the username and subsequent separator whitespace */
3729 x = line;
3730 x += strcspn(x, WHITESPACE);
3731 x += strspn(x, WHITESPACE);
3732
a2a5291b 3733 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
3734 char c[l+1];
3735
a2a5291b 3736 memcpy(c, word, l);
0cb9fbcd
LP
3737 c[l] = 0;
3738
3739 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3740 return log_oom();
3741
3742 r = parse_uid(c, &uids[n_uids++]);
3743 if (r < 0) {
3744 log_error("Failed to parse group data from getent.");
3745 return -EIO;
3746 }
3747 }
3748
3749 r = mkdir_parents(home, 0775);
f647962d
MS
3750 if (r < 0)
3751 return log_error_errno(r, "Failed to make home root directory: %m");
0cb9fbcd
LP
3752
3753 r = mkdir_safe(home, 0755, uid, gid);
f647962d
MS
3754 if (r < 0 && r != -EEXIST)
3755 return log_error_errno(r, "Failed to make home directory: %m");
0cb9fbcd
LP
3756
3757 fchown(STDIN_FILENO, uid, gid);
3758 fchown(STDOUT_FILENO, uid, gid);
3759 fchown(STDERR_FILENO, uid, gid);
3760
4a62c710
MS
3761 if (setgroups(n_uids, uids) < 0)
3762 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
0cb9fbcd 3763
4a62c710
MS
3764 if (setresgid(gid, gid, gid) < 0)
3765 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3766
4a62c710
MS
3767 if (setresuid(uid, uid, uid) < 0)
3768 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3769
3770 if (_home) {
3771 *_home = home;
3772 home = NULL;
3773 }
3774
3775 return 0;
3776}
3777
113cea80 3778/*
6d416b9c
LS
3779 * Return values:
3780 * < 0 : wait_for_terminate() failed to get the state of the
3781 * container, the container was terminated by a signal, or
3782 * failed for an unknown reason. No change is made to the
3783 * container argument.
3784 * > 0 : The program executed in the container terminated with an
3785 * error. The exit code of the program executed in the
919699ec
LP
3786 * container is returned. The container argument has been set
3787 * to CONTAINER_TERMINATED.
6d416b9c
LS
3788 * 0 : The container is being rebooted, has been shut down or exited
3789 * successfully. The container argument has been set to either
3790 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 3791 *
6d416b9c
LS
3792 * That is, success is indicated by a return value of zero, and an
3793 * error is indicated by a non-zero value.
113cea80
DH
3794 */
3795static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 3796 siginfo_t status;
919699ec 3797 int r;
113cea80
DH
3798
3799 r = wait_for_terminate(pid, &status);
f647962d
MS
3800 if (r < 0)
3801 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
3802
3803 switch (status.si_code) {
fddbb89c 3804
113cea80 3805 case CLD_EXITED:
919699ec
LP
3806 if (status.si_status == 0) {
3807 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 3808
fddbb89c 3809 } else
919699ec 3810 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 3811
919699ec
LP
3812 *container = CONTAINER_TERMINATED;
3813 return status.si_status;
113cea80
DH
3814
3815 case CLD_KILLED:
3816 if (status.si_status == SIGINT) {
113cea80 3817
919699ec 3818 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 3819 *container = CONTAINER_TERMINATED;
919699ec
LP
3820 return 0;
3821
113cea80 3822 } else if (status.si_status == SIGHUP) {
113cea80 3823
919699ec 3824 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 3825 *container = CONTAINER_REBOOTED;
919699ec 3826 return 0;
113cea80 3827 }
919699ec 3828
113cea80
DH
3829 /* CLD_KILLED fallthrough */
3830
3831 case CLD_DUMPED:
fddbb89c 3832 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 3833 return -EIO;
113cea80
DH
3834
3835 default:
fddbb89c 3836 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 3837 return -EIO;
113cea80
DH
3838 }
3839
3840 return r;
3841}
3842
e866af3a
DH
3843static void nop_handler(int sig) {}
3844
023fb90b
LP
3845static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3846 pid_t pid;
3847
3848 pid = PTR_TO_UINT32(userdata);
3849 if (pid > 0) {
c6c8f6e2 3850 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
3851 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3852 sd_event_source_set_userdata(s, NULL);
3853 return 0;
3854 }
3855 }
3856
3857 sd_event_exit(sd_event_source_get_event(s), 0);
3858 return 0;
3859}
3860
ec16945e 3861static int determine_names(void) {
1b9cebf6 3862 int r;
ec16945e
LP
3863
3864 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3865 if (arg_machine) {
3866 _cleanup_(image_unrefp) Image *i = NULL;
3867
3868 r = image_find(arg_machine, &i);
3869 if (r < 0)
3870 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3871 else if (r == 0) {
3872 log_error("No image for machine '%s': %m", arg_machine);
3873 return -ENOENT;
3874 }
3875
aceac2f0 3876 if (i->type == IMAGE_RAW)
1b9cebf6
LP
3877 r = set_sanitized_path(&arg_image, i->path);
3878 else
3879 r = set_sanitized_path(&arg_directory, i->path);
3880 if (r < 0)
3881 return log_error_errno(r, "Invalid image directory: %m");
3882
aee327b8
LP
3883 if (!arg_ephemeral)
3884 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 3885 } else
ec16945e
LP
3886 arg_directory = get_current_dir_name();
3887
1b9cebf6
LP
3888 if (!arg_directory && !arg_machine) {
3889 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
3890 return -EINVAL;
3891 }
3892 }
3893
3894 if (!arg_machine) {
b9ba4dab
LP
3895 if (arg_directory && path_equal(arg_directory, "/"))
3896 arg_machine = gethostname_malloc();
3897 else
3898 arg_machine = strdup(basename(arg_image ?: arg_directory));
3899
ec16945e
LP
3900 if (!arg_machine)
3901 return log_oom();
3902
3903 hostname_cleanup(arg_machine, false);
3904 if (!machine_name_is_valid(arg_machine)) {
3905 log_error("Failed to determine machine name automatically, please use -M.");
3906 return -EINVAL;
3907 }
b9ba4dab
LP
3908
3909 if (arg_ephemeral) {
3910 char *b;
3911
3912 /* Add a random suffix when this is an
3913 * ephemeral machine, so that we can run many
3914 * instances at once without manually having
3915 * to specify -M each time. */
3916
3917 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3918 return log_oom();
3919
3920 free(arg_machine);
3921 arg_machine = b;
3922 }
ec16945e
LP
3923 }
3924
3925 return 0;
3926}
3927
6dac160c
LP
3928static int determine_uid_shift(void) {
3929 int r;
3930
3931 if (!arg_userns)
3932 return 0;
3933
3934 if (arg_uid_shift == UID_INVALID) {
3935 struct stat st;
3936
3937 r = stat(arg_directory, &st);
3938 if (r < 0)
3939 return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
3940
3941 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3942
3943 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
3944 log_error("UID and GID base of %s don't match.", arg_directory);
3945 return -EINVAL;
3946 }
3947
3948 arg_uid_range = UINT32_C(0x10000);
3949 }
3950
3951 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3952 log_error("UID base too high for UID range.");
3953 return -EINVAL;
3954 }
3955
3956 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3957 return 0;
3958}
3959
88213476 3960int main(int argc, char *argv[]) {
69c79d3c 3961
611b312b 3962 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
727fd4fd 3963 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
63cc4c31 3964 _cleanup_close_ int master = -1, image_fd = -1;
69c79d3c 3965 _cleanup_fdset_free_ FDSet *fds = NULL;
ec16945e 3966 int r, n_fd_passed, loop_nr = -1;
1b9e5b12 3967 char veth_name[IFNAMSIZ];
ec16945e 3968 bool secondary = false, remove_subvol = false;
e866af3a 3969 sigset_t mask, mask_chld;
69c79d3c 3970 pid_t pid = 0;
ec16945e 3971 int ret = EXIT_SUCCESS;
6d0b55c2 3972 union in_addr_union exposed = {};
30535c16 3973 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
9c857b9d 3974 bool interactive;
88213476
LP
3975
3976 log_parse_environment();
3977 log_open();
3978
ec16945e
LP
3979 r = parse_argv(argc, argv);
3980 if (r <= 0)
88213476 3981 goto finish;
88213476 3982
ec16945e
LP
3983 r = determine_names();
3984 if (r < 0)
3985 goto finish;
7027ff61 3986
88213476
LP
3987 if (geteuid() != 0) {
3988 log_error("Need to be root.");
ec16945e 3989 r = -EPERM;
88213476
LP
3990 goto finish;
3991 }
3992
1b9e5b12
LP
3993 log_close();
3994 n_fd_passed = sd_listen_fds(false);
3995 if (n_fd_passed > 0) {
ec16945e
LP
3996 r = fdset_new_listen_fds(&fds, false);
3997 if (r < 0) {
3998 log_error_errno(r, "Failed to collect file descriptors: %m");
1b9e5b12
LP
3999 goto finish;
4000 }
88213476 4001 }
1b9e5b12
LP
4002 fdset_close_others(fds);
4003 log_open();
88213476 4004
1b9e5b12 4005 if (arg_directory) {
ec16945e
LP
4006 assert(!arg_image);
4007
c4e34a61
LP
4008 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4009 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
ec16945e 4010 r = -EINVAL;
6b9132a9
LP
4011 goto finish;
4012 }
1b9e5b12 4013
30535c16 4014 if (arg_ephemeral) {
8a16a7b4 4015 _cleanup_free_ char *np = NULL;
ec16945e 4016
c4e34a61
LP
4017 /* If the specified path is a mount point we
4018 * generate the new snapshot immediately
4019 * inside it under a random name. However if
4020 * the specified is not a mount point we
4021 * create the new snapshot in the parent
4022 * directory, just next to it. */
4023 r = path_is_mount_point(arg_directory, false);
4024 if (r < 0) {
4025 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4026 goto finish;
4027 }
4028 if (r > 0)
4029 r = tempfn_random_child(arg_directory, &np);
4030 else
4031 r = tempfn_random(arg_directory, &np);
ec16945e
LP
4032 if (r < 0) {
4033 log_error_errno(r, "Failed to generate name for snapshot: %m");
4034 goto finish;
4035 }
4036
30535c16
LP
4037 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4038 if (r < 0) {
4039 log_error_errno(r, "Failed to lock %s: %m", np);
4040 goto finish;
4041 }
4042
f70a17f8 4043 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
ec16945e 4044 if (r < 0) {
ec16945e
LP
4045 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4046 goto finish;
4047 }
4048
4049 free(arg_directory);
4050 arg_directory = np;
8a16a7b4 4051 np = NULL;
ec16945e
LP
4052
4053 remove_subvol = true;
30535c16
LP
4054
4055 } else {
4056 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4057 if (r == -EBUSY) {
4058 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4059 goto finish;
4060 }
4061 if (r < 0) {
4062 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4063 return r;
4064 }
4065
4066 if (arg_template) {
f70a17f8 4067 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
30535c16
LP
4068 if (r == -EEXIST) {
4069 if (!arg_quiet)
4070 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4071 } else if (r < 0) {
83521414 4072 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
4073 goto finish;
4074 } else {
4075 if (!arg_quiet)
4076 log_info("Populated %s from template %s.", arg_directory, arg_template);
4077 }
4078 }
ec16945e
LP
4079 }
4080
1b9e5b12
LP
4081 if (arg_boot) {
4082 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 4083 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 4084 r = -EINVAL;
1b9e5b12
LP
4085 goto finish;
4086 }
4087 } else {
4088 const char *p;
4089
63c372cb 4090 p = strjoina(arg_directory,
1b9e5b12
LP
4091 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
4092 if (access(p, F_OK) < 0) {
4093 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
ec16945e 4094 r = -EINVAL;
1b9e5b12 4095 goto finish;
1b9e5b12
LP
4096 }
4097 }
ec16945e 4098
6b9132a9 4099 } else {
1b9e5b12 4100 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 4101
ec16945e
LP
4102 assert(arg_image);
4103 assert(!arg_template);
4104
30535c16
LP
4105 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4106 if (r == -EBUSY) {
4107 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4108 goto finish;
4109 }
4110 if (r < 0) {
4111 r = log_error_errno(r, "Failed to create image lock: %m");
4112 goto finish;
4113 }
4114
1b9e5b12 4115 if (!mkdtemp(template)) {
56f64d95 4116 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 4117 r = -errno;
6b9132a9 4118 goto finish;
1b9e5b12 4119 }
6b9132a9 4120
1b9e5b12
LP
4121 arg_directory = strdup(template);
4122 if (!arg_directory) {
4123 r = log_oom();
4124 goto finish;
6b9132a9 4125 }
88213476 4126
1b9e5b12
LP
4127 image_fd = setup_image(&device_path, &loop_nr);
4128 if (image_fd < 0) {
4129 r = image_fd;
842f3b0f
LP
4130 goto finish;
4131 }
1b9e5b12 4132
4d9f07b4
LP
4133 r = dissect_image(image_fd,
4134 &root_device, &root_device_rw,
4135 &home_device, &home_device_rw,
4136 &srv_device, &srv_device_rw,
4137 &secondary);
1b9e5b12
LP
4138 if (r < 0)
4139 goto finish;
842f3b0f 4140 }
842f3b0f 4141
6dac160c
LP
4142 r = determine_uid_shift();
4143 if (r < 0)
4144 goto finish;
4145
5a8af538
LP
4146 r = custom_mounts_prepare();
4147 if (r < 0)
4148 goto finish;
4149
9c857b9d
LP
4150 interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
4151
db7feb7e
LP
4152 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4153 if (master < 0) {
ec16945e 4154 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
4155 goto finish;
4156 }
4157
611b312b
LP
4158 r = ptsname_malloc(master, &console);
4159 if (r < 0) {
4160 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
4161 goto finish;
4162 }
4163
a258bf26 4164 if (unlockpt(master) < 0) {
ec16945e 4165 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
4166 goto finish;
4167 }
4168
9c857b9d
LP
4169 if (!arg_quiet)
4170 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4171 arg_machine, arg_image ?: arg_directory);
4172
a258bf26
LP
4173 assert_se(sigemptyset(&mask) == 0);
4174 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
4175 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
4176
023fb90b
LP
4177 assert_se(sigemptyset(&mask_chld) == 0);
4178 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4179
d87be9b0 4180 for (;;) {
6d0b55c2 4181 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
113cea80 4182 ContainerStatus container_status;
7566e267 4183 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
e866af3a
DH
4184 struct sigaction sa = {
4185 .sa_handler = nop_handler,
4186 .sa_flags = SA_NOCLDSTOP,
4187 };
4188
7566e267 4189 r = barrier_create(&barrier);
a2da110b 4190 if (r < 0) {
da927ba9 4191 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
4192 goto finish;
4193 }
4194
6d0b55c2
LP
4195 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
4196 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4197 goto finish;
4198 }
4199
4200 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
4201 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4202 goto finish;
4203 }
4204
e866af3a
DH
4205 /* Child can be killed before execv(), so handle SIGCHLD
4206 * in order to interrupt parent's blocking calls and
4207 * give it a chance to call wait() and terminate. */
4208 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4209 if (r < 0) {
ec16945e 4210 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
4211 goto finish;
4212 }
4213
e866af3a
DH
4214 r = sigaction(SIGCHLD, &sa, NULL);
4215 if (r < 0) {
ec16945e 4216 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
4217 goto finish;
4218 }
4219
60e1651a
KW
4220 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4221 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
4222 (arg_private_network ? CLONE_NEWNET : 0), NULL);
d87be9b0
LP
4223 if (pid < 0) {
4224 if (errno == EINVAL)
ec16945e 4225 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 4226 else
ec16945e 4227 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 4228
d87be9b0
LP
4229 goto finish;
4230 }
a258bf26 4231
d87be9b0
LP
4232 if (pid == 0) {
4233 /* child */
0cb9fbcd 4234 _cleanup_free_ char *home = NULL;
5674767e 4235 unsigned n_env = 2;
d87be9b0 4236 const char *envp[] = {
e10a55fd 4237 "PATH=" DEFAULT_PATH_SPLIT_USR,
d87be9b0
LP
4238 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4239 NULL, /* TERM */
4240 NULL, /* HOME */
4241 NULL, /* USER */
4242 NULL, /* LOGNAME */
4243 NULL, /* container_uuid */
842f3b0f
LP
4244 NULL, /* LISTEN_FDS */
4245 NULL, /* LISTEN_PID */
d87be9b0
LP
4246 NULL
4247 };
f4889f65 4248 char **env_use;
a258bf26 4249
a2da110b
DH
4250 barrier_set_role(&barrier, BARRIER_CHILD);
4251
5674767e
ZJS
4252 envp[n_env] = strv_find_prefix(environ, "TERM=");
4253 if (envp[n_env])
4254 n_env ++;
a258bf26 4255
03e334a1 4256 master = safe_close(master);
a258bf26 4257
03e334a1 4258 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 4259 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
a258bf26 4260
d87be9b0 4261 reset_all_signal_handlers();
1b6d7fa7 4262 reset_signal_mask();
f5c1b9ee 4263
9c857b9d
LP
4264 if (interactive) {
4265 close_nointr(STDIN_FILENO);
4266 close_nointr(STDOUT_FILENO);
4267 close_nointr(STDERR_FILENO);
842f3b0f 4268
9c857b9d
LP
4269 r = open_terminal(console, O_RDWR);
4270 if (r != STDIN_FILENO) {
4271 if (r >= 0) {
4272 safe_close(r);
4273 r = -EINVAL;
4274 }
842f3b0f 4275
9c857b9d
LP
4276 log_error_errno(r, "Failed to open console: %m");
4277 _exit(EXIT_FAILURE);
4278 }
4279
4280 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4281 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
4282 log_error_errno(errno, "Failed to duplicate console: %m");
4283 _exit(EXIT_FAILURE);
4284 }
842f3b0f 4285 }
bc2f673e 4286
d87be9b0 4287 if (setsid() < 0) {
56f64d95 4288 log_error_errno(errno, "setsid() failed: %m");
a2da110b 4289 _exit(EXIT_FAILURE);
bc2f673e
LP
4290 }
4291
db999e0f 4292 if (reset_audit_loginuid() < 0)
a2da110b 4293 _exit(EXIT_FAILURE);
db999e0f 4294
d87be9b0 4295 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
56f64d95 4296 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
a2da110b 4297 _exit(EXIT_FAILURE);
d87be9b0 4298 }
e58a1277 4299
6dac160c
LP
4300 if (arg_private_network)
4301 loopback_setup();
4302
d87be9b0
LP
4303 /* Mark everything as slave, so that we still
4304 * receive mounts from the real root, but don't
4305 * propagate mounts to the real root. */
4306 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
56f64d95 4307 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
a2da110b 4308 _exit(EXIT_FAILURE);
d87be9b0 4309 }
04bc4a3f 4310
727fd4fd
LP
4311 if (mount_devices(arg_directory,
4312 root_device, root_device_rw,
4313 home_device, home_device_rw,
4314 srv_device, srv_device_rw) < 0)
a2da110b 4315 _exit(EXIT_FAILURE);
1b9e5b12 4316
d87be9b0 4317 /* Turn directory into bind mount */
4543768d 4318 if (mount(arg_directory, arg_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
56f64d95 4319 log_error_errno(errno, "Failed to make bind mount: %m");
a2da110b 4320 _exit(EXIT_FAILURE);
d87be9b0 4321 }
88213476 4322
4d9f07b4
LP
4323 r = setup_volatile(arg_directory);
4324 if (r < 0)
a2da110b 4325 _exit(EXIT_FAILURE);
4d9f07b4
LP
4326
4327 if (setup_volatile_state(arg_directory) < 0)
a2da110b 4328 _exit(EXIT_FAILURE);
4d9f07b4
LP
4329
4330 r = base_filesystem_create(arg_directory);
4331 if (r < 0)
a2da110b 4332 _exit(EXIT_FAILURE);
4d9f07b4 4333
d6797c92 4334 if (arg_read_only) {
ec16945e
LP
4335 r = bind_remount_recursive(arg_directory, true);
4336 if (r < 0) {
4337 log_error_errno(r, "Failed to make tree read-only: %m");
a2da110b 4338 _exit(EXIT_FAILURE);
d87be9b0 4339 }
d6797c92 4340 }
2547bb41 4341
d87be9b0 4342 if (mount_all(arg_directory) < 0)
a2da110b 4343 _exit(EXIT_FAILURE);
57fb9fb5 4344
d87be9b0 4345 if (copy_devnodes(arg_directory) < 0)
a2da110b 4346 _exit(EXIT_FAILURE);
a258bf26 4347
f2d88580 4348 if (setup_ptmx(arg_directory) < 0)
a2da110b 4349 _exit(EXIT_FAILURE);
f2d88580 4350
d87be9b0 4351 dev_setup(arg_directory);
88213476 4352
785890ac
LP
4353 if (setup_propagate(arg_directory) < 0)
4354 _exit(EXIT_FAILURE);
4355
28650077 4356 if (setup_seccomp() < 0)
a2da110b 4357 _exit(EXIT_FAILURE);
24fb1112 4358
d87be9b0 4359 if (setup_dev_console(arg_directory, console) < 0)
a2da110b 4360 _exit(EXIT_FAILURE);
88213476 4361
d87be9b0 4362 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
a2da110b 4363 _exit(EXIT_FAILURE);
03e334a1 4364 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
a258bf26 4365
6d0b55c2
LP
4366 if (send_rtnl(rtnl_socket_pair[1]) < 0)
4367 _exit(EXIT_FAILURE);
4368 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4369
b12afc8c
LP
4370 /* Tell the parent that we are ready, and that
4371 * it can cgroupify us to that we lack access
4372 * to certain devices and resources. */
6dac160c 4373 (void) barrier_place(&barrier); /* #1 */
b12afc8c 4374
d87be9b0 4375 if (setup_boot_id(arg_directory) < 0)
a2da110b 4376 _exit(EXIT_FAILURE);
a41fe3a2 4377
d87be9b0 4378 if (setup_timezone(arg_directory) < 0)
a2da110b 4379 _exit(EXIT_FAILURE);
88213476 4380
d87be9b0 4381 if (setup_resolv_conf(arg_directory) < 0)
a2da110b 4382 _exit(EXIT_FAILURE);
687d0825 4383
d87be9b0 4384 if (setup_journal(arg_directory) < 0)
a2da110b 4385 _exit(EXIT_FAILURE);
687d0825 4386
5a8af538 4387 if (mount_custom(arg_directory) < 0)
a2da110b 4388 _exit(EXIT_FAILURE);
06c17c39 4389
b12afc8c
LP
4390 /* Wait until we are cgroup-ified, so that we
4391 * can mount the right cgroup path writable */
6dac160c 4392 (void) barrier_place_and_sync(&barrier); /* #2 */
b12afc8c
LP
4393
4394 if (mount_cgroup(arg_directory) < 0)
4395 _exit(EXIT_FAILURE);
d96c1ecf 4396
d87be9b0 4397 if (chdir(arg_directory) < 0) {
56f64d95 4398 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
a2da110b 4399 _exit(EXIT_FAILURE);
687d0825
MV
4400 }
4401
d87be9b0 4402 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
56f64d95 4403 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
a2da110b 4404 _exit(EXIT_FAILURE);
687d0825
MV
4405 }
4406
d87be9b0 4407 if (chroot(".") < 0) {
56f64d95 4408 log_error_errno(errno, "chroot() failed: %m");
a2da110b 4409 _exit(EXIT_FAILURE);
687d0825
MV
4410 }
4411
d87be9b0 4412 if (chdir("/") < 0) {
56f64d95 4413 log_error_errno(errno, "chdir() failed: %m");
a2da110b 4414 _exit(EXIT_FAILURE);
687d0825
MV
4415 }
4416
6dac160c
LP
4417 if (arg_userns) {
4418 if (unshare(CLONE_NEWUSER) < 0) {
4419 log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
4420 _exit(EXIT_FAILURE);
4421 }
d87be9b0 4422
6dac160c
LP
4423 /* Tell the parent, that it now can
4424 * write the UID map. */
4425 (void) barrier_place(&barrier); /* #3 */
4426
4427 /* Wait until the parent wrote the UID
4428 * map */
4429 (void) barrier_place_and_sync(&barrier); /* #4 */
4430 }
4431
4432 umask(0022);
d87be9b0
LP
4433
4434 if (drop_capabilities() < 0) {
56f64d95 4435 log_error_errno(errno, "drop_capabilities() failed: %m");
a2da110b 4436 _exit(EXIT_FAILURE);
687d0825 4437 }
687d0825 4438
6dac160c
LP
4439 setup_hostname();
4440
4441 if (arg_personality != 0xffffffffLU) {
4442 if (personality(arg_personality) < 0) {
4443 log_error_errno(errno, "personality() failed: %m");
4444 _exit(EXIT_FAILURE);
4445 }
4446 } else if (secondary) {
4447 if (personality(PER_LINUX32) < 0) {
4448 log_error_errno(errno, "personality() failed: %m");
4449 _exit(EXIT_FAILURE);
4450 }
4451 }
4452
4453#ifdef HAVE_SELINUX
4454 if (arg_selinux_context)
4455 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4456 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4457 _exit(EXIT_FAILURE);
4458 }
4459#endif
4460
0cb9fbcd
LP
4461 r = change_uid_gid(&home);
4462 if (r < 0)
a2da110b 4463 _exit(EXIT_FAILURE);
d87be9b0 4464
842f3b0f
LP
4465 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4466 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4467 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
0d0f0c50 4468 log_oom();
a2da110b 4469 _exit(EXIT_FAILURE);
144f0fc0 4470 }
687d0825 4471
9444b1f2 4472 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
9f24adc2
LP
4473 char as_uuid[37];
4474
4475 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
842f3b0f 4476 log_oom();
a2da110b 4477 _exit(EXIT_FAILURE);
842f3b0f
LP
4478 }
4479 }
4480
4481 if (fdset_size(fds) > 0) {
ec16945e
LP
4482 r = fdset_cloexec(fds, false);
4483 if (r < 0) {
4484 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
a2da110b 4485 _exit(EXIT_FAILURE);
842f3b0f
LP
4486 }
4487
4488 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
d1826146 4489 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
d87be9b0 4490 log_oom();
a2da110b 4491 _exit(EXIT_FAILURE);
d87be9b0
LP
4492 }
4493 }
4494
f4889f65
LP
4495 if (!strv_isempty(arg_setenv)) {
4496 char **n;
4497
4498 n = strv_env_merge(2, envp, arg_setenv);
4499 if (!n) {
4500 log_oom();
a2da110b 4501 _exit(EXIT_FAILURE);
f4889f65
LP
4502 }
4503
4504 env_use = n;
4505 } else
4506 env_use = (char**) envp;
4507
6dac160c
LP
4508 /* Let the parent know that we are ready and
4509 * wait until the parent is ready with the
4510 * setup, too... */
4511 (void) barrier_place_and_sync(&barrier); /* #5 */
d96c1ecf 4512
d87be9b0
LP
4513 if (arg_boot) {
4514 char **a;
4515 size_t l;
88213476 4516
d87be9b0 4517 /* Automatically search for the init system */
0f0dbc46 4518
d87be9b0
LP
4519 l = 1 + argc - optind;
4520 a = newa(char*, l + 1);
4521 memcpy(a + 1, argv + optind, l * sizeof(char*));
0f0dbc46 4522
d87be9b0 4523 a[0] = (char*) "/usr/lib/systemd/systemd";
f4889f65 4524 execve(a[0], a, env_use);
0f0dbc46 4525
d87be9b0 4526 a[0] = (char*) "/lib/systemd/systemd";
f4889f65 4527 execve(a[0], a, env_use);
0f0dbc46 4528
d87be9b0 4529 a[0] = (char*) "/sbin/init";
f4889f65 4530 execve(a[0], a, env_use);
d87be9b0 4531 } else if (argc > optind)
f4889f65 4532 execvpe(argv[optind], argv + optind, env_use);
d87be9b0
LP
4533 else {
4534 chdir(home ? home : "/root");
f4889f65 4535 execle("/bin/bash", "-bash", NULL, env_use);
262d10e6 4536 execle("/bin/sh", "-sh", NULL, env_use);
d87be9b0
LP
4537 }
4538
56f64d95 4539 log_error_errno(errno, "execv() failed: %m");
d87be9b0 4540 _exit(EXIT_FAILURE);
da5b3bad 4541 }
88213476 4542
a2da110b 4543 barrier_set_role(&barrier, BARRIER_PARENT);
842f3b0f
LP
4544 fdset_free(fds);
4545 fds = NULL;
4546
6d0b55c2
LP
4547 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4548 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4549
6dac160c
LP
4550 (void) barrier_place(&barrier); /* #1 */
4551
b12afc8c
LP
4552 /* Wait for the most basic Child-setup to be done,
4553 * before we add hardware to it, and place it in a
4554 * cgroup. */
6dac160c 4555 if (barrier_sync(&barrier)) { /* #1 */
5aa4bb6b 4556 int ifi = 0;
354bfd2b 4557
840295fc
LP
4558 r = move_network_interfaces(pid);
4559 if (r < 0)
4560 goto finish;
aa28aefe 4561
5aa4bb6b 4562 r = setup_veth(pid, veth_name, &ifi);
840295fc
LP
4563 if (r < 0)
4564 goto finish;
ab046dde 4565
5aa4bb6b 4566 r = setup_bridge(veth_name, &ifi);
840295fc
LP
4567 if (r < 0)
4568 goto finish;
ab046dde 4569
840295fc
LP
4570 r = setup_macvlan(pid);
4571 if (r < 0)
4572 goto finish;
c74e630d 4573
4bbfe7ad
TG
4574 r = setup_ipvlan(pid);
4575 if (r < 0)
4576 goto finish;
4577
5aa4bb6b
LP
4578 r = register_machine(pid, ifi);
4579 if (r < 0)
4580 goto finish;
4581
6dac160c
LP
4582 /* Notify the child that the parent is ready with all
4583 * its setup, and that the child can now hand over
4584 * control to the code to run inside the container. */
4585 (void) barrier_place(&barrier); /* #2 */
4586
4587 if (arg_userns) {
4588 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4589
4590 (void) barrier_place_and_sync(&barrier); /* #3 */
4591
4592 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4593 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4594 r = write_string_file(uid_map, line);
4595 if (r < 0) {
4596 log_error_errno(r, "Failed to write UID map: %m");
4597 goto finish;
4598 }
4599
4600 /* We always assign the same UID and GID ranges */
4601 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4602 r = write_string_file(uid_map, line);
4603 if (r < 0) {
4604 log_error_errno(r, "Failed to write GID map: %m");
4605 goto finish;
4606 }
4607
4608 (void) barrier_place(&barrier); /* #4 */
4609 }
4610
840295fc
LP
4611 /* Block SIGCHLD here, before notifying child.
4612 * process_pty() will handle it with the other signals. */
4613 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4614 if (r < 0)
4615 goto finish;
e866af3a 4616
840295fc
LP
4617 /* Reset signal to default */
4618 r = default_signals(SIGCHLD, -1);
4619 if (r < 0)
4620 goto finish;
e866af3a 4621
6dac160c
LP
4622 /* Let the child know that we are ready and wait that the child is completely ready now. */
4623 if (barrier_place_and_sync(&barrier)) { /* #5 */
6d0b55c2
LP
4624 _cleanup_event_unref_ sd_event *event = NULL;
4625 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4626 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4627 char last_char = 0;
b12afc8c 4628
733d15ac
LP
4629 sd_notifyf(false,
4630 "READY=1\n"
4631 "STATUS=Container running.\n"
4632 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 4633
6d0b55c2
LP
4634 r = sd_event_new(&event);
4635 if (r < 0) {
4636 log_error_errno(r, "Failed to get default event source: %m");
4637 goto finish;
4638 }
88213476 4639
c6c8f6e2 4640 if (arg_kill_signal > 0) {
6d0b55c2
LP
4641 /* Try to kill the init system on SIGINT or SIGTERM */
4642 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4643 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4644 } else {
4645 /* Immediately exit */
4646 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4647 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4648 }
023fb90b 4649
6d0b55c2
LP
4650 /* simply exit on sigchld */
4651 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 4652
6d0b55c2
LP
4653 if (arg_expose_ports) {
4654 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4655 if (r < 0)
4656 goto finish;
023fb90b 4657
6d0b55c2
LP
4658 (void) expose_ports(rtnl, &exposed);
4659 }
023fb90b 4660
6d0b55c2 4661 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 4662
9c857b9d 4663 r = pty_forward_new(event, master, true, !interactive, &forward);
6d0b55c2
LP
4664 if (r < 0) {
4665 log_error_errno(r, "Failed to create PTY forwarder: %m");
4666 goto finish;
4667 }
023fb90b 4668
6d0b55c2
LP
4669 r = sd_event_loop(event);
4670 if (r < 0) {
4671 log_error_errno(r, "Failed to run event loop: %m");
4672 goto finish;
4673 }
4674
4675 pty_forward_get_last_char(forward, &last_char);
4676
4677 forward = pty_forward_free(forward);
4678
4679 if (!arg_quiet && last_char != '\n')
4680 putc('\n', stdout);
04d39279 4681
6d0b55c2
LP
4682 /* Kill if it is not dead yet anyway */
4683 terminate_machine(pid);
4684 }
840295fc 4685 }
1f0cd86b 4686
840295fc 4687 /* Normally redundant, but better safe than sorry */
04d39279 4688 kill(pid, SIGKILL);
a258bf26 4689
113cea80 4690 r = wait_for_container(pid, &container_status);
04d39279
LP
4691 pid = 0;
4692
ec16945e 4693 if (r < 0)
ce9f1527
LP
4694 /* We failed to wait for the container, or the
4695 * container exited abnormally */
ec16945e
LP
4696 goto finish;
4697 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
4698 /* The container exited with a non-zero
4699 * status, or with zero status and no reboot
4700 * was requested. */
ec16945e 4701 ret = r;
d87be9b0 4702 break;
ec16945e 4703 }
88213476 4704
113cea80 4705 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
4706
4707 if (arg_keep_unit) {
4708 /* Special handling if we are running as a
4709 * service: instead of simply restarting the
4710 * machine we want to restart the entire
4711 * service, so let's inform systemd about this
4712 * with the special exit code 133. The service
4713 * file uses RestartForceExitStatus=133 so
4714 * that this results in a full nspawn
4715 * restart. This is necessary since we might
4716 * have cgroup parameters set we want to have
4717 * flushed out. */
ec16945e
LP
4718 ret = 133;
4719 r = 0;
ce38dbc8
LP
4720 break;
4721 }
6d0b55c2
LP
4722
4723 flush_ports(&exposed);
d87be9b0 4724 }
88213476
LP
4725
4726finish:
af4ec430
LP
4727 sd_notify(false,
4728 "STOPPING=1\n"
4729 "STATUS=Terminating...");
4730
1b9e5b12
LP
4731 loop_remove(loop_nr, &image_fd);
4732
9444b1f2
LP
4733 if (pid > 0)
4734 kill(pid, SIGKILL);
88213476 4735
ec16945e
LP
4736 if (remove_subvol && arg_directory) {
4737 int k;
4738
d9e2daaf 4739 k = btrfs_subvol_remove(arg_directory, true);
ec16945e
LP
4740 if (k < 0)
4741 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4742 }
4743
785890ac
LP
4744 if (arg_machine) {
4745 const char *p;
4746
63c372cb 4747 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4748 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4749 }
4750
04d391da 4751 free(arg_directory);
ec16945e
LP
4752 free(arg_template);
4753 free(arg_image);
7027ff61 4754 free(arg_machine);
c74e630d
LP
4755 free(arg_user);
4756 strv_free(arg_setenv);
4757 strv_free(arg_network_interfaces);
4758 strv_free(arg_network_macvlan);
4bbfe7ad 4759 strv_free(arg_network_ipvlan);
5a8af538 4760 custom_mount_free_all();
88213476 4761
6d0b55c2
LP
4762 flush_ports(&exposed);
4763
4764 while (arg_expose_ports) {
4765 ExposePort *p = arg_expose_ports;
4766 LIST_REMOVE(ports, arg_expose_ports, p);
4767 free(p);
4768 }
4769
ec16945e 4770 return r < 0 ? EXIT_FAILURE : ret;
88213476 4771}