]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
NEWS: give packagers a heads-up on the upcoming python bindings removal
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
88213476 26#include <sys/mount.h>
88213476
LP
27#include <stdlib.h>
28#include <string.h>
29#include <stdio.h>
30#include <errno.h>
31#include <sys/prctl.h>
88213476 32#include <getopt.h>
687d0825 33#include <grp.h>
5ed27dbd 34#include <linux/fs.h>
9537eab0 35#include <sys/socket.h>
aea38d80 36#include <linux/netlink.h>
aa28aefe 37#include <net/if.h>
69c79d3c 38#include <linux/veth.h>
6afc95b7 39#include <sys/personality.h>
1b9e5b12 40#include <linux/loop.h>
2fbe4296 41#include <sys/file.h>
aa28aefe 42
5d63309c 43#ifdef HAVE_SELINUX
a8828ed9
DW
44#include <selinux/selinux.h>
45#endif
88213476 46
24fb1112
LP
47#ifdef HAVE_SECCOMP
48#include <seccomp.h>
49#endif
50
1b9e5b12
LP
51#ifdef HAVE_BLKID
52#include <blkid/blkid.h>
53#endif
54
1f0cd86b
LP
55#include "sd-daemon.h"
56#include "sd-bus.h"
57#include "sd-id128.h"
1c4baffc 58#include "sd-netlink.h"
958b66ea 59#include "random-util.h"
88213476
LP
60#include "log.h"
61#include "util.h"
49e942b2 62#include "mkdir.h"
c6878637 63#include "rm-rf.h"
6b2d0e85 64#include "macro.h"
94d82985 65#include "missing.h"
04d391da 66#include "cgroup-util.h"
a258bf26 67#include "strv.h"
9eb977db 68#include "path-util.h"
a41fe3a2 69#include "loopback-setup.h"
4fc9982c 70#include "dev-setup.h"
842f3b0f 71#include "fdset.h"
acbeb427 72#include "build.h"
a5c32cff 73#include "fileio.h"
40ca29a1 74#include "bus-util.h"
1f0cd86b 75#include "bus-error.h"
4ba93280 76#include "ptyfwd.h"
f4889f65 77#include "env-util.h"
1c4baffc 78#include "netlink-util.h"
7e227024 79#include "udev-util.h"
1b9e5b12
LP
80#include "blkid-util.h"
81#include "gpt.h"
01dde061 82#include "siphash24.h"
849958d1 83#include "copy.h"
3577de7a 84#include "base-filesystem.h"
a2da110b 85#include "barrier.h"
023fb90b 86#include "event-util.h"
f01ae826 87#include "capability.h"
2822da4f 88#include "cap-list.h"
ec16945e 89#include "btrfs-util.h"
1b9cebf6 90#include "machine-image.h"
6d0b55c2
LP
91#include "list.h"
92#include "in-addr-util.h"
12c2884c 93#include "firewall-util.h"
6d0b55c2 94#include "local-addresses.h"
6482f626 95#include "formats-util.h"
0b452006 96#include "process-util.h"
288a74cc 97#include "terminal-util.h"
958b66ea 98#include "hostname-util.h"
24882e06 99#include "signal-util.h"
f2d88580 100
e9642be2
LP
101#ifdef HAVE_SECCOMP
102#include "seccomp-util.h"
103#endif
104
6d0b55c2
LP
105typedef struct ExposePort {
106 int protocol;
107 uint16_t host_port;
108 uint16_t container_port;
109 LIST_FIELDS(struct ExposePort, ports);
110} ExposePort;
111
113cea80
DH
112typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115} ContainerStatus;
116
57fb9fb5
LP
117typedef enum LinkJournal {
118 LINK_NO,
119 LINK_AUTO,
120 LINK_HOST,
121 LINK_GUEST
122} LinkJournal;
88213476 123
4d9f07b4
LP
124typedef enum Volatile {
125 VOLATILE_NO,
126 VOLATILE_YES,
127 VOLATILE_STATE,
128} Volatile;
129
5a8af538
LP
130typedef enum CustomMountType {
131 CUSTOM_MOUNT_BIND,
132 CUSTOM_MOUNT_TMPFS,
133 CUSTOM_MOUNT_OVERLAY,
134} CustomMountType;
135
136typedef struct CustomMount {
137 CustomMountType type;
138 bool read_only;
139 char *source; /* for overlayfs this is the upper directory */
140 char *destination;
141 char *options;
142 char *work_dir;
143 char **lower;
144} CustomMount;
145
88213476 146static char *arg_directory = NULL;
ec16945e 147static char *arg_template = NULL;
687d0825 148static char *arg_user = NULL;
9444b1f2 149static sd_id128_t arg_uuid = {};
7027ff61 150static char *arg_machine = NULL;
c74e630d
LP
151static const char *arg_selinux_context = NULL;
152static const char *arg_selinux_apifs_context = NULL;
9444b1f2 153static const char *arg_slice = NULL;
ff01d048 154static bool arg_private_network = false;
bc2f673e 155static bool arg_read_only = false;
0f0dbc46 156static bool arg_boot = false;
ec16945e 157static bool arg_ephemeral = false;
57fb9fb5 158static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 159static bool arg_link_journal_try = false;
5076f0cc
LP
160static uint64_t arg_retain =
161 (1ULL << CAP_CHOWN) |
162 (1ULL << CAP_DAC_OVERRIDE) |
163 (1ULL << CAP_DAC_READ_SEARCH) |
164 (1ULL << CAP_FOWNER) |
165 (1ULL << CAP_FSETID) |
166 (1ULL << CAP_IPC_OWNER) |
167 (1ULL << CAP_KILL) |
168 (1ULL << CAP_LEASE) |
169 (1ULL << CAP_LINUX_IMMUTABLE) |
170 (1ULL << CAP_NET_BIND_SERVICE) |
171 (1ULL << CAP_NET_BROADCAST) |
172 (1ULL << CAP_NET_RAW) |
173 (1ULL << CAP_SETGID) |
174 (1ULL << CAP_SETFCAP) |
175 (1ULL << CAP_SETPCAP) |
176 (1ULL << CAP_SETUID) |
177 (1ULL << CAP_SYS_ADMIN) |
178 (1ULL << CAP_SYS_CHROOT) |
179 (1ULL << CAP_SYS_NICE) |
180 (1ULL << CAP_SYS_PTRACE) |
181 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 182 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
183 (1ULL << CAP_SYS_BOOT) |
184 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
185 (1ULL << CAP_AUDIT_CONTROL) |
186 (1ULL << CAP_MKNOD);
5a8af538
LP
187static CustomMount *arg_custom_mounts = NULL;
188static unsigned arg_n_custom_mounts = 0;
f4889f65 189static char **arg_setenv = NULL;
284c0b91 190static bool arg_quiet = false;
8a96d94e 191static bool arg_share_system = false;
eb91eb18 192static bool arg_register = true;
89f7c846 193static bool arg_keep_unit = false;
aa28aefe 194static char **arg_network_interfaces = NULL;
c74e630d 195static char **arg_network_macvlan = NULL;
4bbfe7ad 196static char **arg_network_ipvlan = NULL;
69c79d3c 197static bool arg_network_veth = false;
c74e630d 198static const char *arg_network_bridge = NULL;
050f7277 199static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 200static char *arg_image = NULL;
4d9f07b4 201static Volatile arg_volatile = VOLATILE_NO;
6d0b55c2 202static ExposePort *arg_expose_ports = NULL;
f36933fe 203static char **arg_property = NULL;
6dac160c
LP
204static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
205static bool arg_userns = false;
c6c8f6e2 206static int arg_kill_signal = 0;
88213476 207
601185b4 208static void help(void) {
88213476
LP
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
211 " -h --help Show this help\n"
212 " --version Print version string\n"
69c79d3c 213 " -q --quiet Do not show status information\n"
1b9e5b12 214 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
215 " --template=PATH Initialize root directory from template directory,\n"
216 " if missing\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 222 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 223 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 224 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 225 " --property=NAME=VALUE Set scope unit property\n"
03cfe0d5
LP
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
69c79d3c
LP
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
231 " container\n"
c74e630d
LP
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
4bbfe7ad
TG
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
0dfaa006 238 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 239 " and container\n"
ab046dde 240 " --network-bridge=INTERFACE\n"
32457153 241 " Add a virtual ethernet connection between host\n"
ab046dde
TG
242 " and container and add it to an existing bridge on\n"
243 " the host\n"
6d0b55c2 244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 245 " Expose a container IP port on the host\n"
82adf6af
LP
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
a8828ed9
DW
252 " --capability=CAP In addition to the default, retain specified\n"
253 " capability\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 259 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
260 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
261 " the container\n"
262 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 263 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
264 " --overlay=PATH[:PATH...]:PATH\n"
265 " Create an overlay mount from the host to \n"
266 " the container\n"
267 " --overlay-ro=PATH[:PATH...]:PATH\n"
268 " Similar, but creates a read-only overlay mount\n"
284c0b91 269 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 270 " --share-system Share system namespaces with host\n"
eb91eb18 271 " --register=BOOLEAN Register container as machine\n"
89f7c846 272 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 273 " the service unit nspawn is running in\n"
6d0b55c2
LP
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 , program_invocation_short_name);
88213476
LP
276}
277
5a8af538
LP
278static CustomMount* custom_mount_add(CustomMountType t) {
279 CustomMount *c, *ret;
280
281 c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
282 if (!c)
283 return NULL;
284
285 arg_custom_mounts = c;
286 ret = arg_custom_mounts + arg_n_custom_mounts;
287 arg_n_custom_mounts++;
288
289 *ret = (CustomMount) { .type = t };
290
291 return ret;
292}
293
294static void custom_mount_free_all(void) {
295 unsigned i;
296
297 for (i = 0; i < arg_n_custom_mounts; i++) {
298 CustomMount *m = &arg_custom_mounts[i];
299
300 free(m->source);
301 free(m->destination);
302 free(m->options);
303
304 if (m->work_dir) {
305 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
306 free(m->work_dir);
307 }
308
309 strv_free(m->lower);
310 }
311
312 free(arg_custom_mounts);
313 arg_custom_mounts = NULL;
314 arg_n_custom_mounts = 0;
315}
316
317static int custom_mount_compare(const void *a, const void *b) {
318 const CustomMount *x = a, *y = b;
319 int r;
320
321 r = path_compare(x->destination, y->destination);
322 if (r != 0)
323 return r;
324
325 if (x->type < y->type)
326 return -1;
327 if (x->type > y->type)
328 return 1;
329
330 return 0;
331}
332
333static int custom_mounts_prepare(void) {
334 unsigned i;
335 int r;
336
337 /* Ensure the mounts are applied prefix first. */
338 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
339
340 /* Allocate working directories for the overlay file systems that need it */
341 for (i = 0; i < arg_n_custom_mounts; i++) {
342 CustomMount *m = &arg_custom_mounts[i];
343
825d5287
RM
344 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
345 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
346 return -EINVAL;
347 }
348
5a8af538
LP
349 if (m->type != CUSTOM_MOUNT_OVERLAY)
350 continue;
351
352 if (m->work_dir)
353 continue;
354
355 if (m->read_only)
356 continue;
357
14bcf25c 358 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
359 if (r < 0)
360 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
361 }
362
363 return 0;
364}
365
ec16945e
LP
366static int set_sanitized_path(char **b, const char *path) {
367 char *p;
368
369 assert(b);
370 assert(path);
371
372 p = canonicalize_file_name(path);
373 if (!p) {
374 if (errno != ENOENT)
375 return -errno;
376
377 p = path_make_absolute_cwd(path);
378 if (!p)
379 return -ENOMEM;
380 }
381
382 free(*b);
383 *b = path_kill_slashes(p);
384 return 0;
385}
386
88213476
LP
387static int parse_argv(int argc, char *argv[]) {
388
a41fe3a2 389 enum {
acbeb427
ZJS
390 ARG_VERSION = 0x100,
391 ARG_PRIVATE_NETWORK,
bc2f673e 392 ARG_UUID,
5076f0cc 393 ARG_READ_ONLY,
57fb9fb5 394 ARG_CAPABILITY,
420c7379 395 ARG_DROP_CAPABILITY,
17fe0523
LP
396 ARG_LINK_JOURNAL,
397 ARG_BIND,
f4889f65 398 ARG_BIND_RO,
06c17c39 399 ARG_TMPFS,
5a8af538
LP
400 ARG_OVERLAY,
401 ARG_OVERLAY_RO,
f4889f65 402 ARG_SETENV,
eb91eb18 403 ARG_SHARE_SYSTEM,
89f7c846 404 ARG_REGISTER,
aa28aefe 405 ARG_KEEP_UNIT,
69c79d3c 406 ARG_NETWORK_INTERFACE,
c74e630d 407 ARG_NETWORK_MACVLAN,
4bbfe7ad 408 ARG_NETWORK_IPVLAN,
ab046dde 409 ARG_NETWORK_BRIDGE,
6afc95b7 410 ARG_PERSONALITY,
4d9f07b4 411 ARG_VOLATILE,
ec16945e 412 ARG_TEMPLATE,
f36933fe 413 ARG_PROPERTY,
6dac160c 414 ARG_PRIVATE_USERS,
c6c8f6e2 415 ARG_KILL_SIGNAL,
a41fe3a2
LP
416 };
417
88213476 418 static const struct option options[] = {
aa28aefe
LP
419 { "help", no_argument, NULL, 'h' },
420 { "version", no_argument, NULL, ARG_VERSION },
421 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
422 { "template", required_argument, NULL, ARG_TEMPLATE },
423 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
424 { "user", required_argument, NULL, 'u' },
425 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
426 { "boot", no_argument, NULL, 'b' },
427 { "uuid", required_argument, NULL, ARG_UUID },
428 { "read-only", no_argument, NULL, ARG_READ_ONLY },
429 { "capability", required_argument, NULL, ARG_CAPABILITY },
430 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
431 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
432 { "bind", required_argument, NULL, ARG_BIND },
433 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 434 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
435 { "overlay", required_argument, NULL, ARG_OVERLAY },
436 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
437 { "machine", required_argument, NULL, 'M' },
438 { "slice", required_argument, NULL, 'S' },
439 { "setenv", required_argument, NULL, ARG_SETENV },
440 { "selinux-context", required_argument, NULL, 'Z' },
441 { "selinux-apifs-context", required_argument, NULL, 'L' },
442 { "quiet", no_argument, NULL, 'q' },
443 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
444 { "register", required_argument, NULL, ARG_REGISTER },
445 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
446 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 447 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 448 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 449 { "network-veth", no_argument, NULL, 'n' },
ab046dde 450 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 451 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 452 { "image", required_argument, NULL, 'i' },
4d9f07b4 453 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 454 { "port", required_argument, NULL, 'p' },
f36933fe 455 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 456 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 457 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
eb9da376 458 {}
88213476
LP
459 };
460
9444b1f2 461 int c, r;
a42c8b54 462 uint64_t plus = 0, minus = 0;
88213476
LP
463
464 assert(argc >= 0);
465 assert(argv);
466
0dfaa006 467 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
468
469 switch (c) {
470
471 case 'h':
601185b4
ZJS
472 help();
473 return 0;
88213476 474
acbeb427
ZJS
475 case ARG_VERSION:
476 puts(PACKAGE_STRING);
477 puts(SYSTEMD_FEATURES);
478 return 0;
479
88213476 480 case 'D':
ec16945e
LP
481 r = set_sanitized_path(&arg_directory, optarg);
482 if (r < 0)
483 return log_error_errno(r, "Invalid root directory: %m");
484
485 break;
486
487 case ARG_TEMPLATE:
488 r = set_sanitized_path(&arg_template, optarg);
489 if (r < 0)
490 return log_error_errno(r, "Invalid template directory: %m");
88213476
LP
491
492 break;
493
1b9e5b12 494 case 'i':
ec16945e
LP
495 r = set_sanitized_path(&arg_image, optarg);
496 if (r < 0)
497 return log_error_errno(r, "Invalid image path: %m");
498
499 break;
500
501 case 'x':
502 arg_ephemeral = true;
1b9e5b12
LP
503 break;
504
687d0825
MV
505 case 'u':
506 free(arg_user);
7027ff61
LP
507 arg_user = strdup(optarg);
508 if (!arg_user)
509 return log_oom();
687d0825
MV
510
511 break;
512
ab046dde 513 case ARG_NETWORK_BRIDGE:
c74e630d 514 arg_network_bridge = optarg;
ab046dde
TG
515
516 /* fall through */
517
0dfaa006 518 case 'n':
69c79d3c
LP
519 arg_network_veth = true;
520 arg_private_network = true;
521 break;
522
aa28aefe 523 case ARG_NETWORK_INTERFACE:
c74e630d
LP
524 if (strv_extend(&arg_network_interfaces, optarg) < 0)
525 return log_oom();
526
527 arg_private_network = true;
528 break;
529
530 case ARG_NETWORK_MACVLAN:
531 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
532 return log_oom();
533
4bbfe7ad
TG
534 arg_private_network = true;
535 break;
536
537 case ARG_NETWORK_IPVLAN:
538 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
539 return log_oom();
540
aa28aefe
LP
541 /* fall through */
542
ff01d048
LP
543 case ARG_PRIVATE_NETWORK:
544 arg_private_network = true;
a41fe3a2
LP
545 break;
546
0f0dbc46
LP
547 case 'b':
548 arg_boot = true;
549 break;
550
144f0fc0 551 case ARG_UUID:
9444b1f2
LP
552 r = sd_id128_from_string(optarg, &arg_uuid);
553 if (r < 0) {
aa96c6cb 554 log_error("Invalid UUID: %s", optarg);
9444b1f2 555 return r;
aa96c6cb 556 }
9444b1f2 557 break;
aa96c6cb 558
9444b1f2 559 case 'S':
c74e630d 560 arg_slice = optarg;
144f0fc0
LP
561 break;
562
7027ff61 563 case 'M':
eb91eb18
LP
564 if (isempty(optarg)) {
565 free(arg_machine);
566 arg_machine = NULL;
567 } else {
0c3c4284 568 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
569 log_error("Invalid machine name: %s", optarg);
570 return -EINVAL;
571 }
7027ff61 572
0c3c4284
LP
573 r = free_and_strdup(&arg_machine, optarg);
574 if (r < 0)
eb91eb18
LP
575 return log_oom();
576
577 break;
578 }
7027ff61 579
82adf6af
LP
580 case 'Z':
581 arg_selinux_context = optarg;
a8828ed9
DW
582 break;
583
82adf6af
LP
584 case 'L':
585 arg_selinux_apifs_context = optarg;
a8828ed9
DW
586 break;
587
bc2f673e
LP
588 case ARG_READ_ONLY:
589 arg_read_only = true;
590 break;
591
420c7379
LP
592 case ARG_CAPABILITY:
593 case ARG_DROP_CAPABILITY: {
a2a5291b 594 const char *state, *word;
5076f0cc
LP
595 size_t length;
596
597 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 598 _cleanup_free_ char *t;
5076f0cc
LP
599
600 t = strndup(word, length);
0d0f0c50
SL
601 if (!t)
602 return log_oom();
5076f0cc 603
39ed67d1
LP
604 if (streq(t, "all")) {
605 if (c == ARG_CAPABILITY)
a42c8b54 606 plus = (uint64_t) -1;
39ed67d1 607 else
a42c8b54 608 minus = (uint64_t) -1;
39ed67d1 609 } else {
2822da4f
LP
610 int cap;
611
612 cap = capability_from_name(t);
613 if (cap < 0) {
39ed67d1
LP
614 log_error("Failed to parse capability %s.", t);
615 return -EINVAL;
616 }
617
618 if (c == ARG_CAPABILITY)
a42c8b54 619 plus |= 1ULL << (uint64_t) cap;
39ed67d1 620 else
a42c8b54 621 minus |= 1ULL << (uint64_t) cap;
5076f0cc 622 }
5076f0cc
LP
623 }
624
625 break;
626 }
627
57fb9fb5
LP
628 case 'j':
629 arg_link_journal = LINK_GUEST;
574edc90 630 arg_link_journal_try = true;
57fb9fb5
LP
631 break;
632
633 case ARG_LINK_JOURNAL:
53e438e3 634 if (streq(optarg, "auto")) {
57fb9fb5 635 arg_link_journal = LINK_AUTO;
53e438e3
LP
636 arg_link_journal_try = false;
637 } else if (streq(optarg, "no")) {
57fb9fb5 638 arg_link_journal = LINK_NO;
53e438e3
LP
639 arg_link_journal_try = false;
640 } else if (streq(optarg, "guest")) {
57fb9fb5 641 arg_link_journal = LINK_GUEST;
53e438e3
LP
642 arg_link_journal_try = false;
643 } else if (streq(optarg, "host")) {
57fb9fb5 644 arg_link_journal = LINK_HOST;
53e438e3
LP
645 arg_link_journal_try = false;
646 } else if (streq(optarg, "try-guest")) {
574edc90
MP
647 arg_link_journal = LINK_GUEST;
648 arg_link_journal_try = true;
649 } else if (streq(optarg, "try-host")) {
650 arg_link_journal = LINK_HOST;
651 arg_link_journal_try = true;
652 } else {
57fb9fb5
LP
653 log_error("Failed to parse link journal mode %s", optarg);
654 return -EINVAL;
655 }
656
657 break;
658
17fe0523
LP
659 case ARG_BIND:
660 case ARG_BIND_RO: {
5a8af538
LP
661 _cleanup_free_ char *source = NULL, *destination = NULL;
662 CustomMount *m;
17fe0523 663 char *e;
17fe0523
LP
664
665 e = strchr(optarg, ':');
666 if (e) {
5a8af538
LP
667 source = strndup(optarg, e - optarg);
668 destination = strdup(e + 1);
17fe0523 669 } else {
5a8af538
LP
670 source = strdup(optarg);
671 destination = strdup(optarg);
17fe0523
LP
672 }
673
5a8af538 674 if (!source || !destination)
17fe0523
LP
675 return log_oom();
676
5a8af538 677 if (!path_is_absolute(source) || !path_is_absolute(destination)) {
17fe0523
LP
678 log_error("Invalid bind mount specification: %s", optarg);
679 return -EINVAL;
680 }
681
5a8af538
LP
682 m = custom_mount_add(CUSTOM_MOUNT_BIND);
683 if (!m)
b3451bed 684 return log_oom();
17fe0523 685
5a8af538
LP
686 m->source = source;
687 m->destination = destination;
688 m->read_only = c == ARG_BIND_RO;
689
690 source = destination = NULL;
17fe0523
LP
691
692 break;
693 }
694
06c17c39 695 case ARG_TMPFS: {
5a8af538
LP
696 _cleanup_free_ char *path = NULL, *opts = NULL;
697 CustomMount *m;
06c17c39
LP
698 char *e;
699
700 e = strchr(optarg, ':');
701 if (e) {
5a8af538
LP
702 path = strndup(optarg, e - optarg);
703 opts = strdup(e + 1);
06c17c39 704 } else {
5a8af538
LP
705 path = strdup(optarg);
706 opts = strdup("mode=0755");
06c17c39
LP
707 }
708
5a8af538 709 if (!path || !opts)
06c17c39
LP
710 return log_oom();
711
5a8af538 712 if (!path_is_absolute(path)) {
06c17c39
LP
713 log_error("Invalid tmpfs specification: %s", optarg);
714 return -EINVAL;
715 }
716
5a8af538
LP
717 m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
718 if (!m)
06c17c39
LP
719 return log_oom();
720
5a8af538
LP
721 m->destination = path;
722 m->options = opts;
06c17c39 723
5a8af538
LP
724 path = opts = NULL;
725
726 break;
727 }
728
729 case ARG_OVERLAY:
730 case ARG_OVERLAY_RO: {
731 _cleanup_free_ char *upper = NULL, *destination = NULL;
732 _cleanup_strv_free_ char **lower = NULL;
733 CustomMount *m;
734 unsigned n = 0;
735 char **i;
736
737 lower = strv_split(optarg, ":");
738 if (!lower)
06c17c39
LP
739 return log_oom();
740
5a8af538
LP
741 STRV_FOREACH(i, lower) {
742 if (!path_is_absolute(*i)) {
743 log_error("Overlay path %s is not absolute.", *i);
744 return -EINVAL;
745 }
746
747 n++;
748 }
749
750 if (n < 2) {
751 log_error("--overlay= needs at least two colon-separated directories specified.");
752 return -EINVAL;
753 }
754
755 if (n == 2) {
756 /* If two parameters are specified,
757 * the first one is the lower, the
758 * second one the upper directory. And
759 * we'll also define the the
760 * destination mount point the same as
761 * the upper. */
762 upper = lower[1];
763 lower[1] = NULL;
764
765 destination = strdup(upper);
766 if (!destination)
767 return log_oom();
768
769 } else {
770 upper = lower[n - 2];
771 destination = lower[n - 1];
772 lower[n - 2] = NULL;
773 }
774
775 m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
776 if (!m)
777 return log_oom();
778
779 m->destination = destination;
780 m->source = upper;
781 m->lower = lower;
782 m->read_only = c == ARG_OVERLAY_RO;
783
784 upper = destination = NULL;
785 lower = NULL;
06c17c39
LP
786
787 break;
788 }
789
f4889f65
LP
790 case ARG_SETENV: {
791 char **n;
792
793 if (!env_assignment_is_valid(optarg)) {
794 log_error("Environment variable assignment '%s' is not valid.", optarg);
795 return -EINVAL;
796 }
797
798 n = strv_env_set(arg_setenv, optarg);
799 if (!n)
800 return log_oom();
801
802 strv_free(arg_setenv);
803 arg_setenv = n;
804 break;
805 }
806
284c0b91
LP
807 case 'q':
808 arg_quiet = true;
809 break;
810
8a96d94e
LP
811 case ARG_SHARE_SYSTEM:
812 arg_share_system = true;
813 break;
814
eb91eb18
LP
815 case ARG_REGISTER:
816 r = parse_boolean(optarg);
817 if (r < 0) {
818 log_error("Failed to parse --register= argument: %s", optarg);
819 return r;
820 }
821
822 arg_register = r;
823 break;
824
89f7c846
LP
825 case ARG_KEEP_UNIT:
826 arg_keep_unit = true;
827 break;
828
6afc95b7
LP
829 case ARG_PERSONALITY:
830
ac45f971 831 arg_personality = personality_from_string(optarg);
050f7277 832 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
833 log_error("Unknown or unsupported personality '%s'.", optarg);
834 return -EINVAL;
835 }
836
837 break;
838
4d9f07b4
LP
839 case ARG_VOLATILE:
840
841 if (!optarg)
842 arg_volatile = VOLATILE_YES;
843 else {
844 r = parse_boolean(optarg);
845 if (r < 0) {
846 if (streq(optarg, "state"))
847 arg_volatile = VOLATILE_STATE;
848 else {
849 log_error("Failed to parse --volatile= argument: %s", optarg);
850 return r;
851 }
852 } else
853 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
854 }
855
856 break;
857
6d0b55c2
LP
858 case 'p': {
859 const char *split, *e;
860 uint16_t container_port, host_port;
861 int protocol;
862 ExposePort *p;
863
864 if ((e = startswith(optarg, "tcp:")))
865 protocol = IPPROTO_TCP;
866 else if ((e = startswith(optarg, "udp:")))
867 protocol = IPPROTO_UDP;
868 else {
869 e = optarg;
870 protocol = IPPROTO_TCP;
871 }
872
873 split = strchr(e, ':');
874 if (split) {
875 char v[split - e + 1];
876
877 memcpy(v, e, split - e);
878 v[split - e] = 0;
879
880 r = safe_atou16(v, &host_port);
881 if (r < 0 || host_port <= 0) {
882 log_error("Failed to parse host port: %s", optarg);
883 return -EINVAL;
884 }
885
886 r = safe_atou16(split + 1, &container_port);
887 } else {
888 r = safe_atou16(e, &container_port);
889 host_port = container_port;
890 }
891
892 if (r < 0 || container_port <= 0) {
893 log_error("Failed to parse host port: %s", optarg);
894 return -EINVAL;
895 }
896
897 LIST_FOREACH(ports, p, arg_expose_ports) {
898 if (p->protocol == protocol && p->host_port == host_port) {
899 log_error("Duplicate port specification: %s", optarg);
900 return -EINVAL;
901 }
902 }
903
904 p = new(ExposePort, 1);
905 if (!p)
906 return log_oom();
907
908 p->protocol = protocol;
909 p->host_port = host_port;
910 p->container_port = container_port;
911
912 LIST_PREPEND(ports, arg_expose_ports, p);
913
914 break;
915 }
916
f36933fe
LP
917 case ARG_PROPERTY:
918 if (strv_extend(&arg_property, optarg) < 0)
919 return log_oom();
920
921 break;
922
6dac160c
LP
923 case ARG_PRIVATE_USERS:
924 if (optarg) {
925 _cleanup_free_ char *buffer = NULL;
926 const char *range, *shift;
927
928 range = strchr(optarg, ':');
929 if (range) {
930 buffer = strndup(optarg, range - optarg);
931 if (!buffer)
932 return log_oom();
933 shift = buffer;
934
935 range++;
936 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
937 log_error("Failed to parse UID range: %s", range);
938 return -EINVAL;
939 }
940 } else
941 shift = optarg;
942
943 if (parse_uid(shift, &arg_uid_shift) < 0) {
944 log_error("Failed to parse UID: %s", optarg);
945 return -EINVAL;
946 }
947 }
948
949 arg_userns = true;
950 break;
951
c6c8f6e2
LP
952 case ARG_KILL_SIGNAL:
953 arg_kill_signal = signal_from_string_try_harder(optarg);
954 if (arg_kill_signal < 0) {
955 log_error("Cannot parse signal: %s", optarg);
956 return -EINVAL;
957 }
958
959 break;
960
88213476
LP
961 case '?':
962 return -EINVAL;
963
964 default:
eb9da376 965 assert_not_reached("Unhandled option");
88213476 966 }
88213476 967
eb91eb18
LP
968 if (arg_share_system)
969 arg_register = false;
970
971 if (arg_boot && arg_share_system) {
972 log_error("--boot and --share-system may not be combined.");
973 return -EINVAL;
974 }
975
89f7c846
LP
976 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
977 log_error("--keep-unit may not be used when invoked from a user session.");
978 return -EINVAL;
979 }
980
1b9e5b12
LP
981 if (arg_directory && arg_image) {
982 log_error("--directory= and --image= may not be combined.");
983 return -EINVAL;
984 }
985
ec16945e
LP
986 if (arg_template && arg_image) {
987 log_error("--template= and --image= may not be combined.");
988 return -EINVAL;
989 }
990
991 if (arg_template && !(arg_directory || arg_machine)) {
992 log_error("--template= needs --directory= or --machine=.");
993 return -EINVAL;
994 }
995
996 if (arg_ephemeral && arg_template) {
997 log_error("--ephemeral and --template= may not be combined.");
998 return -EINVAL;
999 }
1000
1001 if (arg_ephemeral && arg_image) {
1002 log_error("--ephemeral and --image= may not be combined.");
1003 return -EINVAL;
1004 }
1005
df9a75e4
LP
1006 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1007 log_error("--ephemeral and --link-journal= may not be combined.");
1008 return -EINVAL;
1009 }
1010
4d9f07b4
LP
1011 if (arg_volatile != VOLATILE_NO && arg_read_only) {
1012 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1013 return -EINVAL;
1014 }
1015
6d0b55c2
LP
1016 if (arg_expose_ports && !arg_private_network) {
1017 log_error("Cannot use --port= without private networking.");
1018 return -EINVAL;
1019 }
1020
b774fb7f
DH
1021 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
1022 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
1023
a42c8b54
LP
1024 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1025
c6c8f6e2
LP
1026 if (arg_boot && arg_kill_signal <= 0)
1027 arg_kill_signal = SIGRTMIN+3;
1028
88213476
LP
1029 return 1;
1030}
1031
03cfe0d5
LP
1032static int tmpfs_patch_options(const char *options, char **ret) {
1033 char *buf = NULL;
1034
1035 if (arg_userns && arg_uid_shift != 0) {
825d5287 1036 assert(arg_uid_shift != UID_INVALID);
03cfe0d5
LP
1037
1038 if (options)
f001a835 1039 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
03cfe0d5 1040 else
f001a835 1041 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
03cfe0d5
LP
1042 if (!buf)
1043 return -ENOMEM;
1044
1045 options = buf;
1046 }
1047
1048#ifdef HAVE_SELINUX
1049 if (arg_selinux_apifs_context) {
1050 char *t;
1051
1052 if (options)
1053 t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
1054 else
1055 t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
1056 if (!t) {
1057 free(buf);
1058 return -ENOMEM;
1059 }
1060
1061 free(buf);
1062 buf = t;
1063 }
1064#endif
1065
1066 *ret = buf;
1067 return !!buf;
1068}
1069
1070static int mount_all(const char *dest, bool userns) {
88213476
LP
1071
1072 typedef struct MountPoint {
1073 const char *what;
1074 const char *where;
1075 const char *type;
1076 const char *options;
1077 unsigned long flags;
3bd66c05 1078 bool fatal;
03cfe0d5 1079 bool userns;
88213476
LP
1080 } MountPoint;
1081
1082 static const MountPoint mount_table[] = {
3c59d4f2
RM
1083 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
1084 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
1085 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
1086 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
1087 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
1088 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
1089 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1090 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1091 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
9b634ea5 1092#ifdef HAVE_SELINUX
3c59d4f2
RM
1093 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
1094 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */
9b634ea5 1095#endif
88213476
LP
1096 };
1097
1098 unsigned k;
03cfe0d5 1099 int r;
88213476
LP
1100
1101 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
d15d65a0 1102 _cleanup_free_ char *where = NULL, *options = NULL;
d002827b 1103 const char *o;
88213476 1104
03cfe0d5
LP
1105 if (userns != mount_table[k].userns)
1106 continue;
1107
1108 where = prefix_root(dest, mount_table[k].where);
17fe0523
LP
1109 if (!where)
1110 return log_oom();
88213476 1111
e26d6ce5 1112 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
03cfe0d5
LP
1113 if (r < 0 && r != -ENOENT)
1114 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
88213476 1115
9c1c7f71 1116 /* Skip this entry if it is not a remount. */
03cfe0d5 1117 if (mount_table[k].what && r > 0)
014a9c77
LP
1118 continue;
1119
03cfe0d5
LP
1120 r = mkdir_p(where, 0755);
1121 if (r < 0) {
1122 if (mount_table[k].fatal)
1123 return log_error_errno(r, "Failed to create directory %s: %m", where);
79d80fc1 1124
03cfe0d5 1125 log_warning_errno(r, "Failed to create directory %s: %m", where);
79d80fc1
TG
1126 continue;
1127 }
88213476 1128
03cfe0d5
LP
1129 o = mount_table[k].options;
1130 if (streq_ptr(mount_table[k].type, "tmpfs")) {
1131 r = tmpfs_patch_options(o, &options);
1132 if (r < 0)
6dac160c 1133 return log_oom();
03cfe0d5
LP
1134 if (r > 0)
1135 o = options;
6dac160c 1136 }
a8828ed9 1137
88213476
LP
1138 if (mount(mount_table[k].what,
1139 where,
1140 mount_table[k].type,
1141 mount_table[k].flags,
79d80fc1 1142 o) < 0) {
88213476 1143
03cfe0d5
LP
1144 if (mount_table[k].fatal)
1145 return log_error_errno(errno, "mount(%s) failed: %m", where);
88213476 1146
03cfe0d5 1147 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
88213476 1148 }
88213476
LP
1149 }
1150
03cfe0d5 1151 return 0;
e58a1277 1152}
f8440af5 1153
5a8af538
LP
1154static int mount_bind(const char *dest, CustomMount *m) {
1155 struct stat source_st, dest_st;
03cfe0d5 1156 const char *where;
5a8af538 1157 int r;
17fe0523 1158
5a8af538 1159 assert(m);
d2421337 1160
5a8af538
LP
1161 if (stat(m->source, &source_st) < 0)
1162 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
17fe0523 1163
03cfe0d5 1164 where = prefix_roota(dest, m->destination);
06c17c39 1165
03cfe0d5 1166 if (stat(where, &dest_st) >= 0) {
5a8af538
LP
1167 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
1168 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
1169 return -EINVAL;
2ed4e5e0 1170 }
06c17c39 1171
5a8af538
LP
1172 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
1173 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
1174 return -EINVAL;
d2421337 1175 }
17fe0523 1176
5a8af538
LP
1177 } else if (errno == ENOENT) {
1178 r = mkdir_parents_label(where, 0755);
1179 if (r < 0)
1180 return log_error_errno(r, "Failed to make parents of %s: %m", where);
1181 } else {
1182 log_error_errno(errno, "Failed to stat %s: %m", where);
1183 return -errno;
1184 }
17fe0523 1185
5a8af538
LP
1186 /* Create the mount point. Any non-directory file can be
1187 * mounted on any non-directory file (regular, fifo, socket,
1188 * char, block).
1189 */
1190 if (S_ISDIR(source_st.st_mode))
1191 r = mkdir_label(where, 0755);
1192 else
1193 r = touch(where);
1194 if (r < 0 && r != -EEXIST)
1195 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1196
1197 if (mount(m->source, where, NULL, MS_BIND, NULL) < 0)
1198 return log_error_errno(errno, "mount(%s) failed: %m", where);
1199
1200 if (m->read_only) {
1201 r = bind_remount_recursive(where, true);
1202 if (r < 0)
1203 return log_error_errno(r, "Read-only bind mount failed: %m");
1204 }
1205
1206 return 0;
1207}
1208
1209static int mount_tmpfs(const char *dest, CustomMount *m) {
03cfe0d5
LP
1210 const char *where, *options;
1211 _cleanup_free_ char *buf = NULL;
5a8af538
LP
1212 int r;
1213
1214 assert(dest);
1215 assert(m);
1216
03cfe0d5 1217 where = prefix_roota(dest, m->destination);
5a8af538 1218
03cfe0d5 1219 r = mkdir_p_label(where, 0755);
5a8af538
LP
1220 if (r < 0 && r != -EEXIST)
1221 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1222
03cfe0d5
LP
1223 r = tmpfs_patch_options(m->options, &buf);
1224 if (r < 0)
1225 return log_oom();
1226 options = r > 0 ? buf : m->options;
1227
1228 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
5a8af538
LP
1229 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1230
1231 return 0;
1232}
1233
1234static int mount_overlay(const char *dest, CustomMount *m) {
1235 _cleanup_free_ char *lower = NULL;
03cfe0d5 1236 const char *where, *options;
5a8af538
LP
1237 int r;
1238
1239 assert(dest);
1240 assert(m);
1241
03cfe0d5 1242 where = prefix_roota(dest, m->destination);
5a8af538
LP
1243
1244 r = mkdir_label(where, 0755);
1245 if (r < 0 && r != -EEXIST)
1246 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
1247
1248 (void) mkdir_p_label(m->source, 0755);
1249
1250 strv_reverse(m->lower);
1251 lower = strv_join(m->lower, ":");
1252 strv_reverse(m->lower);
5a8af538
LP
1253 if (!lower)
1254 return log_oom();
1255
1256 if (m->read_only)
1257 options = strjoina("lowerdir=", m->source, ":", lower);
1258 else {
1259 assert(m->work_dir);
1260 (void) mkdir_label(m->work_dir, 0700);
1261
1262 options = strjoina("lowerdir=", lower, ",upperdir=", m->source, ",workdir=", m->work_dir);
1263 }
1264
1265 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
1266 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
1267
1268 return 0;
1269}
1270
1271static int mount_custom(const char *dest) {
1272 unsigned i;
1273 int r;
1274
1275 assert(dest);
1276
1277 for (i = 0; i < arg_n_custom_mounts; i++) {
1278 CustomMount *m = &arg_custom_mounts[i];
1279
1280 switch (m->type) {
1281
1282 case CUSTOM_MOUNT_BIND:
1283 r = mount_bind(dest, m);
1284 break;
1285
1286 case CUSTOM_MOUNT_TMPFS:
1287 r = mount_tmpfs(dest, m);
1288 break;
1289
1290 case CUSTOM_MOUNT_OVERLAY:
1291 r = mount_overlay(dest, m);
1292 break;
1293
1294 default:
1295 assert_not_reached("Unknown custom mount type");
17fe0523 1296 }
5a8af538
LP
1297
1298 if (r < 0)
1299 return r;
17fe0523
LP
1300 }
1301
1302 return 0;
1303}
1304
b12afc8c
LP
1305static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1306 char *to;
1307 int r;
1308
63c372cb 1309 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
b12afc8c 1310
e26d6ce5 1311 r = path_is_mount_point(to, 0);
da00518b 1312 if (r < 0 && r != -ENOENT)
b12afc8c
LP
1313 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1314 if (r > 0)
1315 return 0;
1316
1317 mkdir_p(to, 0755);
1318
c0534580
LP
1319 /* The superblock mount options of the mount point need to be
1320 * identical to the hosts', and hence writable... */
1321 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
b12afc8c
LP
1322 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1323
c0534580
LP
1324 /* ... hence let's only make the bind mount read-only, not the
1325 * superblock. */
1326 if (read_only) {
1327 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1328 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1329 }
b12afc8c
LP
1330 return 1;
1331}
1332
1333static int mount_cgroup(const char *dest) {
1334 _cleanup_set_free_free_ Set *controllers = NULL;
03cfe0d5 1335 const char *cgroup_root;
b12afc8c
LP
1336 int r;
1337
1338 controllers = set_new(&string_hash_ops);
1339 if (!controllers)
1340 return log_oom();
1341
1342 r = cg_kernel_controllers(controllers);
1343 if (r < 0)
1344 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1345
b12afc8c
LP
1346 for (;;) {
1347 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1348
1349 controller = set_steal_first(controllers);
1350 if (!controller)
1351 break;
1352
03cfe0d5 1353 origin = prefix_root("/sys/fs/cgroup/", controller);
b12afc8c
LP
1354 if (!origin)
1355 return log_oom();
1356
1357 r = readlink_malloc(origin, &combined);
1358 if (r == -EINVAL) {
1359 /* Not a symbolic link, but directly a single cgroup hierarchy */
1360
1361 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1362 if (r < 0)
1363 return r;
1364
1365 } else if (r < 0)
1366 return log_error_errno(r, "Failed to read link %s: %m", origin);
1367 else {
1368 _cleanup_free_ char *target = NULL;
1369
03cfe0d5 1370 target = prefix_root(dest, origin);
b12afc8c
LP
1371 if (!target)
1372 return log_oom();
1373
1374 /* A symbolic link, a combination of controllers in one hierarchy */
1375
1376 if (!filename_is_valid(combined)) {
1377 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1378 continue;
1379 }
1380
1381 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1382 if (r < 0)
1383 return r;
1384
875e1014
ILG
1385 r = symlink_idempotent(combined, target);
1386 if (r == -EINVAL) {
1387 log_error("Invalid existing symlink for combined hierarchy");
1388 return r;
1389 }
1390 if (r < 0)
1391 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
b12afc8c
LP
1392 }
1393 }
1394
c0534580 1395 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
b12afc8c
LP
1396 if (r < 0)
1397 return r;
1398
03cfe0d5
LP
1399 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1400 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1401 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1402
1403 return 0;
1404}
1405
1406static int mount_systemd_cgroup_writable(const char *dest) {
1407 _cleanup_free_ char *own_cgroup_path = NULL;
1408 const char *systemd_root, *systemd_own;
1409 int r;
1410
1411 assert(dest);
1412
1413 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1414 if (r < 0)
1415 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1416
b12afc8c 1417 /* Make our own cgroup a (writable) bind mount */
63c372cb 1418 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
b12afc8c
LP
1419 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1420 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1421
1422 /* And then remount the systemd cgroup root read-only */
03cfe0d5 1423 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
b12afc8c
LP
1424 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1425 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1426
03cfe0d5
LP
1427 return 0;
1428}
1429
1430static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1431 assert(p);
1432
1433 if (!arg_userns)
1434 return 0;
1435
1436 if (uid == UID_INVALID && gid == GID_INVALID)
1437 return 0;
1438
1439 if (uid != UID_INVALID) {
1440 uid += arg_uid_shift;
1441
1442 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1443 return -EOVERFLOW;
1444 }
1445
1446 if (gid != GID_INVALID) {
1447 gid += (gid_t) arg_uid_shift;
1448
1449 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1450 return -EOVERFLOW;
1451 }
1452
1453 if (lchown(p, uid, gid) < 0)
1454 return -errno;
b12afc8c
LP
1455
1456 return 0;
1457}
1458
03cfe0d5
LP
1459static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1460 const char *q;
1461
1462 q = prefix_roota(root, path);
1463 if (mkdir(q, mode) < 0) {
1464 if (errno == EEXIST)
1465 return 0;
1466 return -errno;
1467 }
1468
1469 return userns_lchown(q, uid, gid);
1470}
1471
e58a1277 1472static int setup_timezone(const char *dest) {
03cfe0d5
LP
1473 _cleanup_free_ char *p = NULL, *q = NULL;
1474 const char *where, *check, *what;
d4036145
LP
1475 char *z, *y;
1476 int r;
f8440af5 1477
e58a1277
LP
1478 assert(dest);
1479
1480 /* Fix the timezone, if possible */
d4036145
LP
1481 r = readlink_malloc("/etc/localtime", &p);
1482 if (r < 0) {
1483 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1484 return 0;
1485 }
1486
1487 z = path_startswith(p, "../usr/share/zoneinfo/");
1488 if (!z)
1489 z = path_startswith(p, "/usr/share/zoneinfo/");
1490 if (!z) {
1491 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1492 return 0;
1493 }
1494
03cfe0d5 1495 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1496 r = readlink_malloc(where, &q);
1497 if (r >= 0) {
1498 y = path_startswith(q, "../usr/share/zoneinfo/");
1499 if (!y)
1500 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1501
d4036145
LP
1502 /* Already pointing to the right place? Then do nothing .. */
1503 if (y && streq(y, z))
1504 return 0;
1505 }
1506
03cfe0d5
LP
1507 check = strjoina("/usr/share/zoneinfo/", z);
1508 check = prefix_root(dest, check);
1509 if (laccess(check, F_OK) < 0) {
d4036145
LP
1510 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1511 return 0;
1512 }
68fb0892 1513
79d80fc1
TG
1514 r = unlink(where);
1515 if (r < 0 && errno != ENOENT) {
56f64d95 1516 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1517 return 0;
1518 }
4d9f07b4 1519
03cfe0d5 1520 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1521 if (symlink(what, where) < 0) {
56f64d95 1522 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1523 return 0;
1524 }
e58a1277 1525
03cfe0d5
LP
1526 r = userns_lchown(where, 0, 0);
1527 if (r < 0)
1528 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1529
e58a1277 1530 return 0;
88213476
LP
1531}
1532
2547bb41 1533static int setup_resolv_conf(const char *dest) {
03cfe0d5 1534 const char *where = NULL;
79d80fc1 1535 int r;
2547bb41
LP
1536
1537 assert(dest);
1538
1539 if (arg_private_network)
1540 return 0;
1541
1542 /* Fix resolv.conf, if possible */
03cfe0d5 1543 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1544
f2068bcc 1545 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1546 if (r < 0) {
68a313c5
LP
1547 /* If the file already exists as symlink, let's
1548 * suppress the warning, under the assumption that
1549 * resolved or something similar runs inside and the
1550 * symlink points there.
1551 *
1552 * If the disk image is read-only, there's also no
1553 * point in complaining.
1554 */
1555 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1556 "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1557 return 0;
1558 }
2547bb41 1559
03cfe0d5
LP
1560 r = userns_lchown(where, 0, 0);
1561 if (r < 0)
1562 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1563
2547bb41
LP
1564 return 0;
1565}
1566
4d9f07b4 1567static int setup_volatile_state(const char *directory) {
03cfe0d5
LP
1568 _cleanup_free_ char *buf = NULL;
1569 const char *p, *options;
4d9f07b4
LP
1570 int r;
1571
1572 assert(directory);
1573
1574 if (arg_volatile != VOLATILE_STATE)
1575 return 0;
1576
1577 /* --volatile=state means we simply overmount /var
1578 with a tmpfs, and the rest read-only. */
1579
1580 r = bind_remount_recursive(directory, true);
f647962d
MS
1581 if (r < 0)
1582 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
4d9f07b4 1583
03cfe0d5 1584 p = prefix_roota(directory, "/var");
79d80fc1 1585 r = mkdir(p, 0755);
4a62c710
MS
1586 if (r < 0 && errno != EEXIST)
1587 return log_error_errno(errno, "Failed to create %s: %m", directory);
4d9f07b4 1588
03cfe0d5
LP
1589 options = "mode=755";
1590 r = tmpfs_patch_options(options, &buf);
1591 if (r < 0)
1592 return log_oom();
1593 if (r > 0)
1594 options = buf;
1595
1596 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
4a62c710 1597 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
4d9f07b4
LP
1598
1599 return 0;
1600}
1601
1602static int setup_volatile(const char *directory) {
1603 bool tmpfs_mounted = false, bind_mounted = false;
1604 char template[] = "/tmp/nspawn-volatile-XXXXXX";
03cfe0d5
LP
1605 _cleanup_free_ char *buf = NULL;
1606 const char *f, *t, *options;
4d9f07b4
LP
1607 int r;
1608
1609 assert(directory);
1610
1611 if (arg_volatile != VOLATILE_YES)
1612 return 0;
1613
1614 /* --volatile=yes means we mount a tmpfs to the root dir, and
1615 the original /usr to use inside it, and that read-only. */
1616
4a62c710
MS
1617 if (!mkdtemp(template))
1618 return log_error_errno(errno, "Failed to create temporary directory: %m");
4d9f07b4 1619
03cfe0d5
LP
1620 options = "mode=755";
1621 r = tmpfs_patch_options(options, &buf);
1622 if (r < 0)
1623 return log_oom();
1624 if (r > 0)
1625 options = buf;
1626
1627 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
1628 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
4d9f07b4
LP
1629 goto fail;
1630 }
1631
1632 tmpfs_mounted = true;
1633
03cfe0d5
LP
1634 f = prefix_roota(directory, "/usr");
1635 t = prefix_roota(template, "/usr");
4d9f07b4 1636
79d80fc1
TG
1637 r = mkdir(t, 0755);
1638 if (r < 0 && errno != EEXIST) {
03cfe0d5 1639 r = log_error_errno(errno, "Failed to create %s: %m", t);
79d80fc1
TG
1640 goto fail;
1641 }
1642
4543768d 1643 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
03cfe0d5 1644 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
4d9f07b4
LP
1645 goto fail;
1646 }
1647
1648 bind_mounted = true;
1649
1650 r = bind_remount_recursive(t, true);
1651 if (r < 0) {
da927ba9 1652 log_error_errno(r, "Failed to remount %s read-only: %m", t);
4d9f07b4
LP
1653 goto fail;
1654 }
1655
1656 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
03cfe0d5 1657 r = log_error_errno(errno, "Failed to move root mount: %m");
4d9f07b4
LP
1658 goto fail;
1659 }
1660
03cfe0d5 1661 (void) rmdir(template);
4d9f07b4
LP
1662
1663 return 0;
1664
1665fail:
1666 if (bind_mounted)
03cfe0d5
LP
1667 (void) umount(t);
1668
4d9f07b4 1669 if (tmpfs_mounted)
03cfe0d5
LP
1670 (void) umount(template);
1671 (void) rmdir(template);
4d9f07b4
LP
1672 return r;
1673}
1674
9f24adc2 1675static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
03cfe0d5 1676 assert(s);
9f24adc2
LP
1677
1678 snprintf(s, 37,
1679 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1680 SD_ID128_FORMAT_VAL(id));
1681
1682 return s;
1683}
1684
04bc4a3f 1685static int setup_boot_id(const char *dest) {
03cfe0d5 1686 const char *from, *to;
39883f62 1687 sd_id128_t rnd = {};
04bc4a3f
LP
1688 char as_uuid[37];
1689 int r;
1690
eb91eb18
LP
1691 if (arg_share_system)
1692 return 0;
1693
04bc4a3f
LP
1694 /* Generate a new randomized boot ID, so that each boot-up of
1695 * the container gets a new one */
1696
03cfe0d5
LP
1697 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1698 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1699
1700 r = sd_id128_randomize(&rnd);
f647962d
MS
1701 if (r < 0)
1702 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1703
9f24adc2 1704 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1705
574d5f2d 1706 r = write_string_file(from, as_uuid);
f647962d
MS
1707 if (r < 0)
1708 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1709
03cfe0d5
LP
1710 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1711 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1712 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
56f64d95 1713 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1714
1715 unlink(from);
04bc4a3f
LP
1716 return r;
1717}
1718
e58a1277 1719static int copy_devnodes(const char *dest) {
88213476
LP
1720
1721 static const char devnodes[] =
1722 "null\0"
1723 "zero\0"
1724 "full\0"
1725 "random\0"
1726 "urandom\0"
85614d66
TG
1727 "tty\0"
1728 "net/tun\0";
88213476
LP
1729
1730 const char *d;
e58a1277 1731 int r = 0;
7fd1b19b 1732 _cleanup_umask_ mode_t u;
a258bf26
LP
1733
1734 assert(dest);
124640f1
LP
1735
1736 u = umask(0000);
88213476 1737
03cfe0d5
LP
1738 /* Create /dev/net, so that we can create /dev/net/tun in it */
1739 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1740 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1741
88213476 1742 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1743 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1744 struct stat st;
88213476 1745
7f112f50 1746 from = strappend("/dev/", d);
03cfe0d5 1747 to = prefix_root(dest, from);
88213476
LP
1748
1749 if (stat(from, &st) < 0) {
1750
4a62c710
MS
1751 if (errno != ENOENT)
1752 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1753
a258bf26 1754 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1755
03cfe0d5 1756 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1757 return -EIO;
a258bf26 1758
85614d66 1759 } else {
81f5049b
AC
1760 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1761 if (errno != EPERM)
1762 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1763
1764 /* Some systems abusively restrict mknod but
1765 * allow bind mounts. */
1766 r = touch(to);
1767 if (r < 0)
1768 return log_error_errno(r, "touch (%s) failed: %m", to);
1769 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1770 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1771 }
6278cf60 1772
03cfe0d5
LP
1773 r = userns_lchown(to, 0, 0);
1774 if (r < 0)
1775 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1776 }
88213476
LP
1777 }
1778
e58a1277
LP
1779 return r;
1780}
88213476 1781
03cfe0d5
LP
1782static int setup_pts(const char *dest) {
1783 _cleanup_free_ char *options = NULL;
1784 const char *p;
1785
1786#ifdef HAVE_SELINUX
1787 if (arg_selinux_apifs_context)
1788 (void) asprintf(&options,
1789 "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT ",context=\"%s\"",
1790 arg_uid_shift,
1791 arg_uid_shift + TTY_GID,
1792 arg_selinux_apifs_context);
1793 else
1794#endif
1795 (void) asprintf(&options,
1796 "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT,
1797 arg_uid_shift,
1798 arg_uid_shift + TTY_GID);
f2d88580 1799
03cfe0d5 1800 if (!options)
f2d88580
LP
1801 return log_oom();
1802
03cfe0d5 1803 /* Mount /dev/pts itself */
cc9fce65 1804 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1805 if (mkdir(p, 0755) < 0)
1806 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1807 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1808 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1809 if (userns_lchown(p, 0, 0) < 0)
1810 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1811
1812 /* Create /dev/ptmx symlink */
1813 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1814 if (symlink("pts/ptmx", p) < 0)
1815 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
03cfe0d5
LP
1816 if (userns_lchown(p, 0, 0) < 0)
1817 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
f2d88580 1818
03cfe0d5
LP
1819 /* And fix /dev/pts/ptmx ownership */
1820 p = prefix_roota(dest, "/dev/pts/ptmx");
1821 if (userns_lchown(p, 0, 0) < 0)
1822 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1823
f2d88580
LP
1824 return 0;
1825}
1826
e58a1277 1827static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1828 _cleanup_umask_ mode_t u;
1829 const char *to;
e58a1277 1830 int r;
e58a1277
LP
1831
1832 assert(dest);
1833 assert(console);
1834
1835 u = umask(0000);
1836
03cfe0d5 1837 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1838 if (r < 0)
1839 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1840
a258bf26
LP
1841 /* We need to bind mount the right tty to /dev/console since
1842 * ptys can only exist on pts file systems. To have something
81f5049b 1843 * to bind mount things on we create a empty regular file. */
a258bf26 1844
03cfe0d5 1845 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1846 r = touch(to);
1847 if (r < 0)
1848 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1849
4543768d 1850 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1851 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1852
25ea79fe 1853 return 0;
e58a1277
LP
1854}
1855
1856static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1857 const char *from, *to;
7fd1b19b 1858 _cleanup_umask_ mode_t u;
03cfe0d5 1859 int fd, k;
e58a1277
LP
1860 union {
1861 struct cmsghdr cmsghdr;
1862 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1863 } control = {};
1864 struct msghdr mh = {
1865 .msg_control = &control,
1866 .msg_controllen = sizeof(control),
1867 };
e58a1277
LP
1868 struct cmsghdr *cmsg;
1869
e58a1277 1870 assert(kmsg_socket >= 0);
a258bf26 1871
e58a1277 1872 u = umask(0000);
a258bf26 1873
03cfe0d5 1874 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1875 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1876 * on the reading side behave very similar to /proc/kmsg,
1877 * their writing side behaves differently from /dev/kmsg in
1878 * that writing blocks when nothing is reading. In order to
1879 * avoid any problems with containers deadlocking due to this
1880 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1881 from = prefix_roota(dest, "/run/kmsg");
1882 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1883
4a62c710 1884 if (mkfifo(from, 0600) < 0)
03cfe0d5 1885 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 1886 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1887 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1888
1889 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1890 if (fd < 0)
1891 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1892
e58a1277
LP
1893 cmsg = CMSG_FIRSTHDR(&mh);
1894 cmsg->cmsg_level = SOL_SOCKET;
1895 cmsg->cmsg_type = SCM_RIGHTS;
1896 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1897 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1898
1899 mh.msg_controllen = cmsg->cmsg_len;
1900
1901 /* Store away the fd in the socket, so that it stays open as
1902 * long as we run the child */
6d0b55c2 1903 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
03e334a1 1904 safe_close(fd);
e58a1277 1905
4a62c710
MS
1906 if (k < 0)
1907 return log_error_errno(errno, "Failed to send FIFO fd: %m");
a258bf26 1908
03cfe0d5
LP
1909 /* And now make the FIFO unavailable as /run/kmsg... */
1910 (void) unlink(from);
1911
25ea79fe 1912 return 0;
88213476
LP
1913}
1914
6d0b55c2
LP
1915static int send_rtnl(int send_fd) {
1916 union {
1917 struct cmsghdr cmsghdr;
1918 uint8_t buf[CMSG_SPACE(sizeof(int))];
1919 } control = {};
1920 struct msghdr mh = {
1921 .msg_control = &control,
1922 .msg_controllen = sizeof(control),
1923 };
1924 struct cmsghdr *cmsg;
1925 _cleanup_close_ int fd = -1;
1926 ssize_t k;
1927
1928 assert(send_fd >= 0);
1929
1930 if (!arg_expose_ports)
1931 return 0;
1932
1933 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1934 if (fd < 0)
03cfe0d5 1935 return log_error_errno(errno, "Failed to allocate container netlink: %m");
6d0b55c2
LP
1936
1937 cmsg = CMSG_FIRSTHDR(&mh);
1938 cmsg->cmsg_level = SOL_SOCKET;
1939 cmsg->cmsg_type = SCM_RIGHTS;
1940 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1941 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1942
1943 mh.msg_controllen = cmsg->cmsg_len;
1944
1945 /* Store away the fd in the socket, so that it stays open as
1946 * long as we run the child */
1947 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1948 if (k < 0)
1949 return log_error_errno(errno, "Failed to send netlink fd: %m");
1950
1951 return 0;
1952}
1953
1954static int flush_ports(union in_addr_union *exposed) {
1955 ExposePort *p;
1956 int r, af = AF_INET;
1957
1958 assert(exposed);
1959
1960 if (!arg_expose_ports)
1961 return 0;
1962
1963 if (in_addr_is_null(af, exposed))
1964 return 0;
1965
1966 log_debug("Lost IP address.");
1967
1968 LIST_FOREACH(ports, p, arg_expose_ports) {
1969 r = fw_add_local_dnat(false,
1970 af,
1971 p->protocol,
1972 NULL,
1973 NULL, 0,
1974 NULL, 0,
1975 p->host_port,
1976 exposed,
1977 p->container_port,
1978 NULL);
1979 if (r < 0)
1980 log_warning_errno(r, "Failed to modify firewall: %m");
1981 }
1982
1983 *exposed = IN_ADDR_NULL;
1984 return 0;
1985}
1986
1c4baffc 1987static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
6d0b55c2
LP
1988 _cleanup_free_ struct local_address *addresses = NULL;
1989 _cleanup_free_ char *pretty = NULL;
1990 union in_addr_union new_exposed;
1991 ExposePort *p;
1992 bool add;
1993 int af = AF_INET, r;
1994
1995 assert(exposed);
1996
1997 /* Invoked each time an address is added or removed inside the
1998 * container */
1999
2000 if (!arg_expose_ports)
2001 return 0;
2002
2003 r = local_addresses(rtnl, 0, af, &addresses);
2004 if (r < 0)
2005 return log_error_errno(r, "Failed to enumerate local addresses: %m");
2006
2007 add = r > 0 &&
2008 addresses[0].family == af &&
2009 addresses[0].scope < RT_SCOPE_LINK;
2010
2011 if (!add)
2012 return flush_ports(exposed);
2013
2014 new_exposed = addresses[0].address;
2015 if (in_addr_equal(af, exposed, &new_exposed))
2016 return 0;
2017
2018 in_addr_to_string(af, &new_exposed, &pretty);
2019 log_debug("New container IP is %s.", strna(pretty));
2020
2021 LIST_FOREACH(ports, p, arg_expose_ports) {
2022
2023 r = fw_add_local_dnat(true,
2024 af,
2025 p->protocol,
2026 NULL,
2027 NULL, 0,
2028 NULL, 0,
2029 p->host_port,
2030 &new_exposed,
2031 p->container_port,
2032 in_addr_is_null(af, exposed) ? NULL : exposed);
2033 if (r < 0)
2034 log_warning_errno(r, "Failed to modify firewall: %m");
2035 }
2036
2037 *exposed = new_exposed;
2038 return 0;
2039}
2040
1c4baffc 2041static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
2042 union in_addr_union *exposed = userdata;
2043
2044 assert(rtnl);
2045 assert(m);
2046 assert(exposed);
2047
2048 expose_ports(rtnl, exposed);
2049 return 0;
2050}
2051
1c4baffc 2052static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
6d0b55c2
LP
2053 union {
2054 struct cmsghdr cmsghdr;
2055 uint8_t buf[CMSG_SPACE(sizeof(int))];
2056 } control = {};
2057 struct msghdr mh = {
2058 .msg_control = &control,
2059 .msg_controllen = sizeof(control),
2060 };
2061 struct cmsghdr *cmsg;
1c4baffc 2062 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
6d0b55c2
LP
2063 int fd, r;
2064 ssize_t k;
2065
2066 assert(event);
2067 assert(recv_fd >= 0);
2068 assert(ret);
2069
2070 if (!arg_expose_ports)
2071 return 0;
2072
2073 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
2074 if (k < 0)
2075 return log_error_errno(errno, "Failed to recv netlink fd: %m");
2076
2077 cmsg = CMSG_FIRSTHDR(&mh);
2078 assert(cmsg->cmsg_level == SOL_SOCKET);
2079 assert(cmsg->cmsg_type == SCM_RIGHTS);
657bdca9 2080 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
6d0b55c2
LP
2081 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
2082
1c4baffc 2083 r = sd_netlink_open_fd(&rtnl, fd);
6d0b55c2
LP
2084 if (r < 0) {
2085 safe_close(fd);
2086 return log_error_errno(r, "Failed to create rtnl object: %m");
2087 }
2088
1c4baffc 2089 r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
6d0b55c2
LP
2090 if (r < 0)
2091 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
2092
1c4baffc 2093 r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
6d0b55c2
LP
2094 if (r < 0)
2095 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
2096
1c4baffc 2097 r = sd_netlink_attach_event(rtnl, event, 0);
6d0b55c2
LP
2098 if (r < 0)
2099 return log_error_errno(r, "Failed to add to even loop: %m");
2100
2101 *ret = rtnl;
2102 rtnl = NULL;
2103
2104 return 0;
2105}
2106
3a74cea5 2107static int setup_hostname(void) {
3a74cea5 2108
eb91eb18
LP
2109 if (arg_share_system)
2110 return 0;
2111
605f81a8 2112 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 2113 return -errno;
3a74cea5 2114
7027ff61 2115 return 0;
3a74cea5
LP
2116}
2117
57fb9fb5 2118static int setup_journal(const char *directory) {
4d680aee 2119 sd_id128_t machine_id, this_id;
03cfe0d5
LP
2120 _cleanup_free_ char *b = NULL, *d = NULL;
2121 const char *etc_machine_id, *p, *q;
27407a01 2122 char *id;
57fb9fb5
LP
2123 int r;
2124
df9a75e4
LP
2125 /* Don't link journals in ephemeral mode */
2126 if (arg_ephemeral)
2127 return 0;
2128
03cfe0d5 2129 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
57fb9fb5 2130
03cfe0d5 2131 r = read_one_line_file(etc_machine_id, &b);
27407a01
ZJS
2132 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
2133 return 0;
f647962d 2134 else if (r < 0)
03cfe0d5 2135 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
57fb9fb5 2136
27407a01
ZJS
2137 id = strstrip(b);
2138 if (isempty(id) && arg_link_journal == LINK_AUTO)
2139 return 0;
57fb9fb5 2140
27407a01
ZJS
2141 /* Verify validity */
2142 r = sd_id128_from_string(id, &machine_id);
f647962d 2143 if (r < 0)
03cfe0d5 2144 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
57fb9fb5 2145
4d680aee 2146 r = sd_id128_get_machine(&this_id);
f647962d
MS
2147 if (r < 0)
2148 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
2149
2150 if (sd_id128_equal(machine_id, this_id)) {
2151 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
2152 "Host and machine ids are equal (%s): refusing to link journals", id);
2153 if (arg_link_journal == LINK_AUTO)
2154 return 0;
df9a75e4 2155 return -EEXIST;
4d680aee
ZJS
2156 }
2157
2158 if (arg_link_journal == LINK_NO)
2159 return 0;
2160
03cfe0d5
LP
2161 r = userns_mkdir(directory, "/var", 0755, 0, 0);
2162 if (r < 0)
2163 return log_error_errno(r, "Failed to create /var: %m");
2164
2165 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
2166 if (r < 0)
2167 return log_error_errno(r, "Failed to create /var/log: %m");
2168
2169 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
2170 if (r < 0)
2171 return log_error_errno(r, "Failed to create /var/log/journal: %m");
2172
2173 p = strjoina("/var/log/journal/", id);
2174 q = prefix_roota(directory, p);
27407a01 2175
e26d6ce5 2176 if (path_is_mount_point(p, 0) > 0) {
27407a01
ZJS
2177 if (arg_link_journal != LINK_AUTO) {
2178 log_error("%s: already a mount point, refusing to use for journal", p);
2179 return -EEXIST;
2180 }
2181
2182 return 0;
57fb9fb5
LP
2183 }
2184
e26d6ce5 2185 if (path_is_mount_point(q, 0) > 0) {
57fb9fb5 2186 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
2187 log_error("%s: already a mount point, refusing to use for journal", q);
2188 return -EEXIST;
57fb9fb5
LP
2189 }
2190
27407a01 2191 return 0;
57fb9fb5
LP
2192 }
2193
2194 r = readlink_and_make_absolute(p, &d);
2195 if (r >= 0) {
2196 if ((arg_link_journal == LINK_GUEST ||
2197 arg_link_journal == LINK_AUTO) &&
2198 path_equal(d, q)) {
2199
03cfe0d5 2200 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2201 if (r < 0)
56f64d95 2202 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 2203 return 0;
57fb9fb5
LP
2204 }
2205
4a62c710
MS
2206 if (unlink(p) < 0)
2207 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2208 } else if (r == -EINVAL) {
2209
2210 if (arg_link_journal == LINK_GUEST &&
2211 rmdir(p) < 0) {
2212
27407a01
ZJS
2213 if (errno == ENOTDIR) {
2214 log_error("%s already exists and is neither a symlink nor a directory", p);
2215 return r;
2216 } else {
56f64d95 2217 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 2218 return -errno;
57fb9fb5 2219 }
57fb9fb5
LP
2220 }
2221 } else if (r != -ENOENT) {
56f64d95 2222 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 2223 return r;
57fb9fb5
LP
2224 }
2225
2226 if (arg_link_journal == LINK_GUEST) {
2227
2228 if (symlink(q, p) < 0) {
574edc90 2229 if (arg_link_journal_try) {
56f64d95 2230 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
2231 return 0;
2232 } else {
56f64d95 2233 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
2234 return -errno;
2235 }
57fb9fb5
LP
2236 }
2237
03cfe0d5 2238 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2239 if (r < 0)
56f64d95 2240 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 2241 return 0;
57fb9fb5
LP
2242 }
2243
2244 if (arg_link_journal == LINK_HOST) {
574edc90
MP
2245 /* don't create parents here -- if the host doesn't have
2246 * permanent journal set up, don't force it here */
2247 r = mkdir(p, 0755);
57fb9fb5 2248 if (r < 0) {
574edc90 2249 if (arg_link_journal_try) {
56f64d95 2250 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
2251 return 0;
2252 } else {
56f64d95 2253 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
2254 return r;
2255 }
57fb9fb5
LP
2256 }
2257
27407a01
ZJS
2258 } else if (access(p, F_OK) < 0)
2259 return 0;
57fb9fb5 2260
cdb2b9d0
LP
2261 if (dir_is_empty(q) == 0)
2262 log_warning("%s is not empty, proceeding anyway.", q);
2263
03cfe0d5 2264 r = userns_mkdir(directory, p, 0755, 0, 0);
57fb9fb5 2265 if (r < 0) {
56f64d95 2266 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 2267 return r;
57fb9fb5
LP
2268 }
2269
4543768d 2270 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 2271 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2272
27407a01 2273 return 0;
57fb9fb5
LP
2274}
2275
88213476 2276static int drop_capabilities(void) {
5076f0cc 2277 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
2278}
2279
5aa4bb6b 2280static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 2281 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
03976f7b 2282 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
2283 int r;
2284
eb91eb18
LP
2285 if (!arg_register)
2286 return 0;
2287
1c03020c 2288 r = sd_bus_default_system(&bus);
f647962d
MS
2289 if (r < 0)
2290 return log_error_errno(r, "Failed to open system bus: %m");
9444b1f2 2291
89f7c846
LP
2292 if (arg_keep_unit) {
2293 r = sd_bus_call_method(
2294 bus,
2295 "org.freedesktop.machine1",
2296 "/org/freedesktop/machine1",
2297 "org.freedesktop.machine1.Manager",
5aa4bb6b 2298 "RegisterMachineWithNetwork",
89f7c846
LP
2299 &error,
2300 NULL,
5aa4bb6b 2301 "sayssusai",
89f7c846
LP
2302 arg_machine,
2303 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2304 "nspawn",
2305 "container",
2306 (uint32_t) pid,
5aa4bb6b
LP
2307 strempty(arg_directory),
2308 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 2309 } else {
9457ac5b 2310 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
f36933fe 2311 char **i;
ce5b3ad4 2312 unsigned j;
9457ac5b
LP
2313
2314 r = sd_bus_message_new_method_call(
89f7c846 2315 bus,
9457ac5b 2316 &m,
89f7c846
LP
2317 "org.freedesktop.machine1",
2318 "/org/freedesktop/machine1",
2319 "org.freedesktop.machine1.Manager",
5aa4bb6b 2320 "CreateMachineWithNetwork");
f647962d 2321 if (r < 0)
f36933fe 2322 return bus_log_create_error(r);
9457ac5b
LP
2323
2324 r = sd_bus_message_append(
2325 m,
5aa4bb6b 2326 "sayssusai",
89f7c846
LP
2327 arg_machine,
2328 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2329 "nspawn",
2330 "container",
2331 (uint32_t) pid,
5aa4bb6b
LP
2332 strempty(arg_directory),
2333 local_ifindex > 0 ? 1 : 0, local_ifindex);
f647962d 2334 if (r < 0)
f36933fe 2335 return bus_log_create_error(r);
9457ac5b
LP
2336
2337 r = sd_bus_message_open_container(m, 'a', "(sv)");
f647962d 2338 if (r < 0)
f36933fe 2339 return bus_log_create_error(r);
9457ac5b
LP
2340
2341 if (!isempty(arg_slice)) {
2342 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
f647962d 2343 if (r < 0)
f36933fe 2344 return bus_log_create_error(r);
9457ac5b
LP
2345 }
2346
2347 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
f647962d 2348 if (r < 0)
f36933fe 2349 return bus_log_create_error(r);
9457ac5b 2350
773ce3d8
LP
2351 /* If you make changes here, also make sure to update
2352 * systemd-nspawn@.service, to keep the device
2353 * policies in sync regardless if we are run with or
2354 * without the --keep-unit switch. */
63cc4c31 2355 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
9457ac5b
LP
2356 /* Allow the container to
2357 * access and create the API
2358 * device nodes, so that
2359 * PrivateDevices= in the
2360 * container can work
2361 * fine */
2362 "/dev/null", "rwm",
2363 "/dev/zero", "rwm",
2364 "/dev/full", "rwm",
2365 "/dev/random", "rwm",
2366 "/dev/urandom", "rwm",
2367 "/dev/tty", "rwm",
864e1706 2368 "/dev/net/tun", "rwm",
9457ac5b
LP
2369 /* Allow the container
2370 * access to ptys. However,
2371 * do not permit the
2372 * container to ever create
2373 * these device nodes. */
2374 "/dev/pts/ptmx", "rw",
63cc4c31 2375 "char-pts", "rw");
f647962d 2376 if (r < 0)
27023c0e
LP
2377 return bus_log_create_error(r);
2378
ce5b3ad4
SJ
2379 for (j = 0; j < arg_n_custom_mounts; j++) {
2380 CustomMount *cm = &arg_custom_mounts[j];
2381
2382 if (cm->type != CUSTOM_MOUNT_BIND)
2383 continue;
2384
2385 r = is_device_node(cm->source);
2386 if (r < 0)
2387 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
2388
2389 if (r) {
2390 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
2391 cm->source, cm->read_only ? "r" : "rw");
2392 if (r < 0)
2393 return log_error_errno(r, "Failed to append message arguments: %m");
2394 }
2395 }
2396
27023c0e
LP
2397 if (arg_kill_signal != 0) {
2398 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
2399 if (r < 0)
2400 return bus_log_create_error(r);
2401
2402 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
2403 if (r < 0)
2404 return bus_log_create_error(r);
2405 }
9457ac5b 2406
f36933fe
LP
2407 STRV_FOREACH(i, arg_property) {
2408 r = sd_bus_message_open_container(m, 'r', "sv");
2409 if (r < 0)
2410 return bus_log_create_error(r);
2411
2412 r = bus_append_unit_property_assignment(m, *i);
2413 if (r < 0)
2414 return r;
2415
2416 r = sd_bus_message_close_container(m);
2417 if (r < 0)
2418 return bus_log_create_error(r);
2419 }
2420
9457ac5b 2421 r = sd_bus_message_close_container(m);
f647962d 2422 if (r < 0)
f36933fe 2423 return bus_log_create_error(r);
9457ac5b
LP
2424
2425 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
2426 }
2427
9444b1f2 2428 if (r < 0) {
1f0cd86b
LP
2429 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2430 return r;
2431 }
2432
2433 return 0;
2434}
2435
2436static int terminate_machine(pid_t pid) {
2437 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2438 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
03976f7b 2439 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
2440 const char *path;
2441 int r;
2442
eb91eb18
LP
2443 if (!arg_register)
2444 return 0;
2445
1a2399e5
LP
2446 /* If we are reusing the unit, then just exit, systemd will do
2447 * the right thing when we exit. */
2448 if (arg_keep_unit)
2449 return 0;
2450
76b54375 2451 r = sd_bus_default_system(&bus);
f647962d
MS
2452 if (r < 0)
2453 return log_error_errno(r, "Failed to open system bus: %m");
1f0cd86b
LP
2454
2455 r = sd_bus_call_method(
2456 bus,
2457 "org.freedesktop.machine1",
2458 "/org/freedesktop/machine1",
2459 "org.freedesktop.machine1.Manager",
2460 "GetMachineByPID",
2461 &error,
2462 &reply,
2463 "u",
2464 (uint32_t) pid);
2465 if (r < 0) {
2466 /* Note that the machine might already have been
2467 * cleaned up automatically, hence don't consider it a
2468 * failure if we cannot get the machine object. */
2469 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2470 return 0;
2471 }
2472
2473 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
2474 if (r < 0)
2475 return bus_log_parse_error(r);
9444b1f2 2476
1f0cd86b
LP
2477 r = sd_bus_call_method(
2478 bus,
2479 "org.freedesktop.machine1",
2480 path,
2481 "org.freedesktop.machine1.Machine",
2482 "Terminate",
2483 &error,
2484 NULL,
2485 NULL);
2486 if (r < 0) {
2487 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2488 return 0;
2489 }
2490
9444b1f2
LP
2491 return 0;
2492}
2493
db999e0f
LP
2494static int reset_audit_loginuid(void) {
2495 _cleanup_free_ char *p = NULL;
2496 int r;
2497
2498 if (arg_share_system)
2499 return 0;
2500
2501 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2502 if (r == -ENOENT)
db999e0f 2503 return 0;
f647962d
MS
2504 if (r < 0)
2505 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2506
2507 /* Already reset? */
2508 if (streq(p, "4294967295"))
2509 return 0;
2510
2511 r = write_string_file("/proc/self/loginuid", "4294967295");
2512 if (r < 0) {
10a87006
LP
2513 log_error_errno(r,
2514 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2515 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2516 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2517 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2518 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2519
db999e0f 2520 sleep(5);
77b6e194 2521 }
db999e0f
LP
2522
2523 return 0;
77b6e194
LP
2524}
2525
4f758c23
LP
2526#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2527#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
e867ceb6 2528#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
01dde061 2529
a90e2305 2530static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
01dde061
TG
2531 uint8_t result[8];
2532 size_t l, sz;
a90e2305
LP
2533 uint8_t *v, *i;
2534 int r;
01dde061
TG
2535
2536 l = strlen(arg_machine);
2537 sz = sizeof(sd_id128_t) + l;
e867ceb6
LP
2538 if (idx > 0)
2539 sz += sizeof(idx);
a90e2305 2540
01dde061
TG
2541 v = alloca(sz);
2542
2543 /* fetch some persistent data unique to the host */
2544 r = sd_id128_get_machine((sd_id128_t*) v);
2545 if (r < 0)
2546 return r;
2547
2548 /* combine with some data unique (on this host) to this
2549 * container instance */
a90e2305
LP
2550 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2551 if (idx > 0) {
2552 idx = htole64(idx);
2553 memcpy(i, &idx, sizeof(idx));
2554 }
01dde061
TG
2555
2556 /* Let's hash the host machine ID plus the container name. We
2557 * use a fixed, but originally randomly created hash key here. */
4f758c23 2558 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
2559
2560 assert_cc(ETH_ALEN <= sizeof(result));
2561 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2562
2563 /* see eth_random_addr in the kernel */
2564 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2565 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2566
2567 return 0;
2568}
2569
5aa4bb6b 2570static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1c4baffc
TG
2571 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2572 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4f758c23 2573 struct ether_addr mac_host, mac_container;
5aa4bb6b 2574 int r, i;
69c79d3c
LP
2575
2576 if (!arg_private_network)
2577 return 0;
2578
2579 if (!arg_network_veth)
2580 return 0;
2581
08af0da2
LP
2582 /* Use two different interface name prefixes depending whether
2583 * we are in bridge mode or not. */
c00524c9 2584 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 2585 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 2586
e867ceb6
LP
2587 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2588 if (r < 0)
2589 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
4f758c23 2590
e867ceb6
LP
2591 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2592 if (r < 0)
2593 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
01dde061 2594
1c4baffc 2595 r = sd_netlink_open(&rtnl);
f647962d
MS
2596 if (r < 0)
2597 return log_error_errno(r, "Failed to connect to netlink: %m");
69c79d3c 2598
151b9b96 2599 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2600 if (r < 0)
2601 return log_error_errno(r, "Failed to allocate netlink message: %m");
69c79d3c 2602
1c4baffc 2603 r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
f647962d
MS
2604 if (r < 0)
2605 return log_error_errno(r, "Failed to add netlink interface name: %m");
69c79d3c 2606
1c4baffc 2607 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
f647962d
MS
2608 if (r < 0)
2609 return log_error_errno(r, "Failed to add netlink MAC address: %m");
4f758c23 2610
1c4baffc 2611 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2612 if (r < 0)
2613 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2614
1c4baffc 2615 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
f647962d
MS
2616 if (r < 0)
2617 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2618
1c4baffc 2619 r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
f647962d
MS
2620 if (r < 0)
2621 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2622
1c4baffc 2623 r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
f647962d
MS
2624 if (r < 0)
2625 return log_error_errno(r, "Failed to add netlink interface name: %m");
01dde061 2626
1c4baffc 2627 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
f647962d
MS
2628 if (r < 0)
2629 return log_error_errno(r, "Failed to add netlink MAC address: %m");
69c79d3c 2630
1c4baffc 2631 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2632 if (r < 0)
2633 return log_error_errno(r, "Failed to add netlink namespace field: %m");
69c79d3c 2634
1c4baffc 2635 r = sd_netlink_message_close_container(m);
f647962d
MS
2636 if (r < 0)
2637 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c 2638
1c4baffc 2639 r = sd_netlink_message_close_container(m);
f647962d
MS
2640 if (r < 0)
2641 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c 2642
1c4baffc 2643 r = sd_netlink_message_close_container(m);
f647962d
MS
2644 if (r < 0)
2645 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c 2646
1c4baffc 2647 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d 2648 if (r < 0)
637aa8a3 2649 return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
69c79d3c 2650
5aa4bb6b 2651 i = (int) if_nametoindex(iface_name);
4a62c710
MS
2652 if (i <= 0)
2653 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
5aa4bb6b
LP
2654
2655 *ifi = i;
2656
69c79d3c
LP
2657 return 0;
2658}
2659
5aa4bb6b 2660static int setup_bridge(const char veth_name[], int *ifi) {
1c4baffc
TG
2661 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2662 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
ab046dde
TG
2663 int r, bridge;
2664
2665 if (!arg_private_network)
2666 return 0;
2667
2668 if (!arg_network_veth)
2669 return 0;
2670
2671 if (!arg_network_bridge)
2672 return 0;
2673
2674 bridge = (int) if_nametoindex(arg_network_bridge);
4a62c710
MS
2675 if (bridge <= 0)
2676 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
ab046dde 2677
5aa4bb6b
LP
2678 *ifi = bridge;
2679
1c4baffc 2680 r = sd_netlink_open(&rtnl);
f647962d
MS
2681 if (r < 0)
2682 return log_error_errno(r, "Failed to connect to netlink: %m");
ab046dde 2683
151b9b96 2684 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
f647962d
MS
2685 if (r < 0)
2686 return log_error_errno(r, "Failed to allocate netlink message: %m");
ab046dde 2687
039dd4af 2688 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
f647962d
MS
2689 if (r < 0)
2690 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
039dd4af 2691
1c4baffc 2692 r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
f647962d
MS
2693 if (r < 0)
2694 return log_error_errno(r, "Failed to add netlink interface name field: %m");
ab046dde 2695
1c4baffc 2696 r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
f647962d
MS
2697 if (r < 0)
2698 return log_error_errno(r, "Failed to add netlink master field: %m");
ab046dde 2699
1c4baffc 2700 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d
MS
2701 if (r < 0)
2702 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
ab046dde
TG
2703
2704 return 0;
2705}
2706
c74e630d
LP
2707static int parse_interface(struct udev *udev, const char *name) {
2708 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2709 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2710 int ifi;
2711
2712 ifi = (int) if_nametoindex(name);
4a62c710
MS
2713 if (ifi <= 0)
2714 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
c74e630d
LP
2715
2716 sprintf(ifi_str, "n%i", ifi);
2717 d = udev_device_new_from_device_id(udev, ifi_str);
4a62c710
MS
2718 if (!d)
2719 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
c74e630d
LP
2720
2721 if (udev_device_get_is_initialized(d) <= 0) {
2722 log_error("Network interface %s is not initialized yet.", name);
2723 return -EBUSY;
2724 }
2725
2726 return ifi;
2727}
2728
69c79d3c 2729static int move_network_interfaces(pid_t pid) {
7e227024 2730 _cleanup_udev_unref_ struct udev *udev = NULL;
1c4baffc 2731 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
aa28aefe
LP
2732 char **i;
2733 int r;
2734
2735 if (!arg_private_network)
2736 return 0;
2737
2738 if (strv_isempty(arg_network_interfaces))
2739 return 0;
2740
1c4baffc 2741 r = sd_netlink_open(&rtnl);
f647962d
MS
2742 if (r < 0)
2743 return log_error_errno(r, "Failed to connect to netlink: %m");
aa28aefe 2744
7e227024
LP
2745 udev = udev_new();
2746 if (!udev) {
2747 log_error("Failed to connect to udev.");
2748 return -ENOMEM;
2749 }
2750
aa28aefe 2751 STRV_FOREACH(i, arg_network_interfaces) {
1c4baffc 2752 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
b88eb17a 2753 int ifi;
aa28aefe 2754
c74e630d
LP
2755 ifi = parse_interface(udev, *i);
2756 if (ifi < 0)
2757 return ifi;
2758
3125b3ef 2759 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
f647962d
MS
2760 if (r < 0)
2761 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2762
1c4baffc 2763 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2764 if (r < 0)
2765 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
7e227024 2766
1c4baffc 2767 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d
MS
2768 if (r < 0)
2769 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
c74e630d 2770 }
7e227024 2771
c74e630d
LP
2772 return 0;
2773}
2774
2775static int setup_macvlan(pid_t pid) {
2776 _cleanup_udev_unref_ struct udev *udev = NULL;
1c4baffc 2777 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
e867ceb6 2778 unsigned idx = 0;
c74e630d
LP
2779 char **i;
2780 int r;
2781
2782 if (!arg_private_network)
2783 return 0;
2784
2785 if (strv_isempty(arg_network_macvlan))
2786 return 0;
2787
1c4baffc 2788 r = sd_netlink_open(&rtnl);
f647962d
MS
2789 if (r < 0)
2790 return log_error_errno(r, "Failed to connect to netlink: %m");
c74e630d
LP
2791
2792 udev = udev_new();
2793 if (!udev) {
2794 log_error("Failed to connect to udev.");
2795 return -ENOMEM;
2796 }
2797
2798 STRV_FOREACH(i, arg_network_macvlan) {
1c4baffc 2799 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
c74e630d 2800 _cleanup_free_ char *n = NULL;
e867ceb6 2801 struct ether_addr mac;
c74e630d
LP
2802 int ifi;
2803
2804 ifi = parse_interface(udev, *i);
2805 if (ifi < 0)
2806 return ifi;
2807
e867ceb6
LP
2808 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2809 if (r < 0)
2810 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2811
c74e630d 2812 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2813 if (r < 0)
2814 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2815
1c4baffc 2816 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
f647962d
MS
2817 if (r < 0)
2818 return log_error_errno(r, "Failed to add netlink interface index: %m");
c74e630d
LP
2819
2820 n = strappend("mv-", *i);
2821 if (!n)
2822 return log_oom();
2823
2824 strshorten(n, IFNAMSIZ-1);
2825
1c4baffc 2826 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
f647962d
MS
2827 if (r < 0)
2828 return log_error_errno(r, "Failed to add netlink interface name: %m");
c74e630d 2829
1c4baffc 2830 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
e867ceb6
LP
2831 if (r < 0)
2832 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2833
1c4baffc 2834 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2835 if (r < 0)
2836 return log_error_errno(r, "Failed to add netlink namespace field: %m");
c74e630d 2837
1c4baffc 2838 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2839 if (r < 0)
2840 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 2841
1c4baffc 2842 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
f647962d
MS
2843 if (r < 0)
2844 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 2845
1c4baffc 2846 r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
f647962d
MS
2847 if (r < 0)
2848 return log_error_errno(r, "Failed to append macvlan mode: %m");
c74e630d 2849
1c4baffc 2850 r = sd_netlink_message_close_container(m);
f647962d
MS
2851 if (r < 0)
2852 return log_error_errno(r, "Failed to close netlink container: %m");
c74e630d 2853
1c4baffc 2854 r = sd_netlink_message_close_container(m);
f647962d
MS
2855 if (r < 0)
2856 return log_error_errno(r, "Failed to close netlink container: %m");
aa28aefe 2857
1c4baffc 2858 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d
MS
2859 if (r < 0)
2860 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
aa28aefe
LP
2861 }
2862
2863 return 0;
2864}
2865
4bbfe7ad
TG
2866static int setup_ipvlan(pid_t pid) {
2867 _cleanup_udev_unref_ struct udev *udev = NULL;
1c4baffc 2868 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4bbfe7ad
TG
2869 char **i;
2870 int r;
2871
2872 if (!arg_private_network)
2873 return 0;
2874
2875 if (strv_isempty(arg_network_ipvlan))
2876 return 0;
2877
1c4baffc 2878 r = sd_netlink_open(&rtnl);
4bbfe7ad
TG
2879 if (r < 0)
2880 return log_error_errno(r, "Failed to connect to netlink: %m");
2881
2882 udev = udev_new();
2883 if (!udev) {
2884 log_error("Failed to connect to udev.");
2885 return -ENOMEM;
2886 }
2887
2888 STRV_FOREACH(i, arg_network_ipvlan) {
1c4baffc 2889 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
4bbfe7ad
TG
2890 _cleanup_free_ char *n = NULL;
2891 int ifi;
2892
2893 ifi = parse_interface(udev, *i);
2894 if (ifi < 0)
2895 return ifi;
2896
2897 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2898 if (r < 0)
2899 return log_error_errno(r, "Failed to allocate netlink message: %m");
2900
1c4baffc 2901 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
4bbfe7ad
TG
2902 if (r < 0)
2903 return log_error_errno(r, "Failed to add netlink interface index: %m");
2904
2905 n = strappend("iv-", *i);
2906 if (!n)
2907 return log_oom();
2908
2909 strshorten(n, IFNAMSIZ-1);
2910
1c4baffc 2911 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
4bbfe7ad
TG
2912 if (r < 0)
2913 return log_error_errno(r, "Failed to add netlink interface name: %m");
2914
1c4baffc 2915 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
4bbfe7ad
TG
2916 if (r < 0)
2917 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2918
1c4baffc 2919 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
4bbfe7ad
TG
2920 if (r < 0)
2921 return log_error_errno(r, "Failed to open netlink container: %m");
2922
1c4baffc 2923 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
4bbfe7ad
TG
2924 if (r < 0)
2925 return log_error_errno(r, "Failed to open netlink container: %m");
2926
1c4baffc 2927 r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
4bbfe7ad
TG
2928 if (r < 0)
2929 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2930
1c4baffc 2931 r = sd_netlink_message_close_container(m);
4bbfe7ad
TG
2932 if (r < 0)
2933 return log_error_errno(r, "Failed to close netlink container: %m");
2934
1c4baffc 2935 r = sd_netlink_message_close_container(m);
4bbfe7ad
TG
2936 if (r < 0)
2937 return log_error_errno(r, "Failed to close netlink container: %m");
2938
1c4baffc 2939 r = sd_netlink_call(rtnl, m, 0, NULL);
4bbfe7ad
TG
2940 if (r < 0)
2941 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2942 }
2943
2944 return 0;
2945}
2946
28650077 2947static int setup_seccomp(void) {
24fb1112
LP
2948
2949#ifdef HAVE_SECCOMP
9a71b112
JF
2950 static const struct {
2951 uint64_t capability;
2952 int syscall_num;
2953 } blacklist[] = {
5ba7a268
LP
2954 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
2955 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
2956 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
2957 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
2958 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
2959 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
2960 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
2961 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
2962 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
2963 { CAP_SYSLOG, SCMP_SYS(syslog) },
d0a0ccf3
JF
2964 };
2965
24fb1112 2966 scmp_filter_ctx seccomp;
28650077 2967 unsigned i;
24fb1112
LP
2968 int r;
2969
24fb1112
LP
2970 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2971 if (!seccomp)
2972 return log_oom();
2973
e9642be2 2974 r = seccomp_add_secondary_archs(seccomp);
9875fd78 2975 if (r < 0) {
da927ba9 2976 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
2977 goto finish;
2978 }
2979
28650077 2980 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
2981 if (arg_retain & (1ULL << blacklist[i].capability))
2982 continue;
2983
2984 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
2985 if (r == -EFAULT)
2986 continue; /* unknown syscall */
2987 if (r < 0) {
da927ba9 2988 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
2989 goto finish;
2990 }
2991 }
2992
d0a0ccf3 2993
28650077
LP
2994 /*
2995 Audit is broken in containers, much of the userspace audit
2996 hookup will fail if running inside a container. We don't
2997 care and just turn off creation of audit sockets.
2998
2999 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
3000 with EAFNOSUPPORT which audit userspace uses as indication
3001 that audit is disabled in the kernel.
3002 */
3003
3302da46 3004 r = seccomp_rule_add(
24fb1112
LP
3005 seccomp,
3006 SCMP_ACT_ERRNO(EAFNOSUPPORT),
3007 SCMP_SYS(socket),
3008 2,
3009 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
3010 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
3011 if (r < 0) {
da927ba9 3012 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
3013 goto finish;
3014 }
3015
3016 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
3017 if (r < 0) {
da927ba9 3018 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
3019 goto finish;
3020 }
3021
3022 r = seccomp_load(seccomp);
9b1cbdc6
ILG
3023 if (r == -EINVAL) {
3024 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3025 r = 0;
3026 goto finish;
3027 }
3028 if (r < 0) {
da927ba9 3029 log_error_errno(r, "Failed to install seccomp audit filter: %m");
9b1cbdc6
ILG
3030 goto finish;
3031 }
24fb1112
LP
3032
3033finish:
3034 seccomp_release(seccomp);
3035 return r;
3036#else
3037 return 0;
3038#endif
3039
3040}
3041
785890ac
LP
3042static int setup_propagate(const char *root) {
3043 const char *p, *q;
3044
3045 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3046 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 3047 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
3048 (void) mkdir_p(p, 0600);
3049
03cfe0d5
LP
3050 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
3051 return log_error_errno(errno, "Failed to create /run/systemd: %m");
3052
3053 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3054 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
3055
3056 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3057 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 3058
03cfe0d5 3059 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
3060 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
3061 return log_error_errno(errno, "Failed to install propagation bind mount.");
3062
3063 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
3064 return log_error_errno(errno, "Failed to make propagation mount read-only");
3065
3066 return 0;
3067}
3068
1b9e5b12
LP
3069static int setup_image(char **device_path, int *loop_nr) {
3070 struct loop_info64 info = {
3071 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
3072 };
3073 _cleanup_close_ int fd = -1, control = -1, loop = -1;
3074 _cleanup_free_ char* loopdev = NULL;
3075 struct stat st;
3076 int r, nr;
3077
3078 assert(device_path);
3079 assert(loop_nr);
ec16945e 3080 assert(arg_image);
1b9e5b12
LP
3081
3082 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
3083 if (fd < 0)
3084 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 3085
4a62c710
MS
3086 if (fstat(fd, &st) < 0)
3087 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
3088
3089 if (S_ISBLK(st.st_mode)) {
3090 char *p;
3091
3092 p = strdup(arg_image);
3093 if (!p)
3094 return log_oom();
3095
3096 *device_path = p;
3097
3098 *loop_nr = -1;
3099
3100 r = fd;
3101 fd = -1;
3102
3103 return r;
3104 }
3105
3106 if (!S_ISREG(st.st_mode)) {
56f64d95 3107 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
3108 return -EINVAL;
3109 }
3110
3111 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
3112 if (control < 0)
3113 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
3114
3115 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
3116 if (nr < 0)
3117 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
3118
3119 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
3120 return log_oom();
3121
3122 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
3123 if (loop < 0)
3124 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 3125
4a62c710
MS
3126 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
3127 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
3128
3129 if (arg_read_only)
3130 info.lo_flags |= LO_FLAGS_READ_ONLY;
3131
4a62c710
MS
3132 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
3133 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
3134
3135 *device_path = loopdev;
3136 loopdev = NULL;
3137
3138 *loop_nr = nr;
3139
3140 r = loop;
3141 loop = -1;
3142
3143 return r;
3144}
3145
ada4799a
LP
3146#define PARTITION_TABLE_BLURB \
3147 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 3148 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 3149 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
3150 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3151 "to be bootable with systemd-nspawn."
3152
1b9e5b12
LP
3153static int dissect_image(
3154 int fd,
727fd4fd
LP
3155 char **root_device, bool *root_device_rw,
3156 char **home_device, bool *home_device_rw,
3157 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
3158 bool *secondary) {
3159
3160#ifdef HAVE_BLKID
01dc33ce
ZJS
3161 int home_nr = -1, srv_nr = -1;
3162#ifdef GPT_ROOT_NATIVE
3163 int root_nr = -1;
3164#endif
3165#ifdef GPT_ROOT_SECONDARY
3166 int secondary_root_nr = -1;
3167#endif
f6c51a81 3168 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
3169 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
3170 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
3171 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3172 _cleanup_udev_unref_ struct udev *udev = NULL;
3173 struct udev_list_entry *first, *item;
f6c51a81 3174 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 3175 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
3176 const char *pttype = NULL;
3177 blkid_partlist pl;
3178 struct stat st;
c09ef2e4 3179 unsigned i;
1b9e5b12
LP
3180 int r;
3181
3182 assert(fd >= 0);
3183 assert(root_device);
3184 assert(home_device);
3185 assert(srv_device);
3186 assert(secondary);
ec16945e 3187 assert(arg_image);
1b9e5b12
LP
3188
3189 b = blkid_new_probe();
3190 if (!b)
3191 return log_oom();
3192
3193 errno = 0;
3194 r = blkid_probe_set_device(b, fd, 0, 0);
3195 if (r != 0) {
3196 if (errno == 0)
3197 return log_oom();
3198
56f64d95 3199 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
3200 return -errno;
3201 }
3202
3203 blkid_probe_enable_partitions(b, 1);
3204 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
3205
3206 errno = 0;
3207 r = blkid_do_safeprobe(b);
3208 if (r == -2 || r == 1) {
ada4799a
LP
3209 log_error("Failed to identify any partition table on\n"
3210 " %s\n"
3211 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
3212 return -EINVAL;
3213 } else if (r != 0) {
3214 if (errno == 0)
3215 errno = EIO;
56f64d95 3216 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
3217 return -errno;
3218 }
3219
48861960 3220 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
3221
3222 is_gpt = streq_ptr(pttype, "gpt");
3223 is_mbr = streq_ptr(pttype, "dos");
3224
3225 if (!is_gpt && !is_mbr) {
3226 log_error("No GPT or MBR partition table discovered on\n"
3227 " %s\n"
3228 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
3229 return -EINVAL;
3230 }
3231
3232 errno = 0;
3233 pl = blkid_probe_get_partitions(b);
3234 if (!pl) {
3235 if (errno == 0)
3236 return log_oom();
3237
3238 log_error("Failed to list partitions of %s", arg_image);
3239 return -errno;
3240 }
3241
3242 udev = udev_new();
3243 if (!udev)
3244 return log_oom();
3245
4a62c710
MS
3246 if (fstat(fd, &st) < 0)
3247 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 3248
c09ef2e4
LP
3249 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
3250 if (!d)
1b9e5b12
LP
3251 return log_oom();
3252
c09ef2e4
LP
3253 for (i = 0;; i++) {
3254 int n, m;
1b9e5b12 3255
c09ef2e4
LP
3256 if (i >= 10) {
3257 log_error("Kernel partitions never appeared.");
3258 return -ENXIO;
3259 }
3260
3261 e = udev_enumerate_new(udev);
3262 if (!e)
3263 return log_oom();
3264
3265 r = udev_enumerate_add_match_parent(e, d);
3266 if (r < 0)
3267 return log_oom();
3268
3269 r = udev_enumerate_scan_devices(e);
3270 if (r < 0)
3271 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
3272
3273 /* Count the partitions enumerated by the kernel */
3274 n = 0;
3275 first = udev_enumerate_get_list_entry(e);
3276 udev_list_entry_foreach(item, first)
3277 n++;
3278
3279 /* Count the partitions enumerated by blkid */
3280 m = blkid_partlist_numof_partitions(pl);
3281 if (n == m + 1)
3282 break;
3283 if (n > m + 1) {
3284 log_error("blkid and kernel partition list do not match.");
3285 return -EIO;
3286 }
3287 if (n < m + 1) {
3288 unsigned j;
3289
3290 /* The kernel has probed fewer partitions than
3291 * blkid? Maybe the kernel prober is still
3292 * running or it got EBUSY because udev
3293 * already opened the device. Let's reprobe
3294 * the device, which is a synchronous call
3295 * that waits until probing is complete. */
3296
3297 for (j = 0; j < 20; j++) {
3298
3299 r = ioctl(fd, BLKRRPART, 0);
3300 if (r < 0)
3301 r = -errno;
3302 if (r >= 0 || r != -EBUSY)
3303 break;
3304
3305 /* If something else has the device
3306 * open, such as an udev rule, the
3307 * ioctl will return EBUSY. Since
3308 * there's no way to wait until it
3309 * isn't busy anymore, let's just wait
3310 * a bit, and try again.
3311 *
3312 * This is really something they
3313 * should fix in the kernel! */
3314
3315 usleep(50 * USEC_PER_MSEC);
3316 }
3317
3318 if (r < 0)
3319 return log_error_errno(r, "Failed to reread partition table: %m");
3320 }
3321
3322 e = udev_enumerate_unref(e);
3323 }
1b9e5b12
LP
3324
3325 first = udev_enumerate_get_list_entry(e);
3326 udev_list_entry_foreach(item, first) {
3327 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 3328 const char *node;
727fd4fd 3329 unsigned long long flags;
1b9e5b12
LP
3330 blkid_partition pp;
3331 dev_t qn;
3332 int nr;
3333
3334 errno = 0;
3335 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
3336 if (!q) {
3337 if (!errno)
3338 errno = ENOMEM;
3339
56f64d95 3340 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
3341 return -errno;
3342 }
3343
3344 qn = udev_device_get_devnum(q);
3345 if (major(qn) == 0)
3346 continue;
3347
3348 if (st.st_rdev == qn)
3349 continue;
3350
3351 node = udev_device_get_devnode(q);
3352 if (!node)
3353 continue;
3354
3355 pp = blkid_partlist_devno_to_partition(pl, qn);
3356 if (!pp)
3357 continue;
3358
727fd4fd 3359 flags = blkid_partition_get_flags(pp);
727fd4fd 3360
1b9e5b12
LP
3361 nr = blkid_partition_get_partno(pp);
3362 if (nr < 0)
3363 continue;
3364
ada4799a
LP
3365 if (is_gpt) {
3366 sd_id128_t type_id;
3367 const char *stype;
1b9e5b12 3368
f6c51a81
LP
3369 if (flags & GPT_FLAG_NO_AUTO)
3370 continue;
3371
ada4799a
LP
3372 stype = blkid_partition_get_type_string(pp);
3373 if (!stype)
3374 continue;
1b9e5b12 3375
ada4799a 3376 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
3377 continue;
3378
ada4799a 3379 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 3380
ada4799a
LP
3381 if (home && nr >= home_nr)
3382 continue;
1b9e5b12 3383
ada4799a
LP
3384 home_nr = nr;
3385 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 3386
ada4799a
LP
3387 r = free_and_strdup(&home, node);
3388 if (r < 0)
3389 return log_oom();
727fd4fd 3390
ada4799a
LP
3391 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3392
3393 if (srv && nr >= srv_nr)
3394 continue;
3395
3396 srv_nr = nr;
3397 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3398
3399 r = free_and_strdup(&srv, node);
3400 if (r < 0)
3401 return log_oom();
3402 }
1b9e5b12 3403#ifdef GPT_ROOT_NATIVE
ada4799a 3404 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 3405
ada4799a
LP
3406 if (root && nr >= root_nr)
3407 continue;
1b9e5b12 3408
ada4799a
LP
3409 root_nr = nr;
3410 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 3411
ada4799a
LP
3412 r = free_and_strdup(&root, node);
3413 if (r < 0)
3414 return log_oom();
3415 }
1b9e5b12
LP
3416#endif
3417#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
3418 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3419
3420 if (secondary_root && nr >= secondary_root_nr)
3421 continue;
3422
3423 secondary_root_nr = nr;
3424 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3425
3426 r = free_and_strdup(&secondary_root, node);
3427 if (r < 0)
3428 return log_oom();
3429 }
3430#endif
f6c51a81
LP
3431 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3432
3433 if (generic)
3434 multiple_generic = true;
3435 else {
3436 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3437
3438 r = free_and_strdup(&generic, node);
3439 if (r < 0)
3440 return log_oom();
3441 }
3442 }
ada4799a
LP
3443
3444 } else if (is_mbr) {
3445 int type;
1b9e5b12 3446
f6c51a81
LP
3447 if (flags != 0x80) /* Bootable flag */
3448 continue;
3449
ada4799a
LP
3450 type = blkid_partition_get_type(pp);
3451 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
3452 continue;
3453
f6c51a81
LP
3454 if (generic)
3455 multiple_generic = true;
3456 else {
3457 generic_rw = true;
727fd4fd 3458
f6c51a81
LP
3459 r = free_and_strdup(&root, node);
3460 if (r < 0)
3461 return log_oom();
3462 }
1b9e5b12 3463 }
1b9e5b12
LP
3464 }
3465
1b9e5b12
LP
3466 if (root) {
3467 *root_device = root;
3468 root = NULL;
727fd4fd
LP
3469
3470 *root_device_rw = root_rw;
1b9e5b12
LP
3471 *secondary = false;
3472 } else if (secondary_root) {
3473 *root_device = secondary_root;
3474 secondary_root = NULL;
727fd4fd
LP
3475
3476 *root_device_rw = secondary_root_rw;
1b9e5b12 3477 *secondary = true;
f6c51a81
LP
3478 } else if (generic) {
3479
3480 /* There were no partitions with precise meanings
3481 * around, but we found generic partitions. In this
3482 * case, if there's only one, we can go ahead and boot
3483 * it, otherwise we bail out, because we really cannot
3484 * make any sense of it. */
3485
3486 if (multiple_generic) {
3487 log_error("Identified multiple bootable Linux partitions on\n"
3488 " %s\n"
3489 PARTITION_TABLE_BLURB, arg_image);
3490 return -EINVAL;
3491 }
3492
3493 *root_device = generic;
3494 generic = NULL;
3495
3496 *root_device_rw = generic_rw;
3497 *secondary = false;
3498 } else {
3499 log_error("Failed to identify root partition in disk image\n"
3500 " %s\n"
3501 PARTITION_TABLE_BLURB, arg_image);
3502 return -EINVAL;
1b9e5b12
LP
3503 }
3504
3505 if (home) {
3506 *home_device = home;
3507 home = NULL;
727fd4fd
LP
3508
3509 *home_device_rw = home_rw;
1b9e5b12
LP
3510 }
3511
3512 if (srv) {
3513 *srv_device = srv;
3514 srv = NULL;
727fd4fd
LP
3515
3516 *srv_device_rw = srv_rw;
1b9e5b12
LP
3517 }
3518
3519 return 0;
3520#else
3521 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3522 return -EOPNOTSUPP;
1b9e5b12
LP
3523#endif
3524}
3525
727fd4fd 3526static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
3527#ifdef HAVE_BLKID
3528 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3529 const char *fstype, *p;
3530 int r;
3531
3532 assert(what);
3533 assert(where);
3534
727fd4fd
LP
3535 if (arg_read_only)
3536 rw = false;
3537
1b9e5b12 3538 if (directory)
63c372cb 3539 p = strjoina(where, directory);
1b9e5b12
LP
3540 else
3541 p = where;
3542
3543 errno = 0;
3544 b = blkid_new_probe_from_filename(what);
3545 if (!b) {
3546 if (errno == 0)
3547 return log_oom();
56f64d95 3548 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
3549 return -errno;
3550 }
3551
3552 blkid_probe_enable_superblocks(b, 1);
3553 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3554
3555 errno = 0;
3556 r = blkid_do_safeprobe(b);
3557 if (r == -1 || r == 1) {
3558 log_error("Cannot determine file system type of %s", what);
3559 return -EINVAL;
3560 } else if (r != 0) {
3561 if (errno == 0)
3562 errno = EIO;
56f64d95 3563 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
3564 return -errno;
3565 }
3566
3567 errno = 0;
3568 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3569 if (errno == 0)
3570 errno = EINVAL;
3571 log_error("Failed to determine file system type of %s", what);
3572 return -errno;
3573 }
3574
3575 if (streq(fstype, "crypto_LUKS")) {
3576 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 3577 return -EOPNOTSUPP;
1b9e5b12
LP
3578 }
3579
4a62c710
MS
3580 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3581 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
3582
3583 return 0;
3584#else
3585 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3586 return -EOPNOTSUPP;
1b9e5b12
LP
3587#endif
3588}
3589
727fd4fd
LP
3590static int mount_devices(
3591 const char *where,
3592 const char *root_device, bool root_device_rw,
3593 const char *home_device, bool home_device_rw,
3594 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
3595 int r;
3596
3597 assert(where);
3598
3599 if (root_device) {
727fd4fd 3600 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
3601 if (r < 0)
3602 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
3603 }
3604
3605 if (home_device) {
727fd4fd 3606 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
3607 if (r < 0)
3608 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
3609 }
3610
3611 if (srv_device) {
727fd4fd 3612 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
3613 if (r < 0)
3614 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
3615 }
3616
3617 return 0;
3618}
3619
3620static void loop_remove(int nr, int *image_fd) {
3621 _cleanup_close_ int control = -1;
e8c8ddcc 3622 int r;
1b9e5b12
LP
3623
3624 if (nr < 0)
3625 return;
3626
3627 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
3628 r = ioctl(*image_fd, LOOP_CLR_FD);
3629 if (r < 0)
5e4074aa 3630 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 3631 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
3632 }
3633
3634 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 3635 if (control < 0) {
56f64d95 3636 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 3637 return;
e8c8ddcc 3638 }
1b9e5b12 3639
e8c8ddcc
TG
3640 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3641 if (r < 0)
5e4074aa 3642 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
3643}
3644
0cb9fbcd
LP
3645static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3646 int pipe_fds[2];
3647 pid_t pid;
3648
3649 assert(database);
3650 assert(key);
3651 assert(rpid);
3652
4a62c710
MS
3653 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3654 return log_error_errno(errno, "Failed to allocate pipe: %m");
0cb9fbcd
LP
3655
3656 pid = fork();
4a62c710
MS
3657 if (pid < 0)
3658 return log_error_errno(errno, "Failed to fork getent child: %m");
3659 else if (pid == 0) {
0cb9fbcd
LP
3660 int nullfd;
3661 char *empty_env = NULL;
3662
3663 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3664 _exit(EXIT_FAILURE);
3665
3666 if (pipe_fds[0] > 2)
03e334a1 3667 safe_close(pipe_fds[0]);
0cb9fbcd 3668 if (pipe_fds[1] > 2)
03e334a1 3669 safe_close(pipe_fds[1]);
0cb9fbcd
LP
3670
3671 nullfd = open("/dev/null", O_RDWR);
3672 if (nullfd < 0)
3673 _exit(EXIT_FAILURE);
3674
3675 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3676 _exit(EXIT_FAILURE);
3677
3678 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3679 _exit(EXIT_FAILURE);
3680
3681 if (nullfd > 2)
03e334a1 3682 safe_close(nullfd);
0cb9fbcd 3683
ce30c8dc
LP
3684 (void) reset_all_signal_handlers();
3685 (void) reset_signal_mask();
0cb9fbcd
LP
3686 close_all_fds(NULL, 0);
3687
4de82926
MM
3688 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3689 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
3690 _exit(EXIT_FAILURE);
3691 }
3692
03e334a1 3693 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
3694
3695 *rpid = pid;
3696
3697 return pipe_fds[0];
3698}
3699
3700static int change_uid_gid(char **_home) {
a2a5291b
ZJS
3701 char line[LINE_MAX], *x, *u, *g, *h;
3702 const char *word, *state;
0cb9fbcd
LP
3703 _cleanup_free_ uid_t *uids = NULL;
3704 _cleanup_free_ char *home = NULL;
3705 _cleanup_fclose_ FILE *f = NULL;
3706 _cleanup_close_ int fd = -1;
3707 unsigned n_uids = 0;
70f539ca 3708 size_t sz = 0, l;
0cb9fbcd
LP
3709 uid_t uid;
3710 gid_t gid;
3711 pid_t pid;
3712 int r;
3713
3714 assert(_home);
3715
3716 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3717 /* Reset everything fully to 0, just in case */
3718
03cfe0d5
LP
3719 r = reset_uid_gid();
3720 if (r < 0)
3721 return log_error_errno(r, "Failed to become root: %m");
0cb9fbcd
LP
3722
3723 *_home = NULL;
3724 return 0;
3725 }
3726
3727 /* First, get user credentials */
3728 fd = spawn_getent("passwd", arg_user, &pid);
3729 if (fd < 0)
3730 return fd;
3731
3732 f = fdopen(fd, "r");
3733 if (!f)
3734 return log_oom();
3735 fd = -1;
3736
3737 if (!fgets(line, sizeof(line), f)) {
3738
3739 if (!ferror(f)) {
3740 log_error("Failed to resolve user %s.", arg_user);
3741 return -ESRCH;
3742 }
3743
56f64d95 3744 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3745 return -errno;
3746 }
3747
3748 truncate_nl(line);
3749
820d3acf 3750 wait_for_terminate_and_warn("getent passwd", pid, true);
0cb9fbcd
LP
3751
3752 x = strchr(line, ':');
3753 if (!x) {
3754 log_error("/etc/passwd entry has invalid user field.");
3755 return -EIO;
3756 }
3757
3758 u = strchr(x+1, ':');
3759 if (!u) {
3760 log_error("/etc/passwd entry has invalid password field.");
3761 return -EIO;
3762 }
3763
3764 u++;
3765 g = strchr(u, ':');
3766 if (!g) {
3767 log_error("/etc/passwd entry has invalid UID field.");
3768 return -EIO;
3769 }
3770
3771 *g = 0;
3772 g++;
3773 x = strchr(g, ':');
3774 if (!x) {
3775 log_error("/etc/passwd entry has invalid GID field.");
3776 return -EIO;
3777 }
3778
3779 *x = 0;
3780 h = strchr(x+1, ':');
3781 if (!h) {
3782 log_error("/etc/passwd entry has invalid GECOS field.");
3783 return -EIO;
3784 }
3785
3786 h++;
3787 x = strchr(h, ':');
3788 if (!x) {
3789 log_error("/etc/passwd entry has invalid home directory field.");
3790 return -EIO;
3791 }
3792
3793 *x = 0;
3794
3795 r = parse_uid(u, &uid);
3796 if (r < 0) {
3797 log_error("Failed to parse UID of user.");
3798 return -EIO;
3799 }
3800
3801 r = parse_gid(g, &gid);
3802 if (r < 0) {
3803 log_error("Failed to parse GID of user.");
3804 return -EIO;
3805 }
3806
3807 home = strdup(h);
3808 if (!home)
3809 return log_oom();
3810
3811 /* Second, get group memberships */
3812 fd = spawn_getent("initgroups", arg_user, &pid);
3813 if (fd < 0)
3814 return fd;
3815
3816 fclose(f);
3817 f = fdopen(fd, "r");
3818 if (!f)
3819 return log_oom();
3820 fd = -1;
3821
3822 if (!fgets(line, sizeof(line), f)) {
3823 if (!ferror(f)) {
3824 log_error("Failed to resolve user %s.", arg_user);
3825 return -ESRCH;
3826 }
3827
56f64d95 3828 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3829 return -errno;
3830 }
3831
3832 truncate_nl(line);
3833
820d3acf 3834 wait_for_terminate_and_warn("getent initgroups", pid, true);
0cb9fbcd
LP
3835
3836 /* Skip over the username and subsequent separator whitespace */
3837 x = line;
3838 x += strcspn(x, WHITESPACE);
3839 x += strspn(x, WHITESPACE);
3840
a2a5291b 3841 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
3842 char c[l+1];
3843
a2a5291b 3844 memcpy(c, word, l);
0cb9fbcd
LP
3845 c[l] = 0;
3846
3847 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3848 return log_oom();
3849
3850 r = parse_uid(c, &uids[n_uids++]);
3851 if (r < 0) {
3852 log_error("Failed to parse group data from getent.");
3853 return -EIO;
3854 }
3855 }
3856
3857 r = mkdir_parents(home, 0775);
f647962d
MS
3858 if (r < 0)
3859 return log_error_errno(r, "Failed to make home root directory: %m");
0cb9fbcd
LP
3860
3861 r = mkdir_safe(home, 0755, uid, gid);
f647962d
MS
3862 if (r < 0 && r != -EEXIST)
3863 return log_error_errno(r, "Failed to make home directory: %m");
0cb9fbcd 3864
03cfe0d5
LP
3865 (void) fchown(STDIN_FILENO, uid, gid);
3866 (void) fchown(STDOUT_FILENO, uid, gid);
3867 (void) fchown(STDERR_FILENO, uid, gid);
0cb9fbcd 3868
4a62c710
MS
3869 if (setgroups(n_uids, uids) < 0)
3870 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
0cb9fbcd 3871
4a62c710
MS
3872 if (setresgid(gid, gid, gid) < 0)
3873 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3874
4a62c710
MS
3875 if (setresuid(uid, uid, uid) < 0)
3876 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3877
3878 if (_home) {
3879 *_home = home;
3880 home = NULL;
3881 }
3882
3883 return 0;
3884}
3885
113cea80 3886/*
6d416b9c
LS
3887 * Return values:
3888 * < 0 : wait_for_terminate() failed to get the state of the
3889 * container, the container was terminated by a signal, or
3890 * failed for an unknown reason. No change is made to the
3891 * container argument.
3892 * > 0 : The program executed in the container terminated with an
3893 * error. The exit code of the program executed in the
919699ec
LP
3894 * container is returned. The container argument has been set
3895 * to CONTAINER_TERMINATED.
6d416b9c
LS
3896 * 0 : The container is being rebooted, has been shut down or exited
3897 * successfully. The container argument has been set to either
3898 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 3899 *
6d416b9c
LS
3900 * That is, success is indicated by a return value of zero, and an
3901 * error is indicated by a non-zero value.
113cea80
DH
3902 */
3903static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 3904 siginfo_t status;
919699ec 3905 int r;
113cea80
DH
3906
3907 r = wait_for_terminate(pid, &status);
f647962d
MS
3908 if (r < 0)
3909 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
3910
3911 switch (status.si_code) {
fddbb89c 3912
113cea80 3913 case CLD_EXITED:
919699ec
LP
3914 if (status.si_status == 0) {
3915 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 3916
fddbb89c 3917 } else
919699ec 3918 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 3919
919699ec
LP
3920 *container = CONTAINER_TERMINATED;
3921 return status.si_status;
113cea80
DH
3922
3923 case CLD_KILLED:
3924 if (status.si_status == SIGINT) {
113cea80 3925
919699ec 3926 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 3927 *container = CONTAINER_TERMINATED;
919699ec
LP
3928 return 0;
3929
113cea80 3930 } else if (status.si_status == SIGHUP) {
113cea80 3931
919699ec 3932 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 3933 *container = CONTAINER_REBOOTED;
919699ec 3934 return 0;
113cea80 3935 }
919699ec 3936
113cea80
DH
3937 /* CLD_KILLED fallthrough */
3938
3939 case CLD_DUMPED:
fddbb89c 3940 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 3941 return -EIO;
113cea80
DH
3942
3943 default:
fddbb89c 3944 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 3945 return -EIO;
113cea80
DH
3946 }
3947
3948 return r;
3949}
3950
e866af3a
DH
3951static void nop_handler(int sig) {}
3952
023fb90b
LP
3953static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3954 pid_t pid;
3955
3956 pid = PTR_TO_UINT32(userdata);
3957 if (pid > 0) {
c6c8f6e2 3958 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
3959 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3960 sd_event_source_set_userdata(s, NULL);
3961 return 0;
3962 }
3963 }
3964
3965 sd_event_exit(sd_event_source_get_event(s), 0);
3966 return 0;
3967}
3968
ec16945e 3969static int determine_names(void) {
1b9cebf6 3970 int r;
ec16945e
LP
3971
3972 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3973 if (arg_machine) {
3974 _cleanup_(image_unrefp) Image *i = NULL;
3975
3976 r = image_find(arg_machine, &i);
3977 if (r < 0)
3978 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3979 else if (r == 0) {
3980 log_error("No image for machine '%s': %m", arg_machine);
3981 return -ENOENT;
3982 }
3983
aceac2f0 3984 if (i->type == IMAGE_RAW)
1b9cebf6
LP
3985 r = set_sanitized_path(&arg_image, i->path);
3986 else
3987 r = set_sanitized_path(&arg_directory, i->path);
3988 if (r < 0)
3989 return log_error_errno(r, "Invalid image directory: %m");
3990
aee327b8
LP
3991 if (!arg_ephemeral)
3992 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 3993 } else
ec16945e
LP
3994 arg_directory = get_current_dir_name();
3995
1b9cebf6
LP
3996 if (!arg_directory && !arg_machine) {
3997 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
3998 return -EINVAL;
3999 }
4000 }
4001
4002 if (!arg_machine) {
b9ba4dab
LP
4003 if (arg_directory && path_equal(arg_directory, "/"))
4004 arg_machine = gethostname_malloc();
4005 else
4006 arg_machine = strdup(basename(arg_image ?: arg_directory));
4007
ec16945e
LP
4008 if (!arg_machine)
4009 return log_oom();
4010
4011 hostname_cleanup(arg_machine, false);
4012 if (!machine_name_is_valid(arg_machine)) {
4013 log_error("Failed to determine machine name automatically, please use -M.");
4014 return -EINVAL;
4015 }
b9ba4dab
LP
4016
4017 if (arg_ephemeral) {
4018 char *b;
4019
4020 /* Add a random suffix when this is an
4021 * ephemeral machine, so that we can run many
4022 * instances at once without manually having
4023 * to specify -M each time. */
4024
4025 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
4026 return log_oom();
4027
4028 free(arg_machine);
4029 arg_machine = b;
4030 }
ec16945e
LP
4031 }
4032
4033 return 0;
4034}
4035
03cfe0d5 4036static int determine_uid_shift(const char *directory) {
6dac160c
LP
4037 int r;
4038
03cfe0d5
LP
4039 if (!arg_userns) {
4040 arg_uid_shift = 0;
6dac160c 4041 return 0;
03cfe0d5 4042 }
6dac160c
LP
4043
4044 if (arg_uid_shift == UID_INVALID) {
4045 struct stat st;
4046
03cfe0d5 4047 r = stat(directory, &st);
6dac160c 4048 if (r < 0)
03cfe0d5 4049 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
4050
4051 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
4052
4053 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 4054 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
4055 return -EINVAL;
4056 }
4057
4058 arg_uid_range = UINT32_C(0x10000);
4059 }
4060
4061 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
4062 log_error("UID base too high for UID range.");
4063 return -EINVAL;
4064 }
4065
4066 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
4067 return 0;
4068}
4069
03cfe0d5
LP
4070static int inner_child(
4071 Barrier *barrier,
4072 const char *directory,
4073 bool secondary,
4074 int kmsg_socket,
4075 int rtnl_socket,
4076 FDSet *fds,
4077 int argc,
4078 char *argv[]) {
69c79d3c 4079
03cfe0d5
LP
4080 _cleanup_free_ char *home = NULL;
4081 unsigned n_env = 2;
4082 const char *envp[] = {
4083 "PATH=" DEFAULT_PATH_SPLIT_USR,
4084 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4085 NULL, /* TERM */
4086 NULL, /* HOME */
4087 NULL, /* USER */
4088 NULL, /* LOGNAME */
4089 NULL, /* container_uuid */
4090 NULL, /* LISTEN_FDS */
4091 NULL, /* LISTEN_PID */
4092 NULL
4093 };
88213476 4094
2371271c 4095 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 4096 int r;
88213476 4097
03cfe0d5
LP
4098 assert(barrier);
4099 assert(directory);
4100 assert(kmsg_socket >= 0);
88213476 4101
03cfe0d5
LP
4102 if (arg_userns) {
4103 /* Tell the parent, that it now can write the UID map. */
4104 (void) barrier_place(barrier); /* #1 */
7027ff61 4105
03cfe0d5
LP
4106 /* Wait until the parent wrote the UID map */
4107 if (!barrier_place_and_sync(barrier)) { /* #2 */
4108 log_error("Parent died too early");
4109 return -ESRCH;
4110 }
88213476
LP
4111 }
4112
03cfe0d5
LP
4113 r = mount_all(NULL, true);
4114 if (r < 0)
4115 return r;
4116
4117 /* Wait until we are cgroup-ified, so that we
4118 * can mount the right cgroup path writable */
4119 if (!barrier_place_and_sync(barrier)) { /* #3 */
4120 log_error("Parent died too early");
4121 return -ESRCH;
88213476
LP
4122 }
4123
03cfe0d5
LP
4124 r = mount_systemd_cgroup_writable("");
4125 if (r < 0)
4126 return r;
ec16945e 4127
03cfe0d5
LP
4128 r = reset_uid_gid();
4129 if (r < 0)
4130 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 4131
03cfe0d5
LP
4132 r = setup_boot_id(NULL);
4133 if (r < 0)
4134 return r;
ec16945e 4135
03cfe0d5
LP
4136 r = setup_kmsg(NULL, kmsg_socket);
4137 if (r < 0)
4138 return r;
4139 kmsg_socket = safe_close(kmsg_socket);
ec16945e 4140
03cfe0d5 4141 umask(0022);
30535c16 4142
03cfe0d5
LP
4143 if (setsid() < 0)
4144 return log_error_errno(errno, "setsid() failed: %m");
4145
4146 if (arg_private_network)
4147 loopback_setup();
4148
4149 r = send_rtnl(rtnl_socket);
4150 if (r < 0)
4151 return r;
4152 rtnl_socket = safe_close(rtnl_socket);
4153
4154 if (drop_capabilities() < 0)
4155 return log_error_errno(errno, "drop_capabilities() failed: %m");
4156
4157 setup_hostname();
4158
050f7277 4159 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
4160 if (personality(arg_personality) < 0)
4161 return log_error_errno(errno, "personality() failed: %m");
4162 } else if (secondary) {
4163 if (personality(PER_LINUX32) < 0)
4164 return log_error_errno(errno, "personality() failed: %m");
4165 }
4166
4167#ifdef HAVE_SELINUX
4168 if (arg_selinux_context)
4169 if (setexeccon((security_context_t) arg_selinux_context) < 0)
4170 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4171#endif
4172
4173 r = change_uid_gid(&home);
4174 if (r < 0)
4175 return r;
4176
4177 envp[n_env] = strv_find_prefix(environ, "TERM=");
4178 if (envp[n_env])
4179 n_env ++;
4180
4181 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4182 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4183 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
4184 return log_oom();
4185
4186 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4187 char as_uuid[37];
4188
4189 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
4190 return log_oom();
4191 }
4192
4193 if (fdset_size(fds) > 0) {
4194 r = fdset_cloexec(fds, false);
4195 if (r < 0)
4196 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4197
4198 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
4199 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
4200 return log_oom();
4201 }
4202
2371271c
TG
4203 env_use = strv_env_merge(2, envp, arg_setenv);
4204 if (!env_use)
4205 return log_oom();
03cfe0d5
LP
4206
4207 /* Let the parent know that we are ready and
4208 * wait until the parent is ready with the
4209 * setup, too... */
4210 if (!barrier_place_and_sync(barrier)) { /* #4 */
4211 log_error("Parent died too early");
4212 return -ESRCH;
4213 }
4214
4215 /* Now, explicitly close the log, so that we
4216 * then can close all remaining fds. Closing
4217 * the log explicitly first has the benefit
4218 * that the logging subsystem knows about it,
4219 * and is thus ready to be reopened should we
4220 * need it again. Note that the other fds
4221 * closed here are at least the locking and
4222 * barrier fds. */
4223 log_close();
4224 (void) fdset_close_others(fds);
4225
4226 if (arg_boot) {
4227 char **a;
4228 size_t m;
4229
4230 /* Automatically search for the init system */
4231
4232 m = 1 + argc - optind;
4233 a = newa(char*, m + 1);
4234 memcpy(a + 1, argv + optind, m * sizeof(char*));
4235
4236 a[0] = (char*) "/usr/lib/systemd/systemd";
4237 execve(a[0], a, env_use);
4238
4239 a[0] = (char*) "/lib/systemd/systemd";
4240 execve(a[0], a, env_use);
4241
4242 a[0] = (char*) "/sbin/init";
4243 execve(a[0], a, env_use);
4244 } else if (argc > optind)
4245 execvpe(argv[optind], argv + optind, env_use);
4246 else {
4247 chdir(home ? home : "/root");
4248 execle("/bin/bash", "-bash", NULL, env_use);
4249 execle("/bin/sh", "-sh", NULL, env_use);
4250 }
4251
4252 (void) log_open();
4253 return log_error_errno(errno, "execv() failed: %m");
4254}
4255
4256static int outer_child(
4257 Barrier *barrier,
4258 const char *directory,
4259 const char *console,
4260 const char *root_device, bool root_device_rw,
4261 const char *home_device, bool home_device_rw,
4262 const char *srv_device, bool srv_device_rw,
4263 bool interactive,
4264 bool secondary,
4265 int pid_socket,
4266 int kmsg_socket,
4267 int rtnl_socket,
825d5287 4268 int uid_shift_socket,
03cfe0d5
LP
4269 FDSet *fds,
4270 int argc,
4271 char *argv[]) {
4272
4273 pid_t pid;
4274 ssize_t l;
4275 int r;
4276
4277 assert(barrier);
4278 assert(directory);
4279 assert(console);
4280 assert(pid_socket >= 0);
4281 assert(kmsg_socket >= 0);
4282
4283 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
4284 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4285
4286 if (interactive) {
4287 close_nointr(STDIN_FILENO);
4288 close_nointr(STDOUT_FILENO);
4289 close_nointr(STDERR_FILENO);
4290
4291 r = open_terminal(console, O_RDWR);
4292 if (r != STDIN_FILENO) {
4293 if (r >= 0) {
4294 safe_close(r);
4295 r = -EINVAL;
4296 }
4297
4298 return log_error_errno(r, "Failed to open console: %m");
4299 }
4300
4301 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4302 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
4303 return log_error_errno(errno, "Failed to duplicate console: %m");
4304 }
4305
4306 r = reset_audit_loginuid();
4307 if (r < 0)
4308 return r;
4309
4310 /* Mark everything as slave, so that we still
4311 * receive mounts from the real root, but don't
4312 * propagate mounts to the real root. */
4313 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
4314 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4315
4316 r = mount_devices(directory,
4317 root_device, root_device_rw,
4318 home_device, home_device_rw,
4319 srv_device, srv_device_rw);
4320 if (r < 0)
4321 return r;
4322
391567f4
LP
4323 r = determine_uid_shift(directory);
4324 if (r < 0)
4325 return r;
4326
825d5287
RM
4327 if (arg_userns) {
4328 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
4329 if (l < 0)
4330 return log_error_errno(errno, "Failed to send UID shift: %m");
4331 if (l != sizeof(arg_uid_shift)) {
4332 log_error("Short write while sending UID shift.");
4333 return -EIO;
4334 }
4335 }
4336
03cfe0d5
LP
4337 /* Turn directory into bind mount */
4338 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
4339 return log_error_errno(errno, "Failed to make bind mount: %m");
4340
03cfe0d5
LP
4341 r = setup_volatile(directory);
4342 if (r < 0)
4343 return r;
4344
03cfe0d5
LP
4345 r = setup_volatile_state(directory);
4346 if (r < 0)
4347 return r;
4348
03cfe0d5
LP
4349 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
4350 if (r < 0)
4351 return r;
4352
03cfe0d5
LP
4353 if (arg_read_only) {
4354 r = bind_remount_recursive(directory, true);
4355 if (r < 0)
4356 return log_error_errno(r, "Failed to make tree read-only: %m");
4357 }
4358
03cfe0d5
LP
4359 r = mount_all(directory, false);
4360 if (r < 0)
4361 return r;
4362
4363 if (copy_devnodes(directory) < 0)
4364 return r;
4365
4366 dev_setup(directory, arg_uid_shift, arg_uid_shift);
4367
4368 if (setup_pts(directory) < 0)
4369 return r;
4370
4371 r = setup_propagate(directory);
4372 if (r < 0)
4373 return r;
4374
4375 r = setup_dev_console(directory, console);
4376 if (r < 0)
4377 return r;
4378
4379 r = setup_seccomp();
4380 if (r < 0)
4381 return r;
4382
4383 r = setup_timezone(directory);
4384 if (r < 0)
4385 return r;
4386
4387 r = setup_resolv_conf(directory);
4388 if (r < 0)
4389 return r;
4390
4391 r = setup_journal(directory);
4392 if (r < 0)
4393 return r;
4394
4395 r = mount_custom(directory);
4396 if (r < 0)
4397 return r;
4398
4399 r = mount_cgroup(directory);
4400 if (r < 0)
4401 return r;
4402
4403 r = mount_move_root(directory);
4404 if (r < 0)
4405 return log_error_errno(r, "Failed to move root directory: %m");
4406
4407 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4408 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
4409 (arg_private_network ? CLONE_NEWNET : 0) |
4410 (arg_userns ? CLONE_NEWUSER : 0),
4411 NULL);
4412 if (pid < 0)
4413 return log_error_errno(errno, "Failed to fork inner child: %m");
4414
4415 if (pid == 0) {
4416 pid_socket = safe_close(pid_socket);
825d5287 4417 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
4418
4419 /* The inner child has all namespaces that are
4420 * requested, so that we all are owned by the user if
4421 * user namespaces are turned on. */
4422
4423 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
4424 if (r < 0)
4425 _exit(EXIT_FAILURE);
4426
4427 _exit(EXIT_SUCCESS);
4428 }
4429
4430 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4431 if (l < 0)
4432 return log_error_errno(errno, "Failed to send PID: %m");
4433 if (l != sizeof(pid)) {
4434 log_error("Short write while sending PID.");
4435 return -EIO;
4436 }
4437
4438 pid_socket = safe_close(pid_socket);
4439
4440 return 0;
4441}
4442
4443static int setup_uid_map(pid_t pid) {
4444 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4445 int r;
4446
4447 assert(pid > 1);
4448
4449 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4450 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4451 r = write_string_file(uid_map, line);
4452 if (r < 0)
4453 return log_error_errno(r, "Failed to write UID map: %m");
4454
4455 /* We always assign the same UID and GID ranges */
4456 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4457 r = write_string_file(uid_map, line);
4458 if (r < 0)
4459 return log_error_errno(r, "Failed to write GID map: %m");
4460
4461 return 0;
4462}
4463
4464static int chown_cgroup(pid_t pid) {
4465 _cleanup_free_ char *path = NULL, *fs = NULL;
4466 _cleanup_close_ int fd = -1;
4467 const char *fn;
4468 int r;
4469
4470 r = cg_pid_get_path(NULL, pid, &path);
4471 if (r < 0)
4472 return log_error_errno(r, "Failed to get container cgroup path: %m");
4473
4474 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
4475 if (r < 0)
4476 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
4477
4478 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
4479 if (fd < 0)
4480 return log_error_errno(errno, "Failed to open %s: %m", fs);
4481
4482 FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4483 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
4484 log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
4485
4486 return 0;
4487}
4488
4489int main(int argc, char *argv[]) {
4490
4491 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
4492 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4493 _cleanup_close_ int master = -1, image_fd = -1;
4494 _cleanup_fdset_free_ FDSet *fds = NULL;
4495 int r, n_fd_passed, loop_nr = -1;
4496 char veth_name[IFNAMSIZ];
4497 bool secondary = false, remove_subvol = false;
72c0a2c2 4498 sigset_t mask_chld;
03cfe0d5
LP
4499 pid_t pid = 0;
4500 int ret = EXIT_SUCCESS;
4501 union in_addr_union exposed = {};
4502 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4503 bool interactive;
4504
4505 log_parse_environment();
4506 log_open();
4507
4508 r = parse_argv(argc, argv);
4509 if (r <= 0)
4510 goto finish;
4511
4512 r = determine_names();
4513 if (r < 0)
4514 goto finish;
4515
4516 if (geteuid() != 0) {
4517 log_error("Need to be root.");
4518 r = -EPERM;
4519 goto finish;
4520 }
4521
4522 n_fd_passed = sd_listen_fds(false);
4523 if (n_fd_passed > 0) {
4524 r = fdset_new_listen_fds(&fds, false);
4525 if (r < 0) {
4526 log_error_errno(r, "Failed to collect file descriptors: %m");
4527 goto finish;
4528 }
4529 }
4530
4531 if (arg_directory) {
4532 assert(!arg_image);
4533
4534 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4535 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4536 r = -EINVAL;
4537 goto finish;
4538 }
4539
4540 if (arg_ephemeral) {
4541 _cleanup_free_ char *np = NULL;
4542
4543 /* If the specified path is a mount point we
4544 * generate the new snapshot immediately
4545 * inside it under a random name. However if
4546 * the specified is not a mount point we
4547 * create the new snapshot in the parent
4548 * directory, just next to it. */
e26d6ce5 4549 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
4550 if (r < 0) {
4551 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4552 goto finish;
4553 }
4554 if (r > 0)
770b5ce4 4555 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4556 else
770b5ce4 4557 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
4558 if (r < 0) {
4559 log_error_errno(r, "Failed to generate name for snapshot: %m");
4560 goto finish;
4561 }
4562
4563 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4564 if (r < 0) {
4565 log_error_errno(r, "Failed to lock %s: %m", np);
4566 goto finish;
4567 }
4568
4569 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4570 if (r < 0) {
4571 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4572 goto finish;
ec16945e
LP
4573 }
4574
4575 free(arg_directory);
4576 arg_directory = np;
8a16a7b4 4577 np = NULL;
ec16945e
LP
4578
4579 remove_subvol = true;
30535c16
LP
4580
4581 } else {
4582 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4583 if (r == -EBUSY) {
4584 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4585 goto finish;
4586 }
4587 if (r < 0) {
4588 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4589 return r;
4590 }
4591
4592 if (arg_template) {
f70a17f8 4593 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
30535c16
LP
4594 if (r == -EEXIST) {
4595 if (!arg_quiet)
4596 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4597 } else if (r < 0) {
83521414 4598 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
4599 goto finish;
4600 } else {
4601 if (!arg_quiet)
4602 log_info("Populated %s from template %s.", arg_directory, arg_template);
4603 }
4604 }
ec16945e
LP
4605 }
4606
1b9e5b12
LP
4607 if (arg_boot) {
4608 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 4609 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 4610 r = -EINVAL;
1b9e5b12
LP
4611 goto finish;
4612 }
4613 } else {
4614 const char *p;
4615
63c372cb 4616 p = strjoina(arg_directory,
1b9e5b12
LP
4617 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
4618 if (access(p, F_OK) < 0) {
4619 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
ec16945e 4620 r = -EINVAL;
1b9e5b12 4621 goto finish;
1b9e5b12
LP
4622 }
4623 }
ec16945e 4624
6b9132a9 4625 } else {
1b9e5b12 4626 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 4627
ec16945e
LP
4628 assert(arg_image);
4629 assert(!arg_template);
4630
30535c16
LP
4631 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4632 if (r == -EBUSY) {
4633 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4634 goto finish;
4635 }
4636 if (r < 0) {
4637 r = log_error_errno(r, "Failed to create image lock: %m");
4638 goto finish;
4639 }
4640
1b9e5b12 4641 if (!mkdtemp(template)) {
56f64d95 4642 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 4643 r = -errno;
6b9132a9 4644 goto finish;
1b9e5b12 4645 }
6b9132a9 4646
1b9e5b12
LP
4647 arg_directory = strdup(template);
4648 if (!arg_directory) {
4649 r = log_oom();
4650 goto finish;
6b9132a9 4651 }
88213476 4652
1b9e5b12
LP
4653 image_fd = setup_image(&device_path, &loop_nr);
4654 if (image_fd < 0) {
4655 r = image_fd;
842f3b0f
LP
4656 goto finish;
4657 }
1b9e5b12 4658
4d9f07b4
LP
4659 r = dissect_image(image_fd,
4660 &root_device, &root_device_rw,
4661 &home_device, &home_device_rw,
4662 &srv_device, &srv_device_rw,
4663 &secondary);
1b9e5b12
LP
4664 if (r < 0)
4665 goto finish;
842f3b0f 4666 }
842f3b0f 4667
5a8af538
LP
4668 r = custom_mounts_prepare();
4669 if (r < 0)
4670 goto finish;
4671
03cfe0d5
LP
4672 interactive =
4673 isatty(STDIN_FILENO) > 0 &&
4674 isatty(STDOUT_FILENO) > 0;
9c857b9d 4675
db7feb7e
LP
4676 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4677 if (master < 0) {
ec16945e 4678 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
4679 goto finish;
4680 }
4681
611b312b
LP
4682 r = ptsname_malloc(master, &console);
4683 if (r < 0) {
4684 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
4685 goto finish;
4686 }
4687
a258bf26 4688 if (unlockpt(master) < 0) {
ec16945e 4689 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
4690 goto finish;
4691 }
4692
9c857b9d
LP
4693 if (!arg_quiet)
4694 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4695 arg_machine, arg_image ?: arg_directory);
4696
72c0a2c2 4697 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 4698
023fb90b
LP
4699 assert_se(sigemptyset(&mask_chld) == 0);
4700 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4701
03cfe0d5
LP
4702 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4703 r = log_error_errno(errno, "Failed to become subreaper: %m");
4704 goto finish;
4705 }
4706
d87be9b0 4707 for (;;) {
825d5287
RM
4708 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
4709 uid_shift_socket_pair[2] = { -1, -1 };
113cea80 4710 ContainerStatus container_status;
7566e267 4711 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
03cfe0d5 4712 static const struct sigaction sa = {
e866af3a
DH
4713 .sa_handler = nop_handler,
4714 .sa_flags = SA_NOCLDSTOP,
4715 };
03cfe0d5
LP
4716 int ifi = 0;
4717 ssize_t l;
dbb60d69
LP
4718 _cleanup_event_unref_ sd_event *event = NULL;
4719 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4720 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4721 char last_char = 0;
e866af3a 4722
7566e267 4723 r = barrier_create(&barrier);
a2da110b 4724 if (r < 0) {
da927ba9 4725 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
4726 goto finish;
4727 }
4728
6d0b55c2
LP
4729 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
4730 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4731 goto finish;
4732 }
4733
4734 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
4735 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4736 goto finish;
4737 }
4738
03cfe0d5
LP
4739 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
4740 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
4741 goto finish;
4742 }
4743
825d5287
RM
4744 if (arg_userns)
4745 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
4746 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4747 goto finish;
4748 }
4749
e866af3a
DH
4750 /* Child can be killed before execv(), so handle SIGCHLD
4751 * in order to interrupt parent's blocking calls and
4752 * give it a chance to call wait() and terminate. */
4753 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4754 if (r < 0) {
ec16945e 4755 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
4756 goto finish;
4757 }
4758
e866af3a
DH
4759 r = sigaction(SIGCHLD, &sa, NULL);
4760 if (r < 0) {
ec16945e 4761 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
4762 goto finish;
4763 }
4764
03cfe0d5 4765 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
d87be9b0
LP
4766 if (pid < 0) {
4767 if (errno == EINVAL)
ec16945e 4768 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 4769 else
ec16945e 4770 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 4771
d87be9b0
LP
4772 goto finish;
4773 }
a258bf26 4774
d87be9b0 4775 if (pid == 0) {
03cfe0d5 4776 /* The outer child only has a file system namespace. */
a2da110b
DH
4777 barrier_set_role(&barrier, BARRIER_CHILD);
4778
03e334a1 4779 master = safe_close(master);
a258bf26 4780
03e334a1 4781 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 4782 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 4783 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
825d5287 4784 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 4785
ce30c8dc
LP
4786 (void) reset_all_signal_handlers();
4787 (void) reset_signal_mask();
f5c1b9ee 4788
03cfe0d5
LP
4789 r = outer_child(&barrier,
4790 arg_directory,
4791 console,
4792 root_device, root_device_rw,
4793 home_device, home_device_rw,
4794 srv_device, srv_device_rw,
4795 interactive,
4796 secondary,
4797 pid_socket_pair[1],
4798 kmsg_socket_pair[1],
4799 rtnl_socket_pair[1],
825d5287 4800 uid_shift_socket_pair[1],
03cfe0d5
LP
4801 fds,
4802 argc, argv);
0cb9fbcd 4803 if (r < 0)
a2da110b 4804 _exit(EXIT_FAILURE);
d87be9b0 4805
03cfe0d5 4806 _exit(EXIT_SUCCESS);
da5b3bad 4807 }
88213476 4808
a2da110b 4809 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 4810
842f3b0f
LP
4811 fdset_free(fds);
4812 fds = NULL;
4813
6d0b55c2
LP
4814 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4815 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 4816 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
6d0b55c2 4817
03cfe0d5
LP
4818 /* Wait for the outer child. */
4819 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
4820 if (r < 0)
4821 goto finish;
4822 if (r != 0) {
4823 r = -EIO;
4824 goto finish;
4825 }
4826 pid = 0;
6dac160c 4827
03cfe0d5
LP
4828 /* And now retrieve the PID of the inner child. */
4829 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
4830 if (l < 0) {
4831 r = log_error_errno(errno, "Failed to read inner child PID: %m");
4832 goto finish;
4833 }
4834 if (l != sizeof(pid)) {
4835 log_error("Short read while reading inner child PID: %m");
4836 r = EIO;
4837 goto finish;
4838 }
354bfd2b 4839
03cfe0d5 4840 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 4841
03cfe0d5
LP
4842 if (arg_userns) {
4843 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4844 log_error("Child died too early.");
4845 r = -ESRCH;
840295fc 4846 goto finish;
03cfe0d5 4847 }
ab046dde 4848
825d5287
RM
4849 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
4850 if (l < 0) {
4851 r = log_error_errno(errno, "Failed to read UID shift: %m");
4852 goto finish;
4853 }
4854 if (l != sizeof(arg_uid_shift)) {
4855 log_error("Short read while reading UID shift: %m");
4856 r = EIO;
4857 goto finish;
4858 }
4859
03cfe0d5 4860 r = setup_uid_map(pid);
840295fc
LP
4861 if (r < 0)
4862 goto finish;
ab046dde 4863
03cfe0d5
LP
4864 (void) barrier_place(&barrier); /* #2 */
4865 }
c74e630d 4866
03cfe0d5
LP
4867 r = move_network_interfaces(pid);
4868 if (r < 0)
4869 goto finish;
4bbfe7ad 4870
03cfe0d5
LP
4871 r = setup_veth(pid, veth_name, &ifi);
4872 if (r < 0)
4873 goto finish;
5aa4bb6b 4874
03cfe0d5
LP
4875 r = setup_bridge(veth_name, &ifi);
4876 if (r < 0)
4877 goto finish;
6dac160c 4878
03cfe0d5
LP
4879 r = setup_macvlan(pid);
4880 if (r < 0)
4881 goto finish;
6dac160c 4882
03cfe0d5
LP
4883 r = setup_ipvlan(pid);
4884 if (r < 0)
4885 goto finish;
6dac160c 4886
03cfe0d5
LP
4887 r = register_machine(pid, ifi);
4888 if (r < 0)
4889 goto finish;
6dac160c 4890
03cfe0d5
LP
4891 r = chown_cgroup(pid);
4892 if (r < 0)
4893 goto finish;
6dac160c 4894
03cfe0d5
LP
4895 /* Notify the child that the parent is ready with all
4896 * its setup (including cgroup-ification), and that
4897 * the child can now hand over control to the code to
4898 * run inside the container. */
4899 (void) barrier_place(&barrier); /* #3 */
6dac160c 4900
03cfe0d5
LP
4901 /* Block SIGCHLD here, before notifying child.
4902 * process_pty() will handle it with the other signals. */
4903 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 4904
03cfe0d5
LP
4905 /* Reset signal to default */
4906 r = default_signals(SIGCHLD, -1);
4907 if (r < 0) {
4908 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4909 goto finish;
4910 }
e866af3a 4911
03cfe0d5
LP
4912 /* Let the child know that we are ready and wait that the child is completely ready now. */
4913 if (!barrier_place_and_sync(&barrier)) { /* #5 */
4914 log_error("Client died too early.");
4915 r = -ESRCH;
4916 goto finish;
4917 }
b12afc8c 4918
03cfe0d5
LP
4919 sd_notifyf(false,
4920 "READY=1\n"
4921 "STATUS=Container running.\n"
4922 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 4923
03cfe0d5
LP
4924 r = sd_event_new(&event);
4925 if (r < 0) {
4926 log_error_errno(r, "Failed to get default event source: %m");
4927 goto finish;
4928 }
88213476 4929
03cfe0d5
LP
4930 if (arg_kill_signal > 0) {
4931 /* Try to kill the init system on SIGINT or SIGTERM */
4932 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4933 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4934 } else {
4935 /* Immediately exit */
4936 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4937 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4938 }
023fb90b 4939
03cfe0d5
LP
4940 /* simply exit on sigchld */
4941 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 4942
03cfe0d5
LP
4943 if (arg_expose_ports) {
4944 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4945 if (r < 0)
4946 goto finish;
023fb90b 4947
03cfe0d5
LP
4948 (void) expose_ports(rtnl, &exposed);
4949 }
023fb90b 4950
03cfe0d5 4951 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 4952
03cfe0d5
LP
4953 r = pty_forward_new(event, master, true, !interactive, &forward);
4954 if (r < 0) {
4955 log_error_errno(r, "Failed to create PTY forwarder: %m");
4956 goto finish;
4957 }
023fb90b 4958
03cfe0d5
LP
4959 r = sd_event_loop(event);
4960 if (r < 0) {
4961 log_error_errno(r, "Failed to run event loop: %m");
4962 goto finish;
4963 }
6d0b55c2 4964
03cfe0d5 4965 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 4966
03cfe0d5 4967 forward = pty_forward_free(forward);
6d0b55c2 4968
03cfe0d5
LP
4969 if (!arg_quiet && last_char != '\n')
4970 putc('\n', stdout);
04d39279 4971
03cfe0d5
LP
4972 /* Kill if it is not dead yet anyway */
4973 terminate_machine(pid);
1f0cd86b 4974
840295fc 4975 /* Normally redundant, but better safe than sorry */
04d39279 4976 kill(pid, SIGKILL);
a258bf26 4977
113cea80 4978 r = wait_for_container(pid, &container_status);
04d39279
LP
4979 pid = 0;
4980
ec16945e 4981 if (r < 0)
ce9f1527
LP
4982 /* We failed to wait for the container, or the
4983 * container exited abnormally */
ec16945e
LP
4984 goto finish;
4985 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
4986 /* The container exited with a non-zero
4987 * status, or with zero status and no reboot
4988 * was requested. */
ec16945e 4989 ret = r;
d87be9b0 4990 break;
ec16945e 4991 }
88213476 4992
113cea80 4993 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
4994
4995 if (arg_keep_unit) {
4996 /* Special handling if we are running as a
4997 * service: instead of simply restarting the
4998 * machine we want to restart the entire
4999 * service, so let's inform systemd about this
5000 * with the special exit code 133. The service
5001 * file uses RestartForceExitStatus=133 so
5002 * that this results in a full nspawn
5003 * restart. This is necessary since we might
5004 * have cgroup parameters set we want to have
5005 * flushed out. */
ec16945e
LP
5006 ret = 133;
5007 r = 0;
ce38dbc8
LP
5008 break;
5009 }
6d0b55c2
LP
5010
5011 flush_ports(&exposed);
d87be9b0 5012 }
88213476
LP
5013
5014finish:
af4ec430
LP
5015 sd_notify(false,
5016 "STOPPING=1\n"
5017 "STATUS=Terminating...");
5018
9444b1f2
LP
5019 if (pid > 0)
5020 kill(pid, SIGKILL);
88213476 5021
503546da
LP
5022 /* Try to flush whatever is still queued in the pty */
5023 if (master >= 0)
5024 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
5025
03cfe0d5
LP
5026 loop_remove(loop_nr, &image_fd);
5027
ec16945e
LP
5028 if (remove_subvol && arg_directory) {
5029 int k;
5030
d9e2daaf 5031 k = btrfs_subvol_remove(arg_directory, true);
ec16945e
LP
5032 if (k < 0)
5033 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
5034 }
5035
785890ac
LP
5036 if (arg_machine) {
5037 const char *p;
5038
63c372cb 5039 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5040 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5041 }
5042
04d391da 5043 free(arg_directory);
ec16945e
LP
5044 free(arg_template);
5045 free(arg_image);
7027ff61 5046 free(arg_machine);
c74e630d
LP
5047 free(arg_user);
5048 strv_free(arg_setenv);
5049 strv_free(arg_network_interfaces);
5050 strv_free(arg_network_macvlan);
4bbfe7ad 5051 strv_free(arg_network_ipvlan);
5a8af538 5052 custom_mount_free_all();
88213476 5053
6d0b55c2
LP
5054 flush_ports(&exposed);
5055
5056 while (arg_expose_ports) {
5057 ExposePort *p = arg_expose_ports;
5058 LIST_REMOVE(ports, arg_expose_ports, p);
5059 free(p);
5060 }
5061
ec16945e 5062 return r < 0 ? EXIT_FAILURE : ret;
88213476 5063}