]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
hostname-util: get rid of unused parameter of hostname_cleanup()
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
88213476 26#include <sys/mount.h>
88213476
LP
27#include <stdlib.h>
28#include <string.h>
29#include <stdio.h>
30#include <errno.h>
31#include <sys/prctl.h>
88213476 32#include <getopt.h>
687d0825 33#include <grp.h>
5ed27dbd 34#include <linux/fs.h>
9537eab0 35#include <sys/socket.h>
aea38d80 36#include <linux/netlink.h>
aa28aefe 37#include <net/if.h>
69c79d3c 38#include <linux/veth.h>
6afc95b7 39#include <sys/personality.h>
1b9e5b12 40#include <linux/loop.h>
2fbe4296 41#include <sys/file.h>
aa28aefe 42
5d63309c 43#ifdef HAVE_SELINUX
a8828ed9
DW
44#include <selinux/selinux.h>
45#endif
88213476 46
24fb1112
LP
47#ifdef HAVE_SECCOMP
48#include <seccomp.h>
49#endif
50
1b9e5b12
LP
51#ifdef HAVE_BLKID
52#include <blkid/blkid.h>
53#endif
54
1f0cd86b
LP
55#include "sd-daemon.h"
56#include "sd-bus.h"
57#include "sd-id128.h"
1c4baffc 58#include "sd-netlink.h"
958b66ea 59#include "random-util.h"
88213476
LP
60#include "log.h"
61#include "util.h"
49e942b2 62#include "mkdir.h"
c6878637 63#include "rm-rf.h"
6b2d0e85 64#include "macro.h"
94d82985 65#include "missing.h"
04d391da 66#include "cgroup-util.h"
a258bf26 67#include "strv.h"
9eb977db 68#include "path-util.h"
a41fe3a2 69#include "loopback-setup.h"
4fc9982c 70#include "dev-setup.h"
842f3b0f 71#include "fdset.h"
acbeb427 72#include "build.h"
a5c32cff 73#include "fileio.h"
40ca29a1 74#include "bus-util.h"
1f0cd86b 75#include "bus-error.h"
4ba93280 76#include "ptyfwd.h"
f4889f65 77#include "env-util.h"
1c4baffc 78#include "netlink-util.h"
7e227024 79#include "udev-util.h"
1b9e5b12
LP
80#include "blkid-util.h"
81#include "gpt.h"
01dde061 82#include "siphash24.h"
849958d1 83#include "copy.h"
3577de7a 84#include "base-filesystem.h"
a2da110b 85#include "barrier.h"
023fb90b 86#include "event-util.h"
f01ae826 87#include "capability.h"
2822da4f 88#include "cap-list.h"
ec16945e 89#include "btrfs-util.h"
1b9cebf6 90#include "machine-image.h"
6d0b55c2
LP
91#include "list.h"
92#include "in-addr-util.h"
12c2884c 93#include "firewall-util.h"
6d0b55c2 94#include "local-addresses.h"
6482f626 95#include "formats-util.h"
0b452006 96#include "process-util.h"
288a74cc 97#include "terminal-util.h"
958b66ea 98#include "hostname-util.h"
24882e06 99#include "signal-util.h"
f2d88580 100
e9642be2
LP
101#ifdef HAVE_SECCOMP
102#include "seccomp-util.h"
103#endif
104
6d0b55c2
LP
105typedef struct ExposePort {
106 int protocol;
107 uint16_t host_port;
108 uint16_t container_port;
109 LIST_FIELDS(struct ExposePort, ports);
110} ExposePort;
111
113cea80
DH
112typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115} ContainerStatus;
116
57fb9fb5
LP
117typedef enum LinkJournal {
118 LINK_NO,
119 LINK_AUTO,
120 LINK_HOST,
121 LINK_GUEST
122} LinkJournal;
88213476 123
4d9f07b4
LP
124typedef enum Volatile {
125 VOLATILE_NO,
126 VOLATILE_YES,
127 VOLATILE_STATE,
128} Volatile;
129
5a8af538
LP
130typedef enum CustomMountType {
131 CUSTOM_MOUNT_BIND,
132 CUSTOM_MOUNT_TMPFS,
133 CUSTOM_MOUNT_OVERLAY,
134} CustomMountType;
135
136typedef struct CustomMount {
137 CustomMountType type;
138 bool read_only;
139 char *source; /* for overlayfs this is the upper directory */
140 char *destination;
141 char *options;
142 char *work_dir;
143 char **lower;
144} CustomMount;
145
88213476 146static char *arg_directory = NULL;
ec16945e 147static char *arg_template = NULL;
687d0825 148static char *arg_user = NULL;
9444b1f2 149static sd_id128_t arg_uuid = {};
7027ff61 150static char *arg_machine = NULL;
c74e630d
LP
151static const char *arg_selinux_context = NULL;
152static const char *arg_selinux_apifs_context = NULL;
9444b1f2 153static const char *arg_slice = NULL;
ff01d048 154static bool arg_private_network = false;
bc2f673e 155static bool arg_read_only = false;
0f0dbc46 156static bool arg_boot = false;
ec16945e 157static bool arg_ephemeral = false;
57fb9fb5 158static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 159static bool arg_link_journal_try = false;
5076f0cc
LP
160static uint64_t arg_retain =
161 (1ULL << CAP_CHOWN) |
162 (1ULL << CAP_DAC_OVERRIDE) |
163 (1ULL << CAP_DAC_READ_SEARCH) |
164 (1ULL << CAP_FOWNER) |
165 (1ULL << CAP_FSETID) |
166 (1ULL << CAP_IPC_OWNER) |
167 (1ULL << CAP_KILL) |
168 (1ULL << CAP_LEASE) |
169 (1ULL << CAP_LINUX_IMMUTABLE) |
170 (1ULL << CAP_NET_BIND_SERVICE) |
171 (1ULL << CAP_NET_BROADCAST) |
172 (1ULL << CAP_NET_RAW) |
173 (1ULL << CAP_SETGID) |
174 (1ULL << CAP_SETFCAP) |
175 (1ULL << CAP_SETPCAP) |
176 (1ULL << CAP_SETUID) |
177 (1ULL << CAP_SYS_ADMIN) |
178 (1ULL << CAP_SYS_CHROOT) |
179 (1ULL << CAP_SYS_NICE) |
180 (1ULL << CAP_SYS_PTRACE) |
181 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 182 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
183 (1ULL << CAP_SYS_BOOT) |
184 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
185 (1ULL << CAP_AUDIT_CONTROL) |
186 (1ULL << CAP_MKNOD);
5a8af538
LP
187static CustomMount *arg_custom_mounts = NULL;
188static unsigned arg_n_custom_mounts = 0;
f4889f65 189static char **arg_setenv = NULL;
284c0b91 190static bool arg_quiet = false;
8a96d94e 191static bool arg_share_system = false;
eb91eb18 192static bool arg_register = true;
89f7c846 193static bool arg_keep_unit = false;
aa28aefe 194static char **arg_network_interfaces = NULL;
c74e630d 195static char **arg_network_macvlan = NULL;
4bbfe7ad 196static char **arg_network_ipvlan = NULL;
69c79d3c 197static bool arg_network_veth = false;
c74e630d 198static const char *arg_network_bridge = NULL;
050f7277 199static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 200static char *arg_image = NULL;
4d9f07b4 201static Volatile arg_volatile = VOLATILE_NO;
6d0b55c2 202static ExposePort *arg_expose_ports = NULL;
f36933fe 203static char **arg_property = NULL;
6dac160c
LP
204static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
205static bool arg_userns = false;
c6c8f6e2 206static int arg_kill_signal = 0;
88213476 207
601185b4 208static void help(void) {
88213476
LP
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
211 " -h --help Show this help\n"
212 " --version Print version string\n"
69c79d3c 213 " -q --quiet Do not show status information\n"
1b9e5b12 214 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
215 " --template=PATH Initialize root directory from template directory,\n"
216 " if missing\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 222 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 223 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 224 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 225 " --property=NAME=VALUE Set scope unit property\n"
03cfe0d5
LP
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
69c79d3c
LP
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
231 " container\n"
c74e630d
LP
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
4bbfe7ad
TG
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
0dfaa006 238 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 239 " and container\n"
ab046dde 240 " --network-bridge=INTERFACE\n"
32457153 241 " Add a virtual ethernet connection between host\n"
ab046dde
TG
242 " and container and add it to an existing bridge on\n"
243 " the host\n"
6d0b55c2 244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 245 " Expose a container IP port on the host\n"
82adf6af
LP
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
a8828ed9
DW
252 " --capability=CAP In addition to the default, retain specified\n"
253 " capability\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 259 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
260 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
261 " the container\n"
262 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 263 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
264 " --overlay=PATH[:PATH...]:PATH\n"
265 " Create an overlay mount from the host to \n"
266 " the container\n"
267 " --overlay-ro=PATH[:PATH...]:PATH\n"
268 " Similar, but creates a read-only overlay mount\n"
284c0b91 269 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 270 " --share-system Share system namespaces with host\n"
eb91eb18 271 " --register=BOOLEAN Register container as machine\n"
89f7c846 272 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 273 " the service unit nspawn is running in\n"
6d0b55c2
LP
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 , program_invocation_short_name);
88213476
LP
276}
277
5a8af538
LP
278static CustomMount* custom_mount_add(CustomMountType t) {
279 CustomMount *c, *ret;
280
281 c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
282 if (!c)
283 return NULL;
284
285 arg_custom_mounts = c;
286 ret = arg_custom_mounts + arg_n_custom_mounts;
287 arg_n_custom_mounts++;
288
289 *ret = (CustomMount) { .type = t };
290
291 return ret;
292}
293
294static void custom_mount_free_all(void) {
295 unsigned i;
296
297 for (i = 0; i < arg_n_custom_mounts; i++) {
298 CustomMount *m = &arg_custom_mounts[i];
299
300 free(m->source);
301 free(m->destination);
302 free(m->options);
303
304 if (m->work_dir) {
305 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
306 free(m->work_dir);
307 }
308
309 strv_free(m->lower);
310 }
311
312 free(arg_custom_mounts);
313 arg_custom_mounts = NULL;
314 arg_n_custom_mounts = 0;
315}
316
317static int custom_mount_compare(const void *a, const void *b) {
318 const CustomMount *x = a, *y = b;
319 int r;
320
321 r = path_compare(x->destination, y->destination);
322 if (r != 0)
323 return r;
324
325 if (x->type < y->type)
326 return -1;
327 if (x->type > y->type)
328 return 1;
329
330 return 0;
331}
332
333static int custom_mounts_prepare(void) {
334 unsigned i;
335 int r;
336
337 /* Ensure the mounts are applied prefix first. */
338 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
339
340 /* Allocate working directories for the overlay file systems that need it */
341 for (i = 0; i < arg_n_custom_mounts; i++) {
342 CustomMount *m = &arg_custom_mounts[i];
343
825d5287
RM
344 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
345 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
346 return -EINVAL;
347 }
348
5a8af538
LP
349 if (m->type != CUSTOM_MOUNT_OVERLAY)
350 continue;
351
352 if (m->work_dir)
353 continue;
354
355 if (m->read_only)
356 continue;
357
14bcf25c 358 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
359 if (r < 0)
360 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
361 }
362
363 return 0;
364}
365
ec16945e
LP
366static int set_sanitized_path(char **b, const char *path) {
367 char *p;
368
369 assert(b);
370 assert(path);
371
372 p = canonicalize_file_name(path);
373 if (!p) {
374 if (errno != ENOENT)
375 return -errno;
376
377 p = path_make_absolute_cwd(path);
378 if (!p)
379 return -ENOMEM;
380 }
381
382 free(*b);
383 *b = path_kill_slashes(p);
384 return 0;
385}
386
88213476
LP
387static int parse_argv(int argc, char *argv[]) {
388
a41fe3a2 389 enum {
acbeb427
ZJS
390 ARG_VERSION = 0x100,
391 ARG_PRIVATE_NETWORK,
bc2f673e 392 ARG_UUID,
5076f0cc 393 ARG_READ_ONLY,
57fb9fb5 394 ARG_CAPABILITY,
420c7379 395 ARG_DROP_CAPABILITY,
17fe0523
LP
396 ARG_LINK_JOURNAL,
397 ARG_BIND,
f4889f65 398 ARG_BIND_RO,
06c17c39 399 ARG_TMPFS,
5a8af538
LP
400 ARG_OVERLAY,
401 ARG_OVERLAY_RO,
f4889f65 402 ARG_SETENV,
eb91eb18 403 ARG_SHARE_SYSTEM,
89f7c846 404 ARG_REGISTER,
aa28aefe 405 ARG_KEEP_UNIT,
69c79d3c 406 ARG_NETWORK_INTERFACE,
c74e630d 407 ARG_NETWORK_MACVLAN,
4bbfe7ad 408 ARG_NETWORK_IPVLAN,
ab046dde 409 ARG_NETWORK_BRIDGE,
6afc95b7 410 ARG_PERSONALITY,
4d9f07b4 411 ARG_VOLATILE,
ec16945e 412 ARG_TEMPLATE,
f36933fe 413 ARG_PROPERTY,
6dac160c 414 ARG_PRIVATE_USERS,
c6c8f6e2 415 ARG_KILL_SIGNAL,
a41fe3a2
LP
416 };
417
88213476 418 static const struct option options[] = {
aa28aefe
LP
419 { "help", no_argument, NULL, 'h' },
420 { "version", no_argument, NULL, ARG_VERSION },
421 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
422 { "template", required_argument, NULL, ARG_TEMPLATE },
423 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
424 { "user", required_argument, NULL, 'u' },
425 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
426 { "boot", no_argument, NULL, 'b' },
427 { "uuid", required_argument, NULL, ARG_UUID },
428 { "read-only", no_argument, NULL, ARG_READ_ONLY },
429 { "capability", required_argument, NULL, ARG_CAPABILITY },
430 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
431 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
432 { "bind", required_argument, NULL, ARG_BIND },
433 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 434 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
435 { "overlay", required_argument, NULL, ARG_OVERLAY },
436 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
437 { "machine", required_argument, NULL, 'M' },
438 { "slice", required_argument, NULL, 'S' },
439 { "setenv", required_argument, NULL, ARG_SETENV },
440 { "selinux-context", required_argument, NULL, 'Z' },
441 { "selinux-apifs-context", required_argument, NULL, 'L' },
442 { "quiet", no_argument, NULL, 'q' },
443 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
444 { "register", required_argument, NULL, ARG_REGISTER },
445 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
446 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 447 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 448 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 449 { "network-veth", no_argument, NULL, 'n' },
ab046dde 450 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 451 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 452 { "image", required_argument, NULL, 'i' },
4d9f07b4 453 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 454 { "port", required_argument, NULL, 'p' },
f36933fe 455 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 456 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 457 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
eb9da376 458 {}
88213476
LP
459 };
460
9444b1f2 461 int c, r;
a42c8b54 462 uint64_t plus = 0, minus = 0;
88213476
LP
463
464 assert(argc >= 0);
465 assert(argv);
466
0dfaa006 467 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
468
469 switch (c) {
470
471 case 'h':
601185b4
ZJS
472 help();
473 return 0;
88213476 474
acbeb427
ZJS
475 case ARG_VERSION:
476 puts(PACKAGE_STRING);
477 puts(SYSTEMD_FEATURES);
478 return 0;
479
88213476 480 case 'D':
ec16945e
LP
481 r = set_sanitized_path(&arg_directory, optarg);
482 if (r < 0)
483 return log_error_errno(r, "Invalid root directory: %m");
484
485 break;
486
487 case ARG_TEMPLATE:
488 r = set_sanitized_path(&arg_template, optarg);
489 if (r < 0)
490 return log_error_errno(r, "Invalid template directory: %m");
88213476
LP
491
492 break;
493
1b9e5b12 494 case 'i':
ec16945e
LP
495 r = set_sanitized_path(&arg_image, optarg);
496 if (r < 0)
497 return log_error_errno(r, "Invalid image path: %m");
498
499 break;
500
501 case 'x':
502 arg_ephemeral = true;
1b9e5b12
LP
503 break;
504
687d0825
MV
505 case 'u':
506 free(arg_user);
7027ff61
LP
507 arg_user = strdup(optarg);
508 if (!arg_user)
509 return log_oom();
687d0825
MV
510
511 break;
512
ab046dde 513 case ARG_NETWORK_BRIDGE:
c74e630d 514 arg_network_bridge = optarg;
ab046dde
TG
515
516 /* fall through */
517
0dfaa006 518 case 'n':
69c79d3c
LP
519 arg_network_veth = true;
520 arg_private_network = true;
521 break;
522
aa28aefe 523 case ARG_NETWORK_INTERFACE:
c74e630d
LP
524 if (strv_extend(&arg_network_interfaces, optarg) < 0)
525 return log_oom();
526
527 arg_private_network = true;
528 break;
529
530 case ARG_NETWORK_MACVLAN:
531 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
532 return log_oom();
533
4bbfe7ad
TG
534 arg_private_network = true;
535 break;
536
537 case ARG_NETWORK_IPVLAN:
538 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
539 return log_oom();
540
aa28aefe
LP
541 /* fall through */
542
ff01d048
LP
543 case ARG_PRIVATE_NETWORK:
544 arg_private_network = true;
a41fe3a2
LP
545 break;
546
0f0dbc46
LP
547 case 'b':
548 arg_boot = true;
549 break;
550
144f0fc0 551 case ARG_UUID:
9444b1f2
LP
552 r = sd_id128_from_string(optarg, &arg_uuid);
553 if (r < 0) {
aa96c6cb 554 log_error("Invalid UUID: %s", optarg);
9444b1f2 555 return r;
aa96c6cb 556 }
9444b1f2 557 break;
aa96c6cb 558
9444b1f2 559 case 'S':
c74e630d 560 arg_slice = optarg;
144f0fc0
LP
561 break;
562
7027ff61 563 case 'M':
eb91eb18
LP
564 if (isempty(optarg)) {
565 free(arg_machine);
566 arg_machine = NULL;
567 } else {
0c3c4284 568 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
569 log_error("Invalid machine name: %s", optarg);
570 return -EINVAL;
571 }
7027ff61 572
0c3c4284
LP
573 r = free_and_strdup(&arg_machine, optarg);
574 if (r < 0)
eb91eb18
LP
575 return log_oom();
576
577 break;
578 }
7027ff61 579
82adf6af
LP
580 case 'Z':
581 arg_selinux_context = optarg;
a8828ed9
DW
582 break;
583
82adf6af
LP
584 case 'L':
585 arg_selinux_apifs_context = optarg;
a8828ed9
DW
586 break;
587
bc2f673e
LP
588 case ARG_READ_ONLY:
589 arg_read_only = true;
590 break;
591
420c7379
LP
592 case ARG_CAPABILITY:
593 case ARG_DROP_CAPABILITY: {
a2a5291b 594 const char *state, *word;
5076f0cc
LP
595 size_t length;
596
597 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 598 _cleanup_free_ char *t;
5076f0cc
LP
599
600 t = strndup(word, length);
0d0f0c50
SL
601 if (!t)
602 return log_oom();
5076f0cc 603
39ed67d1
LP
604 if (streq(t, "all")) {
605 if (c == ARG_CAPABILITY)
a42c8b54 606 plus = (uint64_t) -1;
39ed67d1 607 else
a42c8b54 608 minus = (uint64_t) -1;
39ed67d1 609 } else {
2822da4f
LP
610 int cap;
611
612 cap = capability_from_name(t);
613 if (cap < 0) {
39ed67d1
LP
614 log_error("Failed to parse capability %s.", t);
615 return -EINVAL;
616 }
617
618 if (c == ARG_CAPABILITY)
a42c8b54 619 plus |= 1ULL << (uint64_t) cap;
39ed67d1 620 else
a42c8b54 621 minus |= 1ULL << (uint64_t) cap;
5076f0cc 622 }
5076f0cc
LP
623 }
624
625 break;
626 }
627
57fb9fb5
LP
628 case 'j':
629 arg_link_journal = LINK_GUEST;
574edc90 630 arg_link_journal_try = true;
57fb9fb5
LP
631 break;
632
633 case ARG_LINK_JOURNAL:
53e438e3 634 if (streq(optarg, "auto")) {
57fb9fb5 635 arg_link_journal = LINK_AUTO;
53e438e3
LP
636 arg_link_journal_try = false;
637 } else if (streq(optarg, "no")) {
57fb9fb5 638 arg_link_journal = LINK_NO;
53e438e3
LP
639 arg_link_journal_try = false;
640 } else if (streq(optarg, "guest")) {
57fb9fb5 641 arg_link_journal = LINK_GUEST;
53e438e3
LP
642 arg_link_journal_try = false;
643 } else if (streq(optarg, "host")) {
57fb9fb5 644 arg_link_journal = LINK_HOST;
53e438e3
LP
645 arg_link_journal_try = false;
646 } else if (streq(optarg, "try-guest")) {
574edc90
MP
647 arg_link_journal = LINK_GUEST;
648 arg_link_journal_try = true;
649 } else if (streq(optarg, "try-host")) {
650 arg_link_journal = LINK_HOST;
651 arg_link_journal_try = true;
652 } else {
57fb9fb5
LP
653 log_error("Failed to parse link journal mode %s", optarg);
654 return -EINVAL;
655 }
656
657 break;
658
17fe0523
LP
659 case ARG_BIND:
660 case ARG_BIND_RO: {
5a8af538
LP
661 _cleanup_free_ char *source = NULL, *destination = NULL;
662 CustomMount *m;
17fe0523 663 char *e;
17fe0523
LP
664
665 e = strchr(optarg, ':');
666 if (e) {
5a8af538
LP
667 source = strndup(optarg, e - optarg);
668 destination = strdup(e + 1);
17fe0523 669 } else {
5a8af538
LP
670 source = strdup(optarg);
671 destination = strdup(optarg);
17fe0523
LP
672 }
673
5a8af538 674 if (!source || !destination)
17fe0523
LP
675 return log_oom();
676
5a8af538 677 if (!path_is_absolute(source) || !path_is_absolute(destination)) {
17fe0523
LP
678 log_error("Invalid bind mount specification: %s", optarg);
679 return -EINVAL;
680 }
681
5a8af538
LP
682 m = custom_mount_add(CUSTOM_MOUNT_BIND);
683 if (!m)
b3451bed 684 return log_oom();
17fe0523 685
5a8af538
LP
686 m->source = source;
687 m->destination = destination;
688 m->read_only = c == ARG_BIND_RO;
689
690 source = destination = NULL;
17fe0523
LP
691
692 break;
693 }
694
06c17c39 695 case ARG_TMPFS: {
5a8af538
LP
696 _cleanup_free_ char *path = NULL, *opts = NULL;
697 CustomMount *m;
06c17c39
LP
698 char *e;
699
700 e = strchr(optarg, ':');
701 if (e) {
5a8af538
LP
702 path = strndup(optarg, e - optarg);
703 opts = strdup(e + 1);
06c17c39 704 } else {
5a8af538
LP
705 path = strdup(optarg);
706 opts = strdup("mode=0755");
06c17c39
LP
707 }
708
5a8af538 709 if (!path || !opts)
06c17c39
LP
710 return log_oom();
711
5a8af538 712 if (!path_is_absolute(path)) {
06c17c39
LP
713 log_error("Invalid tmpfs specification: %s", optarg);
714 return -EINVAL;
715 }
716
5a8af538
LP
717 m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
718 if (!m)
06c17c39
LP
719 return log_oom();
720
5a8af538
LP
721 m->destination = path;
722 m->options = opts;
06c17c39 723
5a8af538
LP
724 path = opts = NULL;
725
726 break;
727 }
728
729 case ARG_OVERLAY:
730 case ARG_OVERLAY_RO: {
731 _cleanup_free_ char *upper = NULL, *destination = NULL;
732 _cleanup_strv_free_ char **lower = NULL;
733 CustomMount *m;
734 unsigned n = 0;
735 char **i;
736
737 lower = strv_split(optarg, ":");
738 if (!lower)
06c17c39
LP
739 return log_oom();
740
5a8af538
LP
741 STRV_FOREACH(i, lower) {
742 if (!path_is_absolute(*i)) {
743 log_error("Overlay path %s is not absolute.", *i);
744 return -EINVAL;
745 }
746
747 n++;
748 }
749
750 if (n < 2) {
751 log_error("--overlay= needs at least two colon-separated directories specified.");
752 return -EINVAL;
753 }
754
755 if (n == 2) {
756 /* If two parameters are specified,
757 * the first one is the lower, the
758 * second one the upper directory. And
af86c440
ZJS
759 * we'll also define the destination
760 * mount point the same as the upper. */
5a8af538
LP
761 upper = lower[1];
762 lower[1] = NULL;
763
764 destination = strdup(upper);
765 if (!destination)
766 return log_oom();
767
768 } else {
769 upper = lower[n - 2];
770 destination = lower[n - 1];
771 lower[n - 2] = NULL;
772 }
773
774 m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
775 if (!m)
776 return log_oom();
777
778 m->destination = destination;
779 m->source = upper;
780 m->lower = lower;
781 m->read_only = c == ARG_OVERLAY_RO;
782
783 upper = destination = NULL;
784 lower = NULL;
06c17c39
LP
785
786 break;
787 }
788
f4889f65
LP
789 case ARG_SETENV: {
790 char **n;
791
792 if (!env_assignment_is_valid(optarg)) {
793 log_error("Environment variable assignment '%s' is not valid.", optarg);
794 return -EINVAL;
795 }
796
797 n = strv_env_set(arg_setenv, optarg);
798 if (!n)
799 return log_oom();
800
801 strv_free(arg_setenv);
802 arg_setenv = n;
803 break;
804 }
805
284c0b91
LP
806 case 'q':
807 arg_quiet = true;
808 break;
809
8a96d94e
LP
810 case ARG_SHARE_SYSTEM:
811 arg_share_system = true;
812 break;
813
eb91eb18
LP
814 case ARG_REGISTER:
815 r = parse_boolean(optarg);
816 if (r < 0) {
817 log_error("Failed to parse --register= argument: %s", optarg);
818 return r;
819 }
820
821 arg_register = r;
822 break;
823
89f7c846
LP
824 case ARG_KEEP_UNIT:
825 arg_keep_unit = true;
826 break;
827
6afc95b7
LP
828 case ARG_PERSONALITY:
829
ac45f971 830 arg_personality = personality_from_string(optarg);
050f7277 831 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
832 log_error("Unknown or unsupported personality '%s'.", optarg);
833 return -EINVAL;
834 }
835
836 break;
837
4d9f07b4
LP
838 case ARG_VOLATILE:
839
840 if (!optarg)
841 arg_volatile = VOLATILE_YES;
842 else {
843 r = parse_boolean(optarg);
844 if (r < 0) {
845 if (streq(optarg, "state"))
846 arg_volatile = VOLATILE_STATE;
847 else {
848 log_error("Failed to parse --volatile= argument: %s", optarg);
849 return r;
850 }
851 } else
852 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
853 }
854
855 break;
856
6d0b55c2
LP
857 case 'p': {
858 const char *split, *e;
859 uint16_t container_port, host_port;
860 int protocol;
861 ExposePort *p;
862
863 if ((e = startswith(optarg, "tcp:")))
864 protocol = IPPROTO_TCP;
865 else if ((e = startswith(optarg, "udp:")))
866 protocol = IPPROTO_UDP;
867 else {
868 e = optarg;
869 protocol = IPPROTO_TCP;
870 }
871
872 split = strchr(e, ':');
873 if (split) {
874 char v[split - e + 1];
875
876 memcpy(v, e, split - e);
877 v[split - e] = 0;
878
879 r = safe_atou16(v, &host_port);
880 if (r < 0 || host_port <= 0) {
881 log_error("Failed to parse host port: %s", optarg);
882 return -EINVAL;
883 }
884
885 r = safe_atou16(split + 1, &container_port);
886 } else {
887 r = safe_atou16(e, &container_port);
888 host_port = container_port;
889 }
890
891 if (r < 0 || container_port <= 0) {
892 log_error("Failed to parse host port: %s", optarg);
893 return -EINVAL;
894 }
895
896 LIST_FOREACH(ports, p, arg_expose_ports) {
897 if (p->protocol == protocol && p->host_port == host_port) {
898 log_error("Duplicate port specification: %s", optarg);
899 return -EINVAL;
900 }
901 }
902
903 p = new(ExposePort, 1);
904 if (!p)
905 return log_oom();
906
907 p->protocol = protocol;
908 p->host_port = host_port;
909 p->container_port = container_port;
910
911 LIST_PREPEND(ports, arg_expose_ports, p);
912
913 break;
914 }
915
f36933fe
LP
916 case ARG_PROPERTY:
917 if (strv_extend(&arg_property, optarg) < 0)
918 return log_oom();
919
920 break;
921
6dac160c
LP
922 case ARG_PRIVATE_USERS:
923 if (optarg) {
924 _cleanup_free_ char *buffer = NULL;
925 const char *range, *shift;
926
927 range = strchr(optarg, ':');
928 if (range) {
929 buffer = strndup(optarg, range - optarg);
930 if (!buffer)
931 return log_oom();
932 shift = buffer;
933
934 range++;
935 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
936 log_error("Failed to parse UID range: %s", range);
937 return -EINVAL;
938 }
939 } else
940 shift = optarg;
941
942 if (parse_uid(shift, &arg_uid_shift) < 0) {
943 log_error("Failed to parse UID: %s", optarg);
944 return -EINVAL;
945 }
946 }
947
948 arg_userns = true;
949 break;
950
c6c8f6e2
LP
951 case ARG_KILL_SIGNAL:
952 arg_kill_signal = signal_from_string_try_harder(optarg);
953 if (arg_kill_signal < 0) {
954 log_error("Cannot parse signal: %s", optarg);
955 return -EINVAL;
956 }
957
958 break;
959
88213476
LP
960 case '?':
961 return -EINVAL;
962
963 default:
eb9da376 964 assert_not_reached("Unhandled option");
88213476 965 }
88213476 966
eb91eb18
LP
967 if (arg_share_system)
968 arg_register = false;
969
970 if (arg_boot && arg_share_system) {
971 log_error("--boot and --share-system may not be combined.");
972 return -EINVAL;
973 }
974
89f7c846
LP
975 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
976 log_error("--keep-unit may not be used when invoked from a user session.");
977 return -EINVAL;
978 }
979
1b9e5b12
LP
980 if (arg_directory && arg_image) {
981 log_error("--directory= and --image= may not be combined.");
982 return -EINVAL;
983 }
984
ec16945e
LP
985 if (arg_template && arg_image) {
986 log_error("--template= and --image= may not be combined.");
987 return -EINVAL;
988 }
989
990 if (arg_template && !(arg_directory || arg_machine)) {
991 log_error("--template= needs --directory= or --machine=.");
992 return -EINVAL;
993 }
994
995 if (arg_ephemeral && arg_template) {
996 log_error("--ephemeral and --template= may not be combined.");
997 return -EINVAL;
998 }
999
1000 if (arg_ephemeral && arg_image) {
1001 log_error("--ephemeral and --image= may not be combined.");
1002 return -EINVAL;
1003 }
1004
df9a75e4
LP
1005 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1006 log_error("--ephemeral and --link-journal= may not be combined.");
1007 return -EINVAL;
1008 }
1009
4d9f07b4
LP
1010 if (arg_volatile != VOLATILE_NO && arg_read_only) {
1011 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1012 return -EINVAL;
1013 }
1014
6d0b55c2
LP
1015 if (arg_expose_ports && !arg_private_network) {
1016 log_error("Cannot use --port= without private networking.");
1017 return -EINVAL;
1018 }
1019
b774fb7f
DH
1020 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
1021 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
1022
a42c8b54
LP
1023 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1024
c6c8f6e2
LP
1025 if (arg_boot && arg_kill_signal <= 0)
1026 arg_kill_signal = SIGRTMIN+3;
1027
88213476
LP
1028 return 1;
1029}
1030
03cfe0d5
LP
1031static int tmpfs_patch_options(const char *options, char **ret) {
1032 char *buf = NULL;
1033
1034 if (arg_userns && arg_uid_shift != 0) {
825d5287 1035 assert(arg_uid_shift != UID_INVALID);
03cfe0d5
LP
1036
1037 if (options)
f001a835 1038 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
03cfe0d5 1039 else
f001a835 1040 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
03cfe0d5
LP
1041 if (!buf)
1042 return -ENOMEM;
1043
1044 options = buf;
1045 }
1046
1047#ifdef HAVE_SELINUX
1048 if (arg_selinux_apifs_context) {
1049 char *t;
1050
1051 if (options)
1052 t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
1053 else
1054 t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
1055 if (!t) {
1056 free(buf);
1057 return -ENOMEM;
1058 }
1059
1060 free(buf);
1061 buf = t;
1062 }
1063#endif
1064
1065 *ret = buf;
1066 return !!buf;
1067}
1068
1069static int mount_all(const char *dest, bool userns) {
88213476
LP
1070
1071 typedef struct MountPoint {
1072 const char *what;
1073 const char *where;
1074 const char *type;
1075 const char *options;
1076 unsigned long flags;
3bd66c05 1077 bool fatal;
03cfe0d5 1078 bool userns;
88213476
LP
1079 } MountPoint;
1080
1081 static const MountPoint mount_table[] = {
3c59d4f2
RM
1082 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
1083 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
1084 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
1085 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
1086 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
1087 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
1088 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1089 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1090 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
9b634ea5 1091#ifdef HAVE_SELINUX
3c59d4f2
RM
1092 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
1093 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */
9b634ea5 1094#endif
88213476
LP
1095 };
1096
1097 unsigned k;
03cfe0d5 1098 int r;
88213476
LP
1099
1100 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
d15d65a0 1101 _cleanup_free_ char *where = NULL, *options = NULL;
d002827b 1102 const char *o;
88213476 1103
03cfe0d5
LP
1104 if (userns != mount_table[k].userns)
1105 continue;
1106
1107 where = prefix_root(dest, mount_table[k].where);
17fe0523
LP
1108 if (!where)
1109 return log_oom();
88213476 1110
e26d6ce5 1111 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
03cfe0d5
LP
1112 if (r < 0 && r != -ENOENT)
1113 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
88213476 1114
9c1c7f71 1115 /* Skip this entry if it is not a remount. */
03cfe0d5 1116 if (mount_table[k].what && r > 0)
014a9c77
LP
1117 continue;
1118
03cfe0d5
LP
1119 r = mkdir_p(where, 0755);
1120 if (r < 0) {
1121 if (mount_table[k].fatal)
1122 return log_error_errno(r, "Failed to create directory %s: %m", where);
79d80fc1 1123
03cfe0d5 1124 log_warning_errno(r, "Failed to create directory %s: %m", where);
79d80fc1
TG
1125 continue;
1126 }
88213476 1127
03cfe0d5
LP
1128 o = mount_table[k].options;
1129 if (streq_ptr(mount_table[k].type, "tmpfs")) {
1130 r = tmpfs_patch_options(o, &options);
1131 if (r < 0)
6dac160c 1132 return log_oom();
03cfe0d5
LP
1133 if (r > 0)
1134 o = options;
6dac160c 1135 }
a8828ed9 1136
88213476
LP
1137 if (mount(mount_table[k].what,
1138 where,
1139 mount_table[k].type,
1140 mount_table[k].flags,
79d80fc1 1141 o) < 0) {
88213476 1142
03cfe0d5
LP
1143 if (mount_table[k].fatal)
1144 return log_error_errno(errno, "mount(%s) failed: %m", where);
88213476 1145
03cfe0d5 1146 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
88213476 1147 }
88213476
LP
1148 }
1149
03cfe0d5 1150 return 0;
e58a1277 1151}
f8440af5 1152
5a8af538
LP
1153static int mount_bind(const char *dest, CustomMount *m) {
1154 struct stat source_st, dest_st;
03cfe0d5 1155 const char *where;
5a8af538 1156 int r;
17fe0523 1157
5a8af538 1158 assert(m);
d2421337 1159
5a8af538
LP
1160 if (stat(m->source, &source_st) < 0)
1161 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
17fe0523 1162
03cfe0d5 1163 where = prefix_roota(dest, m->destination);
06c17c39 1164
03cfe0d5 1165 if (stat(where, &dest_st) >= 0) {
5a8af538
LP
1166 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
1167 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
1168 return -EINVAL;
2ed4e5e0 1169 }
06c17c39 1170
5a8af538
LP
1171 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
1172 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
1173 return -EINVAL;
d2421337 1174 }
17fe0523 1175
5a8af538
LP
1176 } else if (errno == ENOENT) {
1177 r = mkdir_parents_label(where, 0755);
1178 if (r < 0)
1179 return log_error_errno(r, "Failed to make parents of %s: %m", where);
1180 } else {
1181 log_error_errno(errno, "Failed to stat %s: %m", where);
1182 return -errno;
1183 }
17fe0523 1184
5a8af538
LP
1185 /* Create the mount point. Any non-directory file can be
1186 * mounted on any non-directory file (regular, fifo, socket,
1187 * char, block).
1188 */
1189 if (S_ISDIR(source_st.st_mode))
1190 r = mkdir_label(where, 0755);
1191 else
1192 r = touch(where);
1193 if (r < 0 && r != -EEXIST)
1194 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1195
1196 if (mount(m->source, where, NULL, MS_BIND, NULL) < 0)
1197 return log_error_errno(errno, "mount(%s) failed: %m", where);
1198
1199 if (m->read_only) {
1200 r = bind_remount_recursive(where, true);
1201 if (r < 0)
1202 return log_error_errno(r, "Read-only bind mount failed: %m");
1203 }
1204
1205 return 0;
1206}
1207
1208static int mount_tmpfs(const char *dest, CustomMount *m) {
03cfe0d5
LP
1209 const char *where, *options;
1210 _cleanup_free_ char *buf = NULL;
5a8af538
LP
1211 int r;
1212
1213 assert(dest);
1214 assert(m);
1215
03cfe0d5 1216 where = prefix_roota(dest, m->destination);
5a8af538 1217
03cfe0d5 1218 r = mkdir_p_label(where, 0755);
5a8af538
LP
1219 if (r < 0 && r != -EEXIST)
1220 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1221
03cfe0d5
LP
1222 r = tmpfs_patch_options(m->options, &buf);
1223 if (r < 0)
1224 return log_oom();
1225 options = r > 0 ? buf : m->options;
1226
1227 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
5a8af538
LP
1228 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1229
1230 return 0;
1231}
1232
1233static int mount_overlay(const char *dest, CustomMount *m) {
1234 _cleanup_free_ char *lower = NULL;
03cfe0d5 1235 const char *where, *options;
5a8af538
LP
1236 int r;
1237
1238 assert(dest);
1239 assert(m);
1240
03cfe0d5 1241 where = prefix_roota(dest, m->destination);
5a8af538
LP
1242
1243 r = mkdir_label(where, 0755);
1244 if (r < 0 && r != -EEXIST)
1245 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
1246
1247 (void) mkdir_p_label(m->source, 0755);
1248
1249 strv_reverse(m->lower);
1250 lower = strv_join(m->lower, ":");
1251 strv_reverse(m->lower);
5a8af538
LP
1252 if (!lower)
1253 return log_oom();
1254
1255 if (m->read_only)
1256 options = strjoina("lowerdir=", m->source, ":", lower);
1257 else {
1258 assert(m->work_dir);
1259 (void) mkdir_label(m->work_dir, 0700);
1260
1261 options = strjoina("lowerdir=", lower, ",upperdir=", m->source, ",workdir=", m->work_dir);
1262 }
1263
1264 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
1265 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
1266
1267 return 0;
1268}
1269
1270static int mount_custom(const char *dest) {
1271 unsigned i;
1272 int r;
1273
1274 assert(dest);
1275
1276 for (i = 0; i < arg_n_custom_mounts; i++) {
1277 CustomMount *m = &arg_custom_mounts[i];
1278
1279 switch (m->type) {
1280
1281 case CUSTOM_MOUNT_BIND:
1282 r = mount_bind(dest, m);
1283 break;
1284
1285 case CUSTOM_MOUNT_TMPFS:
1286 r = mount_tmpfs(dest, m);
1287 break;
1288
1289 case CUSTOM_MOUNT_OVERLAY:
1290 r = mount_overlay(dest, m);
1291 break;
1292
1293 default:
1294 assert_not_reached("Unknown custom mount type");
17fe0523 1295 }
5a8af538
LP
1296
1297 if (r < 0)
1298 return r;
17fe0523
LP
1299 }
1300
1301 return 0;
1302}
1303
b12afc8c
LP
1304static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1305 char *to;
1306 int r;
1307
63c372cb 1308 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
b12afc8c 1309
e26d6ce5 1310 r = path_is_mount_point(to, 0);
da00518b 1311 if (r < 0 && r != -ENOENT)
b12afc8c
LP
1312 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1313 if (r > 0)
1314 return 0;
1315
1316 mkdir_p(to, 0755);
1317
c0534580
LP
1318 /* The superblock mount options of the mount point need to be
1319 * identical to the hosts', and hence writable... */
1320 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
b12afc8c
LP
1321 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1322
c0534580
LP
1323 /* ... hence let's only make the bind mount read-only, not the
1324 * superblock. */
1325 if (read_only) {
1326 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1327 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1328 }
b12afc8c
LP
1329 return 1;
1330}
1331
1332static int mount_cgroup(const char *dest) {
1333 _cleanup_set_free_free_ Set *controllers = NULL;
03cfe0d5 1334 const char *cgroup_root;
b12afc8c
LP
1335 int r;
1336
1337 controllers = set_new(&string_hash_ops);
1338 if (!controllers)
1339 return log_oom();
1340
1341 r = cg_kernel_controllers(controllers);
1342 if (r < 0)
1343 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1344
b12afc8c
LP
1345 for (;;) {
1346 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1347
1348 controller = set_steal_first(controllers);
1349 if (!controller)
1350 break;
1351
03cfe0d5 1352 origin = prefix_root("/sys/fs/cgroup/", controller);
b12afc8c
LP
1353 if (!origin)
1354 return log_oom();
1355
1356 r = readlink_malloc(origin, &combined);
1357 if (r == -EINVAL) {
1358 /* Not a symbolic link, but directly a single cgroup hierarchy */
1359
1360 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1361 if (r < 0)
1362 return r;
1363
1364 } else if (r < 0)
1365 return log_error_errno(r, "Failed to read link %s: %m", origin);
1366 else {
1367 _cleanup_free_ char *target = NULL;
1368
03cfe0d5 1369 target = prefix_root(dest, origin);
b12afc8c
LP
1370 if (!target)
1371 return log_oom();
1372
1373 /* A symbolic link, a combination of controllers in one hierarchy */
1374
1375 if (!filename_is_valid(combined)) {
1376 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1377 continue;
1378 }
1379
1380 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1381 if (r < 0)
1382 return r;
1383
875e1014
ILG
1384 r = symlink_idempotent(combined, target);
1385 if (r == -EINVAL) {
1386 log_error("Invalid existing symlink for combined hierarchy");
1387 return r;
1388 }
1389 if (r < 0)
1390 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
b12afc8c
LP
1391 }
1392 }
1393
c0534580 1394 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
b12afc8c
LP
1395 if (r < 0)
1396 return r;
1397
03cfe0d5
LP
1398 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1399 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1400 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1401
1402 return 0;
1403}
1404
1405static int mount_systemd_cgroup_writable(const char *dest) {
1406 _cleanup_free_ char *own_cgroup_path = NULL;
1407 const char *systemd_root, *systemd_own;
1408 int r;
1409
1410 assert(dest);
1411
1412 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1413 if (r < 0)
1414 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1415
b12afc8c 1416 /* Make our own cgroup a (writable) bind mount */
63c372cb 1417 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
b12afc8c
LP
1418 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1419 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1420
1421 /* And then remount the systemd cgroup root read-only */
03cfe0d5 1422 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
b12afc8c
LP
1423 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1424 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1425
03cfe0d5
LP
1426 return 0;
1427}
1428
1429static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1430 assert(p);
1431
1432 if (!arg_userns)
1433 return 0;
1434
1435 if (uid == UID_INVALID && gid == GID_INVALID)
1436 return 0;
1437
1438 if (uid != UID_INVALID) {
1439 uid += arg_uid_shift;
1440
1441 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1442 return -EOVERFLOW;
1443 }
1444
1445 if (gid != GID_INVALID) {
1446 gid += (gid_t) arg_uid_shift;
1447
1448 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1449 return -EOVERFLOW;
1450 }
1451
1452 if (lchown(p, uid, gid) < 0)
1453 return -errno;
b12afc8c
LP
1454
1455 return 0;
1456}
1457
03cfe0d5
LP
1458static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1459 const char *q;
1460
1461 q = prefix_roota(root, path);
1462 if (mkdir(q, mode) < 0) {
1463 if (errno == EEXIST)
1464 return 0;
1465 return -errno;
1466 }
1467
1468 return userns_lchown(q, uid, gid);
1469}
1470
e58a1277 1471static int setup_timezone(const char *dest) {
03cfe0d5
LP
1472 _cleanup_free_ char *p = NULL, *q = NULL;
1473 const char *where, *check, *what;
d4036145
LP
1474 char *z, *y;
1475 int r;
f8440af5 1476
e58a1277
LP
1477 assert(dest);
1478
1479 /* Fix the timezone, if possible */
d4036145
LP
1480 r = readlink_malloc("/etc/localtime", &p);
1481 if (r < 0) {
1482 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1483 return 0;
1484 }
1485
1486 z = path_startswith(p, "../usr/share/zoneinfo/");
1487 if (!z)
1488 z = path_startswith(p, "/usr/share/zoneinfo/");
1489 if (!z) {
1490 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1491 return 0;
1492 }
1493
03cfe0d5 1494 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1495 r = readlink_malloc(where, &q);
1496 if (r >= 0) {
1497 y = path_startswith(q, "../usr/share/zoneinfo/");
1498 if (!y)
1499 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1500
d4036145
LP
1501 /* Already pointing to the right place? Then do nothing .. */
1502 if (y && streq(y, z))
1503 return 0;
1504 }
1505
03cfe0d5
LP
1506 check = strjoina("/usr/share/zoneinfo/", z);
1507 check = prefix_root(dest, check);
1508 if (laccess(check, F_OK) < 0) {
d4036145
LP
1509 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1510 return 0;
1511 }
68fb0892 1512
79d80fc1
TG
1513 r = unlink(where);
1514 if (r < 0 && errno != ENOENT) {
56f64d95 1515 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1516 return 0;
1517 }
4d9f07b4 1518
03cfe0d5 1519 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1520 if (symlink(what, where) < 0) {
56f64d95 1521 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1522 return 0;
1523 }
e58a1277 1524
03cfe0d5
LP
1525 r = userns_lchown(where, 0, 0);
1526 if (r < 0)
1527 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1528
e58a1277 1529 return 0;
88213476
LP
1530}
1531
2547bb41 1532static int setup_resolv_conf(const char *dest) {
03cfe0d5 1533 const char *where = NULL;
79d80fc1 1534 int r;
2547bb41
LP
1535
1536 assert(dest);
1537
1538 if (arg_private_network)
1539 return 0;
1540
1541 /* Fix resolv.conf, if possible */
03cfe0d5 1542 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1543
f2068bcc 1544 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1545 if (r < 0) {
68a313c5
LP
1546 /* If the file already exists as symlink, let's
1547 * suppress the warning, under the assumption that
1548 * resolved or something similar runs inside and the
1549 * symlink points there.
1550 *
1551 * If the disk image is read-only, there's also no
1552 * point in complaining.
1553 */
1554 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1555 "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1556 return 0;
1557 }
2547bb41 1558
03cfe0d5
LP
1559 r = userns_lchown(where, 0, 0);
1560 if (r < 0)
1561 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1562
2547bb41
LP
1563 return 0;
1564}
1565
4d9f07b4 1566static int setup_volatile_state(const char *directory) {
03cfe0d5
LP
1567 _cleanup_free_ char *buf = NULL;
1568 const char *p, *options;
4d9f07b4
LP
1569 int r;
1570
1571 assert(directory);
1572
1573 if (arg_volatile != VOLATILE_STATE)
1574 return 0;
1575
1576 /* --volatile=state means we simply overmount /var
1577 with a tmpfs, and the rest read-only. */
1578
1579 r = bind_remount_recursive(directory, true);
f647962d
MS
1580 if (r < 0)
1581 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
4d9f07b4 1582
03cfe0d5 1583 p = prefix_roota(directory, "/var");
79d80fc1 1584 r = mkdir(p, 0755);
4a62c710
MS
1585 if (r < 0 && errno != EEXIST)
1586 return log_error_errno(errno, "Failed to create %s: %m", directory);
4d9f07b4 1587
03cfe0d5
LP
1588 options = "mode=755";
1589 r = tmpfs_patch_options(options, &buf);
1590 if (r < 0)
1591 return log_oom();
1592 if (r > 0)
1593 options = buf;
1594
1595 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
4a62c710 1596 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
4d9f07b4
LP
1597
1598 return 0;
1599}
1600
1601static int setup_volatile(const char *directory) {
1602 bool tmpfs_mounted = false, bind_mounted = false;
1603 char template[] = "/tmp/nspawn-volatile-XXXXXX";
03cfe0d5
LP
1604 _cleanup_free_ char *buf = NULL;
1605 const char *f, *t, *options;
4d9f07b4
LP
1606 int r;
1607
1608 assert(directory);
1609
1610 if (arg_volatile != VOLATILE_YES)
1611 return 0;
1612
1613 /* --volatile=yes means we mount a tmpfs to the root dir, and
1614 the original /usr to use inside it, and that read-only. */
1615
4a62c710
MS
1616 if (!mkdtemp(template))
1617 return log_error_errno(errno, "Failed to create temporary directory: %m");
4d9f07b4 1618
03cfe0d5
LP
1619 options = "mode=755";
1620 r = tmpfs_patch_options(options, &buf);
1621 if (r < 0)
1622 return log_oom();
1623 if (r > 0)
1624 options = buf;
1625
1626 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
1627 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
4d9f07b4
LP
1628 goto fail;
1629 }
1630
1631 tmpfs_mounted = true;
1632
03cfe0d5
LP
1633 f = prefix_roota(directory, "/usr");
1634 t = prefix_roota(template, "/usr");
4d9f07b4 1635
79d80fc1
TG
1636 r = mkdir(t, 0755);
1637 if (r < 0 && errno != EEXIST) {
03cfe0d5 1638 r = log_error_errno(errno, "Failed to create %s: %m", t);
79d80fc1
TG
1639 goto fail;
1640 }
1641
4543768d 1642 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
03cfe0d5 1643 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
4d9f07b4
LP
1644 goto fail;
1645 }
1646
1647 bind_mounted = true;
1648
1649 r = bind_remount_recursive(t, true);
1650 if (r < 0) {
da927ba9 1651 log_error_errno(r, "Failed to remount %s read-only: %m", t);
4d9f07b4
LP
1652 goto fail;
1653 }
1654
1655 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
03cfe0d5 1656 r = log_error_errno(errno, "Failed to move root mount: %m");
4d9f07b4
LP
1657 goto fail;
1658 }
1659
03cfe0d5 1660 (void) rmdir(template);
4d9f07b4
LP
1661
1662 return 0;
1663
1664fail:
1665 if (bind_mounted)
03cfe0d5
LP
1666 (void) umount(t);
1667
4d9f07b4 1668 if (tmpfs_mounted)
03cfe0d5
LP
1669 (void) umount(template);
1670 (void) rmdir(template);
4d9f07b4
LP
1671 return r;
1672}
1673
9f24adc2 1674static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
03cfe0d5 1675 assert(s);
9f24adc2
LP
1676
1677 snprintf(s, 37,
1678 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1679 SD_ID128_FORMAT_VAL(id));
1680
1681 return s;
1682}
1683
04bc4a3f 1684static int setup_boot_id(const char *dest) {
03cfe0d5 1685 const char *from, *to;
39883f62 1686 sd_id128_t rnd = {};
04bc4a3f
LP
1687 char as_uuid[37];
1688 int r;
1689
eb91eb18
LP
1690 if (arg_share_system)
1691 return 0;
1692
04bc4a3f
LP
1693 /* Generate a new randomized boot ID, so that each boot-up of
1694 * the container gets a new one */
1695
03cfe0d5
LP
1696 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1697 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1698
1699 r = sd_id128_randomize(&rnd);
f647962d
MS
1700 if (r < 0)
1701 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1702
9f24adc2 1703 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1704
4c1fc3e4 1705 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
f647962d
MS
1706 if (r < 0)
1707 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1708
03cfe0d5
LP
1709 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1710 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1711 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
56f64d95 1712 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1713
1714 unlink(from);
04bc4a3f
LP
1715 return r;
1716}
1717
e58a1277 1718static int copy_devnodes(const char *dest) {
88213476
LP
1719
1720 static const char devnodes[] =
1721 "null\0"
1722 "zero\0"
1723 "full\0"
1724 "random\0"
1725 "urandom\0"
85614d66
TG
1726 "tty\0"
1727 "net/tun\0";
88213476
LP
1728
1729 const char *d;
e58a1277 1730 int r = 0;
7fd1b19b 1731 _cleanup_umask_ mode_t u;
a258bf26
LP
1732
1733 assert(dest);
124640f1
LP
1734
1735 u = umask(0000);
88213476 1736
03cfe0d5
LP
1737 /* Create /dev/net, so that we can create /dev/net/tun in it */
1738 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1739 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1740
88213476 1741 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1742 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1743 struct stat st;
88213476 1744
7f112f50 1745 from = strappend("/dev/", d);
03cfe0d5 1746 to = prefix_root(dest, from);
88213476
LP
1747
1748 if (stat(from, &st) < 0) {
1749
4a62c710
MS
1750 if (errno != ENOENT)
1751 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1752
a258bf26 1753 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1754
03cfe0d5 1755 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1756 return -EIO;
a258bf26 1757
85614d66 1758 } else {
81f5049b
AC
1759 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1760 if (errno != EPERM)
1761 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1762
1763 /* Some systems abusively restrict mknod but
1764 * allow bind mounts. */
1765 r = touch(to);
1766 if (r < 0)
1767 return log_error_errno(r, "touch (%s) failed: %m", to);
1768 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1769 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1770 }
6278cf60 1771
03cfe0d5
LP
1772 r = userns_lchown(to, 0, 0);
1773 if (r < 0)
1774 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1775 }
88213476
LP
1776 }
1777
e58a1277
LP
1778 return r;
1779}
88213476 1780
03cfe0d5
LP
1781static int setup_pts(const char *dest) {
1782 _cleanup_free_ char *options = NULL;
1783 const char *p;
1784
1785#ifdef HAVE_SELINUX
1786 if (arg_selinux_apifs_context)
1787 (void) asprintf(&options,
3dce8915 1788 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1789 arg_uid_shift + TTY_GID,
1790 arg_selinux_apifs_context);
1791 else
1792#endif
1793 (void) asprintf(&options,
3dce8915 1794 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1795 arg_uid_shift + TTY_GID);
f2d88580 1796
03cfe0d5 1797 if (!options)
f2d88580
LP
1798 return log_oom();
1799
03cfe0d5 1800 /* Mount /dev/pts itself */
cc9fce65 1801 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1802 if (mkdir(p, 0755) < 0)
1803 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1804 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1805 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1806 if (userns_lchown(p, 0, 0) < 0)
1807 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1808
1809 /* Create /dev/ptmx symlink */
1810 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1811 if (symlink("pts/ptmx", p) < 0)
1812 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
03cfe0d5
LP
1813 if (userns_lchown(p, 0, 0) < 0)
1814 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
f2d88580 1815
03cfe0d5
LP
1816 /* And fix /dev/pts/ptmx ownership */
1817 p = prefix_roota(dest, "/dev/pts/ptmx");
1818 if (userns_lchown(p, 0, 0) < 0)
1819 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1820
f2d88580
LP
1821 return 0;
1822}
1823
e58a1277 1824static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1825 _cleanup_umask_ mode_t u;
1826 const char *to;
e58a1277 1827 int r;
e58a1277
LP
1828
1829 assert(dest);
1830 assert(console);
1831
1832 u = umask(0000);
1833
03cfe0d5 1834 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1835 if (r < 0)
1836 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1837
a258bf26
LP
1838 /* We need to bind mount the right tty to /dev/console since
1839 * ptys can only exist on pts file systems. To have something
81f5049b 1840 * to bind mount things on we create a empty regular file. */
a258bf26 1841
03cfe0d5 1842 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1843 r = touch(to);
1844 if (r < 0)
1845 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1846
4543768d 1847 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1848 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1849
25ea79fe 1850 return 0;
e58a1277
LP
1851}
1852
1853static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1854 const char *from, *to;
7fd1b19b 1855 _cleanup_umask_ mode_t u;
03cfe0d5 1856 int fd, k;
e58a1277
LP
1857 union {
1858 struct cmsghdr cmsghdr;
1859 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1860 } control = {};
1861 struct msghdr mh = {
1862 .msg_control = &control,
1863 .msg_controllen = sizeof(control),
1864 };
e58a1277
LP
1865 struct cmsghdr *cmsg;
1866
e58a1277 1867 assert(kmsg_socket >= 0);
a258bf26 1868
e58a1277 1869 u = umask(0000);
a258bf26 1870
03cfe0d5 1871 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1872 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1873 * on the reading side behave very similar to /proc/kmsg,
1874 * their writing side behaves differently from /dev/kmsg in
1875 * that writing blocks when nothing is reading. In order to
1876 * avoid any problems with containers deadlocking due to this
1877 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1878 from = prefix_roota(dest, "/run/kmsg");
1879 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1880
4a62c710 1881 if (mkfifo(from, 0600) < 0)
03cfe0d5 1882 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 1883 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1884 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1885
1886 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1887 if (fd < 0)
1888 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1889
e58a1277
LP
1890 cmsg = CMSG_FIRSTHDR(&mh);
1891 cmsg->cmsg_level = SOL_SOCKET;
1892 cmsg->cmsg_type = SCM_RIGHTS;
1893 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1894 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1895
1896 mh.msg_controllen = cmsg->cmsg_len;
1897
1898 /* Store away the fd in the socket, so that it stays open as
1899 * long as we run the child */
6d0b55c2 1900 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
03e334a1 1901 safe_close(fd);
e58a1277 1902
4a62c710
MS
1903 if (k < 0)
1904 return log_error_errno(errno, "Failed to send FIFO fd: %m");
a258bf26 1905
03cfe0d5
LP
1906 /* And now make the FIFO unavailable as /run/kmsg... */
1907 (void) unlink(from);
1908
25ea79fe 1909 return 0;
88213476
LP
1910}
1911
6d0b55c2
LP
1912static int send_rtnl(int send_fd) {
1913 union {
1914 struct cmsghdr cmsghdr;
1915 uint8_t buf[CMSG_SPACE(sizeof(int))];
1916 } control = {};
1917 struct msghdr mh = {
1918 .msg_control = &control,
1919 .msg_controllen = sizeof(control),
1920 };
1921 struct cmsghdr *cmsg;
1922 _cleanup_close_ int fd = -1;
1923 ssize_t k;
1924
1925 assert(send_fd >= 0);
1926
1927 if (!arg_expose_ports)
1928 return 0;
1929
1930 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1931 if (fd < 0)
03cfe0d5 1932 return log_error_errno(errno, "Failed to allocate container netlink: %m");
6d0b55c2
LP
1933
1934 cmsg = CMSG_FIRSTHDR(&mh);
1935 cmsg->cmsg_level = SOL_SOCKET;
1936 cmsg->cmsg_type = SCM_RIGHTS;
1937 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1938 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1939
1940 mh.msg_controllen = cmsg->cmsg_len;
1941
1942 /* Store away the fd in the socket, so that it stays open as
1943 * long as we run the child */
1944 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1945 if (k < 0)
1946 return log_error_errno(errno, "Failed to send netlink fd: %m");
1947
1948 return 0;
1949}
1950
1951static int flush_ports(union in_addr_union *exposed) {
1952 ExposePort *p;
1953 int r, af = AF_INET;
1954
1955 assert(exposed);
1956
1957 if (!arg_expose_ports)
1958 return 0;
1959
1960 if (in_addr_is_null(af, exposed))
1961 return 0;
1962
1963 log_debug("Lost IP address.");
1964
1965 LIST_FOREACH(ports, p, arg_expose_ports) {
1966 r = fw_add_local_dnat(false,
1967 af,
1968 p->protocol,
1969 NULL,
1970 NULL, 0,
1971 NULL, 0,
1972 p->host_port,
1973 exposed,
1974 p->container_port,
1975 NULL);
1976 if (r < 0)
1977 log_warning_errno(r, "Failed to modify firewall: %m");
1978 }
1979
1980 *exposed = IN_ADDR_NULL;
1981 return 0;
1982}
1983
1c4baffc 1984static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
6d0b55c2
LP
1985 _cleanup_free_ struct local_address *addresses = NULL;
1986 _cleanup_free_ char *pretty = NULL;
1987 union in_addr_union new_exposed;
1988 ExposePort *p;
1989 bool add;
1990 int af = AF_INET, r;
1991
1992 assert(exposed);
1993
1994 /* Invoked each time an address is added or removed inside the
1995 * container */
1996
1997 if (!arg_expose_ports)
1998 return 0;
1999
2000 r = local_addresses(rtnl, 0, af, &addresses);
2001 if (r < 0)
2002 return log_error_errno(r, "Failed to enumerate local addresses: %m");
2003
2004 add = r > 0 &&
2005 addresses[0].family == af &&
2006 addresses[0].scope < RT_SCOPE_LINK;
2007
2008 if (!add)
2009 return flush_ports(exposed);
2010
2011 new_exposed = addresses[0].address;
2012 if (in_addr_equal(af, exposed, &new_exposed))
2013 return 0;
2014
2015 in_addr_to_string(af, &new_exposed, &pretty);
2016 log_debug("New container IP is %s.", strna(pretty));
2017
2018 LIST_FOREACH(ports, p, arg_expose_ports) {
2019
2020 r = fw_add_local_dnat(true,
2021 af,
2022 p->protocol,
2023 NULL,
2024 NULL, 0,
2025 NULL, 0,
2026 p->host_port,
2027 &new_exposed,
2028 p->container_port,
2029 in_addr_is_null(af, exposed) ? NULL : exposed);
2030 if (r < 0)
2031 log_warning_errno(r, "Failed to modify firewall: %m");
2032 }
2033
2034 *exposed = new_exposed;
2035 return 0;
2036}
2037
1c4baffc 2038static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
2039 union in_addr_union *exposed = userdata;
2040
2041 assert(rtnl);
2042 assert(m);
2043 assert(exposed);
2044
2045 expose_ports(rtnl, exposed);
2046 return 0;
2047}
2048
1c4baffc 2049static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
6d0b55c2
LP
2050 union {
2051 struct cmsghdr cmsghdr;
2052 uint8_t buf[CMSG_SPACE(sizeof(int))];
2053 } control = {};
2054 struct msghdr mh = {
2055 .msg_control = &control,
2056 .msg_controllen = sizeof(control),
2057 };
2058 struct cmsghdr *cmsg;
1c4baffc 2059 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
6d0b55c2
LP
2060 int fd, r;
2061 ssize_t k;
2062
2063 assert(event);
2064 assert(recv_fd >= 0);
2065 assert(ret);
2066
2067 if (!arg_expose_ports)
2068 return 0;
2069
2070 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
2071 if (k < 0)
2072 return log_error_errno(errno, "Failed to recv netlink fd: %m");
2073
2074 cmsg = CMSG_FIRSTHDR(&mh);
2075 assert(cmsg->cmsg_level == SOL_SOCKET);
2076 assert(cmsg->cmsg_type == SCM_RIGHTS);
657bdca9 2077 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
6d0b55c2
LP
2078 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
2079
1c4baffc 2080 r = sd_netlink_open_fd(&rtnl, fd);
6d0b55c2
LP
2081 if (r < 0) {
2082 safe_close(fd);
2083 return log_error_errno(r, "Failed to create rtnl object: %m");
2084 }
2085
1c4baffc 2086 r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
6d0b55c2
LP
2087 if (r < 0)
2088 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
2089
1c4baffc 2090 r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
6d0b55c2
LP
2091 if (r < 0)
2092 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
2093
1c4baffc 2094 r = sd_netlink_attach_event(rtnl, event, 0);
6d0b55c2
LP
2095 if (r < 0)
2096 return log_error_errno(r, "Failed to add to even loop: %m");
2097
2098 *ret = rtnl;
2099 rtnl = NULL;
2100
2101 return 0;
2102}
2103
3a74cea5 2104static int setup_hostname(void) {
3a74cea5 2105
eb91eb18
LP
2106 if (arg_share_system)
2107 return 0;
2108
605f81a8 2109 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 2110 return -errno;
3a74cea5 2111
7027ff61 2112 return 0;
3a74cea5
LP
2113}
2114
57fb9fb5 2115static int setup_journal(const char *directory) {
4d680aee 2116 sd_id128_t machine_id, this_id;
03cfe0d5
LP
2117 _cleanup_free_ char *b = NULL, *d = NULL;
2118 const char *etc_machine_id, *p, *q;
27407a01 2119 char *id;
57fb9fb5
LP
2120 int r;
2121
df9a75e4
LP
2122 /* Don't link journals in ephemeral mode */
2123 if (arg_ephemeral)
2124 return 0;
2125
03cfe0d5 2126 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
57fb9fb5 2127
03cfe0d5 2128 r = read_one_line_file(etc_machine_id, &b);
27407a01
ZJS
2129 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
2130 return 0;
f647962d 2131 else if (r < 0)
03cfe0d5 2132 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
57fb9fb5 2133
27407a01
ZJS
2134 id = strstrip(b);
2135 if (isempty(id) && arg_link_journal == LINK_AUTO)
2136 return 0;
57fb9fb5 2137
27407a01
ZJS
2138 /* Verify validity */
2139 r = sd_id128_from_string(id, &machine_id);
f647962d 2140 if (r < 0)
03cfe0d5 2141 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
57fb9fb5 2142
4d680aee 2143 r = sd_id128_get_machine(&this_id);
f647962d
MS
2144 if (r < 0)
2145 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
2146
2147 if (sd_id128_equal(machine_id, this_id)) {
2148 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
2149 "Host and machine ids are equal (%s): refusing to link journals", id);
2150 if (arg_link_journal == LINK_AUTO)
2151 return 0;
df9a75e4 2152 return -EEXIST;
4d680aee
ZJS
2153 }
2154
2155 if (arg_link_journal == LINK_NO)
2156 return 0;
2157
03cfe0d5
LP
2158 r = userns_mkdir(directory, "/var", 0755, 0, 0);
2159 if (r < 0)
2160 return log_error_errno(r, "Failed to create /var: %m");
2161
2162 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
2163 if (r < 0)
2164 return log_error_errno(r, "Failed to create /var/log: %m");
2165
2166 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
2167 if (r < 0)
2168 return log_error_errno(r, "Failed to create /var/log/journal: %m");
2169
2170 p = strjoina("/var/log/journal/", id);
2171 q = prefix_roota(directory, p);
27407a01 2172
e26d6ce5 2173 if (path_is_mount_point(p, 0) > 0) {
27407a01
ZJS
2174 if (arg_link_journal != LINK_AUTO) {
2175 log_error("%s: already a mount point, refusing to use for journal", p);
2176 return -EEXIST;
2177 }
2178
2179 return 0;
57fb9fb5
LP
2180 }
2181
e26d6ce5 2182 if (path_is_mount_point(q, 0) > 0) {
57fb9fb5 2183 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
2184 log_error("%s: already a mount point, refusing to use for journal", q);
2185 return -EEXIST;
57fb9fb5
LP
2186 }
2187
27407a01 2188 return 0;
57fb9fb5
LP
2189 }
2190
2191 r = readlink_and_make_absolute(p, &d);
2192 if (r >= 0) {
2193 if ((arg_link_journal == LINK_GUEST ||
2194 arg_link_journal == LINK_AUTO) &&
2195 path_equal(d, q)) {
2196
03cfe0d5 2197 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2198 if (r < 0)
56f64d95 2199 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 2200 return 0;
57fb9fb5
LP
2201 }
2202
4a62c710
MS
2203 if (unlink(p) < 0)
2204 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2205 } else if (r == -EINVAL) {
2206
2207 if (arg_link_journal == LINK_GUEST &&
2208 rmdir(p) < 0) {
2209
27407a01
ZJS
2210 if (errno == ENOTDIR) {
2211 log_error("%s already exists and is neither a symlink nor a directory", p);
2212 return r;
2213 } else {
56f64d95 2214 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 2215 return -errno;
57fb9fb5 2216 }
57fb9fb5
LP
2217 }
2218 } else if (r != -ENOENT) {
56f64d95 2219 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 2220 return r;
57fb9fb5
LP
2221 }
2222
2223 if (arg_link_journal == LINK_GUEST) {
2224
2225 if (symlink(q, p) < 0) {
574edc90 2226 if (arg_link_journal_try) {
56f64d95 2227 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
2228 return 0;
2229 } else {
56f64d95 2230 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
2231 return -errno;
2232 }
57fb9fb5
LP
2233 }
2234
03cfe0d5 2235 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2236 if (r < 0)
56f64d95 2237 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 2238 return 0;
57fb9fb5
LP
2239 }
2240
2241 if (arg_link_journal == LINK_HOST) {
574edc90
MP
2242 /* don't create parents here -- if the host doesn't have
2243 * permanent journal set up, don't force it here */
2244 r = mkdir(p, 0755);
57fb9fb5 2245 if (r < 0) {
574edc90 2246 if (arg_link_journal_try) {
56f64d95 2247 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
2248 return 0;
2249 } else {
56f64d95 2250 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
2251 return r;
2252 }
57fb9fb5
LP
2253 }
2254
27407a01
ZJS
2255 } else if (access(p, F_OK) < 0)
2256 return 0;
57fb9fb5 2257
cdb2b9d0
LP
2258 if (dir_is_empty(q) == 0)
2259 log_warning("%s is not empty, proceeding anyway.", q);
2260
03cfe0d5 2261 r = userns_mkdir(directory, p, 0755, 0, 0);
57fb9fb5 2262 if (r < 0) {
56f64d95 2263 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 2264 return r;
57fb9fb5
LP
2265 }
2266
4543768d 2267 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 2268 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2269
27407a01 2270 return 0;
57fb9fb5
LP
2271}
2272
88213476 2273static int drop_capabilities(void) {
5076f0cc 2274 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
2275}
2276
5aa4bb6b 2277static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 2278 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
03976f7b 2279 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
2280 int r;
2281
eb91eb18
LP
2282 if (!arg_register)
2283 return 0;
2284
1c03020c 2285 r = sd_bus_default_system(&bus);
f647962d
MS
2286 if (r < 0)
2287 return log_error_errno(r, "Failed to open system bus: %m");
9444b1f2 2288
89f7c846
LP
2289 if (arg_keep_unit) {
2290 r = sd_bus_call_method(
2291 bus,
2292 "org.freedesktop.machine1",
2293 "/org/freedesktop/machine1",
2294 "org.freedesktop.machine1.Manager",
5aa4bb6b 2295 "RegisterMachineWithNetwork",
89f7c846
LP
2296 &error,
2297 NULL,
5aa4bb6b 2298 "sayssusai",
89f7c846
LP
2299 arg_machine,
2300 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2301 "nspawn",
2302 "container",
2303 (uint32_t) pid,
5aa4bb6b
LP
2304 strempty(arg_directory),
2305 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 2306 } else {
9457ac5b 2307 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
f36933fe 2308 char **i;
ce5b3ad4 2309 unsigned j;
9457ac5b
LP
2310
2311 r = sd_bus_message_new_method_call(
89f7c846 2312 bus,
9457ac5b 2313 &m,
89f7c846
LP
2314 "org.freedesktop.machine1",
2315 "/org/freedesktop/machine1",
2316 "org.freedesktop.machine1.Manager",
5aa4bb6b 2317 "CreateMachineWithNetwork");
f647962d 2318 if (r < 0)
f36933fe 2319 return bus_log_create_error(r);
9457ac5b
LP
2320
2321 r = sd_bus_message_append(
2322 m,
5aa4bb6b 2323 "sayssusai",
89f7c846
LP
2324 arg_machine,
2325 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2326 "nspawn",
2327 "container",
2328 (uint32_t) pid,
5aa4bb6b
LP
2329 strempty(arg_directory),
2330 local_ifindex > 0 ? 1 : 0, local_ifindex);
f647962d 2331 if (r < 0)
f36933fe 2332 return bus_log_create_error(r);
9457ac5b
LP
2333
2334 r = sd_bus_message_open_container(m, 'a', "(sv)");
f647962d 2335 if (r < 0)
f36933fe 2336 return bus_log_create_error(r);
9457ac5b
LP
2337
2338 if (!isempty(arg_slice)) {
2339 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
f647962d 2340 if (r < 0)
f36933fe 2341 return bus_log_create_error(r);
9457ac5b
LP
2342 }
2343
2344 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
f647962d 2345 if (r < 0)
f36933fe 2346 return bus_log_create_error(r);
9457ac5b 2347
773ce3d8
LP
2348 /* If you make changes here, also make sure to update
2349 * systemd-nspawn@.service, to keep the device
2350 * policies in sync regardless if we are run with or
2351 * without the --keep-unit switch. */
63cc4c31 2352 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
9457ac5b
LP
2353 /* Allow the container to
2354 * access and create the API
2355 * device nodes, so that
2356 * PrivateDevices= in the
2357 * container can work
2358 * fine */
2359 "/dev/null", "rwm",
2360 "/dev/zero", "rwm",
2361 "/dev/full", "rwm",
2362 "/dev/random", "rwm",
2363 "/dev/urandom", "rwm",
2364 "/dev/tty", "rwm",
864e1706 2365 "/dev/net/tun", "rwm",
9457ac5b
LP
2366 /* Allow the container
2367 * access to ptys. However,
2368 * do not permit the
2369 * container to ever create
2370 * these device nodes. */
2371 "/dev/pts/ptmx", "rw",
63cc4c31 2372 "char-pts", "rw");
f647962d 2373 if (r < 0)
27023c0e
LP
2374 return bus_log_create_error(r);
2375
ce5b3ad4
SJ
2376 for (j = 0; j < arg_n_custom_mounts; j++) {
2377 CustomMount *cm = &arg_custom_mounts[j];
2378
2379 if (cm->type != CUSTOM_MOUNT_BIND)
2380 continue;
2381
2382 r = is_device_node(cm->source);
2383 if (r < 0)
2384 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
2385
2386 if (r) {
2387 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
2388 cm->source, cm->read_only ? "r" : "rw");
2389 if (r < 0)
2390 return log_error_errno(r, "Failed to append message arguments: %m");
2391 }
2392 }
2393
27023c0e
LP
2394 if (arg_kill_signal != 0) {
2395 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
2396 if (r < 0)
2397 return bus_log_create_error(r);
2398
2399 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
2400 if (r < 0)
2401 return bus_log_create_error(r);
2402 }
9457ac5b 2403
f36933fe
LP
2404 STRV_FOREACH(i, arg_property) {
2405 r = sd_bus_message_open_container(m, 'r', "sv");
2406 if (r < 0)
2407 return bus_log_create_error(r);
2408
2409 r = bus_append_unit_property_assignment(m, *i);
2410 if (r < 0)
2411 return r;
2412
2413 r = sd_bus_message_close_container(m);
2414 if (r < 0)
2415 return bus_log_create_error(r);
2416 }
2417
9457ac5b 2418 r = sd_bus_message_close_container(m);
f647962d 2419 if (r < 0)
f36933fe 2420 return bus_log_create_error(r);
9457ac5b
LP
2421
2422 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
2423 }
2424
9444b1f2 2425 if (r < 0) {
1f0cd86b
LP
2426 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2427 return r;
2428 }
2429
2430 return 0;
2431}
2432
2433static int terminate_machine(pid_t pid) {
2434 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2435 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
03976f7b 2436 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
2437 const char *path;
2438 int r;
2439
eb91eb18
LP
2440 if (!arg_register)
2441 return 0;
2442
1a2399e5
LP
2443 /* If we are reusing the unit, then just exit, systemd will do
2444 * the right thing when we exit. */
2445 if (arg_keep_unit)
2446 return 0;
2447
76b54375 2448 r = sd_bus_default_system(&bus);
f647962d
MS
2449 if (r < 0)
2450 return log_error_errno(r, "Failed to open system bus: %m");
1f0cd86b
LP
2451
2452 r = sd_bus_call_method(
2453 bus,
2454 "org.freedesktop.machine1",
2455 "/org/freedesktop/machine1",
2456 "org.freedesktop.machine1.Manager",
2457 "GetMachineByPID",
2458 &error,
2459 &reply,
2460 "u",
2461 (uint32_t) pid);
2462 if (r < 0) {
2463 /* Note that the machine might already have been
2464 * cleaned up automatically, hence don't consider it a
2465 * failure if we cannot get the machine object. */
2466 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2467 return 0;
2468 }
2469
2470 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
2471 if (r < 0)
2472 return bus_log_parse_error(r);
9444b1f2 2473
1f0cd86b
LP
2474 r = sd_bus_call_method(
2475 bus,
2476 "org.freedesktop.machine1",
2477 path,
2478 "org.freedesktop.machine1.Machine",
2479 "Terminate",
2480 &error,
2481 NULL,
2482 NULL);
2483 if (r < 0) {
2484 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2485 return 0;
2486 }
2487
9444b1f2
LP
2488 return 0;
2489}
2490
db999e0f
LP
2491static int reset_audit_loginuid(void) {
2492 _cleanup_free_ char *p = NULL;
2493 int r;
2494
2495 if (arg_share_system)
2496 return 0;
2497
2498 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2499 if (r == -ENOENT)
db999e0f 2500 return 0;
f647962d
MS
2501 if (r < 0)
2502 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2503
2504 /* Already reset? */
2505 if (streq(p, "4294967295"))
2506 return 0;
2507
ad118bda 2508 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 2509 if (r < 0) {
10a87006
LP
2510 log_error_errno(r,
2511 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2512 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2513 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2514 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2515 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2516
db999e0f 2517 sleep(5);
77b6e194 2518 }
db999e0f
LP
2519
2520 return 0;
77b6e194
LP
2521}
2522
4f758c23
LP
2523#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2524#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
e867ceb6 2525#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
01dde061 2526
a90e2305 2527static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
01dde061
TG
2528 uint8_t result[8];
2529 size_t l, sz;
a90e2305
LP
2530 uint8_t *v, *i;
2531 int r;
01dde061
TG
2532
2533 l = strlen(arg_machine);
2534 sz = sizeof(sd_id128_t) + l;
e867ceb6
LP
2535 if (idx > 0)
2536 sz += sizeof(idx);
a90e2305 2537
01dde061
TG
2538 v = alloca(sz);
2539
2540 /* fetch some persistent data unique to the host */
2541 r = sd_id128_get_machine((sd_id128_t*) v);
2542 if (r < 0)
2543 return r;
2544
2545 /* combine with some data unique (on this host) to this
2546 * container instance */
a90e2305
LP
2547 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2548 if (idx > 0) {
2549 idx = htole64(idx);
2550 memcpy(i, &idx, sizeof(idx));
2551 }
01dde061
TG
2552
2553 /* Let's hash the host machine ID plus the container name. We
2554 * use a fixed, but originally randomly created hash key here. */
4f758c23 2555 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
2556
2557 assert_cc(ETH_ALEN <= sizeof(result));
2558 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2559
2560 /* see eth_random_addr in the kernel */
2561 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2562 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2563
2564 return 0;
2565}
2566
5aa4bb6b 2567static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1c4baffc
TG
2568 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2569 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4f758c23 2570 struct ether_addr mac_host, mac_container;
5aa4bb6b 2571 int r, i;
69c79d3c
LP
2572
2573 if (!arg_private_network)
2574 return 0;
2575
2576 if (!arg_network_veth)
2577 return 0;
2578
08af0da2
LP
2579 /* Use two different interface name prefixes depending whether
2580 * we are in bridge mode or not. */
c00524c9 2581 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 2582 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 2583
e867ceb6
LP
2584 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2585 if (r < 0)
2586 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
4f758c23 2587
e867ceb6
LP
2588 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2589 if (r < 0)
2590 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
01dde061 2591
1c4baffc 2592 r = sd_netlink_open(&rtnl);
f647962d
MS
2593 if (r < 0)
2594 return log_error_errno(r, "Failed to connect to netlink: %m");
69c79d3c 2595
151b9b96 2596 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2597 if (r < 0)
2598 return log_error_errno(r, "Failed to allocate netlink message: %m");
69c79d3c 2599
1c4baffc 2600 r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
f647962d
MS
2601 if (r < 0)
2602 return log_error_errno(r, "Failed to add netlink interface name: %m");
69c79d3c 2603
1c4baffc 2604 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
f647962d
MS
2605 if (r < 0)
2606 return log_error_errno(r, "Failed to add netlink MAC address: %m");
4f758c23 2607
1c4baffc 2608 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2609 if (r < 0)
2610 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2611
1c4baffc 2612 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
f647962d
MS
2613 if (r < 0)
2614 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2615
1c4baffc 2616 r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
f647962d
MS
2617 if (r < 0)
2618 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2619
1c4baffc 2620 r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
f647962d
MS
2621 if (r < 0)
2622 return log_error_errno(r, "Failed to add netlink interface name: %m");
01dde061 2623
1c4baffc 2624 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
f647962d
MS
2625 if (r < 0)
2626 return log_error_errno(r, "Failed to add netlink MAC address: %m");
69c79d3c 2627
1c4baffc 2628 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2629 if (r < 0)
2630 return log_error_errno(r, "Failed to add netlink namespace field: %m");
69c79d3c 2631
1c4baffc 2632 r = sd_netlink_message_close_container(m);
f647962d
MS
2633 if (r < 0)
2634 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c 2635
1c4baffc 2636 r = sd_netlink_message_close_container(m);
f647962d
MS
2637 if (r < 0)
2638 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c 2639
1c4baffc 2640 r = sd_netlink_message_close_container(m);
f647962d
MS
2641 if (r < 0)
2642 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c 2643
1c4baffc 2644 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d 2645 if (r < 0)
637aa8a3 2646 return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
69c79d3c 2647
5aa4bb6b 2648 i = (int) if_nametoindex(iface_name);
4a62c710
MS
2649 if (i <= 0)
2650 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
5aa4bb6b
LP
2651
2652 *ifi = i;
2653
69c79d3c
LP
2654 return 0;
2655}
2656
5aa4bb6b 2657static int setup_bridge(const char veth_name[], int *ifi) {
1c4baffc
TG
2658 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2659 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
ab046dde
TG
2660 int r, bridge;
2661
2662 if (!arg_private_network)
2663 return 0;
2664
2665 if (!arg_network_veth)
2666 return 0;
2667
2668 if (!arg_network_bridge)
2669 return 0;
2670
2671 bridge = (int) if_nametoindex(arg_network_bridge);
4a62c710
MS
2672 if (bridge <= 0)
2673 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
ab046dde 2674
5aa4bb6b
LP
2675 *ifi = bridge;
2676
1c4baffc 2677 r = sd_netlink_open(&rtnl);
f647962d
MS
2678 if (r < 0)
2679 return log_error_errno(r, "Failed to connect to netlink: %m");
ab046dde 2680
151b9b96 2681 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
f647962d
MS
2682 if (r < 0)
2683 return log_error_errno(r, "Failed to allocate netlink message: %m");
ab046dde 2684
039dd4af 2685 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
f647962d
MS
2686 if (r < 0)
2687 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
039dd4af 2688
1c4baffc 2689 r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
f647962d
MS
2690 if (r < 0)
2691 return log_error_errno(r, "Failed to add netlink interface name field: %m");
ab046dde 2692
1c4baffc 2693 r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
f647962d
MS
2694 if (r < 0)
2695 return log_error_errno(r, "Failed to add netlink master field: %m");
ab046dde 2696
1c4baffc 2697 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d
MS
2698 if (r < 0)
2699 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
ab046dde
TG
2700
2701 return 0;
2702}
2703
c74e630d
LP
2704static int parse_interface(struct udev *udev, const char *name) {
2705 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2706 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2707 int ifi;
2708
2709 ifi = (int) if_nametoindex(name);
4a62c710
MS
2710 if (ifi <= 0)
2711 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
c74e630d
LP
2712
2713 sprintf(ifi_str, "n%i", ifi);
2714 d = udev_device_new_from_device_id(udev, ifi_str);
4a62c710
MS
2715 if (!d)
2716 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
c74e630d
LP
2717
2718 if (udev_device_get_is_initialized(d) <= 0) {
2719 log_error("Network interface %s is not initialized yet.", name);
2720 return -EBUSY;
2721 }
2722
2723 return ifi;
2724}
2725
69c79d3c 2726static int move_network_interfaces(pid_t pid) {
7e227024 2727 _cleanup_udev_unref_ struct udev *udev = NULL;
1c4baffc 2728 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
aa28aefe
LP
2729 char **i;
2730 int r;
2731
2732 if (!arg_private_network)
2733 return 0;
2734
2735 if (strv_isempty(arg_network_interfaces))
2736 return 0;
2737
1c4baffc 2738 r = sd_netlink_open(&rtnl);
f647962d
MS
2739 if (r < 0)
2740 return log_error_errno(r, "Failed to connect to netlink: %m");
aa28aefe 2741
7e227024
LP
2742 udev = udev_new();
2743 if (!udev) {
2744 log_error("Failed to connect to udev.");
2745 return -ENOMEM;
2746 }
2747
aa28aefe 2748 STRV_FOREACH(i, arg_network_interfaces) {
1c4baffc 2749 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
b88eb17a 2750 int ifi;
aa28aefe 2751
c74e630d
LP
2752 ifi = parse_interface(udev, *i);
2753 if (ifi < 0)
2754 return ifi;
2755
3125b3ef 2756 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
f647962d
MS
2757 if (r < 0)
2758 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2759
1c4baffc 2760 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2761 if (r < 0)
2762 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
7e227024 2763
1c4baffc 2764 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d
MS
2765 if (r < 0)
2766 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
c74e630d 2767 }
7e227024 2768
c74e630d
LP
2769 return 0;
2770}
2771
2772static int setup_macvlan(pid_t pid) {
2773 _cleanup_udev_unref_ struct udev *udev = NULL;
1c4baffc 2774 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
e867ceb6 2775 unsigned idx = 0;
c74e630d
LP
2776 char **i;
2777 int r;
2778
2779 if (!arg_private_network)
2780 return 0;
2781
2782 if (strv_isempty(arg_network_macvlan))
2783 return 0;
2784
1c4baffc 2785 r = sd_netlink_open(&rtnl);
f647962d
MS
2786 if (r < 0)
2787 return log_error_errno(r, "Failed to connect to netlink: %m");
c74e630d
LP
2788
2789 udev = udev_new();
2790 if (!udev) {
2791 log_error("Failed to connect to udev.");
2792 return -ENOMEM;
2793 }
2794
2795 STRV_FOREACH(i, arg_network_macvlan) {
1c4baffc 2796 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
c74e630d 2797 _cleanup_free_ char *n = NULL;
e867ceb6 2798 struct ether_addr mac;
c74e630d
LP
2799 int ifi;
2800
2801 ifi = parse_interface(udev, *i);
2802 if (ifi < 0)
2803 return ifi;
2804
e867ceb6
LP
2805 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2806 if (r < 0)
2807 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2808
c74e630d 2809 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2810 if (r < 0)
2811 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2812
1c4baffc 2813 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
f647962d
MS
2814 if (r < 0)
2815 return log_error_errno(r, "Failed to add netlink interface index: %m");
c74e630d
LP
2816
2817 n = strappend("mv-", *i);
2818 if (!n)
2819 return log_oom();
2820
2821 strshorten(n, IFNAMSIZ-1);
2822
1c4baffc 2823 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
f647962d
MS
2824 if (r < 0)
2825 return log_error_errno(r, "Failed to add netlink interface name: %m");
c74e630d 2826
1c4baffc 2827 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
e867ceb6
LP
2828 if (r < 0)
2829 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2830
1c4baffc 2831 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2832 if (r < 0)
2833 return log_error_errno(r, "Failed to add netlink namespace field: %m");
c74e630d 2834
1c4baffc 2835 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2836 if (r < 0)
2837 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 2838
1c4baffc 2839 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
f647962d
MS
2840 if (r < 0)
2841 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 2842
1c4baffc 2843 r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
f647962d
MS
2844 if (r < 0)
2845 return log_error_errno(r, "Failed to append macvlan mode: %m");
c74e630d 2846
1c4baffc 2847 r = sd_netlink_message_close_container(m);
f647962d
MS
2848 if (r < 0)
2849 return log_error_errno(r, "Failed to close netlink container: %m");
c74e630d 2850
1c4baffc 2851 r = sd_netlink_message_close_container(m);
f647962d
MS
2852 if (r < 0)
2853 return log_error_errno(r, "Failed to close netlink container: %m");
aa28aefe 2854
1c4baffc 2855 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d
MS
2856 if (r < 0)
2857 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
aa28aefe
LP
2858 }
2859
2860 return 0;
2861}
2862
4bbfe7ad
TG
2863static int setup_ipvlan(pid_t pid) {
2864 _cleanup_udev_unref_ struct udev *udev = NULL;
1c4baffc 2865 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4bbfe7ad
TG
2866 char **i;
2867 int r;
2868
2869 if (!arg_private_network)
2870 return 0;
2871
2872 if (strv_isempty(arg_network_ipvlan))
2873 return 0;
2874
1c4baffc 2875 r = sd_netlink_open(&rtnl);
4bbfe7ad
TG
2876 if (r < 0)
2877 return log_error_errno(r, "Failed to connect to netlink: %m");
2878
2879 udev = udev_new();
2880 if (!udev) {
2881 log_error("Failed to connect to udev.");
2882 return -ENOMEM;
2883 }
2884
2885 STRV_FOREACH(i, arg_network_ipvlan) {
1c4baffc 2886 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
4bbfe7ad
TG
2887 _cleanup_free_ char *n = NULL;
2888 int ifi;
2889
2890 ifi = parse_interface(udev, *i);
2891 if (ifi < 0)
2892 return ifi;
2893
2894 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2895 if (r < 0)
2896 return log_error_errno(r, "Failed to allocate netlink message: %m");
2897
1c4baffc 2898 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
4bbfe7ad
TG
2899 if (r < 0)
2900 return log_error_errno(r, "Failed to add netlink interface index: %m");
2901
2902 n = strappend("iv-", *i);
2903 if (!n)
2904 return log_oom();
2905
2906 strshorten(n, IFNAMSIZ-1);
2907
1c4baffc 2908 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
4bbfe7ad
TG
2909 if (r < 0)
2910 return log_error_errno(r, "Failed to add netlink interface name: %m");
2911
1c4baffc 2912 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
4bbfe7ad
TG
2913 if (r < 0)
2914 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2915
1c4baffc 2916 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
4bbfe7ad
TG
2917 if (r < 0)
2918 return log_error_errno(r, "Failed to open netlink container: %m");
2919
1c4baffc 2920 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
4bbfe7ad
TG
2921 if (r < 0)
2922 return log_error_errno(r, "Failed to open netlink container: %m");
2923
1c4baffc 2924 r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
4bbfe7ad
TG
2925 if (r < 0)
2926 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2927
1c4baffc 2928 r = sd_netlink_message_close_container(m);
4bbfe7ad
TG
2929 if (r < 0)
2930 return log_error_errno(r, "Failed to close netlink container: %m");
2931
1c4baffc 2932 r = sd_netlink_message_close_container(m);
4bbfe7ad
TG
2933 if (r < 0)
2934 return log_error_errno(r, "Failed to close netlink container: %m");
2935
1c4baffc 2936 r = sd_netlink_call(rtnl, m, 0, NULL);
4bbfe7ad
TG
2937 if (r < 0)
2938 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2939 }
2940
2941 return 0;
2942}
2943
28650077 2944static int setup_seccomp(void) {
24fb1112
LP
2945
2946#ifdef HAVE_SECCOMP
9a71b112
JF
2947 static const struct {
2948 uint64_t capability;
2949 int syscall_num;
2950 } blacklist[] = {
5ba7a268
LP
2951 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
2952 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
2953 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
2954 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
2955 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
2956 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
2957 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
2958 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
2959 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
2960 { CAP_SYSLOG, SCMP_SYS(syslog) },
d0a0ccf3
JF
2961 };
2962
24fb1112 2963 scmp_filter_ctx seccomp;
28650077 2964 unsigned i;
24fb1112
LP
2965 int r;
2966
24fb1112
LP
2967 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2968 if (!seccomp)
2969 return log_oom();
2970
e9642be2 2971 r = seccomp_add_secondary_archs(seccomp);
9875fd78 2972 if (r < 0) {
da927ba9 2973 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
2974 goto finish;
2975 }
2976
28650077 2977 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
2978 if (arg_retain & (1ULL << blacklist[i].capability))
2979 continue;
2980
2981 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
2982 if (r == -EFAULT)
2983 continue; /* unknown syscall */
2984 if (r < 0) {
da927ba9 2985 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
2986 goto finish;
2987 }
2988 }
2989
d0a0ccf3 2990
28650077
LP
2991 /*
2992 Audit is broken in containers, much of the userspace audit
2993 hookup will fail if running inside a container. We don't
2994 care and just turn off creation of audit sockets.
2995
2996 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2997 with EAFNOSUPPORT which audit userspace uses as indication
2998 that audit is disabled in the kernel.
2999 */
3000
3302da46 3001 r = seccomp_rule_add(
24fb1112
LP
3002 seccomp,
3003 SCMP_ACT_ERRNO(EAFNOSUPPORT),
3004 SCMP_SYS(socket),
3005 2,
3006 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
3007 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
3008 if (r < 0) {
da927ba9 3009 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
3010 goto finish;
3011 }
3012
3013 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
3014 if (r < 0) {
da927ba9 3015 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
3016 goto finish;
3017 }
3018
3019 r = seccomp_load(seccomp);
9b1cbdc6
ILG
3020 if (r == -EINVAL) {
3021 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3022 r = 0;
3023 goto finish;
3024 }
3025 if (r < 0) {
da927ba9 3026 log_error_errno(r, "Failed to install seccomp audit filter: %m");
9b1cbdc6
ILG
3027 goto finish;
3028 }
24fb1112
LP
3029
3030finish:
3031 seccomp_release(seccomp);
3032 return r;
3033#else
3034 return 0;
3035#endif
3036
3037}
3038
785890ac
LP
3039static int setup_propagate(const char *root) {
3040 const char *p, *q;
3041
3042 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3043 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 3044 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
3045 (void) mkdir_p(p, 0600);
3046
03cfe0d5
LP
3047 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
3048 return log_error_errno(errno, "Failed to create /run/systemd: %m");
3049
3050 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3051 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
3052
3053 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3054 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 3055
03cfe0d5 3056 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
3057 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
3058 return log_error_errno(errno, "Failed to install propagation bind mount.");
3059
3060 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
3061 return log_error_errno(errno, "Failed to make propagation mount read-only");
3062
3063 return 0;
3064}
3065
1b9e5b12
LP
3066static int setup_image(char **device_path, int *loop_nr) {
3067 struct loop_info64 info = {
3068 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
3069 };
3070 _cleanup_close_ int fd = -1, control = -1, loop = -1;
3071 _cleanup_free_ char* loopdev = NULL;
3072 struct stat st;
3073 int r, nr;
3074
3075 assert(device_path);
3076 assert(loop_nr);
ec16945e 3077 assert(arg_image);
1b9e5b12
LP
3078
3079 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
3080 if (fd < 0)
3081 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 3082
4a62c710
MS
3083 if (fstat(fd, &st) < 0)
3084 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
3085
3086 if (S_ISBLK(st.st_mode)) {
3087 char *p;
3088
3089 p = strdup(arg_image);
3090 if (!p)
3091 return log_oom();
3092
3093 *device_path = p;
3094
3095 *loop_nr = -1;
3096
3097 r = fd;
3098 fd = -1;
3099
3100 return r;
3101 }
3102
3103 if (!S_ISREG(st.st_mode)) {
56f64d95 3104 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
3105 return -EINVAL;
3106 }
3107
3108 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
3109 if (control < 0)
3110 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
3111
3112 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
3113 if (nr < 0)
3114 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
3115
3116 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
3117 return log_oom();
3118
3119 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
3120 if (loop < 0)
3121 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 3122
4a62c710
MS
3123 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
3124 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
3125
3126 if (arg_read_only)
3127 info.lo_flags |= LO_FLAGS_READ_ONLY;
3128
4a62c710
MS
3129 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
3130 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
3131
3132 *device_path = loopdev;
3133 loopdev = NULL;
3134
3135 *loop_nr = nr;
3136
3137 r = loop;
3138 loop = -1;
3139
3140 return r;
3141}
3142
ada4799a
LP
3143#define PARTITION_TABLE_BLURB \
3144 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 3145 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 3146 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
3147 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3148 "to be bootable with systemd-nspawn."
3149
1b9e5b12
LP
3150static int dissect_image(
3151 int fd,
727fd4fd
LP
3152 char **root_device, bool *root_device_rw,
3153 char **home_device, bool *home_device_rw,
3154 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
3155 bool *secondary) {
3156
3157#ifdef HAVE_BLKID
01dc33ce
ZJS
3158 int home_nr = -1, srv_nr = -1;
3159#ifdef GPT_ROOT_NATIVE
3160 int root_nr = -1;
3161#endif
3162#ifdef GPT_ROOT_SECONDARY
3163 int secondary_root_nr = -1;
3164#endif
f6c51a81 3165 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
3166 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
3167 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
3168 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3169 _cleanup_udev_unref_ struct udev *udev = NULL;
3170 struct udev_list_entry *first, *item;
f6c51a81 3171 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 3172 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
3173 const char *pttype = NULL;
3174 blkid_partlist pl;
3175 struct stat st;
c09ef2e4 3176 unsigned i;
1b9e5b12
LP
3177 int r;
3178
3179 assert(fd >= 0);
3180 assert(root_device);
3181 assert(home_device);
3182 assert(srv_device);
3183 assert(secondary);
ec16945e 3184 assert(arg_image);
1b9e5b12
LP
3185
3186 b = blkid_new_probe();
3187 if (!b)
3188 return log_oom();
3189
3190 errno = 0;
3191 r = blkid_probe_set_device(b, fd, 0, 0);
3192 if (r != 0) {
3193 if (errno == 0)
3194 return log_oom();
3195
56f64d95 3196 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
3197 return -errno;
3198 }
3199
3200 blkid_probe_enable_partitions(b, 1);
3201 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
3202
3203 errno = 0;
3204 r = blkid_do_safeprobe(b);
3205 if (r == -2 || r == 1) {
ada4799a
LP
3206 log_error("Failed to identify any partition table on\n"
3207 " %s\n"
3208 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
3209 return -EINVAL;
3210 } else if (r != 0) {
3211 if (errno == 0)
3212 errno = EIO;
56f64d95 3213 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
3214 return -errno;
3215 }
3216
48861960 3217 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
3218
3219 is_gpt = streq_ptr(pttype, "gpt");
3220 is_mbr = streq_ptr(pttype, "dos");
3221
3222 if (!is_gpt && !is_mbr) {
3223 log_error("No GPT or MBR partition table discovered on\n"
3224 " %s\n"
3225 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
3226 return -EINVAL;
3227 }
3228
3229 errno = 0;
3230 pl = blkid_probe_get_partitions(b);
3231 if (!pl) {
3232 if (errno == 0)
3233 return log_oom();
3234
3235 log_error("Failed to list partitions of %s", arg_image);
3236 return -errno;
3237 }
3238
3239 udev = udev_new();
3240 if (!udev)
3241 return log_oom();
3242
4a62c710
MS
3243 if (fstat(fd, &st) < 0)
3244 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 3245
c09ef2e4
LP
3246 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
3247 if (!d)
1b9e5b12
LP
3248 return log_oom();
3249
c09ef2e4
LP
3250 for (i = 0;; i++) {
3251 int n, m;
1b9e5b12 3252
c09ef2e4
LP
3253 if (i >= 10) {
3254 log_error("Kernel partitions never appeared.");
3255 return -ENXIO;
3256 }
3257
3258 e = udev_enumerate_new(udev);
3259 if (!e)
3260 return log_oom();
3261
3262 r = udev_enumerate_add_match_parent(e, d);
3263 if (r < 0)
3264 return log_oom();
3265
3266 r = udev_enumerate_scan_devices(e);
3267 if (r < 0)
3268 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
3269
3270 /* Count the partitions enumerated by the kernel */
3271 n = 0;
3272 first = udev_enumerate_get_list_entry(e);
3273 udev_list_entry_foreach(item, first)
3274 n++;
3275
3276 /* Count the partitions enumerated by blkid */
3277 m = blkid_partlist_numof_partitions(pl);
3278 if (n == m + 1)
3279 break;
3280 if (n > m + 1) {
3281 log_error("blkid and kernel partition list do not match.");
3282 return -EIO;
3283 }
3284 if (n < m + 1) {
3285 unsigned j;
3286
3287 /* The kernel has probed fewer partitions than
3288 * blkid? Maybe the kernel prober is still
3289 * running or it got EBUSY because udev
3290 * already opened the device. Let's reprobe
3291 * the device, which is a synchronous call
3292 * that waits until probing is complete. */
3293
3294 for (j = 0; j < 20; j++) {
3295
3296 r = ioctl(fd, BLKRRPART, 0);
3297 if (r < 0)
3298 r = -errno;
3299 if (r >= 0 || r != -EBUSY)
3300 break;
3301
3302 /* If something else has the device
3303 * open, such as an udev rule, the
3304 * ioctl will return EBUSY. Since
3305 * there's no way to wait until it
3306 * isn't busy anymore, let's just wait
3307 * a bit, and try again.
3308 *
3309 * This is really something they
3310 * should fix in the kernel! */
3311
3312 usleep(50 * USEC_PER_MSEC);
3313 }
3314
3315 if (r < 0)
3316 return log_error_errno(r, "Failed to reread partition table: %m");
3317 }
3318
3319 e = udev_enumerate_unref(e);
3320 }
1b9e5b12
LP
3321
3322 first = udev_enumerate_get_list_entry(e);
3323 udev_list_entry_foreach(item, first) {
3324 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 3325 const char *node;
727fd4fd 3326 unsigned long long flags;
1b9e5b12
LP
3327 blkid_partition pp;
3328 dev_t qn;
3329 int nr;
3330
3331 errno = 0;
3332 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
3333 if (!q) {
3334 if (!errno)
3335 errno = ENOMEM;
3336
56f64d95 3337 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
3338 return -errno;
3339 }
3340
3341 qn = udev_device_get_devnum(q);
3342 if (major(qn) == 0)
3343 continue;
3344
3345 if (st.st_rdev == qn)
3346 continue;
3347
3348 node = udev_device_get_devnode(q);
3349 if (!node)
3350 continue;
3351
3352 pp = blkid_partlist_devno_to_partition(pl, qn);
3353 if (!pp)
3354 continue;
3355
727fd4fd 3356 flags = blkid_partition_get_flags(pp);
727fd4fd 3357
1b9e5b12
LP
3358 nr = blkid_partition_get_partno(pp);
3359 if (nr < 0)
3360 continue;
3361
ada4799a
LP
3362 if (is_gpt) {
3363 sd_id128_t type_id;
3364 const char *stype;
1b9e5b12 3365
f6c51a81
LP
3366 if (flags & GPT_FLAG_NO_AUTO)
3367 continue;
3368
ada4799a
LP
3369 stype = blkid_partition_get_type_string(pp);
3370 if (!stype)
3371 continue;
1b9e5b12 3372
ada4799a 3373 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
3374 continue;
3375
ada4799a 3376 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 3377
ada4799a
LP
3378 if (home && nr >= home_nr)
3379 continue;
1b9e5b12 3380
ada4799a
LP
3381 home_nr = nr;
3382 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 3383
ada4799a
LP
3384 r = free_and_strdup(&home, node);
3385 if (r < 0)
3386 return log_oom();
727fd4fd 3387
ada4799a
LP
3388 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3389
3390 if (srv && nr >= srv_nr)
3391 continue;
3392
3393 srv_nr = nr;
3394 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3395
3396 r = free_and_strdup(&srv, node);
3397 if (r < 0)
3398 return log_oom();
3399 }
1b9e5b12 3400#ifdef GPT_ROOT_NATIVE
ada4799a 3401 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 3402
ada4799a
LP
3403 if (root && nr >= root_nr)
3404 continue;
1b9e5b12 3405
ada4799a
LP
3406 root_nr = nr;
3407 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 3408
ada4799a
LP
3409 r = free_and_strdup(&root, node);
3410 if (r < 0)
3411 return log_oom();
3412 }
1b9e5b12
LP
3413#endif
3414#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
3415 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3416
3417 if (secondary_root && nr >= secondary_root_nr)
3418 continue;
3419
3420 secondary_root_nr = nr;
3421 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3422
3423 r = free_and_strdup(&secondary_root, node);
3424 if (r < 0)
3425 return log_oom();
3426 }
3427#endif
f6c51a81
LP
3428 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3429
3430 if (generic)
3431 multiple_generic = true;
3432 else {
3433 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3434
3435 r = free_and_strdup(&generic, node);
3436 if (r < 0)
3437 return log_oom();
3438 }
3439 }
ada4799a
LP
3440
3441 } else if (is_mbr) {
3442 int type;
1b9e5b12 3443
f6c51a81
LP
3444 if (flags != 0x80) /* Bootable flag */
3445 continue;
3446
ada4799a
LP
3447 type = blkid_partition_get_type(pp);
3448 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
3449 continue;
3450
f6c51a81
LP
3451 if (generic)
3452 multiple_generic = true;
3453 else {
3454 generic_rw = true;
727fd4fd 3455
f6c51a81
LP
3456 r = free_and_strdup(&root, node);
3457 if (r < 0)
3458 return log_oom();
3459 }
1b9e5b12 3460 }
1b9e5b12
LP
3461 }
3462
1b9e5b12
LP
3463 if (root) {
3464 *root_device = root;
3465 root = NULL;
727fd4fd
LP
3466
3467 *root_device_rw = root_rw;
1b9e5b12
LP
3468 *secondary = false;
3469 } else if (secondary_root) {
3470 *root_device = secondary_root;
3471 secondary_root = NULL;
727fd4fd
LP
3472
3473 *root_device_rw = secondary_root_rw;
1b9e5b12 3474 *secondary = true;
f6c51a81
LP
3475 } else if (generic) {
3476
3477 /* There were no partitions with precise meanings
3478 * around, but we found generic partitions. In this
3479 * case, if there's only one, we can go ahead and boot
3480 * it, otherwise we bail out, because we really cannot
3481 * make any sense of it. */
3482
3483 if (multiple_generic) {
3484 log_error("Identified multiple bootable Linux partitions on\n"
3485 " %s\n"
3486 PARTITION_TABLE_BLURB, arg_image);
3487 return -EINVAL;
3488 }
3489
3490 *root_device = generic;
3491 generic = NULL;
3492
3493 *root_device_rw = generic_rw;
3494 *secondary = false;
3495 } else {
3496 log_error("Failed to identify root partition in disk image\n"
3497 " %s\n"
3498 PARTITION_TABLE_BLURB, arg_image);
3499 return -EINVAL;
1b9e5b12
LP
3500 }
3501
3502 if (home) {
3503 *home_device = home;
3504 home = NULL;
727fd4fd
LP
3505
3506 *home_device_rw = home_rw;
1b9e5b12
LP
3507 }
3508
3509 if (srv) {
3510 *srv_device = srv;
3511 srv = NULL;
727fd4fd
LP
3512
3513 *srv_device_rw = srv_rw;
1b9e5b12
LP
3514 }
3515
3516 return 0;
3517#else
3518 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3519 return -EOPNOTSUPP;
1b9e5b12
LP
3520#endif
3521}
3522
727fd4fd 3523static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
3524#ifdef HAVE_BLKID
3525 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3526 const char *fstype, *p;
3527 int r;
3528
3529 assert(what);
3530 assert(where);
3531
727fd4fd
LP
3532 if (arg_read_only)
3533 rw = false;
3534
1b9e5b12 3535 if (directory)
63c372cb 3536 p = strjoina(where, directory);
1b9e5b12
LP
3537 else
3538 p = where;
3539
3540 errno = 0;
3541 b = blkid_new_probe_from_filename(what);
3542 if (!b) {
3543 if (errno == 0)
3544 return log_oom();
56f64d95 3545 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
3546 return -errno;
3547 }
3548
3549 blkid_probe_enable_superblocks(b, 1);
3550 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3551
3552 errno = 0;
3553 r = blkid_do_safeprobe(b);
3554 if (r == -1 || r == 1) {
3555 log_error("Cannot determine file system type of %s", what);
3556 return -EINVAL;
3557 } else if (r != 0) {
3558 if (errno == 0)
3559 errno = EIO;
56f64d95 3560 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
3561 return -errno;
3562 }
3563
3564 errno = 0;
3565 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3566 if (errno == 0)
3567 errno = EINVAL;
3568 log_error("Failed to determine file system type of %s", what);
3569 return -errno;
3570 }
3571
3572 if (streq(fstype, "crypto_LUKS")) {
3573 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 3574 return -EOPNOTSUPP;
1b9e5b12
LP
3575 }
3576
4a62c710
MS
3577 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3578 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
3579
3580 return 0;
3581#else
3582 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3583 return -EOPNOTSUPP;
1b9e5b12
LP
3584#endif
3585}
3586
727fd4fd
LP
3587static int mount_devices(
3588 const char *where,
3589 const char *root_device, bool root_device_rw,
3590 const char *home_device, bool home_device_rw,
3591 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
3592 int r;
3593
3594 assert(where);
3595
3596 if (root_device) {
727fd4fd 3597 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
3598 if (r < 0)
3599 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
3600 }
3601
3602 if (home_device) {
727fd4fd 3603 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
3604 if (r < 0)
3605 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
3606 }
3607
3608 if (srv_device) {
727fd4fd 3609 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
3610 if (r < 0)
3611 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
3612 }
3613
3614 return 0;
3615}
3616
3617static void loop_remove(int nr, int *image_fd) {
3618 _cleanup_close_ int control = -1;
e8c8ddcc 3619 int r;
1b9e5b12
LP
3620
3621 if (nr < 0)
3622 return;
3623
3624 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
3625 r = ioctl(*image_fd, LOOP_CLR_FD);
3626 if (r < 0)
5e4074aa 3627 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 3628 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
3629 }
3630
3631 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 3632 if (control < 0) {
56f64d95 3633 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 3634 return;
e8c8ddcc 3635 }
1b9e5b12 3636
e8c8ddcc
TG
3637 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3638 if (r < 0)
5e4074aa 3639 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
3640}
3641
0cb9fbcd
LP
3642static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3643 int pipe_fds[2];
3644 pid_t pid;
3645
3646 assert(database);
3647 assert(key);
3648 assert(rpid);
3649
4a62c710
MS
3650 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3651 return log_error_errno(errno, "Failed to allocate pipe: %m");
0cb9fbcd
LP
3652
3653 pid = fork();
4a62c710
MS
3654 if (pid < 0)
3655 return log_error_errno(errno, "Failed to fork getent child: %m");
3656 else if (pid == 0) {
0cb9fbcd
LP
3657 int nullfd;
3658 char *empty_env = NULL;
3659
3660 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3661 _exit(EXIT_FAILURE);
3662
3663 if (pipe_fds[0] > 2)
03e334a1 3664 safe_close(pipe_fds[0]);
0cb9fbcd 3665 if (pipe_fds[1] > 2)
03e334a1 3666 safe_close(pipe_fds[1]);
0cb9fbcd
LP
3667
3668 nullfd = open("/dev/null", O_RDWR);
3669 if (nullfd < 0)
3670 _exit(EXIT_FAILURE);
3671
3672 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3673 _exit(EXIT_FAILURE);
3674
3675 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3676 _exit(EXIT_FAILURE);
3677
3678 if (nullfd > 2)
03e334a1 3679 safe_close(nullfd);
0cb9fbcd 3680
ce30c8dc
LP
3681 (void) reset_all_signal_handlers();
3682 (void) reset_signal_mask();
0cb9fbcd
LP
3683 close_all_fds(NULL, 0);
3684
4de82926
MM
3685 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3686 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
3687 _exit(EXIT_FAILURE);
3688 }
3689
03e334a1 3690 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
3691
3692 *rpid = pid;
3693
3694 return pipe_fds[0];
3695}
3696
3697static int change_uid_gid(char **_home) {
a2a5291b
ZJS
3698 char line[LINE_MAX], *x, *u, *g, *h;
3699 const char *word, *state;
0cb9fbcd
LP
3700 _cleanup_free_ uid_t *uids = NULL;
3701 _cleanup_free_ char *home = NULL;
3702 _cleanup_fclose_ FILE *f = NULL;
3703 _cleanup_close_ int fd = -1;
3704 unsigned n_uids = 0;
70f539ca 3705 size_t sz = 0, l;
0cb9fbcd
LP
3706 uid_t uid;
3707 gid_t gid;
3708 pid_t pid;
3709 int r;
3710
3711 assert(_home);
3712
3713 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3714 /* Reset everything fully to 0, just in case */
3715
03cfe0d5
LP
3716 r = reset_uid_gid();
3717 if (r < 0)
3718 return log_error_errno(r, "Failed to become root: %m");
0cb9fbcd
LP
3719
3720 *_home = NULL;
3721 return 0;
3722 }
3723
3724 /* First, get user credentials */
3725 fd = spawn_getent("passwd", arg_user, &pid);
3726 if (fd < 0)
3727 return fd;
3728
3729 f = fdopen(fd, "r");
3730 if (!f)
3731 return log_oom();
3732 fd = -1;
3733
3734 if (!fgets(line, sizeof(line), f)) {
3735
3736 if (!ferror(f)) {
3737 log_error("Failed to resolve user %s.", arg_user);
3738 return -ESRCH;
3739 }
3740
56f64d95 3741 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3742 return -errno;
3743 }
3744
3745 truncate_nl(line);
3746
820d3acf 3747 wait_for_terminate_and_warn("getent passwd", pid, true);
0cb9fbcd
LP
3748
3749 x = strchr(line, ':');
3750 if (!x) {
3751 log_error("/etc/passwd entry has invalid user field.");
3752 return -EIO;
3753 }
3754
3755 u = strchr(x+1, ':');
3756 if (!u) {
3757 log_error("/etc/passwd entry has invalid password field.");
3758 return -EIO;
3759 }
3760
3761 u++;
3762 g = strchr(u, ':');
3763 if (!g) {
3764 log_error("/etc/passwd entry has invalid UID field.");
3765 return -EIO;
3766 }
3767
3768 *g = 0;
3769 g++;
3770 x = strchr(g, ':');
3771 if (!x) {
3772 log_error("/etc/passwd entry has invalid GID field.");
3773 return -EIO;
3774 }
3775
3776 *x = 0;
3777 h = strchr(x+1, ':');
3778 if (!h) {
3779 log_error("/etc/passwd entry has invalid GECOS field.");
3780 return -EIO;
3781 }
3782
3783 h++;
3784 x = strchr(h, ':');
3785 if (!x) {
3786 log_error("/etc/passwd entry has invalid home directory field.");
3787 return -EIO;
3788 }
3789
3790 *x = 0;
3791
3792 r = parse_uid(u, &uid);
3793 if (r < 0) {
3794 log_error("Failed to parse UID of user.");
3795 return -EIO;
3796 }
3797
3798 r = parse_gid(g, &gid);
3799 if (r < 0) {
3800 log_error("Failed to parse GID of user.");
3801 return -EIO;
3802 }
3803
3804 home = strdup(h);
3805 if (!home)
3806 return log_oom();
3807
3808 /* Second, get group memberships */
3809 fd = spawn_getent("initgroups", arg_user, &pid);
3810 if (fd < 0)
3811 return fd;
3812
3813 fclose(f);
3814 f = fdopen(fd, "r");
3815 if (!f)
3816 return log_oom();
3817 fd = -1;
3818
3819 if (!fgets(line, sizeof(line), f)) {
3820 if (!ferror(f)) {
3821 log_error("Failed to resolve user %s.", arg_user);
3822 return -ESRCH;
3823 }
3824
56f64d95 3825 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3826 return -errno;
3827 }
3828
3829 truncate_nl(line);
3830
820d3acf 3831 wait_for_terminate_and_warn("getent initgroups", pid, true);
0cb9fbcd
LP
3832
3833 /* Skip over the username and subsequent separator whitespace */
3834 x = line;
3835 x += strcspn(x, WHITESPACE);
3836 x += strspn(x, WHITESPACE);
3837
a2a5291b 3838 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
3839 char c[l+1];
3840
a2a5291b 3841 memcpy(c, word, l);
0cb9fbcd
LP
3842 c[l] = 0;
3843
3844 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3845 return log_oom();
3846
3847 r = parse_uid(c, &uids[n_uids++]);
3848 if (r < 0) {
3849 log_error("Failed to parse group data from getent.");
3850 return -EIO;
3851 }
3852 }
3853
3854 r = mkdir_parents(home, 0775);
f647962d
MS
3855 if (r < 0)
3856 return log_error_errno(r, "Failed to make home root directory: %m");
0cb9fbcd
LP
3857
3858 r = mkdir_safe(home, 0755, uid, gid);
f647962d
MS
3859 if (r < 0 && r != -EEXIST)
3860 return log_error_errno(r, "Failed to make home directory: %m");
0cb9fbcd 3861
03cfe0d5
LP
3862 (void) fchown(STDIN_FILENO, uid, gid);
3863 (void) fchown(STDOUT_FILENO, uid, gid);
3864 (void) fchown(STDERR_FILENO, uid, gid);
0cb9fbcd 3865
4a62c710
MS
3866 if (setgroups(n_uids, uids) < 0)
3867 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
0cb9fbcd 3868
4a62c710
MS
3869 if (setresgid(gid, gid, gid) < 0)
3870 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3871
4a62c710
MS
3872 if (setresuid(uid, uid, uid) < 0)
3873 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3874
3875 if (_home) {
3876 *_home = home;
3877 home = NULL;
3878 }
3879
3880 return 0;
3881}
3882
113cea80 3883/*
6d416b9c
LS
3884 * Return values:
3885 * < 0 : wait_for_terminate() failed to get the state of the
3886 * container, the container was terminated by a signal, or
3887 * failed for an unknown reason. No change is made to the
3888 * container argument.
3889 * > 0 : The program executed in the container terminated with an
3890 * error. The exit code of the program executed in the
919699ec
LP
3891 * container is returned. The container argument has been set
3892 * to CONTAINER_TERMINATED.
6d416b9c
LS
3893 * 0 : The container is being rebooted, has been shut down or exited
3894 * successfully. The container argument has been set to either
3895 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 3896 *
6d416b9c
LS
3897 * That is, success is indicated by a return value of zero, and an
3898 * error is indicated by a non-zero value.
113cea80
DH
3899 */
3900static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 3901 siginfo_t status;
919699ec 3902 int r;
113cea80
DH
3903
3904 r = wait_for_terminate(pid, &status);
f647962d
MS
3905 if (r < 0)
3906 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
3907
3908 switch (status.si_code) {
fddbb89c 3909
113cea80 3910 case CLD_EXITED:
919699ec
LP
3911 if (status.si_status == 0) {
3912 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 3913
fddbb89c 3914 } else
919699ec 3915 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 3916
919699ec
LP
3917 *container = CONTAINER_TERMINATED;
3918 return status.si_status;
113cea80
DH
3919
3920 case CLD_KILLED:
3921 if (status.si_status == SIGINT) {
113cea80 3922
919699ec 3923 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 3924 *container = CONTAINER_TERMINATED;
919699ec
LP
3925 return 0;
3926
113cea80 3927 } else if (status.si_status == SIGHUP) {
113cea80 3928
919699ec 3929 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 3930 *container = CONTAINER_REBOOTED;
919699ec 3931 return 0;
113cea80 3932 }
919699ec 3933
113cea80
DH
3934 /* CLD_KILLED fallthrough */
3935
3936 case CLD_DUMPED:
fddbb89c 3937 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 3938 return -EIO;
113cea80
DH
3939
3940 default:
fddbb89c 3941 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 3942 return -EIO;
113cea80
DH
3943 }
3944
3945 return r;
3946}
3947
e866af3a
DH
3948static void nop_handler(int sig) {}
3949
023fb90b
LP
3950static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3951 pid_t pid;
3952
3953 pid = PTR_TO_UINT32(userdata);
3954 if (pid > 0) {
c6c8f6e2 3955 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
3956 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3957 sd_event_source_set_userdata(s, NULL);
3958 return 0;
3959 }
3960 }
3961
3962 sd_event_exit(sd_event_source_get_event(s), 0);
3963 return 0;
3964}
3965
ec16945e 3966static int determine_names(void) {
1b9cebf6 3967 int r;
ec16945e
LP
3968
3969 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3970 if (arg_machine) {
3971 _cleanup_(image_unrefp) Image *i = NULL;
3972
3973 r = image_find(arg_machine, &i);
3974 if (r < 0)
3975 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3976 else if (r == 0) {
3977 log_error("No image for machine '%s': %m", arg_machine);
3978 return -ENOENT;
3979 }
3980
aceac2f0 3981 if (i->type == IMAGE_RAW)
1b9cebf6
LP
3982 r = set_sanitized_path(&arg_image, i->path);
3983 else
3984 r = set_sanitized_path(&arg_directory, i->path);
3985 if (r < 0)
3986 return log_error_errno(r, "Invalid image directory: %m");
3987
aee327b8
LP
3988 if (!arg_ephemeral)
3989 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 3990 } else
ec16945e
LP
3991 arg_directory = get_current_dir_name();
3992
1b9cebf6
LP
3993 if (!arg_directory && !arg_machine) {
3994 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
3995 return -EINVAL;
3996 }
3997 }
3998
3999 if (!arg_machine) {
b9ba4dab
LP
4000 if (arg_directory && path_equal(arg_directory, "/"))
4001 arg_machine = gethostname_malloc();
4002 else
4003 arg_machine = strdup(basename(arg_image ?: arg_directory));
4004
ec16945e
LP
4005 if (!arg_machine)
4006 return log_oom();
4007
ae691c1d 4008 hostname_cleanup(arg_machine);
ec16945e
LP
4009 if (!machine_name_is_valid(arg_machine)) {
4010 log_error("Failed to determine machine name automatically, please use -M.");
4011 return -EINVAL;
4012 }
b9ba4dab
LP
4013
4014 if (arg_ephemeral) {
4015 char *b;
4016
4017 /* Add a random suffix when this is an
4018 * ephemeral machine, so that we can run many
4019 * instances at once without manually having
4020 * to specify -M each time. */
4021
4022 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
4023 return log_oom();
4024
4025 free(arg_machine);
4026 arg_machine = b;
4027 }
ec16945e
LP
4028 }
4029
4030 return 0;
4031}
4032
03cfe0d5 4033static int determine_uid_shift(const char *directory) {
6dac160c
LP
4034 int r;
4035
03cfe0d5
LP
4036 if (!arg_userns) {
4037 arg_uid_shift = 0;
6dac160c 4038 return 0;
03cfe0d5 4039 }
6dac160c
LP
4040
4041 if (arg_uid_shift == UID_INVALID) {
4042 struct stat st;
4043
03cfe0d5 4044 r = stat(directory, &st);
6dac160c 4045 if (r < 0)
03cfe0d5 4046 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
4047
4048 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
4049
4050 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 4051 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
4052 return -EINVAL;
4053 }
4054
4055 arg_uid_range = UINT32_C(0x10000);
4056 }
4057
4058 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
4059 log_error("UID base too high for UID range.");
4060 return -EINVAL;
4061 }
4062
4063 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
4064 return 0;
4065}
4066
03cfe0d5
LP
4067static int inner_child(
4068 Barrier *barrier,
4069 const char *directory,
4070 bool secondary,
4071 int kmsg_socket,
4072 int rtnl_socket,
4073 FDSet *fds,
4074 int argc,
4075 char *argv[]) {
69c79d3c 4076
03cfe0d5
LP
4077 _cleanup_free_ char *home = NULL;
4078 unsigned n_env = 2;
4079 const char *envp[] = {
4080 "PATH=" DEFAULT_PATH_SPLIT_USR,
4081 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4082 NULL, /* TERM */
4083 NULL, /* HOME */
4084 NULL, /* USER */
4085 NULL, /* LOGNAME */
4086 NULL, /* container_uuid */
4087 NULL, /* LISTEN_FDS */
4088 NULL, /* LISTEN_PID */
4089 NULL
4090 };
88213476 4091
2371271c 4092 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 4093 int r;
88213476 4094
03cfe0d5
LP
4095 assert(barrier);
4096 assert(directory);
4097 assert(kmsg_socket >= 0);
88213476 4098
03cfe0d5
LP
4099 if (arg_userns) {
4100 /* Tell the parent, that it now can write the UID map. */
4101 (void) barrier_place(barrier); /* #1 */
7027ff61 4102
03cfe0d5
LP
4103 /* Wait until the parent wrote the UID map */
4104 if (!barrier_place_and_sync(barrier)) { /* #2 */
4105 log_error("Parent died too early");
4106 return -ESRCH;
4107 }
88213476
LP
4108 }
4109
03cfe0d5
LP
4110 r = mount_all(NULL, true);
4111 if (r < 0)
4112 return r;
4113
4114 /* Wait until we are cgroup-ified, so that we
4115 * can mount the right cgroup path writable */
4116 if (!barrier_place_and_sync(barrier)) { /* #3 */
4117 log_error("Parent died too early");
4118 return -ESRCH;
88213476
LP
4119 }
4120
03cfe0d5
LP
4121 r = mount_systemd_cgroup_writable("");
4122 if (r < 0)
4123 return r;
ec16945e 4124
03cfe0d5
LP
4125 r = reset_uid_gid();
4126 if (r < 0)
4127 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 4128
03cfe0d5
LP
4129 r = setup_boot_id(NULL);
4130 if (r < 0)
4131 return r;
ec16945e 4132
03cfe0d5
LP
4133 r = setup_kmsg(NULL, kmsg_socket);
4134 if (r < 0)
4135 return r;
4136 kmsg_socket = safe_close(kmsg_socket);
ec16945e 4137
03cfe0d5 4138 umask(0022);
30535c16 4139
03cfe0d5
LP
4140 if (setsid() < 0)
4141 return log_error_errno(errno, "setsid() failed: %m");
4142
4143 if (arg_private_network)
4144 loopback_setup();
4145
4146 r = send_rtnl(rtnl_socket);
4147 if (r < 0)
4148 return r;
4149 rtnl_socket = safe_close(rtnl_socket);
4150
4151 if (drop_capabilities() < 0)
4152 return log_error_errno(errno, "drop_capabilities() failed: %m");
4153
4154 setup_hostname();
4155
050f7277 4156 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
4157 if (personality(arg_personality) < 0)
4158 return log_error_errno(errno, "personality() failed: %m");
4159 } else if (secondary) {
4160 if (personality(PER_LINUX32) < 0)
4161 return log_error_errno(errno, "personality() failed: %m");
4162 }
4163
4164#ifdef HAVE_SELINUX
4165 if (arg_selinux_context)
4166 if (setexeccon((security_context_t) arg_selinux_context) < 0)
4167 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4168#endif
4169
4170 r = change_uid_gid(&home);
4171 if (r < 0)
4172 return r;
4173
4174 envp[n_env] = strv_find_prefix(environ, "TERM=");
4175 if (envp[n_env])
4176 n_env ++;
4177
4178 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4179 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4180 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
4181 return log_oom();
4182
4183 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4184 char as_uuid[37];
4185
4186 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
4187 return log_oom();
4188 }
4189
4190 if (fdset_size(fds) > 0) {
4191 r = fdset_cloexec(fds, false);
4192 if (r < 0)
4193 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4194
4195 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
4196 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
4197 return log_oom();
4198 }
4199
2371271c
TG
4200 env_use = strv_env_merge(2, envp, arg_setenv);
4201 if (!env_use)
4202 return log_oom();
03cfe0d5
LP
4203
4204 /* Let the parent know that we are ready and
4205 * wait until the parent is ready with the
4206 * setup, too... */
4207 if (!barrier_place_and_sync(barrier)) { /* #4 */
4208 log_error("Parent died too early");
4209 return -ESRCH;
4210 }
4211
4212 /* Now, explicitly close the log, so that we
4213 * then can close all remaining fds. Closing
4214 * the log explicitly first has the benefit
4215 * that the logging subsystem knows about it,
4216 * and is thus ready to be reopened should we
4217 * need it again. Note that the other fds
4218 * closed here are at least the locking and
4219 * barrier fds. */
4220 log_close();
4221 (void) fdset_close_others(fds);
4222
4223 if (arg_boot) {
4224 char **a;
4225 size_t m;
4226
4227 /* Automatically search for the init system */
4228
4229 m = 1 + argc - optind;
4230 a = newa(char*, m + 1);
4231 memcpy(a + 1, argv + optind, m * sizeof(char*));
4232
4233 a[0] = (char*) "/usr/lib/systemd/systemd";
4234 execve(a[0], a, env_use);
4235
4236 a[0] = (char*) "/lib/systemd/systemd";
4237 execve(a[0], a, env_use);
4238
4239 a[0] = (char*) "/sbin/init";
4240 execve(a[0], a, env_use);
4241 } else if (argc > optind)
4242 execvpe(argv[optind], argv + optind, env_use);
4243 else {
4244 chdir(home ? home : "/root");
4245 execle("/bin/bash", "-bash", NULL, env_use);
4246 execle("/bin/sh", "-sh", NULL, env_use);
4247 }
4248
4249 (void) log_open();
4250 return log_error_errno(errno, "execv() failed: %m");
4251}
4252
4253static int outer_child(
4254 Barrier *barrier,
4255 const char *directory,
4256 const char *console,
4257 const char *root_device, bool root_device_rw,
4258 const char *home_device, bool home_device_rw,
4259 const char *srv_device, bool srv_device_rw,
4260 bool interactive,
4261 bool secondary,
4262 int pid_socket,
4263 int kmsg_socket,
4264 int rtnl_socket,
825d5287 4265 int uid_shift_socket,
03cfe0d5
LP
4266 FDSet *fds,
4267 int argc,
4268 char *argv[]) {
4269
4270 pid_t pid;
4271 ssize_t l;
4272 int r;
4273
4274 assert(barrier);
4275 assert(directory);
4276 assert(console);
4277 assert(pid_socket >= 0);
4278 assert(kmsg_socket >= 0);
4279
4280 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
4281 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4282
4283 if (interactive) {
4284 close_nointr(STDIN_FILENO);
4285 close_nointr(STDOUT_FILENO);
4286 close_nointr(STDERR_FILENO);
4287
4288 r = open_terminal(console, O_RDWR);
4289 if (r != STDIN_FILENO) {
4290 if (r >= 0) {
4291 safe_close(r);
4292 r = -EINVAL;
4293 }
4294
4295 return log_error_errno(r, "Failed to open console: %m");
4296 }
4297
4298 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4299 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
4300 return log_error_errno(errno, "Failed to duplicate console: %m");
4301 }
4302
4303 r = reset_audit_loginuid();
4304 if (r < 0)
4305 return r;
4306
4307 /* Mark everything as slave, so that we still
4308 * receive mounts from the real root, but don't
4309 * propagate mounts to the real root. */
4310 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
4311 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4312
4313 r = mount_devices(directory,
4314 root_device, root_device_rw,
4315 home_device, home_device_rw,
4316 srv_device, srv_device_rw);
4317 if (r < 0)
4318 return r;
4319
391567f4
LP
4320 r = determine_uid_shift(directory);
4321 if (r < 0)
4322 return r;
4323
825d5287
RM
4324 if (arg_userns) {
4325 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
4326 if (l < 0)
4327 return log_error_errno(errno, "Failed to send UID shift: %m");
4328 if (l != sizeof(arg_uid_shift)) {
4329 log_error("Short write while sending UID shift.");
4330 return -EIO;
4331 }
4332 }
4333
03cfe0d5
LP
4334 /* Turn directory into bind mount */
4335 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
4336 return log_error_errno(errno, "Failed to make bind mount: %m");
4337
03cfe0d5
LP
4338 r = setup_volatile(directory);
4339 if (r < 0)
4340 return r;
4341
03cfe0d5
LP
4342 r = setup_volatile_state(directory);
4343 if (r < 0)
4344 return r;
4345
03cfe0d5
LP
4346 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
4347 if (r < 0)
4348 return r;
4349
03cfe0d5
LP
4350 if (arg_read_only) {
4351 r = bind_remount_recursive(directory, true);
4352 if (r < 0)
4353 return log_error_errno(r, "Failed to make tree read-only: %m");
4354 }
4355
03cfe0d5
LP
4356 r = mount_all(directory, false);
4357 if (r < 0)
4358 return r;
4359
4360 if (copy_devnodes(directory) < 0)
4361 return r;
4362
4363 dev_setup(directory, arg_uid_shift, arg_uid_shift);
4364
4365 if (setup_pts(directory) < 0)
4366 return r;
4367
4368 r = setup_propagate(directory);
4369 if (r < 0)
4370 return r;
4371
4372 r = setup_dev_console(directory, console);
4373 if (r < 0)
4374 return r;
4375
4376 r = setup_seccomp();
4377 if (r < 0)
4378 return r;
4379
4380 r = setup_timezone(directory);
4381 if (r < 0)
4382 return r;
4383
4384 r = setup_resolv_conf(directory);
4385 if (r < 0)
4386 return r;
4387
4388 r = setup_journal(directory);
4389 if (r < 0)
4390 return r;
4391
4392 r = mount_custom(directory);
4393 if (r < 0)
4394 return r;
4395
4396 r = mount_cgroup(directory);
4397 if (r < 0)
4398 return r;
4399
4400 r = mount_move_root(directory);
4401 if (r < 0)
4402 return log_error_errno(r, "Failed to move root directory: %m");
4403
4404 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4405 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
4406 (arg_private_network ? CLONE_NEWNET : 0) |
4407 (arg_userns ? CLONE_NEWUSER : 0),
4408 NULL);
4409 if (pid < 0)
4410 return log_error_errno(errno, "Failed to fork inner child: %m");
4411
4412 if (pid == 0) {
4413 pid_socket = safe_close(pid_socket);
825d5287 4414 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
4415
4416 /* The inner child has all namespaces that are
4417 * requested, so that we all are owned by the user if
4418 * user namespaces are turned on. */
4419
4420 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
4421 if (r < 0)
4422 _exit(EXIT_FAILURE);
4423
4424 _exit(EXIT_SUCCESS);
4425 }
4426
4427 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4428 if (l < 0)
4429 return log_error_errno(errno, "Failed to send PID: %m");
4430 if (l != sizeof(pid)) {
4431 log_error("Short write while sending PID.");
4432 return -EIO;
4433 }
4434
4435 pid_socket = safe_close(pid_socket);
4436
4437 return 0;
4438}
4439
4440static int setup_uid_map(pid_t pid) {
4441 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4442 int r;
4443
4444 assert(pid > 1);
4445
4446 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4447 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 4448 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
4449 if (r < 0)
4450 return log_error_errno(r, "Failed to write UID map: %m");
4451
4452 /* We always assign the same UID and GID ranges */
4453 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 4454 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
4455 if (r < 0)
4456 return log_error_errno(r, "Failed to write GID map: %m");
4457
4458 return 0;
4459}
4460
4461static int chown_cgroup(pid_t pid) {
4462 _cleanup_free_ char *path = NULL, *fs = NULL;
4463 _cleanup_close_ int fd = -1;
4464 const char *fn;
4465 int r;
4466
4467 r = cg_pid_get_path(NULL, pid, &path);
4468 if (r < 0)
4469 return log_error_errno(r, "Failed to get container cgroup path: %m");
4470
4471 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
4472 if (r < 0)
4473 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
4474
4475 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
4476 if (fd < 0)
4477 return log_error_errno(errno, "Failed to open %s: %m", fs);
4478
4479 FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4480 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
4481 log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
4482
4483 return 0;
4484}
4485
4486int main(int argc, char *argv[]) {
4487
4488 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
4489 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4490 _cleanup_close_ int master = -1, image_fd = -1;
4491 _cleanup_fdset_free_ FDSet *fds = NULL;
4492 int r, n_fd_passed, loop_nr = -1;
4493 char veth_name[IFNAMSIZ];
4494 bool secondary = false, remove_subvol = false;
72c0a2c2 4495 sigset_t mask_chld;
03cfe0d5
LP
4496 pid_t pid = 0;
4497 int ret = EXIT_SUCCESS;
4498 union in_addr_union exposed = {};
4499 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4500 bool interactive;
4501
4502 log_parse_environment();
4503 log_open();
4504
4505 r = parse_argv(argc, argv);
4506 if (r <= 0)
4507 goto finish;
4508
4509 r = determine_names();
4510 if (r < 0)
4511 goto finish;
4512
4513 if (geteuid() != 0) {
4514 log_error("Need to be root.");
4515 r = -EPERM;
4516 goto finish;
4517 }
4518
4519 n_fd_passed = sd_listen_fds(false);
4520 if (n_fd_passed > 0) {
4521 r = fdset_new_listen_fds(&fds, false);
4522 if (r < 0) {
4523 log_error_errno(r, "Failed to collect file descriptors: %m");
4524 goto finish;
4525 }
4526 }
4527
4528 if (arg_directory) {
4529 assert(!arg_image);
4530
4531 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4532 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4533 r = -EINVAL;
4534 goto finish;
4535 }
4536
4537 if (arg_ephemeral) {
4538 _cleanup_free_ char *np = NULL;
4539
4540 /* If the specified path is a mount point we
4541 * generate the new snapshot immediately
4542 * inside it under a random name. However if
4543 * the specified is not a mount point we
4544 * create the new snapshot in the parent
4545 * directory, just next to it. */
e26d6ce5 4546 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
4547 if (r < 0) {
4548 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4549 goto finish;
4550 }
4551 if (r > 0)
770b5ce4 4552 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4553 else
770b5ce4 4554 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
4555 if (r < 0) {
4556 log_error_errno(r, "Failed to generate name for snapshot: %m");
4557 goto finish;
4558 }
4559
4560 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4561 if (r < 0) {
4562 log_error_errno(r, "Failed to lock %s: %m", np);
4563 goto finish;
4564 }
4565
4566 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4567 if (r < 0) {
4568 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4569 goto finish;
ec16945e
LP
4570 }
4571
4572 free(arg_directory);
4573 arg_directory = np;
8a16a7b4 4574 np = NULL;
ec16945e
LP
4575
4576 remove_subvol = true;
30535c16
LP
4577
4578 } else {
4579 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4580 if (r == -EBUSY) {
4581 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4582 goto finish;
4583 }
4584 if (r < 0) {
4585 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4586 return r;
4587 }
4588
4589 if (arg_template) {
f70a17f8 4590 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
30535c16
LP
4591 if (r == -EEXIST) {
4592 if (!arg_quiet)
4593 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4594 } else if (r < 0) {
83521414 4595 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
4596 goto finish;
4597 } else {
4598 if (!arg_quiet)
4599 log_info("Populated %s from template %s.", arg_directory, arg_template);
4600 }
4601 }
ec16945e
LP
4602 }
4603
1b9e5b12
LP
4604 if (arg_boot) {
4605 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 4606 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 4607 r = -EINVAL;
1b9e5b12
LP
4608 goto finish;
4609 }
4610 } else {
4611 const char *p;
4612
63c372cb 4613 p = strjoina(arg_directory,
1b9e5b12
LP
4614 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
4615 if (access(p, F_OK) < 0) {
4616 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
ec16945e 4617 r = -EINVAL;
1b9e5b12 4618 goto finish;
1b9e5b12
LP
4619 }
4620 }
ec16945e 4621
6b9132a9 4622 } else {
1b9e5b12 4623 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 4624
ec16945e
LP
4625 assert(arg_image);
4626 assert(!arg_template);
4627
30535c16
LP
4628 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4629 if (r == -EBUSY) {
4630 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4631 goto finish;
4632 }
4633 if (r < 0) {
4634 r = log_error_errno(r, "Failed to create image lock: %m");
4635 goto finish;
4636 }
4637
1b9e5b12 4638 if (!mkdtemp(template)) {
56f64d95 4639 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 4640 r = -errno;
6b9132a9 4641 goto finish;
1b9e5b12 4642 }
6b9132a9 4643
1b9e5b12
LP
4644 arg_directory = strdup(template);
4645 if (!arg_directory) {
4646 r = log_oom();
4647 goto finish;
6b9132a9 4648 }
88213476 4649
1b9e5b12
LP
4650 image_fd = setup_image(&device_path, &loop_nr);
4651 if (image_fd < 0) {
4652 r = image_fd;
842f3b0f
LP
4653 goto finish;
4654 }
1b9e5b12 4655
4d9f07b4
LP
4656 r = dissect_image(image_fd,
4657 &root_device, &root_device_rw,
4658 &home_device, &home_device_rw,
4659 &srv_device, &srv_device_rw,
4660 &secondary);
1b9e5b12
LP
4661 if (r < 0)
4662 goto finish;
842f3b0f 4663 }
842f3b0f 4664
5a8af538
LP
4665 r = custom_mounts_prepare();
4666 if (r < 0)
4667 goto finish;
4668
03cfe0d5
LP
4669 interactive =
4670 isatty(STDIN_FILENO) > 0 &&
4671 isatty(STDOUT_FILENO) > 0;
9c857b9d 4672
db7feb7e
LP
4673 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4674 if (master < 0) {
ec16945e 4675 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
4676 goto finish;
4677 }
4678
611b312b
LP
4679 r = ptsname_malloc(master, &console);
4680 if (r < 0) {
4681 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
4682 goto finish;
4683 }
4684
a258bf26 4685 if (unlockpt(master) < 0) {
ec16945e 4686 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
4687 goto finish;
4688 }
4689
9c857b9d
LP
4690 if (!arg_quiet)
4691 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4692 arg_machine, arg_image ?: arg_directory);
4693
72c0a2c2 4694 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 4695
023fb90b
LP
4696 assert_se(sigemptyset(&mask_chld) == 0);
4697 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4698
03cfe0d5
LP
4699 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4700 r = log_error_errno(errno, "Failed to become subreaper: %m");
4701 goto finish;
4702 }
4703
d87be9b0 4704 for (;;) {
825d5287
RM
4705 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
4706 uid_shift_socket_pair[2] = { -1, -1 };
113cea80 4707 ContainerStatus container_status;
7566e267 4708 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
03cfe0d5 4709 static const struct sigaction sa = {
e866af3a
DH
4710 .sa_handler = nop_handler,
4711 .sa_flags = SA_NOCLDSTOP,
4712 };
03cfe0d5
LP
4713 int ifi = 0;
4714 ssize_t l;
dbb60d69
LP
4715 _cleanup_event_unref_ sd_event *event = NULL;
4716 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4717 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4718 char last_char = 0;
e866af3a 4719
7566e267 4720 r = barrier_create(&barrier);
a2da110b 4721 if (r < 0) {
da927ba9 4722 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
4723 goto finish;
4724 }
4725
6d0b55c2
LP
4726 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
4727 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4728 goto finish;
4729 }
4730
4731 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
4732 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4733 goto finish;
4734 }
4735
03cfe0d5
LP
4736 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
4737 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
4738 goto finish;
4739 }
4740
825d5287
RM
4741 if (arg_userns)
4742 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
4743 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4744 goto finish;
4745 }
4746
e866af3a
DH
4747 /* Child can be killed before execv(), so handle SIGCHLD
4748 * in order to interrupt parent's blocking calls and
4749 * give it a chance to call wait() and terminate. */
4750 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4751 if (r < 0) {
ec16945e 4752 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
4753 goto finish;
4754 }
4755
e866af3a
DH
4756 r = sigaction(SIGCHLD, &sa, NULL);
4757 if (r < 0) {
ec16945e 4758 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
4759 goto finish;
4760 }
4761
03cfe0d5 4762 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
d87be9b0
LP
4763 if (pid < 0) {
4764 if (errno == EINVAL)
ec16945e 4765 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 4766 else
ec16945e 4767 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 4768
d87be9b0
LP
4769 goto finish;
4770 }
a258bf26 4771
d87be9b0 4772 if (pid == 0) {
03cfe0d5 4773 /* The outer child only has a file system namespace. */
a2da110b
DH
4774 barrier_set_role(&barrier, BARRIER_CHILD);
4775
03e334a1 4776 master = safe_close(master);
a258bf26 4777
03e334a1 4778 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 4779 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 4780 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
825d5287 4781 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 4782
ce30c8dc
LP
4783 (void) reset_all_signal_handlers();
4784 (void) reset_signal_mask();
f5c1b9ee 4785
03cfe0d5
LP
4786 r = outer_child(&barrier,
4787 arg_directory,
4788 console,
4789 root_device, root_device_rw,
4790 home_device, home_device_rw,
4791 srv_device, srv_device_rw,
4792 interactive,
4793 secondary,
4794 pid_socket_pair[1],
4795 kmsg_socket_pair[1],
4796 rtnl_socket_pair[1],
825d5287 4797 uid_shift_socket_pair[1],
03cfe0d5
LP
4798 fds,
4799 argc, argv);
0cb9fbcd 4800 if (r < 0)
a2da110b 4801 _exit(EXIT_FAILURE);
d87be9b0 4802
03cfe0d5 4803 _exit(EXIT_SUCCESS);
da5b3bad 4804 }
88213476 4805
a2da110b 4806 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 4807
842f3b0f
LP
4808 fdset_free(fds);
4809 fds = NULL;
4810
6d0b55c2
LP
4811 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4812 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 4813 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
6d0b55c2 4814
03cfe0d5
LP
4815 /* Wait for the outer child. */
4816 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
4817 if (r < 0)
4818 goto finish;
4819 if (r != 0) {
4820 r = -EIO;
4821 goto finish;
4822 }
4823 pid = 0;
6dac160c 4824
03cfe0d5
LP
4825 /* And now retrieve the PID of the inner child. */
4826 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
4827 if (l < 0) {
4828 r = log_error_errno(errno, "Failed to read inner child PID: %m");
4829 goto finish;
4830 }
4831 if (l != sizeof(pid)) {
4832 log_error("Short read while reading inner child PID: %m");
4833 r = EIO;
4834 goto finish;
4835 }
354bfd2b 4836
03cfe0d5 4837 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 4838
03cfe0d5
LP
4839 if (arg_userns) {
4840 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4841 log_error("Child died too early.");
4842 r = -ESRCH;
840295fc 4843 goto finish;
03cfe0d5 4844 }
ab046dde 4845
825d5287
RM
4846 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
4847 if (l < 0) {
4848 r = log_error_errno(errno, "Failed to read UID shift: %m");
4849 goto finish;
4850 }
4851 if (l != sizeof(arg_uid_shift)) {
4852 log_error("Short read while reading UID shift: %m");
4853 r = EIO;
4854 goto finish;
4855 }
4856
03cfe0d5 4857 r = setup_uid_map(pid);
840295fc
LP
4858 if (r < 0)
4859 goto finish;
ab046dde 4860
03cfe0d5
LP
4861 (void) barrier_place(&barrier); /* #2 */
4862 }
c74e630d 4863
03cfe0d5
LP
4864 r = move_network_interfaces(pid);
4865 if (r < 0)
4866 goto finish;
4bbfe7ad 4867
03cfe0d5
LP
4868 r = setup_veth(pid, veth_name, &ifi);
4869 if (r < 0)
4870 goto finish;
5aa4bb6b 4871
03cfe0d5
LP
4872 r = setup_bridge(veth_name, &ifi);
4873 if (r < 0)
4874 goto finish;
6dac160c 4875
03cfe0d5
LP
4876 r = setup_macvlan(pid);
4877 if (r < 0)
4878 goto finish;
6dac160c 4879
03cfe0d5
LP
4880 r = setup_ipvlan(pid);
4881 if (r < 0)
4882 goto finish;
6dac160c 4883
03cfe0d5
LP
4884 r = register_machine(pid, ifi);
4885 if (r < 0)
4886 goto finish;
6dac160c 4887
03cfe0d5
LP
4888 r = chown_cgroup(pid);
4889 if (r < 0)
4890 goto finish;
6dac160c 4891
03cfe0d5
LP
4892 /* Notify the child that the parent is ready with all
4893 * its setup (including cgroup-ification), and that
4894 * the child can now hand over control to the code to
4895 * run inside the container. */
4896 (void) barrier_place(&barrier); /* #3 */
6dac160c 4897
03cfe0d5
LP
4898 /* Block SIGCHLD here, before notifying child.
4899 * process_pty() will handle it with the other signals. */
4900 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 4901
03cfe0d5
LP
4902 /* Reset signal to default */
4903 r = default_signals(SIGCHLD, -1);
4904 if (r < 0) {
4905 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4906 goto finish;
4907 }
e866af3a 4908
03cfe0d5
LP
4909 /* Let the child know that we are ready and wait that the child is completely ready now. */
4910 if (!barrier_place_and_sync(&barrier)) { /* #5 */
4911 log_error("Client died too early.");
4912 r = -ESRCH;
4913 goto finish;
4914 }
b12afc8c 4915
03cfe0d5
LP
4916 sd_notifyf(false,
4917 "READY=1\n"
4918 "STATUS=Container running.\n"
4919 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 4920
03cfe0d5
LP
4921 r = sd_event_new(&event);
4922 if (r < 0) {
4923 log_error_errno(r, "Failed to get default event source: %m");
4924 goto finish;
4925 }
88213476 4926
03cfe0d5
LP
4927 if (arg_kill_signal > 0) {
4928 /* Try to kill the init system on SIGINT or SIGTERM */
4929 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4930 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4931 } else {
4932 /* Immediately exit */
4933 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4934 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4935 }
023fb90b 4936
03cfe0d5
LP
4937 /* simply exit on sigchld */
4938 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 4939
03cfe0d5
LP
4940 if (arg_expose_ports) {
4941 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4942 if (r < 0)
4943 goto finish;
023fb90b 4944
03cfe0d5
LP
4945 (void) expose_ports(rtnl, &exposed);
4946 }
023fb90b 4947
03cfe0d5 4948 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 4949
03cfe0d5
LP
4950 r = pty_forward_new(event, master, true, !interactive, &forward);
4951 if (r < 0) {
4952 log_error_errno(r, "Failed to create PTY forwarder: %m");
4953 goto finish;
4954 }
023fb90b 4955
03cfe0d5
LP
4956 r = sd_event_loop(event);
4957 if (r < 0) {
4958 log_error_errno(r, "Failed to run event loop: %m");
4959 goto finish;
4960 }
6d0b55c2 4961
03cfe0d5 4962 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 4963
03cfe0d5 4964 forward = pty_forward_free(forward);
6d0b55c2 4965
03cfe0d5
LP
4966 if (!arg_quiet && last_char != '\n')
4967 putc('\n', stdout);
04d39279 4968
03cfe0d5
LP
4969 /* Kill if it is not dead yet anyway */
4970 terminate_machine(pid);
1f0cd86b 4971
840295fc 4972 /* Normally redundant, but better safe than sorry */
04d39279 4973 kill(pid, SIGKILL);
a258bf26 4974
113cea80 4975 r = wait_for_container(pid, &container_status);
04d39279
LP
4976 pid = 0;
4977
ec16945e 4978 if (r < 0)
ce9f1527
LP
4979 /* We failed to wait for the container, or the
4980 * container exited abnormally */
ec16945e
LP
4981 goto finish;
4982 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
4983 /* The container exited with a non-zero
4984 * status, or with zero status and no reboot
4985 * was requested. */
ec16945e 4986 ret = r;
d87be9b0 4987 break;
ec16945e 4988 }
88213476 4989
113cea80 4990 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
4991
4992 if (arg_keep_unit) {
4993 /* Special handling if we are running as a
4994 * service: instead of simply restarting the
4995 * machine we want to restart the entire
4996 * service, so let's inform systemd about this
4997 * with the special exit code 133. The service
4998 * file uses RestartForceExitStatus=133 so
4999 * that this results in a full nspawn
5000 * restart. This is necessary since we might
5001 * have cgroup parameters set we want to have
5002 * flushed out. */
ec16945e
LP
5003 ret = 133;
5004 r = 0;
ce38dbc8
LP
5005 break;
5006 }
6d0b55c2
LP
5007
5008 flush_ports(&exposed);
d87be9b0 5009 }
88213476
LP
5010
5011finish:
af4ec430
LP
5012 sd_notify(false,
5013 "STOPPING=1\n"
5014 "STATUS=Terminating...");
5015
9444b1f2
LP
5016 if (pid > 0)
5017 kill(pid, SIGKILL);
88213476 5018
503546da
LP
5019 /* Try to flush whatever is still queued in the pty */
5020 if (master >= 0)
5021 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
5022
03cfe0d5
LP
5023 loop_remove(loop_nr, &image_fd);
5024
ec16945e
LP
5025 if (remove_subvol && arg_directory) {
5026 int k;
5027
d9e2daaf 5028 k = btrfs_subvol_remove(arg_directory, true);
ec16945e
LP
5029 if (k < 0)
5030 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
5031 }
5032
785890ac
LP
5033 if (arg_machine) {
5034 const char *p;
5035
63c372cb 5036 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5037 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5038 }
5039
04d391da 5040 free(arg_directory);
ec16945e
LP
5041 free(arg_template);
5042 free(arg_image);
7027ff61 5043 free(arg_machine);
c74e630d
LP
5044 free(arg_user);
5045 strv_free(arg_setenv);
5046 strv_free(arg_network_interfaces);
5047 strv_free(arg_network_macvlan);
4bbfe7ad 5048 strv_free(arg_network_ipvlan);
5a8af538 5049 custom_mount_free_all();
88213476 5050
6d0b55c2
LP
5051 flush_ports(&exposed);
5052
5053 while (arg_expose_ports) {
5054 ExposePort *p = arg_expose_ports;
5055 LIST_REMOVE(ports, arg_expose_ports, p);
5056 free(p);
5057 }
5058
ec16945e 5059 return r < 0 ? EXIT_FAILURE : ret;
88213476 5060}