]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
libsystemd-network: plug memory leak
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
88213476 26#include <sys/mount.h>
88213476
LP
27#include <stdlib.h>
28#include <string.h>
29#include <stdio.h>
30#include <errno.h>
31#include <sys/prctl.h>
88213476 32#include <getopt.h>
687d0825 33#include <grp.h>
5ed27dbd 34#include <linux/fs.h>
9537eab0 35#include <sys/socket.h>
aea38d80 36#include <linux/netlink.h>
aa28aefe 37#include <net/if.h>
69c79d3c 38#include <linux/veth.h>
6afc95b7 39#include <sys/personality.h>
1b9e5b12 40#include <linux/loop.h>
2fbe4296 41#include <sys/file.h>
aa28aefe 42
5d63309c 43#ifdef HAVE_SELINUX
a8828ed9
DW
44#include <selinux/selinux.h>
45#endif
88213476 46
24fb1112
LP
47#ifdef HAVE_SECCOMP
48#include <seccomp.h>
49#endif
50
1b9e5b12
LP
51#ifdef HAVE_BLKID
52#include <blkid/blkid.h>
53#endif
54
1f0cd86b
LP
55#include "sd-daemon.h"
56#include "sd-bus.h"
57#include "sd-id128.h"
1c4baffc 58#include "sd-netlink.h"
958b66ea 59#include "random-util.h"
88213476
LP
60#include "log.h"
61#include "util.h"
49e942b2 62#include "mkdir.h"
c6878637 63#include "rm-rf.h"
6b2d0e85 64#include "macro.h"
94d82985 65#include "missing.h"
04d391da 66#include "cgroup-util.h"
a258bf26 67#include "strv.h"
9eb977db 68#include "path-util.h"
a41fe3a2 69#include "loopback-setup.h"
4fc9982c 70#include "dev-setup.h"
842f3b0f 71#include "fdset.h"
acbeb427 72#include "build.h"
a5c32cff 73#include "fileio.h"
40ca29a1 74#include "bus-util.h"
1f0cd86b 75#include "bus-error.h"
4ba93280 76#include "ptyfwd.h"
f4889f65 77#include "env-util.h"
1c4baffc 78#include "netlink-util.h"
7e227024 79#include "udev-util.h"
1b9e5b12
LP
80#include "blkid-util.h"
81#include "gpt.h"
01dde061 82#include "siphash24.h"
849958d1 83#include "copy.h"
3577de7a 84#include "base-filesystem.h"
a2da110b 85#include "barrier.h"
023fb90b 86#include "event-util.h"
f01ae826 87#include "capability.h"
2822da4f 88#include "cap-list.h"
ec16945e 89#include "btrfs-util.h"
1b9cebf6 90#include "machine-image.h"
6d0b55c2
LP
91#include "list.h"
92#include "in-addr-util.h"
12c2884c 93#include "firewall-util.h"
6d0b55c2 94#include "local-addresses.h"
6482f626 95#include "formats-util.h"
0b452006 96#include "process-util.h"
288a74cc 97#include "terminal-util.h"
958b66ea 98#include "hostname-util.h"
24882e06 99#include "signal-util.h"
f2d88580 100
e9642be2
LP
101#ifdef HAVE_SECCOMP
102#include "seccomp-util.h"
103#endif
104
6d0b55c2
LP
105typedef struct ExposePort {
106 int protocol;
107 uint16_t host_port;
108 uint16_t container_port;
109 LIST_FIELDS(struct ExposePort, ports);
110} ExposePort;
111
113cea80
DH
112typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115} ContainerStatus;
116
57fb9fb5
LP
117typedef enum LinkJournal {
118 LINK_NO,
119 LINK_AUTO,
120 LINK_HOST,
121 LINK_GUEST
122} LinkJournal;
88213476 123
4d9f07b4
LP
124typedef enum Volatile {
125 VOLATILE_NO,
126 VOLATILE_YES,
127 VOLATILE_STATE,
128} Volatile;
129
5a8af538
LP
130typedef enum CustomMountType {
131 CUSTOM_MOUNT_BIND,
132 CUSTOM_MOUNT_TMPFS,
133 CUSTOM_MOUNT_OVERLAY,
134} CustomMountType;
135
136typedef struct CustomMount {
137 CustomMountType type;
138 bool read_only;
139 char *source; /* for overlayfs this is the upper directory */
140 char *destination;
141 char *options;
142 char *work_dir;
143 char **lower;
144} CustomMount;
145
88213476 146static char *arg_directory = NULL;
ec16945e 147static char *arg_template = NULL;
687d0825 148static char *arg_user = NULL;
9444b1f2 149static sd_id128_t arg_uuid = {};
7027ff61 150static char *arg_machine = NULL;
c74e630d
LP
151static const char *arg_selinux_context = NULL;
152static const char *arg_selinux_apifs_context = NULL;
9444b1f2 153static const char *arg_slice = NULL;
ff01d048 154static bool arg_private_network = false;
bc2f673e 155static bool arg_read_only = false;
0f0dbc46 156static bool arg_boot = false;
ec16945e 157static bool arg_ephemeral = false;
57fb9fb5 158static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 159static bool arg_link_journal_try = false;
5076f0cc
LP
160static uint64_t arg_retain =
161 (1ULL << CAP_CHOWN) |
162 (1ULL << CAP_DAC_OVERRIDE) |
163 (1ULL << CAP_DAC_READ_SEARCH) |
164 (1ULL << CAP_FOWNER) |
165 (1ULL << CAP_FSETID) |
166 (1ULL << CAP_IPC_OWNER) |
167 (1ULL << CAP_KILL) |
168 (1ULL << CAP_LEASE) |
169 (1ULL << CAP_LINUX_IMMUTABLE) |
170 (1ULL << CAP_NET_BIND_SERVICE) |
171 (1ULL << CAP_NET_BROADCAST) |
172 (1ULL << CAP_NET_RAW) |
173 (1ULL << CAP_SETGID) |
174 (1ULL << CAP_SETFCAP) |
175 (1ULL << CAP_SETPCAP) |
176 (1ULL << CAP_SETUID) |
177 (1ULL << CAP_SYS_ADMIN) |
178 (1ULL << CAP_SYS_CHROOT) |
179 (1ULL << CAP_SYS_NICE) |
180 (1ULL << CAP_SYS_PTRACE) |
181 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 182 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
183 (1ULL << CAP_SYS_BOOT) |
184 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
185 (1ULL << CAP_AUDIT_CONTROL) |
186 (1ULL << CAP_MKNOD);
5a8af538
LP
187static CustomMount *arg_custom_mounts = NULL;
188static unsigned arg_n_custom_mounts = 0;
f4889f65 189static char **arg_setenv = NULL;
284c0b91 190static bool arg_quiet = false;
8a96d94e 191static bool arg_share_system = false;
eb91eb18 192static bool arg_register = true;
89f7c846 193static bool arg_keep_unit = false;
aa28aefe 194static char **arg_network_interfaces = NULL;
c74e630d 195static char **arg_network_macvlan = NULL;
4bbfe7ad 196static char **arg_network_ipvlan = NULL;
69c79d3c 197static bool arg_network_veth = false;
c74e630d 198static const char *arg_network_bridge = NULL;
050f7277 199static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 200static char *arg_image = NULL;
4d9f07b4 201static Volatile arg_volatile = VOLATILE_NO;
6d0b55c2 202static ExposePort *arg_expose_ports = NULL;
f36933fe 203static char **arg_property = NULL;
6dac160c
LP
204static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
205static bool arg_userns = false;
c6c8f6e2 206static int arg_kill_signal = 0;
88213476 207
601185b4 208static void help(void) {
88213476
LP
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
211 " -h --help Show this help\n"
212 " --version Print version string\n"
69c79d3c 213 " -q --quiet Do not show status information\n"
1b9e5b12 214 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
215 " --template=PATH Initialize root directory from template directory,\n"
216 " if missing\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 222 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 223 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 224 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 225 " --property=NAME=VALUE Set scope unit property\n"
03cfe0d5
LP
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
69c79d3c
LP
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
231 " container\n"
c74e630d
LP
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
4bbfe7ad
TG
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
0dfaa006 238 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 239 " and container\n"
ab046dde 240 " --network-bridge=INTERFACE\n"
32457153 241 " Add a virtual ethernet connection between host\n"
ab046dde
TG
242 " and container and add it to an existing bridge on\n"
243 " the host\n"
6d0b55c2 244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 245 " Expose a container IP port on the host\n"
82adf6af
LP
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
a8828ed9
DW
252 " --capability=CAP In addition to the default, retain specified\n"
253 " capability\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 259 " --read-only Mount the root directory read-only\n"
a8828ed9
DW
260 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
261 " the container\n"
262 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
06c17c39 263 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
264 " --overlay=PATH[:PATH...]:PATH\n"
265 " Create an overlay mount from the host to \n"
266 " the container\n"
267 " --overlay-ro=PATH[:PATH...]:PATH\n"
268 " Similar, but creates a read-only overlay mount\n"
284c0b91 269 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 270 " --share-system Share system namespaces with host\n"
eb91eb18 271 " --register=BOOLEAN Register container as machine\n"
89f7c846 272 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 273 " the service unit nspawn is running in\n"
6d0b55c2
LP
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 , program_invocation_short_name);
88213476
LP
276}
277
5a8af538
LP
278static CustomMount* custom_mount_add(CustomMountType t) {
279 CustomMount *c, *ret;
280
281 c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
282 if (!c)
283 return NULL;
284
285 arg_custom_mounts = c;
286 ret = arg_custom_mounts + arg_n_custom_mounts;
287 arg_n_custom_mounts++;
288
289 *ret = (CustomMount) { .type = t };
290
291 return ret;
292}
293
294static void custom_mount_free_all(void) {
295 unsigned i;
296
297 for (i = 0; i < arg_n_custom_mounts; i++) {
298 CustomMount *m = &arg_custom_mounts[i];
299
300 free(m->source);
301 free(m->destination);
302 free(m->options);
303
304 if (m->work_dir) {
305 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
306 free(m->work_dir);
307 }
308
309 strv_free(m->lower);
310 }
311
97b11eed 312 arg_custom_mounts = mfree(arg_custom_mounts);
5a8af538
LP
313 arg_n_custom_mounts = 0;
314}
315
316static int custom_mount_compare(const void *a, const void *b) {
317 const CustomMount *x = a, *y = b;
318 int r;
319
320 r = path_compare(x->destination, y->destination);
321 if (r != 0)
322 return r;
323
324 if (x->type < y->type)
325 return -1;
326 if (x->type > y->type)
327 return 1;
328
329 return 0;
330}
331
332static int custom_mounts_prepare(void) {
333 unsigned i;
334 int r;
335
336 /* Ensure the mounts are applied prefix first. */
337 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
338
339 /* Allocate working directories for the overlay file systems that need it */
340 for (i = 0; i < arg_n_custom_mounts; i++) {
341 CustomMount *m = &arg_custom_mounts[i];
342
825d5287
RM
343 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
344 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
345 return -EINVAL;
346 }
347
5a8af538
LP
348 if (m->type != CUSTOM_MOUNT_OVERLAY)
349 continue;
350
351 if (m->work_dir)
352 continue;
353
354 if (m->read_only)
355 continue;
356
14bcf25c 357 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
358 if (r < 0)
359 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
360 }
361
362 return 0;
363}
364
ec16945e
LP
365static int set_sanitized_path(char **b, const char *path) {
366 char *p;
367
368 assert(b);
369 assert(path);
370
371 p = canonicalize_file_name(path);
372 if (!p) {
373 if (errno != ENOENT)
374 return -errno;
375
376 p = path_make_absolute_cwd(path);
377 if (!p)
378 return -ENOMEM;
379 }
380
381 free(*b);
382 *b = path_kill_slashes(p);
383 return 0;
384}
385
88213476
LP
386static int parse_argv(int argc, char *argv[]) {
387
a41fe3a2 388 enum {
acbeb427
ZJS
389 ARG_VERSION = 0x100,
390 ARG_PRIVATE_NETWORK,
bc2f673e 391 ARG_UUID,
5076f0cc 392 ARG_READ_ONLY,
57fb9fb5 393 ARG_CAPABILITY,
420c7379 394 ARG_DROP_CAPABILITY,
17fe0523
LP
395 ARG_LINK_JOURNAL,
396 ARG_BIND,
f4889f65 397 ARG_BIND_RO,
06c17c39 398 ARG_TMPFS,
5a8af538
LP
399 ARG_OVERLAY,
400 ARG_OVERLAY_RO,
f4889f65 401 ARG_SETENV,
eb91eb18 402 ARG_SHARE_SYSTEM,
89f7c846 403 ARG_REGISTER,
aa28aefe 404 ARG_KEEP_UNIT,
69c79d3c 405 ARG_NETWORK_INTERFACE,
c74e630d 406 ARG_NETWORK_MACVLAN,
4bbfe7ad 407 ARG_NETWORK_IPVLAN,
ab046dde 408 ARG_NETWORK_BRIDGE,
6afc95b7 409 ARG_PERSONALITY,
4d9f07b4 410 ARG_VOLATILE,
ec16945e 411 ARG_TEMPLATE,
f36933fe 412 ARG_PROPERTY,
6dac160c 413 ARG_PRIVATE_USERS,
c6c8f6e2 414 ARG_KILL_SIGNAL,
a41fe3a2
LP
415 };
416
88213476 417 static const struct option options[] = {
aa28aefe
LP
418 { "help", no_argument, NULL, 'h' },
419 { "version", no_argument, NULL, ARG_VERSION },
420 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
421 { "template", required_argument, NULL, ARG_TEMPLATE },
422 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
423 { "user", required_argument, NULL, 'u' },
424 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
425 { "boot", no_argument, NULL, 'b' },
426 { "uuid", required_argument, NULL, ARG_UUID },
427 { "read-only", no_argument, NULL, ARG_READ_ONLY },
428 { "capability", required_argument, NULL, ARG_CAPABILITY },
429 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
430 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
431 { "bind", required_argument, NULL, ARG_BIND },
432 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 433 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
434 { "overlay", required_argument, NULL, ARG_OVERLAY },
435 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
436 { "machine", required_argument, NULL, 'M' },
437 { "slice", required_argument, NULL, 'S' },
438 { "setenv", required_argument, NULL, ARG_SETENV },
439 { "selinux-context", required_argument, NULL, 'Z' },
440 { "selinux-apifs-context", required_argument, NULL, 'L' },
441 { "quiet", no_argument, NULL, 'q' },
442 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
443 { "register", required_argument, NULL, ARG_REGISTER },
444 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
445 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 446 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 447 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 448 { "network-veth", no_argument, NULL, 'n' },
ab046dde 449 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 450 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 451 { "image", required_argument, NULL, 'i' },
4d9f07b4 452 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 453 { "port", required_argument, NULL, 'p' },
f36933fe 454 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 455 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 456 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
eb9da376 457 {}
88213476
LP
458 };
459
9444b1f2 460 int c, r;
a42c8b54 461 uint64_t plus = 0, minus = 0;
88213476
LP
462
463 assert(argc >= 0);
464 assert(argv);
465
0dfaa006 466 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
467
468 switch (c) {
469
470 case 'h':
601185b4
ZJS
471 help();
472 return 0;
88213476 473
acbeb427
ZJS
474 case ARG_VERSION:
475 puts(PACKAGE_STRING);
476 puts(SYSTEMD_FEATURES);
477 return 0;
478
88213476 479 case 'D':
ec16945e
LP
480 r = set_sanitized_path(&arg_directory, optarg);
481 if (r < 0)
482 return log_error_errno(r, "Invalid root directory: %m");
483
484 break;
485
486 case ARG_TEMPLATE:
487 r = set_sanitized_path(&arg_template, optarg);
488 if (r < 0)
489 return log_error_errno(r, "Invalid template directory: %m");
88213476
LP
490
491 break;
492
1b9e5b12 493 case 'i':
ec16945e
LP
494 r = set_sanitized_path(&arg_image, optarg);
495 if (r < 0)
496 return log_error_errno(r, "Invalid image path: %m");
497
498 break;
499
500 case 'x':
501 arg_ephemeral = true;
1b9e5b12
LP
502 break;
503
687d0825 504 case 'u':
2fc09a9c
DM
505 r = free_and_strdup(&arg_user, optarg);
506 if (r < 0)
7027ff61 507 return log_oom();
687d0825
MV
508
509 break;
510
ab046dde 511 case ARG_NETWORK_BRIDGE:
c74e630d 512 arg_network_bridge = optarg;
ab046dde
TG
513
514 /* fall through */
515
0dfaa006 516 case 'n':
69c79d3c
LP
517 arg_network_veth = true;
518 arg_private_network = true;
519 break;
520
aa28aefe 521 case ARG_NETWORK_INTERFACE:
c74e630d
LP
522 if (strv_extend(&arg_network_interfaces, optarg) < 0)
523 return log_oom();
524
525 arg_private_network = true;
526 break;
527
528 case ARG_NETWORK_MACVLAN:
529 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
530 return log_oom();
531
4bbfe7ad
TG
532 arg_private_network = true;
533 break;
534
535 case ARG_NETWORK_IPVLAN:
536 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
537 return log_oom();
538
aa28aefe
LP
539 /* fall through */
540
ff01d048
LP
541 case ARG_PRIVATE_NETWORK:
542 arg_private_network = true;
a41fe3a2
LP
543 break;
544
0f0dbc46
LP
545 case 'b':
546 arg_boot = true;
547 break;
548
144f0fc0 549 case ARG_UUID:
9444b1f2
LP
550 r = sd_id128_from_string(optarg, &arg_uuid);
551 if (r < 0) {
aa96c6cb 552 log_error("Invalid UUID: %s", optarg);
9444b1f2 553 return r;
aa96c6cb 554 }
9444b1f2 555 break;
aa96c6cb 556
9444b1f2 557 case 'S':
c74e630d 558 arg_slice = optarg;
144f0fc0
LP
559 break;
560
7027ff61 561 case 'M':
eb91eb18 562 if (isempty(optarg)) {
97b11eed 563 arg_machine = mfree(arg_machine);
eb91eb18 564 } else {
0c3c4284 565 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
566 log_error("Invalid machine name: %s", optarg);
567 return -EINVAL;
568 }
7027ff61 569
0c3c4284
LP
570 r = free_and_strdup(&arg_machine, optarg);
571 if (r < 0)
eb91eb18
LP
572 return log_oom();
573
574 break;
575 }
7027ff61 576
82adf6af
LP
577 case 'Z':
578 arg_selinux_context = optarg;
a8828ed9
DW
579 break;
580
82adf6af
LP
581 case 'L':
582 arg_selinux_apifs_context = optarg;
a8828ed9
DW
583 break;
584
bc2f673e
LP
585 case ARG_READ_ONLY:
586 arg_read_only = true;
587 break;
588
420c7379
LP
589 case ARG_CAPABILITY:
590 case ARG_DROP_CAPABILITY: {
a2a5291b 591 const char *state, *word;
5076f0cc
LP
592 size_t length;
593
594 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 595 _cleanup_free_ char *t;
5076f0cc
LP
596
597 t = strndup(word, length);
0d0f0c50
SL
598 if (!t)
599 return log_oom();
5076f0cc 600
39ed67d1
LP
601 if (streq(t, "all")) {
602 if (c == ARG_CAPABILITY)
a42c8b54 603 plus = (uint64_t) -1;
39ed67d1 604 else
a42c8b54 605 minus = (uint64_t) -1;
39ed67d1 606 } else {
2822da4f
LP
607 int cap;
608
609 cap = capability_from_name(t);
610 if (cap < 0) {
39ed67d1
LP
611 log_error("Failed to parse capability %s.", t);
612 return -EINVAL;
613 }
614
615 if (c == ARG_CAPABILITY)
a42c8b54 616 plus |= 1ULL << (uint64_t) cap;
39ed67d1 617 else
a42c8b54 618 minus |= 1ULL << (uint64_t) cap;
5076f0cc 619 }
5076f0cc
LP
620 }
621
622 break;
623 }
624
57fb9fb5
LP
625 case 'j':
626 arg_link_journal = LINK_GUEST;
574edc90 627 arg_link_journal_try = true;
57fb9fb5
LP
628 break;
629
630 case ARG_LINK_JOURNAL:
53e438e3 631 if (streq(optarg, "auto")) {
57fb9fb5 632 arg_link_journal = LINK_AUTO;
53e438e3
LP
633 arg_link_journal_try = false;
634 } else if (streq(optarg, "no")) {
57fb9fb5 635 arg_link_journal = LINK_NO;
53e438e3
LP
636 arg_link_journal_try = false;
637 } else if (streq(optarg, "guest")) {
57fb9fb5 638 arg_link_journal = LINK_GUEST;
53e438e3
LP
639 arg_link_journal_try = false;
640 } else if (streq(optarg, "host")) {
57fb9fb5 641 arg_link_journal = LINK_HOST;
53e438e3
LP
642 arg_link_journal_try = false;
643 } else if (streq(optarg, "try-guest")) {
574edc90
MP
644 arg_link_journal = LINK_GUEST;
645 arg_link_journal_try = true;
646 } else if (streq(optarg, "try-host")) {
647 arg_link_journal = LINK_HOST;
648 arg_link_journal_try = true;
649 } else {
57fb9fb5
LP
650 log_error("Failed to parse link journal mode %s", optarg);
651 return -EINVAL;
652 }
653
654 break;
655
17fe0523
LP
656 case ARG_BIND:
657 case ARG_BIND_RO: {
5a8af538
LP
658 _cleanup_free_ char *source = NULL, *destination = NULL;
659 CustomMount *m;
17fe0523 660 char *e;
17fe0523
LP
661
662 e = strchr(optarg, ':');
663 if (e) {
5a8af538
LP
664 source = strndup(optarg, e - optarg);
665 destination = strdup(e + 1);
17fe0523 666 } else {
5a8af538
LP
667 source = strdup(optarg);
668 destination = strdup(optarg);
17fe0523
LP
669 }
670
5a8af538 671 if (!source || !destination)
17fe0523
LP
672 return log_oom();
673
5a8af538 674 if (!path_is_absolute(source) || !path_is_absolute(destination)) {
17fe0523
LP
675 log_error("Invalid bind mount specification: %s", optarg);
676 return -EINVAL;
677 }
678
5a8af538
LP
679 m = custom_mount_add(CUSTOM_MOUNT_BIND);
680 if (!m)
b3451bed 681 return log_oom();
17fe0523 682
5a8af538
LP
683 m->source = source;
684 m->destination = destination;
685 m->read_only = c == ARG_BIND_RO;
686
687 source = destination = NULL;
17fe0523
LP
688
689 break;
690 }
691
06c17c39 692 case ARG_TMPFS: {
5a8af538
LP
693 _cleanup_free_ char *path = NULL, *opts = NULL;
694 CustomMount *m;
06c17c39
LP
695 char *e;
696
697 e = strchr(optarg, ':');
698 if (e) {
5a8af538
LP
699 path = strndup(optarg, e - optarg);
700 opts = strdup(e + 1);
06c17c39 701 } else {
5a8af538
LP
702 path = strdup(optarg);
703 opts = strdup("mode=0755");
06c17c39
LP
704 }
705
5a8af538 706 if (!path || !opts)
06c17c39
LP
707 return log_oom();
708
5a8af538 709 if (!path_is_absolute(path)) {
06c17c39
LP
710 log_error("Invalid tmpfs specification: %s", optarg);
711 return -EINVAL;
712 }
713
5a8af538
LP
714 m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
715 if (!m)
06c17c39
LP
716 return log_oom();
717
5a8af538
LP
718 m->destination = path;
719 m->options = opts;
06c17c39 720
5a8af538
LP
721 path = opts = NULL;
722
723 break;
724 }
725
726 case ARG_OVERLAY:
727 case ARG_OVERLAY_RO: {
728 _cleanup_free_ char *upper = NULL, *destination = NULL;
729 _cleanup_strv_free_ char **lower = NULL;
730 CustomMount *m;
731 unsigned n = 0;
732 char **i;
733
734 lower = strv_split(optarg, ":");
735 if (!lower)
06c17c39
LP
736 return log_oom();
737
5a8af538
LP
738 STRV_FOREACH(i, lower) {
739 if (!path_is_absolute(*i)) {
740 log_error("Overlay path %s is not absolute.", *i);
741 return -EINVAL;
742 }
743
744 n++;
745 }
746
747 if (n < 2) {
748 log_error("--overlay= needs at least two colon-separated directories specified.");
749 return -EINVAL;
750 }
751
752 if (n == 2) {
753 /* If two parameters are specified,
754 * the first one is the lower, the
755 * second one the upper directory. And
af86c440
ZJS
756 * we'll also define the destination
757 * mount point the same as the upper. */
5a8af538
LP
758 upper = lower[1];
759 lower[1] = NULL;
760
761 destination = strdup(upper);
762 if (!destination)
763 return log_oom();
764
765 } else {
766 upper = lower[n - 2];
767 destination = lower[n - 1];
768 lower[n - 2] = NULL;
769 }
770
771 m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
772 if (!m)
773 return log_oom();
774
775 m->destination = destination;
776 m->source = upper;
777 m->lower = lower;
778 m->read_only = c == ARG_OVERLAY_RO;
779
780 upper = destination = NULL;
781 lower = NULL;
06c17c39
LP
782
783 break;
784 }
785
f4889f65
LP
786 case ARG_SETENV: {
787 char **n;
788
789 if (!env_assignment_is_valid(optarg)) {
790 log_error("Environment variable assignment '%s' is not valid.", optarg);
791 return -EINVAL;
792 }
793
794 n = strv_env_set(arg_setenv, optarg);
795 if (!n)
796 return log_oom();
797
798 strv_free(arg_setenv);
799 arg_setenv = n;
800 break;
801 }
802
284c0b91
LP
803 case 'q':
804 arg_quiet = true;
805 break;
806
8a96d94e
LP
807 case ARG_SHARE_SYSTEM:
808 arg_share_system = true;
809 break;
810
eb91eb18
LP
811 case ARG_REGISTER:
812 r = parse_boolean(optarg);
813 if (r < 0) {
814 log_error("Failed to parse --register= argument: %s", optarg);
815 return r;
816 }
817
818 arg_register = r;
819 break;
820
89f7c846
LP
821 case ARG_KEEP_UNIT:
822 arg_keep_unit = true;
823 break;
824
6afc95b7
LP
825 case ARG_PERSONALITY:
826
ac45f971 827 arg_personality = personality_from_string(optarg);
050f7277 828 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
829 log_error("Unknown or unsupported personality '%s'.", optarg);
830 return -EINVAL;
831 }
832
833 break;
834
4d9f07b4
LP
835 case ARG_VOLATILE:
836
837 if (!optarg)
838 arg_volatile = VOLATILE_YES;
839 else {
840 r = parse_boolean(optarg);
841 if (r < 0) {
842 if (streq(optarg, "state"))
843 arg_volatile = VOLATILE_STATE;
844 else {
845 log_error("Failed to parse --volatile= argument: %s", optarg);
846 return r;
847 }
848 } else
849 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
850 }
851
852 break;
853
6d0b55c2
LP
854 case 'p': {
855 const char *split, *e;
856 uint16_t container_port, host_port;
857 int protocol;
858 ExposePort *p;
859
860 if ((e = startswith(optarg, "tcp:")))
861 protocol = IPPROTO_TCP;
862 else if ((e = startswith(optarg, "udp:")))
863 protocol = IPPROTO_UDP;
864 else {
865 e = optarg;
866 protocol = IPPROTO_TCP;
867 }
868
869 split = strchr(e, ':');
870 if (split) {
871 char v[split - e + 1];
872
873 memcpy(v, e, split - e);
874 v[split - e] = 0;
875
876 r = safe_atou16(v, &host_port);
877 if (r < 0 || host_port <= 0) {
878 log_error("Failed to parse host port: %s", optarg);
879 return -EINVAL;
880 }
881
882 r = safe_atou16(split + 1, &container_port);
883 } else {
884 r = safe_atou16(e, &container_port);
885 host_port = container_port;
886 }
887
888 if (r < 0 || container_port <= 0) {
889 log_error("Failed to parse host port: %s", optarg);
890 return -EINVAL;
891 }
892
893 LIST_FOREACH(ports, p, arg_expose_ports) {
894 if (p->protocol == protocol && p->host_port == host_port) {
895 log_error("Duplicate port specification: %s", optarg);
896 return -EINVAL;
897 }
898 }
899
900 p = new(ExposePort, 1);
901 if (!p)
902 return log_oom();
903
904 p->protocol = protocol;
905 p->host_port = host_port;
906 p->container_port = container_port;
907
908 LIST_PREPEND(ports, arg_expose_ports, p);
909
910 break;
911 }
912
f36933fe
LP
913 case ARG_PROPERTY:
914 if (strv_extend(&arg_property, optarg) < 0)
915 return log_oom();
916
917 break;
918
6dac160c
LP
919 case ARG_PRIVATE_USERS:
920 if (optarg) {
921 _cleanup_free_ char *buffer = NULL;
922 const char *range, *shift;
923
924 range = strchr(optarg, ':');
925 if (range) {
926 buffer = strndup(optarg, range - optarg);
927 if (!buffer)
928 return log_oom();
929 shift = buffer;
930
931 range++;
932 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
933 log_error("Failed to parse UID range: %s", range);
934 return -EINVAL;
935 }
936 } else
937 shift = optarg;
938
939 if (parse_uid(shift, &arg_uid_shift) < 0) {
940 log_error("Failed to parse UID: %s", optarg);
941 return -EINVAL;
942 }
943 }
944
945 arg_userns = true;
946 break;
947
c6c8f6e2
LP
948 case ARG_KILL_SIGNAL:
949 arg_kill_signal = signal_from_string_try_harder(optarg);
950 if (arg_kill_signal < 0) {
951 log_error("Cannot parse signal: %s", optarg);
952 return -EINVAL;
953 }
954
955 break;
956
88213476
LP
957 case '?':
958 return -EINVAL;
959
960 default:
eb9da376 961 assert_not_reached("Unhandled option");
88213476 962 }
88213476 963
eb91eb18
LP
964 if (arg_share_system)
965 arg_register = false;
966
967 if (arg_boot && arg_share_system) {
968 log_error("--boot and --share-system may not be combined.");
969 return -EINVAL;
970 }
971
89f7c846
LP
972 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
973 log_error("--keep-unit may not be used when invoked from a user session.");
974 return -EINVAL;
975 }
976
1b9e5b12
LP
977 if (arg_directory && arg_image) {
978 log_error("--directory= and --image= may not be combined.");
979 return -EINVAL;
980 }
981
ec16945e
LP
982 if (arg_template && arg_image) {
983 log_error("--template= and --image= may not be combined.");
984 return -EINVAL;
985 }
986
987 if (arg_template && !(arg_directory || arg_machine)) {
988 log_error("--template= needs --directory= or --machine=.");
989 return -EINVAL;
990 }
991
992 if (arg_ephemeral && arg_template) {
993 log_error("--ephemeral and --template= may not be combined.");
994 return -EINVAL;
995 }
996
997 if (arg_ephemeral && arg_image) {
998 log_error("--ephemeral and --image= may not be combined.");
999 return -EINVAL;
1000 }
1001
df9a75e4
LP
1002 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1003 log_error("--ephemeral and --link-journal= may not be combined.");
1004 return -EINVAL;
1005 }
1006
4d9f07b4
LP
1007 if (arg_volatile != VOLATILE_NO && arg_read_only) {
1008 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1009 return -EINVAL;
1010 }
1011
6d0b55c2
LP
1012 if (arg_expose_ports && !arg_private_network) {
1013 log_error("Cannot use --port= without private networking.");
1014 return -EINVAL;
1015 }
1016
b774fb7f
DH
1017 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
1018 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
1019
a42c8b54
LP
1020 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1021
c6c8f6e2
LP
1022 if (arg_boot && arg_kill_signal <= 0)
1023 arg_kill_signal = SIGRTMIN+3;
1024
88213476
LP
1025 return 1;
1026}
1027
03cfe0d5
LP
1028static int tmpfs_patch_options(const char *options, char **ret) {
1029 char *buf = NULL;
1030
1031 if (arg_userns && arg_uid_shift != 0) {
825d5287 1032 assert(arg_uid_shift != UID_INVALID);
03cfe0d5
LP
1033
1034 if (options)
f001a835 1035 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
03cfe0d5 1036 else
f001a835 1037 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
03cfe0d5
LP
1038 if (!buf)
1039 return -ENOMEM;
1040
1041 options = buf;
1042 }
1043
1044#ifdef HAVE_SELINUX
1045 if (arg_selinux_apifs_context) {
1046 char *t;
1047
1048 if (options)
1049 t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
1050 else
1051 t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
1052 if (!t) {
1053 free(buf);
1054 return -ENOMEM;
1055 }
1056
1057 free(buf);
1058 buf = t;
1059 }
1060#endif
1061
1062 *ret = buf;
1063 return !!buf;
1064}
1065
1066static int mount_all(const char *dest, bool userns) {
88213476
LP
1067
1068 typedef struct MountPoint {
1069 const char *what;
1070 const char *where;
1071 const char *type;
1072 const char *options;
1073 unsigned long flags;
3bd66c05 1074 bool fatal;
03cfe0d5 1075 bool userns;
88213476
LP
1076 } MountPoint;
1077
1078 static const MountPoint mount_table[] = {
3c59d4f2
RM
1079 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
1080 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
1081 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
1082 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
1083 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
1084 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
1085 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1086 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1087 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
9b634ea5 1088#ifdef HAVE_SELINUX
3c59d4f2
RM
1089 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
1090 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */
9b634ea5 1091#endif
88213476
LP
1092 };
1093
1094 unsigned k;
03cfe0d5 1095 int r;
88213476
LP
1096
1097 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
d15d65a0 1098 _cleanup_free_ char *where = NULL, *options = NULL;
d002827b 1099 const char *o;
88213476 1100
03cfe0d5
LP
1101 if (userns != mount_table[k].userns)
1102 continue;
1103
1104 where = prefix_root(dest, mount_table[k].where);
17fe0523
LP
1105 if (!where)
1106 return log_oom();
88213476 1107
e26d6ce5 1108 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
03cfe0d5
LP
1109 if (r < 0 && r != -ENOENT)
1110 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
88213476 1111
9c1c7f71 1112 /* Skip this entry if it is not a remount. */
03cfe0d5 1113 if (mount_table[k].what && r > 0)
014a9c77
LP
1114 continue;
1115
03cfe0d5
LP
1116 r = mkdir_p(where, 0755);
1117 if (r < 0) {
1118 if (mount_table[k].fatal)
1119 return log_error_errno(r, "Failed to create directory %s: %m", where);
79d80fc1 1120
03cfe0d5 1121 log_warning_errno(r, "Failed to create directory %s: %m", where);
79d80fc1
TG
1122 continue;
1123 }
88213476 1124
03cfe0d5
LP
1125 o = mount_table[k].options;
1126 if (streq_ptr(mount_table[k].type, "tmpfs")) {
1127 r = tmpfs_patch_options(o, &options);
1128 if (r < 0)
6dac160c 1129 return log_oom();
03cfe0d5
LP
1130 if (r > 0)
1131 o = options;
6dac160c 1132 }
a8828ed9 1133
88213476
LP
1134 if (mount(mount_table[k].what,
1135 where,
1136 mount_table[k].type,
1137 mount_table[k].flags,
79d80fc1 1138 o) < 0) {
88213476 1139
03cfe0d5
LP
1140 if (mount_table[k].fatal)
1141 return log_error_errno(errno, "mount(%s) failed: %m", where);
88213476 1142
03cfe0d5 1143 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
88213476 1144 }
88213476
LP
1145 }
1146
03cfe0d5 1147 return 0;
e58a1277 1148}
f8440af5 1149
5a8af538
LP
1150static int mount_bind(const char *dest, CustomMount *m) {
1151 struct stat source_st, dest_st;
03cfe0d5 1152 const char *where;
5a8af538 1153 int r;
17fe0523 1154
5a8af538 1155 assert(m);
d2421337 1156
5a8af538
LP
1157 if (stat(m->source, &source_st) < 0)
1158 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
17fe0523 1159
03cfe0d5 1160 where = prefix_roota(dest, m->destination);
06c17c39 1161
03cfe0d5 1162 if (stat(where, &dest_st) >= 0) {
5a8af538
LP
1163 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
1164 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
1165 return -EINVAL;
2ed4e5e0 1166 }
06c17c39 1167
5a8af538
LP
1168 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
1169 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
1170 return -EINVAL;
d2421337 1171 }
17fe0523 1172
5a8af538
LP
1173 } else if (errno == ENOENT) {
1174 r = mkdir_parents_label(where, 0755);
1175 if (r < 0)
1176 return log_error_errno(r, "Failed to make parents of %s: %m", where);
1177 } else {
1178 log_error_errno(errno, "Failed to stat %s: %m", where);
1179 return -errno;
1180 }
17fe0523 1181
5a8af538
LP
1182 /* Create the mount point. Any non-directory file can be
1183 * mounted on any non-directory file (regular, fifo, socket,
1184 * char, block).
1185 */
1186 if (S_ISDIR(source_st.st_mode))
1187 r = mkdir_label(where, 0755);
1188 else
1189 r = touch(where);
1190 if (r < 0 && r != -EEXIST)
1191 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1192
1193 if (mount(m->source, where, NULL, MS_BIND, NULL) < 0)
1194 return log_error_errno(errno, "mount(%s) failed: %m", where);
1195
1196 if (m->read_only) {
1197 r = bind_remount_recursive(where, true);
1198 if (r < 0)
1199 return log_error_errno(r, "Read-only bind mount failed: %m");
1200 }
1201
1202 return 0;
1203}
1204
1205static int mount_tmpfs(const char *dest, CustomMount *m) {
03cfe0d5
LP
1206 const char *where, *options;
1207 _cleanup_free_ char *buf = NULL;
5a8af538
LP
1208 int r;
1209
1210 assert(dest);
1211 assert(m);
1212
03cfe0d5 1213 where = prefix_roota(dest, m->destination);
5a8af538 1214
03cfe0d5 1215 r = mkdir_p_label(where, 0755);
5a8af538
LP
1216 if (r < 0 && r != -EEXIST)
1217 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1218
03cfe0d5
LP
1219 r = tmpfs_patch_options(m->options, &buf);
1220 if (r < 0)
1221 return log_oom();
1222 options = r > 0 ? buf : m->options;
1223
1224 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
5a8af538
LP
1225 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1226
1227 return 0;
1228}
1229
1230static int mount_overlay(const char *dest, CustomMount *m) {
1231 _cleanup_free_ char *lower = NULL;
03cfe0d5 1232 const char *where, *options;
5a8af538
LP
1233 int r;
1234
1235 assert(dest);
1236 assert(m);
1237
03cfe0d5 1238 where = prefix_roota(dest, m->destination);
5a8af538
LP
1239
1240 r = mkdir_label(where, 0755);
1241 if (r < 0 && r != -EEXIST)
1242 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
1243
1244 (void) mkdir_p_label(m->source, 0755);
1245
1246 strv_reverse(m->lower);
1247 lower = strv_join(m->lower, ":");
1248 strv_reverse(m->lower);
5a8af538
LP
1249 if (!lower)
1250 return log_oom();
1251
1252 if (m->read_only)
1253 options = strjoina("lowerdir=", m->source, ":", lower);
1254 else {
1255 assert(m->work_dir);
1256 (void) mkdir_label(m->work_dir, 0700);
1257
1258 options = strjoina("lowerdir=", lower, ",upperdir=", m->source, ",workdir=", m->work_dir);
1259 }
1260
1261 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
1262 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
1263
1264 return 0;
1265}
1266
1267static int mount_custom(const char *dest) {
1268 unsigned i;
1269 int r;
1270
1271 assert(dest);
1272
1273 for (i = 0; i < arg_n_custom_mounts; i++) {
1274 CustomMount *m = &arg_custom_mounts[i];
1275
1276 switch (m->type) {
1277
1278 case CUSTOM_MOUNT_BIND:
1279 r = mount_bind(dest, m);
1280 break;
1281
1282 case CUSTOM_MOUNT_TMPFS:
1283 r = mount_tmpfs(dest, m);
1284 break;
1285
1286 case CUSTOM_MOUNT_OVERLAY:
1287 r = mount_overlay(dest, m);
1288 break;
1289
1290 default:
1291 assert_not_reached("Unknown custom mount type");
17fe0523 1292 }
5a8af538
LP
1293
1294 if (r < 0)
1295 return r;
17fe0523
LP
1296 }
1297
1298 return 0;
1299}
1300
b12afc8c
LP
1301static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1302 char *to;
1303 int r;
1304
63c372cb 1305 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
b12afc8c 1306
e26d6ce5 1307 r = path_is_mount_point(to, 0);
da00518b 1308 if (r < 0 && r != -ENOENT)
b12afc8c
LP
1309 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1310 if (r > 0)
1311 return 0;
1312
1313 mkdir_p(to, 0755);
1314
c0534580
LP
1315 /* The superblock mount options of the mount point need to be
1316 * identical to the hosts', and hence writable... */
1317 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
b12afc8c
LP
1318 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1319
c0534580
LP
1320 /* ... hence let's only make the bind mount read-only, not the
1321 * superblock. */
1322 if (read_only) {
1323 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1324 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1325 }
b12afc8c
LP
1326 return 1;
1327}
1328
1329static int mount_cgroup(const char *dest) {
1330 _cleanup_set_free_free_ Set *controllers = NULL;
03cfe0d5 1331 const char *cgroup_root;
b12afc8c
LP
1332 int r;
1333
1334 controllers = set_new(&string_hash_ops);
1335 if (!controllers)
1336 return log_oom();
1337
1338 r = cg_kernel_controllers(controllers);
1339 if (r < 0)
1340 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1341
b12afc8c
LP
1342 for (;;) {
1343 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1344
1345 controller = set_steal_first(controllers);
1346 if (!controller)
1347 break;
1348
03cfe0d5 1349 origin = prefix_root("/sys/fs/cgroup/", controller);
b12afc8c
LP
1350 if (!origin)
1351 return log_oom();
1352
1353 r = readlink_malloc(origin, &combined);
1354 if (r == -EINVAL) {
1355 /* Not a symbolic link, but directly a single cgroup hierarchy */
1356
1357 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1358 if (r < 0)
1359 return r;
1360
1361 } else if (r < 0)
1362 return log_error_errno(r, "Failed to read link %s: %m", origin);
1363 else {
1364 _cleanup_free_ char *target = NULL;
1365
03cfe0d5 1366 target = prefix_root(dest, origin);
b12afc8c
LP
1367 if (!target)
1368 return log_oom();
1369
1370 /* A symbolic link, a combination of controllers in one hierarchy */
1371
1372 if (!filename_is_valid(combined)) {
1373 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1374 continue;
1375 }
1376
1377 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1378 if (r < 0)
1379 return r;
1380
875e1014
ILG
1381 r = symlink_idempotent(combined, target);
1382 if (r == -EINVAL) {
1383 log_error("Invalid existing symlink for combined hierarchy");
1384 return r;
1385 }
1386 if (r < 0)
1387 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
b12afc8c
LP
1388 }
1389 }
1390
c0534580 1391 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
b12afc8c
LP
1392 if (r < 0)
1393 return r;
1394
03cfe0d5
LP
1395 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1396 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1397 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1398
1399 return 0;
1400}
1401
1402static int mount_systemd_cgroup_writable(const char *dest) {
1403 _cleanup_free_ char *own_cgroup_path = NULL;
1404 const char *systemd_root, *systemd_own;
1405 int r;
1406
1407 assert(dest);
1408
1409 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1410 if (r < 0)
1411 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1412
b12afc8c 1413 /* Make our own cgroup a (writable) bind mount */
63c372cb 1414 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
b12afc8c
LP
1415 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1416 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1417
1418 /* And then remount the systemd cgroup root read-only */
03cfe0d5 1419 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
b12afc8c
LP
1420 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1421 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1422
03cfe0d5
LP
1423 return 0;
1424}
1425
1426static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1427 assert(p);
1428
1429 if (!arg_userns)
1430 return 0;
1431
1432 if (uid == UID_INVALID && gid == GID_INVALID)
1433 return 0;
1434
1435 if (uid != UID_INVALID) {
1436 uid += arg_uid_shift;
1437
1438 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1439 return -EOVERFLOW;
1440 }
1441
1442 if (gid != GID_INVALID) {
1443 gid += (gid_t) arg_uid_shift;
1444
1445 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1446 return -EOVERFLOW;
1447 }
1448
1449 if (lchown(p, uid, gid) < 0)
1450 return -errno;
b12afc8c
LP
1451
1452 return 0;
1453}
1454
03cfe0d5
LP
1455static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1456 const char *q;
1457
1458 q = prefix_roota(root, path);
1459 if (mkdir(q, mode) < 0) {
1460 if (errno == EEXIST)
1461 return 0;
1462 return -errno;
1463 }
1464
1465 return userns_lchown(q, uid, gid);
1466}
1467
e58a1277 1468static int setup_timezone(const char *dest) {
03cfe0d5
LP
1469 _cleanup_free_ char *p = NULL, *q = NULL;
1470 const char *where, *check, *what;
d4036145
LP
1471 char *z, *y;
1472 int r;
f8440af5 1473
e58a1277
LP
1474 assert(dest);
1475
1476 /* Fix the timezone, if possible */
d4036145
LP
1477 r = readlink_malloc("/etc/localtime", &p);
1478 if (r < 0) {
1479 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1480 return 0;
1481 }
1482
1483 z = path_startswith(p, "../usr/share/zoneinfo/");
1484 if (!z)
1485 z = path_startswith(p, "/usr/share/zoneinfo/");
1486 if (!z) {
1487 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1488 return 0;
1489 }
1490
03cfe0d5 1491 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1492 r = readlink_malloc(where, &q);
1493 if (r >= 0) {
1494 y = path_startswith(q, "../usr/share/zoneinfo/");
1495 if (!y)
1496 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1497
d4036145
LP
1498 /* Already pointing to the right place? Then do nothing .. */
1499 if (y && streq(y, z))
1500 return 0;
1501 }
1502
03cfe0d5
LP
1503 check = strjoina("/usr/share/zoneinfo/", z);
1504 check = prefix_root(dest, check);
1505 if (laccess(check, F_OK) < 0) {
d4036145
LP
1506 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1507 return 0;
1508 }
68fb0892 1509
79d80fc1
TG
1510 r = unlink(where);
1511 if (r < 0 && errno != ENOENT) {
56f64d95 1512 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1513 return 0;
1514 }
4d9f07b4 1515
03cfe0d5 1516 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1517 if (symlink(what, where) < 0) {
56f64d95 1518 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1519 return 0;
1520 }
e58a1277 1521
03cfe0d5
LP
1522 r = userns_lchown(where, 0, 0);
1523 if (r < 0)
1524 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1525
e58a1277 1526 return 0;
88213476
LP
1527}
1528
2547bb41 1529static int setup_resolv_conf(const char *dest) {
03cfe0d5 1530 const char *where = NULL;
79d80fc1 1531 int r;
2547bb41
LP
1532
1533 assert(dest);
1534
1535 if (arg_private_network)
1536 return 0;
1537
1538 /* Fix resolv.conf, if possible */
03cfe0d5 1539 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1540
f2068bcc 1541 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1542 if (r < 0) {
68a313c5
LP
1543 /* If the file already exists as symlink, let's
1544 * suppress the warning, under the assumption that
1545 * resolved or something similar runs inside and the
1546 * symlink points there.
1547 *
1548 * If the disk image is read-only, there's also no
1549 * point in complaining.
1550 */
1551 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1552 "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1553 return 0;
1554 }
2547bb41 1555
03cfe0d5
LP
1556 r = userns_lchown(where, 0, 0);
1557 if (r < 0)
1558 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1559
2547bb41
LP
1560 return 0;
1561}
1562
4d9f07b4 1563static int setup_volatile_state(const char *directory) {
03cfe0d5
LP
1564 _cleanup_free_ char *buf = NULL;
1565 const char *p, *options;
4d9f07b4
LP
1566 int r;
1567
1568 assert(directory);
1569
1570 if (arg_volatile != VOLATILE_STATE)
1571 return 0;
1572
1573 /* --volatile=state means we simply overmount /var
1574 with a tmpfs, and the rest read-only. */
1575
1576 r = bind_remount_recursive(directory, true);
f647962d
MS
1577 if (r < 0)
1578 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
4d9f07b4 1579
03cfe0d5 1580 p = prefix_roota(directory, "/var");
79d80fc1 1581 r = mkdir(p, 0755);
4a62c710
MS
1582 if (r < 0 && errno != EEXIST)
1583 return log_error_errno(errno, "Failed to create %s: %m", directory);
4d9f07b4 1584
03cfe0d5
LP
1585 options = "mode=755";
1586 r = tmpfs_patch_options(options, &buf);
1587 if (r < 0)
1588 return log_oom();
1589 if (r > 0)
1590 options = buf;
1591
1592 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
4a62c710 1593 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
4d9f07b4
LP
1594
1595 return 0;
1596}
1597
1598static int setup_volatile(const char *directory) {
1599 bool tmpfs_mounted = false, bind_mounted = false;
1600 char template[] = "/tmp/nspawn-volatile-XXXXXX";
03cfe0d5
LP
1601 _cleanup_free_ char *buf = NULL;
1602 const char *f, *t, *options;
4d9f07b4
LP
1603 int r;
1604
1605 assert(directory);
1606
1607 if (arg_volatile != VOLATILE_YES)
1608 return 0;
1609
1610 /* --volatile=yes means we mount a tmpfs to the root dir, and
1611 the original /usr to use inside it, and that read-only. */
1612
4a62c710
MS
1613 if (!mkdtemp(template))
1614 return log_error_errno(errno, "Failed to create temporary directory: %m");
4d9f07b4 1615
03cfe0d5
LP
1616 options = "mode=755";
1617 r = tmpfs_patch_options(options, &buf);
1618 if (r < 0)
1619 return log_oom();
1620 if (r > 0)
1621 options = buf;
1622
1623 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
1624 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
4d9f07b4
LP
1625 goto fail;
1626 }
1627
1628 tmpfs_mounted = true;
1629
03cfe0d5
LP
1630 f = prefix_roota(directory, "/usr");
1631 t = prefix_roota(template, "/usr");
4d9f07b4 1632
79d80fc1
TG
1633 r = mkdir(t, 0755);
1634 if (r < 0 && errno != EEXIST) {
03cfe0d5 1635 r = log_error_errno(errno, "Failed to create %s: %m", t);
79d80fc1
TG
1636 goto fail;
1637 }
1638
4543768d 1639 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
03cfe0d5 1640 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
4d9f07b4
LP
1641 goto fail;
1642 }
1643
1644 bind_mounted = true;
1645
1646 r = bind_remount_recursive(t, true);
1647 if (r < 0) {
da927ba9 1648 log_error_errno(r, "Failed to remount %s read-only: %m", t);
4d9f07b4
LP
1649 goto fail;
1650 }
1651
1652 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
03cfe0d5 1653 r = log_error_errno(errno, "Failed to move root mount: %m");
4d9f07b4
LP
1654 goto fail;
1655 }
1656
03cfe0d5 1657 (void) rmdir(template);
4d9f07b4
LP
1658
1659 return 0;
1660
1661fail:
1662 if (bind_mounted)
03cfe0d5
LP
1663 (void) umount(t);
1664
4d9f07b4 1665 if (tmpfs_mounted)
03cfe0d5
LP
1666 (void) umount(template);
1667 (void) rmdir(template);
4d9f07b4
LP
1668 return r;
1669}
1670
9f24adc2 1671static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
03cfe0d5 1672 assert(s);
9f24adc2
LP
1673
1674 snprintf(s, 37,
1675 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1676 SD_ID128_FORMAT_VAL(id));
1677
1678 return s;
1679}
1680
04bc4a3f 1681static int setup_boot_id(const char *dest) {
03cfe0d5 1682 const char *from, *to;
39883f62 1683 sd_id128_t rnd = {};
04bc4a3f
LP
1684 char as_uuid[37];
1685 int r;
1686
eb91eb18
LP
1687 if (arg_share_system)
1688 return 0;
1689
04bc4a3f
LP
1690 /* Generate a new randomized boot ID, so that each boot-up of
1691 * the container gets a new one */
1692
03cfe0d5
LP
1693 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1694 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1695
1696 r = sd_id128_randomize(&rnd);
f647962d
MS
1697 if (r < 0)
1698 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1699
9f24adc2 1700 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1701
4c1fc3e4 1702 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
f647962d
MS
1703 if (r < 0)
1704 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1705
03cfe0d5
LP
1706 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1707 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1708 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
56f64d95 1709 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1710
1711 unlink(from);
04bc4a3f
LP
1712 return r;
1713}
1714
e58a1277 1715static int copy_devnodes(const char *dest) {
88213476
LP
1716
1717 static const char devnodes[] =
1718 "null\0"
1719 "zero\0"
1720 "full\0"
1721 "random\0"
1722 "urandom\0"
85614d66
TG
1723 "tty\0"
1724 "net/tun\0";
88213476
LP
1725
1726 const char *d;
e58a1277 1727 int r = 0;
7fd1b19b 1728 _cleanup_umask_ mode_t u;
a258bf26
LP
1729
1730 assert(dest);
124640f1
LP
1731
1732 u = umask(0000);
88213476 1733
03cfe0d5
LP
1734 /* Create /dev/net, so that we can create /dev/net/tun in it */
1735 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1736 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1737
88213476 1738 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1739 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1740 struct stat st;
88213476 1741
7f112f50 1742 from = strappend("/dev/", d);
03cfe0d5 1743 to = prefix_root(dest, from);
88213476
LP
1744
1745 if (stat(from, &st) < 0) {
1746
4a62c710
MS
1747 if (errno != ENOENT)
1748 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1749
a258bf26 1750 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1751
03cfe0d5 1752 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1753 return -EIO;
a258bf26 1754
85614d66 1755 } else {
81f5049b
AC
1756 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1757 if (errno != EPERM)
1758 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1759
1760 /* Some systems abusively restrict mknod but
1761 * allow bind mounts. */
1762 r = touch(to);
1763 if (r < 0)
1764 return log_error_errno(r, "touch (%s) failed: %m", to);
1765 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1766 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1767 }
6278cf60 1768
03cfe0d5
LP
1769 r = userns_lchown(to, 0, 0);
1770 if (r < 0)
1771 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1772 }
88213476
LP
1773 }
1774
e58a1277
LP
1775 return r;
1776}
88213476 1777
03cfe0d5
LP
1778static int setup_pts(const char *dest) {
1779 _cleanup_free_ char *options = NULL;
1780 const char *p;
1781
1782#ifdef HAVE_SELINUX
1783 if (arg_selinux_apifs_context)
1784 (void) asprintf(&options,
3dce8915 1785 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1786 arg_uid_shift + TTY_GID,
1787 arg_selinux_apifs_context);
1788 else
1789#endif
1790 (void) asprintf(&options,
3dce8915 1791 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1792 arg_uid_shift + TTY_GID);
f2d88580 1793
03cfe0d5 1794 if (!options)
f2d88580
LP
1795 return log_oom();
1796
03cfe0d5 1797 /* Mount /dev/pts itself */
cc9fce65 1798 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1799 if (mkdir(p, 0755) < 0)
1800 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1801 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1802 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1803 if (userns_lchown(p, 0, 0) < 0)
1804 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1805
1806 /* Create /dev/ptmx symlink */
1807 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1808 if (symlink("pts/ptmx", p) < 0)
1809 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
03cfe0d5
LP
1810 if (userns_lchown(p, 0, 0) < 0)
1811 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
f2d88580 1812
03cfe0d5
LP
1813 /* And fix /dev/pts/ptmx ownership */
1814 p = prefix_roota(dest, "/dev/pts/ptmx");
1815 if (userns_lchown(p, 0, 0) < 0)
1816 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1817
f2d88580
LP
1818 return 0;
1819}
1820
e58a1277 1821static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1822 _cleanup_umask_ mode_t u;
1823 const char *to;
e58a1277 1824 int r;
e58a1277
LP
1825
1826 assert(dest);
1827 assert(console);
1828
1829 u = umask(0000);
1830
03cfe0d5 1831 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1832 if (r < 0)
1833 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1834
a258bf26
LP
1835 /* We need to bind mount the right tty to /dev/console since
1836 * ptys can only exist on pts file systems. To have something
81f5049b 1837 * to bind mount things on we create a empty regular file. */
a258bf26 1838
03cfe0d5 1839 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1840 r = touch(to);
1841 if (r < 0)
1842 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1843
4543768d 1844 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1845 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1846
25ea79fe 1847 return 0;
e58a1277
LP
1848}
1849
1850static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1851 const char *from, *to;
7fd1b19b 1852 _cleanup_umask_ mode_t u;
03cfe0d5 1853 int fd, k;
e58a1277
LP
1854 union {
1855 struct cmsghdr cmsghdr;
1856 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1857 } control = {};
1858 struct msghdr mh = {
1859 .msg_control = &control,
1860 .msg_controllen = sizeof(control),
1861 };
e58a1277
LP
1862 struct cmsghdr *cmsg;
1863
e58a1277 1864 assert(kmsg_socket >= 0);
a258bf26 1865
e58a1277 1866 u = umask(0000);
a258bf26 1867
03cfe0d5 1868 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1869 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1870 * on the reading side behave very similar to /proc/kmsg,
1871 * their writing side behaves differently from /dev/kmsg in
1872 * that writing blocks when nothing is reading. In order to
1873 * avoid any problems with containers deadlocking due to this
1874 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1875 from = prefix_roota(dest, "/run/kmsg");
1876 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1877
4a62c710 1878 if (mkfifo(from, 0600) < 0)
03cfe0d5 1879 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 1880 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1881 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1882
1883 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1884 if (fd < 0)
1885 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1886
e58a1277
LP
1887 cmsg = CMSG_FIRSTHDR(&mh);
1888 cmsg->cmsg_level = SOL_SOCKET;
1889 cmsg->cmsg_type = SCM_RIGHTS;
1890 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1891 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1892
1893 mh.msg_controllen = cmsg->cmsg_len;
1894
1895 /* Store away the fd in the socket, so that it stays open as
1896 * long as we run the child */
6d0b55c2 1897 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
03e334a1 1898 safe_close(fd);
e58a1277 1899
4a62c710
MS
1900 if (k < 0)
1901 return log_error_errno(errno, "Failed to send FIFO fd: %m");
a258bf26 1902
03cfe0d5
LP
1903 /* And now make the FIFO unavailable as /run/kmsg... */
1904 (void) unlink(from);
1905
25ea79fe 1906 return 0;
88213476
LP
1907}
1908
6d0b55c2
LP
1909static int send_rtnl(int send_fd) {
1910 union {
1911 struct cmsghdr cmsghdr;
1912 uint8_t buf[CMSG_SPACE(sizeof(int))];
1913 } control = {};
1914 struct msghdr mh = {
1915 .msg_control = &control,
1916 .msg_controllen = sizeof(control),
1917 };
1918 struct cmsghdr *cmsg;
1919 _cleanup_close_ int fd = -1;
1920 ssize_t k;
1921
1922 assert(send_fd >= 0);
1923
1924 if (!arg_expose_ports)
1925 return 0;
1926
1927 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1928 if (fd < 0)
03cfe0d5 1929 return log_error_errno(errno, "Failed to allocate container netlink: %m");
6d0b55c2
LP
1930
1931 cmsg = CMSG_FIRSTHDR(&mh);
1932 cmsg->cmsg_level = SOL_SOCKET;
1933 cmsg->cmsg_type = SCM_RIGHTS;
1934 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1935 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1936
1937 mh.msg_controllen = cmsg->cmsg_len;
1938
1939 /* Store away the fd in the socket, so that it stays open as
1940 * long as we run the child */
1941 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1942 if (k < 0)
1943 return log_error_errno(errno, "Failed to send netlink fd: %m");
1944
1945 return 0;
1946}
1947
1948static int flush_ports(union in_addr_union *exposed) {
1949 ExposePort *p;
1950 int r, af = AF_INET;
1951
1952 assert(exposed);
1953
1954 if (!arg_expose_ports)
1955 return 0;
1956
1957 if (in_addr_is_null(af, exposed))
1958 return 0;
1959
1960 log_debug("Lost IP address.");
1961
1962 LIST_FOREACH(ports, p, arg_expose_ports) {
1963 r = fw_add_local_dnat(false,
1964 af,
1965 p->protocol,
1966 NULL,
1967 NULL, 0,
1968 NULL, 0,
1969 p->host_port,
1970 exposed,
1971 p->container_port,
1972 NULL);
1973 if (r < 0)
1974 log_warning_errno(r, "Failed to modify firewall: %m");
1975 }
1976
1977 *exposed = IN_ADDR_NULL;
1978 return 0;
1979}
1980
1c4baffc 1981static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
6d0b55c2
LP
1982 _cleanup_free_ struct local_address *addresses = NULL;
1983 _cleanup_free_ char *pretty = NULL;
1984 union in_addr_union new_exposed;
1985 ExposePort *p;
1986 bool add;
1987 int af = AF_INET, r;
1988
1989 assert(exposed);
1990
1991 /* Invoked each time an address is added or removed inside the
1992 * container */
1993
1994 if (!arg_expose_ports)
1995 return 0;
1996
1997 r = local_addresses(rtnl, 0, af, &addresses);
1998 if (r < 0)
1999 return log_error_errno(r, "Failed to enumerate local addresses: %m");
2000
2001 add = r > 0 &&
2002 addresses[0].family == af &&
2003 addresses[0].scope < RT_SCOPE_LINK;
2004
2005 if (!add)
2006 return flush_ports(exposed);
2007
2008 new_exposed = addresses[0].address;
2009 if (in_addr_equal(af, exposed, &new_exposed))
2010 return 0;
2011
2012 in_addr_to_string(af, &new_exposed, &pretty);
2013 log_debug("New container IP is %s.", strna(pretty));
2014
2015 LIST_FOREACH(ports, p, arg_expose_ports) {
2016
2017 r = fw_add_local_dnat(true,
2018 af,
2019 p->protocol,
2020 NULL,
2021 NULL, 0,
2022 NULL, 0,
2023 p->host_port,
2024 &new_exposed,
2025 p->container_port,
2026 in_addr_is_null(af, exposed) ? NULL : exposed);
2027 if (r < 0)
2028 log_warning_errno(r, "Failed to modify firewall: %m");
2029 }
2030
2031 *exposed = new_exposed;
2032 return 0;
2033}
2034
1c4baffc 2035static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
2036 union in_addr_union *exposed = userdata;
2037
2038 assert(rtnl);
2039 assert(m);
2040 assert(exposed);
2041
2042 expose_ports(rtnl, exposed);
2043 return 0;
2044}
2045
1c4baffc 2046static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
6d0b55c2
LP
2047 union {
2048 struct cmsghdr cmsghdr;
2049 uint8_t buf[CMSG_SPACE(sizeof(int))];
2050 } control = {};
2051 struct msghdr mh = {
2052 .msg_control = &control,
2053 .msg_controllen = sizeof(control),
2054 };
2055 struct cmsghdr *cmsg;
1c4baffc 2056 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
6d0b55c2
LP
2057 int fd, r;
2058 ssize_t k;
2059
2060 assert(event);
2061 assert(recv_fd >= 0);
2062 assert(ret);
2063
2064 if (!arg_expose_ports)
2065 return 0;
2066
2067 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
2068 if (k < 0)
2069 return log_error_errno(errno, "Failed to recv netlink fd: %m");
2070
2071 cmsg = CMSG_FIRSTHDR(&mh);
2072 assert(cmsg->cmsg_level == SOL_SOCKET);
2073 assert(cmsg->cmsg_type == SCM_RIGHTS);
657bdca9 2074 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
6d0b55c2
LP
2075 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
2076
1c4baffc 2077 r = sd_netlink_open_fd(&rtnl, fd);
6d0b55c2
LP
2078 if (r < 0) {
2079 safe_close(fd);
2080 return log_error_errno(r, "Failed to create rtnl object: %m");
2081 }
2082
1c4baffc 2083 r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
6d0b55c2
LP
2084 if (r < 0)
2085 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
2086
1c4baffc 2087 r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
6d0b55c2
LP
2088 if (r < 0)
2089 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
2090
1c4baffc 2091 r = sd_netlink_attach_event(rtnl, event, 0);
6d0b55c2
LP
2092 if (r < 0)
2093 return log_error_errno(r, "Failed to add to even loop: %m");
2094
2095 *ret = rtnl;
2096 rtnl = NULL;
2097
2098 return 0;
2099}
2100
3a74cea5 2101static int setup_hostname(void) {
3a74cea5 2102
eb91eb18
LP
2103 if (arg_share_system)
2104 return 0;
2105
605f81a8 2106 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 2107 return -errno;
3a74cea5 2108
7027ff61 2109 return 0;
3a74cea5
LP
2110}
2111
57fb9fb5 2112static int setup_journal(const char *directory) {
4d680aee 2113 sd_id128_t machine_id, this_id;
03cfe0d5
LP
2114 _cleanup_free_ char *b = NULL, *d = NULL;
2115 const char *etc_machine_id, *p, *q;
27407a01 2116 char *id;
57fb9fb5
LP
2117 int r;
2118
df9a75e4
LP
2119 /* Don't link journals in ephemeral mode */
2120 if (arg_ephemeral)
2121 return 0;
2122
03cfe0d5 2123 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
57fb9fb5 2124
03cfe0d5 2125 r = read_one_line_file(etc_machine_id, &b);
27407a01
ZJS
2126 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
2127 return 0;
f647962d 2128 else if (r < 0)
03cfe0d5 2129 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
57fb9fb5 2130
27407a01
ZJS
2131 id = strstrip(b);
2132 if (isempty(id) && arg_link_journal == LINK_AUTO)
2133 return 0;
57fb9fb5 2134
27407a01
ZJS
2135 /* Verify validity */
2136 r = sd_id128_from_string(id, &machine_id);
f647962d 2137 if (r < 0)
03cfe0d5 2138 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
57fb9fb5 2139
4d680aee 2140 r = sd_id128_get_machine(&this_id);
f647962d
MS
2141 if (r < 0)
2142 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
2143
2144 if (sd_id128_equal(machine_id, this_id)) {
2145 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
2146 "Host and machine ids are equal (%s): refusing to link journals", id);
2147 if (arg_link_journal == LINK_AUTO)
2148 return 0;
df9a75e4 2149 return -EEXIST;
4d680aee
ZJS
2150 }
2151
2152 if (arg_link_journal == LINK_NO)
2153 return 0;
2154
03cfe0d5
LP
2155 r = userns_mkdir(directory, "/var", 0755, 0, 0);
2156 if (r < 0)
2157 return log_error_errno(r, "Failed to create /var: %m");
2158
2159 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
2160 if (r < 0)
2161 return log_error_errno(r, "Failed to create /var/log: %m");
2162
2163 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
2164 if (r < 0)
2165 return log_error_errno(r, "Failed to create /var/log/journal: %m");
2166
2167 p = strjoina("/var/log/journal/", id);
2168 q = prefix_roota(directory, p);
27407a01 2169
e26d6ce5 2170 if (path_is_mount_point(p, 0) > 0) {
27407a01
ZJS
2171 if (arg_link_journal != LINK_AUTO) {
2172 log_error("%s: already a mount point, refusing to use for journal", p);
2173 return -EEXIST;
2174 }
2175
2176 return 0;
57fb9fb5
LP
2177 }
2178
e26d6ce5 2179 if (path_is_mount_point(q, 0) > 0) {
57fb9fb5 2180 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
2181 log_error("%s: already a mount point, refusing to use for journal", q);
2182 return -EEXIST;
57fb9fb5
LP
2183 }
2184
27407a01 2185 return 0;
57fb9fb5
LP
2186 }
2187
2188 r = readlink_and_make_absolute(p, &d);
2189 if (r >= 0) {
2190 if ((arg_link_journal == LINK_GUEST ||
2191 arg_link_journal == LINK_AUTO) &&
2192 path_equal(d, q)) {
2193
03cfe0d5 2194 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2195 if (r < 0)
56f64d95 2196 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 2197 return 0;
57fb9fb5
LP
2198 }
2199
4a62c710
MS
2200 if (unlink(p) < 0)
2201 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2202 } else if (r == -EINVAL) {
2203
2204 if (arg_link_journal == LINK_GUEST &&
2205 rmdir(p) < 0) {
2206
27407a01
ZJS
2207 if (errno == ENOTDIR) {
2208 log_error("%s already exists and is neither a symlink nor a directory", p);
2209 return r;
2210 } else {
56f64d95 2211 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 2212 return -errno;
57fb9fb5 2213 }
57fb9fb5
LP
2214 }
2215 } else if (r != -ENOENT) {
56f64d95 2216 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 2217 return r;
57fb9fb5
LP
2218 }
2219
2220 if (arg_link_journal == LINK_GUEST) {
2221
2222 if (symlink(q, p) < 0) {
574edc90 2223 if (arg_link_journal_try) {
56f64d95 2224 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
2225 return 0;
2226 } else {
56f64d95 2227 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
2228 return -errno;
2229 }
57fb9fb5
LP
2230 }
2231
03cfe0d5 2232 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2233 if (r < 0)
56f64d95 2234 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 2235 return 0;
57fb9fb5
LP
2236 }
2237
2238 if (arg_link_journal == LINK_HOST) {
574edc90
MP
2239 /* don't create parents here -- if the host doesn't have
2240 * permanent journal set up, don't force it here */
2241 r = mkdir(p, 0755);
57fb9fb5 2242 if (r < 0) {
574edc90 2243 if (arg_link_journal_try) {
56f64d95 2244 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
2245 return 0;
2246 } else {
56f64d95 2247 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
2248 return r;
2249 }
57fb9fb5
LP
2250 }
2251
27407a01
ZJS
2252 } else if (access(p, F_OK) < 0)
2253 return 0;
57fb9fb5 2254
cdb2b9d0
LP
2255 if (dir_is_empty(q) == 0)
2256 log_warning("%s is not empty, proceeding anyway.", q);
2257
03cfe0d5 2258 r = userns_mkdir(directory, p, 0755, 0, 0);
57fb9fb5 2259 if (r < 0) {
56f64d95 2260 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 2261 return r;
57fb9fb5
LP
2262 }
2263
4543768d 2264 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 2265 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2266
27407a01 2267 return 0;
57fb9fb5
LP
2268}
2269
88213476 2270static int drop_capabilities(void) {
5076f0cc 2271 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
2272}
2273
5aa4bb6b 2274static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 2275 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
03976f7b 2276 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
2277 int r;
2278
eb91eb18
LP
2279 if (!arg_register)
2280 return 0;
2281
1c03020c 2282 r = sd_bus_default_system(&bus);
f647962d
MS
2283 if (r < 0)
2284 return log_error_errno(r, "Failed to open system bus: %m");
9444b1f2 2285
89f7c846
LP
2286 if (arg_keep_unit) {
2287 r = sd_bus_call_method(
2288 bus,
2289 "org.freedesktop.machine1",
2290 "/org/freedesktop/machine1",
2291 "org.freedesktop.machine1.Manager",
5aa4bb6b 2292 "RegisterMachineWithNetwork",
89f7c846
LP
2293 &error,
2294 NULL,
5aa4bb6b 2295 "sayssusai",
89f7c846
LP
2296 arg_machine,
2297 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2298 "nspawn",
2299 "container",
2300 (uint32_t) pid,
5aa4bb6b
LP
2301 strempty(arg_directory),
2302 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 2303 } else {
9457ac5b 2304 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
f36933fe 2305 char **i;
ce5b3ad4 2306 unsigned j;
9457ac5b
LP
2307
2308 r = sd_bus_message_new_method_call(
89f7c846 2309 bus,
9457ac5b 2310 &m,
89f7c846
LP
2311 "org.freedesktop.machine1",
2312 "/org/freedesktop/machine1",
2313 "org.freedesktop.machine1.Manager",
5aa4bb6b 2314 "CreateMachineWithNetwork");
f647962d 2315 if (r < 0)
f36933fe 2316 return bus_log_create_error(r);
9457ac5b
LP
2317
2318 r = sd_bus_message_append(
2319 m,
5aa4bb6b 2320 "sayssusai",
89f7c846
LP
2321 arg_machine,
2322 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2323 "nspawn",
2324 "container",
2325 (uint32_t) pid,
5aa4bb6b
LP
2326 strempty(arg_directory),
2327 local_ifindex > 0 ? 1 : 0, local_ifindex);
f647962d 2328 if (r < 0)
f36933fe 2329 return bus_log_create_error(r);
9457ac5b
LP
2330
2331 r = sd_bus_message_open_container(m, 'a', "(sv)");
f647962d 2332 if (r < 0)
f36933fe 2333 return bus_log_create_error(r);
9457ac5b
LP
2334
2335 if (!isempty(arg_slice)) {
2336 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
f647962d 2337 if (r < 0)
f36933fe 2338 return bus_log_create_error(r);
9457ac5b
LP
2339 }
2340
2341 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
f647962d 2342 if (r < 0)
f36933fe 2343 return bus_log_create_error(r);
9457ac5b 2344
773ce3d8
LP
2345 /* If you make changes here, also make sure to update
2346 * systemd-nspawn@.service, to keep the device
2347 * policies in sync regardless if we are run with or
2348 * without the --keep-unit switch. */
63cc4c31 2349 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
9457ac5b
LP
2350 /* Allow the container to
2351 * access and create the API
2352 * device nodes, so that
2353 * PrivateDevices= in the
2354 * container can work
2355 * fine */
2356 "/dev/null", "rwm",
2357 "/dev/zero", "rwm",
2358 "/dev/full", "rwm",
2359 "/dev/random", "rwm",
2360 "/dev/urandom", "rwm",
2361 "/dev/tty", "rwm",
864e1706 2362 "/dev/net/tun", "rwm",
9457ac5b
LP
2363 /* Allow the container
2364 * access to ptys. However,
2365 * do not permit the
2366 * container to ever create
2367 * these device nodes. */
2368 "/dev/pts/ptmx", "rw",
63cc4c31 2369 "char-pts", "rw");
f647962d 2370 if (r < 0)
27023c0e
LP
2371 return bus_log_create_error(r);
2372
ce5b3ad4
SJ
2373 for (j = 0; j < arg_n_custom_mounts; j++) {
2374 CustomMount *cm = &arg_custom_mounts[j];
2375
2376 if (cm->type != CUSTOM_MOUNT_BIND)
2377 continue;
2378
2379 r = is_device_node(cm->source);
2380 if (r < 0)
2381 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
2382
2383 if (r) {
2384 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
2385 cm->source, cm->read_only ? "r" : "rw");
2386 if (r < 0)
2387 return log_error_errno(r, "Failed to append message arguments: %m");
2388 }
2389 }
2390
27023c0e
LP
2391 if (arg_kill_signal != 0) {
2392 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
2393 if (r < 0)
2394 return bus_log_create_error(r);
2395
2396 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
2397 if (r < 0)
2398 return bus_log_create_error(r);
2399 }
9457ac5b 2400
f36933fe
LP
2401 STRV_FOREACH(i, arg_property) {
2402 r = sd_bus_message_open_container(m, 'r', "sv");
2403 if (r < 0)
2404 return bus_log_create_error(r);
2405
2406 r = bus_append_unit_property_assignment(m, *i);
2407 if (r < 0)
2408 return r;
2409
2410 r = sd_bus_message_close_container(m);
2411 if (r < 0)
2412 return bus_log_create_error(r);
2413 }
2414
9457ac5b 2415 r = sd_bus_message_close_container(m);
f647962d 2416 if (r < 0)
f36933fe 2417 return bus_log_create_error(r);
9457ac5b
LP
2418
2419 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
2420 }
2421
9444b1f2 2422 if (r < 0) {
1f0cd86b
LP
2423 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2424 return r;
2425 }
2426
2427 return 0;
2428}
2429
2430static int terminate_machine(pid_t pid) {
2431 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2432 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
03976f7b 2433 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
2434 const char *path;
2435 int r;
2436
eb91eb18
LP
2437 if (!arg_register)
2438 return 0;
2439
1a2399e5
LP
2440 /* If we are reusing the unit, then just exit, systemd will do
2441 * the right thing when we exit. */
2442 if (arg_keep_unit)
2443 return 0;
2444
76b54375 2445 r = sd_bus_default_system(&bus);
f647962d
MS
2446 if (r < 0)
2447 return log_error_errno(r, "Failed to open system bus: %m");
1f0cd86b
LP
2448
2449 r = sd_bus_call_method(
2450 bus,
2451 "org.freedesktop.machine1",
2452 "/org/freedesktop/machine1",
2453 "org.freedesktop.machine1.Manager",
2454 "GetMachineByPID",
2455 &error,
2456 &reply,
2457 "u",
2458 (uint32_t) pid);
2459 if (r < 0) {
2460 /* Note that the machine might already have been
2461 * cleaned up automatically, hence don't consider it a
2462 * failure if we cannot get the machine object. */
2463 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2464 return 0;
2465 }
2466
2467 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
2468 if (r < 0)
2469 return bus_log_parse_error(r);
9444b1f2 2470
1f0cd86b
LP
2471 r = sd_bus_call_method(
2472 bus,
2473 "org.freedesktop.machine1",
2474 path,
2475 "org.freedesktop.machine1.Machine",
2476 "Terminate",
2477 &error,
2478 NULL,
2479 NULL);
2480 if (r < 0) {
2481 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2482 return 0;
2483 }
2484
9444b1f2
LP
2485 return 0;
2486}
2487
db999e0f
LP
2488static int reset_audit_loginuid(void) {
2489 _cleanup_free_ char *p = NULL;
2490 int r;
2491
2492 if (arg_share_system)
2493 return 0;
2494
2495 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2496 if (r == -ENOENT)
db999e0f 2497 return 0;
f647962d
MS
2498 if (r < 0)
2499 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2500
2501 /* Already reset? */
2502 if (streq(p, "4294967295"))
2503 return 0;
2504
ad118bda 2505 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 2506 if (r < 0) {
10a87006
LP
2507 log_error_errno(r,
2508 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2509 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2510 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2511 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2512 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2513
db999e0f 2514 sleep(5);
77b6e194 2515 }
db999e0f
LP
2516
2517 return 0;
77b6e194
LP
2518}
2519
4f758c23
LP
2520#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2521#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
e867ceb6 2522#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
01dde061 2523
a90e2305 2524static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
01dde061
TG
2525 uint8_t result[8];
2526 size_t l, sz;
a90e2305
LP
2527 uint8_t *v, *i;
2528 int r;
01dde061
TG
2529
2530 l = strlen(arg_machine);
2531 sz = sizeof(sd_id128_t) + l;
e867ceb6
LP
2532 if (idx > 0)
2533 sz += sizeof(idx);
a90e2305 2534
01dde061
TG
2535 v = alloca(sz);
2536
2537 /* fetch some persistent data unique to the host */
2538 r = sd_id128_get_machine((sd_id128_t*) v);
2539 if (r < 0)
2540 return r;
2541
2542 /* combine with some data unique (on this host) to this
2543 * container instance */
a90e2305
LP
2544 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2545 if (idx > 0) {
2546 idx = htole64(idx);
2547 memcpy(i, &idx, sizeof(idx));
2548 }
01dde061
TG
2549
2550 /* Let's hash the host machine ID plus the container name. We
2551 * use a fixed, but originally randomly created hash key here. */
4f758c23 2552 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
2553
2554 assert_cc(ETH_ALEN <= sizeof(result));
2555 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2556
2557 /* see eth_random_addr in the kernel */
2558 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2559 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2560
2561 return 0;
2562}
2563
5aa4bb6b 2564static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1c4baffc
TG
2565 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2566 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4f758c23 2567 struct ether_addr mac_host, mac_container;
5aa4bb6b 2568 int r, i;
69c79d3c
LP
2569
2570 if (!arg_private_network)
2571 return 0;
2572
2573 if (!arg_network_veth)
2574 return 0;
2575
08af0da2
LP
2576 /* Use two different interface name prefixes depending whether
2577 * we are in bridge mode or not. */
c00524c9 2578 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 2579 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 2580
e867ceb6
LP
2581 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2582 if (r < 0)
2583 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
4f758c23 2584
e867ceb6
LP
2585 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2586 if (r < 0)
2587 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
01dde061 2588
1c4baffc 2589 r = sd_netlink_open(&rtnl);
f647962d
MS
2590 if (r < 0)
2591 return log_error_errno(r, "Failed to connect to netlink: %m");
69c79d3c 2592
151b9b96 2593 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2594 if (r < 0)
2595 return log_error_errno(r, "Failed to allocate netlink message: %m");
69c79d3c 2596
1c4baffc 2597 r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
f647962d
MS
2598 if (r < 0)
2599 return log_error_errno(r, "Failed to add netlink interface name: %m");
69c79d3c 2600
1c4baffc 2601 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
f647962d
MS
2602 if (r < 0)
2603 return log_error_errno(r, "Failed to add netlink MAC address: %m");
4f758c23 2604
1c4baffc 2605 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2606 if (r < 0)
2607 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2608
1c4baffc 2609 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
f647962d
MS
2610 if (r < 0)
2611 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2612
1c4baffc 2613 r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
f647962d
MS
2614 if (r < 0)
2615 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2616
1c4baffc 2617 r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
f647962d
MS
2618 if (r < 0)
2619 return log_error_errno(r, "Failed to add netlink interface name: %m");
01dde061 2620
1c4baffc 2621 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
f647962d
MS
2622 if (r < 0)
2623 return log_error_errno(r, "Failed to add netlink MAC address: %m");
69c79d3c 2624
1c4baffc 2625 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2626 if (r < 0)
2627 return log_error_errno(r, "Failed to add netlink namespace field: %m");
69c79d3c 2628
1c4baffc 2629 r = sd_netlink_message_close_container(m);
f647962d
MS
2630 if (r < 0)
2631 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c 2632
1c4baffc 2633 r = sd_netlink_message_close_container(m);
f647962d
MS
2634 if (r < 0)
2635 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c 2636
1c4baffc 2637 r = sd_netlink_message_close_container(m);
f647962d
MS
2638 if (r < 0)
2639 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c 2640
1c4baffc 2641 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d 2642 if (r < 0)
637aa8a3 2643 return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
69c79d3c 2644
5aa4bb6b 2645 i = (int) if_nametoindex(iface_name);
4a62c710
MS
2646 if (i <= 0)
2647 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
5aa4bb6b
LP
2648
2649 *ifi = i;
2650
69c79d3c
LP
2651 return 0;
2652}
2653
5aa4bb6b 2654static int setup_bridge(const char veth_name[], int *ifi) {
1c4baffc
TG
2655 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2656 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
ab046dde
TG
2657 int r, bridge;
2658
2659 if (!arg_private_network)
2660 return 0;
2661
2662 if (!arg_network_veth)
2663 return 0;
2664
2665 if (!arg_network_bridge)
2666 return 0;
2667
2668 bridge = (int) if_nametoindex(arg_network_bridge);
4a62c710
MS
2669 if (bridge <= 0)
2670 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
ab046dde 2671
5aa4bb6b
LP
2672 *ifi = bridge;
2673
1c4baffc 2674 r = sd_netlink_open(&rtnl);
f647962d
MS
2675 if (r < 0)
2676 return log_error_errno(r, "Failed to connect to netlink: %m");
ab046dde 2677
151b9b96 2678 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
f647962d
MS
2679 if (r < 0)
2680 return log_error_errno(r, "Failed to allocate netlink message: %m");
ab046dde 2681
039dd4af 2682 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
f647962d
MS
2683 if (r < 0)
2684 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
039dd4af 2685
1c4baffc 2686 r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
f647962d
MS
2687 if (r < 0)
2688 return log_error_errno(r, "Failed to add netlink interface name field: %m");
ab046dde 2689
1c4baffc 2690 r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
f647962d
MS
2691 if (r < 0)
2692 return log_error_errno(r, "Failed to add netlink master field: %m");
ab046dde 2693
1c4baffc 2694 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d
MS
2695 if (r < 0)
2696 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
ab046dde
TG
2697
2698 return 0;
2699}
2700
c74e630d
LP
2701static int parse_interface(struct udev *udev, const char *name) {
2702 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2703 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2704 int ifi;
2705
2706 ifi = (int) if_nametoindex(name);
4a62c710
MS
2707 if (ifi <= 0)
2708 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
c74e630d
LP
2709
2710 sprintf(ifi_str, "n%i", ifi);
2711 d = udev_device_new_from_device_id(udev, ifi_str);
4a62c710
MS
2712 if (!d)
2713 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
c74e630d
LP
2714
2715 if (udev_device_get_is_initialized(d) <= 0) {
2716 log_error("Network interface %s is not initialized yet.", name);
2717 return -EBUSY;
2718 }
2719
2720 return ifi;
2721}
2722
69c79d3c 2723static int move_network_interfaces(pid_t pid) {
7e227024 2724 _cleanup_udev_unref_ struct udev *udev = NULL;
1c4baffc 2725 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
aa28aefe
LP
2726 char **i;
2727 int r;
2728
2729 if (!arg_private_network)
2730 return 0;
2731
2732 if (strv_isempty(arg_network_interfaces))
2733 return 0;
2734
1c4baffc 2735 r = sd_netlink_open(&rtnl);
f647962d
MS
2736 if (r < 0)
2737 return log_error_errno(r, "Failed to connect to netlink: %m");
aa28aefe 2738
7e227024
LP
2739 udev = udev_new();
2740 if (!udev) {
2741 log_error("Failed to connect to udev.");
2742 return -ENOMEM;
2743 }
2744
aa28aefe 2745 STRV_FOREACH(i, arg_network_interfaces) {
1c4baffc 2746 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
b88eb17a 2747 int ifi;
aa28aefe 2748
c74e630d
LP
2749 ifi = parse_interface(udev, *i);
2750 if (ifi < 0)
2751 return ifi;
2752
3125b3ef 2753 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
f647962d
MS
2754 if (r < 0)
2755 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2756
1c4baffc 2757 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2758 if (r < 0)
2759 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
7e227024 2760
1c4baffc 2761 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d
MS
2762 if (r < 0)
2763 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
c74e630d 2764 }
7e227024 2765
c74e630d
LP
2766 return 0;
2767}
2768
2769static int setup_macvlan(pid_t pid) {
2770 _cleanup_udev_unref_ struct udev *udev = NULL;
1c4baffc 2771 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
e867ceb6 2772 unsigned idx = 0;
c74e630d
LP
2773 char **i;
2774 int r;
2775
2776 if (!arg_private_network)
2777 return 0;
2778
2779 if (strv_isempty(arg_network_macvlan))
2780 return 0;
2781
1c4baffc 2782 r = sd_netlink_open(&rtnl);
f647962d
MS
2783 if (r < 0)
2784 return log_error_errno(r, "Failed to connect to netlink: %m");
c74e630d
LP
2785
2786 udev = udev_new();
2787 if (!udev) {
2788 log_error("Failed to connect to udev.");
2789 return -ENOMEM;
2790 }
2791
2792 STRV_FOREACH(i, arg_network_macvlan) {
1c4baffc 2793 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
c74e630d 2794 _cleanup_free_ char *n = NULL;
e867ceb6 2795 struct ether_addr mac;
c74e630d
LP
2796 int ifi;
2797
2798 ifi = parse_interface(udev, *i);
2799 if (ifi < 0)
2800 return ifi;
2801
e867ceb6
LP
2802 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2803 if (r < 0)
2804 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2805
c74e630d 2806 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2807 if (r < 0)
2808 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2809
1c4baffc 2810 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
f647962d
MS
2811 if (r < 0)
2812 return log_error_errno(r, "Failed to add netlink interface index: %m");
c74e630d
LP
2813
2814 n = strappend("mv-", *i);
2815 if (!n)
2816 return log_oom();
2817
2818 strshorten(n, IFNAMSIZ-1);
2819
1c4baffc 2820 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
f647962d
MS
2821 if (r < 0)
2822 return log_error_errno(r, "Failed to add netlink interface name: %m");
c74e630d 2823
1c4baffc 2824 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
e867ceb6
LP
2825 if (r < 0)
2826 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2827
1c4baffc 2828 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2829 if (r < 0)
2830 return log_error_errno(r, "Failed to add netlink namespace field: %m");
c74e630d 2831
1c4baffc 2832 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2833 if (r < 0)
2834 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 2835
1c4baffc 2836 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
f647962d
MS
2837 if (r < 0)
2838 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 2839
1c4baffc 2840 r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
f647962d
MS
2841 if (r < 0)
2842 return log_error_errno(r, "Failed to append macvlan mode: %m");
c74e630d 2843
1c4baffc 2844 r = sd_netlink_message_close_container(m);
f647962d
MS
2845 if (r < 0)
2846 return log_error_errno(r, "Failed to close netlink container: %m");
c74e630d 2847
1c4baffc 2848 r = sd_netlink_message_close_container(m);
f647962d
MS
2849 if (r < 0)
2850 return log_error_errno(r, "Failed to close netlink container: %m");
aa28aefe 2851
1c4baffc 2852 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d
MS
2853 if (r < 0)
2854 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
aa28aefe
LP
2855 }
2856
2857 return 0;
2858}
2859
4bbfe7ad
TG
2860static int setup_ipvlan(pid_t pid) {
2861 _cleanup_udev_unref_ struct udev *udev = NULL;
1c4baffc 2862 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4bbfe7ad
TG
2863 char **i;
2864 int r;
2865
2866 if (!arg_private_network)
2867 return 0;
2868
2869 if (strv_isempty(arg_network_ipvlan))
2870 return 0;
2871
1c4baffc 2872 r = sd_netlink_open(&rtnl);
4bbfe7ad
TG
2873 if (r < 0)
2874 return log_error_errno(r, "Failed to connect to netlink: %m");
2875
2876 udev = udev_new();
2877 if (!udev) {
2878 log_error("Failed to connect to udev.");
2879 return -ENOMEM;
2880 }
2881
2882 STRV_FOREACH(i, arg_network_ipvlan) {
1c4baffc 2883 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
4bbfe7ad
TG
2884 _cleanup_free_ char *n = NULL;
2885 int ifi;
2886
2887 ifi = parse_interface(udev, *i);
2888 if (ifi < 0)
2889 return ifi;
2890
2891 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2892 if (r < 0)
2893 return log_error_errno(r, "Failed to allocate netlink message: %m");
2894
1c4baffc 2895 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
4bbfe7ad
TG
2896 if (r < 0)
2897 return log_error_errno(r, "Failed to add netlink interface index: %m");
2898
2899 n = strappend("iv-", *i);
2900 if (!n)
2901 return log_oom();
2902
2903 strshorten(n, IFNAMSIZ-1);
2904
1c4baffc 2905 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
4bbfe7ad
TG
2906 if (r < 0)
2907 return log_error_errno(r, "Failed to add netlink interface name: %m");
2908
1c4baffc 2909 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
4bbfe7ad
TG
2910 if (r < 0)
2911 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2912
1c4baffc 2913 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
4bbfe7ad
TG
2914 if (r < 0)
2915 return log_error_errno(r, "Failed to open netlink container: %m");
2916
1c4baffc 2917 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
4bbfe7ad
TG
2918 if (r < 0)
2919 return log_error_errno(r, "Failed to open netlink container: %m");
2920
1c4baffc 2921 r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
4bbfe7ad
TG
2922 if (r < 0)
2923 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2924
1c4baffc 2925 r = sd_netlink_message_close_container(m);
4bbfe7ad
TG
2926 if (r < 0)
2927 return log_error_errno(r, "Failed to close netlink container: %m");
2928
1c4baffc 2929 r = sd_netlink_message_close_container(m);
4bbfe7ad
TG
2930 if (r < 0)
2931 return log_error_errno(r, "Failed to close netlink container: %m");
2932
1c4baffc 2933 r = sd_netlink_call(rtnl, m, 0, NULL);
4bbfe7ad
TG
2934 if (r < 0)
2935 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2936 }
2937
2938 return 0;
2939}
2940
28650077 2941static int setup_seccomp(void) {
24fb1112
LP
2942
2943#ifdef HAVE_SECCOMP
9a71b112
JF
2944 static const struct {
2945 uint64_t capability;
2946 int syscall_num;
2947 } blacklist[] = {
5ba7a268
LP
2948 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
2949 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
2950 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
2951 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
2952 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
2953 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
2954 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
2955 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
2956 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
2957 { CAP_SYSLOG, SCMP_SYS(syslog) },
d0a0ccf3
JF
2958 };
2959
24fb1112 2960 scmp_filter_ctx seccomp;
28650077 2961 unsigned i;
24fb1112
LP
2962 int r;
2963
24fb1112
LP
2964 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2965 if (!seccomp)
2966 return log_oom();
2967
e9642be2 2968 r = seccomp_add_secondary_archs(seccomp);
9875fd78 2969 if (r < 0) {
da927ba9 2970 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
2971 goto finish;
2972 }
2973
28650077 2974 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
2975 if (arg_retain & (1ULL << blacklist[i].capability))
2976 continue;
2977
2978 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
2979 if (r == -EFAULT)
2980 continue; /* unknown syscall */
2981 if (r < 0) {
da927ba9 2982 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
2983 goto finish;
2984 }
2985 }
2986
d0a0ccf3 2987
28650077
LP
2988 /*
2989 Audit is broken in containers, much of the userspace audit
2990 hookup will fail if running inside a container. We don't
2991 care and just turn off creation of audit sockets.
2992
2993 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2994 with EAFNOSUPPORT which audit userspace uses as indication
2995 that audit is disabled in the kernel.
2996 */
2997
3302da46 2998 r = seccomp_rule_add(
24fb1112
LP
2999 seccomp,
3000 SCMP_ACT_ERRNO(EAFNOSUPPORT),
3001 SCMP_SYS(socket),
3002 2,
3003 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
3004 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
3005 if (r < 0) {
da927ba9 3006 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
3007 goto finish;
3008 }
3009
3010 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
3011 if (r < 0) {
da927ba9 3012 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
3013 goto finish;
3014 }
3015
3016 r = seccomp_load(seccomp);
9b1cbdc6
ILG
3017 if (r == -EINVAL) {
3018 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3019 r = 0;
3020 goto finish;
3021 }
3022 if (r < 0) {
da927ba9 3023 log_error_errno(r, "Failed to install seccomp audit filter: %m");
9b1cbdc6
ILG
3024 goto finish;
3025 }
24fb1112
LP
3026
3027finish:
3028 seccomp_release(seccomp);
3029 return r;
3030#else
3031 return 0;
3032#endif
3033
3034}
3035
785890ac
LP
3036static int setup_propagate(const char *root) {
3037 const char *p, *q;
3038
3039 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3040 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 3041 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
3042 (void) mkdir_p(p, 0600);
3043
03cfe0d5
LP
3044 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
3045 return log_error_errno(errno, "Failed to create /run/systemd: %m");
3046
3047 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3048 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
3049
3050 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3051 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 3052
03cfe0d5 3053 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
3054 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
3055 return log_error_errno(errno, "Failed to install propagation bind mount.");
3056
3057 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
3058 return log_error_errno(errno, "Failed to make propagation mount read-only");
3059
3060 return 0;
3061}
3062
1b9e5b12
LP
3063static int setup_image(char **device_path, int *loop_nr) {
3064 struct loop_info64 info = {
3065 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
3066 };
3067 _cleanup_close_ int fd = -1, control = -1, loop = -1;
3068 _cleanup_free_ char* loopdev = NULL;
3069 struct stat st;
3070 int r, nr;
3071
3072 assert(device_path);
3073 assert(loop_nr);
ec16945e 3074 assert(arg_image);
1b9e5b12
LP
3075
3076 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
3077 if (fd < 0)
3078 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 3079
4a62c710
MS
3080 if (fstat(fd, &st) < 0)
3081 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
3082
3083 if (S_ISBLK(st.st_mode)) {
3084 char *p;
3085
3086 p = strdup(arg_image);
3087 if (!p)
3088 return log_oom();
3089
3090 *device_path = p;
3091
3092 *loop_nr = -1;
3093
3094 r = fd;
3095 fd = -1;
3096
3097 return r;
3098 }
3099
3100 if (!S_ISREG(st.st_mode)) {
56f64d95 3101 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
3102 return -EINVAL;
3103 }
3104
3105 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
3106 if (control < 0)
3107 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
3108
3109 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
3110 if (nr < 0)
3111 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
3112
3113 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
3114 return log_oom();
3115
3116 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
3117 if (loop < 0)
3118 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 3119
4a62c710
MS
3120 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
3121 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
3122
3123 if (arg_read_only)
3124 info.lo_flags |= LO_FLAGS_READ_ONLY;
3125
4a62c710
MS
3126 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
3127 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
3128
3129 *device_path = loopdev;
3130 loopdev = NULL;
3131
3132 *loop_nr = nr;
3133
3134 r = loop;
3135 loop = -1;
3136
3137 return r;
3138}
3139
ada4799a
LP
3140#define PARTITION_TABLE_BLURB \
3141 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 3142 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 3143 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
3144 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3145 "to be bootable with systemd-nspawn."
3146
1b9e5b12
LP
3147static int dissect_image(
3148 int fd,
727fd4fd
LP
3149 char **root_device, bool *root_device_rw,
3150 char **home_device, bool *home_device_rw,
3151 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
3152 bool *secondary) {
3153
3154#ifdef HAVE_BLKID
01dc33ce
ZJS
3155 int home_nr = -1, srv_nr = -1;
3156#ifdef GPT_ROOT_NATIVE
3157 int root_nr = -1;
3158#endif
3159#ifdef GPT_ROOT_SECONDARY
3160 int secondary_root_nr = -1;
3161#endif
f6c51a81 3162 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
3163 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
3164 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
3165 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3166 _cleanup_udev_unref_ struct udev *udev = NULL;
3167 struct udev_list_entry *first, *item;
f6c51a81 3168 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 3169 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
3170 const char *pttype = NULL;
3171 blkid_partlist pl;
3172 struct stat st;
c09ef2e4 3173 unsigned i;
1b9e5b12
LP
3174 int r;
3175
3176 assert(fd >= 0);
3177 assert(root_device);
3178 assert(home_device);
3179 assert(srv_device);
3180 assert(secondary);
ec16945e 3181 assert(arg_image);
1b9e5b12
LP
3182
3183 b = blkid_new_probe();
3184 if (!b)
3185 return log_oom();
3186
3187 errno = 0;
3188 r = blkid_probe_set_device(b, fd, 0, 0);
3189 if (r != 0) {
3190 if (errno == 0)
3191 return log_oom();
3192
56f64d95 3193 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
3194 return -errno;
3195 }
3196
3197 blkid_probe_enable_partitions(b, 1);
3198 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
3199
3200 errno = 0;
3201 r = blkid_do_safeprobe(b);
3202 if (r == -2 || r == 1) {
ada4799a
LP
3203 log_error("Failed to identify any partition table on\n"
3204 " %s\n"
3205 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
3206 return -EINVAL;
3207 } else if (r != 0) {
3208 if (errno == 0)
3209 errno = EIO;
56f64d95 3210 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
3211 return -errno;
3212 }
3213
48861960 3214 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
3215
3216 is_gpt = streq_ptr(pttype, "gpt");
3217 is_mbr = streq_ptr(pttype, "dos");
3218
3219 if (!is_gpt && !is_mbr) {
3220 log_error("No GPT or MBR partition table discovered on\n"
3221 " %s\n"
3222 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
3223 return -EINVAL;
3224 }
3225
3226 errno = 0;
3227 pl = blkid_probe_get_partitions(b);
3228 if (!pl) {
3229 if (errno == 0)
3230 return log_oom();
3231
3232 log_error("Failed to list partitions of %s", arg_image);
3233 return -errno;
3234 }
3235
3236 udev = udev_new();
3237 if (!udev)
3238 return log_oom();
3239
4a62c710
MS
3240 if (fstat(fd, &st) < 0)
3241 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 3242
c09ef2e4
LP
3243 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
3244 if (!d)
1b9e5b12
LP
3245 return log_oom();
3246
c09ef2e4
LP
3247 for (i = 0;; i++) {
3248 int n, m;
1b9e5b12 3249
c09ef2e4
LP
3250 if (i >= 10) {
3251 log_error("Kernel partitions never appeared.");
3252 return -ENXIO;
3253 }
3254
3255 e = udev_enumerate_new(udev);
3256 if (!e)
3257 return log_oom();
3258
3259 r = udev_enumerate_add_match_parent(e, d);
3260 if (r < 0)
3261 return log_oom();
3262
3263 r = udev_enumerate_scan_devices(e);
3264 if (r < 0)
3265 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
3266
3267 /* Count the partitions enumerated by the kernel */
3268 n = 0;
3269 first = udev_enumerate_get_list_entry(e);
3270 udev_list_entry_foreach(item, first)
3271 n++;
3272
3273 /* Count the partitions enumerated by blkid */
3274 m = blkid_partlist_numof_partitions(pl);
3275 if (n == m + 1)
3276 break;
3277 if (n > m + 1) {
3278 log_error("blkid and kernel partition list do not match.");
3279 return -EIO;
3280 }
3281 if (n < m + 1) {
3282 unsigned j;
3283
3284 /* The kernel has probed fewer partitions than
3285 * blkid? Maybe the kernel prober is still
3286 * running or it got EBUSY because udev
3287 * already opened the device. Let's reprobe
3288 * the device, which is a synchronous call
3289 * that waits until probing is complete. */
3290
3291 for (j = 0; j < 20; j++) {
3292
3293 r = ioctl(fd, BLKRRPART, 0);
3294 if (r < 0)
3295 r = -errno;
3296 if (r >= 0 || r != -EBUSY)
3297 break;
3298
3299 /* If something else has the device
3300 * open, such as an udev rule, the
3301 * ioctl will return EBUSY. Since
3302 * there's no way to wait until it
3303 * isn't busy anymore, let's just wait
3304 * a bit, and try again.
3305 *
3306 * This is really something they
3307 * should fix in the kernel! */
3308
3309 usleep(50 * USEC_PER_MSEC);
3310 }
3311
3312 if (r < 0)
3313 return log_error_errno(r, "Failed to reread partition table: %m");
3314 }
3315
3316 e = udev_enumerate_unref(e);
3317 }
1b9e5b12
LP
3318
3319 first = udev_enumerate_get_list_entry(e);
3320 udev_list_entry_foreach(item, first) {
3321 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 3322 const char *node;
727fd4fd 3323 unsigned long long flags;
1b9e5b12
LP
3324 blkid_partition pp;
3325 dev_t qn;
3326 int nr;
3327
3328 errno = 0;
3329 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
3330 if (!q) {
3331 if (!errno)
3332 errno = ENOMEM;
3333
56f64d95 3334 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
3335 return -errno;
3336 }
3337
3338 qn = udev_device_get_devnum(q);
3339 if (major(qn) == 0)
3340 continue;
3341
3342 if (st.st_rdev == qn)
3343 continue;
3344
3345 node = udev_device_get_devnode(q);
3346 if (!node)
3347 continue;
3348
3349 pp = blkid_partlist_devno_to_partition(pl, qn);
3350 if (!pp)
3351 continue;
3352
727fd4fd 3353 flags = blkid_partition_get_flags(pp);
727fd4fd 3354
1b9e5b12
LP
3355 nr = blkid_partition_get_partno(pp);
3356 if (nr < 0)
3357 continue;
3358
ada4799a
LP
3359 if (is_gpt) {
3360 sd_id128_t type_id;
3361 const char *stype;
1b9e5b12 3362
f6c51a81
LP
3363 if (flags & GPT_FLAG_NO_AUTO)
3364 continue;
3365
ada4799a
LP
3366 stype = blkid_partition_get_type_string(pp);
3367 if (!stype)
3368 continue;
1b9e5b12 3369
ada4799a 3370 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
3371 continue;
3372
ada4799a 3373 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 3374
ada4799a
LP
3375 if (home && nr >= home_nr)
3376 continue;
1b9e5b12 3377
ada4799a
LP
3378 home_nr = nr;
3379 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 3380
ada4799a
LP
3381 r = free_and_strdup(&home, node);
3382 if (r < 0)
3383 return log_oom();
727fd4fd 3384
ada4799a
LP
3385 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3386
3387 if (srv && nr >= srv_nr)
3388 continue;
3389
3390 srv_nr = nr;
3391 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3392
3393 r = free_and_strdup(&srv, node);
3394 if (r < 0)
3395 return log_oom();
3396 }
1b9e5b12 3397#ifdef GPT_ROOT_NATIVE
ada4799a 3398 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 3399
ada4799a
LP
3400 if (root && nr >= root_nr)
3401 continue;
1b9e5b12 3402
ada4799a
LP
3403 root_nr = nr;
3404 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 3405
ada4799a
LP
3406 r = free_and_strdup(&root, node);
3407 if (r < 0)
3408 return log_oom();
3409 }
1b9e5b12
LP
3410#endif
3411#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
3412 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3413
3414 if (secondary_root && nr >= secondary_root_nr)
3415 continue;
3416
3417 secondary_root_nr = nr;
3418 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3419
3420 r = free_and_strdup(&secondary_root, node);
3421 if (r < 0)
3422 return log_oom();
3423 }
3424#endif
f6c51a81
LP
3425 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3426
3427 if (generic)
3428 multiple_generic = true;
3429 else {
3430 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3431
3432 r = free_and_strdup(&generic, node);
3433 if (r < 0)
3434 return log_oom();
3435 }
3436 }
ada4799a
LP
3437
3438 } else if (is_mbr) {
3439 int type;
1b9e5b12 3440
f6c51a81
LP
3441 if (flags != 0x80) /* Bootable flag */
3442 continue;
3443
ada4799a
LP
3444 type = blkid_partition_get_type(pp);
3445 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
3446 continue;
3447
f6c51a81
LP
3448 if (generic)
3449 multiple_generic = true;
3450 else {
3451 generic_rw = true;
727fd4fd 3452
f6c51a81
LP
3453 r = free_and_strdup(&root, node);
3454 if (r < 0)
3455 return log_oom();
3456 }
1b9e5b12 3457 }
1b9e5b12
LP
3458 }
3459
1b9e5b12
LP
3460 if (root) {
3461 *root_device = root;
3462 root = NULL;
727fd4fd
LP
3463
3464 *root_device_rw = root_rw;
1b9e5b12
LP
3465 *secondary = false;
3466 } else if (secondary_root) {
3467 *root_device = secondary_root;
3468 secondary_root = NULL;
727fd4fd
LP
3469
3470 *root_device_rw = secondary_root_rw;
1b9e5b12 3471 *secondary = true;
f6c51a81
LP
3472 } else if (generic) {
3473
3474 /* There were no partitions with precise meanings
3475 * around, but we found generic partitions. In this
3476 * case, if there's only one, we can go ahead and boot
3477 * it, otherwise we bail out, because we really cannot
3478 * make any sense of it. */
3479
3480 if (multiple_generic) {
3481 log_error("Identified multiple bootable Linux partitions on\n"
3482 " %s\n"
3483 PARTITION_TABLE_BLURB, arg_image);
3484 return -EINVAL;
3485 }
3486
3487 *root_device = generic;
3488 generic = NULL;
3489
3490 *root_device_rw = generic_rw;
3491 *secondary = false;
3492 } else {
3493 log_error("Failed to identify root partition in disk image\n"
3494 " %s\n"
3495 PARTITION_TABLE_BLURB, arg_image);
3496 return -EINVAL;
1b9e5b12
LP
3497 }
3498
3499 if (home) {
3500 *home_device = home;
3501 home = NULL;
727fd4fd
LP
3502
3503 *home_device_rw = home_rw;
1b9e5b12
LP
3504 }
3505
3506 if (srv) {
3507 *srv_device = srv;
3508 srv = NULL;
727fd4fd
LP
3509
3510 *srv_device_rw = srv_rw;
1b9e5b12
LP
3511 }
3512
3513 return 0;
3514#else
3515 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3516 return -EOPNOTSUPP;
1b9e5b12
LP
3517#endif
3518}
3519
727fd4fd 3520static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
3521#ifdef HAVE_BLKID
3522 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3523 const char *fstype, *p;
3524 int r;
3525
3526 assert(what);
3527 assert(where);
3528
727fd4fd
LP
3529 if (arg_read_only)
3530 rw = false;
3531
1b9e5b12 3532 if (directory)
63c372cb 3533 p = strjoina(where, directory);
1b9e5b12
LP
3534 else
3535 p = where;
3536
3537 errno = 0;
3538 b = blkid_new_probe_from_filename(what);
3539 if (!b) {
3540 if (errno == 0)
3541 return log_oom();
56f64d95 3542 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
3543 return -errno;
3544 }
3545
3546 blkid_probe_enable_superblocks(b, 1);
3547 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3548
3549 errno = 0;
3550 r = blkid_do_safeprobe(b);
3551 if (r == -1 || r == 1) {
3552 log_error("Cannot determine file system type of %s", what);
3553 return -EINVAL;
3554 } else if (r != 0) {
3555 if (errno == 0)
3556 errno = EIO;
56f64d95 3557 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
3558 return -errno;
3559 }
3560
3561 errno = 0;
3562 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3563 if (errno == 0)
3564 errno = EINVAL;
3565 log_error("Failed to determine file system type of %s", what);
3566 return -errno;
3567 }
3568
3569 if (streq(fstype, "crypto_LUKS")) {
3570 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 3571 return -EOPNOTSUPP;
1b9e5b12
LP
3572 }
3573
4a62c710
MS
3574 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3575 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
3576
3577 return 0;
3578#else
3579 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3580 return -EOPNOTSUPP;
1b9e5b12
LP
3581#endif
3582}
3583
727fd4fd
LP
3584static int mount_devices(
3585 const char *where,
3586 const char *root_device, bool root_device_rw,
3587 const char *home_device, bool home_device_rw,
3588 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
3589 int r;
3590
3591 assert(where);
3592
3593 if (root_device) {
727fd4fd 3594 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
3595 if (r < 0)
3596 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
3597 }
3598
3599 if (home_device) {
727fd4fd 3600 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
3601 if (r < 0)
3602 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
3603 }
3604
3605 if (srv_device) {
727fd4fd 3606 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
3607 if (r < 0)
3608 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
3609 }
3610
3611 return 0;
3612}
3613
3614static void loop_remove(int nr, int *image_fd) {
3615 _cleanup_close_ int control = -1;
e8c8ddcc 3616 int r;
1b9e5b12
LP
3617
3618 if (nr < 0)
3619 return;
3620
3621 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
3622 r = ioctl(*image_fd, LOOP_CLR_FD);
3623 if (r < 0)
5e4074aa 3624 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 3625 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
3626 }
3627
3628 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 3629 if (control < 0) {
56f64d95 3630 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 3631 return;
e8c8ddcc 3632 }
1b9e5b12 3633
e8c8ddcc
TG
3634 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3635 if (r < 0)
5e4074aa 3636 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
3637}
3638
0cb9fbcd
LP
3639static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3640 int pipe_fds[2];
3641 pid_t pid;
3642
3643 assert(database);
3644 assert(key);
3645 assert(rpid);
3646
4a62c710
MS
3647 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3648 return log_error_errno(errno, "Failed to allocate pipe: %m");
0cb9fbcd
LP
3649
3650 pid = fork();
4a62c710
MS
3651 if (pid < 0)
3652 return log_error_errno(errno, "Failed to fork getent child: %m");
3653 else if (pid == 0) {
0cb9fbcd
LP
3654 int nullfd;
3655 char *empty_env = NULL;
3656
3657 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3658 _exit(EXIT_FAILURE);
3659
3660 if (pipe_fds[0] > 2)
03e334a1 3661 safe_close(pipe_fds[0]);
0cb9fbcd 3662 if (pipe_fds[1] > 2)
03e334a1 3663 safe_close(pipe_fds[1]);
0cb9fbcd
LP
3664
3665 nullfd = open("/dev/null", O_RDWR);
3666 if (nullfd < 0)
3667 _exit(EXIT_FAILURE);
3668
3669 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3670 _exit(EXIT_FAILURE);
3671
3672 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3673 _exit(EXIT_FAILURE);
3674
3675 if (nullfd > 2)
03e334a1 3676 safe_close(nullfd);
0cb9fbcd 3677
ce30c8dc
LP
3678 (void) reset_all_signal_handlers();
3679 (void) reset_signal_mask();
0cb9fbcd
LP
3680 close_all_fds(NULL, 0);
3681
4de82926
MM
3682 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3683 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
3684 _exit(EXIT_FAILURE);
3685 }
3686
03e334a1 3687 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
3688
3689 *rpid = pid;
3690
3691 return pipe_fds[0];
3692}
3693
3694static int change_uid_gid(char **_home) {
a2a5291b
ZJS
3695 char line[LINE_MAX], *x, *u, *g, *h;
3696 const char *word, *state;
0cb9fbcd
LP
3697 _cleanup_free_ uid_t *uids = NULL;
3698 _cleanup_free_ char *home = NULL;
3699 _cleanup_fclose_ FILE *f = NULL;
3700 _cleanup_close_ int fd = -1;
3701 unsigned n_uids = 0;
70f539ca 3702 size_t sz = 0, l;
0cb9fbcd
LP
3703 uid_t uid;
3704 gid_t gid;
3705 pid_t pid;
3706 int r;
3707
3708 assert(_home);
3709
3710 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3711 /* Reset everything fully to 0, just in case */
3712
03cfe0d5
LP
3713 r = reset_uid_gid();
3714 if (r < 0)
3715 return log_error_errno(r, "Failed to become root: %m");
0cb9fbcd
LP
3716
3717 *_home = NULL;
3718 return 0;
3719 }
3720
3721 /* First, get user credentials */
3722 fd = spawn_getent("passwd", arg_user, &pid);
3723 if (fd < 0)
3724 return fd;
3725
3726 f = fdopen(fd, "r");
3727 if (!f)
3728 return log_oom();
3729 fd = -1;
3730
3731 if (!fgets(line, sizeof(line), f)) {
3732
3733 if (!ferror(f)) {
3734 log_error("Failed to resolve user %s.", arg_user);
3735 return -ESRCH;
3736 }
3737
56f64d95 3738 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3739 return -errno;
3740 }
3741
3742 truncate_nl(line);
3743
820d3acf 3744 wait_for_terminate_and_warn("getent passwd", pid, true);
0cb9fbcd
LP
3745
3746 x = strchr(line, ':');
3747 if (!x) {
3748 log_error("/etc/passwd entry has invalid user field.");
3749 return -EIO;
3750 }
3751
3752 u = strchr(x+1, ':');
3753 if (!u) {
3754 log_error("/etc/passwd entry has invalid password field.");
3755 return -EIO;
3756 }
3757
3758 u++;
3759 g = strchr(u, ':');
3760 if (!g) {
3761 log_error("/etc/passwd entry has invalid UID field.");
3762 return -EIO;
3763 }
3764
3765 *g = 0;
3766 g++;
3767 x = strchr(g, ':');
3768 if (!x) {
3769 log_error("/etc/passwd entry has invalid GID field.");
3770 return -EIO;
3771 }
3772
3773 *x = 0;
3774 h = strchr(x+1, ':');
3775 if (!h) {
3776 log_error("/etc/passwd entry has invalid GECOS field.");
3777 return -EIO;
3778 }
3779
3780 h++;
3781 x = strchr(h, ':');
3782 if (!x) {
3783 log_error("/etc/passwd entry has invalid home directory field.");
3784 return -EIO;
3785 }
3786
3787 *x = 0;
3788
3789 r = parse_uid(u, &uid);
3790 if (r < 0) {
3791 log_error("Failed to parse UID of user.");
3792 return -EIO;
3793 }
3794
3795 r = parse_gid(g, &gid);
3796 if (r < 0) {
3797 log_error("Failed to parse GID of user.");
3798 return -EIO;
3799 }
3800
3801 home = strdup(h);
3802 if (!home)
3803 return log_oom();
3804
3805 /* Second, get group memberships */
3806 fd = spawn_getent("initgroups", arg_user, &pid);
3807 if (fd < 0)
3808 return fd;
3809
3810 fclose(f);
3811 f = fdopen(fd, "r");
3812 if (!f)
3813 return log_oom();
3814 fd = -1;
3815
3816 if (!fgets(line, sizeof(line), f)) {
3817 if (!ferror(f)) {
3818 log_error("Failed to resolve user %s.", arg_user);
3819 return -ESRCH;
3820 }
3821
56f64d95 3822 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3823 return -errno;
3824 }
3825
3826 truncate_nl(line);
3827
820d3acf 3828 wait_for_terminate_and_warn("getent initgroups", pid, true);
0cb9fbcd
LP
3829
3830 /* Skip over the username and subsequent separator whitespace */
3831 x = line;
3832 x += strcspn(x, WHITESPACE);
3833 x += strspn(x, WHITESPACE);
3834
a2a5291b 3835 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
3836 char c[l+1];
3837
a2a5291b 3838 memcpy(c, word, l);
0cb9fbcd
LP
3839 c[l] = 0;
3840
3841 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3842 return log_oom();
3843
3844 r = parse_uid(c, &uids[n_uids++]);
3845 if (r < 0) {
3846 log_error("Failed to parse group data from getent.");
3847 return -EIO;
3848 }
3849 }
3850
3851 r = mkdir_parents(home, 0775);
f647962d
MS
3852 if (r < 0)
3853 return log_error_errno(r, "Failed to make home root directory: %m");
0cb9fbcd
LP
3854
3855 r = mkdir_safe(home, 0755, uid, gid);
f647962d
MS
3856 if (r < 0 && r != -EEXIST)
3857 return log_error_errno(r, "Failed to make home directory: %m");
0cb9fbcd 3858
03cfe0d5
LP
3859 (void) fchown(STDIN_FILENO, uid, gid);
3860 (void) fchown(STDOUT_FILENO, uid, gid);
3861 (void) fchown(STDERR_FILENO, uid, gid);
0cb9fbcd 3862
4a62c710
MS
3863 if (setgroups(n_uids, uids) < 0)
3864 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
0cb9fbcd 3865
4a62c710
MS
3866 if (setresgid(gid, gid, gid) < 0)
3867 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3868
4a62c710
MS
3869 if (setresuid(uid, uid, uid) < 0)
3870 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3871
3872 if (_home) {
3873 *_home = home;
3874 home = NULL;
3875 }
3876
3877 return 0;
3878}
3879
113cea80 3880/*
6d416b9c
LS
3881 * Return values:
3882 * < 0 : wait_for_terminate() failed to get the state of the
3883 * container, the container was terminated by a signal, or
3884 * failed for an unknown reason. No change is made to the
3885 * container argument.
3886 * > 0 : The program executed in the container terminated with an
3887 * error. The exit code of the program executed in the
919699ec
LP
3888 * container is returned. The container argument has been set
3889 * to CONTAINER_TERMINATED.
6d416b9c
LS
3890 * 0 : The container is being rebooted, has been shut down or exited
3891 * successfully. The container argument has been set to either
3892 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 3893 *
6d416b9c
LS
3894 * That is, success is indicated by a return value of zero, and an
3895 * error is indicated by a non-zero value.
113cea80
DH
3896 */
3897static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 3898 siginfo_t status;
919699ec 3899 int r;
113cea80
DH
3900
3901 r = wait_for_terminate(pid, &status);
f647962d
MS
3902 if (r < 0)
3903 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
3904
3905 switch (status.si_code) {
fddbb89c 3906
113cea80 3907 case CLD_EXITED:
919699ec
LP
3908 if (status.si_status == 0) {
3909 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 3910
fddbb89c 3911 } else
919699ec 3912 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 3913
919699ec
LP
3914 *container = CONTAINER_TERMINATED;
3915 return status.si_status;
113cea80
DH
3916
3917 case CLD_KILLED:
3918 if (status.si_status == SIGINT) {
113cea80 3919
919699ec 3920 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 3921 *container = CONTAINER_TERMINATED;
919699ec
LP
3922 return 0;
3923
113cea80 3924 } else if (status.si_status == SIGHUP) {
113cea80 3925
919699ec 3926 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 3927 *container = CONTAINER_REBOOTED;
919699ec 3928 return 0;
113cea80 3929 }
919699ec 3930
113cea80
DH
3931 /* CLD_KILLED fallthrough */
3932
3933 case CLD_DUMPED:
fddbb89c 3934 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 3935 return -EIO;
113cea80
DH
3936
3937 default:
fddbb89c 3938 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 3939 return -EIO;
113cea80
DH
3940 }
3941
3942 return r;
3943}
3944
e866af3a
DH
3945static void nop_handler(int sig) {}
3946
023fb90b
LP
3947static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3948 pid_t pid;
3949
3950 pid = PTR_TO_UINT32(userdata);
3951 if (pid > 0) {
c6c8f6e2 3952 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
3953 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3954 sd_event_source_set_userdata(s, NULL);
3955 return 0;
3956 }
3957 }
3958
3959 sd_event_exit(sd_event_source_get_event(s), 0);
3960 return 0;
3961}
3962
ec16945e 3963static int determine_names(void) {
1b9cebf6 3964 int r;
ec16945e
LP
3965
3966 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3967 if (arg_machine) {
3968 _cleanup_(image_unrefp) Image *i = NULL;
3969
3970 r = image_find(arg_machine, &i);
3971 if (r < 0)
3972 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3973 else if (r == 0) {
3974 log_error("No image for machine '%s': %m", arg_machine);
3975 return -ENOENT;
3976 }
3977
aceac2f0 3978 if (i->type == IMAGE_RAW)
1b9cebf6
LP
3979 r = set_sanitized_path(&arg_image, i->path);
3980 else
3981 r = set_sanitized_path(&arg_directory, i->path);
3982 if (r < 0)
3983 return log_error_errno(r, "Invalid image directory: %m");
3984
aee327b8
LP
3985 if (!arg_ephemeral)
3986 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 3987 } else
ec16945e
LP
3988 arg_directory = get_current_dir_name();
3989
1b9cebf6
LP
3990 if (!arg_directory && !arg_machine) {
3991 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
3992 return -EINVAL;
3993 }
3994 }
3995
3996 if (!arg_machine) {
b9ba4dab
LP
3997 if (arg_directory && path_equal(arg_directory, "/"))
3998 arg_machine = gethostname_malloc();
3999 else
4000 arg_machine = strdup(basename(arg_image ?: arg_directory));
4001
ec16945e
LP
4002 if (!arg_machine)
4003 return log_oom();
4004
ae691c1d 4005 hostname_cleanup(arg_machine);
ec16945e
LP
4006 if (!machine_name_is_valid(arg_machine)) {
4007 log_error("Failed to determine machine name automatically, please use -M.");
4008 return -EINVAL;
4009 }
b9ba4dab
LP
4010
4011 if (arg_ephemeral) {
4012 char *b;
4013
4014 /* Add a random suffix when this is an
4015 * ephemeral machine, so that we can run many
4016 * instances at once without manually having
4017 * to specify -M each time. */
4018
4019 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
4020 return log_oom();
4021
4022 free(arg_machine);
4023 arg_machine = b;
4024 }
ec16945e
LP
4025 }
4026
4027 return 0;
4028}
4029
03cfe0d5 4030static int determine_uid_shift(const char *directory) {
6dac160c
LP
4031 int r;
4032
03cfe0d5
LP
4033 if (!arg_userns) {
4034 arg_uid_shift = 0;
6dac160c 4035 return 0;
03cfe0d5 4036 }
6dac160c
LP
4037
4038 if (arg_uid_shift == UID_INVALID) {
4039 struct stat st;
4040
03cfe0d5 4041 r = stat(directory, &st);
6dac160c 4042 if (r < 0)
03cfe0d5 4043 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
4044
4045 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
4046
4047 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 4048 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
4049 return -EINVAL;
4050 }
4051
4052 arg_uid_range = UINT32_C(0x10000);
4053 }
4054
4055 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
4056 log_error("UID base too high for UID range.");
4057 return -EINVAL;
4058 }
4059
4060 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
4061 return 0;
4062}
4063
03cfe0d5
LP
4064static int inner_child(
4065 Barrier *barrier,
4066 const char *directory,
4067 bool secondary,
4068 int kmsg_socket,
4069 int rtnl_socket,
4070 FDSet *fds,
4071 int argc,
4072 char *argv[]) {
69c79d3c 4073
03cfe0d5
LP
4074 _cleanup_free_ char *home = NULL;
4075 unsigned n_env = 2;
4076 const char *envp[] = {
4077 "PATH=" DEFAULT_PATH_SPLIT_USR,
4078 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4079 NULL, /* TERM */
4080 NULL, /* HOME */
4081 NULL, /* USER */
4082 NULL, /* LOGNAME */
4083 NULL, /* container_uuid */
4084 NULL, /* LISTEN_FDS */
4085 NULL, /* LISTEN_PID */
4086 NULL
4087 };
88213476 4088
2371271c 4089 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 4090 int r;
88213476 4091
03cfe0d5
LP
4092 assert(barrier);
4093 assert(directory);
4094 assert(kmsg_socket >= 0);
88213476 4095
03cfe0d5
LP
4096 if (arg_userns) {
4097 /* Tell the parent, that it now can write the UID map. */
4098 (void) barrier_place(barrier); /* #1 */
7027ff61 4099
03cfe0d5
LP
4100 /* Wait until the parent wrote the UID map */
4101 if (!barrier_place_and_sync(barrier)) { /* #2 */
4102 log_error("Parent died too early");
4103 return -ESRCH;
4104 }
88213476
LP
4105 }
4106
03cfe0d5
LP
4107 r = mount_all(NULL, true);
4108 if (r < 0)
4109 return r;
4110
4111 /* Wait until we are cgroup-ified, so that we
4112 * can mount the right cgroup path writable */
4113 if (!barrier_place_and_sync(barrier)) { /* #3 */
4114 log_error("Parent died too early");
4115 return -ESRCH;
88213476
LP
4116 }
4117
03cfe0d5
LP
4118 r = mount_systemd_cgroup_writable("");
4119 if (r < 0)
4120 return r;
ec16945e 4121
03cfe0d5
LP
4122 r = reset_uid_gid();
4123 if (r < 0)
4124 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 4125
03cfe0d5
LP
4126 r = setup_boot_id(NULL);
4127 if (r < 0)
4128 return r;
ec16945e 4129
03cfe0d5
LP
4130 r = setup_kmsg(NULL, kmsg_socket);
4131 if (r < 0)
4132 return r;
4133 kmsg_socket = safe_close(kmsg_socket);
ec16945e 4134
03cfe0d5 4135 umask(0022);
30535c16 4136
03cfe0d5
LP
4137 if (setsid() < 0)
4138 return log_error_errno(errno, "setsid() failed: %m");
4139
4140 if (arg_private_network)
4141 loopback_setup();
4142
4143 r = send_rtnl(rtnl_socket);
4144 if (r < 0)
4145 return r;
4146 rtnl_socket = safe_close(rtnl_socket);
4147
4148 if (drop_capabilities() < 0)
4149 return log_error_errno(errno, "drop_capabilities() failed: %m");
4150
4151 setup_hostname();
4152
050f7277 4153 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
4154 if (personality(arg_personality) < 0)
4155 return log_error_errno(errno, "personality() failed: %m");
4156 } else if (secondary) {
4157 if (personality(PER_LINUX32) < 0)
4158 return log_error_errno(errno, "personality() failed: %m");
4159 }
4160
4161#ifdef HAVE_SELINUX
4162 if (arg_selinux_context)
4163 if (setexeccon((security_context_t) arg_selinux_context) < 0)
4164 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4165#endif
4166
4167 r = change_uid_gid(&home);
4168 if (r < 0)
4169 return r;
4170
4171 envp[n_env] = strv_find_prefix(environ, "TERM=");
4172 if (envp[n_env])
4173 n_env ++;
4174
4175 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4176 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4177 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
4178 return log_oom();
4179
4180 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4181 char as_uuid[37];
4182
4183 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
4184 return log_oom();
4185 }
4186
4187 if (fdset_size(fds) > 0) {
4188 r = fdset_cloexec(fds, false);
4189 if (r < 0)
4190 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4191
4192 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
4193 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
4194 return log_oom();
4195 }
4196
2371271c
TG
4197 env_use = strv_env_merge(2, envp, arg_setenv);
4198 if (!env_use)
4199 return log_oom();
03cfe0d5
LP
4200
4201 /* Let the parent know that we are ready and
4202 * wait until the parent is ready with the
4203 * setup, too... */
4204 if (!barrier_place_and_sync(barrier)) { /* #4 */
4205 log_error("Parent died too early");
4206 return -ESRCH;
4207 }
4208
4209 /* Now, explicitly close the log, so that we
4210 * then can close all remaining fds. Closing
4211 * the log explicitly first has the benefit
4212 * that the logging subsystem knows about it,
4213 * and is thus ready to be reopened should we
4214 * need it again. Note that the other fds
4215 * closed here are at least the locking and
4216 * barrier fds. */
4217 log_close();
4218 (void) fdset_close_others(fds);
4219
4220 if (arg_boot) {
4221 char **a;
4222 size_t m;
4223
4224 /* Automatically search for the init system */
4225
4226 m = 1 + argc - optind;
4227 a = newa(char*, m + 1);
4228 memcpy(a + 1, argv + optind, m * sizeof(char*));
4229
4230 a[0] = (char*) "/usr/lib/systemd/systemd";
4231 execve(a[0], a, env_use);
4232
4233 a[0] = (char*) "/lib/systemd/systemd";
4234 execve(a[0], a, env_use);
4235
4236 a[0] = (char*) "/sbin/init";
4237 execve(a[0], a, env_use);
4238 } else if (argc > optind)
4239 execvpe(argv[optind], argv + optind, env_use);
4240 else {
4241 chdir(home ? home : "/root");
4242 execle("/bin/bash", "-bash", NULL, env_use);
4243 execle("/bin/sh", "-sh", NULL, env_use);
4244 }
4245
4246 (void) log_open();
4247 return log_error_errno(errno, "execv() failed: %m");
4248}
4249
4250static int outer_child(
4251 Barrier *barrier,
4252 const char *directory,
4253 const char *console,
4254 const char *root_device, bool root_device_rw,
4255 const char *home_device, bool home_device_rw,
4256 const char *srv_device, bool srv_device_rw,
4257 bool interactive,
4258 bool secondary,
4259 int pid_socket,
4260 int kmsg_socket,
4261 int rtnl_socket,
825d5287 4262 int uid_shift_socket,
03cfe0d5
LP
4263 FDSet *fds,
4264 int argc,
4265 char *argv[]) {
4266
4267 pid_t pid;
4268 ssize_t l;
4269 int r;
4270
4271 assert(barrier);
4272 assert(directory);
4273 assert(console);
4274 assert(pid_socket >= 0);
4275 assert(kmsg_socket >= 0);
4276
4277 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
4278 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4279
4280 if (interactive) {
4281 close_nointr(STDIN_FILENO);
4282 close_nointr(STDOUT_FILENO);
4283 close_nointr(STDERR_FILENO);
4284
4285 r = open_terminal(console, O_RDWR);
4286 if (r != STDIN_FILENO) {
4287 if (r >= 0) {
4288 safe_close(r);
4289 r = -EINVAL;
4290 }
4291
4292 return log_error_errno(r, "Failed to open console: %m");
4293 }
4294
4295 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4296 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
4297 return log_error_errno(errno, "Failed to duplicate console: %m");
4298 }
4299
4300 r = reset_audit_loginuid();
4301 if (r < 0)
4302 return r;
4303
4304 /* Mark everything as slave, so that we still
4305 * receive mounts from the real root, but don't
4306 * propagate mounts to the real root. */
4307 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
4308 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4309
4310 r = mount_devices(directory,
4311 root_device, root_device_rw,
4312 home_device, home_device_rw,
4313 srv_device, srv_device_rw);
4314 if (r < 0)
4315 return r;
4316
391567f4
LP
4317 r = determine_uid_shift(directory);
4318 if (r < 0)
4319 return r;
4320
825d5287
RM
4321 if (arg_userns) {
4322 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
4323 if (l < 0)
4324 return log_error_errno(errno, "Failed to send UID shift: %m");
4325 if (l != sizeof(arg_uid_shift)) {
4326 log_error("Short write while sending UID shift.");
4327 return -EIO;
4328 }
4329 }
4330
03cfe0d5
LP
4331 /* Turn directory into bind mount */
4332 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
4333 return log_error_errno(errno, "Failed to make bind mount: %m");
4334
03cfe0d5
LP
4335 r = setup_volatile(directory);
4336 if (r < 0)
4337 return r;
4338
03cfe0d5
LP
4339 r = setup_volatile_state(directory);
4340 if (r < 0)
4341 return r;
4342
03cfe0d5
LP
4343 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
4344 if (r < 0)
4345 return r;
4346
03cfe0d5
LP
4347 if (arg_read_only) {
4348 r = bind_remount_recursive(directory, true);
4349 if (r < 0)
4350 return log_error_errno(r, "Failed to make tree read-only: %m");
4351 }
4352
03cfe0d5
LP
4353 r = mount_all(directory, false);
4354 if (r < 0)
4355 return r;
4356
4357 if (copy_devnodes(directory) < 0)
4358 return r;
4359
4360 dev_setup(directory, arg_uid_shift, arg_uid_shift);
4361
4362 if (setup_pts(directory) < 0)
4363 return r;
4364
4365 r = setup_propagate(directory);
4366 if (r < 0)
4367 return r;
4368
4369 r = setup_dev_console(directory, console);
4370 if (r < 0)
4371 return r;
4372
4373 r = setup_seccomp();
4374 if (r < 0)
4375 return r;
4376
4377 r = setup_timezone(directory);
4378 if (r < 0)
4379 return r;
4380
4381 r = setup_resolv_conf(directory);
4382 if (r < 0)
4383 return r;
4384
4385 r = setup_journal(directory);
4386 if (r < 0)
4387 return r;
4388
4389 r = mount_custom(directory);
4390 if (r < 0)
4391 return r;
4392
4393 r = mount_cgroup(directory);
4394 if (r < 0)
4395 return r;
4396
4397 r = mount_move_root(directory);
4398 if (r < 0)
4399 return log_error_errno(r, "Failed to move root directory: %m");
4400
4401 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4402 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
4403 (arg_private_network ? CLONE_NEWNET : 0) |
4404 (arg_userns ? CLONE_NEWUSER : 0),
4405 NULL);
4406 if (pid < 0)
4407 return log_error_errno(errno, "Failed to fork inner child: %m");
4408
4409 if (pid == 0) {
4410 pid_socket = safe_close(pid_socket);
825d5287 4411 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
4412
4413 /* The inner child has all namespaces that are
4414 * requested, so that we all are owned by the user if
4415 * user namespaces are turned on. */
4416
4417 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
4418 if (r < 0)
4419 _exit(EXIT_FAILURE);
4420
4421 _exit(EXIT_SUCCESS);
4422 }
4423
4424 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4425 if (l < 0)
4426 return log_error_errno(errno, "Failed to send PID: %m");
4427 if (l != sizeof(pid)) {
4428 log_error("Short write while sending PID.");
4429 return -EIO;
4430 }
4431
4432 pid_socket = safe_close(pid_socket);
4433
4434 return 0;
4435}
4436
4437static int setup_uid_map(pid_t pid) {
4438 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4439 int r;
4440
4441 assert(pid > 1);
4442
4443 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4444 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 4445 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
4446 if (r < 0)
4447 return log_error_errno(r, "Failed to write UID map: %m");
4448
4449 /* We always assign the same UID and GID ranges */
4450 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 4451 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
4452 if (r < 0)
4453 return log_error_errno(r, "Failed to write GID map: %m");
4454
4455 return 0;
4456}
4457
4458static int chown_cgroup(pid_t pid) {
4459 _cleanup_free_ char *path = NULL, *fs = NULL;
4460 _cleanup_close_ int fd = -1;
4461 const char *fn;
4462 int r;
4463
4464 r = cg_pid_get_path(NULL, pid, &path);
4465 if (r < 0)
4466 return log_error_errno(r, "Failed to get container cgroup path: %m");
4467
4468 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
4469 if (r < 0)
4470 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
4471
4472 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
4473 if (fd < 0)
4474 return log_error_errno(errno, "Failed to open %s: %m", fs);
4475
4476 FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4477 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
4478 log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
4479
4480 return 0;
4481}
4482
4483int main(int argc, char *argv[]) {
4484
4485 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
4486 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4487 _cleanup_close_ int master = -1, image_fd = -1;
4488 _cleanup_fdset_free_ FDSet *fds = NULL;
4489 int r, n_fd_passed, loop_nr = -1;
4490 char veth_name[IFNAMSIZ];
4491 bool secondary = false, remove_subvol = false;
72c0a2c2 4492 sigset_t mask_chld;
03cfe0d5
LP
4493 pid_t pid = 0;
4494 int ret = EXIT_SUCCESS;
4495 union in_addr_union exposed = {};
4496 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4497 bool interactive;
4498
4499 log_parse_environment();
4500 log_open();
4501
4502 r = parse_argv(argc, argv);
4503 if (r <= 0)
4504 goto finish;
4505
4506 r = determine_names();
4507 if (r < 0)
4508 goto finish;
4509
4510 if (geteuid() != 0) {
4511 log_error("Need to be root.");
4512 r = -EPERM;
4513 goto finish;
4514 }
4515
4516 n_fd_passed = sd_listen_fds(false);
4517 if (n_fd_passed > 0) {
4518 r = fdset_new_listen_fds(&fds, false);
4519 if (r < 0) {
4520 log_error_errno(r, "Failed to collect file descriptors: %m");
4521 goto finish;
4522 }
4523 }
4524
4525 if (arg_directory) {
4526 assert(!arg_image);
4527
4528 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4529 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4530 r = -EINVAL;
4531 goto finish;
4532 }
4533
4534 if (arg_ephemeral) {
4535 _cleanup_free_ char *np = NULL;
4536
4537 /* If the specified path is a mount point we
4538 * generate the new snapshot immediately
4539 * inside it under a random name. However if
4540 * the specified is not a mount point we
4541 * create the new snapshot in the parent
4542 * directory, just next to it. */
e26d6ce5 4543 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
4544 if (r < 0) {
4545 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4546 goto finish;
4547 }
4548 if (r > 0)
770b5ce4 4549 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4550 else
770b5ce4 4551 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
4552 if (r < 0) {
4553 log_error_errno(r, "Failed to generate name for snapshot: %m");
4554 goto finish;
4555 }
4556
4557 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4558 if (r < 0) {
4559 log_error_errno(r, "Failed to lock %s: %m", np);
4560 goto finish;
4561 }
4562
4563 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4564 if (r < 0) {
4565 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4566 goto finish;
ec16945e
LP
4567 }
4568
4569 free(arg_directory);
4570 arg_directory = np;
8a16a7b4 4571 np = NULL;
ec16945e
LP
4572
4573 remove_subvol = true;
30535c16
LP
4574
4575 } else {
4576 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4577 if (r == -EBUSY) {
4578 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4579 goto finish;
4580 }
4581 if (r < 0) {
4582 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4583 return r;
4584 }
4585
4586 if (arg_template) {
f70a17f8 4587 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
30535c16
LP
4588 if (r == -EEXIST) {
4589 if (!arg_quiet)
4590 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4591 } else if (r < 0) {
83521414 4592 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
4593 goto finish;
4594 } else {
4595 if (!arg_quiet)
4596 log_info("Populated %s from template %s.", arg_directory, arg_template);
4597 }
4598 }
ec16945e
LP
4599 }
4600
1b9e5b12
LP
4601 if (arg_boot) {
4602 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 4603 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 4604 r = -EINVAL;
1b9e5b12
LP
4605 goto finish;
4606 }
4607 } else {
4608 const char *p;
4609
63c372cb 4610 p = strjoina(arg_directory,
1b9e5b12
LP
4611 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
4612 if (access(p, F_OK) < 0) {
4613 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
ec16945e 4614 r = -EINVAL;
1b9e5b12 4615 goto finish;
1b9e5b12
LP
4616 }
4617 }
ec16945e 4618
6b9132a9 4619 } else {
1b9e5b12 4620 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 4621
ec16945e
LP
4622 assert(arg_image);
4623 assert(!arg_template);
4624
30535c16
LP
4625 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4626 if (r == -EBUSY) {
4627 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4628 goto finish;
4629 }
4630 if (r < 0) {
4631 r = log_error_errno(r, "Failed to create image lock: %m");
4632 goto finish;
4633 }
4634
1b9e5b12 4635 if (!mkdtemp(template)) {
56f64d95 4636 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 4637 r = -errno;
6b9132a9 4638 goto finish;
1b9e5b12 4639 }
6b9132a9 4640
1b9e5b12
LP
4641 arg_directory = strdup(template);
4642 if (!arg_directory) {
4643 r = log_oom();
4644 goto finish;
6b9132a9 4645 }
88213476 4646
1b9e5b12
LP
4647 image_fd = setup_image(&device_path, &loop_nr);
4648 if (image_fd < 0) {
4649 r = image_fd;
842f3b0f
LP
4650 goto finish;
4651 }
1b9e5b12 4652
4d9f07b4
LP
4653 r = dissect_image(image_fd,
4654 &root_device, &root_device_rw,
4655 &home_device, &home_device_rw,
4656 &srv_device, &srv_device_rw,
4657 &secondary);
1b9e5b12
LP
4658 if (r < 0)
4659 goto finish;
842f3b0f 4660 }
842f3b0f 4661
5a8af538
LP
4662 r = custom_mounts_prepare();
4663 if (r < 0)
4664 goto finish;
4665
03cfe0d5
LP
4666 interactive =
4667 isatty(STDIN_FILENO) > 0 &&
4668 isatty(STDOUT_FILENO) > 0;
9c857b9d 4669
db7feb7e
LP
4670 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4671 if (master < 0) {
ec16945e 4672 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
4673 goto finish;
4674 }
4675
611b312b
LP
4676 r = ptsname_malloc(master, &console);
4677 if (r < 0) {
4678 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
4679 goto finish;
4680 }
4681
a258bf26 4682 if (unlockpt(master) < 0) {
ec16945e 4683 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
4684 goto finish;
4685 }
4686
9c857b9d
LP
4687 if (!arg_quiet)
4688 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4689 arg_machine, arg_image ?: arg_directory);
4690
72c0a2c2 4691 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 4692
023fb90b
LP
4693 assert_se(sigemptyset(&mask_chld) == 0);
4694 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4695
03cfe0d5
LP
4696 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4697 r = log_error_errno(errno, "Failed to become subreaper: %m");
4698 goto finish;
4699 }
4700
d87be9b0 4701 for (;;) {
825d5287
RM
4702 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
4703 uid_shift_socket_pair[2] = { -1, -1 };
113cea80 4704 ContainerStatus container_status;
7566e267 4705 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
03cfe0d5 4706 static const struct sigaction sa = {
e866af3a
DH
4707 .sa_handler = nop_handler,
4708 .sa_flags = SA_NOCLDSTOP,
4709 };
03cfe0d5
LP
4710 int ifi = 0;
4711 ssize_t l;
dbb60d69
LP
4712 _cleanup_event_unref_ sd_event *event = NULL;
4713 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4714 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4715 char last_char = 0;
e866af3a 4716
7566e267 4717 r = barrier_create(&barrier);
a2da110b 4718 if (r < 0) {
da927ba9 4719 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
4720 goto finish;
4721 }
4722
6d0b55c2
LP
4723 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
4724 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4725 goto finish;
4726 }
4727
4728 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
4729 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4730 goto finish;
4731 }
4732
03cfe0d5
LP
4733 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
4734 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
4735 goto finish;
4736 }
4737
825d5287
RM
4738 if (arg_userns)
4739 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
4740 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4741 goto finish;
4742 }
4743
e866af3a
DH
4744 /* Child can be killed before execv(), so handle SIGCHLD
4745 * in order to interrupt parent's blocking calls and
4746 * give it a chance to call wait() and terminate. */
4747 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4748 if (r < 0) {
ec16945e 4749 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
4750 goto finish;
4751 }
4752
e866af3a
DH
4753 r = sigaction(SIGCHLD, &sa, NULL);
4754 if (r < 0) {
ec16945e 4755 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
4756 goto finish;
4757 }
4758
03cfe0d5 4759 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
d87be9b0
LP
4760 if (pid < 0) {
4761 if (errno == EINVAL)
ec16945e 4762 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 4763 else
ec16945e 4764 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 4765
d87be9b0
LP
4766 goto finish;
4767 }
a258bf26 4768
d87be9b0 4769 if (pid == 0) {
03cfe0d5 4770 /* The outer child only has a file system namespace. */
a2da110b
DH
4771 barrier_set_role(&barrier, BARRIER_CHILD);
4772
03e334a1 4773 master = safe_close(master);
a258bf26 4774
03e334a1 4775 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 4776 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 4777 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
825d5287 4778 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 4779
ce30c8dc
LP
4780 (void) reset_all_signal_handlers();
4781 (void) reset_signal_mask();
f5c1b9ee 4782
03cfe0d5
LP
4783 r = outer_child(&barrier,
4784 arg_directory,
4785 console,
4786 root_device, root_device_rw,
4787 home_device, home_device_rw,
4788 srv_device, srv_device_rw,
4789 interactive,
4790 secondary,
4791 pid_socket_pair[1],
4792 kmsg_socket_pair[1],
4793 rtnl_socket_pair[1],
825d5287 4794 uid_shift_socket_pair[1],
03cfe0d5
LP
4795 fds,
4796 argc, argv);
0cb9fbcd 4797 if (r < 0)
a2da110b 4798 _exit(EXIT_FAILURE);
d87be9b0 4799
03cfe0d5 4800 _exit(EXIT_SUCCESS);
da5b3bad 4801 }
88213476 4802
a2da110b 4803 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 4804
842f3b0f
LP
4805 fdset_free(fds);
4806 fds = NULL;
4807
6d0b55c2
LP
4808 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4809 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 4810 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
6d0b55c2 4811
03cfe0d5
LP
4812 /* Wait for the outer child. */
4813 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
4814 if (r < 0)
4815 goto finish;
4816 if (r != 0) {
4817 r = -EIO;
4818 goto finish;
4819 }
4820 pid = 0;
6dac160c 4821
03cfe0d5
LP
4822 /* And now retrieve the PID of the inner child. */
4823 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
4824 if (l < 0) {
4825 r = log_error_errno(errno, "Failed to read inner child PID: %m");
4826 goto finish;
4827 }
4828 if (l != sizeof(pid)) {
4829 log_error("Short read while reading inner child PID: %m");
4830 r = EIO;
4831 goto finish;
4832 }
354bfd2b 4833
03cfe0d5 4834 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 4835
03cfe0d5
LP
4836 if (arg_userns) {
4837 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4838 log_error("Child died too early.");
4839 r = -ESRCH;
840295fc 4840 goto finish;
03cfe0d5 4841 }
ab046dde 4842
825d5287
RM
4843 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
4844 if (l < 0) {
4845 r = log_error_errno(errno, "Failed to read UID shift: %m");
4846 goto finish;
4847 }
4848 if (l != sizeof(arg_uid_shift)) {
4849 log_error("Short read while reading UID shift: %m");
4850 r = EIO;
4851 goto finish;
4852 }
4853
03cfe0d5 4854 r = setup_uid_map(pid);
840295fc
LP
4855 if (r < 0)
4856 goto finish;
ab046dde 4857
03cfe0d5
LP
4858 (void) barrier_place(&barrier); /* #2 */
4859 }
c74e630d 4860
03cfe0d5
LP
4861 r = move_network_interfaces(pid);
4862 if (r < 0)
4863 goto finish;
4bbfe7ad 4864
03cfe0d5
LP
4865 r = setup_veth(pid, veth_name, &ifi);
4866 if (r < 0)
4867 goto finish;
5aa4bb6b 4868
03cfe0d5
LP
4869 r = setup_bridge(veth_name, &ifi);
4870 if (r < 0)
4871 goto finish;
6dac160c 4872
03cfe0d5
LP
4873 r = setup_macvlan(pid);
4874 if (r < 0)
4875 goto finish;
6dac160c 4876
03cfe0d5
LP
4877 r = setup_ipvlan(pid);
4878 if (r < 0)
4879 goto finish;
6dac160c 4880
03cfe0d5
LP
4881 r = register_machine(pid, ifi);
4882 if (r < 0)
4883 goto finish;
6dac160c 4884
03cfe0d5
LP
4885 r = chown_cgroup(pid);
4886 if (r < 0)
4887 goto finish;
6dac160c 4888
03cfe0d5
LP
4889 /* Notify the child that the parent is ready with all
4890 * its setup (including cgroup-ification), and that
4891 * the child can now hand over control to the code to
4892 * run inside the container. */
4893 (void) barrier_place(&barrier); /* #3 */
6dac160c 4894
03cfe0d5
LP
4895 /* Block SIGCHLD here, before notifying child.
4896 * process_pty() will handle it with the other signals. */
4897 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 4898
03cfe0d5
LP
4899 /* Reset signal to default */
4900 r = default_signals(SIGCHLD, -1);
4901 if (r < 0) {
4902 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4903 goto finish;
4904 }
e866af3a 4905
03cfe0d5
LP
4906 /* Let the child know that we are ready and wait that the child is completely ready now. */
4907 if (!barrier_place_and_sync(&barrier)) { /* #5 */
4908 log_error("Client died too early.");
4909 r = -ESRCH;
4910 goto finish;
4911 }
b12afc8c 4912
03cfe0d5
LP
4913 sd_notifyf(false,
4914 "READY=1\n"
4915 "STATUS=Container running.\n"
4916 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 4917
03cfe0d5
LP
4918 r = sd_event_new(&event);
4919 if (r < 0) {
4920 log_error_errno(r, "Failed to get default event source: %m");
4921 goto finish;
4922 }
88213476 4923
03cfe0d5
LP
4924 if (arg_kill_signal > 0) {
4925 /* Try to kill the init system on SIGINT or SIGTERM */
4926 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4927 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4928 } else {
4929 /* Immediately exit */
4930 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4931 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4932 }
023fb90b 4933
03cfe0d5
LP
4934 /* simply exit on sigchld */
4935 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 4936
03cfe0d5
LP
4937 if (arg_expose_ports) {
4938 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4939 if (r < 0)
4940 goto finish;
023fb90b 4941
03cfe0d5
LP
4942 (void) expose_ports(rtnl, &exposed);
4943 }
023fb90b 4944
03cfe0d5 4945 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 4946
03cfe0d5
LP
4947 r = pty_forward_new(event, master, true, !interactive, &forward);
4948 if (r < 0) {
4949 log_error_errno(r, "Failed to create PTY forwarder: %m");
4950 goto finish;
4951 }
023fb90b 4952
03cfe0d5
LP
4953 r = sd_event_loop(event);
4954 if (r < 0) {
4955 log_error_errno(r, "Failed to run event loop: %m");
4956 goto finish;
4957 }
6d0b55c2 4958
03cfe0d5 4959 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 4960
03cfe0d5 4961 forward = pty_forward_free(forward);
6d0b55c2 4962
03cfe0d5
LP
4963 if (!arg_quiet && last_char != '\n')
4964 putc('\n', stdout);
04d39279 4965
03cfe0d5
LP
4966 /* Kill if it is not dead yet anyway */
4967 terminate_machine(pid);
1f0cd86b 4968
840295fc 4969 /* Normally redundant, but better safe than sorry */
04d39279 4970 kill(pid, SIGKILL);
a258bf26 4971
113cea80 4972 r = wait_for_container(pid, &container_status);
04d39279
LP
4973 pid = 0;
4974
ec16945e 4975 if (r < 0)
ce9f1527
LP
4976 /* We failed to wait for the container, or the
4977 * container exited abnormally */
ec16945e
LP
4978 goto finish;
4979 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
4980 /* The container exited with a non-zero
4981 * status, or with zero status and no reboot
4982 * was requested. */
ec16945e 4983 ret = r;
d87be9b0 4984 break;
ec16945e 4985 }
88213476 4986
113cea80 4987 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
4988
4989 if (arg_keep_unit) {
4990 /* Special handling if we are running as a
4991 * service: instead of simply restarting the
4992 * machine we want to restart the entire
4993 * service, so let's inform systemd about this
4994 * with the special exit code 133. The service
4995 * file uses RestartForceExitStatus=133 so
4996 * that this results in a full nspawn
4997 * restart. This is necessary since we might
4998 * have cgroup parameters set we want to have
4999 * flushed out. */
ec16945e
LP
5000 ret = 133;
5001 r = 0;
ce38dbc8
LP
5002 break;
5003 }
6d0b55c2
LP
5004
5005 flush_ports(&exposed);
d87be9b0 5006 }
88213476
LP
5007
5008finish:
af4ec430
LP
5009 sd_notify(false,
5010 "STOPPING=1\n"
5011 "STATUS=Terminating...");
5012
9444b1f2
LP
5013 if (pid > 0)
5014 kill(pid, SIGKILL);
88213476 5015
503546da
LP
5016 /* Try to flush whatever is still queued in the pty */
5017 if (master >= 0)
5018 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
5019
03cfe0d5
LP
5020 loop_remove(loop_nr, &image_fd);
5021
ec16945e
LP
5022 if (remove_subvol && arg_directory) {
5023 int k;
5024
d9e2daaf 5025 k = btrfs_subvol_remove(arg_directory, true);
ec16945e
LP
5026 if (k < 0)
5027 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
5028 }
5029
785890ac
LP
5030 if (arg_machine) {
5031 const char *p;
5032
63c372cb 5033 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5034 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5035 }
5036
04d391da 5037 free(arg_directory);
ec16945e
LP
5038 free(arg_template);
5039 free(arg_image);
7027ff61 5040 free(arg_machine);
c74e630d
LP
5041 free(arg_user);
5042 strv_free(arg_setenv);
5043 strv_free(arg_network_interfaces);
5044 strv_free(arg_network_macvlan);
4bbfe7ad 5045 strv_free(arg_network_ipvlan);
5a8af538 5046 custom_mount_free_all();
88213476 5047
6d0b55c2
LP
5048 flush_ports(&exposed);
5049
5050 while (arg_expose_ports) {
5051 ExposePort *p = arg_expose_ports;
5052 LIST_REMOVE(ports, arg_expose_ports, p);
5053 free(p);
5054 }
5055
ec16945e 5056 return r < 0 ? EXIT_FAILURE : ret;
88213476 5057}