]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: don't try to extract quotes from option string, glibc doesn't do that either
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
88213476 26#include <sys/mount.h>
88213476
LP
27#include <stdlib.h>
28#include <string.h>
29#include <stdio.h>
30#include <errno.h>
31#include <sys/prctl.h>
88213476 32#include <getopt.h>
687d0825 33#include <grp.h>
5ed27dbd 34#include <linux/fs.h>
9537eab0 35#include <sys/socket.h>
aea38d80 36#include <linux/netlink.h>
aa28aefe 37#include <net/if.h>
69c79d3c 38#include <linux/veth.h>
6afc95b7 39#include <sys/personality.h>
1b9e5b12 40#include <linux/loop.h>
2fbe4296 41#include <sys/file.h>
aa28aefe 42
5d63309c 43#ifdef HAVE_SELINUX
a8828ed9
DW
44#include <selinux/selinux.h>
45#endif
88213476 46
24fb1112
LP
47#ifdef HAVE_SECCOMP
48#include <seccomp.h>
49#endif
50
1b9e5b12
LP
51#ifdef HAVE_BLKID
52#include <blkid/blkid.h>
53#endif
54
1f0cd86b
LP
55#include "sd-daemon.h"
56#include "sd-bus.h"
57#include "sd-id128.h"
1c4baffc 58#include "sd-netlink.h"
958b66ea 59#include "random-util.h"
88213476
LP
60#include "log.h"
61#include "util.h"
49e942b2 62#include "mkdir.h"
c6878637 63#include "rm-rf.h"
6b2d0e85 64#include "macro.h"
94d82985 65#include "missing.h"
04d391da 66#include "cgroup-util.h"
a258bf26 67#include "strv.h"
9eb977db 68#include "path-util.h"
a41fe3a2 69#include "loopback-setup.h"
4fc9982c 70#include "dev-setup.h"
842f3b0f 71#include "fdset.h"
acbeb427 72#include "build.h"
a5c32cff 73#include "fileio.h"
40ca29a1 74#include "bus-util.h"
1f0cd86b 75#include "bus-error.h"
4ba93280 76#include "ptyfwd.h"
f4889f65 77#include "env-util.h"
1c4baffc 78#include "netlink-util.h"
7e227024 79#include "udev-util.h"
1b9e5b12
LP
80#include "blkid-util.h"
81#include "gpt.h"
01dde061 82#include "siphash24.h"
849958d1 83#include "copy.h"
3577de7a 84#include "base-filesystem.h"
a2da110b 85#include "barrier.h"
023fb90b 86#include "event-util.h"
f01ae826 87#include "capability.h"
2822da4f 88#include "cap-list.h"
ec16945e 89#include "btrfs-util.h"
1b9cebf6 90#include "machine-image.h"
6d0b55c2
LP
91#include "list.h"
92#include "in-addr-util.h"
12c2884c 93#include "firewall-util.h"
6d0b55c2 94#include "local-addresses.h"
6482f626 95#include "formats-util.h"
0b452006 96#include "process-util.h"
288a74cc 97#include "terminal-util.h"
958b66ea 98#include "hostname-util.h"
24882e06 99#include "signal-util.h"
f2d88580 100
e9642be2
LP
101#ifdef HAVE_SECCOMP
102#include "seccomp-util.h"
103#endif
104
6d0b55c2
LP
105typedef struct ExposePort {
106 int protocol;
107 uint16_t host_port;
108 uint16_t container_port;
109 LIST_FIELDS(struct ExposePort, ports);
110} ExposePort;
111
113cea80
DH
112typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115} ContainerStatus;
116
57fb9fb5
LP
117typedef enum LinkJournal {
118 LINK_NO,
119 LINK_AUTO,
120 LINK_HOST,
121 LINK_GUEST
122} LinkJournal;
88213476 123
4d9f07b4
LP
124typedef enum Volatile {
125 VOLATILE_NO,
126 VOLATILE_YES,
127 VOLATILE_STATE,
128} Volatile;
129
5a8af538
LP
130typedef enum CustomMountType {
131 CUSTOM_MOUNT_BIND,
132 CUSTOM_MOUNT_TMPFS,
133 CUSTOM_MOUNT_OVERLAY,
134} CustomMountType;
135
136typedef struct CustomMount {
137 CustomMountType type;
138 bool read_only;
139 char *source; /* for overlayfs this is the upper directory */
140 char *destination;
141 char *options;
142 char *work_dir;
143 char **lower;
144} CustomMount;
145
88213476 146static char *arg_directory = NULL;
ec16945e 147static char *arg_template = NULL;
687d0825 148static char *arg_user = NULL;
9444b1f2 149static sd_id128_t arg_uuid = {};
7027ff61 150static char *arg_machine = NULL;
c74e630d
LP
151static const char *arg_selinux_context = NULL;
152static const char *arg_selinux_apifs_context = NULL;
9444b1f2 153static const char *arg_slice = NULL;
ff01d048 154static bool arg_private_network = false;
bc2f673e 155static bool arg_read_only = false;
0f0dbc46 156static bool arg_boot = false;
ec16945e 157static bool arg_ephemeral = false;
57fb9fb5 158static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 159static bool arg_link_journal_try = false;
5076f0cc
LP
160static uint64_t arg_retain =
161 (1ULL << CAP_CHOWN) |
162 (1ULL << CAP_DAC_OVERRIDE) |
163 (1ULL << CAP_DAC_READ_SEARCH) |
164 (1ULL << CAP_FOWNER) |
165 (1ULL << CAP_FSETID) |
166 (1ULL << CAP_IPC_OWNER) |
167 (1ULL << CAP_KILL) |
168 (1ULL << CAP_LEASE) |
169 (1ULL << CAP_LINUX_IMMUTABLE) |
170 (1ULL << CAP_NET_BIND_SERVICE) |
171 (1ULL << CAP_NET_BROADCAST) |
172 (1ULL << CAP_NET_RAW) |
173 (1ULL << CAP_SETGID) |
174 (1ULL << CAP_SETFCAP) |
175 (1ULL << CAP_SETPCAP) |
176 (1ULL << CAP_SETUID) |
177 (1ULL << CAP_SYS_ADMIN) |
178 (1ULL << CAP_SYS_CHROOT) |
179 (1ULL << CAP_SYS_NICE) |
180 (1ULL << CAP_SYS_PTRACE) |
181 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 182 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
183 (1ULL << CAP_SYS_BOOT) |
184 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
185 (1ULL << CAP_AUDIT_CONTROL) |
186 (1ULL << CAP_MKNOD);
5a8af538
LP
187static CustomMount *arg_custom_mounts = NULL;
188static unsigned arg_n_custom_mounts = 0;
f4889f65 189static char **arg_setenv = NULL;
284c0b91 190static bool arg_quiet = false;
8a96d94e 191static bool arg_share_system = false;
eb91eb18 192static bool arg_register = true;
89f7c846 193static bool arg_keep_unit = false;
aa28aefe 194static char **arg_network_interfaces = NULL;
c74e630d 195static char **arg_network_macvlan = NULL;
4bbfe7ad 196static char **arg_network_ipvlan = NULL;
69c79d3c 197static bool arg_network_veth = false;
c74e630d 198static const char *arg_network_bridge = NULL;
050f7277 199static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 200static char *arg_image = NULL;
4d9f07b4 201static Volatile arg_volatile = VOLATILE_NO;
6d0b55c2 202static ExposePort *arg_expose_ports = NULL;
f36933fe 203static char **arg_property = NULL;
6dac160c
LP
204static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
205static bool arg_userns = false;
c6c8f6e2 206static int arg_kill_signal = 0;
88213476 207
601185b4 208static void help(void) {
88213476
LP
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
211 " -h --help Show this help\n"
212 " --version Print version string\n"
69c79d3c 213 " -q --quiet Do not show status information\n"
1b9e5b12 214 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
215 " --template=PATH Initialize root directory from template directory,\n"
216 " if missing\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 222 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 223 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 224 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 225 " --property=NAME=VALUE Set scope unit property\n"
03cfe0d5
LP
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
69c79d3c
LP
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
231 " container\n"
c74e630d
LP
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
4bbfe7ad
TG
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
0dfaa006 238 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 239 " and container\n"
ab046dde 240 " --network-bridge=INTERFACE\n"
32457153 241 " Add a virtual ethernet connection between host\n"
ab046dde
TG
242 " and container and add it to an existing bridge on\n"
243 " the host\n"
6d0b55c2 244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 245 " Expose a container IP port on the host\n"
82adf6af
LP
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
a8828ed9
DW
252 " --capability=CAP In addition to the default, retain specified\n"
253 " capability\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 259 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
260 " --bind=PATH[:PATH[:OPTIONS]]\n"
261 " Bind mount a file or directory from the host into\n"
a8828ed9 262 " the container\n"
5e5bfa6e
EY
263 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
264 " Similar, but creates a read-only bind mount\n"
06c17c39 265 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
266 " --overlay=PATH[:PATH...]:PATH\n"
267 " Create an overlay mount from the host to \n"
268 " the container\n"
269 " --overlay-ro=PATH[:PATH...]:PATH\n"
270 " Similar, but creates a read-only overlay mount\n"
284c0b91 271 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 272 " --share-system Share system namespaces with host\n"
eb91eb18 273 " --register=BOOLEAN Register container as machine\n"
89f7c846 274 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 275 " the service unit nspawn is running in\n"
6d0b55c2
LP
276 " --volatile[=MODE] Run the system in volatile mode\n"
277 , program_invocation_short_name);
88213476
LP
278}
279
5a8af538
LP
280static CustomMount* custom_mount_add(CustomMountType t) {
281 CustomMount *c, *ret;
282
283 c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
284 if (!c)
285 return NULL;
286
287 arg_custom_mounts = c;
288 ret = arg_custom_mounts + arg_n_custom_mounts;
289 arg_n_custom_mounts++;
290
291 *ret = (CustomMount) { .type = t };
292
293 return ret;
294}
295
296static void custom_mount_free_all(void) {
297 unsigned i;
298
299 for (i = 0; i < arg_n_custom_mounts; i++) {
300 CustomMount *m = &arg_custom_mounts[i];
301
302 free(m->source);
303 free(m->destination);
304 free(m->options);
305
306 if (m->work_dir) {
307 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
308 free(m->work_dir);
309 }
310
311 strv_free(m->lower);
312 }
313
97b11eed 314 arg_custom_mounts = mfree(arg_custom_mounts);
5a8af538
LP
315 arg_n_custom_mounts = 0;
316}
317
318static int custom_mount_compare(const void *a, const void *b) {
319 const CustomMount *x = a, *y = b;
320 int r;
321
322 r = path_compare(x->destination, y->destination);
323 if (r != 0)
324 return r;
325
326 if (x->type < y->type)
327 return -1;
328 if (x->type > y->type)
329 return 1;
330
331 return 0;
332}
333
334static int custom_mounts_prepare(void) {
335 unsigned i;
336 int r;
337
338 /* Ensure the mounts are applied prefix first. */
339 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
340
341 /* Allocate working directories for the overlay file systems that need it */
342 for (i = 0; i < arg_n_custom_mounts; i++) {
343 CustomMount *m = &arg_custom_mounts[i];
344
825d5287
RM
345 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
346 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
347 return -EINVAL;
348 }
349
5a8af538
LP
350 if (m->type != CUSTOM_MOUNT_OVERLAY)
351 continue;
352
353 if (m->work_dir)
354 continue;
355
356 if (m->read_only)
357 continue;
358
14bcf25c 359 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
360 if (r < 0)
361 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
362 }
363
364 return 0;
365}
366
ec16945e
LP
367static int set_sanitized_path(char **b, const char *path) {
368 char *p;
369
370 assert(b);
371 assert(path);
372
373 p = canonicalize_file_name(path);
374 if (!p) {
375 if (errno != ENOENT)
376 return -errno;
377
378 p = path_make_absolute_cwd(path);
379 if (!p)
380 return -ENOMEM;
381 }
382
383 free(*b);
384 *b = path_kill_slashes(p);
385 return 0;
386}
387
88213476
LP
388static int parse_argv(int argc, char *argv[]) {
389
a41fe3a2 390 enum {
acbeb427
ZJS
391 ARG_VERSION = 0x100,
392 ARG_PRIVATE_NETWORK,
bc2f673e 393 ARG_UUID,
5076f0cc 394 ARG_READ_ONLY,
57fb9fb5 395 ARG_CAPABILITY,
420c7379 396 ARG_DROP_CAPABILITY,
17fe0523
LP
397 ARG_LINK_JOURNAL,
398 ARG_BIND,
f4889f65 399 ARG_BIND_RO,
06c17c39 400 ARG_TMPFS,
5a8af538
LP
401 ARG_OVERLAY,
402 ARG_OVERLAY_RO,
f4889f65 403 ARG_SETENV,
eb91eb18 404 ARG_SHARE_SYSTEM,
89f7c846 405 ARG_REGISTER,
aa28aefe 406 ARG_KEEP_UNIT,
69c79d3c 407 ARG_NETWORK_INTERFACE,
c74e630d 408 ARG_NETWORK_MACVLAN,
4bbfe7ad 409 ARG_NETWORK_IPVLAN,
ab046dde 410 ARG_NETWORK_BRIDGE,
6afc95b7 411 ARG_PERSONALITY,
4d9f07b4 412 ARG_VOLATILE,
ec16945e 413 ARG_TEMPLATE,
f36933fe 414 ARG_PROPERTY,
6dac160c 415 ARG_PRIVATE_USERS,
c6c8f6e2 416 ARG_KILL_SIGNAL,
a41fe3a2
LP
417 };
418
88213476 419 static const struct option options[] = {
aa28aefe
LP
420 { "help", no_argument, NULL, 'h' },
421 { "version", no_argument, NULL, ARG_VERSION },
422 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
423 { "template", required_argument, NULL, ARG_TEMPLATE },
424 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
425 { "user", required_argument, NULL, 'u' },
426 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
427 { "boot", no_argument, NULL, 'b' },
428 { "uuid", required_argument, NULL, ARG_UUID },
429 { "read-only", no_argument, NULL, ARG_READ_ONLY },
430 { "capability", required_argument, NULL, ARG_CAPABILITY },
431 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
432 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
433 { "bind", required_argument, NULL, ARG_BIND },
434 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 435 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
436 { "overlay", required_argument, NULL, ARG_OVERLAY },
437 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
438 { "machine", required_argument, NULL, 'M' },
439 { "slice", required_argument, NULL, 'S' },
440 { "setenv", required_argument, NULL, ARG_SETENV },
441 { "selinux-context", required_argument, NULL, 'Z' },
442 { "selinux-apifs-context", required_argument, NULL, 'L' },
443 { "quiet", no_argument, NULL, 'q' },
444 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
445 { "register", required_argument, NULL, ARG_REGISTER },
446 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
447 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 448 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 449 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 450 { "network-veth", no_argument, NULL, 'n' },
ab046dde 451 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 452 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 453 { "image", required_argument, NULL, 'i' },
4d9f07b4 454 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 455 { "port", required_argument, NULL, 'p' },
f36933fe 456 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 457 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 458 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
eb9da376 459 {}
88213476
LP
460 };
461
9444b1f2 462 int c, r;
a42c8b54 463 uint64_t plus = 0, minus = 0;
88213476
LP
464
465 assert(argc >= 0);
466 assert(argv);
467
0dfaa006 468 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
469
470 switch (c) {
471
472 case 'h':
601185b4
ZJS
473 help();
474 return 0;
88213476 475
acbeb427
ZJS
476 case ARG_VERSION:
477 puts(PACKAGE_STRING);
478 puts(SYSTEMD_FEATURES);
479 return 0;
480
88213476 481 case 'D':
ec16945e
LP
482 r = set_sanitized_path(&arg_directory, optarg);
483 if (r < 0)
484 return log_error_errno(r, "Invalid root directory: %m");
485
486 break;
487
488 case ARG_TEMPLATE:
489 r = set_sanitized_path(&arg_template, optarg);
490 if (r < 0)
491 return log_error_errno(r, "Invalid template directory: %m");
88213476
LP
492
493 break;
494
1b9e5b12 495 case 'i':
ec16945e
LP
496 r = set_sanitized_path(&arg_image, optarg);
497 if (r < 0)
498 return log_error_errno(r, "Invalid image path: %m");
499
500 break;
501
502 case 'x':
503 arg_ephemeral = true;
1b9e5b12
LP
504 break;
505
687d0825 506 case 'u':
2fc09a9c
DM
507 r = free_and_strdup(&arg_user, optarg);
508 if (r < 0)
7027ff61 509 return log_oom();
687d0825
MV
510
511 break;
512
ab046dde 513 case ARG_NETWORK_BRIDGE:
c74e630d 514 arg_network_bridge = optarg;
ab046dde
TG
515
516 /* fall through */
517
0dfaa006 518 case 'n':
69c79d3c
LP
519 arg_network_veth = true;
520 arg_private_network = true;
521 break;
522
aa28aefe 523 case ARG_NETWORK_INTERFACE:
c74e630d
LP
524 if (strv_extend(&arg_network_interfaces, optarg) < 0)
525 return log_oom();
526
527 arg_private_network = true;
528 break;
529
530 case ARG_NETWORK_MACVLAN:
531 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
532 return log_oom();
533
4bbfe7ad
TG
534 arg_private_network = true;
535 break;
536
537 case ARG_NETWORK_IPVLAN:
538 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
539 return log_oom();
540
aa28aefe
LP
541 /* fall through */
542
ff01d048
LP
543 case ARG_PRIVATE_NETWORK:
544 arg_private_network = true;
a41fe3a2
LP
545 break;
546
0f0dbc46
LP
547 case 'b':
548 arg_boot = true;
549 break;
550
144f0fc0 551 case ARG_UUID:
9444b1f2
LP
552 r = sd_id128_from_string(optarg, &arg_uuid);
553 if (r < 0) {
aa96c6cb 554 log_error("Invalid UUID: %s", optarg);
9444b1f2 555 return r;
aa96c6cb 556 }
9444b1f2 557 break;
aa96c6cb 558
9444b1f2 559 case 'S':
c74e630d 560 arg_slice = optarg;
144f0fc0
LP
561 break;
562
7027ff61 563 case 'M':
c1521918 564 if (isempty(optarg))
97b11eed 565 arg_machine = mfree(arg_machine);
c1521918 566 else {
0c3c4284 567 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
568 log_error("Invalid machine name: %s", optarg);
569 return -EINVAL;
570 }
7027ff61 571
0c3c4284
LP
572 r = free_and_strdup(&arg_machine, optarg);
573 if (r < 0)
eb91eb18
LP
574 return log_oom();
575
576 break;
577 }
7027ff61 578
82adf6af
LP
579 case 'Z':
580 arg_selinux_context = optarg;
a8828ed9
DW
581 break;
582
82adf6af
LP
583 case 'L':
584 arg_selinux_apifs_context = optarg;
a8828ed9
DW
585 break;
586
bc2f673e
LP
587 case ARG_READ_ONLY:
588 arg_read_only = true;
589 break;
590
420c7379
LP
591 case ARG_CAPABILITY:
592 case ARG_DROP_CAPABILITY: {
a2a5291b 593 const char *state, *word;
5076f0cc
LP
594 size_t length;
595
596 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 597 _cleanup_free_ char *t;
5076f0cc
LP
598
599 t = strndup(word, length);
0d0f0c50
SL
600 if (!t)
601 return log_oom();
5076f0cc 602
39ed67d1
LP
603 if (streq(t, "all")) {
604 if (c == ARG_CAPABILITY)
a42c8b54 605 plus = (uint64_t) -1;
39ed67d1 606 else
a42c8b54 607 minus = (uint64_t) -1;
39ed67d1 608 } else {
2822da4f
LP
609 int cap;
610
611 cap = capability_from_name(t);
612 if (cap < 0) {
39ed67d1
LP
613 log_error("Failed to parse capability %s.", t);
614 return -EINVAL;
615 }
616
617 if (c == ARG_CAPABILITY)
a42c8b54 618 plus |= 1ULL << (uint64_t) cap;
39ed67d1 619 else
a42c8b54 620 minus |= 1ULL << (uint64_t) cap;
5076f0cc 621 }
5076f0cc
LP
622 }
623
624 break;
625 }
626
57fb9fb5
LP
627 case 'j':
628 arg_link_journal = LINK_GUEST;
574edc90 629 arg_link_journal_try = true;
57fb9fb5
LP
630 break;
631
632 case ARG_LINK_JOURNAL:
53e438e3 633 if (streq(optarg, "auto")) {
57fb9fb5 634 arg_link_journal = LINK_AUTO;
53e438e3
LP
635 arg_link_journal_try = false;
636 } else if (streq(optarg, "no")) {
57fb9fb5 637 arg_link_journal = LINK_NO;
53e438e3
LP
638 arg_link_journal_try = false;
639 } else if (streq(optarg, "guest")) {
57fb9fb5 640 arg_link_journal = LINK_GUEST;
53e438e3
LP
641 arg_link_journal_try = false;
642 } else if (streq(optarg, "host")) {
57fb9fb5 643 arg_link_journal = LINK_HOST;
53e438e3
LP
644 arg_link_journal_try = false;
645 } else if (streq(optarg, "try-guest")) {
574edc90
MP
646 arg_link_journal = LINK_GUEST;
647 arg_link_journal_try = true;
648 } else if (streq(optarg, "try-host")) {
649 arg_link_journal = LINK_HOST;
650 arg_link_journal_try = true;
651 } else {
57fb9fb5
LP
652 log_error("Failed to parse link journal mode %s", optarg);
653 return -EINVAL;
654 }
655
656 break;
657
17fe0523
LP
658 case ARG_BIND:
659 case ARG_BIND_RO: {
e4a5d9ed 660 const char *current = optarg;
5e5bfa6e 661 _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
5a8af538 662 CustomMount *m;
17fe0523 663
5e5bfa6e 664 r = extract_many_words(&current, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, &opts, NULL);
e4a5d9ed
RM
665 switch (r) {
666 case 1:
667 destination = strdup(source);
668 case 2:
5e5bfa6e 669 case 3:
e4a5d9ed
RM
670 break;
671 case -ENOMEM:
672 return log_oom();
673 default:
674 log_error("Invalid bind mount specification: %s", optarg);
675 return -EINVAL;
17fe0523
LP
676 }
677
5a8af538 678 if (!source || !destination)
17fe0523
LP
679 return log_oom();
680
5a8af538 681 if (!path_is_absolute(source) || !path_is_absolute(destination)) {
17fe0523
LP
682 log_error("Invalid bind mount specification: %s", optarg);
683 return -EINVAL;
684 }
685
5a8af538
LP
686 m = custom_mount_add(CUSTOM_MOUNT_BIND);
687 if (!m)
b3451bed 688 return log_oom();
17fe0523 689
5a8af538
LP
690 m->source = source;
691 m->destination = destination;
692 m->read_only = c == ARG_BIND_RO;
5e5bfa6e 693 m->options = opts;
5a8af538 694
5e5bfa6e 695 source = destination = opts = NULL;
17fe0523
LP
696
697 break;
698 }
699
06c17c39 700 case ARG_TMPFS: {
6330ee10 701 const char *current = optarg;
5a8af538
LP
702 _cleanup_free_ char *path = NULL, *opts = NULL;
703 CustomMount *m;
06c17c39 704
6330ee10
RM
705 r = extract_first_word(&current, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
706 if (r == -ENOMEM)
707 return log_oom();
708 else if (r < 0) {
709 log_error("Invalid tmpfs specification: %s", optarg);
710 return r;
06c17c39 711 }
6330ee10
RM
712 if (r)
713 opts = strdup(current);
714 else
715 opts = strdup("mode=0755");
06c17c39 716
5a8af538 717 if (!path || !opts)
06c17c39
LP
718 return log_oom();
719
5a8af538 720 if (!path_is_absolute(path)) {
06c17c39
LP
721 log_error("Invalid tmpfs specification: %s", optarg);
722 return -EINVAL;
723 }
724
5a8af538
LP
725 m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
726 if (!m)
06c17c39
LP
727 return log_oom();
728
5a8af538
LP
729 m->destination = path;
730 m->options = opts;
06c17c39 731
5a8af538
LP
732 path = opts = NULL;
733
734 break;
735 }
736
737 case ARG_OVERLAY:
738 case ARG_OVERLAY_RO: {
739 _cleanup_free_ char *upper = NULL, *destination = NULL;
740 _cleanup_strv_free_ char **lower = NULL;
741 CustomMount *m;
742 unsigned n = 0;
743 char **i;
744
62f9f39a
RM
745 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
746 if (r == -ENOMEM)
06c17c39 747 return log_oom();
62f9f39a
RM
748 else if (r < 0) {
749 log_error("Invalid overlay specification: %s", optarg);
750 return r;
751 }
06c17c39 752
5a8af538
LP
753 STRV_FOREACH(i, lower) {
754 if (!path_is_absolute(*i)) {
755 log_error("Overlay path %s is not absolute.", *i);
756 return -EINVAL;
757 }
758
759 n++;
760 }
761
762 if (n < 2) {
763 log_error("--overlay= needs at least two colon-separated directories specified.");
764 return -EINVAL;
765 }
766
767 if (n == 2) {
768 /* If two parameters are specified,
769 * the first one is the lower, the
770 * second one the upper directory. And
af86c440
ZJS
771 * we'll also define the destination
772 * mount point the same as the upper. */
5a8af538
LP
773 upper = lower[1];
774 lower[1] = NULL;
775
776 destination = strdup(upper);
777 if (!destination)
778 return log_oom();
779
780 } else {
781 upper = lower[n - 2];
782 destination = lower[n - 1];
783 lower[n - 2] = NULL;
784 }
785
786 m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
787 if (!m)
788 return log_oom();
789
790 m->destination = destination;
791 m->source = upper;
792 m->lower = lower;
793 m->read_only = c == ARG_OVERLAY_RO;
794
795 upper = destination = NULL;
796 lower = NULL;
06c17c39
LP
797
798 break;
799 }
800
f4889f65
LP
801 case ARG_SETENV: {
802 char **n;
803
804 if (!env_assignment_is_valid(optarg)) {
805 log_error("Environment variable assignment '%s' is not valid.", optarg);
806 return -EINVAL;
807 }
808
809 n = strv_env_set(arg_setenv, optarg);
810 if (!n)
811 return log_oom();
812
813 strv_free(arg_setenv);
814 arg_setenv = n;
815 break;
816 }
817
284c0b91
LP
818 case 'q':
819 arg_quiet = true;
820 break;
821
8a96d94e
LP
822 case ARG_SHARE_SYSTEM:
823 arg_share_system = true;
824 break;
825
eb91eb18
LP
826 case ARG_REGISTER:
827 r = parse_boolean(optarg);
828 if (r < 0) {
829 log_error("Failed to parse --register= argument: %s", optarg);
830 return r;
831 }
832
833 arg_register = r;
834 break;
835
89f7c846
LP
836 case ARG_KEEP_UNIT:
837 arg_keep_unit = true;
838 break;
839
6afc95b7
LP
840 case ARG_PERSONALITY:
841
ac45f971 842 arg_personality = personality_from_string(optarg);
050f7277 843 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
844 log_error("Unknown or unsupported personality '%s'.", optarg);
845 return -EINVAL;
846 }
847
848 break;
849
4d9f07b4
LP
850 case ARG_VOLATILE:
851
852 if (!optarg)
853 arg_volatile = VOLATILE_YES;
854 else {
855 r = parse_boolean(optarg);
856 if (r < 0) {
857 if (streq(optarg, "state"))
858 arg_volatile = VOLATILE_STATE;
859 else {
860 log_error("Failed to parse --volatile= argument: %s", optarg);
861 return r;
862 }
863 } else
864 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
865 }
866
867 break;
868
6d0b55c2
LP
869 case 'p': {
870 const char *split, *e;
871 uint16_t container_port, host_port;
872 int protocol;
873 ExposePort *p;
874
875 if ((e = startswith(optarg, "tcp:")))
876 protocol = IPPROTO_TCP;
877 else if ((e = startswith(optarg, "udp:")))
878 protocol = IPPROTO_UDP;
879 else {
880 e = optarg;
881 protocol = IPPROTO_TCP;
882 }
883
884 split = strchr(e, ':');
885 if (split) {
886 char v[split - e + 1];
887
888 memcpy(v, e, split - e);
889 v[split - e] = 0;
890
891 r = safe_atou16(v, &host_port);
892 if (r < 0 || host_port <= 0) {
893 log_error("Failed to parse host port: %s", optarg);
894 return -EINVAL;
895 }
896
897 r = safe_atou16(split + 1, &container_port);
898 } else {
899 r = safe_atou16(e, &container_port);
900 host_port = container_port;
901 }
902
903 if (r < 0 || container_port <= 0) {
904 log_error("Failed to parse host port: %s", optarg);
905 return -EINVAL;
906 }
907
908 LIST_FOREACH(ports, p, arg_expose_ports) {
909 if (p->protocol == protocol && p->host_port == host_port) {
910 log_error("Duplicate port specification: %s", optarg);
911 return -EINVAL;
912 }
913 }
914
915 p = new(ExposePort, 1);
916 if (!p)
917 return log_oom();
918
919 p->protocol = protocol;
920 p->host_port = host_port;
921 p->container_port = container_port;
922
923 LIST_PREPEND(ports, arg_expose_ports, p);
924
925 break;
926 }
927
f36933fe
LP
928 case ARG_PROPERTY:
929 if (strv_extend(&arg_property, optarg) < 0)
930 return log_oom();
931
932 break;
933
6dac160c
LP
934 case ARG_PRIVATE_USERS:
935 if (optarg) {
936 _cleanup_free_ char *buffer = NULL;
937 const char *range, *shift;
938
939 range = strchr(optarg, ':');
940 if (range) {
941 buffer = strndup(optarg, range - optarg);
942 if (!buffer)
943 return log_oom();
944 shift = buffer;
945
946 range++;
947 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
948 log_error("Failed to parse UID range: %s", range);
949 return -EINVAL;
950 }
951 } else
952 shift = optarg;
953
954 if (parse_uid(shift, &arg_uid_shift) < 0) {
955 log_error("Failed to parse UID: %s", optarg);
956 return -EINVAL;
957 }
958 }
959
960 arg_userns = true;
961 break;
962
c6c8f6e2
LP
963 case ARG_KILL_SIGNAL:
964 arg_kill_signal = signal_from_string_try_harder(optarg);
965 if (arg_kill_signal < 0) {
966 log_error("Cannot parse signal: %s", optarg);
967 return -EINVAL;
968 }
969
970 break;
971
88213476
LP
972 case '?':
973 return -EINVAL;
974
975 default:
eb9da376 976 assert_not_reached("Unhandled option");
88213476 977 }
88213476 978
eb91eb18
LP
979 if (arg_share_system)
980 arg_register = false;
981
982 if (arg_boot && arg_share_system) {
983 log_error("--boot and --share-system may not be combined.");
984 return -EINVAL;
985 }
986
89f7c846
LP
987 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
988 log_error("--keep-unit may not be used when invoked from a user session.");
989 return -EINVAL;
990 }
991
1b9e5b12
LP
992 if (arg_directory && arg_image) {
993 log_error("--directory= and --image= may not be combined.");
994 return -EINVAL;
995 }
996
ec16945e
LP
997 if (arg_template && arg_image) {
998 log_error("--template= and --image= may not be combined.");
999 return -EINVAL;
1000 }
1001
1002 if (arg_template && !(arg_directory || arg_machine)) {
1003 log_error("--template= needs --directory= or --machine=.");
1004 return -EINVAL;
1005 }
1006
1007 if (arg_ephemeral && arg_template) {
1008 log_error("--ephemeral and --template= may not be combined.");
1009 return -EINVAL;
1010 }
1011
1012 if (arg_ephemeral && arg_image) {
1013 log_error("--ephemeral and --image= may not be combined.");
1014 return -EINVAL;
1015 }
1016
df9a75e4
LP
1017 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1018 log_error("--ephemeral and --link-journal= may not be combined.");
1019 return -EINVAL;
1020 }
1021
4d9f07b4
LP
1022 if (arg_volatile != VOLATILE_NO && arg_read_only) {
1023 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1024 return -EINVAL;
1025 }
1026
6d0b55c2
LP
1027 if (arg_expose_ports && !arg_private_network) {
1028 log_error("Cannot use --port= without private networking.");
1029 return -EINVAL;
1030 }
1031
b774fb7f
DH
1032 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
1033 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
1034
a42c8b54
LP
1035 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1036
c6c8f6e2
LP
1037 if (arg_boot && arg_kill_signal <= 0)
1038 arg_kill_signal = SIGRTMIN+3;
1039
88213476
LP
1040 return 1;
1041}
1042
03cfe0d5
LP
1043static int tmpfs_patch_options(const char *options, char **ret) {
1044 char *buf = NULL;
1045
1046 if (arg_userns && arg_uid_shift != 0) {
825d5287 1047 assert(arg_uid_shift != UID_INVALID);
03cfe0d5
LP
1048
1049 if (options)
f001a835 1050 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
03cfe0d5 1051 else
f001a835 1052 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
03cfe0d5
LP
1053 if (!buf)
1054 return -ENOMEM;
1055
1056 options = buf;
1057 }
1058
1059#ifdef HAVE_SELINUX
1060 if (arg_selinux_apifs_context) {
1061 char *t;
1062
1063 if (options)
1064 t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
1065 else
1066 t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
1067 if (!t) {
1068 free(buf);
1069 return -ENOMEM;
1070 }
1071
1072 free(buf);
1073 buf = t;
1074 }
1075#endif
1076
1077 *ret = buf;
1078 return !!buf;
1079}
1080
1081static int mount_all(const char *dest, bool userns) {
88213476
LP
1082
1083 typedef struct MountPoint {
1084 const char *what;
1085 const char *where;
1086 const char *type;
1087 const char *options;
1088 unsigned long flags;
3bd66c05 1089 bool fatal;
03cfe0d5 1090 bool userns;
88213476
LP
1091 } MountPoint;
1092
1093 static const MountPoint mount_table[] = {
3c59d4f2
RM
1094 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
1095 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
1096 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
1097 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
1098 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
1099 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
1100 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1101 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1102 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
9b634ea5 1103#ifdef HAVE_SELINUX
3c59d4f2
RM
1104 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
1105 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */
9b634ea5 1106#endif
88213476
LP
1107 };
1108
1109 unsigned k;
03cfe0d5 1110 int r;
88213476
LP
1111
1112 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
d15d65a0 1113 _cleanup_free_ char *where = NULL, *options = NULL;
d002827b 1114 const char *o;
88213476 1115
03cfe0d5
LP
1116 if (userns != mount_table[k].userns)
1117 continue;
1118
1119 where = prefix_root(dest, mount_table[k].where);
17fe0523
LP
1120 if (!where)
1121 return log_oom();
88213476 1122
e26d6ce5 1123 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
03cfe0d5
LP
1124 if (r < 0 && r != -ENOENT)
1125 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
88213476 1126
9c1c7f71 1127 /* Skip this entry if it is not a remount. */
03cfe0d5 1128 if (mount_table[k].what && r > 0)
014a9c77
LP
1129 continue;
1130
03cfe0d5
LP
1131 r = mkdir_p(where, 0755);
1132 if (r < 0) {
1133 if (mount_table[k].fatal)
1134 return log_error_errno(r, "Failed to create directory %s: %m", where);
79d80fc1 1135
03cfe0d5 1136 log_warning_errno(r, "Failed to create directory %s: %m", where);
79d80fc1
TG
1137 continue;
1138 }
88213476 1139
03cfe0d5
LP
1140 o = mount_table[k].options;
1141 if (streq_ptr(mount_table[k].type, "tmpfs")) {
1142 r = tmpfs_patch_options(o, &options);
1143 if (r < 0)
6dac160c 1144 return log_oom();
03cfe0d5
LP
1145 if (r > 0)
1146 o = options;
6dac160c 1147 }
a8828ed9 1148
88213476
LP
1149 if (mount(mount_table[k].what,
1150 where,
1151 mount_table[k].type,
1152 mount_table[k].flags,
79d80fc1 1153 o) < 0) {
88213476 1154
03cfe0d5
LP
1155 if (mount_table[k].fatal)
1156 return log_error_errno(errno, "mount(%s) failed: %m", where);
88213476 1157
03cfe0d5 1158 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
88213476 1159 }
88213476
LP
1160 }
1161
03cfe0d5 1162 return 0;
e58a1277 1163}
f8440af5 1164
5e5bfa6e
EY
1165static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
1166 const char *p = options;
1167 unsigned long flags = *mount_flags;
1168 char *opts = NULL;
1169
1170 assert(options);
1171
1172 for (;;) {
1173 _cleanup_free_ char *word = NULL;
a19222e1 1174 int r = extract_first_word(&p, &word, ",", 0);
5e5bfa6e
EY
1175 if (r < 0)
1176 return log_error_errno(r, "Failed to extract mount option: %m");
1177 if (r == 0)
1178 break;
1179
1180 if (streq(word, "rbind"))
1181 flags |= MS_REC;
1182 else if (streq(word, "norbind"))
1183 flags &= ~MS_REC;
1184 else {
1185 log_error("Invalid bind mount option: %s", word);
1186 return -EINVAL;
1187 }
1188 }
1189
1190 *mount_flags = flags;
1191 /* in the future mount_opts will hold string options for mount(2) */
1192 *mount_opts = opts;
1193
1194 return 0;
1195}
1196
5a8af538
LP
1197static int mount_bind(const char *dest, CustomMount *m) {
1198 struct stat source_st, dest_st;
03cfe0d5 1199 const char *where;
5e5bfa6e
EY
1200 unsigned long mount_flags = MS_BIND | MS_REC;
1201 _cleanup_free_ char *mount_opts = NULL;
5a8af538 1202 int r;
17fe0523 1203
5a8af538 1204 assert(m);
d2421337 1205
5e5bfa6e
EY
1206 if (m->options) {
1207 r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
1208 if (r < 0)
1209 return r;
1210 }
1211
5a8af538
LP
1212 if (stat(m->source, &source_st) < 0)
1213 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
17fe0523 1214
03cfe0d5 1215 where = prefix_roota(dest, m->destination);
06c17c39 1216
03cfe0d5 1217 if (stat(where, &dest_st) >= 0) {
5a8af538
LP
1218 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
1219 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
1220 return -EINVAL;
2ed4e5e0 1221 }
06c17c39 1222
5a8af538
LP
1223 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
1224 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
1225 return -EINVAL;
d2421337 1226 }
17fe0523 1227
5a8af538
LP
1228 } else if (errno == ENOENT) {
1229 r = mkdir_parents_label(where, 0755);
1230 if (r < 0)
1231 return log_error_errno(r, "Failed to make parents of %s: %m", where);
1232 } else {
1233 log_error_errno(errno, "Failed to stat %s: %m", where);
1234 return -errno;
1235 }
17fe0523 1236
5a8af538
LP
1237 /* Create the mount point. Any non-directory file can be
1238 * mounted on any non-directory file (regular, fifo, socket,
1239 * char, block).
1240 */
1241 if (S_ISDIR(source_st.st_mode))
1242 r = mkdir_label(where, 0755);
1243 else
1244 r = touch(where);
1245 if (r < 0 && r != -EEXIST)
1246 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1247
5e5bfa6e 1248 if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0)
5a8af538
LP
1249 return log_error_errno(errno, "mount(%s) failed: %m", where);
1250
1251 if (m->read_only) {
1252 r = bind_remount_recursive(where, true);
1253 if (r < 0)
1254 return log_error_errno(r, "Read-only bind mount failed: %m");
1255 }
1256
1257 return 0;
1258}
1259
1260static int mount_tmpfs(const char *dest, CustomMount *m) {
03cfe0d5
LP
1261 const char *where, *options;
1262 _cleanup_free_ char *buf = NULL;
5a8af538
LP
1263 int r;
1264
1265 assert(dest);
1266 assert(m);
1267
03cfe0d5 1268 where = prefix_roota(dest, m->destination);
5a8af538 1269
03cfe0d5 1270 r = mkdir_p_label(where, 0755);
5a8af538
LP
1271 if (r < 0 && r != -EEXIST)
1272 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1273
03cfe0d5
LP
1274 r = tmpfs_patch_options(m->options, &buf);
1275 if (r < 0)
1276 return log_oom();
1277 options = r > 0 ? buf : m->options;
1278
1279 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
5a8af538
LP
1280 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1281
1282 return 0;
1283}
1284
872d0dbd
RM
1285static char *joined_and_escaped_lower_dirs(char * const *lower) {
1286 _cleanup_strv_free_ char **sv = NULL;
1287
1288 sv = strv_copy(lower);
1289 if (!sv)
1290 return NULL;
1291
1292 strv_reverse(sv);
1293
1294 if (!strv_shell_escape(sv, ",:"))
1295 return NULL;
1296
1297 return strv_join(sv, ":");
1298}
1299
5a8af538
LP
1300static int mount_overlay(const char *dest, CustomMount *m) {
1301 _cleanup_free_ char *lower = NULL;
03cfe0d5 1302 const char *where, *options;
5a8af538
LP
1303 int r;
1304
1305 assert(dest);
1306 assert(m);
1307
03cfe0d5 1308 where = prefix_roota(dest, m->destination);
5a8af538
LP
1309
1310 r = mkdir_label(where, 0755);
1311 if (r < 0 && r != -EEXIST)
1312 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
1313
1314 (void) mkdir_p_label(m->source, 0755);
1315
872d0dbd 1316 lower = joined_and_escaped_lower_dirs(m->lower);
5a8af538
LP
1317 if (!lower)
1318 return log_oom();
1319
872d0dbd
RM
1320 if (m->read_only) {
1321 _cleanup_free_ char *escaped_source = NULL;
1322
1323 escaped_source = shell_escape(m->source, ",:");
1324 if (!escaped_source)
1325 return log_oom();
1326
1327 options = strjoina("lowerdir=", escaped_source, ":", lower);
1328 } else {
1329 _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL;
1330
5a8af538
LP
1331 assert(m->work_dir);
1332 (void) mkdir_label(m->work_dir, 0700);
1333
872d0dbd
RM
1334 escaped_source = shell_escape(m->source, ",:");
1335 if (!escaped_source)
1336 return log_oom();
1337 escaped_work_dir = shell_escape(m->work_dir, ",:");
1338 if (!escaped_work_dir)
1339 return log_oom();
1340
1341 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
5a8af538
LP
1342 }
1343
1344 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
1345 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
1346
1347 return 0;
1348}
1349
1350static int mount_custom(const char *dest) {
1351 unsigned i;
1352 int r;
1353
1354 assert(dest);
1355
1356 for (i = 0; i < arg_n_custom_mounts; i++) {
1357 CustomMount *m = &arg_custom_mounts[i];
1358
1359 switch (m->type) {
1360
1361 case CUSTOM_MOUNT_BIND:
1362 r = mount_bind(dest, m);
1363 break;
1364
1365 case CUSTOM_MOUNT_TMPFS:
1366 r = mount_tmpfs(dest, m);
1367 break;
1368
1369 case CUSTOM_MOUNT_OVERLAY:
1370 r = mount_overlay(dest, m);
1371 break;
1372
1373 default:
1374 assert_not_reached("Unknown custom mount type");
17fe0523 1375 }
5a8af538
LP
1376
1377 if (r < 0)
1378 return r;
17fe0523
LP
1379 }
1380
1381 return 0;
1382}
1383
b12afc8c
LP
1384static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1385 char *to;
1386 int r;
1387
63c372cb 1388 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
b12afc8c 1389
e26d6ce5 1390 r = path_is_mount_point(to, 0);
da00518b 1391 if (r < 0 && r != -ENOENT)
b12afc8c
LP
1392 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1393 if (r > 0)
1394 return 0;
1395
1396 mkdir_p(to, 0755);
1397
c0534580
LP
1398 /* The superblock mount options of the mount point need to be
1399 * identical to the hosts', and hence writable... */
1400 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
b12afc8c
LP
1401 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1402
c0534580
LP
1403 /* ... hence let's only make the bind mount read-only, not the
1404 * superblock. */
1405 if (read_only) {
1406 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1407 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1408 }
b12afc8c
LP
1409 return 1;
1410}
1411
1412static int mount_cgroup(const char *dest) {
1413 _cleanup_set_free_free_ Set *controllers = NULL;
03cfe0d5 1414 const char *cgroup_root;
b12afc8c
LP
1415 int r;
1416
1417 controllers = set_new(&string_hash_ops);
1418 if (!controllers)
1419 return log_oom();
1420
1421 r = cg_kernel_controllers(controllers);
1422 if (r < 0)
1423 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1424
b12afc8c
LP
1425 for (;;) {
1426 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1427
1428 controller = set_steal_first(controllers);
1429 if (!controller)
1430 break;
1431
03cfe0d5 1432 origin = prefix_root("/sys/fs/cgroup/", controller);
b12afc8c
LP
1433 if (!origin)
1434 return log_oom();
1435
1436 r = readlink_malloc(origin, &combined);
1437 if (r == -EINVAL) {
1438 /* Not a symbolic link, but directly a single cgroup hierarchy */
1439
1440 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1441 if (r < 0)
1442 return r;
1443
1444 } else if (r < 0)
1445 return log_error_errno(r, "Failed to read link %s: %m", origin);
1446 else {
1447 _cleanup_free_ char *target = NULL;
1448
03cfe0d5 1449 target = prefix_root(dest, origin);
b12afc8c
LP
1450 if (!target)
1451 return log_oom();
1452
1453 /* A symbolic link, a combination of controllers in one hierarchy */
1454
1455 if (!filename_is_valid(combined)) {
1456 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1457 continue;
1458 }
1459
1460 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1461 if (r < 0)
1462 return r;
1463
875e1014
ILG
1464 r = symlink_idempotent(combined, target);
1465 if (r == -EINVAL) {
1466 log_error("Invalid existing symlink for combined hierarchy");
1467 return r;
1468 }
1469 if (r < 0)
1470 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
b12afc8c
LP
1471 }
1472 }
1473
c0534580 1474 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
b12afc8c
LP
1475 if (r < 0)
1476 return r;
1477
03cfe0d5
LP
1478 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1479 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1480 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1481
1482 return 0;
1483}
1484
1485static int mount_systemd_cgroup_writable(const char *dest) {
1486 _cleanup_free_ char *own_cgroup_path = NULL;
1487 const char *systemd_root, *systemd_own;
1488 int r;
1489
1490 assert(dest);
1491
1492 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1493 if (r < 0)
1494 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1495
b12afc8c 1496 /* Make our own cgroup a (writable) bind mount */
63c372cb 1497 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
b12afc8c
LP
1498 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1499 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1500
1501 /* And then remount the systemd cgroup root read-only */
03cfe0d5 1502 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
b12afc8c
LP
1503 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1504 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1505
03cfe0d5
LP
1506 return 0;
1507}
1508
1509static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1510 assert(p);
1511
1512 if (!arg_userns)
1513 return 0;
1514
1515 if (uid == UID_INVALID && gid == GID_INVALID)
1516 return 0;
1517
1518 if (uid != UID_INVALID) {
1519 uid += arg_uid_shift;
1520
1521 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1522 return -EOVERFLOW;
1523 }
1524
1525 if (gid != GID_INVALID) {
1526 gid += (gid_t) arg_uid_shift;
1527
1528 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1529 return -EOVERFLOW;
1530 }
1531
1532 if (lchown(p, uid, gid) < 0)
1533 return -errno;
b12afc8c
LP
1534
1535 return 0;
1536}
1537
03cfe0d5
LP
1538static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1539 const char *q;
1540
1541 q = prefix_roota(root, path);
1542 if (mkdir(q, mode) < 0) {
1543 if (errno == EEXIST)
1544 return 0;
1545 return -errno;
1546 }
1547
1548 return userns_lchown(q, uid, gid);
1549}
1550
e58a1277 1551static int setup_timezone(const char *dest) {
03cfe0d5
LP
1552 _cleanup_free_ char *p = NULL, *q = NULL;
1553 const char *where, *check, *what;
d4036145
LP
1554 char *z, *y;
1555 int r;
f8440af5 1556
e58a1277
LP
1557 assert(dest);
1558
1559 /* Fix the timezone, if possible */
d4036145
LP
1560 r = readlink_malloc("/etc/localtime", &p);
1561 if (r < 0) {
1562 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1563 return 0;
1564 }
1565
1566 z = path_startswith(p, "../usr/share/zoneinfo/");
1567 if (!z)
1568 z = path_startswith(p, "/usr/share/zoneinfo/");
1569 if (!z) {
1570 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1571 return 0;
1572 }
1573
03cfe0d5 1574 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1575 r = readlink_malloc(where, &q);
1576 if (r >= 0) {
1577 y = path_startswith(q, "../usr/share/zoneinfo/");
1578 if (!y)
1579 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1580
d4036145
LP
1581 /* Already pointing to the right place? Then do nothing .. */
1582 if (y && streq(y, z))
1583 return 0;
1584 }
1585
03cfe0d5
LP
1586 check = strjoina("/usr/share/zoneinfo/", z);
1587 check = prefix_root(dest, check);
1588 if (laccess(check, F_OK) < 0) {
d4036145
LP
1589 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1590 return 0;
1591 }
68fb0892 1592
79d80fc1
TG
1593 r = unlink(where);
1594 if (r < 0 && errno != ENOENT) {
56f64d95 1595 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1596 return 0;
1597 }
4d9f07b4 1598
03cfe0d5 1599 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1600 if (symlink(what, where) < 0) {
56f64d95 1601 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1602 return 0;
1603 }
e58a1277 1604
03cfe0d5
LP
1605 r = userns_lchown(where, 0, 0);
1606 if (r < 0)
1607 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1608
e58a1277 1609 return 0;
88213476
LP
1610}
1611
2547bb41 1612static int setup_resolv_conf(const char *dest) {
03cfe0d5 1613 const char *where = NULL;
79d80fc1 1614 int r;
2547bb41
LP
1615
1616 assert(dest);
1617
1618 if (arg_private_network)
1619 return 0;
1620
1621 /* Fix resolv.conf, if possible */
03cfe0d5 1622 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1623
f2068bcc 1624 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1625 if (r < 0) {
68a313c5
LP
1626 /* If the file already exists as symlink, let's
1627 * suppress the warning, under the assumption that
1628 * resolved or something similar runs inside and the
1629 * symlink points there.
1630 *
1631 * If the disk image is read-only, there's also no
1632 * point in complaining.
1633 */
1634 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1635 "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1636 return 0;
1637 }
2547bb41 1638
03cfe0d5
LP
1639 r = userns_lchown(where, 0, 0);
1640 if (r < 0)
1641 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1642
2547bb41
LP
1643 return 0;
1644}
1645
4d9f07b4 1646static int setup_volatile_state(const char *directory) {
03cfe0d5
LP
1647 _cleanup_free_ char *buf = NULL;
1648 const char *p, *options;
4d9f07b4
LP
1649 int r;
1650
1651 assert(directory);
1652
1653 if (arg_volatile != VOLATILE_STATE)
1654 return 0;
1655
1656 /* --volatile=state means we simply overmount /var
1657 with a tmpfs, and the rest read-only. */
1658
1659 r = bind_remount_recursive(directory, true);
f647962d
MS
1660 if (r < 0)
1661 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
4d9f07b4 1662
03cfe0d5 1663 p = prefix_roota(directory, "/var");
79d80fc1 1664 r = mkdir(p, 0755);
4a62c710
MS
1665 if (r < 0 && errno != EEXIST)
1666 return log_error_errno(errno, "Failed to create %s: %m", directory);
4d9f07b4 1667
03cfe0d5
LP
1668 options = "mode=755";
1669 r = tmpfs_patch_options(options, &buf);
1670 if (r < 0)
1671 return log_oom();
1672 if (r > 0)
1673 options = buf;
1674
1675 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
4a62c710 1676 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
4d9f07b4
LP
1677
1678 return 0;
1679}
1680
1681static int setup_volatile(const char *directory) {
1682 bool tmpfs_mounted = false, bind_mounted = false;
1683 char template[] = "/tmp/nspawn-volatile-XXXXXX";
03cfe0d5
LP
1684 _cleanup_free_ char *buf = NULL;
1685 const char *f, *t, *options;
4d9f07b4
LP
1686 int r;
1687
1688 assert(directory);
1689
1690 if (arg_volatile != VOLATILE_YES)
1691 return 0;
1692
1693 /* --volatile=yes means we mount a tmpfs to the root dir, and
1694 the original /usr to use inside it, and that read-only. */
1695
4a62c710
MS
1696 if (!mkdtemp(template))
1697 return log_error_errno(errno, "Failed to create temporary directory: %m");
4d9f07b4 1698
03cfe0d5
LP
1699 options = "mode=755";
1700 r = tmpfs_patch_options(options, &buf);
1701 if (r < 0)
1702 return log_oom();
1703 if (r > 0)
1704 options = buf;
1705
1706 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
1707 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
4d9f07b4
LP
1708 goto fail;
1709 }
1710
1711 tmpfs_mounted = true;
1712
03cfe0d5
LP
1713 f = prefix_roota(directory, "/usr");
1714 t = prefix_roota(template, "/usr");
4d9f07b4 1715
79d80fc1
TG
1716 r = mkdir(t, 0755);
1717 if (r < 0 && errno != EEXIST) {
03cfe0d5 1718 r = log_error_errno(errno, "Failed to create %s: %m", t);
79d80fc1
TG
1719 goto fail;
1720 }
1721
4543768d 1722 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
03cfe0d5 1723 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
4d9f07b4
LP
1724 goto fail;
1725 }
1726
1727 bind_mounted = true;
1728
1729 r = bind_remount_recursive(t, true);
1730 if (r < 0) {
da927ba9 1731 log_error_errno(r, "Failed to remount %s read-only: %m", t);
4d9f07b4
LP
1732 goto fail;
1733 }
1734
1735 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
03cfe0d5 1736 r = log_error_errno(errno, "Failed to move root mount: %m");
4d9f07b4
LP
1737 goto fail;
1738 }
1739
03cfe0d5 1740 (void) rmdir(template);
4d9f07b4
LP
1741
1742 return 0;
1743
1744fail:
1745 if (bind_mounted)
03cfe0d5
LP
1746 (void) umount(t);
1747
4d9f07b4 1748 if (tmpfs_mounted)
03cfe0d5
LP
1749 (void) umount(template);
1750 (void) rmdir(template);
4d9f07b4
LP
1751 return r;
1752}
1753
9f24adc2 1754static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
03cfe0d5 1755 assert(s);
9f24adc2
LP
1756
1757 snprintf(s, 37,
1758 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1759 SD_ID128_FORMAT_VAL(id));
1760
1761 return s;
1762}
1763
04bc4a3f 1764static int setup_boot_id(const char *dest) {
03cfe0d5 1765 const char *from, *to;
39883f62 1766 sd_id128_t rnd = {};
04bc4a3f
LP
1767 char as_uuid[37];
1768 int r;
1769
eb91eb18
LP
1770 if (arg_share_system)
1771 return 0;
1772
04bc4a3f
LP
1773 /* Generate a new randomized boot ID, so that each boot-up of
1774 * the container gets a new one */
1775
03cfe0d5
LP
1776 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1777 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1778
1779 r = sd_id128_randomize(&rnd);
f647962d
MS
1780 if (r < 0)
1781 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1782
9f24adc2 1783 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1784
4c1fc3e4 1785 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
f647962d
MS
1786 if (r < 0)
1787 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1788
03cfe0d5
LP
1789 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1790 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1791 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
56f64d95 1792 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1793
1794 unlink(from);
04bc4a3f
LP
1795 return r;
1796}
1797
e58a1277 1798static int copy_devnodes(const char *dest) {
88213476
LP
1799
1800 static const char devnodes[] =
1801 "null\0"
1802 "zero\0"
1803 "full\0"
1804 "random\0"
1805 "urandom\0"
85614d66
TG
1806 "tty\0"
1807 "net/tun\0";
88213476
LP
1808
1809 const char *d;
e58a1277 1810 int r = 0;
7fd1b19b 1811 _cleanup_umask_ mode_t u;
a258bf26
LP
1812
1813 assert(dest);
124640f1
LP
1814
1815 u = umask(0000);
88213476 1816
03cfe0d5
LP
1817 /* Create /dev/net, so that we can create /dev/net/tun in it */
1818 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1819 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1820
88213476 1821 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1822 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1823 struct stat st;
88213476 1824
7f112f50 1825 from = strappend("/dev/", d);
03cfe0d5 1826 to = prefix_root(dest, from);
88213476
LP
1827
1828 if (stat(from, &st) < 0) {
1829
4a62c710
MS
1830 if (errno != ENOENT)
1831 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1832
a258bf26 1833 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1834
03cfe0d5 1835 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1836 return -EIO;
a258bf26 1837
85614d66 1838 } else {
81f5049b
AC
1839 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1840 if (errno != EPERM)
1841 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1842
1843 /* Some systems abusively restrict mknod but
1844 * allow bind mounts. */
1845 r = touch(to);
1846 if (r < 0)
1847 return log_error_errno(r, "touch (%s) failed: %m", to);
1848 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1849 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1850 }
6278cf60 1851
03cfe0d5
LP
1852 r = userns_lchown(to, 0, 0);
1853 if (r < 0)
1854 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1855 }
88213476
LP
1856 }
1857
e58a1277
LP
1858 return r;
1859}
88213476 1860
03cfe0d5
LP
1861static int setup_pts(const char *dest) {
1862 _cleanup_free_ char *options = NULL;
1863 const char *p;
1864
1865#ifdef HAVE_SELINUX
1866 if (arg_selinux_apifs_context)
1867 (void) asprintf(&options,
3dce8915 1868 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1869 arg_uid_shift + TTY_GID,
1870 arg_selinux_apifs_context);
1871 else
1872#endif
1873 (void) asprintf(&options,
3dce8915 1874 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1875 arg_uid_shift + TTY_GID);
f2d88580 1876
03cfe0d5 1877 if (!options)
f2d88580
LP
1878 return log_oom();
1879
03cfe0d5 1880 /* Mount /dev/pts itself */
cc9fce65 1881 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1882 if (mkdir(p, 0755) < 0)
1883 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1884 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1885 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1886 if (userns_lchown(p, 0, 0) < 0)
1887 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1888
1889 /* Create /dev/ptmx symlink */
1890 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1891 if (symlink("pts/ptmx", p) < 0)
1892 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
03cfe0d5
LP
1893 if (userns_lchown(p, 0, 0) < 0)
1894 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
f2d88580 1895
03cfe0d5
LP
1896 /* And fix /dev/pts/ptmx ownership */
1897 p = prefix_roota(dest, "/dev/pts/ptmx");
1898 if (userns_lchown(p, 0, 0) < 0)
1899 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1900
f2d88580
LP
1901 return 0;
1902}
1903
e58a1277 1904static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1905 _cleanup_umask_ mode_t u;
1906 const char *to;
e58a1277 1907 int r;
e58a1277
LP
1908
1909 assert(dest);
1910 assert(console);
1911
1912 u = umask(0000);
1913
03cfe0d5 1914 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1915 if (r < 0)
1916 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1917
a258bf26
LP
1918 /* We need to bind mount the right tty to /dev/console since
1919 * ptys can only exist on pts file systems. To have something
81f5049b 1920 * to bind mount things on we create a empty regular file. */
a258bf26 1921
03cfe0d5 1922 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1923 r = touch(to);
1924 if (r < 0)
1925 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1926
4543768d 1927 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1928 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1929
25ea79fe 1930 return 0;
e58a1277
LP
1931}
1932
1933static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1934 const char *from, *to;
7fd1b19b 1935 _cleanup_umask_ mode_t u;
03cfe0d5 1936 int fd, k;
e58a1277
LP
1937 union {
1938 struct cmsghdr cmsghdr;
1939 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1940 } control = {};
1941 struct msghdr mh = {
1942 .msg_control = &control,
1943 .msg_controllen = sizeof(control),
1944 };
e58a1277
LP
1945 struct cmsghdr *cmsg;
1946
e58a1277 1947 assert(kmsg_socket >= 0);
a258bf26 1948
e58a1277 1949 u = umask(0000);
a258bf26 1950
03cfe0d5 1951 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1952 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1953 * on the reading side behave very similar to /proc/kmsg,
1954 * their writing side behaves differently from /dev/kmsg in
1955 * that writing blocks when nothing is reading. In order to
1956 * avoid any problems with containers deadlocking due to this
1957 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1958 from = prefix_roota(dest, "/run/kmsg");
1959 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1960
4a62c710 1961 if (mkfifo(from, 0600) < 0)
03cfe0d5 1962 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 1963 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1964 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1965
1966 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1967 if (fd < 0)
1968 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1969
e58a1277
LP
1970 cmsg = CMSG_FIRSTHDR(&mh);
1971 cmsg->cmsg_level = SOL_SOCKET;
1972 cmsg->cmsg_type = SCM_RIGHTS;
1973 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1974 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1975
1976 mh.msg_controllen = cmsg->cmsg_len;
1977
1978 /* Store away the fd in the socket, so that it stays open as
1979 * long as we run the child */
6d0b55c2 1980 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
03e334a1 1981 safe_close(fd);
e58a1277 1982
4a62c710
MS
1983 if (k < 0)
1984 return log_error_errno(errno, "Failed to send FIFO fd: %m");
a258bf26 1985
03cfe0d5
LP
1986 /* And now make the FIFO unavailable as /run/kmsg... */
1987 (void) unlink(from);
1988
25ea79fe 1989 return 0;
88213476
LP
1990}
1991
6d0b55c2
LP
1992static int send_rtnl(int send_fd) {
1993 union {
1994 struct cmsghdr cmsghdr;
1995 uint8_t buf[CMSG_SPACE(sizeof(int))];
1996 } control = {};
1997 struct msghdr mh = {
1998 .msg_control = &control,
1999 .msg_controllen = sizeof(control),
2000 };
2001 struct cmsghdr *cmsg;
2002 _cleanup_close_ int fd = -1;
2003 ssize_t k;
2004
2005 assert(send_fd >= 0);
2006
2007 if (!arg_expose_ports)
2008 return 0;
2009
2010 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
2011 if (fd < 0)
03cfe0d5 2012 return log_error_errno(errno, "Failed to allocate container netlink: %m");
6d0b55c2
LP
2013
2014 cmsg = CMSG_FIRSTHDR(&mh);
2015 cmsg->cmsg_level = SOL_SOCKET;
2016 cmsg->cmsg_type = SCM_RIGHTS;
2017 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
2018 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
2019
2020 mh.msg_controllen = cmsg->cmsg_len;
2021
2022 /* Store away the fd in the socket, so that it stays open as
2023 * long as we run the child */
2024 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
2025 if (k < 0)
2026 return log_error_errno(errno, "Failed to send netlink fd: %m");
2027
2028 return 0;
2029}
2030
2031static int flush_ports(union in_addr_union *exposed) {
2032 ExposePort *p;
2033 int r, af = AF_INET;
2034
2035 assert(exposed);
2036
2037 if (!arg_expose_ports)
2038 return 0;
2039
2040 if (in_addr_is_null(af, exposed))
2041 return 0;
2042
2043 log_debug("Lost IP address.");
2044
2045 LIST_FOREACH(ports, p, arg_expose_ports) {
2046 r = fw_add_local_dnat(false,
2047 af,
2048 p->protocol,
2049 NULL,
2050 NULL, 0,
2051 NULL, 0,
2052 p->host_port,
2053 exposed,
2054 p->container_port,
2055 NULL);
2056 if (r < 0)
2057 log_warning_errno(r, "Failed to modify firewall: %m");
2058 }
2059
2060 *exposed = IN_ADDR_NULL;
2061 return 0;
2062}
2063
1c4baffc 2064static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
6d0b55c2
LP
2065 _cleanup_free_ struct local_address *addresses = NULL;
2066 _cleanup_free_ char *pretty = NULL;
2067 union in_addr_union new_exposed;
2068 ExposePort *p;
2069 bool add;
2070 int af = AF_INET, r;
2071
2072 assert(exposed);
2073
2074 /* Invoked each time an address is added or removed inside the
2075 * container */
2076
2077 if (!arg_expose_ports)
2078 return 0;
2079
2080 r = local_addresses(rtnl, 0, af, &addresses);
2081 if (r < 0)
2082 return log_error_errno(r, "Failed to enumerate local addresses: %m");
2083
2084 add = r > 0 &&
2085 addresses[0].family == af &&
2086 addresses[0].scope < RT_SCOPE_LINK;
2087
2088 if (!add)
2089 return flush_ports(exposed);
2090
2091 new_exposed = addresses[0].address;
2092 if (in_addr_equal(af, exposed, &new_exposed))
2093 return 0;
2094
2095 in_addr_to_string(af, &new_exposed, &pretty);
2096 log_debug("New container IP is %s.", strna(pretty));
2097
2098 LIST_FOREACH(ports, p, arg_expose_ports) {
2099
2100 r = fw_add_local_dnat(true,
2101 af,
2102 p->protocol,
2103 NULL,
2104 NULL, 0,
2105 NULL, 0,
2106 p->host_port,
2107 &new_exposed,
2108 p->container_port,
2109 in_addr_is_null(af, exposed) ? NULL : exposed);
2110 if (r < 0)
2111 log_warning_errno(r, "Failed to modify firewall: %m");
2112 }
2113
2114 *exposed = new_exposed;
2115 return 0;
2116}
2117
1c4baffc 2118static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
2119 union in_addr_union *exposed = userdata;
2120
2121 assert(rtnl);
2122 assert(m);
2123 assert(exposed);
2124
2125 expose_ports(rtnl, exposed);
2126 return 0;
2127}
2128
1c4baffc 2129static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
6d0b55c2
LP
2130 union {
2131 struct cmsghdr cmsghdr;
2132 uint8_t buf[CMSG_SPACE(sizeof(int))];
2133 } control = {};
2134 struct msghdr mh = {
2135 .msg_control = &control,
2136 .msg_controllen = sizeof(control),
2137 };
2138 struct cmsghdr *cmsg;
1c4baffc 2139 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
6d0b55c2
LP
2140 int fd, r;
2141 ssize_t k;
2142
2143 assert(event);
2144 assert(recv_fd >= 0);
2145 assert(ret);
2146
2147 if (!arg_expose_ports)
2148 return 0;
2149
2150 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
2151 if (k < 0)
2152 return log_error_errno(errno, "Failed to recv netlink fd: %m");
2153
2154 cmsg = CMSG_FIRSTHDR(&mh);
2155 assert(cmsg->cmsg_level == SOL_SOCKET);
2156 assert(cmsg->cmsg_type == SCM_RIGHTS);
657bdca9 2157 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
6d0b55c2
LP
2158 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
2159
1c4baffc 2160 r = sd_netlink_open_fd(&rtnl, fd);
6d0b55c2
LP
2161 if (r < 0) {
2162 safe_close(fd);
2163 return log_error_errno(r, "Failed to create rtnl object: %m");
2164 }
2165
1c4baffc 2166 r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
6d0b55c2
LP
2167 if (r < 0)
2168 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
2169
1c4baffc 2170 r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
6d0b55c2
LP
2171 if (r < 0)
2172 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
2173
1c4baffc 2174 r = sd_netlink_attach_event(rtnl, event, 0);
6d0b55c2
LP
2175 if (r < 0)
2176 return log_error_errno(r, "Failed to add to even loop: %m");
2177
2178 *ret = rtnl;
2179 rtnl = NULL;
2180
2181 return 0;
2182}
2183
3a74cea5 2184static int setup_hostname(void) {
3a74cea5 2185
eb91eb18
LP
2186 if (arg_share_system)
2187 return 0;
2188
605f81a8 2189 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 2190 return -errno;
3a74cea5 2191
7027ff61 2192 return 0;
3a74cea5
LP
2193}
2194
57fb9fb5 2195static int setup_journal(const char *directory) {
4d680aee 2196 sd_id128_t machine_id, this_id;
03cfe0d5
LP
2197 _cleanup_free_ char *b = NULL, *d = NULL;
2198 const char *etc_machine_id, *p, *q;
27407a01 2199 char *id;
57fb9fb5
LP
2200 int r;
2201
df9a75e4
LP
2202 /* Don't link journals in ephemeral mode */
2203 if (arg_ephemeral)
2204 return 0;
2205
03cfe0d5 2206 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
57fb9fb5 2207
03cfe0d5 2208 r = read_one_line_file(etc_machine_id, &b);
27407a01
ZJS
2209 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
2210 return 0;
f647962d 2211 else if (r < 0)
03cfe0d5 2212 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
57fb9fb5 2213
27407a01
ZJS
2214 id = strstrip(b);
2215 if (isempty(id) && arg_link_journal == LINK_AUTO)
2216 return 0;
57fb9fb5 2217
27407a01
ZJS
2218 /* Verify validity */
2219 r = sd_id128_from_string(id, &machine_id);
f647962d 2220 if (r < 0)
03cfe0d5 2221 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
57fb9fb5 2222
4d680aee 2223 r = sd_id128_get_machine(&this_id);
f647962d
MS
2224 if (r < 0)
2225 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
2226
2227 if (sd_id128_equal(machine_id, this_id)) {
2228 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
2229 "Host and machine ids are equal (%s): refusing to link journals", id);
2230 if (arg_link_journal == LINK_AUTO)
2231 return 0;
df9a75e4 2232 return -EEXIST;
4d680aee
ZJS
2233 }
2234
2235 if (arg_link_journal == LINK_NO)
2236 return 0;
2237
03cfe0d5
LP
2238 r = userns_mkdir(directory, "/var", 0755, 0, 0);
2239 if (r < 0)
2240 return log_error_errno(r, "Failed to create /var: %m");
2241
2242 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
2243 if (r < 0)
2244 return log_error_errno(r, "Failed to create /var/log: %m");
2245
2246 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
2247 if (r < 0)
2248 return log_error_errno(r, "Failed to create /var/log/journal: %m");
2249
2250 p = strjoina("/var/log/journal/", id);
2251 q = prefix_roota(directory, p);
27407a01 2252
e26d6ce5 2253 if (path_is_mount_point(p, 0) > 0) {
27407a01
ZJS
2254 if (arg_link_journal != LINK_AUTO) {
2255 log_error("%s: already a mount point, refusing to use for journal", p);
2256 return -EEXIST;
2257 }
2258
2259 return 0;
57fb9fb5
LP
2260 }
2261
e26d6ce5 2262 if (path_is_mount_point(q, 0) > 0) {
57fb9fb5 2263 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
2264 log_error("%s: already a mount point, refusing to use for journal", q);
2265 return -EEXIST;
57fb9fb5
LP
2266 }
2267
27407a01 2268 return 0;
57fb9fb5
LP
2269 }
2270
2271 r = readlink_and_make_absolute(p, &d);
2272 if (r >= 0) {
2273 if ((arg_link_journal == LINK_GUEST ||
2274 arg_link_journal == LINK_AUTO) &&
2275 path_equal(d, q)) {
2276
03cfe0d5 2277 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2278 if (r < 0)
56f64d95 2279 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 2280 return 0;
57fb9fb5
LP
2281 }
2282
4a62c710
MS
2283 if (unlink(p) < 0)
2284 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2285 } else if (r == -EINVAL) {
2286
2287 if (arg_link_journal == LINK_GUEST &&
2288 rmdir(p) < 0) {
2289
27407a01
ZJS
2290 if (errno == ENOTDIR) {
2291 log_error("%s already exists and is neither a symlink nor a directory", p);
2292 return r;
2293 } else {
56f64d95 2294 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 2295 return -errno;
57fb9fb5 2296 }
57fb9fb5
LP
2297 }
2298 } else if (r != -ENOENT) {
56f64d95 2299 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 2300 return r;
57fb9fb5
LP
2301 }
2302
2303 if (arg_link_journal == LINK_GUEST) {
2304
2305 if (symlink(q, p) < 0) {
574edc90 2306 if (arg_link_journal_try) {
56f64d95 2307 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
2308 return 0;
2309 } else {
56f64d95 2310 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
2311 return -errno;
2312 }
57fb9fb5
LP
2313 }
2314
03cfe0d5 2315 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2316 if (r < 0)
56f64d95 2317 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 2318 return 0;
57fb9fb5
LP
2319 }
2320
2321 if (arg_link_journal == LINK_HOST) {
574edc90
MP
2322 /* don't create parents here -- if the host doesn't have
2323 * permanent journal set up, don't force it here */
2324 r = mkdir(p, 0755);
57fb9fb5 2325 if (r < 0) {
574edc90 2326 if (arg_link_journal_try) {
56f64d95 2327 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
2328 return 0;
2329 } else {
56f64d95 2330 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
2331 return r;
2332 }
57fb9fb5
LP
2333 }
2334
27407a01
ZJS
2335 } else if (access(p, F_OK) < 0)
2336 return 0;
57fb9fb5 2337
cdb2b9d0
LP
2338 if (dir_is_empty(q) == 0)
2339 log_warning("%s is not empty, proceeding anyway.", q);
2340
03cfe0d5 2341 r = userns_mkdir(directory, p, 0755, 0, 0);
57fb9fb5 2342 if (r < 0) {
56f64d95 2343 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 2344 return r;
57fb9fb5
LP
2345 }
2346
4543768d 2347 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 2348 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2349
27407a01 2350 return 0;
57fb9fb5
LP
2351}
2352
88213476 2353static int drop_capabilities(void) {
5076f0cc 2354 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
2355}
2356
5aa4bb6b 2357static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 2358 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
03976f7b 2359 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
2360 int r;
2361
eb91eb18
LP
2362 if (!arg_register)
2363 return 0;
2364
1c03020c 2365 r = sd_bus_default_system(&bus);
f647962d
MS
2366 if (r < 0)
2367 return log_error_errno(r, "Failed to open system bus: %m");
9444b1f2 2368
89f7c846
LP
2369 if (arg_keep_unit) {
2370 r = sd_bus_call_method(
2371 bus,
2372 "org.freedesktop.machine1",
2373 "/org/freedesktop/machine1",
2374 "org.freedesktop.machine1.Manager",
5aa4bb6b 2375 "RegisterMachineWithNetwork",
89f7c846
LP
2376 &error,
2377 NULL,
5aa4bb6b 2378 "sayssusai",
89f7c846
LP
2379 arg_machine,
2380 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2381 "nspawn",
2382 "container",
2383 (uint32_t) pid,
5aa4bb6b
LP
2384 strempty(arg_directory),
2385 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 2386 } else {
9457ac5b 2387 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
f36933fe 2388 char **i;
ce5b3ad4 2389 unsigned j;
9457ac5b
LP
2390
2391 r = sd_bus_message_new_method_call(
89f7c846 2392 bus,
9457ac5b 2393 &m,
89f7c846
LP
2394 "org.freedesktop.machine1",
2395 "/org/freedesktop/machine1",
2396 "org.freedesktop.machine1.Manager",
5aa4bb6b 2397 "CreateMachineWithNetwork");
f647962d 2398 if (r < 0)
f36933fe 2399 return bus_log_create_error(r);
9457ac5b
LP
2400
2401 r = sd_bus_message_append(
2402 m,
5aa4bb6b 2403 "sayssusai",
89f7c846
LP
2404 arg_machine,
2405 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2406 "nspawn",
2407 "container",
2408 (uint32_t) pid,
5aa4bb6b
LP
2409 strempty(arg_directory),
2410 local_ifindex > 0 ? 1 : 0, local_ifindex);
f647962d 2411 if (r < 0)
f36933fe 2412 return bus_log_create_error(r);
9457ac5b
LP
2413
2414 r = sd_bus_message_open_container(m, 'a', "(sv)");
f647962d 2415 if (r < 0)
f36933fe 2416 return bus_log_create_error(r);
9457ac5b
LP
2417
2418 if (!isempty(arg_slice)) {
2419 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
f647962d 2420 if (r < 0)
f36933fe 2421 return bus_log_create_error(r);
9457ac5b
LP
2422 }
2423
2424 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
f647962d 2425 if (r < 0)
f36933fe 2426 return bus_log_create_error(r);
9457ac5b 2427
773ce3d8
LP
2428 /* If you make changes here, also make sure to update
2429 * systemd-nspawn@.service, to keep the device
2430 * policies in sync regardless if we are run with or
2431 * without the --keep-unit switch. */
63cc4c31 2432 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
9457ac5b
LP
2433 /* Allow the container to
2434 * access and create the API
2435 * device nodes, so that
2436 * PrivateDevices= in the
2437 * container can work
2438 * fine */
2439 "/dev/null", "rwm",
2440 "/dev/zero", "rwm",
2441 "/dev/full", "rwm",
2442 "/dev/random", "rwm",
2443 "/dev/urandom", "rwm",
2444 "/dev/tty", "rwm",
864e1706 2445 "/dev/net/tun", "rwm",
9457ac5b
LP
2446 /* Allow the container
2447 * access to ptys. However,
2448 * do not permit the
2449 * container to ever create
2450 * these device nodes. */
2451 "/dev/pts/ptmx", "rw",
63cc4c31 2452 "char-pts", "rw");
f647962d 2453 if (r < 0)
27023c0e
LP
2454 return bus_log_create_error(r);
2455
ce5b3ad4
SJ
2456 for (j = 0; j < arg_n_custom_mounts; j++) {
2457 CustomMount *cm = &arg_custom_mounts[j];
2458
2459 if (cm->type != CUSTOM_MOUNT_BIND)
2460 continue;
2461
2462 r = is_device_node(cm->source);
2463 if (r < 0)
2464 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
2465
2466 if (r) {
2467 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
2468 cm->source, cm->read_only ? "r" : "rw");
2469 if (r < 0)
2470 return log_error_errno(r, "Failed to append message arguments: %m");
2471 }
2472 }
2473
27023c0e
LP
2474 if (arg_kill_signal != 0) {
2475 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
2476 if (r < 0)
2477 return bus_log_create_error(r);
2478
2479 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
2480 if (r < 0)
2481 return bus_log_create_error(r);
2482 }
9457ac5b 2483
f36933fe
LP
2484 STRV_FOREACH(i, arg_property) {
2485 r = sd_bus_message_open_container(m, 'r', "sv");
2486 if (r < 0)
2487 return bus_log_create_error(r);
2488
2489 r = bus_append_unit_property_assignment(m, *i);
2490 if (r < 0)
2491 return r;
2492
2493 r = sd_bus_message_close_container(m);
2494 if (r < 0)
2495 return bus_log_create_error(r);
2496 }
2497
9457ac5b 2498 r = sd_bus_message_close_container(m);
f647962d 2499 if (r < 0)
f36933fe 2500 return bus_log_create_error(r);
9457ac5b
LP
2501
2502 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
2503 }
2504
9444b1f2 2505 if (r < 0) {
1f0cd86b
LP
2506 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2507 return r;
2508 }
2509
2510 return 0;
2511}
2512
2513static int terminate_machine(pid_t pid) {
2514 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2515 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
03976f7b 2516 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
2517 const char *path;
2518 int r;
2519
eb91eb18
LP
2520 if (!arg_register)
2521 return 0;
2522
1a2399e5
LP
2523 /* If we are reusing the unit, then just exit, systemd will do
2524 * the right thing when we exit. */
2525 if (arg_keep_unit)
2526 return 0;
2527
76b54375 2528 r = sd_bus_default_system(&bus);
f647962d
MS
2529 if (r < 0)
2530 return log_error_errno(r, "Failed to open system bus: %m");
1f0cd86b
LP
2531
2532 r = sd_bus_call_method(
2533 bus,
2534 "org.freedesktop.machine1",
2535 "/org/freedesktop/machine1",
2536 "org.freedesktop.machine1.Manager",
2537 "GetMachineByPID",
2538 &error,
2539 &reply,
2540 "u",
2541 (uint32_t) pid);
2542 if (r < 0) {
2543 /* Note that the machine might already have been
2544 * cleaned up automatically, hence don't consider it a
2545 * failure if we cannot get the machine object. */
2546 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2547 return 0;
2548 }
2549
2550 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
2551 if (r < 0)
2552 return bus_log_parse_error(r);
9444b1f2 2553
1f0cd86b
LP
2554 r = sd_bus_call_method(
2555 bus,
2556 "org.freedesktop.machine1",
2557 path,
2558 "org.freedesktop.machine1.Machine",
2559 "Terminate",
2560 &error,
2561 NULL,
2562 NULL);
2563 if (r < 0) {
2564 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2565 return 0;
2566 }
2567
9444b1f2
LP
2568 return 0;
2569}
2570
db999e0f
LP
2571static int reset_audit_loginuid(void) {
2572 _cleanup_free_ char *p = NULL;
2573 int r;
2574
2575 if (arg_share_system)
2576 return 0;
2577
2578 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2579 if (r == -ENOENT)
db999e0f 2580 return 0;
f647962d
MS
2581 if (r < 0)
2582 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2583
2584 /* Already reset? */
2585 if (streq(p, "4294967295"))
2586 return 0;
2587
ad118bda 2588 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 2589 if (r < 0) {
10a87006
LP
2590 log_error_errno(r,
2591 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2592 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2593 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2594 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2595 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2596
db999e0f 2597 sleep(5);
77b6e194 2598 }
db999e0f
LP
2599
2600 return 0;
77b6e194
LP
2601}
2602
4f758c23
LP
2603#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2604#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
e867ceb6 2605#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
01dde061 2606
a90e2305 2607static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
01dde061
TG
2608 uint8_t result[8];
2609 size_t l, sz;
a90e2305
LP
2610 uint8_t *v, *i;
2611 int r;
01dde061
TG
2612
2613 l = strlen(arg_machine);
2614 sz = sizeof(sd_id128_t) + l;
e867ceb6
LP
2615 if (idx > 0)
2616 sz += sizeof(idx);
a90e2305 2617
01dde061
TG
2618 v = alloca(sz);
2619
2620 /* fetch some persistent data unique to the host */
2621 r = sd_id128_get_machine((sd_id128_t*) v);
2622 if (r < 0)
2623 return r;
2624
2625 /* combine with some data unique (on this host) to this
2626 * container instance */
a90e2305
LP
2627 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2628 if (idx > 0) {
2629 idx = htole64(idx);
2630 memcpy(i, &idx, sizeof(idx));
2631 }
01dde061
TG
2632
2633 /* Let's hash the host machine ID plus the container name. We
2634 * use a fixed, but originally randomly created hash key here. */
4f758c23 2635 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
2636
2637 assert_cc(ETH_ALEN <= sizeof(result));
2638 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2639
2640 /* see eth_random_addr in the kernel */
2641 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2642 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2643
2644 return 0;
2645}
2646
5aa4bb6b 2647static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1c4baffc
TG
2648 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2649 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4f758c23 2650 struct ether_addr mac_host, mac_container;
5aa4bb6b 2651 int r, i;
69c79d3c
LP
2652
2653 if (!arg_private_network)
2654 return 0;
2655
2656 if (!arg_network_veth)
2657 return 0;
2658
08af0da2
LP
2659 /* Use two different interface name prefixes depending whether
2660 * we are in bridge mode or not. */
c00524c9 2661 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 2662 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 2663
e867ceb6
LP
2664 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2665 if (r < 0)
2666 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
4f758c23 2667
e867ceb6
LP
2668 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2669 if (r < 0)
2670 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
01dde061 2671
1c4baffc 2672 r = sd_netlink_open(&rtnl);
f647962d
MS
2673 if (r < 0)
2674 return log_error_errno(r, "Failed to connect to netlink: %m");
69c79d3c 2675
151b9b96 2676 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2677 if (r < 0)
2678 return log_error_errno(r, "Failed to allocate netlink message: %m");
69c79d3c 2679
1c4baffc 2680 r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
f647962d
MS
2681 if (r < 0)
2682 return log_error_errno(r, "Failed to add netlink interface name: %m");
69c79d3c 2683
1c4baffc 2684 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
f647962d
MS
2685 if (r < 0)
2686 return log_error_errno(r, "Failed to add netlink MAC address: %m");
4f758c23 2687
1c4baffc 2688 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2689 if (r < 0)
2690 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2691
1c4baffc 2692 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
f647962d
MS
2693 if (r < 0)
2694 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2695
1c4baffc 2696 r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
f647962d
MS
2697 if (r < 0)
2698 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2699
1c4baffc 2700 r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
f647962d
MS
2701 if (r < 0)
2702 return log_error_errno(r, "Failed to add netlink interface name: %m");
01dde061 2703
1c4baffc 2704 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
f647962d
MS
2705 if (r < 0)
2706 return log_error_errno(r, "Failed to add netlink MAC address: %m");
69c79d3c 2707
1c4baffc 2708 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2709 if (r < 0)
2710 return log_error_errno(r, "Failed to add netlink namespace field: %m");
69c79d3c 2711
1c4baffc 2712 r = sd_netlink_message_close_container(m);
f647962d
MS
2713 if (r < 0)
2714 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c 2715
1c4baffc 2716 r = sd_netlink_message_close_container(m);
f647962d
MS
2717 if (r < 0)
2718 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c 2719
1c4baffc 2720 r = sd_netlink_message_close_container(m);
f647962d
MS
2721 if (r < 0)
2722 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c 2723
1c4baffc 2724 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d 2725 if (r < 0)
637aa8a3 2726 return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
69c79d3c 2727
5aa4bb6b 2728 i = (int) if_nametoindex(iface_name);
4a62c710
MS
2729 if (i <= 0)
2730 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
5aa4bb6b
LP
2731
2732 *ifi = i;
2733
69c79d3c
LP
2734 return 0;
2735}
2736
5aa4bb6b 2737static int setup_bridge(const char veth_name[], int *ifi) {
1c4baffc
TG
2738 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2739 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
ab046dde
TG
2740 int r, bridge;
2741
2742 if (!arg_private_network)
2743 return 0;
2744
2745 if (!arg_network_veth)
2746 return 0;
2747
2748 if (!arg_network_bridge)
2749 return 0;
2750
2751 bridge = (int) if_nametoindex(arg_network_bridge);
4a62c710
MS
2752 if (bridge <= 0)
2753 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
ab046dde 2754
5aa4bb6b
LP
2755 *ifi = bridge;
2756
1c4baffc 2757 r = sd_netlink_open(&rtnl);
f647962d
MS
2758 if (r < 0)
2759 return log_error_errno(r, "Failed to connect to netlink: %m");
ab046dde 2760
151b9b96 2761 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
f647962d
MS
2762 if (r < 0)
2763 return log_error_errno(r, "Failed to allocate netlink message: %m");
ab046dde 2764
039dd4af 2765 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
f647962d
MS
2766 if (r < 0)
2767 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
039dd4af 2768
1c4baffc 2769 r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
f647962d
MS
2770 if (r < 0)
2771 return log_error_errno(r, "Failed to add netlink interface name field: %m");
ab046dde 2772
1c4baffc 2773 r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
f647962d
MS
2774 if (r < 0)
2775 return log_error_errno(r, "Failed to add netlink master field: %m");
ab046dde 2776
1c4baffc 2777 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d
MS
2778 if (r < 0)
2779 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
ab046dde
TG
2780
2781 return 0;
2782}
2783
c74e630d
LP
2784static int parse_interface(struct udev *udev, const char *name) {
2785 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2786 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2787 int ifi;
2788
2789 ifi = (int) if_nametoindex(name);
4a62c710
MS
2790 if (ifi <= 0)
2791 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
c74e630d
LP
2792
2793 sprintf(ifi_str, "n%i", ifi);
2794 d = udev_device_new_from_device_id(udev, ifi_str);
4a62c710
MS
2795 if (!d)
2796 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
c74e630d
LP
2797
2798 if (udev_device_get_is_initialized(d) <= 0) {
2799 log_error("Network interface %s is not initialized yet.", name);
2800 return -EBUSY;
2801 }
2802
2803 return ifi;
2804}
2805
69c79d3c 2806static int move_network_interfaces(pid_t pid) {
7e227024 2807 _cleanup_udev_unref_ struct udev *udev = NULL;
1c4baffc 2808 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
aa28aefe
LP
2809 char **i;
2810 int r;
2811
2812 if (!arg_private_network)
2813 return 0;
2814
2815 if (strv_isempty(arg_network_interfaces))
2816 return 0;
2817
1c4baffc 2818 r = sd_netlink_open(&rtnl);
f647962d
MS
2819 if (r < 0)
2820 return log_error_errno(r, "Failed to connect to netlink: %m");
aa28aefe 2821
7e227024
LP
2822 udev = udev_new();
2823 if (!udev) {
2824 log_error("Failed to connect to udev.");
2825 return -ENOMEM;
2826 }
2827
aa28aefe 2828 STRV_FOREACH(i, arg_network_interfaces) {
1c4baffc 2829 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
b88eb17a 2830 int ifi;
aa28aefe 2831
c74e630d
LP
2832 ifi = parse_interface(udev, *i);
2833 if (ifi < 0)
2834 return ifi;
2835
3125b3ef 2836 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
f647962d
MS
2837 if (r < 0)
2838 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2839
1c4baffc 2840 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2841 if (r < 0)
2842 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
7e227024 2843
1c4baffc 2844 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d
MS
2845 if (r < 0)
2846 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
c74e630d 2847 }
7e227024 2848
c74e630d
LP
2849 return 0;
2850}
2851
2852static int setup_macvlan(pid_t pid) {
2853 _cleanup_udev_unref_ struct udev *udev = NULL;
1c4baffc 2854 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
e867ceb6 2855 unsigned idx = 0;
c74e630d
LP
2856 char **i;
2857 int r;
2858
2859 if (!arg_private_network)
2860 return 0;
2861
2862 if (strv_isempty(arg_network_macvlan))
2863 return 0;
2864
1c4baffc 2865 r = sd_netlink_open(&rtnl);
f647962d
MS
2866 if (r < 0)
2867 return log_error_errno(r, "Failed to connect to netlink: %m");
c74e630d
LP
2868
2869 udev = udev_new();
2870 if (!udev) {
2871 log_error("Failed to connect to udev.");
2872 return -ENOMEM;
2873 }
2874
2875 STRV_FOREACH(i, arg_network_macvlan) {
1c4baffc 2876 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
c74e630d 2877 _cleanup_free_ char *n = NULL;
e867ceb6 2878 struct ether_addr mac;
c74e630d
LP
2879 int ifi;
2880
2881 ifi = parse_interface(udev, *i);
2882 if (ifi < 0)
2883 return ifi;
2884
e867ceb6
LP
2885 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2886 if (r < 0)
2887 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2888
c74e630d 2889 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2890 if (r < 0)
2891 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2892
1c4baffc 2893 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
f647962d
MS
2894 if (r < 0)
2895 return log_error_errno(r, "Failed to add netlink interface index: %m");
c74e630d
LP
2896
2897 n = strappend("mv-", *i);
2898 if (!n)
2899 return log_oom();
2900
2901 strshorten(n, IFNAMSIZ-1);
2902
1c4baffc 2903 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
f647962d
MS
2904 if (r < 0)
2905 return log_error_errno(r, "Failed to add netlink interface name: %m");
c74e630d 2906
1c4baffc 2907 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
e867ceb6
LP
2908 if (r < 0)
2909 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2910
1c4baffc 2911 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2912 if (r < 0)
2913 return log_error_errno(r, "Failed to add netlink namespace field: %m");
c74e630d 2914
1c4baffc 2915 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2916 if (r < 0)
2917 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 2918
1c4baffc 2919 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
f647962d
MS
2920 if (r < 0)
2921 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 2922
1c4baffc 2923 r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
f647962d
MS
2924 if (r < 0)
2925 return log_error_errno(r, "Failed to append macvlan mode: %m");
c74e630d 2926
1c4baffc 2927 r = sd_netlink_message_close_container(m);
f647962d
MS
2928 if (r < 0)
2929 return log_error_errno(r, "Failed to close netlink container: %m");
c74e630d 2930
1c4baffc 2931 r = sd_netlink_message_close_container(m);
f647962d
MS
2932 if (r < 0)
2933 return log_error_errno(r, "Failed to close netlink container: %m");
aa28aefe 2934
1c4baffc 2935 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d
MS
2936 if (r < 0)
2937 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
aa28aefe
LP
2938 }
2939
2940 return 0;
2941}
2942
4bbfe7ad
TG
2943static int setup_ipvlan(pid_t pid) {
2944 _cleanup_udev_unref_ struct udev *udev = NULL;
1c4baffc 2945 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4bbfe7ad
TG
2946 char **i;
2947 int r;
2948
2949 if (!arg_private_network)
2950 return 0;
2951
2952 if (strv_isempty(arg_network_ipvlan))
2953 return 0;
2954
1c4baffc 2955 r = sd_netlink_open(&rtnl);
4bbfe7ad
TG
2956 if (r < 0)
2957 return log_error_errno(r, "Failed to connect to netlink: %m");
2958
2959 udev = udev_new();
2960 if (!udev) {
2961 log_error("Failed to connect to udev.");
2962 return -ENOMEM;
2963 }
2964
2965 STRV_FOREACH(i, arg_network_ipvlan) {
1c4baffc 2966 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
4bbfe7ad
TG
2967 _cleanup_free_ char *n = NULL;
2968 int ifi;
2969
2970 ifi = parse_interface(udev, *i);
2971 if (ifi < 0)
2972 return ifi;
2973
2974 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2975 if (r < 0)
2976 return log_error_errno(r, "Failed to allocate netlink message: %m");
2977
1c4baffc 2978 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
4bbfe7ad
TG
2979 if (r < 0)
2980 return log_error_errno(r, "Failed to add netlink interface index: %m");
2981
2982 n = strappend("iv-", *i);
2983 if (!n)
2984 return log_oom();
2985
2986 strshorten(n, IFNAMSIZ-1);
2987
1c4baffc 2988 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
4bbfe7ad
TG
2989 if (r < 0)
2990 return log_error_errno(r, "Failed to add netlink interface name: %m");
2991
1c4baffc 2992 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
4bbfe7ad
TG
2993 if (r < 0)
2994 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2995
1c4baffc 2996 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
4bbfe7ad
TG
2997 if (r < 0)
2998 return log_error_errno(r, "Failed to open netlink container: %m");
2999
1c4baffc 3000 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
4bbfe7ad
TG
3001 if (r < 0)
3002 return log_error_errno(r, "Failed to open netlink container: %m");
3003
1c4baffc 3004 r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
4bbfe7ad
TG
3005 if (r < 0)
3006 return log_error_errno(r, "Failed to add ipvlan mode: %m");
3007
1c4baffc 3008 r = sd_netlink_message_close_container(m);
4bbfe7ad
TG
3009 if (r < 0)
3010 return log_error_errno(r, "Failed to close netlink container: %m");
3011
1c4baffc 3012 r = sd_netlink_message_close_container(m);
4bbfe7ad
TG
3013 if (r < 0)
3014 return log_error_errno(r, "Failed to close netlink container: %m");
3015
1c4baffc 3016 r = sd_netlink_call(rtnl, m, 0, NULL);
4bbfe7ad
TG
3017 if (r < 0)
3018 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
3019 }
3020
3021 return 0;
3022}
3023
28650077 3024static int setup_seccomp(void) {
24fb1112
LP
3025
3026#ifdef HAVE_SECCOMP
9a71b112
JF
3027 static const struct {
3028 uint64_t capability;
3029 int syscall_num;
3030 } blacklist[] = {
5ba7a268
LP
3031 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
3032 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
3033 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
3034 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
3035 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
3036 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
3037 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
3038 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
3039 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
3040 { CAP_SYSLOG, SCMP_SYS(syslog) },
d0a0ccf3
JF
3041 };
3042
24fb1112 3043 scmp_filter_ctx seccomp;
28650077 3044 unsigned i;
24fb1112
LP
3045 int r;
3046
24fb1112
LP
3047 seccomp = seccomp_init(SCMP_ACT_ALLOW);
3048 if (!seccomp)
3049 return log_oom();
3050
e9642be2 3051 r = seccomp_add_secondary_archs(seccomp);
9875fd78 3052 if (r < 0) {
da927ba9 3053 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
3054 goto finish;
3055 }
3056
28650077 3057 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
3058 if (arg_retain & (1ULL << blacklist[i].capability))
3059 continue;
3060
3061 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
3062 if (r == -EFAULT)
3063 continue; /* unknown syscall */
3064 if (r < 0) {
da927ba9 3065 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
3066 goto finish;
3067 }
3068 }
3069
d0a0ccf3 3070
28650077
LP
3071 /*
3072 Audit is broken in containers, much of the userspace audit
3073 hookup will fail if running inside a container. We don't
3074 care and just turn off creation of audit sockets.
3075
3076 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
3077 with EAFNOSUPPORT which audit userspace uses as indication
3078 that audit is disabled in the kernel.
3079 */
3080
3302da46 3081 r = seccomp_rule_add(
24fb1112
LP
3082 seccomp,
3083 SCMP_ACT_ERRNO(EAFNOSUPPORT),
3084 SCMP_SYS(socket),
3085 2,
3086 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
3087 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
3088 if (r < 0) {
da927ba9 3089 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
3090 goto finish;
3091 }
3092
3093 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
3094 if (r < 0) {
da927ba9 3095 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
3096 goto finish;
3097 }
3098
3099 r = seccomp_load(seccomp);
9b1cbdc6
ILG
3100 if (r == -EINVAL) {
3101 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3102 r = 0;
3103 goto finish;
3104 }
3105 if (r < 0) {
da927ba9 3106 log_error_errno(r, "Failed to install seccomp audit filter: %m");
9b1cbdc6
ILG
3107 goto finish;
3108 }
24fb1112
LP
3109
3110finish:
3111 seccomp_release(seccomp);
3112 return r;
3113#else
3114 return 0;
3115#endif
3116
3117}
3118
785890ac
LP
3119static int setup_propagate(const char *root) {
3120 const char *p, *q;
3121
3122 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3123 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 3124 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
3125 (void) mkdir_p(p, 0600);
3126
03cfe0d5
LP
3127 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
3128 return log_error_errno(errno, "Failed to create /run/systemd: %m");
3129
3130 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3131 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
3132
3133 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3134 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 3135
03cfe0d5 3136 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
3137 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
3138 return log_error_errno(errno, "Failed to install propagation bind mount.");
3139
3140 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
3141 return log_error_errno(errno, "Failed to make propagation mount read-only");
3142
3143 return 0;
3144}
3145
1b9e5b12
LP
3146static int setup_image(char **device_path, int *loop_nr) {
3147 struct loop_info64 info = {
3148 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
3149 };
3150 _cleanup_close_ int fd = -1, control = -1, loop = -1;
3151 _cleanup_free_ char* loopdev = NULL;
3152 struct stat st;
3153 int r, nr;
3154
3155 assert(device_path);
3156 assert(loop_nr);
ec16945e 3157 assert(arg_image);
1b9e5b12
LP
3158
3159 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
3160 if (fd < 0)
3161 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 3162
4a62c710
MS
3163 if (fstat(fd, &st) < 0)
3164 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
3165
3166 if (S_ISBLK(st.st_mode)) {
3167 char *p;
3168
3169 p = strdup(arg_image);
3170 if (!p)
3171 return log_oom();
3172
3173 *device_path = p;
3174
3175 *loop_nr = -1;
3176
3177 r = fd;
3178 fd = -1;
3179
3180 return r;
3181 }
3182
3183 if (!S_ISREG(st.st_mode)) {
56f64d95 3184 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
3185 return -EINVAL;
3186 }
3187
3188 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
3189 if (control < 0)
3190 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
3191
3192 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
3193 if (nr < 0)
3194 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
3195
3196 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
3197 return log_oom();
3198
3199 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
3200 if (loop < 0)
3201 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 3202
4a62c710
MS
3203 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
3204 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
3205
3206 if (arg_read_only)
3207 info.lo_flags |= LO_FLAGS_READ_ONLY;
3208
4a62c710
MS
3209 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
3210 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
3211
3212 *device_path = loopdev;
3213 loopdev = NULL;
3214
3215 *loop_nr = nr;
3216
3217 r = loop;
3218 loop = -1;
3219
3220 return r;
3221}
3222
ada4799a
LP
3223#define PARTITION_TABLE_BLURB \
3224 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 3225 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 3226 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
3227 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3228 "to be bootable with systemd-nspawn."
3229
1b9e5b12
LP
3230static int dissect_image(
3231 int fd,
727fd4fd
LP
3232 char **root_device, bool *root_device_rw,
3233 char **home_device, bool *home_device_rw,
3234 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
3235 bool *secondary) {
3236
3237#ifdef HAVE_BLKID
01dc33ce
ZJS
3238 int home_nr = -1, srv_nr = -1;
3239#ifdef GPT_ROOT_NATIVE
3240 int root_nr = -1;
3241#endif
3242#ifdef GPT_ROOT_SECONDARY
3243 int secondary_root_nr = -1;
3244#endif
f6c51a81 3245 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
3246 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
3247 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
3248 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3249 _cleanup_udev_unref_ struct udev *udev = NULL;
3250 struct udev_list_entry *first, *item;
f6c51a81 3251 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 3252 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
3253 const char *pttype = NULL;
3254 blkid_partlist pl;
3255 struct stat st;
c09ef2e4 3256 unsigned i;
1b9e5b12
LP
3257 int r;
3258
3259 assert(fd >= 0);
3260 assert(root_device);
3261 assert(home_device);
3262 assert(srv_device);
3263 assert(secondary);
ec16945e 3264 assert(arg_image);
1b9e5b12
LP
3265
3266 b = blkid_new_probe();
3267 if (!b)
3268 return log_oom();
3269
3270 errno = 0;
3271 r = blkid_probe_set_device(b, fd, 0, 0);
3272 if (r != 0) {
3273 if (errno == 0)
3274 return log_oom();
3275
56f64d95 3276 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
3277 return -errno;
3278 }
3279
3280 blkid_probe_enable_partitions(b, 1);
3281 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
3282
3283 errno = 0;
3284 r = blkid_do_safeprobe(b);
3285 if (r == -2 || r == 1) {
ada4799a
LP
3286 log_error("Failed to identify any partition table on\n"
3287 " %s\n"
3288 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
3289 return -EINVAL;
3290 } else if (r != 0) {
3291 if (errno == 0)
3292 errno = EIO;
56f64d95 3293 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
3294 return -errno;
3295 }
3296
48861960 3297 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
3298
3299 is_gpt = streq_ptr(pttype, "gpt");
3300 is_mbr = streq_ptr(pttype, "dos");
3301
3302 if (!is_gpt && !is_mbr) {
3303 log_error("No GPT or MBR partition table discovered on\n"
3304 " %s\n"
3305 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
3306 return -EINVAL;
3307 }
3308
3309 errno = 0;
3310 pl = blkid_probe_get_partitions(b);
3311 if (!pl) {
3312 if (errno == 0)
3313 return log_oom();
3314
3315 log_error("Failed to list partitions of %s", arg_image);
3316 return -errno;
3317 }
3318
3319 udev = udev_new();
3320 if (!udev)
3321 return log_oom();
3322
4a62c710
MS
3323 if (fstat(fd, &st) < 0)
3324 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 3325
c09ef2e4
LP
3326 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
3327 if (!d)
1b9e5b12
LP
3328 return log_oom();
3329
c09ef2e4
LP
3330 for (i = 0;; i++) {
3331 int n, m;
1b9e5b12 3332
c09ef2e4
LP
3333 if (i >= 10) {
3334 log_error("Kernel partitions never appeared.");
3335 return -ENXIO;
3336 }
3337
3338 e = udev_enumerate_new(udev);
3339 if (!e)
3340 return log_oom();
3341
3342 r = udev_enumerate_add_match_parent(e, d);
3343 if (r < 0)
3344 return log_oom();
3345
3346 r = udev_enumerate_scan_devices(e);
3347 if (r < 0)
3348 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
3349
3350 /* Count the partitions enumerated by the kernel */
3351 n = 0;
3352 first = udev_enumerate_get_list_entry(e);
3353 udev_list_entry_foreach(item, first)
3354 n++;
3355
3356 /* Count the partitions enumerated by blkid */
3357 m = blkid_partlist_numof_partitions(pl);
3358 if (n == m + 1)
3359 break;
3360 if (n > m + 1) {
3361 log_error("blkid and kernel partition list do not match.");
3362 return -EIO;
3363 }
3364 if (n < m + 1) {
3365 unsigned j;
3366
3367 /* The kernel has probed fewer partitions than
3368 * blkid? Maybe the kernel prober is still
3369 * running or it got EBUSY because udev
3370 * already opened the device. Let's reprobe
3371 * the device, which is a synchronous call
3372 * that waits until probing is complete. */
3373
3374 for (j = 0; j < 20; j++) {
3375
3376 r = ioctl(fd, BLKRRPART, 0);
3377 if (r < 0)
3378 r = -errno;
3379 if (r >= 0 || r != -EBUSY)
3380 break;
3381
3382 /* If something else has the device
3383 * open, such as an udev rule, the
3384 * ioctl will return EBUSY. Since
3385 * there's no way to wait until it
3386 * isn't busy anymore, let's just wait
3387 * a bit, and try again.
3388 *
3389 * This is really something they
3390 * should fix in the kernel! */
3391
3392 usleep(50 * USEC_PER_MSEC);
3393 }
3394
3395 if (r < 0)
3396 return log_error_errno(r, "Failed to reread partition table: %m");
3397 }
3398
3399 e = udev_enumerate_unref(e);
3400 }
1b9e5b12
LP
3401
3402 first = udev_enumerate_get_list_entry(e);
3403 udev_list_entry_foreach(item, first) {
3404 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 3405 const char *node;
727fd4fd 3406 unsigned long long flags;
1b9e5b12
LP
3407 blkid_partition pp;
3408 dev_t qn;
3409 int nr;
3410
3411 errno = 0;
3412 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
3413 if (!q) {
3414 if (!errno)
3415 errno = ENOMEM;
3416
56f64d95 3417 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
3418 return -errno;
3419 }
3420
3421 qn = udev_device_get_devnum(q);
3422 if (major(qn) == 0)
3423 continue;
3424
3425 if (st.st_rdev == qn)
3426 continue;
3427
3428 node = udev_device_get_devnode(q);
3429 if (!node)
3430 continue;
3431
3432 pp = blkid_partlist_devno_to_partition(pl, qn);
3433 if (!pp)
3434 continue;
3435
727fd4fd 3436 flags = blkid_partition_get_flags(pp);
727fd4fd 3437
1b9e5b12
LP
3438 nr = blkid_partition_get_partno(pp);
3439 if (nr < 0)
3440 continue;
3441
ada4799a
LP
3442 if (is_gpt) {
3443 sd_id128_t type_id;
3444 const char *stype;
1b9e5b12 3445
f6c51a81
LP
3446 if (flags & GPT_FLAG_NO_AUTO)
3447 continue;
3448
ada4799a
LP
3449 stype = blkid_partition_get_type_string(pp);
3450 if (!stype)
3451 continue;
1b9e5b12 3452
ada4799a 3453 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
3454 continue;
3455
ada4799a 3456 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 3457
ada4799a
LP
3458 if (home && nr >= home_nr)
3459 continue;
1b9e5b12 3460
ada4799a
LP
3461 home_nr = nr;
3462 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 3463
ada4799a
LP
3464 r = free_and_strdup(&home, node);
3465 if (r < 0)
3466 return log_oom();
727fd4fd 3467
ada4799a
LP
3468 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3469
3470 if (srv && nr >= srv_nr)
3471 continue;
3472
3473 srv_nr = nr;
3474 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3475
3476 r = free_and_strdup(&srv, node);
3477 if (r < 0)
3478 return log_oom();
3479 }
1b9e5b12 3480#ifdef GPT_ROOT_NATIVE
ada4799a 3481 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 3482
ada4799a
LP
3483 if (root && nr >= root_nr)
3484 continue;
1b9e5b12 3485
ada4799a
LP
3486 root_nr = nr;
3487 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 3488
ada4799a
LP
3489 r = free_and_strdup(&root, node);
3490 if (r < 0)
3491 return log_oom();
3492 }
1b9e5b12
LP
3493#endif
3494#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
3495 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3496
3497 if (secondary_root && nr >= secondary_root_nr)
3498 continue;
3499
3500 secondary_root_nr = nr;
3501 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3502
3503 r = free_and_strdup(&secondary_root, node);
3504 if (r < 0)
3505 return log_oom();
3506 }
3507#endif
f6c51a81
LP
3508 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3509
3510 if (generic)
3511 multiple_generic = true;
3512 else {
3513 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3514
3515 r = free_and_strdup(&generic, node);
3516 if (r < 0)
3517 return log_oom();
3518 }
3519 }
ada4799a
LP
3520
3521 } else if (is_mbr) {
3522 int type;
1b9e5b12 3523
f6c51a81
LP
3524 if (flags != 0x80) /* Bootable flag */
3525 continue;
3526
ada4799a
LP
3527 type = blkid_partition_get_type(pp);
3528 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
3529 continue;
3530
f6c51a81
LP
3531 if (generic)
3532 multiple_generic = true;
3533 else {
3534 generic_rw = true;
727fd4fd 3535
f6c51a81
LP
3536 r = free_and_strdup(&root, node);
3537 if (r < 0)
3538 return log_oom();
3539 }
1b9e5b12 3540 }
1b9e5b12
LP
3541 }
3542
1b9e5b12
LP
3543 if (root) {
3544 *root_device = root;
3545 root = NULL;
727fd4fd
LP
3546
3547 *root_device_rw = root_rw;
1b9e5b12
LP
3548 *secondary = false;
3549 } else if (secondary_root) {
3550 *root_device = secondary_root;
3551 secondary_root = NULL;
727fd4fd
LP
3552
3553 *root_device_rw = secondary_root_rw;
1b9e5b12 3554 *secondary = true;
f6c51a81
LP
3555 } else if (generic) {
3556
3557 /* There were no partitions with precise meanings
3558 * around, but we found generic partitions. In this
3559 * case, if there's only one, we can go ahead and boot
3560 * it, otherwise we bail out, because we really cannot
3561 * make any sense of it. */
3562
3563 if (multiple_generic) {
3564 log_error("Identified multiple bootable Linux partitions on\n"
3565 " %s\n"
3566 PARTITION_TABLE_BLURB, arg_image);
3567 return -EINVAL;
3568 }
3569
3570 *root_device = generic;
3571 generic = NULL;
3572
3573 *root_device_rw = generic_rw;
3574 *secondary = false;
3575 } else {
3576 log_error("Failed to identify root partition in disk image\n"
3577 " %s\n"
3578 PARTITION_TABLE_BLURB, arg_image);
3579 return -EINVAL;
1b9e5b12
LP
3580 }
3581
3582 if (home) {
3583 *home_device = home;
3584 home = NULL;
727fd4fd
LP
3585
3586 *home_device_rw = home_rw;
1b9e5b12
LP
3587 }
3588
3589 if (srv) {
3590 *srv_device = srv;
3591 srv = NULL;
727fd4fd
LP
3592
3593 *srv_device_rw = srv_rw;
1b9e5b12
LP
3594 }
3595
3596 return 0;
3597#else
3598 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3599 return -EOPNOTSUPP;
1b9e5b12
LP
3600#endif
3601}
3602
727fd4fd 3603static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
3604#ifdef HAVE_BLKID
3605 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3606 const char *fstype, *p;
3607 int r;
3608
3609 assert(what);
3610 assert(where);
3611
727fd4fd
LP
3612 if (arg_read_only)
3613 rw = false;
3614
1b9e5b12 3615 if (directory)
63c372cb 3616 p = strjoina(where, directory);
1b9e5b12
LP
3617 else
3618 p = where;
3619
3620 errno = 0;
3621 b = blkid_new_probe_from_filename(what);
3622 if (!b) {
3623 if (errno == 0)
3624 return log_oom();
56f64d95 3625 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
3626 return -errno;
3627 }
3628
3629 blkid_probe_enable_superblocks(b, 1);
3630 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3631
3632 errno = 0;
3633 r = blkid_do_safeprobe(b);
3634 if (r == -1 || r == 1) {
3635 log_error("Cannot determine file system type of %s", what);
3636 return -EINVAL;
3637 } else if (r != 0) {
3638 if (errno == 0)
3639 errno = EIO;
56f64d95 3640 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
3641 return -errno;
3642 }
3643
3644 errno = 0;
3645 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3646 if (errno == 0)
3647 errno = EINVAL;
3648 log_error("Failed to determine file system type of %s", what);
3649 return -errno;
3650 }
3651
3652 if (streq(fstype, "crypto_LUKS")) {
3653 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 3654 return -EOPNOTSUPP;
1b9e5b12
LP
3655 }
3656
4a62c710
MS
3657 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3658 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
3659
3660 return 0;
3661#else
3662 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3663 return -EOPNOTSUPP;
1b9e5b12
LP
3664#endif
3665}
3666
727fd4fd
LP
3667static int mount_devices(
3668 const char *where,
3669 const char *root_device, bool root_device_rw,
3670 const char *home_device, bool home_device_rw,
3671 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
3672 int r;
3673
3674 assert(where);
3675
3676 if (root_device) {
727fd4fd 3677 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
3678 if (r < 0)
3679 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
3680 }
3681
3682 if (home_device) {
727fd4fd 3683 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
3684 if (r < 0)
3685 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
3686 }
3687
3688 if (srv_device) {
727fd4fd 3689 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
3690 if (r < 0)
3691 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
3692 }
3693
3694 return 0;
3695}
3696
3697static void loop_remove(int nr, int *image_fd) {
3698 _cleanup_close_ int control = -1;
e8c8ddcc 3699 int r;
1b9e5b12
LP
3700
3701 if (nr < 0)
3702 return;
3703
3704 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
3705 r = ioctl(*image_fd, LOOP_CLR_FD);
3706 if (r < 0)
5e4074aa 3707 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 3708 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
3709 }
3710
3711 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 3712 if (control < 0) {
56f64d95 3713 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 3714 return;
e8c8ddcc 3715 }
1b9e5b12 3716
e8c8ddcc
TG
3717 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3718 if (r < 0)
5e4074aa 3719 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
3720}
3721
0cb9fbcd
LP
3722static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3723 int pipe_fds[2];
3724 pid_t pid;
3725
3726 assert(database);
3727 assert(key);
3728 assert(rpid);
3729
4a62c710
MS
3730 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3731 return log_error_errno(errno, "Failed to allocate pipe: %m");
0cb9fbcd
LP
3732
3733 pid = fork();
4a62c710
MS
3734 if (pid < 0)
3735 return log_error_errno(errno, "Failed to fork getent child: %m");
3736 else if (pid == 0) {
0cb9fbcd
LP
3737 int nullfd;
3738 char *empty_env = NULL;
3739
3740 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3741 _exit(EXIT_FAILURE);
3742
3743 if (pipe_fds[0] > 2)
03e334a1 3744 safe_close(pipe_fds[0]);
0cb9fbcd 3745 if (pipe_fds[1] > 2)
03e334a1 3746 safe_close(pipe_fds[1]);
0cb9fbcd
LP
3747
3748 nullfd = open("/dev/null", O_RDWR);
3749 if (nullfd < 0)
3750 _exit(EXIT_FAILURE);
3751
3752 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3753 _exit(EXIT_FAILURE);
3754
3755 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3756 _exit(EXIT_FAILURE);
3757
3758 if (nullfd > 2)
03e334a1 3759 safe_close(nullfd);
0cb9fbcd 3760
ce30c8dc
LP
3761 (void) reset_all_signal_handlers();
3762 (void) reset_signal_mask();
0cb9fbcd
LP
3763 close_all_fds(NULL, 0);
3764
4de82926
MM
3765 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3766 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
3767 _exit(EXIT_FAILURE);
3768 }
3769
03e334a1 3770 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
3771
3772 *rpid = pid;
3773
3774 return pipe_fds[0];
3775}
3776
3777static int change_uid_gid(char **_home) {
a2a5291b
ZJS
3778 char line[LINE_MAX], *x, *u, *g, *h;
3779 const char *word, *state;
0cb9fbcd
LP
3780 _cleanup_free_ uid_t *uids = NULL;
3781 _cleanup_free_ char *home = NULL;
3782 _cleanup_fclose_ FILE *f = NULL;
3783 _cleanup_close_ int fd = -1;
3784 unsigned n_uids = 0;
70f539ca 3785 size_t sz = 0, l;
0cb9fbcd
LP
3786 uid_t uid;
3787 gid_t gid;
3788 pid_t pid;
3789 int r;
3790
3791 assert(_home);
3792
3793 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3794 /* Reset everything fully to 0, just in case */
3795
03cfe0d5
LP
3796 r = reset_uid_gid();
3797 if (r < 0)
3798 return log_error_errno(r, "Failed to become root: %m");
0cb9fbcd
LP
3799
3800 *_home = NULL;
3801 return 0;
3802 }
3803
3804 /* First, get user credentials */
3805 fd = spawn_getent("passwd", arg_user, &pid);
3806 if (fd < 0)
3807 return fd;
3808
3809 f = fdopen(fd, "r");
3810 if (!f)
3811 return log_oom();
3812 fd = -1;
3813
3814 if (!fgets(line, sizeof(line), f)) {
3815
3816 if (!ferror(f)) {
3817 log_error("Failed to resolve user %s.", arg_user);
3818 return -ESRCH;
3819 }
3820
56f64d95 3821 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3822 return -errno;
3823 }
3824
3825 truncate_nl(line);
3826
820d3acf 3827 wait_for_terminate_and_warn("getent passwd", pid, true);
0cb9fbcd
LP
3828
3829 x = strchr(line, ':');
3830 if (!x) {
3831 log_error("/etc/passwd entry has invalid user field.");
3832 return -EIO;
3833 }
3834
3835 u = strchr(x+1, ':');
3836 if (!u) {
3837 log_error("/etc/passwd entry has invalid password field.");
3838 return -EIO;
3839 }
3840
3841 u++;
3842 g = strchr(u, ':');
3843 if (!g) {
3844 log_error("/etc/passwd entry has invalid UID field.");
3845 return -EIO;
3846 }
3847
3848 *g = 0;
3849 g++;
3850 x = strchr(g, ':');
3851 if (!x) {
3852 log_error("/etc/passwd entry has invalid GID field.");
3853 return -EIO;
3854 }
3855
3856 *x = 0;
3857 h = strchr(x+1, ':');
3858 if (!h) {
3859 log_error("/etc/passwd entry has invalid GECOS field.");
3860 return -EIO;
3861 }
3862
3863 h++;
3864 x = strchr(h, ':');
3865 if (!x) {
3866 log_error("/etc/passwd entry has invalid home directory field.");
3867 return -EIO;
3868 }
3869
3870 *x = 0;
3871
3872 r = parse_uid(u, &uid);
3873 if (r < 0) {
3874 log_error("Failed to parse UID of user.");
3875 return -EIO;
3876 }
3877
3878 r = parse_gid(g, &gid);
3879 if (r < 0) {
3880 log_error("Failed to parse GID of user.");
3881 return -EIO;
3882 }
3883
3884 home = strdup(h);
3885 if (!home)
3886 return log_oom();
3887
3888 /* Second, get group memberships */
3889 fd = spawn_getent("initgroups", arg_user, &pid);
3890 if (fd < 0)
3891 return fd;
3892
3893 fclose(f);
3894 f = fdopen(fd, "r");
3895 if (!f)
3896 return log_oom();
3897 fd = -1;
3898
3899 if (!fgets(line, sizeof(line), f)) {
3900 if (!ferror(f)) {
3901 log_error("Failed to resolve user %s.", arg_user);
3902 return -ESRCH;
3903 }
3904
56f64d95 3905 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3906 return -errno;
3907 }
3908
3909 truncate_nl(line);
3910
820d3acf 3911 wait_for_terminate_and_warn("getent initgroups", pid, true);
0cb9fbcd
LP
3912
3913 /* Skip over the username and subsequent separator whitespace */
3914 x = line;
3915 x += strcspn(x, WHITESPACE);
3916 x += strspn(x, WHITESPACE);
3917
a2a5291b 3918 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
3919 char c[l+1];
3920
a2a5291b 3921 memcpy(c, word, l);
0cb9fbcd
LP
3922 c[l] = 0;
3923
3924 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3925 return log_oom();
3926
3927 r = parse_uid(c, &uids[n_uids++]);
3928 if (r < 0) {
3929 log_error("Failed to parse group data from getent.");
3930 return -EIO;
3931 }
3932 }
3933
3934 r = mkdir_parents(home, 0775);
f647962d
MS
3935 if (r < 0)
3936 return log_error_errno(r, "Failed to make home root directory: %m");
0cb9fbcd
LP
3937
3938 r = mkdir_safe(home, 0755, uid, gid);
f647962d
MS
3939 if (r < 0 && r != -EEXIST)
3940 return log_error_errno(r, "Failed to make home directory: %m");
0cb9fbcd 3941
03cfe0d5
LP
3942 (void) fchown(STDIN_FILENO, uid, gid);
3943 (void) fchown(STDOUT_FILENO, uid, gid);
3944 (void) fchown(STDERR_FILENO, uid, gid);
0cb9fbcd 3945
4a62c710
MS
3946 if (setgroups(n_uids, uids) < 0)
3947 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
0cb9fbcd 3948
4a62c710
MS
3949 if (setresgid(gid, gid, gid) < 0)
3950 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3951
4a62c710
MS
3952 if (setresuid(uid, uid, uid) < 0)
3953 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3954
3955 if (_home) {
3956 *_home = home;
3957 home = NULL;
3958 }
3959
3960 return 0;
3961}
3962
113cea80 3963/*
6d416b9c
LS
3964 * Return values:
3965 * < 0 : wait_for_terminate() failed to get the state of the
3966 * container, the container was terminated by a signal, or
3967 * failed for an unknown reason. No change is made to the
3968 * container argument.
3969 * > 0 : The program executed in the container terminated with an
3970 * error. The exit code of the program executed in the
919699ec
LP
3971 * container is returned. The container argument has been set
3972 * to CONTAINER_TERMINATED.
6d416b9c
LS
3973 * 0 : The container is being rebooted, has been shut down or exited
3974 * successfully. The container argument has been set to either
3975 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 3976 *
6d416b9c
LS
3977 * That is, success is indicated by a return value of zero, and an
3978 * error is indicated by a non-zero value.
113cea80
DH
3979 */
3980static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 3981 siginfo_t status;
919699ec 3982 int r;
113cea80
DH
3983
3984 r = wait_for_terminate(pid, &status);
f647962d
MS
3985 if (r < 0)
3986 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
3987
3988 switch (status.si_code) {
fddbb89c 3989
113cea80 3990 case CLD_EXITED:
919699ec
LP
3991 if (status.si_status == 0) {
3992 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 3993
fddbb89c 3994 } else
919699ec 3995 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 3996
919699ec
LP
3997 *container = CONTAINER_TERMINATED;
3998 return status.si_status;
113cea80
DH
3999
4000 case CLD_KILLED:
4001 if (status.si_status == SIGINT) {
113cea80 4002
919699ec 4003 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 4004 *container = CONTAINER_TERMINATED;
919699ec
LP
4005 return 0;
4006
113cea80 4007 } else if (status.si_status == SIGHUP) {
113cea80 4008
919699ec 4009 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 4010 *container = CONTAINER_REBOOTED;
919699ec 4011 return 0;
113cea80 4012 }
919699ec 4013
113cea80
DH
4014 /* CLD_KILLED fallthrough */
4015
4016 case CLD_DUMPED:
fddbb89c 4017 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 4018 return -EIO;
113cea80
DH
4019
4020 default:
fddbb89c 4021 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 4022 return -EIO;
113cea80
DH
4023 }
4024
4025 return r;
4026}
4027
e866af3a
DH
4028static void nop_handler(int sig) {}
4029
023fb90b
LP
4030static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
4031 pid_t pid;
4032
4033 pid = PTR_TO_UINT32(userdata);
4034 if (pid > 0) {
c6c8f6e2 4035 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
4036 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
4037 sd_event_source_set_userdata(s, NULL);
4038 return 0;
4039 }
4040 }
4041
4042 sd_event_exit(sd_event_source_get_event(s), 0);
4043 return 0;
4044}
4045
ec16945e 4046static int determine_names(void) {
1b9cebf6 4047 int r;
ec16945e 4048
c1521918
LP
4049 if (arg_template && !arg_directory && arg_machine) {
4050
4051 /* If --template= was specified then we should not
4052 * search for a machine, but instead create a new one
4053 * in /var/lib/machine. */
4054
4055 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
4056 if (!arg_directory)
4057 return log_oom();
4058 }
4059
ec16945e 4060 if (!arg_image && !arg_directory) {
1b9cebf6
LP
4061 if (arg_machine) {
4062 _cleanup_(image_unrefp) Image *i = NULL;
4063
4064 r = image_find(arg_machine, &i);
4065 if (r < 0)
4066 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
4067 else if (r == 0) {
4068 log_error("No image for machine '%s': %m", arg_machine);
4069 return -ENOENT;
4070 }
4071
aceac2f0 4072 if (i->type == IMAGE_RAW)
1b9cebf6
LP
4073 r = set_sanitized_path(&arg_image, i->path);
4074 else
4075 r = set_sanitized_path(&arg_directory, i->path);
4076 if (r < 0)
4077 return log_error_errno(r, "Invalid image directory: %m");
4078
aee327b8
LP
4079 if (!arg_ephemeral)
4080 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 4081 } else
ec16945e
LP
4082 arg_directory = get_current_dir_name();
4083
1b9cebf6
LP
4084 if (!arg_directory && !arg_machine) {
4085 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
4086 return -EINVAL;
4087 }
4088 }
4089
4090 if (!arg_machine) {
b9ba4dab
LP
4091 if (arg_directory && path_equal(arg_directory, "/"))
4092 arg_machine = gethostname_malloc();
4093 else
4094 arg_machine = strdup(basename(arg_image ?: arg_directory));
4095
ec16945e
LP
4096 if (!arg_machine)
4097 return log_oom();
4098
ae691c1d 4099 hostname_cleanup(arg_machine);
ec16945e
LP
4100 if (!machine_name_is_valid(arg_machine)) {
4101 log_error("Failed to determine machine name automatically, please use -M.");
4102 return -EINVAL;
4103 }
b9ba4dab
LP
4104
4105 if (arg_ephemeral) {
4106 char *b;
4107
4108 /* Add a random suffix when this is an
4109 * ephemeral machine, so that we can run many
4110 * instances at once without manually having
4111 * to specify -M each time. */
4112
4113 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
4114 return log_oom();
4115
4116 free(arg_machine);
4117 arg_machine = b;
4118 }
ec16945e
LP
4119 }
4120
4121 return 0;
4122}
4123
03cfe0d5 4124static int determine_uid_shift(const char *directory) {
6dac160c
LP
4125 int r;
4126
03cfe0d5
LP
4127 if (!arg_userns) {
4128 arg_uid_shift = 0;
6dac160c 4129 return 0;
03cfe0d5 4130 }
6dac160c
LP
4131
4132 if (arg_uid_shift == UID_INVALID) {
4133 struct stat st;
4134
03cfe0d5 4135 r = stat(directory, &st);
6dac160c 4136 if (r < 0)
03cfe0d5 4137 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
4138
4139 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
4140
4141 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 4142 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
4143 return -EINVAL;
4144 }
4145
4146 arg_uid_range = UINT32_C(0x10000);
4147 }
4148
4149 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
4150 log_error("UID base too high for UID range.");
4151 return -EINVAL;
4152 }
4153
4154 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
4155 return 0;
4156}
4157
03cfe0d5
LP
4158static int inner_child(
4159 Barrier *barrier,
4160 const char *directory,
4161 bool secondary,
4162 int kmsg_socket,
4163 int rtnl_socket,
4164 FDSet *fds,
4165 int argc,
4166 char *argv[]) {
69c79d3c 4167
03cfe0d5
LP
4168 _cleanup_free_ char *home = NULL;
4169 unsigned n_env = 2;
4170 const char *envp[] = {
4171 "PATH=" DEFAULT_PATH_SPLIT_USR,
4172 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4173 NULL, /* TERM */
4174 NULL, /* HOME */
4175 NULL, /* USER */
4176 NULL, /* LOGNAME */
4177 NULL, /* container_uuid */
4178 NULL, /* LISTEN_FDS */
4179 NULL, /* LISTEN_PID */
4180 NULL
4181 };
88213476 4182
2371271c 4183 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 4184 int r;
88213476 4185
03cfe0d5
LP
4186 assert(barrier);
4187 assert(directory);
4188 assert(kmsg_socket >= 0);
88213476 4189
03cfe0d5
LP
4190 if (arg_userns) {
4191 /* Tell the parent, that it now can write the UID map. */
4192 (void) barrier_place(barrier); /* #1 */
7027ff61 4193
03cfe0d5
LP
4194 /* Wait until the parent wrote the UID map */
4195 if (!barrier_place_and_sync(barrier)) { /* #2 */
4196 log_error("Parent died too early");
4197 return -ESRCH;
4198 }
88213476
LP
4199 }
4200
03cfe0d5
LP
4201 r = mount_all(NULL, true);
4202 if (r < 0)
4203 return r;
4204
4205 /* Wait until we are cgroup-ified, so that we
4206 * can mount the right cgroup path writable */
4207 if (!barrier_place_and_sync(barrier)) { /* #3 */
4208 log_error("Parent died too early");
4209 return -ESRCH;
88213476
LP
4210 }
4211
03cfe0d5
LP
4212 r = mount_systemd_cgroup_writable("");
4213 if (r < 0)
4214 return r;
ec16945e 4215
03cfe0d5
LP
4216 r = reset_uid_gid();
4217 if (r < 0)
4218 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 4219
03cfe0d5
LP
4220 r = setup_boot_id(NULL);
4221 if (r < 0)
4222 return r;
ec16945e 4223
03cfe0d5
LP
4224 r = setup_kmsg(NULL, kmsg_socket);
4225 if (r < 0)
4226 return r;
4227 kmsg_socket = safe_close(kmsg_socket);
ec16945e 4228
03cfe0d5 4229 umask(0022);
30535c16 4230
03cfe0d5
LP
4231 if (setsid() < 0)
4232 return log_error_errno(errno, "setsid() failed: %m");
4233
4234 if (arg_private_network)
4235 loopback_setup();
4236
4237 r = send_rtnl(rtnl_socket);
4238 if (r < 0)
4239 return r;
4240 rtnl_socket = safe_close(rtnl_socket);
4241
4242 if (drop_capabilities() < 0)
4243 return log_error_errno(errno, "drop_capabilities() failed: %m");
4244
4245 setup_hostname();
4246
050f7277 4247 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
4248 if (personality(arg_personality) < 0)
4249 return log_error_errno(errno, "personality() failed: %m");
4250 } else if (secondary) {
4251 if (personality(PER_LINUX32) < 0)
4252 return log_error_errno(errno, "personality() failed: %m");
4253 }
4254
4255#ifdef HAVE_SELINUX
4256 if (arg_selinux_context)
4257 if (setexeccon((security_context_t) arg_selinux_context) < 0)
4258 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4259#endif
4260
4261 r = change_uid_gid(&home);
4262 if (r < 0)
4263 return r;
4264
4265 envp[n_env] = strv_find_prefix(environ, "TERM=");
4266 if (envp[n_env])
4267 n_env ++;
4268
4269 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4270 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4271 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
4272 return log_oom();
4273
4274 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4275 char as_uuid[37];
4276
4277 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
4278 return log_oom();
4279 }
4280
4281 if (fdset_size(fds) > 0) {
4282 r = fdset_cloexec(fds, false);
4283 if (r < 0)
4284 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4285
4286 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
4287 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
4288 return log_oom();
4289 }
4290
2371271c
TG
4291 env_use = strv_env_merge(2, envp, arg_setenv);
4292 if (!env_use)
4293 return log_oom();
03cfe0d5
LP
4294
4295 /* Let the parent know that we are ready and
4296 * wait until the parent is ready with the
4297 * setup, too... */
4298 if (!barrier_place_and_sync(barrier)) { /* #4 */
4299 log_error("Parent died too early");
4300 return -ESRCH;
4301 }
4302
4303 /* Now, explicitly close the log, so that we
4304 * then can close all remaining fds. Closing
4305 * the log explicitly first has the benefit
4306 * that the logging subsystem knows about it,
4307 * and is thus ready to be reopened should we
4308 * need it again. Note that the other fds
4309 * closed here are at least the locking and
4310 * barrier fds. */
4311 log_close();
4312 (void) fdset_close_others(fds);
4313
4314 if (arg_boot) {
4315 char **a;
4316 size_t m;
4317
4318 /* Automatically search for the init system */
4319
4320 m = 1 + argc - optind;
4321 a = newa(char*, m + 1);
4322 memcpy(a + 1, argv + optind, m * sizeof(char*));
4323
4324 a[0] = (char*) "/usr/lib/systemd/systemd";
4325 execve(a[0], a, env_use);
4326
4327 a[0] = (char*) "/lib/systemd/systemd";
4328 execve(a[0], a, env_use);
4329
4330 a[0] = (char*) "/sbin/init";
4331 execve(a[0], a, env_use);
4332 } else if (argc > optind)
4333 execvpe(argv[optind], argv + optind, env_use);
4334 else {
4335 chdir(home ? home : "/root");
4336 execle("/bin/bash", "-bash", NULL, env_use);
4337 execle("/bin/sh", "-sh", NULL, env_use);
4338 }
4339
4340 (void) log_open();
4341 return log_error_errno(errno, "execv() failed: %m");
4342}
4343
4344static int outer_child(
4345 Barrier *barrier,
4346 const char *directory,
4347 const char *console,
4348 const char *root_device, bool root_device_rw,
4349 const char *home_device, bool home_device_rw,
4350 const char *srv_device, bool srv_device_rw,
4351 bool interactive,
4352 bool secondary,
4353 int pid_socket,
4354 int kmsg_socket,
4355 int rtnl_socket,
825d5287 4356 int uid_shift_socket,
03cfe0d5
LP
4357 FDSet *fds,
4358 int argc,
4359 char *argv[]) {
4360
4361 pid_t pid;
4362 ssize_t l;
4363 int r;
4364
4365 assert(barrier);
4366 assert(directory);
4367 assert(console);
4368 assert(pid_socket >= 0);
4369 assert(kmsg_socket >= 0);
4370
4371 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
4372 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4373
4374 if (interactive) {
4375 close_nointr(STDIN_FILENO);
4376 close_nointr(STDOUT_FILENO);
4377 close_nointr(STDERR_FILENO);
4378
4379 r = open_terminal(console, O_RDWR);
4380 if (r != STDIN_FILENO) {
4381 if (r >= 0) {
4382 safe_close(r);
4383 r = -EINVAL;
4384 }
4385
4386 return log_error_errno(r, "Failed to open console: %m");
4387 }
4388
4389 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4390 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
4391 return log_error_errno(errno, "Failed to duplicate console: %m");
4392 }
4393
4394 r = reset_audit_loginuid();
4395 if (r < 0)
4396 return r;
4397
4398 /* Mark everything as slave, so that we still
4399 * receive mounts from the real root, but don't
4400 * propagate mounts to the real root. */
4401 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
4402 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4403
4404 r = mount_devices(directory,
4405 root_device, root_device_rw,
4406 home_device, home_device_rw,
4407 srv_device, srv_device_rw);
4408 if (r < 0)
4409 return r;
4410
391567f4
LP
4411 r = determine_uid_shift(directory);
4412 if (r < 0)
4413 return r;
4414
825d5287
RM
4415 if (arg_userns) {
4416 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
4417 if (l < 0)
4418 return log_error_errno(errno, "Failed to send UID shift: %m");
4419 if (l != sizeof(arg_uid_shift)) {
4420 log_error("Short write while sending UID shift.");
4421 return -EIO;
4422 }
4423 }
4424
03cfe0d5
LP
4425 /* Turn directory into bind mount */
4426 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
4427 return log_error_errno(errno, "Failed to make bind mount: %m");
4428
03cfe0d5
LP
4429 r = setup_volatile(directory);
4430 if (r < 0)
4431 return r;
4432
03cfe0d5
LP
4433 r = setup_volatile_state(directory);
4434 if (r < 0)
4435 return r;
4436
03cfe0d5
LP
4437 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
4438 if (r < 0)
4439 return r;
4440
03cfe0d5
LP
4441 if (arg_read_only) {
4442 r = bind_remount_recursive(directory, true);
4443 if (r < 0)
4444 return log_error_errno(r, "Failed to make tree read-only: %m");
4445 }
4446
03cfe0d5
LP
4447 r = mount_all(directory, false);
4448 if (r < 0)
4449 return r;
4450
4451 if (copy_devnodes(directory) < 0)
4452 return r;
4453
4454 dev_setup(directory, arg_uid_shift, arg_uid_shift);
4455
4456 if (setup_pts(directory) < 0)
4457 return r;
4458
4459 r = setup_propagate(directory);
4460 if (r < 0)
4461 return r;
4462
4463 r = setup_dev_console(directory, console);
4464 if (r < 0)
4465 return r;
4466
4467 r = setup_seccomp();
4468 if (r < 0)
4469 return r;
4470
4471 r = setup_timezone(directory);
4472 if (r < 0)
4473 return r;
4474
4475 r = setup_resolv_conf(directory);
4476 if (r < 0)
4477 return r;
4478
4479 r = setup_journal(directory);
4480 if (r < 0)
4481 return r;
4482
4483 r = mount_custom(directory);
4484 if (r < 0)
4485 return r;
4486
4487 r = mount_cgroup(directory);
4488 if (r < 0)
4489 return r;
4490
4491 r = mount_move_root(directory);
4492 if (r < 0)
4493 return log_error_errno(r, "Failed to move root directory: %m");
4494
4495 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4496 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
4497 (arg_private_network ? CLONE_NEWNET : 0) |
4498 (arg_userns ? CLONE_NEWUSER : 0),
4499 NULL);
4500 if (pid < 0)
4501 return log_error_errno(errno, "Failed to fork inner child: %m");
4502
4503 if (pid == 0) {
4504 pid_socket = safe_close(pid_socket);
825d5287 4505 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
4506
4507 /* The inner child has all namespaces that are
4508 * requested, so that we all are owned by the user if
4509 * user namespaces are turned on. */
4510
4511 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
4512 if (r < 0)
4513 _exit(EXIT_FAILURE);
4514
4515 _exit(EXIT_SUCCESS);
4516 }
4517
4518 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4519 if (l < 0)
4520 return log_error_errno(errno, "Failed to send PID: %m");
4521 if (l != sizeof(pid)) {
4522 log_error("Short write while sending PID.");
4523 return -EIO;
4524 }
4525
4526 pid_socket = safe_close(pid_socket);
4527
4528 return 0;
4529}
4530
4531static int setup_uid_map(pid_t pid) {
4532 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4533 int r;
4534
4535 assert(pid > 1);
4536
4537 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4538 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 4539 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
4540 if (r < 0)
4541 return log_error_errno(r, "Failed to write UID map: %m");
4542
4543 /* We always assign the same UID and GID ranges */
4544 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 4545 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
4546 if (r < 0)
4547 return log_error_errno(r, "Failed to write GID map: %m");
4548
4549 return 0;
4550}
4551
4552static int chown_cgroup(pid_t pid) {
4553 _cleanup_free_ char *path = NULL, *fs = NULL;
4554 _cleanup_close_ int fd = -1;
4555 const char *fn;
4556 int r;
4557
4558 r = cg_pid_get_path(NULL, pid, &path);
4559 if (r < 0)
4560 return log_error_errno(r, "Failed to get container cgroup path: %m");
4561
4562 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
4563 if (r < 0)
4564 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
4565
4566 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
4567 if (fd < 0)
4568 return log_error_errno(errno, "Failed to open %s: %m", fs);
4569
4570 FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4571 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
4572 log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
4573
4574 return 0;
4575}
4576
4577int main(int argc, char *argv[]) {
4578
4579 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
4580 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4581 _cleanup_close_ int master = -1, image_fd = -1;
4582 _cleanup_fdset_free_ FDSet *fds = NULL;
4583 int r, n_fd_passed, loop_nr = -1;
4584 char veth_name[IFNAMSIZ];
4585 bool secondary = false, remove_subvol = false;
72c0a2c2 4586 sigset_t mask_chld;
03cfe0d5
LP
4587 pid_t pid = 0;
4588 int ret = EXIT_SUCCESS;
4589 union in_addr_union exposed = {};
4590 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4591 bool interactive;
4592
4593 log_parse_environment();
4594 log_open();
4595
4596 r = parse_argv(argc, argv);
4597 if (r <= 0)
4598 goto finish;
4599
4600 r = determine_names();
4601 if (r < 0)
4602 goto finish;
4603
4604 if (geteuid() != 0) {
4605 log_error("Need to be root.");
4606 r = -EPERM;
4607 goto finish;
4608 }
4609
4610 n_fd_passed = sd_listen_fds(false);
4611 if (n_fd_passed > 0) {
4612 r = fdset_new_listen_fds(&fds, false);
4613 if (r < 0) {
4614 log_error_errno(r, "Failed to collect file descriptors: %m");
4615 goto finish;
4616 }
4617 }
4618
4619 if (arg_directory) {
4620 assert(!arg_image);
4621
4622 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4623 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4624 r = -EINVAL;
4625 goto finish;
4626 }
4627
4628 if (arg_ephemeral) {
4629 _cleanup_free_ char *np = NULL;
4630
4631 /* If the specified path is a mount point we
4632 * generate the new snapshot immediately
4633 * inside it under a random name. However if
4634 * the specified is not a mount point we
4635 * create the new snapshot in the parent
4636 * directory, just next to it. */
e26d6ce5 4637 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
4638 if (r < 0) {
4639 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4640 goto finish;
4641 }
4642 if (r > 0)
770b5ce4 4643 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4644 else
770b5ce4 4645 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
4646 if (r < 0) {
4647 log_error_errno(r, "Failed to generate name for snapshot: %m");
4648 goto finish;
4649 }
4650
4651 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4652 if (r < 0) {
4653 log_error_errno(r, "Failed to lock %s: %m", np);
4654 goto finish;
4655 }
4656
4657 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4658 if (r < 0) {
4659 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4660 goto finish;
ec16945e
LP
4661 }
4662
4663 free(arg_directory);
4664 arg_directory = np;
8a16a7b4 4665 np = NULL;
ec16945e
LP
4666
4667 remove_subvol = true;
30535c16
LP
4668
4669 } else {
4670 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4671 if (r == -EBUSY) {
4672 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4673 goto finish;
4674 }
4675 if (r < 0) {
4676 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4677 return r;
4678 }
4679
4680 if (arg_template) {
f70a17f8 4681 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
30535c16
LP
4682 if (r == -EEXIST) {
4683 if (!arg_quiet)
4684 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4685 } else if (r < 0) {
83521414 4686 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
4687 goto finish;
4688 } else {
4689 if (!arg_quiet)
4690 log_info("Populated %s from template %s.", arg_directory, arg_template);
4691 }
4692 }
ec16945e
LP
4693 }
4694
1b9e5b12
LP
4695 if (arg_boot) {
4696 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 4697 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 4698 r = -EINVAL;
1b9e5b12
LP
4699 goto finish;
4700 }
4701 } else {
4702 const char *p;
4703
63c372cb 4704 p = strjoina(arg_directory,
1b9e5b12
LP
4705 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
4706 if (access(p, F_OK) < 0) {
4707 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
ec16945e 4708 r = -EINVAL;
1b9e5b12 4709 goto finish;
1b9e5b12
LP
4710 }
4711 }
ec16945e 4712
6b9132a9 4713 } else {
1b9e5b12 4714 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 4715
ec16945e
LP
4716 assert(arg_image);
4717 assert(!arg_template);
4718
30535c16
LP
4719 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4720 if (r == -EBUSY) {
4721 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4722 goto finish;
4723 }
4724 if (r < 0) {
4725 r = log_error_errno(r, "Failed to create image lock: %m");
4726 goto finish;
4727 }
4728
1b9e5b12 4729 if (!mkdtemp(template)) {
56f64d95 4730 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 4731 r = -errno;
6b9132a9 4732 goto finish;
1b9e5b12 4733 }
6b9132a9 4734
1b9e5b12
LP
4735 arg_directory = strdup(template);
4736 if (!arg_directory) {
4737 r = log_oom();
4738 goto finish;
6b9132a9 4739 }
88213476 4740
1b9e5b12
LP
4741 image_fd = setup_image(&device_path, &loop_nr);
4742 if (image_fd < 0) {
4743 r = image_fd;
842f3b0f
LP
4744 goto finish;
4745 }
1b9e5b12 4746
4d9f07b4
LP
4747 r = dissect_image(image_fd,
4748 &root_device, &root_device_rw,
4749 &home_device, &home_device_rw,
4750 &srv_device, &srv_device_rw,
4751 &secondary);
1b9e5b12
LP
4752 if (r < 0)
4753 goto finish;
842f3b0f 4754 }
842f3b0f 4755
5a8af538
LP
4756 r = custom_mounts_prepare();
4757 if (r < 0)
4758 goto finish;
4759
03cfe0d5
LP
4760 interactive =
4761 isatty(STDIN_FILENO) > 0 &&
4762 isatty(STDOUT_FILENO) > 0;
9c857b9d 4763
db7feb7e
LP
4764 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4765 if (master < 0) {
ec16945e 4766 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
4767 goto finish;
4768 }
4769
611b312b
LP
4770 r = ptsname_malloc(master, &console);
4771 if (r < 0) {
4772 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
4773 goto finish;
4774 }
4775
a258bf26 4776 if (unlockpt(master) < 0) {
ec16945e 4777 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
4778 goto finish;
4779 }
4780
9c857b9d
LP
4781 if (!arg_quiet)
4782 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4783 arg_machine, arg_image ?: arg_directory);
4784
72c0a2c2 4785 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 4786
023fb90b
LP
4787 assert_se(sigemptyset(&mask_chld) == 0);
4788 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4789
03cfe0d5
LP
4790 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4791 r = log_error_errno(errno, "Failed to become subreaper: %m");
4792 goto finish;
4793 }
4794
d87be9b0 4795 for (;;) {
825d5287
RM
4796 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
4797 uid_shift_socket_pair[2] = { -1, -1 };
113cea80 4798 ContainerStatus container_status;
7566e267 4799 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
03cfe0d5 4800 static const struct sigaction sa = {
e866af3a
DH
4801 .sa_handler = nop_handler,
4802 .sa_flags = SA_NOCLDSTOP,
4803 };
03cfe0d5
LP
4804 int ifi = 0;
4805 ssize_t l;
dbb60d69
LP
4806 _cleanup_event_unref_ sd_event *event = NULL;
4807 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4808 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4809 char last_char = 0;
e866af3a 4810
7566e267 4811 r = barrier_create(&barrier);
a2da110b 4812 if (r < 0) {
da927ba9 4813 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
4814 goto finish;
4815 }
4816
6d0b55c2
LP
4817 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
4818 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4819 goto finish;
4820 }
4821
4822 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
4823 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4824 goto finish;
4825 }
4826
03cfe0d5
LP
4827 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
4828 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
4829 goto finish;
4830 }
4831
825d5287
RM
4832 if (arg_userns)
4833 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
4834 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4835 goto finish;
4836 }
4837
e866af3a
DH
4838 /* Child can be killed before execv(), so handle SIGCHLD
4839 * in order to interrupt parent's blocking calls and
4840 * give it a chance to call wait() and terminate. */
4841 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4842 if (r < 0) {
ec16945e 4843 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
4844 goto finish;
4845 }
4846
e866af3a
DH
4847 r = sigaction(SIGCHLD, &sa, NULL);
4848 if (r < 0) {
ec16945e 4849 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
4850 goto finish;
4851 }
4852
03cfe0d5 4853 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
d87be9b0
LP
4854 if (pid < 0) {
4855 if (errno == EINVAL)
ec16945e 4856 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 4857 else
ec16945e 4858 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 4859
d87be9b0
LP
4860 goto finish;
4861 }
a258bf26 4862
d87be9b0 4863 if (pid == 0) {
03cfe0d5 4864 /* The outer child only has a file system namespace. */
a2da110b
DH
4865 barrier_set_role(&barrier, BARRIER_CHILD);
4866
03e334a1 4867 master = safe_close(master);
a258bf26 4868
03e334a1 4869 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 4870 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 4871 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
825d5287 4872 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 4873
ce30c8dc
LP
4874 (void) reset_all_signal_handlers();
4875 (void) reset_signal_mask();
f5c1b9ee 4876
03cfe0d5
LP
4877 r = outer_child(&barrier,
4878 arg_directory,
4879 console,
4880 root_device, root_device_rw,
4881 home_device, home_device_rw,
4882 srv_device, srv_device_rw,
4883 interactive,
4884 secondary,
4885 pid_socket_pair[1],
4886 kmsg_socket_pair[1],
4887 rtnl_socket_pair[1],
825d5287 4888 uid_shift_socket_pair[1],
03cfe0d5
LP
4889 fds,
4890 argc, argv);
0cb9fbcd 4891 if (r < 0)
a2da110b 4892 _exit(EXIT_FAILURE);
d87be9b0 4893
03cfe0d5 4894 _exit(EXIT_SUCCESS);
da5b3bad 4895 }
88213476 4896
a2da110b 4897 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 4898
842f3b0f
LP
4899 fdset_free(fds);
4900 fds = NULL;
4901
6d0b55c2
LP
4902 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4903 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 4904 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
6d0b55c2 4905
03cfe0d5
LP
4906 /* Wait for the outer child. */
4907 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
4908 if (r < 0)
4909 goto finish;
4910 if (r != 0) {
4911 r = -EIO;
4912 goto finish;
4913 }
4914 pid = 0;
6dac160c 4915
03cfe0d5
LP
4916 /* And now retrieve the PID of the inner child. */
4917 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
4918 if (l < 0) {
4919 r = log_error_errno(errno, "Failed to read inner child PID: %m");
4920 goto finish;
4921 }
4922 if (l != sizeof(pid)) {
4923 log_error("Short read while reading inner child PID: %m");
4924 r = EIO;
4925 goto finish;
4926 }
354bfd2b 4927
03cfe0d5 4928 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 4929
03cfe0d5
LP
4930 if (arg_userns) {
4931 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4932 log_error("Child died too early.");
4933 r = -ESRCH;
840295fc 4934 goto finish;
03cfe0d5 4935 }
ab046dde 4936
825d5287
RM
4937 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
4938 if (l < 0) {
4939 r = log_error_errno(errno, "Failed to read UID shift: %m");
4940 goto finish;
4941 }
4942 if (l != sizeof(arg_uid_shift)) {
4943 log_error("Short read while reading UID shift: %m");
4944 r = EIO;
4945 goto finish;
4946 }
4947
03cfe0d5 4948 r = setup_uid_map(pid);
840295fc
LP
4949 if (r < 0)
4950 goto finish;
ab046dde 4951
03cfe0d5
LP
4952 (void) barrier_place(&barrier); /* #2 */
4953 }
c74e630d 4954
03cfe0d5
LP
4955 r = move_network_interfaces(pid);
4956 if (r < 0)
4957 goto finish;
4bbfe7ad 4958
03cfe0d5
LP
4959 r = setup_veth(pid, veth_name, &ifi);
4960 if (r < 0)
4961 goto finish;
5aa4bb6b 4962
03cfe0d5
LP
4963 r = setup_bridge(veth_name, &ifi);
4964 if (r < 0)
4965 goto finish;
6dac160c 4966
03cfe0d5
LP
4967 r = setup_macvlan(pid);
4968 if (r < 0)
4969 goto finish;
6dac160c 4970
03cfe0d5
LP
4971 r = setup_ipvlan(pid);
4972 if (r < 0)
4973 goto finish;
6dac160c 4974
03cfe0d5
LP
4975 r = register_machine(pid, ifi);
4976 if (r < 0)
4977 goto finish;
6dac160c 4978
03cfe0d5
LP
4979 r = chown_cgroup(pid);
4980 if (r < 0)
4981 goto finish;
6dac160c 4982
03cfe0d5
LP
4983 /* Notify the child that the parent is ready with all
4984 * its setup (including cgroup-ification), and that
4985 * the child can now hand over control to the code to
4986 * run inside the container. */
4987 (void) barrier_place(&barrier); /* #3 */
6dac160c 4988
03cfe0d5
LP
4989 /* Block SIGCHLD here, before notifying child.
4990 * process_pty() will handle it with the other signals. */
4991 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 4992
03cfe0d5
LP
4993 /* Reset signal to default */
4994 r = default_signals(SIGCHLD, -1);
4995 if (r < 0) {
4996 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4997 goto finish;
4998 }
e866af3a 4999
03cfe0d5
LP
5000 /* Let the child know that we are ready and wait that the child is completely ready now. */
5001 if (!barrier_place_and_sync(&barrier)) { /* #5 */
5002 log_error("Client died too early.");
5003 r = -ESRCH;
5004 goto finish;
5005 }
b12afc8c 5006
03cfe0d5
LP
5007 sd_notifyf(false,
5008 "READY=1\n"
5009 "STATUS=Container running.\n"
5010 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 5011
03cfe0d5
LP
5012 r = sd_event_new(&event);
5013 if (r < 0) {
5014 log_error_errno(r, "Failed to get default event source: %m");
5015 goto finish;
5016 }
88213476 5017
03cfe0d5
LP
5018 if (arg_kill_signal > 0) {
5019 /* Try to kill the init system on SIGINT or SIGTERM */
5020 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
5021 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
5022 } else {
5023 /* Immediately exit */
5024 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5025 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
5026 }
023fb90b 5027
03cfe0d5
LP
5028 /* simply exit on sigchld */
5029 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 5030
03cfe0d5
LP
5031 if (arg_expose_ports) {
5032 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
5033 if (r < 0)
5034 goto finish;
023fb90b 5035
03cfe0d5
LP
5036 (void) expose_ports(rtnl, &exposed);
5037 }
023fb90b 5038
03cfe0d5 5039 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 5040
03cfe0d5
LP
5041 r = pty_forward_new(event, master, true, !interactive, &forward);
5042 if (r < 0) {
5043 log_error_errno(r, "Failed to create PTY forwarder: %m");
5044 goto finish;
5045 }
023fb90b 5046
03cfe0d5
LP
5047 r = sd_event_loop(event);
5048 if (r < 0) {
5049 log_error_errno(r, "Failed to run event loop: %m");
5050 goto finish;
5051 }
6d0b55c2 5052
03cfe0d5 5053 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 5054
03cfe0d5 5055 forward = pty_forward_free(forward);
6d0b55c2 5056
03cfe0d5
LP
5057 if (!arg_quiet && last_char != '\n')
5058 putc('\n', stdout);
04d39279 5059
03cfe0d5
LP
5060 /* Kill if it is not dead yet anyway */
5061 terminate_machine(pid);
1f0cd86b 5062
840295fc 5063 /* Normally redundant, but better safe than sorry */
04d39279 5064 kill(pid, SIGKILL);
a258bf26 5065
113cea80 5066 r = wait_for_container(pid, &container_status);
04d39279
LP
5067 pid = 0;
5068
ec16945e 5069 if (r < 0)
ce9f1527
LP
5070 /* We failed to wait for the container, or the
5071 * container exited abnormally */
ec16945e
LP
5072 goto finish;
5073 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
5074 /* The container exited with a non-zero
5075 * status, or with zero status and no reboot
5076 * was requested. */
ec16945e 5077 ret = r;
d87be9b0 5078 break;
ec16945e 5079 }
88213476 5080
113cea80 5081 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
5082
5083 if (arg_keep_unit) {
5084 /* Special handling if we are running as a
5085 * service: instead of simply restarting the
5086 * machine we want to restart the entire
5087 * service, so let's inform systemd about this
5088 * with the special exit code 133. The service
5089 * file uses RestartForceExitStatus=133 so
5090 * that this results in a full nspawn
5091 * restart. This is necessary since we might
5092 * have cgroup parameters set we want to have
5093 * flushed out. */
ec16945e
LP
5094 ret = 133;
5095 r = 0;
ce38dbc8
LP
5096 break;
5097 }
6d0b55c2
LP
5098
5099 flush_ports(&exposed);
d87be9b0 5100 }
88213476
LP
5101
5102finish:
af4ec430
LP
5103 sd_notify(false,
5104 "STOPPING=1\n"
5105 "STATUS=Terminating...");
5106
9444b1f2
LP
5107 if (pid > 0)
5108 kill(pid, SIGKILL);
88213476 5109
503546da
LP
5110 /* Try to flush whatever is still queued in the pty */
5111 if (master >= 0)
5112 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
5113
03cfe0d5
LP
5114 loop_remove(loop_nr, &image_fd);
5115
ec16945e
LP
5116 if (remove_subvol && arg_directory) {
5117 int k;
5118
d9e2daaf 5119 k = btrfs_subvol_remove(arg_directory, true);
ec16945e
LP
5120 if (k < 0)
5121 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
5122 }
5123
785890ac
LP
5124 if (arg_machine) {
5125 const char *p;
5126
63c372cb 5127 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5128 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5129 }
5130
04d391da 5131 free(arg_directory);
ec16945e
LP
5132 free(arg_template);
5133 free(arg_image);
7027ff61 5134 free(arg_machine);
c74e630d
LP
5135 free(arg_user);
5136 strv_free(arg_setenv);
5137 strv_free(arg_network_interfaces);
5138 strv_free(arg_network_macvlan);
4bbfe7ad 5139 strv_free(arg_network_ipvlan);
5a8af538 5140 custom_mount_free_all();
88213476 5141
6d0b55c2
LP
5142 flush_ports(&exposed);
5143
5144 while (arg_expose_ports) {
5145 ExposePort *p = arg_expose_ports;
5146 LIST_REMOVE(ports, arg_expose_ports, p);
5147 free(p);
5148 }
5149
ec16945e 5150 return r < 0 ? EXIT_FAILURE : ret;
88213476 5151}