]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: split out mount related functions into a new nspawn-mount.c file
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
88213476 26#include <sys/mount.h>
88213476
LP
27#include <stdlib.h>
28#include <string.h>
29#include <stdio.h>
30#include <errno.h>
31#include <sys/prctl.h>
88213476 32#include <getopt.h>
687d0825 33#include <grp.h>
5ed27dbd 34#include <linux/fs.h>
9537eab0 35#include <sys/socket.h>
aea38d80 36#include <linux/netlink.h>
aa28aefe 37#include <net/if.h>
69c79d3c 38#include <linux/veth.h>
6afc95b7 39#include <sys/personality.h>
1b9e5b12 40#include <linux/loop.h>
2fbe4296 41#include <sys/file.h>
aa28aefe 42
5d63309c 43#ifdef HAVE_SELINUX
a8828ed9
DW
44#include <selinux/selinux.h>
45#endif
88213476 46
24fb1112
LP
47#ifdef HAVE_SECCOMP
48#include <seccomp.h>
49#endif
50
1b9e5b12
LP
51#ifdef HAVE_BLKID
52#include <blkid/blkid.h>
53#endif
54
1f0cd86b
LP
55#include "sd-daemon.h"
56#include "sd-bus.h"
57#include "sd-id128.h"
1c4baffc 58#include "sd-netlink.h"
958b66ea 59#include "random-util.h"
88213476
LP
60#include "log.h"
61#include "util.h"
49e942b2 62#include "mkdir.h"
c6878637 63#include "rm-rf.h"
6b2d0e85 64#include "macro.h"
94d82985 65#include "missing.h"
04d391da 66#include "cgroup-util.h"
a258bf26 67#include "strv.h"
9eb977db 68#include "path-util.h"
a41fe3a2 69#include "loopback-setup.h"
4fc9982c 70#include "dev-setup.h"
842f3b0f 71#include "fdset.h"
acbeb427 72#include "build.h"
a5c32cff 73#include "fileio.h"
40ca29a1 74#include "bus-util.h"
1f0cd86b 75#include "bus-error.h"
4ba93280 76#include "ptyfwd.h"
f4889f65 77#include "env-util.h"
1c4baffc 78#include "netlink-util.h"
7e227024 79#include "udev-util.h"
1b9e5b12
LP
80#include "blkid-util.h"
81#include "gpt.h"
01dde061 82#include "siphash24.h"
849958d1 83#include "copy.h"
3577de7a 84#include "base-filesystem.h"
a2da110b 85#include "barrier.h"
023fb90b 86#include "event-util.h"
f01ae826 87#include "capability.h"
2822da4f 88#include "cap-list.h"
ec16945e 89#include "btrfs-util.h"
1b9cebf6 90#include "machine-image.h"
6d0b55c2
LP
91#include "list.h"
92#include "in-addr-util.h"
12c2884c 93#include "firewall-util.h"
6d0b55c2 94#include "local-addresses.h"
6482f626 95#include "formats-util.h"
0b452006 96#include "process-util.h"
288a74cc 97#include "terminal-util.h"
958b66ea 98#include "hostname-util.h"
24882e06 99#include "signal-util.h"
f2d88580 100
e9642be2
LP
101#ifdef HAVE_SECCOMP
102#include "seccomp-util.h"
103#endif
104
f757855e
LP
105#include "nspawn.h"
106#include "nspawn-settings.h"
e83bebef 107#include "nspawn-mount.h"
6d0b55c2 108
113cea80
DH
109typedef enum ContainerStatus {
110 CONTAINER_TERMINATED,
111 CONTAINER_REBOOTED
112} ContainerStatus;
113
57fb9fb5
LP
114typedef enum LinkJournal {
115 LINK_NO,
116 LINK_AUTO,
117 LINK_HOST,
118 LINK_GUEST
119} LinkJournal;
88213476
LP
120
121static char *arg_directory = NULL;
ec16945e 122static char *arg_template = NULL;
687d0825 123static char *arg_user = NULL;
9444b1f2 124static sd_id128_t arg_uuid = {};
7027ff61 125static char *arg_machine = NULL;
c74e630d
LP
126static const char *arg_selinux_context = NULL;
127static const char *arg_selinux_apifs_context = NULL;
9444b1f2 128static const char *arg_slice = NULL;
ff01d048 129static bool arg_private_network = false;
bc2f673e 130static bool arg_read_only = false;
0f0dbc46 131static bool arg_boot = false;
ec16945e 132static bool arg_ephemeral = false;
57fb9fb5 133static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 134static bool arg_link_journal_try = false;
5076f0cc
LP
135static uint64_t arg_retain =
136 (1ULL << CAP_CHOWN) |
137 (1ULL << CAP_DAC_OVERRIDE) |
138 (1ULL << CAP_DAC_READ_SEARCH) |
139 (1ULL << CAP_FOWNER) |
140 (1ULL << CAP_FSETID) |
141 (1ULL << CAP_IPC_OWNER) |
142 (1ULL << CAP_KILL) |
143 (1ULL << CAP_LEASE) |
144 (1ULL << CAP_LINUX_IMMUTABLE) |
145 (1ULL << CAP_NET_BIND_SERVICE) |
146 (1ULL << CAP_NET_BROADCAST) |
147 (1ULL << CAP_NET_RAW) |
148 (1ULL << CAP_SETGID) |
149 (1ULL << CAP_SETFCAP) |
150 (1ULL << CAP_SETPCAP) |
151 (1ULL << CAP_SETUID) |
152 (1ULL << CAP_SYS_ADMIN) |
153 (1ULL << CAP_SYS_CHROOT) |
154 (1ULL << CAP_SYS_NICE) |
155 (1ULL << CAP_SYS_PTRACE) |
156 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 157 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
158 (1ULL << CAP_SYS_BOOT) |
159 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
160 (1ULL << CAP_AUDIT_CONTROL) |
161 (1ULL << CAP_MKNOD);
5a8af538
LP
162static CustomMount *arg_custom_mounts = NULL;
163static unsigned arg_n_custom_mounts = 0;
f4889f65 164static char **arg_setenv = NULL;
284c0b91 165static bool arg_quiet = false;
8a96d94e 166static bool arg_share_system = false;
eb91eb18 167static bool arg_register = true;
89f7c846 168static bool arg_keep_unit = false;
aa28aefe 169static char **arg_network_interfaces = NULL;
c74e630d 170static char **arg_network_macvlan = NULL;
4bbfe7ad 171static char **arg_network_ipvlan = NULL;
69c79d3c 172static bool arg_network_veth = false;
f757855e 173static char *arg_network_bridge = NULL;
050f7277 174static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 175static char *arg_image = NULL;
f757855e 176static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 177static ExposePort *arg_expose_ports = NULL;
f36933fe 178static char **arg_property = NULL;
6dac160c
LP
179static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
180static bool arg_userns = false;
c6c8f6e2 181static int arg_kill_signal = 0;
efdb0237 182static bool arg_unified_cgroup_hierarchy = false;
f757855e
LP
183static SettingsMask arg_settings_mask = 0;
184static int arg_settings_trusted = -1;
185static char **arg_parameters = NULL;
88213476 186
601185b4 187static void help(void) {
88213476
LP
188 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
189 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
190 " -h --help Show this help\n"
191 " --version Print version string\n"
69c79d3c 192 " -q --quiet Do not show status information\n"
1b9e5b12 193 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
194 " --template=PATH Initialize root directory from template directory,\n"
195 " if missing\n"
196 " -x --ephemeral Run container with snapshot of root directory, and\n"
197 " remove it after exit\n"
198 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
199 " -b --boot Boot up full system (i.e. invoke init)\n"
200 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 201 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 202 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 203 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 204 " --property=NAME=VALUE Set scope unit property\n"
03cfe0d5
LP
205 " --private-users[=UIDBASE[:NUIDS]]\n"
206 " Run within user namespace\n"
69c79d3c
LP
207 " --private-network Disable network in container\n"
208 " --network-interface=INTERFACE\n"
209 " Assign an existing network interface to the\n"
210 " container\n"
c74e630d
LP
211 " --network-macvlan=INTERFACE\n"
212 " Create a macvlan network interface based on an\n"
213 " existing network interface to the container\n"
4bbfe7ad
TG
214 " --network-ipvlan=INTERFACE\n"
215 " Create a ipvlan network interface based on an\n"
216 " existing network interface to the container\n"
0dfaa006 217 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 218 " and container\n"
ab046dde 219 " --network-bridge=INTERFACE\n"
32457153 220 " Add a virtual ethernet connection between host\n"
ab046dde
TG
221 " and container and add it to an existing bridge on\n"
222 " the host\n"
6d0b55c2 223 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 224 " Expose a container IP port on the host\n"
82adf6af
LP
225 " -Z --selinux-context=SECLABEL\n"
226 " Set the SELinux security context to be used by\n"
227 " processes in the container\n"
228 " -L --selinux-apifs-context=SECLABEL\n"
229 " Set the SELinux security context to be used by\n"
230 " API/tmpfs file systems in the container\n"
a8828ed9
DW
231 " --capability=CAP In addition to the default, retain specified\n"
232 " capability\n"
233 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 234 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
235 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
236 " try-guest, try-host\n"
237 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 238 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
239 " --bind=PATH[:PATH[:OPTIONS]]\n"
240 " Bind mount a file or directory from the host into\n"
a8828ed9 241 " the container\n"
5e5bfa6e
EY
242 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
243 " Similar, but creates a read-only bind mount\n"
06c17c39 244 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
245 " --overlay=PATH[:PATH...]:PATH\n"
246 " Create an overlay mount from the host to \n"
247 " the container\n"
248 " --overlay-ro=PATH[:PATH...]:PATH\n"
249 " Similar, but creates a read-only overlay mount\n"
284c0b91 250 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 251 " --share-system Share system namespaces with host\n"
eb91eb18 252 " --register=BOOLEAN Register container as machine\n"
89f7c846 253 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 254 " the service unit nspawn is running in\n"
6d0b55c2 255 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 256 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
6d0b55c2 257 , program_invocation_short_name);
88213476
LP
258}
259
5a8af538
LP
260
261static int custom_mounts_prepare(void) {
262 unsigned i;
263 int r;
264
265 /* Ensure the mounts are applied prefix first. */
266 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
267
268 /* Allocate working directories for the overlay file systems that need it */
269 for (i = 0; i < arg_n_custom_mounts; i++) {
270 CustomMount *m = &arg_custom_mounts[i];
271
825d5287
RM
272 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
273 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
274 return -EINVAL;
275 }
276
5a8af538
LP
277 if (m->type != CUSTOM_MOUNT_OVERLAY)
278 continue;
279
280 if (m->work_dir)
281 continue;
282
283 if (m->read_only)
284 continue;
285
14bcf25c 286 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
287 if (r < 0)
288 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
289 }
290
291 return 0;
292}
293
ec16945e
LP
294static int set_sanitized_path(char **b, const char *path) {
295 char *p;
296
297 assert(b);
298 assert(path);
299
300 p = canonicalize_file_name(path);
301 if (!p) {
302 if (errno != ENOENT)
303 return -errno;
304
305 p = path_make_absolute_cwd(path);
306 if (!p)
307 return -ENOMEM;
308 }
309
310 free(*b);
311 *b = path_kill_slashes(p);
312 return 0;
313}
314
efdb0237
LP
315static int detect_unified_cgroup_hierarchy(void) {
316 const char *e;
317 int r;
318
319 /* Allow the user to control whether the unified hierarchy is used */
320 e = getenv("UNIFIED_CGROUP_HIERARCHY");
321 if (e) {
322 r = parse_boolean(e);
323 if (r < 0)
324 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
325
326 arg_unified_cgroup_hierarchy = r;
327 return 0;
328 }
329
330 /* Otherwise inherit the default from the host system */
331 r = cg_unified();
332 if (r < 0)
333 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
334
335 arg_unified_cgroup_hierarchy = r;
336 return 0;
337}
338
f757855e
LP
339int expose_port_parse(ExposePort **l, const char *s) {
340
341 const char *split, *e;
342 uint16_t container_port, host_port;
343 int protocol;
344 ExposePort *p;
345 int r;
346
347 if ((e = startswith(s, "tcp:")))
348 protocol = IPPROTO_TCP;
349 else if ((e = startswith(s, "udp:")))
350 protocol = IPPROTO_UDP;
351 else {
352 e = s;
353 protocol = IPPROTO_TCP;
354 }
355
356 split = strchr(e, ':');
357 if (split) {
358 char v[split - e + 1];
359
360 memcpy(v, e, split - e);
361 v[split - e] = 0;
362
363 r = safe_atou16(v, &host_port);
364 if (r < 0 || host_port <= 0)
365 return -EINVAL;
366
367 r = safe_atou16(split + 1, &container_port);
368 } else {
369 r = safe_atou16(e, &container_port);
370 host_port = container_port;
371 }
372
373 if (r < 0 || container_port <= 0)
374 return -EINVAL;
375
376 LIST_FOREACH(ports, p, arg_expose_ports)
377 if (p->protocol == protocol && p->host_port == host_port)
378 return -EEXIST;
379
380 p = new(ExposePort, 1);
381 if (!p)
382 return -ENOMEM;
383
384 p->protocol = protocol;
385 p->host_port = host_port;
386 p->container_port = container_port;
387
388 LIST_PREPEND(ports, *l, p);
389
390 return 0;
391}
392
88213476
LP
393static int parse_argv(int argc, char *argv[]) {
394
a41fe3a2 395 enum {
acbeb427
ZJS
396 ARG_VERSION = 0x100,
397 ARG_PRIVATE_NETWORK,
bc2f673e 398 ARG_UUID,
5076f0cc 399 ARG_READ_ONLY,
57fb9fb5 400 ARG_CAPABILITY,
420c7379 401 ARG_DROP_CAPABILITY,
17fe0523
LP
402 ARG_LINK_JOURNAL,
403 ARG_BIND,
f4889f65 404 ARG_BIND_RO,
06c17c39 405 ARG_TMPFS,
5a8af538
LP
406 ARG_OVERLAY,
407 ARG_OVERLAY_RO,
f4889f65 408 ARG_SETENV,
eb91eb18 409 ARG_SHARE_SYSTEM,
89f7c846 410 ARG_REGISTER,
aa28aefe 411 ARG_KEEP_UNIT,
69c79d3c 412 ARG_NETWORK_INTERFACE,
c74e630d 413 ARG_NETWORK_MACVLAN,
4bbfe7ad 414 ARG_NETWORK_IPVLAN,
ab046dde 415 ARG_NETWORK_BRIDGE,
6afc95b7 416 ARG_PERSONALITY,
4d9f07b4 417 ARG_VOLATILE,
ec16945e 418 ARG_TEMPLATE,
f36933fe 419 ARG_PROPERTY,
6dac160c 420 ARG_PRIVATE_USERS,
c6c8f6e2 421 ARG_KILL_SIGNAL,
f757855e 422 ARG_SETTINGS,
a41fe3a2
LP
423 };
424
88213476 425 static const struct option options[] = {
aa28aefe
LP
426 { "help", no_argument, NULL, 'h' },
427 { "version", no_argument, NULL, ARG_VERSION },
428 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
429 { "template", required_argument, NULL, ARG_TEMPLATE },
430 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
431 { "user", required_argument, NULL, 'u' },
432 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
433 { "boot", no_argument, NULL, 'b' },
434 { "uuid", required_argument, NULL, ARG_UUID },
435 { "read-only", no_argument, NULL, ARG_READ_ONLY },
436 { "capability", required_argument, NULL, ARG_CAPABILITY },
437 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
438 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
439 { "bind", required_argument, NULL, ARG_BIND },
440 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 441 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
442 { "overlay", required_argument, NULL, ARG_OVERLAY },
443 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
444 { "machine", required_argument, NULL, 'M' },
445 { "slice", required_argument, NULL, 'S' },
446 { "setenv", required_argument, NULL, ARG_SETENV },
447 { "selinux-context", required_argument, NULL, 'Z' },
448 { "selinux-apifs-context", required_argument, NULL, 'L' },
449 { "quiet", no_argument, NULL, 'q' },
450 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
451 { "register", required_argument, NULL, ARG_REGISTER },
452 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
453 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 454 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 455 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 456 { "network-veth", no_argument, NULL, 'n' },
ab046dde 457 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 458 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 459 { "image", required_argument, NULL, 'i' },
4d9f07b4 460 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 461 { "port", required_argument, NULL, 'p' },
f36933fe 462 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 463 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 464 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
f757855e 465 { "settings", required_argument, NULL, ARG_SETTINGS },
eb9da376 466 {}
88213476
LP
467 };
468
9444b1f2 469 int c, r;
a42c8b54 470 uint64_t plus = 0, minus = 0;
f757855e 471 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
472
473 assert(argc >= 0);
474 assert(argv);
475
0dfaa006 476 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
477
478 switch (c) {
479
480 case 'h':
601185b4
ZJS
481 help();
482 return 0;
88213476 483
acbeb427
ZJS
484 case ARG_VERSION:
485 puts(PACKAGE_STRING);
486 puts(SYSTEMD_FEATURES);
487 return 0;
488
88213476 489 case 'D':
ec16945e
LP
490 r = set_sanitized_path(&arg_directory, optarg);
491 if (r < 0)
492 return log_error_errno(r, "Invalid root directory: %m");
493
494 break;
495
496 case ARG_TEMPLATE:
497 r = set_sanitized_path(&arg_template, optarg);
498 if (r < 0)
499 return log_error_errno(r, "Invalid template directory: %m");
88213476
LP
500
501 break;
502
1b9e5b12 503 case 'i':
ec16945e
LP
504 r = set_sanitized_path(&arg_image, optarg);
505 if (r < 0)
506 return log_error_errno(r, "Invalid image path: %m");
507
508 break;
509
510 case 'x':
511 arg_ephemeral = true;
1b9e5b12
LP
512 break;
513
687d0825 514 case 'u':
2fc09a9c
DM
515 r = free_and_strdup(&arg_user, optarg);
516 if (r < 0)
7027ff61 517 return log_oom();
687d0825 518
f757855e 519 arg_settings_mask |= SETTING_USER;
687d0825
MV
520 break;
521
ab046dde 522 case ARG_NETWORK_BRIDGE:
f757855e
LP
523 r = free_and_strdup(&arg_network_bridge, optarg);
524 if (r < 0)
525 return log_oom();
ab046dde
TG
526
527 /* fall through */
528
0dfaa006 529 case 'n':
69c79d3c
LP
530 arg_network_veth = true;
531 arg_private_network = true;
f757855e 532 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
533 break;
534
aa28aefe 535 case ARG_NETWORK_INTERFACE:
c74e630d
LP
536 if (strv_extend(&arg_network_interfaces, optarg) < 0)
537 return log_oom();
538
539 arg_private_network = true;
f757855e 540 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
541 break;
542
543 case ARG_NETWORK_MACVLAN:
544 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
545 return log_oom();
546
4bbfe7ad 547 arg_private_network = true;
f757855e 548 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
549 break;
550
551 case ARG_NETWORK_IPVLAN:
552 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
553 return log_oom();
554
aa28aefe
LP
555 /* fall through */
556
ff01d048
LP
557 case ARG_PRIVATE_NETWORK:
558 arg_private_network = true;
f757855e 559 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
560 break;
561
0f0dbc46
LP
562 case 'b':
563 arg_boot = true;
f757855e 564 arg_settings_mask |= SETTING_BOOT;
0f0dbc46
LP
565 break;
566
144f0fc0 567 case ARG_UUID:
9444b1f2
LP
568 r = sd_id128_from_string(optarg, &arg_uuid);
569 if (r < 0) {
aa96c6cb 570 log_error("Invalid UUID: %s", optarg);
9444b1f2 571 return r;
aa96c6cb 572 }
f757855e
LP
573
574 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 575 break;
aa96c6cb 576
9444b1f2 577 case 'S':
c74e630d 578 arg_slice = optarg;
144f0fc0
LP
579 break;
580
7027ff61 581 case 'M':
c1521918 582 if (isempty(optarg))
97b11eed 583 arg_machine = mfree(arg_machine);
c1521918 584 else {
0c3c4284 585 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
586 log_error("Invalid machine name: %s", optarg);
587 return -EINVAL;
588 }
7027ff61 589
0c3c4284
LP
590 r = free_and_strdup(&arg_machine, optarg);
591 if (r < 0)
eb91eb18
LP
592 return log_oom();
593
594 break;
595 }
7027ff61 596
82adf6af
LP
597 case 'Z':
598 arg_selinux_context = optarg;
a8828ed9
DW
599 break;
600
82adf6af
LP
601 case 'L':
602 arg_selinux_apifs_context = optarg;
a8828ed9
DW
603 break;
604
bc2f673e
LP
605 case ARG_READ_ONLY:
606 arg_read_only = true;
f757855e 607 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
608 break;
609
420c7379
LP
610 case ARG_CAPABILITY:
611 case ARG_DROP_CAPABILITY: {
a2a5291b 612 const char *state, *word;
5076f0cc
LP
613 size_t length;
614
615 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 616 _cleanup_free_ char *t;
5076f0cc
LP
617
618 t = strndup(word, length);
0d0f0c50
SL
619 if (!t)
620 return log_oom();
5076f0cc 621
39ed67d1
LP
622 if (streq(t, "all")) {
623 if (c == ARG_CAPABILITY)
a42c8b54 624 plus = (uint64_t) -1;
39ed67d1 625 else
a42c8b54 626 minus = (uint64_t) -1;
39ed67d1 627 } else {
2822da4f
LP
628 int cap;
629
630 cap = capability_from_name(t);
631 if (cap < 0) {
39ed67d1
LP
632 log_error("Failed to parse capability %s.", t);
633 return -EINVAL;
634 }
635
636 if (c == ARG_CAPABILITY)
a42c8b54 637 plus |= 1ULL << (uint64_t) cap;
39ed67d1 638 else
a42c8b54 639 minus |= 1ULL << (uint64_t) cap;
5076f0cc 640 }
5076f0cc
LP
641 }
642
f757855e 643 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
644 break;
645 }
646
57fb9fb5
LP
647 case 'j':
648 arg_link_journal = LINK_GUEST;
574edc90 649 arg_link_journal_try = true;
57fb9fb5
LP
650 break;
651
652 case ARG_LINK_JOURNAL:
53e438e3 653 if (streq(optarg, "auto")) {
57fb9fb5 654 arg_link_journal = LINK_AUTO;
53e438e3
LP
655 arg_link_journal_try = false;
656 } else if (streq(optarg, "no")) {
57fb9fb5 657 arg_link_journal = LINK_NO;
53e438e3
LP
658 arg_link_journal_try = false;
659 } else if (streq(optarg, "guest")) {
57fb9fb5 660 arg_link_journal = LINK_GUEST;
53e438e3
LP
661 arg_link_journal_try = false;
662 } else if (streq(optarg, "host")) {
57fb9fb5 663 arg_link_journal = LINK_HOST;
53e438e3
LP
664 arg_link_journal_try = false;
665 } else if (streq(optarg, "try-guest")) {
574edc90
MP
666 arg_link_journal = LINK_GUEST;
667 arg_link_journal_try = true;
668 } else if (streq(optarg, "try-host")) {
669 arg_link_journal = LINK_HOST;
670 arg_link_journal_try = true;
671 } else {
57fb9fb5
LP
672 log_error("Failed to parse link journal mode %s", optarg);
673 return -EINVAL;
674 }
675
676 break;
677
17fe0523 678 case ARG_BIND:
f757855e
LP
679 case ARG_BIND_RO:
680 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
681 if (r < 0)
682 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 683
f757855e 684 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 685 break;
06c17c39 686
f757855e
LP
687 case ARG_TMPFS:
688 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
689 if (r < 0)
690 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 691
f757855e 692 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 693 break;
5a8af538
LP
694
695 case ARG_OVERLAY:
696 case ARG_OVERLAY_RO: {
697 _cleanup_free_ char *upper = NULL, *destination = NULL;
698 _cleanup_strv_free_ char **lower = NULL;
699 CustomMount *m;
700 unsigned n = 0;
701 char **i;
702
62f9f39a
RM
703 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
704 if (r == -ENOMEM)
06c17c39 705 return log_oom();
62f9f39a
RM
706 else if (r < 0) {
707 log_error("Invalid overlay specification: %s", optarg);
708 return r;
709 }
06c17c39 710
5a8af538
LP
711 STRV_FOREACH(i, lower) {
712 if (!path_is_absolute(*i)) {
713 log_error("Overlay path %s is not absolute.", *i);
714 return -EINVAL;
715 }
716
717 n++;
718 }
719
720 if (n < 2) {
721 log_error("--overlay= needs at least two colon-separated directories specified.");
722 return -EINVAL;
723 }
724
725 if (n == 2) {
726 /* If two parameters are specified,
727 * the first one is the lower, the
728 * second one the upper directory. And
af86c440
ZJS
729 * we'll also define the destination
730 * mount point the same as the upper. */
5a8af538
LP
731 upper = lower[1];
732 lower[1] = NULL;
733
734 destination = strdup(upper);
735 if (!destination)
736 return log_oom();
737
738 } else {
739 upper = lower[n - 2];
740 destination = lower[n - 1];
741 lower[n - 2] = NULL;
742 }
743
f757855e 744 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
5a8af538
LP
745 if (!m)
746 return log_oom();
747
748 m->destination = destination;
749 m->source = upper;
750 m->lower = lower;
751 m->read_only = c == ARG_OVERLAY_RO;
752
753 upper = destination = NULL;
754 lower = NULL;
06c17c39 755
f757855e 756 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39
LP
757 break;
758 }
759
f4889f65
LP
760 case ARG_SETENV: {
761 char **n;
762
763 if (!env_assignment_is_valid(optarg)) {
764 log_error("Environment variable assignment '%s' is not valid.", optarg);
765 return -EINVAL;
766 }
767
768 n = strv_env_set(arg_setenv, optarg);
769 if (!n)
770 return log_oom();
771
772 strv_free(arg_setenv);
773 arg_setenv = n;
f757855e
LP
774
775 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
776 break;
777 }
778
284c0b91
LP
779 case 'q':
780 arg_quiet = true;
781 break;
782
8a96d94e
LP
783 case ARG_SHARE_SYSTEM:
784 arg_share_system = true;
785 break;
786
eb91eb18
LP
787 case ARG_REGISTER:
788 r = parse_boolean(optarg);
789 if (r < 0) {
790 log_error("Failed to parse --register= argument: %s", optarg);
791 return r;
792 }
793
794 arg_register = r;
795 break;
796
89f7c846
LP
797 case ARG_KEEP_UNIT:
798 arg_keep_unit = true;
799 break;
800
6afc95b7
LP
801 case ARG_PERSONALITY:
802
ac45f971 803 arg_personality = personality_from_string(optarg);
050f7277 804 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
805 log_error("Unknown or unsupported personality '%s'.", optarg);
806 return -EINVAL;
807 }
808
f757855e 809 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
810 break;
811
4d9f07b4
LP
812 case ARG_VOLATILE:
813
814 if (!optarg)
f757855e 815 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 816 else {
f757855e 817 VolatileMode m;
4d9f07b4 818
f757855e
LP
819 m = volatile_mode_from_string(optarg);
820 if (m < 0) {
821 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 822 return -EINVAL;
f757855e
LP
823 } else
824 arg_volatile_mode = m;
6d0b55c2
LP
825 }
826
f757855e
LP
827 arg_settings_mask |= SETTING_VOLATILE_MODE;
828 break;
6d0b55c2 829
f757855e
LP
830 case 'p':
831 r = expose_port_parse(&arg_expose_ports, optarg);
832 if (r == -EEXIST)
833 return log_error_errno(r, "Duplicate port specification: %s", optarg);
834 if (r < 0)
835 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 836
f757855e 837 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 838 break;
6d0b55c2 839
f36933fe
LP
840 case ARG_PROPERTY:
841 if (strv_extend(&arg_property, optarg) < 0)
842 return log_oom();
843
844 break;
845
6dac160c
LP
846 case ARG_PRIVATE_USERS:
847 if (optarg) {
848 _cleanup_free_ char *buffer = NULL;
849 const char *range, *shift;
850
851 range = strchr(optarg, ':');
852 if (range) {
853 buffer = strndup(optarg, range - optarg);
854 if (!buffer)
855 return log_oom();
856 shift = buffer;
857
858 range++;
859 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
860 log_error("Failed to parse UID range: %s", range);
861 return -EINVAL;
862 }
863 } else
864 shift = optarg;
865
866 if (parse_uid(shift, &arg_uid_shift) < 0) {
867 log_error("Failed to parse UID: %s", optarg);
868 return -EINVAL;
869 }
870 }
871
872 arg_userns = true;
873 break;
874
c6c8f6e2
LP
875 case ARG_KILL_SIGNAL:
876 arg_kill_signal = signal_from_string_try_harder(optarg);
877 if (arg_kill_signal < 0) {
878 log_error("Cannot parse signal: %s", optarg);
879 return -EINVAL;
880 }
881
f757855e
LP
882 arg_settings_mask |= SETTING_KILL_SIGNAL;
883 break;
884
885 case ARG_SETTINGS:
886
887 /* no → do not read files
888 * yes → read files, do not override cmdline, trust only subset
889 * override → read files, override cmdline, trust only subset
890 * trusted → read files, do not override cmdline, trust all
891 */
892
893 r = parse_boolean(optarg);
894 if (r < 0) {
895 if (streq(optarg, "trusted")) {
896 mask_all_settings = false;
897 mask_no_settings = false;
898 arg_settings_trusted = true;
899
900 } else if (streq(optarg, "override")) {
901 mask_all_settings = false;
902 mask_no_settings = true;
903 arg_settings_trusted = -1;
904 } else
905 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
906 } else if (r > 0) {
907 /* yes */
908 mask_all_settings = false;
909 mask_no_settings = false;
910 arg_settings_trusted = -1;
911 } else {
912 /* no */
913 mask_all_settings = true;
914 mask_no_settings = false;
915 arg_settings_trusted = false;
916 }
917
c6c8f6e2
LP
918 break;
919
88213476
LP
920 case '?':
921 return -EINVAL;
922
923 default:
eb9da376 924 assert_not_reached("Unhandled option");
88213476 925 }
88213476 926
eb91eb18
LP
927 if (arg_share_system)
928 arg_register = false;
929
930 if (arg_boot && arg_share_system) {
931 log_error("--boot and --share-system may not be combined.");
932 return -EINVAL;
933 }
934
89f7c846
LP
935 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
936 log_error("--keep-unit may not be used when invoked from a user session.");
937 return -EINVAL;
938 }
939
1b9e5b12
LP
940 if (arg_directory && arg_image) {
941 log_error("--directory= and --image= may not be combined.");
942 return -EINVAL;
943 }
944
ec16945e
LP
945 if (arg_template && arg_image) {
946 log_error("--template= and --image= may not be combined.");
947 return -EINVAL;
948 }
949
950 if (arg_template && !(arg_directory || arg_machine)) {
951 log_error("--template= needs --directory= or --machine=.");
952 return -EINVAL;
953 }
954
955 if (arg_ephemeral && arg_template) {
956 log_error("--ephemeral and --template= may not be combined.");
957 return -EINVAL;
958 }
959
960 if (arg_ephemeral && arg_image) {
961 log_error("--ephemeral and --image= may not be combined.");
962 return -EINVAL;
963 }
964
df9a75e4
LP
965 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
966 log_error("--ephemeral and --link-journal= may not be combined.");
967 return -EINVAL;
968 }
969
f757855e
LP
970 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
971 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
972
973 if (argc > optind) {
974 arg_parameters = strv_copy(argv + optind);
975 if (!arg_parameters)
976 return log_oom();
977
978 arg_settings_mask |= SETTING_BOOT;
979 }
980
981 /* Load all settings from .nspawn files */
982 if (mask_no_settings)
983 arg_settings_mask = 0;
984
985 /* Don't load any settings from .nspawn files */
986 if (mask_all_settings)
987 arg_settings_mask = _SETTINGS_MASK_ALL;
988
989 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
990
991 r = detect_unified_cgroup_hierarchy();
992 if (r < 0)
993 return r;
994
995 return 1;
996}
997
998static int verify_arguments(void) {
999
1000 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1001 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1002 return -EINVAL;
1003 }
1004
6d0b55c2
LP
1005 if (arg_expose_ports && !arg_private_network) {
1006 log_error("Cannot use --port= without private networking.");
1007 return -EINVAL;
1008 }
1009
c6c8f6e2
LP
1010 if (arg_boot && arg_kill_signal <= 0)
1011 arg_kill_signal = SIGRTMIN+3;
1012
f757855e 1013 return 0;
88213476
LP
1014}
1015
03cfe0d5
LP
1016static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1017 assert(p);
1018
1019 if (!arg_userns)
1020 return 0;
1021
1022 if (uid == UID_INVALID && gid == GID_INVALID)
1023 return 0;
1024
1025 if (uid != UID_INVALID) {
1026 uid += arg_uid_shift;
1027
1028 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1029 return -EOVERFLOW;
1030 }
1031
1032 if (gid != GID_INVALID) {
1033 gid += (gid_t) arg_uid_shift;
1034
1035 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1036 return -EOVERFLOW;
1037 }
1038
1039 if (lchown(p, uid, gid) < 0)
1040 return -errno;
b12afc8c
LP
1041
1042 return 0;
1043}
1044
03cfe0d5
LP
1045static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1046 const char *q;
1047
1048 q = prefix_roota(root, path);
1049 if (mkdir(q, mode) < 0) {
1050 if (errno == EEXIST)
1051 return 0;
1052 return -errno;
1053 }
1054
1055 return userns_lchown(q, uid, gid);
1056}
1057
e58a1277 1058static int setup_timezone(const char *dest) {
03cfe0d5
LP
1059 _cleanup_free_ char *p = NULL, *q = NULL;
1060 const char *where, *check, *what;
d4036145
LP
1061 char *z, *y;
1062 int r;
f8440af5 1063
e58a1277
LP
1064 assert(dest);
1065
1066 /* Fix the timezone, if possible */
d4036145
LP
1067 r = readlink_malloc("/etc/localtime", &p);
1068 if (r < 0) {
1069 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1070 return 0;
1071 }
1072
1073 z = path_startswith(p, "../usr/share/zoneinfo/");
1074 if (!z)
1075 z = path_startswith(p, "/usr/share/zoneinfo/");
1076 if (!z) {
1077 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1078 return 0;
1079 }
1080
03cfe0d5 1081 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1082 r = readlink_malloc(where, &q);
1083 if (r >= 0) {
1084 y = path_startswith(q, "../usr/share/zoneinfo/");
1085 if (!y)
1086 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1087
d4036145
LP
1088 /* Already pointing to the right place? Then do nothing .. */
1089 if (y && streq(y, z))
1090 return 0;
1091 }
1092
03cfe0d5
LP
1093 check = strjoina("/usr/share/zoneinfo/", z);
1094 check = prefix_root(dest, check);
1095 if (laccess(check, F_OK) < 0) {
d4036145
LP
1096 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1097 return 0;
1098 }
68fb0892 1099
79d80fc1
TG
1100 r = unlink(where);
1101 if (r < 0 && errno != ENOENT) {
56f64d95 1102 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1103 return 0;
1104 }
4d9f07b4 1105
03cfe0d5 1106 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1107 if (symlink(what, where) < 0) {
56f64d95 1108 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1109 return 0;
1110 }
e58a1277 1111
03cfe0d5
LP
1112 r = userns_lchown(where, 0, 0);
1113 if (r < 0)
1114 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1115
e58a1277 1116 return 0;
88213476
LP
1117}
1118
2547bb41 1119static int setup_resolv_conf(const char *dest) {
03cfe0d5 1120 const char *where = NULL;
79d80fc1 1121 int r;
2547bb41
LP
1122
1123 assert(dest);
1124
1125 if (arg_private_network)
1126 return 0;
1127
1128 /* Fix resolv.conf, if possible */
03cfe0d5 1129 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1130
f2068bcc 1131 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1132 if (r < 0) {
68a313c5
LP
1133 /* If the file already exists as symlink, let's
1134 * suppress the warning, under the assumption that
1135 * resolved or something similar runs inside and the
1136 * symlink points there.
1137 *
1138 * If the disk image is read-only, there's also no
1139 * point in complaining.
1140 */
1141 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1142 "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1143 return 0;
1144 }
2547bb41 1145
03cfe0d5
LP
1146 r = userns_lchown(where, 0, 0);
1147 if (r < 0)
1148 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1149
2547bb41
LP
1150 return 0;
1151}
1152
9f24adc2 1153static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
03cfe0d5 1154 assert(s);
9f24adc2
LP
1155
1156 snprintf(s, 37,
1157 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1158 SD_ID128_FORMAT_VAL(id));
1159
1160 return s;
1161}
1162
04bc4a3f 1163static int setup_boot_id(const char *dest) {
03cfe0d5 1164 const char *from, *to;
39883f62 1165 sd_id128_t rnd = {};
04bc4a3f
LP
1166 char as_uuid[37];
1167 int r;
1168
eb91eb18
LP
1169 if (arg_share_system)
1170 return 0;
1171
04bc4a3f
LP
1172 /* Generate a new randomized boot ID, so that each boot-up of
1173 * the container gets a new one */
1174
03cfe0d5
LP
1175 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1176 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1177
1178 r = sd_id128_randomize(&rnd);
f647962d
MS
1179 if (r < 0)
1180 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1181
9f24adc2 1182 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1183
4c1fc3e4 1184 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
f647962d
MS
1185 if (r < 0)
1186 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1187
03cfe0d5
LP
1188 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1189 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1190 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
56f64d95 1191 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1192
1193 unlink(from);
04bc4a3f
LP
1194 return r;
1195}
1196
e58a1277 1197static int copy_devnodes(const char *dest) {
88213476
LP
1198
1199 static const char devnodes[] =
1200 "null\0"
1201 "zero\0"
1202 "full\0"
1203 "random\0"
1204 "urandom\0"
85614d66
TG
1205 "tty\0"
1206 "net/tun\0";
88213476
LP
1207
1208 const char *d;
e58a1277 1209 int r = 0;
7fd1b19b 1210 _cleanup_umask_ mode_t u;
a258bf26
LP
1211
1212 assert(dest);
124640f1
LP
1213
1214 u = umask(0000);
88213476 1215
03cfe0d5
LP
1216 /* Create /dev/net, so that we can create /dev/net/tun in it */
1217 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1218 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1219
88213476 1220 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 1221 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 1222 struct stat st;
88213476 1223
7f112f50 1224 from = strappend("/dev/", d);
03cfe0d5 1225 to = prefix_root(dest, from);
88213476
LP
1226
1227 if (stat(from, &st) < 0) {
1228
4a62c710
MS
1229 if (errno != ENOENT)
1230 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 1231
a258bf26 1232 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 1233
03cfe0d5 1234 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 1235 return -EIO;
a258bf26 1236
85614d66 1237 } else {
81f5049b
AC
1238 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1239 if (errno != EPERM)
1240 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1241
1242 /* Some systems abusively restrict mknod but
1243 * allow bind mounts. */
1244 r = touch(to);
1245 if (r < 0)
1246 return log_error_errno(r, "touch (%s) failed: %m", to);
1247 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1248 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1249 }
6278cf60 1250
03cfe0d5
LP
1251 r = userns_lchown(to, 0, 0);
1252 if (r < 0)
1253 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 1254 }
88213476
LP
1255 }
1256
e58a1277
LP
1257 return r;
1258}
88213476 1259
03cfe0d5
LP
1260static int setup_pts(const char *dest) {
1261 _cleanup_free_ char *options = NULL;
1262 const char *p;
1263
1264#ifdef HAVE_SELINUX
1265 if (arg_selinux_apifs_context)
1266 (void) asprintf(&options,
3dce8915 1267 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
1268 arg_uid_shift + TTY_GID,
1269 arg_selinux_apifs_context);
1270 else
1271#endif
1272 (void) asprintf(&options,
3dce8915 1273 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 1274 arg_uid_shift + TTY_GID);
f2d88580 1275
03cfe0d5 1276 if (!options)
f2d88580
LP
1277 return log_oom();
1278
03cfe0d5 1279 /* Mount /dev/pts itself */
cc9fce65 1280 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
1281 if (mkdir(p, 0755) < 0)
1282 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1283 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1284 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1285 if (userns_lchown(p, 0, 0) < 0)
1286 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1287
1288 /* Create /dev/ptmx symlink */
1289 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
1290 if (symlink("pts/ptmx", p) < 0)
1291 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
03cfe0d5
LP
1292 if (userns_lchown(p, 0, 0) < 0)
1293 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
f2d88580 1294
03cfe0d5
LP
1295 /* And fix /dev/pts/ptmx ownership */
1296 p = prefix_roota(dest, "/dev/pts/ptmx");
1297 if (userns_lchown(p, 0, 0) < 0)
1298 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 1299
f2d88580
LP
1300 return 0;
1301}
1302
e58a1277 1303static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
1304 _cleanup_umask_ mode_t u;
1305 const char *to;
e58a1277 1306 int r;
e58a1277
LP
1307
1308 assert(dest);
1309 assert(console);
1310
1311 u = umask(0000);
1312
03cfe0d5 1313 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
1314 if (r < 0)
1315 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 1316
a258bf26
LP
1317 /* We need to bind mount the right tty to /dev/console since
1318 * ptys can only exist on pts file systems. To have something
81f5049b 1319 * to bind mount things on we create a empty regular file. */
a258bf26 1320
03cfe0d5 1321 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
1322 r = touch(to);
1323 if (r < 0)
1324 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 1325
4543768d 1326 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1327 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 1328
25ea79fe 1329 return 0;
e58a1277
LP
1330}
1331
1332static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 1333 const char *from, *to;
7fd1b19b 1334 _cleanup_umask_ mode_t u;
03cfe0d5 1335 int fd, k;
e58a1277
LP
1336 union {
1337 struct cmsghdr cmsghdr;
1338 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
1339 } control = {};
1340 struct msghdr mh = {
1341 .msg_control = &control,
1342 .msg_controllen = sizeof(control),
1343 };
e58a1277
LP
1344 struct cmsghdr *cmsg;
1345
e58a1277 1346 assert(kmsg_socket >= 0);
a258bf26 1347
e58a1277 1348 u = umask(0000);
a258bf26 1349
03cfe0d5 1350 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
1351 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1352 * on the reading side behave very similar to /proc/kmsg,
1353 * their writing side behaves differently from /dev/kmsg in
1354 * that writing blocks when nothing is reading. In order to
1355 * avoid any problems with containers deadlocking due to this
1356 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
1357 from = prefix_roota(dest, "/run/kmsg");
1358 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 1359
4a62c710 1360 if (mkfifo(from, 0600) < 0)
03cfe0d5 1361 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 1362 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 1363 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
1364
1365 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
1366 if (fd < 0)
1367 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 1368
e58a1277
LP
1369 cmsg = CMSG_FIRSTHDR(&mh);
1370 cmsg->cmsg_level = SOL_SOCKET;
1371 cmsg->cmsg_type = SCM_RIGHTS;
1372 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1373 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1374
1375 mh.msg_controllen = cmsg->cmsg_len;
1376
1377 /* Store away the fd in the socket, so that it stays open as
1378 * long as we run the child */
6d0b55c2 1379 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
03e334a1 1380 safe_close(fd);
e58a1277 1381
4a62c710
MS
1382 if (k < 0)
1383 return log_error_errno(errno, "Failed to send FIFO fd: %m");
a258bf26 1384
03cfe0d5
LP
1385 /* And now make the FIFO unavailable as /run/kmsg... */
1386 (void) unlink(from);
1387
25ea79fe 1388 return 0;
88213476
LP
1389}
1390
6d0b55c2
LP
1391static int send_rtnl(int send_fd) {
1392 union {
1393 struct cmsghdr cmsghdr;
1394 uint8_t buf[CMSG_SPACE(sizeof(int))];
1395 } control = {};
1396 struct msghdr mh = {
1397 .msg_control = &control,
1398 .msg_controllen = sizeof(control),
1399 };
1400 struct cmsghdr *cmsg;
1401 _cleanup_close_ int fd = -1;
1402 ssize_t k;
1403
1404 assert(send_fd >= 0);
1405
1406 if (!arg_expose_ports)
1407 return 0;
1408
1409 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1410 if (fd < 0)
03cfe0d5 1411 return log_error_errno(errno, "Failed to allocate container netlink: %m");
6d0b55c2
LP
1412
1413 cmsg = CMSG_FIRSTHDR(&mh);
1414 cmsg->cmsg_level = SOL_SOCKET;
1415 cmsg->cmsg_type = SCM_RIGHTS;
1416 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1417 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1418
1419 mh.msg_controllen = cmsg->cmsg_len;
1420
1421 /* Store away the fd in the socket, so that it stays open as
1422 * long as we run the child */
1423 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1424 if (k < 0)
1425 return log_error_errno(errno, "Failed to send netlink fd: %m");
1426
1427 return 0;
1428}
1429
1430static int flush_ports(union in_addr_union *exposed) {
1431 ExposePort *p;
1432 int r, af = AF_INET;
1433
1434 assert(exposed);
1435
1436 if (!arg_expose_ports)
1437 return 0;
1438
1439 if (in_addr_is_null(af, exposed))
1440 return 0;
1441
1442 log_debug("Lost IP address.");
1443
1444 LIST_FOREACH(ports, p, arg_expose_ports) {
1445 r = fw_add_local_dnat(false,
1446 af,
1447 p->protocol,
1448 NULL,
1449 NULL, 0,
1450 NULL, 0,
1451 p->host_port,
1452 exposed,
1453 p->container_port,
1454 NULL);
1455 if (r < 0)
1456 log_warning_errno(r, "Failed to modify firewall: %m");
1457 }
1458
1459 *exposed = IN_ADDR_NULL;
1460 return 0;
1461}
1462
1c4baffc 1463static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
6d0b55c2
LP
1464 _cleanup_free_ struct local_address *addresses = NULL;
1465 _cleanup_free_ char *pretty = NULL;
1466 union in_addr_union new_exposed;
1467 ExposePort *p;
1468 bool add;
1469 int af = AF_INET, r;
1470
1471 assert(exposed);
1472
1473 /* Invoked each time an address is added or removed inside the
1474 * container */
1475
1476 if (!arg_expose_ports)
1477 return 0;
1478
1479 r = local_addresses(rtnl, 0, af, &addresses);
1480 if (r < 0)
1481 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1482
1483 add = r > 0 &&
1484 addresses[0].family == af &&
1485 addresses[0].scope < RT_SCOPE_LINK;
1486
1487 if (!add)
1488 return flush_ports(exposed);
1489
1490 new_exposed = addresses[0].address;
1491 if (in_addr_equal(af, exposed, &new_exposed))
1492 return 0;
1493
1494 in_addr_to_string(af, &new_exposed, &pretty);
1495 log_debug("New container IP is %s.", strna(pretty));
1496
1497 LIST_FOREACH(ports, p, arg_expose_ports) {
1498
1499 r = fw_add_local_dnat(true,
1500 af,
1501 p->protocol,
1502 NULL,
1503 NULL, 0,
1504 NULL, 0,
1505 p->host_port,
1506 &new_exposed,
1507 p->container_port,
1508 in_addr_is_null(af, exposed) ? NULL : exposed);
1509 if (r < 0)
1510 log_warning_errno(r, "Failed to modify firewall: %m");
1511 }
1512
1513 *exposed = new_exposed;
1514 return 0;
1515}
1516
f757855e
LP
1517void expose_port_free_all(ExposePort *p) {
1518
1519 while (p) {
1520 ExposePort *q = p;
1521 LIST_REMOVE(ports, p, q);
1522 free(q);
1523 }
1524}
1525
1c4baffc 1526static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
1527 union in_addr_union *exposed = userdata;
1528
1529 assert(rtnl);
1530 assert(m);
1531 assert(exposed);
1532
1533 expose_ports(rtnl, exposed);
1534 return 0;
1535}
1536
1c4baffc 1537static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
6d0b55c2
LP
1538 union {
1539 struct cmsghdr cmsghdr;
1540 uint8_t buf[CMSG_SPACE(sizeof(int))];
1541 } control = {};
1542 struct msghdr mh = {
1543 .msg_control = &control,
1544 .msg_controllen = sizeof(control),
1545 };
1546 struct cmsghdr *cmsg;
1c4baffc 1547 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
6d0b55c2
LP
1548 int fd, r;
1549 ssize_t k;
1550
1551 assert(event);
1552 assert(recv_fd >= 0);
1553 assert(ret);
1554
1555 if (!arg_expose_ports)
1556 return 0;
1557
1558 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1559 if (k < 0)
1560 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1561
1562 cmsg = CMSG_FIRSTHDR(&mh);
1563 assert(cmsg->cmsg_level == SOL_SOCKET);
1564 assert(cmsg->cmsg_type == SCM_RIGHTS);
657bdca9 1565 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
6d0b55c2
LP
1566 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1567
1c4baffc 1568 r = sd_netlink_open_fd(&rtnl, fd);
6d0b55c2
LP
1569 if (r < 0) {
1570 safe_close(fd);
1571 return log_error_errno(r, "Failed to create rtnl object: %m");
1572 }
1573
1c4baffc 1574 r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
6d0b55c2
LP
1575 if (r < 0)
1576 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1577
1c4baffc 1578 r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
6d0b55c2
LP
1579 if (r < 0)
1580 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1581
1c4baffc 1582 r = sd_netlink_attach_event(rtnl, event, 0);
6d0b55c2
LP
1583 if (r < 0)
1584 return log_error_errno(r, "Failed to add to even loop: %m");
1585
1586 *ret = rtnl;
1587 rtnl = NULL;
1588
1589 return 0;
1590}
1591
3a74cea5 1592static int setup_hostname(void) {
3a74cea5 1593
eb91eb18
LP
1594 if (arg_share_system)
1595 return 0;
1596
605f81a8 1597 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 1598 return -errno;
3a74cea5 1599
7027ff61 1600 return 0;
3a74cea5
LP
1601}
1602
57fb9fb5 1603static int setup_journal(const char *directory) {
4d680aee 1604 sd_id128_t machine_id, this_id;
03cfe0d5
LP
1605 _cleanup_free_ char *b = NULL, *d = NULL;
1606 const char *etc_machine_id, *p, *q;
27407a01 1607 char *id;
57fb9fb5
LP
1608 int r;
1609
df9a75e4
LP
1610 /* Don't link journals in ephemeral mode */
1611 if (arg_ephemeral)
1612 return 0;
1613
03cfe0d5 1614 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
57fb9fb5 1615
03cfe0d5 1616 r = read_one_line_file(etc_machine_id, &b);
27407a01
ZJS
1617 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1618 return 0;
f647962d 1619 else if (r < 0)
03cfe0d5 1620 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
57fb9fb5 1621
27407a01
ZJS
1622 id = strstrip(b);
1623 if (isempty(id) && arg_link_journal == LINK_AUTO)
1624 return 0;
57fb9fb5 1625
27407a01
ZJS
1626 /* Verify validity */
1627 r = sd_id128_from_string(id, &machine_id);
f647962d 1628 if (r < 0)
03cfe0d5 1629 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
57fb9fb5 1630
4d680aee 1631 r = sd_id128_get_machine(&this_id);
f647962d
MS
1632 if (r < 0)
1633 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
1634
1635 if (sd_id128_equal(machine_id, this_id)) {
1636 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1637 "Host and machine ids are equal (%s): refusing to link journals", id);
1638 if (arg_link_journal == LINK_AUTO)
1639 return 0;
df9a75e4 1640 return -EEXIST;
4d680aee
ZJS
1641 }
1642
1643 if (arg_link_journal == LINK_NO)
1644 return 0;
1645
03cfe0d5
LP
1646 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1647 if (r < 0)
1648 return log_error_errno(r, "Failed to create /var: %m");
1649
1650 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1651 if (r < 0)
1652 return log_error_errno(r, "Failed to create /var/log: %m");
1653
1654 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1655 if (r < 0)
1656 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1657
1658 p = strjoina("/var/log/journal/", id);
1659 q = prefix_roota(directory, p);
27407a01 1660
e26d6ce5 1661 if (path_is_mount_point(p, 0) > 0) {
27407a01
ZJS
1662 if (arg_link_journal != LINK_AUTO) {
1663 log_error("%s: already a mount point, refusing to use for journal", p);
1664 return -EEXIST;
1665 }
1666
1667 return 0;
57fb9fb5
LP
1668 }
1669
e26d6ce5 1670 if (path_is_mount_point(q, 0) > 0) {
57fb9fb5 1671 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
1672 log_error("%s: already a mount point, refusing to use for journal", q);
1673 return -EEXIST;
57fb9fb5
LP
1674 }
1675
27407a01 1676 return 0;
57fb9fb5
LP
1677 }
1678
1679 r = readlink_and_make_absolute(p, &d);
1680 if (r >= 0) {
1681 if ((arg_link_journal == LINK_GUEST ||
1682 arg_link_journal == LINK_AUTO) &&
1683 path_equal(d, q)) {
1684
03cfe0d5 1685 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1686 if (r < 0)
56f64d95 1687 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1688 return 0;
57fb9fb5
LP
1689 }
1690
4a62c710
MS
1691 if (unlink(p) < 0)
1692 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
1693 } else if (r == -EINVAL) {
1694
1695 if (arg_link_journal == LINK_GUEST &&
1696 rmdir(p) < 0) {
1697
27407a01
ZJS
1698 if (errno == ENOTDIR) {
1699 log_error("%s already exists and is neither a symlink nor a directory", p);
1700 return r;
1701 } else {
56f64d95 1702 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 1703 return -errno;
57fb9fb5 1704 }
57fb9fb5
LP
1705 }
1706 } else if (r != -ENOENT) {
56f64d95 1707 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 1708 return r;
57fb9fb5
LP
1709 }
1710
1711 if (arg_link_journal == LINK_GUEST) {
1712
1713 if (symlink(q, p) < 0) {
574edc90 1714 if (arg_link_journal_try) {
56f64d95 1715 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
1716 return 0;
1717 } else {
56f64d95 1718 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
1719 return -errno;
1720 }
57fb9fb5
LP
1721 }
1722
03cfe0d5 1723 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 1724 if (r < 0)
56f64d95 1725 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 1726 return 0;
57fb9fb5
LP
1727 }
1728
1729 if (arg_link_journal == LINK_HOST) {
574edc90
MP
1730 /* don't create parents here -- if the host doesn't have
1731 * permanent journal set up, don't force it here */
1732 r = mkdir(p, 0755);
57fb9fb5 1733 if (r < 0) {
574edc90 1734 if (arg_link_journal_try) {
56f64d95 1735 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
1736 return 0;
1737 } else {
56f64d95 1738 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
1739 return r;
1740 }
57fb9fb5
LP
1741 }
1742
27407a01
ZJS
1743 } else if (access(p, F_OK) < 0)
1744 return 0;
57fb9fb5 1745
cdb2b9d0
LP
1746 if (dir_is_empty(q) == 0)
1747 log_warning("%s is not empty, proceeding anyway.", q);
1748
03cfe0d5 1749 r = userns_mkdir(directory, p, 0755, 0, 0);
57fb9fb5 1750 if (r < 0) {
56f64d95 1751 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 1752 return r;
57fb9fb5
LP
1753 }
1754
4543768d 1755 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 1756 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 1757
27407a01 1758 return 0;
57fb9fb5
LP
1759}
1760
88213476 1761static int drop_capabilities(void) {
5076f0cc 1762 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
1763}
1764
5aa4bb6b 1765static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 1766 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
03976f7b 1767 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
1768 int r;
1769
eb91eb18
LP
1770 if (!arg_register)
1771 return 0;
1772
1c03020c 1773 r = sd_bus_default_system(&bus);
f647962d
MS
1774 if (r < 0)
1775 return log_error_errno(r, "Failed to open system bus: %m");
9444b1f2 1776
89f7c846
LP
1777 if (arg_keep_unit) {
1778 r = sd_bus_call_method(
1779 bus,
1780 "org.freedesktop.machine1",
1781 "/org/freedesktop/machine1",
1782 "org.freedesktop.machine1.Manager",
5aa4bb6b 1783 "RegisterMachineWithNetwork",
89f7c846
LP
1784 &error,
1785 NULL,
5aa4bb6b 1786 "sayssusai",
89f7c846
LP
1787 arg_machine,
1788 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1789 "nspawn",
1790 "container",
1791 (uint32_t) pid,
5aa4bb6b
LP
1792 strempty(arg_directory),
1793 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 1794 } else {
9457ac5b 1795 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
f36933fe 1796 char **i;
ce5b3ad4 1797 unsigned j;
9457ac5b
LP
1798
1799 r = sd_bus_message_new_method_call(
89f7c846 1800 bus,
9457ac5b 1801 &m,
89f7c846
LP
1802 "org.freedesktop.machine1",
1803 "/org/freedesktop/machine1",
1804 "org.freedesktop.machine1.Manager",
5aa4bb6b 1805 "CreateMachineWithNetwork");
f647962d 1806 if (r < 0)
f36933fe 1807 return bus_log_create_error(r);
9457ac5b
LP
1808
1809 r = sd_bus_message_append(
1810 m,
5aa4bb6b 1811 "sayssusai",
89f7c846
LP
1812 arg_machine,
1813 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1814 "nspawn",
1815 "container",
1816 (uint32_t) pid,
5aa4bb6b
LP
1817 strempty(arg_directory),
1818 local_ifindex > 0 ? 1 : 0, local_ifindex);
f647962d 1819 if (r < 0)
f36933fe 1820 return bus_log_create_error(r);
9457ac5b
LP
1821
1822 r = sd_bus_message_open_container(m, 'a', "(sv)");
f647962d 1823 if (r < 0)
f36933fe 1824 return bus_log_create_error(r);
9457ac5b
LP
1825
1826 if (!isempty(arg_slice)) {
1827 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
f647962d 1828 if (r < 0)
f36933fe 1829 return bus_log_create_error(r);
9457ac5b
LP
1830 }
1831
1832 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
f647962d 1833 if (r < 0)
f36933fe 1834 return bus_log_create_error(r);
9457ac5b 1835
773ce3d8
LP
1836 /* If you make changes here, also make sure to update
1837 * systemd-nspawn@.service, to keep the device
1838 * policies in sync regardless if we are run with or
1839 * without the --keep-unit switch. */
63cc4c31 1840 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
9457ac5b
LP
1841 /* Allow the container to
1842 * access and create the API
1843 * device nodes, so that
1844 * PrivateDevices= in the
1845 * container can work
1846 * fine */
1847 "/dev/null", "rwm",
1848 "/dev/zero", "rwm",
1849 "/dev/full", "rwm",
1850 "/dev/random", "rwm",
1851 "/dev/urandom", "rwm",
1852 "/dev/tty", "rwm",
864e1706 1853 "/dev/net/tun", "rwm",
9457ac5b
LP
1854 /* Allow the container
1855 * access to ptys. However,
1856 * do not permit the
1857 * container to ever create
1858 * these device nodes. */
1859 "/dev/pts/ptmx", "rw",
63cc4c31 1860 "char-pts", "rw");
f647962d 1861 if (r < 0)
27023c0e
LP
1862 return bus_log_create_error(r);
1863
ce5b3ad4
SJ
1864 for (j = 0; j < arg_n_custom_mounts; j++) {
1865 CustomMount *cm = &arg_custom_mounts[j];
1866
1867 if (cm->type != CUSTOM_MOUNT_BIND)
1868 continue;
1869
1870 r = is_device_node(cm->source);
1871 if (r < 0)
1872 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
1873
1874 if (r) {
1875 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
1876 cm->source, cm->read_only ? "r" : "rw");
1877 if (r < 0)
1878 return log_error_errno(r, "Failed to append message arguments: %m");
1879 }
1880 }
1881
27023c0e
LP
1882 if (arg_kill_signal != 0) {
1883 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
1884 if (r < 0)
1885 return bus_log_create_error(r);
1886
1887 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
1888 if (r < 0)
1889 return bus_log_create_error(r);
1890 }
9457ac5b 1891
f36933fe
LP
1892 STRV_FOREACH(i, arg_property) {
1893 r = sd_bus_message_open_container(m, 'r', "sv");
1894 if (r < 0)
1895 return bus_log_create_error(r);
1896
1897 r = bus_append_unit_property_assignment(m, *i);
1898 if (r < 0)
1899 return r;
1900
1901 r = sd_bus_message_close_container(m);
1902 if (r < 0)
1903 return bus_log_create_error(r);
1904 }
1905
9457ac5b 1906 r = sd_bus_message_close_container(m);
f647962d 1907 if (r < 0)
f36933fe 1908 return bus_log_create_error(r);
9457ac5b
LP
1909
1910 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
1911 }
1912
9444b1f2 1913 if (r < 0) {
1f0cd86b
LP
1914 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1915 return r;
1916 }
1917
1918 return 0;
1919}
1920
1921static int terminate_machine(pid_t pid) {
1922 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1923 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
03976f7b 1924 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
1925 const char *path;
1926 int r;
1927
eb91eb18
LP
1928 if (!arg_register)
1929 return 0;
1930
1a2399e5
LP
1931 /* If we are reusing the unit, then just exit, systemd will do
1932 * the right thing when we exit. */
1933 if (arg_keep_unit)
1934 return 0;
1935
76b54375 1936 r = sd_bus_default_system(&bus);
f647962d
MS
1937 if (r < 0)
1938 return log_error_errno(r, "Failed to open system bus: %m");
1f0cd86b
LP
1939
1940 r = sd_bus_call_method(
1941 bus,
1942 "org.freedesktop.machine1",
1943 "/org/freedesktop/machine1",
1944 "org.freedesktop.machine1.Manager",
1945 "GetMachineByPID",
1946 &error,
1947 &reply,
1948 "u",
1949 (uint32_t) pid);
1950 if (r < 0) {
1951 /* Note that the machine might already have been
1952 * cleaned up automatically, hence don't consider it a
1953 * failure if we cannot get the machine object. */
1954 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1955 return 0;
1956 }
1957
1958 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
1959 if (r < 0)
1960 return bus_log_parse_error(r);
9444b1f2 1961
1f0cd86b
LP
1962 r = sd_bus_call_method(
1963 bus,
1964 "org.freedesktop.machine1",
1965 path,
1966 "org.freedesktop.machine1.Machine",
1967 "Terminate",
1968 &error,
1969 NULL,
1970 NULL);
1971 if (r < 0) {
1972 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1973 return 0;
1974 }
1975
9444b1f2
LP
1976 return 0;
1977}
1978
db999e0f
LP
1979static int reset_audit_loginuid(void) {
1980 _cleanup_free_ char *p = NULL;
1981 int r;
1982
1983 if (arg_share_system)
1984 return 0;
1985
1986 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 1987 if (r == -ENOENT)
db999e0f 1988 return 0;
f647962d
MS
1989 if (r < 0)
1990 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
1991
1992 /* Already reset? */
1993 if (streq(p, "4294967295"))
1994 return 0;
1995
ad118bda 1996 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 1997 if (r < 0) {
10a87006
LP
1998 log_error_errno(r,
1999 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2000 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2001 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2002 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2003 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2004
db999e0f 2005 sleep(5);
77b6e194 2006 }
db999e0f
LP
2007
2008 return 0;
77b6e194
LP
2009}
2010
4f758c23
LP
2011#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2012#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
e867ceb6 2013#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
01dde061 2014
a90e2305 2015static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
01dde061
TG
2016 uint8_t result[8];
2017 size_t l, sz;
a90e2305
LP
2018 uint8_t *v, *i;
2019 int r;
01dde061
TG
2020
2021 l = strlen(arg_machine);
2022 sz = sizeof(sd_id128_t) + l;
e867ceb6
LP
2023 if (idx > 0)
2024 sz += sizeof(idx);
a90e2305 2025
01dde061
TG
2026 v = alloca(sz);
2027
2028 /* fetch some persistent data unique to the host */
2029 r = sd_id128_get_machine((sd_id128_t*) v);
2030 if (r < 0)
2031 return r;
2032
2033 /* combine with some data unique (on this host) to this
2034 * container instance */
a90e2305
LP
2035 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2036 if (idx > 0) {
2037 idx = htole64(idx);
2038 memcpy(i, &idx, sizeof(idx));
2039 }
01dde061
TG
2040
2041 /* Let's hash the host machine ID plus the container name. We
2042 * use a fixed, but originally randomly created hash key here. */
4f758c23 2043 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
2044
2045 assert_cc(ETH_ALEN <= sizeof(result));
2046 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2047
2048 /* see eth_random_addr in the kernel */
2049 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2050 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2051
2052 return 0;
2053}
2054
5aa4bb6b 2055static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1c4baffc
TG
2056 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2057 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4f758c23 2058 struct ether_addr mac_host, mac_container;
5aa4bb6b 2059 int r, i;
69c79d3c
LP
2060
2061 if (!arg_private_network)
2062 return 0;
2063
2064 if (!arg_network_veth)
2065 return 0;
2066
08af0da2
LP
2067 /* Use two different interface name prefixes depending whether
2068 * we are in bridge mode or not. */
c00524c9 2069 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 2070 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 2071
e867ceb6
LP
2072 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2073 if (r < 0)
2074 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
4f758c23 2075
e867ceb6
LP
2076 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2077 if (r < 0)
2078 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
01dde061 2079
1c4baffc 2080 r = sd_netlink_open(&rtnl);
f647962d
MS
2081 if (r < 0)
2082 return log_error_errno(r, "Failed to connect to netlink: %m");
69c79d3c 2083
151b9b96 2084 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2085 if (r < 0)
2086 return log_error_errno(r, "Failed to allocate netlink message: %m");
69c79d3c 2087
1c4baffc 2088 r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
f647962d
MS
2089 if (r < 0)
2090 return log_error_errno(r, "Failed to add netlink interface name: %m");
69c79d3c 2091
1c4baffc 2092 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
f647962d
MS
2093 if (r < 0)
2094 return log_error_errno(r, "Failed to add netlink MAC address: %m");
4f758c23 2095
1c4baffc 2096 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2097 if (r < 0)
2098 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2099
1c4baffc 2100 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
f647962d
MS
2101 if (r < 0)
2102 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2103
1c4baffc 2104 r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
f647962d
MS
2105 if (r < 0)
2106 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2107
1c4baffc 2108 r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
f647962d
MS
2109 if (r < 0)
2110 return log_error_errno(r, "Failed to add netlink interface name: %m");
01dde061 2111
1c4baffc 2112 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
f647962d
MS
2113 if (r < 0)
2114 return log_error_errno(r, "Failed to add netlink MAC address: %m");
69c79d3c 2115
1c4baffc 2116 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2117 if (r < 0)
2118 return log_error_errno(r, "Failed to add netlink namespace field: %m");
69c79d3c 2119
1c4baffc 2120 r = sd_netlink_message_close_container(m);
f647962d
MS
2121 if (r < 0)
2122 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c 2123
1c4baffc 2124 r = sd_netlink_message_close_container(m);
f647962d
MS
2125 if (r < 0)
2126 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c 2127
1c4baffc 2128 r = sd_netlink_message_close_container(m);
f647962d
MS
2129 if (r < 0)
2130 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c 2131
1c4baffc 2132 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d 2133 if (r < 0)
637aa8a3 2134 return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
69c79d3c 2135
5aa4bb6b 2136 i = (int) if_nametoindex(iface_name);
4a62c710
MS
2137 if (i <= 0)
2138 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
5aa4bb6b
LP
2139
2140 *ifi = i;
2141
69c79d3c
LP
2142 return 0;
2143}
2144
5aa4bb6b 2145static int setup_bridge(const char veth_name[], int *ifi) {
1c4baffc
TG
2146 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2147 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
ab046dde
TG
2148 int r, bridge;
2149
2150 if (!arg_private_network)
2151 return 0;
2152
2153 if (!arg_network_veth)
2154 return 0;
2155
2156 if (!arg_network_bridge)
2157 return 0;
2158
2159 bridge = (int) if_nametoindex(arg_network_bridge);
4a62c710
MS
2160 if (bridge <= 0)
2161 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
ab046dde 2162
5aa4bb6b
LP
2163 *ifi = bridge;
2164
1c4baffc 2165 r = sd_netlink_open(&rtnl);
f647962d
MS
2166 if (r < 0)
2167 return log_error_errno(r, "Failed to connect to netlink: %m");
ab046dde 2168
151b9b96 2169 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
f647962d
MS
2170 if (r < 0)
2171 return log_error_errno(r, "Failed to allocate netlink message: %m");
ab046dde 2172
039dd4af 2173 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
f647962d
MS
2174 if (r < 0)
2175 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
039dd4af 2176
1c4baffc 2177 r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
f647962d
MS
2178 if (r < 0)
2179 return log_error_errno(r, "Failed to add netlink interface name field: %m");
ab046dde 2180
1c4baffc 2181 r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
f647962d
MS
2182 if (r < 0)
2183 return log_error_errno(r, "Failed to add netlink master field: %m");
ab046dde 2184
1c4baffc 2185 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d
MS
2186 if (r < 0)
2187 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
ab046dde
TG
2188
2189 return 0;
2190}
2191
c74e630d
LP
2192static int parse_interface(struct udev *udev, const char *name) {
2193 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2194 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2195 int ifi;
2196
2197 ifi = (int) if_nametoindex(name);
4a62c710
MS
2198 if (ifi <= 0)
2199 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
c74e630d
LP
2200
2201 sprintf(ifi_str, "n%i", ifi);
2202 d = udev_device_new_from_device_id(udev, ifi_str);
4a62c710
MS
2203 if (!d)
2204 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
c74e630d
LP
2205
2206 if (udev_device_get_is_initialized(d) <= 0) {
2207 log_error("Network interface %s is not initialized yet.", name);
2208 return -EBUSY;
2209 }
2210
2211 return ifi;
2212}
2213
69c79d3c 2214static int move_network_interfaces(pid_t pid) {
7e227024 2215 _cleanup_udev_unref_ struct udev *udev = NULL;
1c4baffc 2216 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
aa28aefe
LP
2217 char **i;
2218 int r;
2219
2220 if (!arg_private_network)
2221 return 0;
2222
2223 if (strv_isempty(arg_network_interfaces))
2224 return 0;
2225
1c4baffc 2226 r = sd_netlink_open(&rtnl);
f647962d
MS
2227 if (r < 0)
2228 return log_error_errno(r, "Failed to connect to netlink: %m");
aa28aefe 2229
7e227024
LP
2230 udev = udev_new();
2231 if (!udev) {
2232 log_error("Failed to connect to udev.");
2233 return -ENOMEM;
2234 }
2235
aa28aefe 2236 STRV_FOREACH(i, arg_network_interfaces) {
1c4baffc 2237 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
b88eb17a 2238 int ifi;
aa28aefe 2239
c74e630d
LP
2240 ifi = parse_interface(udev, *i);
2241 if (ifi < 0)
2242 return ifi;
2243
3125b3ef 2244 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
f647962d
MS
2245 if (r < 0)
2246 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2247
1c4baffc 2248 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2249 if (r < 0)
2250 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
7e227024 2251
1c4baffc 2252 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d
MS
2253 if (r < 0)
2254 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
c74e630d 2255 }
7e227024 2256
c74e630d
LP
2257 return 0;
2258}
2259
2260static int setup_macvlan(pid_t pid) {
2261 _cleanup_udev_unref_ struct udev *udev = NULL;
1c4baffc 2262 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
e867ceb6 2263 unsigned idx = 0;
c74e630d
LP
2264 char **i;
2265 int r;
2266
2267 if (!arg_private_network)
2268 return 0;
2269
2270 if (strv_isempty(arg_network_macvlan))
2271 return 0;
2272
1c4baffc 2273 r = sd_netlink_open(&rtnl);
f647962d
MS
2274 if (r < 0)
2275 return log_error_errno(r, "Failed to connect to netlink: %m");
c74e630d
LP
2276
2277 udev = udev_new();
2278 if (!udev) {
2279 log_error("Failed to connect to udev.");
2280 return -ENOMEM;
2281 }
2282
2283 STRV_FOREACH(i, arg_network_macvlan) {
1c4baffc 2284 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
c74e630d 2285 _cleanup_free_ char *n = NULL;
e867ceb6 2286 struct ether_addr mac;
c74e630d
LP
2287 int ifi;
2288
2289 ifi = parse_interface(udev, *i);
2290 if (ifi < 0)
2291 return ifi;
2292
e867ceb6
LP
2293 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2294 if (r < 0)
2295 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2296
c74e630d 2297 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2298 if (r < 0)
2299 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 2300
1c4baffc 2301 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
f647962d
MS
2302 if (r < 0)
2303 return log_error_errno(r, "Failed to add netlink interface index: %m");
c74e630d
LP
2304
2305 n = strappend("mv-", *i);
2306 if (!n)
2307 return log_oom();
2308
2309 strshorten(n, IFNAMSIZ-1);
2310
1c4baffc 2311 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
f647962d
MS
2312 if (r < 0)
2313 return log_error_errno(r, "Failed to add netlink interface name: %m");
c74e630d 2314
1c4baffc 2315 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
e867ceb6
LP
2316 if (r < 0)
2317 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2318
1c4baffc 2319 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2320 if (r < 0)
2321 return log_error_errno(r, "Failed to add netlink namespace field: %m");
c74e630d 2322
1c4baffc 2323 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2324 if (r < 0)
2325 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 2326
1c4baffc 2327 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
f647962d
MS
2328 if (r < 0)
2329 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 2330
1c4baffc 2331 r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
f647962d
MS
2332 if (r < 0)
2333 return log_error_errno(r, "Failed to append macvlan mode: %m");
c74e630d 2334
1c4baffc 2335 r = sd_netlink_message_close_container(m);
f647962d
MS
2336 if (r < 0)
2337 return log_error_errno(r, "Failed to close netlink container: %m");
c74e630d 2338
1c4baffc 2339 r = sd_netlink_message_close_container(m);
f647962d
MS
2340 if (r < 0)
2341 return log_error_errno(r, "Failed to close netlink container: %m");
aa28aefe 2342
1c4baffc 2343 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d
MS
2344 if (r < 0)
2345 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
aa28aefe
LP
2346 }
2347
2348 return 0;
2349}
2350
4bbfe7ad
TG
2351static int setup_ipvlan(pid_t pid) {
2352 _cleanup_udev_unref_ struct udev *udev = NULL;
1c4baffc 2353 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4bbfe7ad
TG
2354 char **i;
2355 int r;
2356
2357 if (!arg_private_network)
2358 return 0;
2359
2360 if (strv_isempty(arg_network_ipvlan))
2361 return 0;
2362
1c4baffc 2363 r = sd_netlink_open(&rtnl);
4bbfe7ad
TG
2364 if (r < 0)
2365 return log_error_errno(r, "Failed to connect to netlink: %m");
2366
2367 udev = udev_new();
2368 if (!udev) {
2369 log_error("Failed to connect to udev.");
2370 return -ENOMEM;
2371 }
2372
2373 STRV_FOREACH(i, arg_network_ipvlan) {
1c4baffc 2374 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
4bbfe7ad
TG
2375 _cleanup_free_ char *n = NULL;
2376 int ifi;
2377
2378 ifi = parse_interface(udev, *i);
2379 if (ifi < 0)
2380 return ifi;
2381
2382 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2383 if (r < 0)
2384 return log_error_errno(r, "Failed to allocate netlink message: %m");
2385
1c4baffc 2386 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
4bbfe7ad
TG
2387 if (r < 0)
2388 return log_error_errno(r, "Failed to add netlink interface index: %m");
2389
2390 n = strappend("iv-", *i);
2391 if (!n)
2392 return log_oom();
2393
2394 strshorten(n, IFNAMSIZ-1);
2395
1c4baffc 2396 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
4bbfe7ad
TG
2397 if (r < 0)
2398 return log_error_errno(r, "Failed to add netlink interface name: %m");
2399
1c4baffc 2400 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
4bbfe7ad
TG
2401 if (r < 0)
2402 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2403
1c4baffc 2404 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
4bbfe7ad
TG
2405 if (r < 0)
2406 return log_error_errno(r, "Failed to open netlink container: %m");
2407
1c4baffc 2408 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
4bbfe7ad
TG
2409 if (r < 0)
2410 return log_error_errno(r, "Failed to open netlink container: %m");
2411
1c4baffc 2412 r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
4bbfe7ad
TG
2413 if (r < 0)
2414 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2415
1c4baffc 2416 r = sd_netlink_message_close_container(m);
4bbfe7ad
TG
2417 if (r < 0)
2418 return log_error_errno(r, "Failed to close netlink container: %m");
2419
1c4baffc 2420 r = sd_netlink_message_close_container(m);
4bbfe7ad
TG
2421 if (r < 0)
2422 return log_error_errno(r, "Failed to close netlink container: %m");
2423
1c4baffc 2424 r = sd_netlink_call(rtnl, m, 0, NULL);
4bbfe7ad
TG
2425 if (r < 0)
2426 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2427 }
2428
2429 return 0;
2430}
2431
28650077 2432static int setup_seccomp(void) {
24fb1112
LP
2433
2434#ifdef HAVE_SECCOMP
9a71b112
JF
2435 static const struct {
2436 uint64_t capability;
2437 int syscall_num;
2438 } blacklist[] = {
5ba7a268
LP
2439 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
2440 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
2441 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
2442 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
2443 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
2444 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
2445 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
2446 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
2447 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
2448 { CAP_SYSLOG, SCMP_SYS(syslog) },
d0a0ccf3
JF
2449 };
2450
24fb1112 2451 scmp_filter_ctx seccomp;
28650077 2452 unsigned i;
24fb1112
LP
2453 int r;
2454
24fb1112
LP
2455 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2456 if (!seccomp)
2457 return log_oom();
2458
e9642be2 2459 r = seccomp_add_secondary_archs(seccomp);
9875fd78 2460 if (r < 0) {
da927ba9 2461 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
2462 goto finish;
2463 }
2464
28650077 2465 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
2466 if (arg_retain & (1ULL << blacklist[i].capability))
2467 continue;
2468
2469 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
2470 if (r == -EFAULT)
2471 continue; /* unknown syscall */
2472 if (r < 0) {
da927ba9 2473 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
2474 goto finish;
2475 }
2476 }
2477
d0a0ccf3 2478
28650077
LP
2479 /*
2480 Audit is broken in containers, much of the userspace audit
2481 hookup will fail if running inside a container. We don't
2482 care and just turn off creation of audit sockets.
2483
2484 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2485 with EAFNOSUPPORT which audit userspace uses as indication
2486 that audit is disabled in the kernel.
2487 */
2488
3302da46 2489 r = seccomp_rule_add(
24fb1112
LP
2490 seccomp,
2491 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2492 SCMP_SYS(socket),
2493 2,
2494 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2495 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2496 if (r < 0) {
da927ba9 2497 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
2498 goto finish;
2499 }
2500
2501 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2502 if (r < 0) {
da927ba9 2503 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
2504 goto finish;
2505 }
2506
2507 r = seccomp_load(seccomp);
9b1cbdc6
ILG
2508 if (r == -EINVAL) {
2509 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
2510 r = 0;
2511 goto finish;
2512 }
2513 if (r < 0) {
da927ba9 2514 log_error_errno(r, "Failed to install seccomp audit filter: %m");
9b1cbdc6
ILG
2515 goto finish;
2516 }
24fb1112
LP
2517
2518finish:
2519 seccomp_release(seccomp);
2520 return r;
2521#else
2522 return 0;
2523#endif
2524
2525}
2526
785890ac
LP
2527static int setup_propagate(const char *root) {
2528 const char *p, *q;
2529
2530 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2531 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 2532 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
2533 (void) mkdir_p(p, 0600);
2534
03cfe0d5
LP
2535 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
2536 return log_error_errno(errno, "Failed to create /run/systemd: %m");
2537
2538 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
2539 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
2540
2541 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
2542 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 2543
03cfe0d5 2544 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
2545 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2546 return log_error_errno(errno, "Failed to install propagation bind mount.");
2547
2548 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2549 return log_error_errno(errno, "Failed to make propagation mount read-only");
2550
2551 return 0;
2552}
2553
1b9e5b12
LP
2554static int setup_image(char **device_path, int *loop_nr) {
2555 struct loop_info64 info = {
2556 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2557 };
2558 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2559 _cleanup_free_ char* loopdev = NULL;
2560 struct stat st;
2561 int r, nr;
2562
2563 assert(device_path);
2564 assert(loop_nr);
ec16945e 2565 assert(arg_image);
1b9e5b12
LP
2566
2567 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2568 if (fd < 0)
2569 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 2570
4a62c710
MS
2571 if (fstat(fd, &st) < 0)
2572 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
2573
2574 if (S_ISBLK(st.st_mode)) {
2575 char *p;
2576
2577 p = strdup(arg_image);
2578 if (!p)
2579 return log_oom();
2580
2581 *device_path = p;
2582
2583 *loop_nr = -1;
2584
2585 r = fd;
2586 fd = -1;
2587
2588 return r;
2589 }
2590
2591 if (!S_ISREG(st.st_mode)) {
56f64d95 2592 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
2593 return -EINVAL;
2594 }
2595
2596 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
2597 if (control < 0)
2598 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
2599
2600 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
2601 if (nr < 0)
2602 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
2603
2604 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2605 return log_oom();
2606
2607 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
2608 if (loop < 0)
2609 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 2610
4a62c710
MS
2611 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2612 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
2613
2614 if (arg_read_only)
2615 info.lo_flags |= LO_FLAGS_READ_ONLY;
2616
4a62c710
MS
2617 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2618 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
2619
2620 *device_path = loopdev;
2621 loopdev = NULL;
2622
2623 *loop_nr = nr;
2624
2625 r = loop;
2626 loop = -1;
2627
2628 return r;
2629}
2630
ada4799a
LP
2631#define PARTITION_TABLE_BLURB \
2632 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 2633 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 2634 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
2635 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2636 "to be bootable with systemd-nspawn."
2637
1b9e5b12
LP
2638static int dissect_image(
2639 int fd,
727fd4fd
LP
2640 char **root_device, bool *root_device_rw,
2641 char **home_device, bool *home_device_rw,
2642 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
2643 bool *secondary) {
2644
2645#ifdef HAVE_BLKID
01dc33ce
ZJS
2646 int home_nr = -1, srv_nr = -1;
2647#ifdef GPT_ROOT_NATIVE
2648 int root_nr = -1;
2649#endif
2650#ifdef GPT_ROOT_SECONDARY
2651 int secondary_root_nr = -1;
2652#endif
f6c51a81 2653 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
2654 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2655 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2656 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2657 _cleanup_udev_unref_ struct udev *udev = NULL;
2658 struct udev_list_entry *first, *item;
f6c51a81 2659 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 2660 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
2661 const char *pttype = NULL;
2662 blkid_partlist pl;
2663 struct stat st;
c09ef2e4 2664 unsigned i;
1b9e5b12
LP
2665 int r;
2666
2667 assert(fd >= 0);
2668 assert(root_device);
2669 assert(home_device);
2670 assert(srv_device);
2671 assert(secondary);
ec16945e 2672 assert(arg_image);
1b9e5b12
LP
2673
2674 b = blkid_new_probe();
2675 if (!b)
2676 return log_oom();
2677
2678 errno = 0;
2679 r = blkid_probe_set_device(b, fd, 0, 0);
2680 if (r != 0) {
2681 if (errno == 0)
2682 return log_oom();
2683
56f64d95 2684 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
2685 return -errno;
2686 }
2687
2688 blkid_probe_enable_partitions(b, 1);
2689 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2690
2691 errno = 0;
2692 r = blkid_do_safeprobe(b);
2693 if (r == -2 || r == 1) {
ada4799a
LP
2694 log_error("Failed to identify any partition table on\n"
2695 " %s\n"
2696 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
2697 return -EINVAL;
2698 } else if (r != 0) {
2699 if (errno == 0)
2700 errno = EIO;
56f64d95 2701 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
2702 return -errno;
2703 }
2704
48861960 2705 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
2706
2707 is_gpt = streq_ptr(pttype, "gpt");
2708 is_mbr = streq_ptr(pttype, "dos");
2709
2710 if (!is_gpt && !is_mbr) {
2711 log_error("No GPT or MBR partition table discovered on\n"
2712 " %s\n"
2713 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
2714 return -EINVAL;
2715 }
2716
2717 errno = 0;
2718 pl = blkid_probe_get_partitions(b);
2719 if (!pl) {
2720 if (errno == 0)
2721 return log_oom();
2722
2723 log_error("Failed to list partitions of %s", arg_image);
2724 return -errno;
2725 }
2726
2727 udev = udev_new();
2728 if (!udev)
2729 return log_oom();
2730
4a62c710
MS
2731 if (fstat(fd, &st) < 0)
2732 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 2733
c09ef2e4
LP
2734 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2735 if (!d)
1b9e5b12
LP
2736 return log_oom();
2737
c09ef2e4
LP
2738 for (i = 0;; i++) {
2739 int n, m;
1b9e5b12 2740
c09ef2e4
LP
2741 if (i >= 10) {
2742 log_error("Kernel partitions never appeared.");
2743 return -ENXIO;
2744 }
2745
2746 e = udev_enumerate_new(udev);
2747 if (!e)
2748 return log_oom();
2749
2750 r = udev_enumerate_add_match_parent(e, d);
2751 if (r < 0)
2752 return log_oom();
2753
2754 r = udev_enumerate_scan_devices(e);
2755 if (r < 0)
2756 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2757
2758 /* Count the partitions enumerated by the kernel */
2759 n = 0;
2760 first = udev_enumerate_get_list_entry(e);
2761 udev_list_entry_foreach(item, first)
2762 n++;
2763
2764 /* Count the partitions enumerated by blkid */
2765 m = blkid_partlist_numof_partitions(pl);
2766 if (n == m + 1)
2767 break;
2768 if (n > m + 1) {
2769 log_error("blkid and kernel partition list do not match.");
2770 return -EIO;
2771 }
2772 if (n < m + 1) {
2773 unsigned j;
2774
2775 /* The kernel has probed fewer partitions than
2776 * blkid? Maybe the kernel prober is still
2777 * running or it got EBUSY because udev
2778 * already opened the device. Let's reprobe
2779 * the device, which is a synchronous call
2780 * that waits until probing is complete. */
2781
2782 for (j = 0; j < 20; j++) {
2783
2784 r = ioctl(fd, BLKRRPART, 0);
2785 if (r < 0)
2786 r = -errno;
2787 if (r >= 0 || r != -EBUSY)
2788 break;
2789
2790 /* If something else has the device
2791 * open, such as an udev rule, the
2792 * ioctl will return EBUSY. Since
2793 * there's no way to wait until it
2794 * isn't busy anymore, let's just wait
2795 * a bit, and try again.
2796 *
2797 * This is really something they
2798 * should fix in the kernel! */
2799
2800 usleep(50 * USEC_PER_MSEC);
2801 }
2802
2803 if (r < 0)
2804 return log_error_errno(r, "Failed to reread partition table: %m");
2805 }
2806
2807 e = udev_enumerate_unref(e);
2808 }
1b9e5b12
LP
2809
2810 first = udev_enumerate_get_list_entry(e);
2811 udev_list_entry_foreach(item, first) {
2812 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 2813 const char *node;
727fd4fd 2814 unsigned long long flags;
1b9e5b12
LP
2815 blkid_partition pp;
2816 dev_t qn;
2817 int nr;
2818
2819 errno = 0;
2820 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2821 if (!q) {
2822 if (!errno)
2823 errno = ENOMEM;
2824
56f64d95 2825 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
2826 return -errno;
2827 }
2828
2829 qn = udev_device_get_devnum(q);
2830 if (major(qn) == 0)
2831 continue;
2832
2833 if (st.st_rdev == qn)
2834 continue;
2835
2836 node = udev_device_get_devnode(q);
2837 if (!node)
2838 continue;
2839
2840 pp = blkid_partlist_devno_to_partition(pl, qn);
2841 if (!pp)
2842 continue;
2843
727fd4fd 2844 flags = blkid_partition_get_flags(pp);
727fd4fd 2845
1b9e5b12
LP
2846 nr = blkid_partition_get_partno(pp);
2847 if (nr < 0)
2848 continue;
2849
ada4799a
LP
2850 if (is_gpt) {
2851 sd_id128_t type_id;
2852 const char *stype;
1b9e5b12 2853
f6c51a81
LP
2854 if (flags & GPT_FLAG_NO_AUTO)
2855 continue;
2856
ada4799a
LP
2857 stype = blkid_partition_get_type_string(pp);
2858 if (!stype)
2859 continue;
1b9e5b12 2860
ada4799a 2861 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
2862 continue;
2863
ada4799a 2864 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 2865
ada4799a
LP
2866 if (home && nr >= home_nr)
2867 continue;
1b9e5b12 2868
ada4799a
LP
2869 home_nr = nr;
2870 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 2871
ada4799a
LP
2872 r = free_and_strdup(&home, node);
2873 if (r < 0)
2874 return log_oom();
727fd4fd 2875
ada4799a
LP
2876 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2877
2878 if (srv && nr >= srv_nr)
2879 continue;
2880
2881 srv_nr = nr;
2882 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2883
2884 r = free_and_strdup(&srv, node);
2885 if (r < 0)
2886 return log_oom();
2887 }
1b9e5b12 2888#ifdef GPT_ROOT_NATIVE
ada4799a 2889 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 2890
ada4799a
LP
2891 if (root && nr >= root_nr)
2892 continue;
1b9e5b12 2893
ada4799a
LP
2894 root_nr = nr;
2895 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 2896
ada4799a
LP
2897 r = free_and_strdup(&root, node);
2898 if (r < 0)
2899 return log_oom();
2900 }
1b9e5b12
LP
2901#endif
2902#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
2903 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2904
2905 if (secondary_root && nr >= secondary_root_nr)
2906 continue;
2907
2908 secondary_root_nr = nr;
2909 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2910
2911 r = free_and_strdup(&secondary_root, node);
2912 if (r < 0)
2913 return log_oom();
2914 }
2915#endif
f6c51a81
LP
2916 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2917
2918 if (generic)
2919 multiple_generic = true;
2920 else {
2921 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2922
2923 r = free_and_strdup(&generic, node);
2924 if (r < 0)
2925 return log_oom();
2926 }
2927 }
ada4799a
LP
2928
2929 } else if (is_mbr) {
2930 int type;
1b9e5b12 2931
f6c51a81
LP
2932 if (flags != 0x80) /* Bootable flag */
2933 continue;
2934
ada4799a
LP
2935 type = blkid_partition_get_type(pp);
2936 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
2937 continue;
2938
f6c51a81
LP
2939 if (generic)
2940 multiple_generic = true;
2941 else {
2942 generic_rw = true;
727fd4fd 2943
f6c51a81
LP
2944 r = free_and_strdup(&root, node);
2945 if (r < 0)
2946 return log_oom();
2947 }
1b9e5b12 2948 }
1b9e5b12
LP
2949 }
2950
1b9e5b12
LP
2951 if (root) {
2952 *root_device = root;
2953 root = NULL;
727fd4fd
LP
2954
2955 *root_device_rw = root_rw;
1b9e5b12
LP
2956 *secondary = false;
2957 } else if (secondary_root) {
2958 *root_device = secondary_root;
2959 secondary_root = NULL;
727fd4fd
LP
2960
2961 *root_device_rw = secondary_root_rw;
1b9e5b12 2962 *secondary = true;
f6c51a81
LP
2963 } else if (generic) {
2964
2965 /* There were no partitions with precise meanings
2966 * around, but we found generic partitions. In this
2967 * case, if there's only one, we can go ahead and boot
2968 * it, otherwise we bail out, because we really cannot
2969 * make any sense of it. */
2970
2971 if (multiple_generic) {
2972 log_error("Identified multiple bootable Linux partitions on\n"
2973 " %s\n"
2974 PARTITION_TABLE_BLURB, arg_image);
2975 return -EINVAL;
2976 }
2977
2978 *root_device = generic;
2979 generic = NULL;
2980
2981 *root_device_rw = generic_rw;
2982 *secondary = false;
2983 } else {
2984 log_error("Failed to identify root partition in disk image\n"
2985 " %s\n"
2986 PARTITION_TABLE_BLURB, arg_image);
2987 return -EINVAL;
1b9e5b12
LP
2988 }
2989
2990 if (home) {
2991 *home_device = home;
2992 home = NULL;
727fd4fd
LP
2993
2994 *home_device_rw = home_rw;
1b9e5b12
LP
2995 }
2996
2997 if (srv) {
2998 *srv_device = srv;
2999 srv = NULL;
727fd4fd
LP
3000
3001 *srv_device_rw = srv_rw;
1b9e5b12
LP
3002 }
3003
3004 return 0;
3005#else
3006 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3007 return -EOPNOTSUPP;
1b9e5b12
LP
3008#endif
3009}
3010
727fd4fd 3011static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
3012#ifdef HAVE_BLKID
3013 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3014 const char *fstype, *p;
3015 int r;
3016
3017 assert(what);
3018 assert(where);
3019
727fd4fd
LP
3020 if (arg_read_only)
3021 rw = false;
3022
1b9e5b12 3023 if (directory)
63c372cb 3024 p = strjoina(where, directory);
1b9e5b12
LP
3025 else
3026 p = where;
3027
3028 errno = 0;
3029 b = blkid_new_probe_from_filename(what);
3030 if (!b) {
3031 if (errno == 0)
3032 return log_oom();
56f64d95 3033 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
3034 return -errno;
3035 }
3036
3037 blkid_probe_enable_superblocks(b, 1);
3038 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3039
3040 errno = 0;
3041 r = blkid_do_safeprobe(b);
3042 if (r == -1 || r == 1) {
3043 log_error("Cannot determine file system type of %s", what);
3044 return -EINVAL;
3045 } else if (r != 0) {
3046 if (errno == 0)
3047 errno = EIO;
56f64d95 3048 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
3049 return -errno;
3050 }
3051
3052 errno = 0;
3053 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3054 if (errno == 0)
3055 errno = EINVAL;
3056 log_error("Failed to determine file system type of %s", what);
3057 return -errno;
3058 }
3059
3060 if (streq(fstype, "crypto_LUKS")) {
3061 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 3062 return -EOPNOTSUPP;
1b9e5b12
LP
3063 }
3064
4a62c710
MS
3065 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3066 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
3067
3068 return 0;
3069#else
3070 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3071 return -EOPNOTSUPP;
1b9e5b12
LP
3072#endif
3073}
3074
727fd4fd
LP
3075static int mount_devices(
3076 const char *where,
3077 const char *root_device, bool root_device_rw,
3078 const char *home_device, bool home_device_rw,
3079 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
3080 int r;
3081
3082 assert(where);
3083
3084 if (root_device) {
727fd4fd 3085 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
3086 if (r < 0)
3087 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
3088 }
3089
3090 if (home_device) {
727fd4fd 3091 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
3092 if (r < 0)
3093 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
3094 }
3095
3096 if (srv_device) {
727fd4fd 3097 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
3098 if (r < 0)
3099 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
3100 }
3101
3102 return 0;
3103}
3104
3105static void loop_remove(int nr, int *image_fd) {
3106 _cleanup_close_ int control = -1;
e8c8ddcc 3107 int r;
1b9e5b12
LP
3108
3109 if (nr < 0)
3110 return;
3111
3112 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
3113 r = ioctl(*image_fd, LOOP_CLR_FD);
3114 if (r < 0)
5e4074aa 3115 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 3116 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
3117 }
3118
3119 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 3120 if (control < 0) {
56f64d95 3121 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 3122 return;
e8c8ddcc 3123 }
1b9e5b12 3124
e8c8ddcc
TG
3125 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3126 if (r < 0)
5e4074aa 3127 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
3128}
3129
0cb9fbcd
LP
3130static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3131 int pipe_fds[2];
3132 pid_t pid;
3133
3134 assert(database);
3135 assert(key);
3136 assert(rpid);
3137
4a62c710
MS
3138 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3139 return log_error_errno(errno, "Failed to allocate pipe: %m");
0cb9fbcd
LP
3140
3141 pid = fork();
4a62c710
MS
3142 if (pid < 0)
3143 return log_error_errno(errno, "Failed to fork getent child: %m");
3144 else if (pid == 0) {
0cb9fbcd
LP
3145 int nullfd;
3146 char *empty_env = NULL;
3147
3148 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3149 _exit(EXIT_FAILURE);
3150
3151 if (pipe_fds[0] > 2)
03e334a1 3152 safe_close(pipe_fds[0]);
0cb9fbcd 3153 if (pipe_fds[1] > 2)
03e334a1 3154 safe_close(pipe_fds[1]);
0cb9fbcd
LP
3155
3156 nullfd = open("/dev/null", O_RDWR);
3157 if (nullfd < 0)
3158 _exit(EXIT_FAILURE);
3159
3160 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3161 _exit(EXIT_FAILURE);
3162
3163 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3164 _exit(EXIT_FAILURE);
3165
3166 if (nullfd > 2)
03e334a1 3167 safe_close(nullfd);
0cb9fbcd 3168
ce30c8dc
LP
3169 (void) reset_all_signal_handlers();
3170 (void) reset_signal_mask();
0cb9fbcd
LP
3171 close_all_fds(NULL, 0);
3172
4de82926
MM
3173 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3174 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
3175 _exit(EXIT_FAILURE);
3176 }
3177
03e334a1 3178 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
3179
3180 *rpid = pid;
3181
3182 return pipe_fds[0];
3183}
3184
3185static int change_uid_gid(char **_home) {
a2a5291b
ZJS
3186 char line[LINE_MAX], *x, *u, *g, *h;
3187 const char *word, *state;
0cb9fbcd
LP
3188 _cleanup_free_ uid_t *uids = NULL;
3189 _cleanup_free_ char *home = NULL;
3190 _cleanup_fclose_ FILE *f = NULL;
3191 _cleanup_close_ int fd = -1;
3192 unsigned n_uids = 0;
70f539ca 3193 size_t sz = 0, l;
0cb9fbcd
LP
3194 uid_t uid;
3195 gid_t gid;
3196 pid_t pid;
3197 int r;
3198
3199 assert(_home);
3200
3201 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3202 /* Reset everything fully to 0, just in case */
3203
03cfe0d5
LP
3204 r = reset_uid_gid();
3205 if (r < 0)
3206 return log_error_errno(r, "Failed to become root: %m");
0cb9fbcd
LP
3207
3208 *_home = NULL;
3209 return 0;
3210 }
3211
3212 /* First, get user credentials */
3213 fd = spawn_getent("passwd", arg_user, &pid);
3214 if (fd < 0)
3215 return fd;
3216
3217 f = fdopen(fd, "r");
3218 if (!f)
3219 return log_oom();
3220 fd = -1;
3221
3222 if (!fgets(line, sizeof(line), f)) {
3223
3224 if (!ferror(f)) {
3225 log_error("Failed to resolve user %s.", arg_user);
3226 return -ESRCH;
3227 }
3228
56f64d95 3229 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3230 return -errno;
3231 }
3232
3233 truncate_nl(line);
3234
820d3acf 3235 wait_for_terminate_and_warn("getent passwd", pid, true);
0cb9fbcd
LP
3236
3237 x = strchr(line, ':');
3238 if (!x) {
3239 log_error("/etc/passwd entry has invalid user field.");
3240 return -EIO;
3241 }
3242
3243 u = strchr(x+1, ':');
3244 if (!u) {
3245 log_error("/etc/passwd entry has invalid password field.");
3246 return -EIO;
3247 }
3248
3249 u++;
3250 g = strchr(u, ':');
3251 if (!g) {
3252 log_error("/etc/passwd entry has invalid UID field.");
3253 return -EIO;
3254 }
3255
3256 *g = 0;
3257 g++;
3258 x = strchr(g, ':');
3259 if (!x) {
3260 log_error("/etc/passwd entry has invalid GID field.");
3261 return -EIO;
3262 }
3263
3264 *x = 0;
3265 h = strchr(x+1, ':');
3266 if (!h) {
3267 log_error("/etc/passwd entry has invalid GECOS field.");
3268 return -EIO;
3269 }
3270
3271 h++;
3272 x = strchr(h, ':');
3273 if (!x) {
3274 log_error("/etc/passwd entry has invalid home directory field.");
3275 return -EIO;
3276 }
3277
3278 *x = 0;
3279
3280 r = parse_uid(u, &uid);
3281 if (r < 0) {
3282 log_error("Failed to parse UID of user.");
3283 return -EIO;
3284 }
3285
3286 r = parse_gid(g, &gid);
3287 if (r < 0) {
3288 log_error("Failed to parse GID of user.");
3289 return -EIO;
3290 }
3291
3292 home = strdup(h);
3293 if (!home)
3294 return log_oom();
3295
3296 /* Second, get group memberships */
3297 fd = spawn_getent("initgroups", arg_user, &pid);
3298 if (fd < 0)
3299 return fd;
3300
3301 fclose(f);
3302 f = fdopen(fd, "r");
3303 if (!f)
3304 return log_oom();
3305 fd = -1;
3306
3307 if (!fgets(line, sizeof(line), f)) {
3308 if (!ferror(f)) {
3309 log_error("Failed to resolve user %s.", arg_user);
3310 return -ESRCH;
3311 }
3312
56f64d95 3313 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
3314 return -errno;
3315 }
3316
3317 truncate_nl(line);
3318
820d3acf 3319 wait_for_terminate_and_warn("getent initgroups", pid, true);
0cb9fbcd
LP
3320
3321 /* Skip over the username and subsequent separator whitespace */
3322 x = line;
3323 x += strcspn(x, WHITESPACE);
3324 x += strspn(x, WHITESPACE);
3325
a2a5291b 3326 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
3327 char c[l+1];
3328
a2a5291b 3329 memcpy(c, word, l);
0cb9fbcd
LP
3330 c[l] = 0;
3331
3332 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3333 return log_oom();
3334
3335 r = parse_uid(c, &uids[n_uids++]);
3336 if (r < 0) {
3337 log_error("Failed to parse group data from getent.");
3338 return -EIO;
3339 }
3340 }
3341
3342 r = mkdir_parents(home, 0775);
f647962d
MS
3343 if (r < 0)
3344 return log_error_errno(r, "Failed to make home root directory: %m");
0cb9fbcd
LP
3345
3346 r = mkdir_safe(home, 0755, uid, gid);
f647962d
MS
3347 if (r < 0 && r != -EEXIST)
3348 return log_error_errno(r, "Failed to make home directory: %m");
0cb9fbcd 3349
03cfe0d5
LP
3350 (void) fchown(STDIN_FILENO, uid, gid);
3351 (void) fchown(STDOUT_FILENO, uid, gid);
3352 (void) fchown(STDERR_FILENO, uid, gid);
0cb9fbcd 3353
4a62c710
MS
3354 if (setgroups(n_uids, uids) < 0)
3355 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
0cb9fbcd 3356
4a62c710
MS
3357 if (setresgid(gid, gid, gid) < 0)
3358 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 3359
4a62c710
MS
3360 if (setresuid(uid, uid, uid) < 0)
3361 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
3362
3363 if (_home) {
3364 *_home = home;
3365 home = NULL;
3366 }
3367
3368 return 0;
3369}
3370
113cea80 3371/*
6d416b9c
LS
3372 * Return values:
3373 * < 0 : wait_for_terminate() failed to get the state of the
3374 * container, the container was terminated by a signal, or
3375 * failed for an unknown reason. No change is made to the
3376 * container argument.
3377 * > 0 : The program executed in the container terminated with an
3378 * error. The exit code of the program executed in the
919699ec
LP
3379 * container is returned. The container argument has been set
3380 * to CONTAINER_TERMINATED.
6d416b9c
LS
3381 * 0 : The container is being rebooted, has been shut down or exited
3382 * successfully. The container argument has been set to either
3383 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 3384 *
6d416b9c
LS
3385 * That is, success is indicated by a return value of zero, and an
3386 * error is indicated by a non-zero value.
113cea80
DH
3387 */
3388static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 3389 siginfo_t status;
919699ec 3390 int r;
113cea80
DH
3391
3392 r = wait_for_terminate(pid, &status);
f647962d
MS
3393 if (r < 0)
3394 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
3395
3396 switch (status.si_code) {
fddbb89c 3397
113cea80 3398 case CLD_EXITED:
919699ec
LP
3399 if (status.si_status == 0) {
3400 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 3401
fddbb89c 3402 } else
919699ec 3403 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 3404
919699ec
LP
3405 *container = CONTAINER_TERMINATED;
3406 return status.si_status;
113cea80
DH
3407
3408 case CLD_KILLED:
3409 if (status.si_status == SIGINT) {
113cea80 3410
919699ec 3411 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 3412 *container = CONTAINER_TERMINATED;
919699ec
LP
3413 return 0;
3414
113cea80 3415 } else if (status.si_status == SIGHUP) {
113cea80 3416
919699ec 3417 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 3418 *container = CONTAINER_REBOOTED;
919699ec 3419 return 0;
113cea80 3420 }
919699ec 3421
113cea80
DH
3422 /* CLD_KILLED fallthrough */
3423
3424 case CLD_DUMPED:
fddbb89c 3425 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 3426 return -EIO;
113cea80
DH
3427
3428 default:
fddbb89c 3429 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 3430 return -EIO;
113cea80
DH
3431 }
3432
3433 return r;
3434}
3435
e866af3a
DH
3436static void nop_handler(int sig) {}
3437
023fb90b
LP
3438static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3439 pid_t pid;
3440
3441 pid = PTR_TO_UINT32(userdata);
3442 if (pid > 0) {
c6c8f6e2 3443 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
3444 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3445 sd_event_source_set_userdata(s, NULL);
3446 return 0;
3447 }
3448 }
3449
3450 sd_event_exit(sd_event_source_get_event(s), 0);
3451 return 0;
3452}
3453
ec16945e 3454static int determine_names(void) {
1b9cebf6 3455 int r;
ec16945e 3456
c1521918
LP
3457 if (arg_template && !arg_directory && arg_machine) {
3458
3459 /* If --template= was specified then we should not
3460 * search for a machine, but instead create a new one
3461 * in /var/lib/machine. */
3462
3463 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
3464 if (!arg_directory)
3465 return log_oom();
3466 }
3467
ec16945e 3468 if (!arg_image && !arg_directory) {
1b9cebf6
LP
3469 if (arg_machine) {
3470 _cleanup_(image_unrefp) Image *i = NULL;
3471
3472 r = image_find(arg_machine, &i);
3473 if (r < 0)
3474 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3475 else if (r == 0) {
3476 log_error("No image for machine '%s': %m", arg_machine);
3477 return -ENOENT;
3478 }
3479
aceac2f0 3480 if (i->type == IMAGE_RAW)
1b9cebf6
LP
3481 r = set_sanitized_path(&arg_image, i->path);
3482 else
3483 r = set_sanitized_path(&arg_directory, i->path);
3484 if (r < 0)
3485 return log_error_errno(r, "Invalid image directory: %m");
3486
aee327b8
LP
3487 if (!arg_ephemeral)
3488 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 3489 } else
ec16945e
LP
3490 arg_directory = get_current_dir_name();
3491
1b9cebf6
LP
3492 if (!arg_directory && !arg_machine) {
3493 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
3494 return -EINVAL;
3495 }
3496 }
3497
3498 if (!arg_machine) {
b9ba4dab
LP
3499 if (arg_directory && path_equal(arg_directory, "/"))
3500 arg_machine = gethostname_malloc();
3501 else
3502 arg_machine = strdup(basename(arg_image ?: arg_directory));
3503
ec16945e
LP
3504 if (!arg_machine)
3505 return log_oom();
3506
ae691c1d 3507 hostname_cleanup(arg_machine);
ec16945e
LP
3508 if (!machine_name_is_valid(arg_machine)) {
3509 log_error("Failed to determine machine name automatically, please use -M.");
3510 return -EINVAL;
3511 }
b9ba4dab
LP
3512
3513 if (arg_ephemeral) {
3514 char *b;
3515
3516 /* Add a random suffix when this is an
3517 * ephemeral machine, so that we can run many
3518 * instances at once without manually having
3519 * to specify -M each time. */
3520
3521 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3522 return log_oom();
3523
3524 free(arg_machine);
3525 arg_machine = b;
3526 }
ec16945e
LP
3527 }
3528
3529 return 0;
3530}
3531
03cfe0d5 3532static int determine_uid_shift(const char *directory) {
6dac160c
LP
3533 int r;
3534
03cfe0d5
LP
3535 if (!arg_userns) {
3536 arg_uid_shift = 0;
6dac160c 3537 return 0;
03cfe0d5 3538 }
6dac160c
LP
3539
3540 if (arg_uid_shift == UID_INVALID) {
3541 struct stat st;
3542
03cfe0d5 3543 r = stat(directory, &st);
6dac160c 3544 if (r < 0)
03cfe0d5 3545 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
3546
3547 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3548
3549 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 3550 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
3551 return -EINVAL;
3552 }
3553
3554 arg_uid_range = UINT32_C(0x10000);
3555 }
3556
3557 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3558 log_error("UID base too high for UID range.");
3559 return -EINVAL;
3560 }
3561
3562 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3563 return 0;
3564}
3565
03cfe0d5
LP
3566static int inner_child(
3567 Barrier *barrier,
3568 const char *directory,
3569 bool secondary,
3570 int kmsg_socket,
3571 int rtnl_socket,
f757855e 3572 FDSet *fds) {
69c79d3c 3573
03cfe0d5
LP
3574 _cleanup_free_ char *home = NULL;
3575 unsigned n_env = 2;
3576 const char *envp[] = {
3577 "PATH=" DEFAULT_PATH_SPLIT_USR,
3578 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3579 NULL, /* TERM */
3580 NULL, /* HOME */
3581 NULL, /* USER */
3582 NULL, /* LOGNAME */
3583 NULL, /* container_uuid */
3584 NULL, /* LISTEN_FDS */
3585 NULL, /* LISTEN_PID */
3586 NULL
3587 };
88213476 3588
2371271c 3589 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 3590 int r;
88213476 3591
03cfe0d5
LP
3592 assert(barrier);
3593 assert(directory);
3594 assert(kmsg_socket >= 0);
88213476 3595
efdb0237
LP
3596 cg_unified_flush();
3597
03cfe0d5
LP
3598 if (arg_userns) {
3599 /* Tell the parent, that it now can write the UID map. */
3600 (void) barrier_place(barrier); /* #1 */
7027ff61 3601
03cfe0d5
LP
3602 /* Wait until the parent wrote the UID map */
3603 if (!barrier_place_and_sync(barrier)) { /* #2 */
3604 log_error("Parent died too early");
3605 return -ESRCH;
3606 }
88213476
LP
3607 }
3608
e83bebef 3609 r = mount_all(NULL, true, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
3610 if (r < 0)
3611 return r;
3612
3613 /* Wait until we are cgroup-ified, so that we
3614 * can mount the right cgroup path writable */
3615 if (!barrier_place_and_sync(barrier)) { /* #3 */
3616 log_error("Parent died too early");
3617 return -ESRCH;
88213476
LP
3618 }
3619
e83bebef 3620 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
03cfe0d5
LP
3621 if (r < 0)
3622 return r;
ec16945e 3623
03cfe0d5
LP
3624 r = reset_uid_gid();
3625 if (r < 0)
3626 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 3627
03cfe0d5
LP
3628 r = setup_boot_id(NULL);
3629 if (r < 0)
3630 return r;
ec16945e 3631
03cfe0d5
LP
3632 r = setup_kmsg(NULL, kmsg_socket);
3633 if (r < 0)
3634 return r;
3635 kmsg_socket = safe_close(kmsg_socket);
ec16945e 3636
03cfe0d5 3637 umask(0022);
30535c16 3638
03cfe0d5
LP
3639 if (setsid() < 0)
3640 return log_error_errno(errno, "setsid() failed: %m");
3641
3642 if (arg_private_network)
3643 loopback_setup();
3644
3645 r = send_rtnl(rtnl_socket);
3646 if (r < 0)
3647 return r;
3648 rtnl_socket = safe_close(rtnl_socket);
3649
3650 if (drop_capabilities() < 0)
3651 return log_error_errno(errno, "drop_capabilities() failed: %m");
3652
3653 setup_hostname();
3654
050f7277 3655 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
3656 if (personality(arg_personality) < 0)
3657 return log_error_errno(errno, "personality() failed: %m");
3658 } else if (secondary) {
3659 if (personality(PER_LINUX32) < 0)
3660 return log_error_errno(errno, "personality() failed: %m");
3661 }
3662
3663#ifdef HAVE_SELINUX
3664 if (arg_selinux_context)
3665 if (setexeccon((security_context_t) arg_selinux_context) < 0)
3666 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3667#endif
3668
3669 r = change_uid_gid(&home);
3670 if (r < 0)
3671 return r;
3672
3673 envp[n_env] = strv_find_prefix(environ, "TERM=");
3674 if (envp[n_env])
3675 n_env ++;
3676
3677 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3678 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3679 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
3680 return log_oom();
3681
3682 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3683 char as_uuid[37];
3684
3685 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
3686 return log_oom();
3687 }
3688
3689 if (fdset_size(fds) > 0) {
3690 r = fdset_cloexec(fds, false);
3691 if (r < 0)
3692 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3693
3694 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3695 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
3696 return log_oom();
3697 }
3698
2371271c
TG
3699 env_use = strv_env_merge(2, envp, arg_setenv);
3700 if (!env_use)
3701 return log_oom();
03cfe0d5
LP
3702
3703 /* Let the parent know that we are ready and
3704 * wait until the parent is ready with the
3705 * setup, too... */
3706 if (!barrier_place_and_sync(barrier)) { /* #4 */
3707 log_error("Parent died too early");
3708 return -ESRCH;
3709 }
3710
3711 /* Now, explicitly close the log, so that we
3712 * then can close all remaining fds. Closing
3713 * the log explicitly first has the benefit
3714 * that the logging subsystem knows about it,
3715 * and is thus ready to be reopened should we
3716 * need it again. Note that the other fds
3717 * closed here are at least the locking and
3718 * barrier fds. */
3719 log_close();
3720 (void) fdset_close_others(fds);
3721
3722 if (arg_boot) {
3723 char **a;
3724 size_t m;
3725
3726 /* Automatically search for the init system */
3727
f757855e 3728 m = 1 + strv_length(arg_parameters);
03cfe0d5 3729 a = newa(char*, m + 1);
f757855e
LP
3730 if (strv_isempty(arg_parameters))
3731 a[1] = NULL;
3732 else
3733 memcpy(a + 1, arg_parameters, m * sizeof(char*));
03cfe0d5
LP
3734
3735 a[0] = (char*) "/usr/lib/systemd/systemd";
3736 execve(a[0], a, env_use);
3737
3738 a[0] = (char*) "/lib/systemd/systemd";
3739 execve(a[0], a, env_use);
3740
3741 a[0] = (char*) "/sbin/init";
3742 execve(a[0], a, env_use);
f757855e
LP
3743 } else if (!strv_isempty(arg_parameters))
3744 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 3745 else {
f757855e 3746 chdir(home ?: "/root");
03cfe0d5
LP
3747 execle("/bin/bash", "-bash", NULL, env_use);
3748 execle("/bin/sh", "-sh", NULL, env_use);
3749 }
3750
3751 (void) log_open();
3752 return log_error_errno(errno, "execv() failed: %m");
3753}
3754
3755static int outer_child(
3756 Barrier *barrier,
3757 const char *directory,
3758 const char *console,
3759 const char *root_device, bool root_device_rw,
3760 const char *home_device, bool home_device_rw,
3761 const char *srv_device, bool srv_device_rw,
3762 bool interactive,
3763 bool secondary,
3764 int pid_socket,
3765 int kmsg_socket,
3766 int rtnl_socket,
825d5287 3767 int uid_shift_socket,
f757855e 3768 FDSet *fds) {
03cfe0d5
LP
3769
3770 pid_t pid;
3771 ssize_t l;
3772 int r;
3773
3774 assert(barrier);
3775 assert(directory);
3776 assert(console);
3777 assert(pid_socket >= 0);
3778 assert(kmsg_socket >= 0);
3779
efdb0237
LP
3780 cg_unified_flush();
3781
03cfe0d5
LP
3782 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3783 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3784
3785 if (interactive) {
3786 close_nointr(STDIN_FILENO);
3787 close_nointr(STDOUT_FILENO);
3788 close_nointr(STDERR_FILENO);
3789
3790 r = open_terminal(console, O_RDWR);
3791 if (r != STDIN_FILENO) {
3792 if (r >= 0) {
3793 safe_close(r);
3794 r = -EINVAL;
3795 }
3796
3797 return log_error_errno(r, "Failed to open console: %m");
3798 }
3799
3800 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3801 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
3802 return log_error_errno(errno, "Failed to duplicate console: %m");
3803 }
3804
3805 r = reset_audit_loginuid();
3806 if (r < 0)
3807 return r;
3808
3809 /* Mark everything as slave, so that we still
3810 * receive mounts from the real root, but don't
3811 * propagate mounts to the real root. */
3812 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
3813 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3814
3815 r = mount_devices(directory,
3816 root_device, root_device_rw,
3817 home_device, home_device_rw,
3818 srv_device, srv_device_rw);
3819 if (r < 0)
3820 return r;
3821
391567f4
LP
3822 r = determine_uid_shift(directory);
3823 if (r < 0)
3824 return r;
3825
825d5287
RM
3826 if (arg_userns) {
3827 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3828 if (l < 0)
3829 return log_error_errno(errno, "Failed to send UID shift: %m");
3830 if (l != sizeof(arg_uid_shift)) {
3831 log_error("Short write while sending UID shift.");
3832 return -EIO;
3833 }
3834 }
3835
03cfe0d5
LP
3836 /* Turn directory into bind mount */
3837 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
3838 return log_error_errno(errno, "Failed to make bind mount: %m");
3839
e83bebef 3840 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
3841 if (r < 0)
3842 return r;
3843
e83bebef 3844 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
03cfe0d5
LP
3845 if (r < 0)
3846 return r;
3847
03cfe0d5
LP
3848 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3849 if (r < 0)
3850 return r;
3851
03cfe0d5
LP
3852 if (arg_read_only) {
3853 r = bind_remount_recursive(directory, true);
3854 if (r < 0)
3855 return log_error_errno(r, "Failed to make tree read-only: %m");
3856 }
3857
e83bebef 3858 r = mount_all(directory, false, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
3859 if (r < 0)
3860 return r;
3861
3862 if (copy_devnodes(directory) < 0)
3863 return r;
3864
3865 dev_setup(directory, arg_uid_shift, arg_uid_shift);
3866
3867 if (setup_pts(directory) < 0)
3868 return r;
3869
3870 r = setup_propagate(directory);
3871 if (r < 0)
3872 return r;
3873
3874 r = setup_dev_console(directory, console);
3875 if (r < 0)
3876 return r;
3877
3878 r = setup_seccomp();
3879 if (r < 0)
3880 return r;
3881
3882 r = setup_timezone(directory);
3883 if (r < 0)
3884 return r;
3885
3886 r = setup_resolv_conf(directory);
3887 if (r < 0)
3888 return r;
3889
3890 r = setup_journal(directory);
3891 if (r < 0)
3892 return r;
3893
e83bebef 3894 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
3895 if (r < 0)
3896 return r;
3897
e83bebef 3898 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
03cfe0d5
LP
3899 if (r < 0)
3900 return r;
3901
3902 r = mount_move_root(directory);
3903 if (r < 0)
3904 return log_error_errno(r, "Failed to move root directory: %m");
3905
3906 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3907 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
3908 (arg_private_network ? CLONE_NEWNET : 0) |
3909 (arg_userns ? CLONE_NEWUSER : 0),
3910 NULL);
3911 if (pid < 0)
3912 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
3913 if (pid == 0) {
3914 pid_socket = safe_close(pid_socket);
825d5287 3915 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
3916
3917 /* The inner child has all namespaces that are
3918 * requested, so that we all are owned by the user if
3919 * user namespaces are turned on. */
3920
f757855e 3921 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
3922 if (r < 0)
3923 _exit(EXIT_FAILURE);
3924
3925 _exit(EXIT_SUCCESS);
3926 }
3927
3928 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
3929 if (l < 0)
3930 return log_error_errno(errno, "Failed to send PID: %m");
3931 if (l != sizeof(pid)) {
3932 log_error("Short write while sending PID.");
3933 return -EIO;
3934 }
3935
3936 pid_socket = safe_close(pid_socket);
3937
3938 return 0;
3939}
3940
3941static int setup_uid_map(pid_t pid) {
3942 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
3943 int r;
3944
3945 assert(pid > 1);
3946
3947 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
3948 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 3949 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3950 if (r < 0)
3951 return log_error_errno(r, "Failed to write UID map: %m");
3952
3953 /* We always assign the same UID and GID ranges */
3954 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 3955 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
3956 if (r < 0)
3957 return log_error_errno(r, "Failed to write GID map: %m");
3958
3959 return 0;
3960}
3961
3962static int chown_cgroup(pid_t pid) {
3963 _cleanup_free_ char *path = NULL, *fs = NULL;
3964 _cleanup_close_ int fd = -1;
3965 const char *fn;
3966 int r;
3967
3968 r = cg_pid_get_path(NULL, pid, &path);
3969 if (r < 0)
3970 return log_error_errno(r, "Failed to get container cgroup path: %m");
3971
3972 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
3973 if (r < 0)
3974 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
3975
3976 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
3977 if (fd < 0)
3978 return log_error_errno(errno, "Failed to open %s: %m", fs);
3979
efdb0237
LP
3980 FOREACH_STRING(fn,
3981 ".",
3982 "tasks",
3983 "notify_on_release",
3984 "cgroup.procs",
3985 "cgroup.clone_children",
3986 "cgroup.controllers",
3987 "cgroup.subtree_control",
3988 "cgroup.populated")
03cfe0d5 3989 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
efdb0237
LP
3990 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
3991 "Failed to chown() cgroup file %s, ignoring: %m", fn);
3992
3993 return 0;
3994}
3995
3996static int sync_cgroup(pid_t pid) {
3997 _cleanup_free_ char *cgroup = NULL;
3998 char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
3999 bool undo_mount = false;
4000 const char *fn;
4001 int unified, r;
4002
4003 unified = cg_unified();
4004 if (unified < 0)
4005 return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
4006
4007 if ((unified > 0) == arg_unified_cgroup_hierarchy)
4008 return 0;
4009
4010 /* When the host uses the legacy cgroup setup, but the
4011 * container shall use the unified hierarchy, let's make sure
4012 * we copy the path from the name=systemd hierarchy into the
4013 * unified hierarchy. Similar for the reverse situation. */
4014
4015 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
4016 if (r < 0)
4017 return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid);
4018
4019 /* In order to access the unified hierarchy we need to mount it */
4020 if (!mkdtemp(tree))
4021 return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m");
4022
4023 if (unified)
4024 r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
4025 else
4026 r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior");
4027 if (r < 0) {
4028 r = log_error_errno(errno, "Failed to mount unified hierarchy: %m");
4029 goto finish;
4030 }
4031
4032 undo_mount = true;
4033
4034 fn = strjoina(tree, cgroup, "/cgroup.procs");
4035 (void) mkdir_parents(fn, 0755);
4036
4037 sprintf(pid_string, PID_FMT, pid);
4038 r = write_string_file(fn, pid_string, 0);
4039 if (r < 0)
4040 log_error_errno(r, "Failed to move process: %m");
4041
4042finish:
4043 if (undo_mount)
4044 (void) umount(tree);
4045
4046 (void) rmdir(tree);
4047 return r;
4048}
4049
4050static int create_subcgroup(pid_t pid) {
4051 _cleanup_free_ char *cgroup = NULL;
4052 const char *child;
4053 int unified, r;
98e4d8d7 4054 CGroupMask supported;
efdb0237
LP
4055
4056 /* In the unified hierarchy inner nodes may only only contain
4057 * subgroups, but not processes. Hence, if we running in the
4058 * unified hierarchy and the container does the same, and we
4059 * did not create a scope unit for the container move us and
4060 * the container into two separate subcgroups. */
4061
4062 if (!arg_keep_unit)
4063 return 0;
4064
4065 if (!arg_unified_cgroup_hierarchy)
4066 return 0;
4067
4068 unified = cg_unified();
4069 if (unified < 0)
4070 return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
4071 if (unified == 0)
4072 return 0;
4073
98e4d8d7
LP
4074 r = cg_mask_supported(&supported);
4075 if (r < 0)
4076 return log_error_errno(r, "Failed to determine supported controllers: %m");
4077
efdb0237
LP
4078 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
4079 if (r < 0)
4080 return log_error_errno(r, "Failed to get our control group: %m");
4081
4082 child = strjoina(cgroup, "/payload");
4083 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid);
4084 if (r < 0)
4085 return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
4086
4087 child = strjoina(cgroup, "/supervisor");
4088 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0);
4089 if (r < 0)
4090 return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
03cfe0d5 4091
98e4d8d7
LP
4092 /* Try to enable as many controllers as possible for the new payload. */
4093 (void) cg_enable_everywhere(supported, supported, cgroup);
03cfe0d5
LP
4094 return 0;
4095}
4096
f757855e
LP
4097static int load_settings(void) {
4098 _cleanup_(settings_freep) Settings *settings = NULL;
4099 _cleanup_fclose_ FILE *f = NULL;
4100 _cleanup_free_ char *p = NULL;
4101 const char *fn, *i;
4102 int r;
4103
4104 /* If all settings are masked, there's no point in looking for
4105 * the settings file */
4106 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
4107 return 0;
4108
4109 fn = strjoina(arg_machine, ".nspawn");
4110
4111 /* We first look in the admin's directories in /etc and /run */
4112 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4113 _cleanup_free_ char *j = NULL;
4114
4115 j = strjoin(i, "/", fn, NULL);
4116 if (!j)
4117 return log_oom();
4118
4119 f = fopen(j, "re");
4120 if (f) {
4121 p = j;
4122 j = NULL;
4123
4124 /* By default we trust configuration from /etc and /run */
4125 if (arg_settings_trusted < 0)
4126 arg_settings_trusted = true;
4127
4128 break;
4129 }
4130
4131 if (errno != ENOENT)
4132 return log_error_errno(errno, "Failed to open %s: %m", j);
4133 }
4134
4135 if (!f) {
4136 /* After that, let's look for a file next to the
4137 * actual image we shall boot. */
4138
4139 if (arg_image) {
4140 p = file_in_same_dir(arg_image, fn);
4141 if (!p)
4142 return log_oom();
4143 } else if (arg_directory) {
4144 p = file_in_same_dir(arg_directory, fn);
4145 if (!p)
4146 return log_oom();
4147 }
4148
4149 if (p) {
4150 f = fopen(p, "re");
4151 if (!f && errno != ENOENT)
4152 return log_error_errno(errno, "Failed to open %s: %m", p);
4153
4154 /* By default we do not trust configuration from /var/lib/machines */
4155 if (arg_settings_trusted < 0)
4156 arg_settings_trusted = false;
4157 }
4158 }
4159
4160 if (!f)
4161 return 0;
4162
4163 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4164
4165 r = settings_load(f, p, &settings);
4166 if (r < 0)
4167 return r;
4168
4169 /* Copy over bits from the settings, unless they have been
4170 * explicitly masked by command line switches. */
4171
4172 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
4173 settings->boot >= 0) {
4174 arg_boot = settings->boot;
4175
4176 strv_free(arg_parameters);
4177 arg_parameters = settings->parameters;
4178 settings->parameters = NULL;
4179 }
4180
4181 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
4182 settings->environment) {
4183 strv_free(arg_setenv);
4184 arg_setenv = settings->environment;
4185 settings->environment = NULL;
4186 }
4187
4188 if ((arg_settings_mask & SETTING_USER) == 0 &&
4189 settings->user) {
4190 free(arg_user);
4191 arg_user = settings->user;
4192 settings->user = NULL;
4193 }
4194
4195 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
4196
4197 if (!arg_settings_trusted && settings->capability != 0)
4198 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
4199 else
4200 arg_retain |= settings->capability;
4201
4202 arg_retain &= ~settings->drop_capability;
4203 }
4204
4205 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4206 settings->kill_signal > 0)
4207 arg_kill_signal = settings->kill_signal;
4208
4209 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4210 settings->personality != PERSONALITY_INVALID)
4211 arg_personality = settings->personality;
4212
4213 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4214 !sd_id128_is_null(settings->machine_id)) {
4215
4216 if (!arg_settings_trusted)
4217 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
4218 else
4219 arg_uuid = settings->machine_id;
4220 }
4221
4222 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4223 settings->read_only >= 0)
4224 arg_read_only = settings->read_only;
4225
4226 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4227 settings->volatile_mode != _VOLATILE_MODE_INVALID)
4228 arg_volatile_mode = settings->volatile_mode;
4229
4230 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4231 settings->n_custom_mounts > 0) {
4232
4233 if (!arg_settings_trusted)
4234 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
4235 else {
4236 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4237 arg_custom_mounts = settings->custom_mounts;
4238 arg_n_custom_mounts = settings->n_custom_mounts;
4239
4240 settings->custom_mounts = NULL;
4241 settings->n_custom_mounts = 0;
4242 }
4243 }
4244
4245 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4246 (settings->private_network >= 0 ||
4247 settings->network_veth >= 0 ||
4248 settings->network_bridge ||
4249 settings->network_interfaces ||
4250 settings->network_macvlan ||
4251 settings->network_ipvlan)) {
4252
4253 if (!arg_settings_trusted)
4254 log_warning("Ignoring network settings, file %s is not trusted.", p);
4255 else {
4256 strv_free(arg_network_interfaces);
4257 arg_network_interfaces = settings->network_interfaces;
4258 settings->network_interfaces = NULL;
4259
4260 strv_free(arg_network_macvlan);
4261 arg_network_macvlan = settings->network_macvlan;
4262 settings->network_macvlan = NULL;
4263
4264 strv_free(arg_network_ipvlan);
4265 arg_network_ipvlan = settings->network_ipvlan;
4266 settings->network_ipvlan = NULL;
4267
4268 free(arg_network_bridge);
4269 arg_network_bridge = settings->network_bridge;
4270 settings->network_bridge = NULL;
4271
4272 arg_network_veth = settings->network_veth > 0 || settings->network_bridge;
4273
4274 arg_private_network = true; /* all these settings imply private networking */
4275 }
4276 }
4277
4278 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4279 settings->expose_ports) {
4280
4281 if (!arg_settings_trusted)
4282 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
4283 else {
4284 expose_port_free_all(arg_expose_ports);
4285 arg_expose_ports = settings->expose_ports;
4286 settings->expose_ports = NULL;
4287 }
4288 }
4289
4290 return 0;
4291}
4292
03cfe0d5
LP
4293int main(int argc, char *argv[]) {
4294
4295 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
4296 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4297 _cleanup_close_ int master = -1, image_fd = -1;
4298 _cleanup_fdset_free_ FDSet *fds = NULL;
4299 int r, n_fd_passed, loop_nr = -1;
4300 char veth_name[IFNAMSIZ];
4301 bool secondary = false, remove_subvol = false;
72c0a2c2 4302 sigset_t mask_chld;
03cfe0d5
LP
4303 pid_t pid = 0;
4304 int ret = EXIT_SUCCESS;
4305 union in_addr_union exposed = {};
4306 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4307 bool interactive;
4308
4309 log_parse_environment();
4310 log_open();
4311
4312 r = parse_argv(argc, argv);
4313 if (r <= 0)
4314 goto finish;
4315
03cfe0d5
LP
4316 if (geteuid() != 0) {
4317 log_error("Need to be root.");
4318 r = -EPERM;
4319 goto finish;
4320 }
f757855e
LP
4321 r = determine_names();
4322 if (r < 0)
4323 goto finish;
4324
4325 r = load_settings();
4326 if (r < 0)
4327 goto finish;
4328
4329 r = verify_arguments();
4330 if (r < 0)
4331 goto finish;
03cfe0d5
LP
4332
4333 n_fd_passed = sd_listen_fds(false);
4334 if (n_fd_passed > 0) {
4335 r = fdset_new_listen_fds(&fds, false);
4336 if (r < 0) {
4337 log_error_errno(r, "Failed to collect file descriptors: %m");
4338 goto finish;
4339 }
4340 }
4341
4342 if (arg_directory) {
4343 assert(!arg_image);
4344
4345 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4346 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4347 r = -EINVAL;
4348 goto finish;
4349 }
4350
4351 if (arg_ephemeral) {
4352 _cleanup_free_ char *np = NULL;
4353
4354 /* If the specified path is a mount point we
4355 * generate the new snapshot immediately
4356 * inside it under a random name. However if
4357 * the specified is not a mount point we
4358 * create the new snapshot in the parent
4359 * directory, just next to it. */
e26d6ce5 4360 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
4361 if (r < 0) {
4362 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4363 goto finish;
4364 }
4365 if (r > 0)
770b5ce4 4366 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 4367 else
770b5ce4 4368 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
4369 if (r < 0) {
4370 log_error_errno(r, "Failed to generate name for snapshot: %m");
4371 goto finish;
4372 }
4373
4374 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4375 if (r < 0) {
4376 log_error_errno(r, "Failed to lock %s: %m", np);
4377 goto finish;
4378 }
4379
4380 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4381 if (r < 0) {
4382 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4383 goto finish;
ec16945e
LP
4384 }
4385
4386 free(arg_directory);
4387 arg_directory = np;
8a16a7b4 4388 np = NULL;
ec16945e
LP
4389
4390 remove_subvol = true;
30535c16
LP
4391
4392 } else {
4393 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4394 if (r == -EBUSY) {
4395 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4396 goto finish;
4397 }
4398 if (r < 0) {
4399 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4400 return r;
4401 }
4402
4403 if (arg_template) {
f70a17f8 4404 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
30535c16
LP
4405 if (r == -EEXIST) {
4406 if (!arg_quiet)
4407 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4408 } else if (r < 0) {
83521414 4409 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
4410 goto finish;
4411 } else {
4412 if (!arg_quiet)
4413 log_info("Populated %s from template %s.", arg_directory, arg_template);
4414 }
4415 }
ec16945e
LP
4416 }
4417
1b9e5b12
LP
4418 if (arg_boot) {
4419 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 4420 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 4421 r = -EINVAL;
1b9e5b12
LP
4422 goto finish;
4423 }
4424 } else {
4425 const char *p;
4426
63c372cb 4427 p = strjoina(arg_directory,
1b9e5b12
LP
4428 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
4429 if (access(p, F_OK) < 0) {
4430 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
ec16945e 4431 r = -EINVAL;
1b9e5b12 4432 goto finish;
1b9e5b12
LP
4433 }
4434 }
ec16945e 4435
6b9132a9 4436 } else {
1b9e5b12 4437 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 4438
ec16945e
LP
4439 assert(arg_image);
4440 assert(!arg_template);
4441
30535c16
LP
4442 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4443 if (r == -EBUSY) {
4444 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4445 goto finish;
4446 }
4447 if (r < 0) {
4448 r = log_error_errno(r, "Failed to create image lock: %m");
4449 goto finish;
4450 }
4451
1b9e5b12 4452 if (!mkdtemp(template)) {
56f64d95 4453 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 4454 r = -errno;
6b9132a9 4455 goto finish;
1b9e5b12 4456 }
6b9132a9 4457
1b9e5b12
LP
4458 arg_directory = strdup(template);
4459 if (!arg_directory) {
4460 r = log_oom();
4461 goto finish;
6b9132a9 4462 }
88213476 4463
1b9e5b12
LP
4464 image_fd = setup_image(&device_path, &loop_nr);
4465 if (image_fd < 0) {
4466 r = image_fd;
842f3b0f
LP
4467 goto finish;
4468 }
1b9e5b12 4469
4d9f07b4
LP
4470 r = dissect_image(image_fd,
4471 &root_device, &root_device_rw,
4472 &home_device, &home_device_rw,
4473 &srv_device, &srv_device_rw,
4474 &secondary);
1b9e5b12
LP
4475 if (r < 0)
4476 goto finish;
842f3b0f 4477 }
842f3b0f 4478
5a8af538
LP
4479 r = custom_mounts_prepare();
4480 if (r < 0)
4481 goto finish;
4482
03cfe0d5
LP
4483 interactive =
4484 isatty(STDIN_FILENO) > 0 &&
4485 isatty(STDOUT_FILENO) > 0;
9c857b9d 4486
db7feb7e
LP
4487 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4488 if (master < 0) {
ec16945e 4489 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
4490 goto finish;
4491 }
4492
611b312b
LP
4493 r = ptsname_malloc(master, &console);
4494 if (r < 0) {
4495 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
4496 goto finish;
4497 }
4498
a258bf26 4499 if (unlockpt(master) < 0) {
ec16945e 4500 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
4501 goto finish;
4502 }
4503
9c857b9d
LP
4504 if (!arg_quiet)
4505 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4506 arg_machine, arg_image ?: arg_directory);
4507
72c0a2c2 4508 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 4509
023fb90b
LP
4510 assert_se(sigemptyset(&mask_chld) == 0);
4511 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4512
03cfe0d5
LP
4513 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4514 r = log_error_errno(errno, "Failed to become subreaper: %m");
4515 goto finish;
4516 }
4517
d87be9b0 4518 for (;;) {
825d5287
RM
4519 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
4520 uid_shift_socket_pair[2] = { -1, -1 };
113cea80 4521 ContainerStatus container_status;
7566e267 4522 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
03cfe0d5 4523 static const struct sigaction sa = {
e866af3a
DH
4524 .sa_handler = nop_handler,
4525 .sa_flags = SA_NOCLDSTOP,
4526 };
03cfe0d5
LP
4527 int ifi = 0;
4528 ssize_t l;
dbb60d69
LP
4529 _cleanup_event_unref_ sd_event *event = NULL;
4530 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4531 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4532 char last_char = 0;
e866af3a 4533
7566e267 4534 r = barrier_create(&barrier);
a2da110b 4535 if (r < 0) {
da927ba9 4536 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
4537 goto finish;
4538 }
4539
6d0b55c2
LP
4540 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
4541 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4542 goto finish;
4543 }
4544
4545 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
4546 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4547 goto finish;
4548 }
4549
03cfe0d5
LP
4550 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
4551 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
4552 goto finish;
4553 }
4554
825d5287
RM
4555 if (arg_userns)
4556 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
4557 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4558 goto finish;
4559 }
4560
e866af3a
DH
4561 /* Child can be killed before execv(), so handle SIGCHLD
4562 * in order to interrupt parent's blocking calls and
4563 * give it a chance to call wait() and terminate. */
4564 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4565 if (r < 0) {
ec16945e 4566 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
4567 goto finish;
4568 }
4569
e866af3a
DH
4570 r = sigaction(SIGCHLD, &sa, NULL);
4571 if (r < 0) {
ec16945e 4572 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
4573 goto finish;
4574 }
4575
03cfe0d5 4576 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
d87be9b0
LP
4577 if (pid < 0) {
4578 if (errno == EINVAL)
ec16945e 4579 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 4580 else
ec16945e 4581 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 4582
d87be9b0
LP
4583 goto finish;
4584 }
a258bf26 4585
d87be9b0 4586 if (pid == 0) {
03cfe0d5 4587 /* The outer child only has a file system namespace. */
a2da110b
DH
4588 barrier_set_role(&barrier, BARRIER_CHILD);
4589
03e334a1 4590 master = safe_close(master);
a258bf26 4591
03e334a1 4592 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 4593 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 4594 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
825d5287 4595 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 4596
ce30c8dc
LP
4597 (void) reset_all_signal_handlers();
4598 (void) reset_signal_mask();
f5c1b9ee 4599
03cfe0d5
LP
4600 r = outer_child(&barrier,
4601 arg_directory,
4602 console,
4603 root_device, root_device_rw,
4604 home_device, home_device_rw,
4605 srv_device, srv_device_rw,
4606 interactive,
4607 secondary,
4608 pid_socket_pair[1],
4609 kmsg_socket_pair[1],
4610 rtnl_socket_pair[1],
825d5287 4611 uid_shift_socket_pair[1],
f757855e 4612 fds);
0cb9fbcd 4613 if (r < 0)
a2da110b 4614 _exit(EXIT_FAILURE);
d87be9b0 4615
03cfe0d5 4616 _exit(EXIT_SUCCESS);
da5b3bad 4617 }
88213476 4618
a2da110b 4619 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 4620
842f3b0f
LP
4621 fdset_free(fds);
4622 fds = NULL;
4623
6d0b55c2
LP
4624 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4625 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 4626 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
6d0b55c2 4627
03cfe0d5
LP
4628 /* Wait for the outer child. */
4629 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
4630 if (r < 0)
4631 goto finish;
4632 if (r != 0) {
4633 r = -EIO;
4634 goto finish;
4635 }
4636 pid = 0;
6dac160c 4637
03cfe0d5
LP
4638 /* And now retrieve the PID of the inner child. */
4639 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
4640 if (l < 0) {
4641 r = log_error_errno(errno, "Failed to read inner child PID: %m");
4642 goto finish;
4643 }
4644 if (l != sizeof(pid)) {
4645 log_error("Short read while reading inner child PID: %m");
4646 r = EIO;
4647 goto finish;
4648 }
354bfd2b 4649
03cfe0d5 4650 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 4651
03cfe0d5
LP
4652 if (arg_userns) {
4653 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4654 log_error("Child died too early.");
4655 r = -ESRCH;
840295fc 4656 goto finish;
03cfe0d5 4657 }
ab046dde 4658
825d5287
RM
4659 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
4660 if (l < 0) {
4661 r = log_error_errno(errno, "Failed to read UID shift: %m");
4662 goto finish;
4663 }
4664 if (l != sizeof(arg_uid_shift)) {
4665 log_error("Short read while reading UID shift: %m");
4666 r = EIO;
4667 goto finish;
4668 }
4669
03cfe0d5 4670 r = setup_uid_map(pid);
840295fc
LP
4671 if (r < 0)
4672 goto finish;
ab046dde 4673
03cfe0d5
LP
4674 (void) barrier_place(&barrier); /* #2 */
4675 }
c74e630d 4676
03cfe0d5
LP
4677 r = move_network_interfaces(pid);
4678 if (r < 0)
4679 goto finish;
4bbfe7ad 4680
03cfe0d5
LP
4681 r = setup_veth(pid, veth_name, &ifi);
4682 if (r < 0)
4683 goto finish;
5aa4bb6b 4684
03cfe0d5
LP
4685 r = setup_bridge(veth_name, &ifi);
4686 if (r < 0)
4687 goto finish;
6dac160c 4688
03cfe0d5
LP
4689 r = setup_macvlan(pid);
4690 if (r < 0)
4691 goto finish;
6dac160c 4692
03cfe0d5
LP
4693 r = setup_ipvlan(pid);
4694 if (r < 0)
4695 goto finish;
6dac160c 4696
03cfe0d5
LP
4697 r = register_machine(pid, ifi);
4698 if (r < 0)
4699 goto finish;
6dac160c 4700
efdb0237
LP
4701 r = sync_cgroup(pid);
4702 if (r < 0)
4703 goto finish;
4704
4705 r = create_subcgroup(pid);
4706 if (r < 0)
4707 goto finish;
4708
03cfe0d5
LP
4709 r = chown_cgroup(pid);
4710 if (r < 0)
4711 goto finish;
6dac160c 4712
03cfe0d5
LP
4713 /* Notify the child that the parent is ready with all
4714 * its setup (including cgroup-ification), and that
4715 * the child can now hand over control to the code to
4716 * run inside the container. */
4717 (void) barrier_place(&barrier); /* #3 */
6dac160c 4718
03cfe0d5
LP
4719 /* Block SIGCHLD here, before notifying child.
4720 * process_pty() will handle it with the other signals. */
4721 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 4722
03cfe0d5
LP
4723 /* Reset signal to default */
4724 r = default_signals(SIGCHLD, -1);
4725 if (r < 0) {
4726 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4727 goto finish;
4728 }
e866af3a 4729
03cfe0d5
LP
4730 /* Let the child know that we are ready and wait that the child is completely ready now. */
4731 if (!barrier_place_and_sync(&barrier)) { /* #5 */
4732 log_error("Client died too early.");
4733 r = -ESRCH;
4734 goto finish;
4735 }
b12afc8c 4736
03cfe0d5
LP
4737 sd_notifyf(false,
4738 "READY=1\n"
4739 "STATUS=Container running.\n"
4740 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 4741
03cfe0d5
LP
4742 r = sd_event_new(&event);
4743 if (r < 0) {
4744 log_error_errno(r, "Failed to get default event source: %m");
4745 goto finish;
4746 }
88213476 4747
03cfe0d5
LP
4748 if (arg_kill_signal > 0) {
4749 /* Try to kill the init system on SIGINT or SIGTERM */
4750 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4751 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4752 } else {
4753 /* Immediately exit */
4754 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4755 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4756 }
023fb90b 4757
03cfe0d5
LP
4758 /* simply exit on sigchld */
4759 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 4760
03cfe0d5
LP
4761 if (arg_expose_ports) {
4762 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4763 if (r < 0)
4764 goto finish;
023fb90b 4765
03cfe0d5
LP
4766 (void) expose_ports(rtnl, &exposed);
4767 }
023fb90b 4768
03cfe0d5 4769 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 4770
03cfe0d5
LP
4771 r = pty_forward_new(event, master, true, !interactive, &forward);
4772 if (r < 0) {
4773 log_error_errno(r, "Failed to create PTY forwarder: %m");
4774 goto finish;
4775 }
023fb90b 4776
03cfe0d5
LP
4777 r = sd_event_loop(event);
4778 if (r < 0) {
4779 log_error_errno(r, "Failed to run event loop: %m");
4780 goto finish;
4781 }
6d0b55c2 4782
03cfe0d5 4783 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 4784
03cfe0d5 4785 forward = pty_forward_free(forward);
6d0b55c2 4786
03cfe0d5
LP
4787 if (!arg_quiet && last_char != '\n')
4788 putc('\n', stdout);
04d39279 4789
03cfe0d5
LP
4790 /* Kill if it is not dead yet anyway */
4791 terminate_machine(pid);
1f0cd86b 4792
840295fc 4793 /* Normally redundant, but better safe than sorry */
04d39279 4794 kill(pid, SIGKILL);
a258bf26 4795
113cea80 4796 r = wait_for_container(pid, &container_status);
04d39279
LP
4797 pid = 0;
4798
ec16945e 4799 if (r < 0)
ce9f1527
LP
4800 /* We failed to wait for the container, or the
4801 * container exited abnormally */
ec16945e
LP
4802 goto finish;
4803 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
4804 /* The container exited with a non-zero
4805 * status, or with zero status and no reboot
4806 * was requested. */
ec16945e 4807 ret = r;
d87be9b0 4808 break;
ec16945e 4809 }
88213476 4810
113cea80 4811 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
4812
4813 if (arg_keep_unit) {
4814 /* Special handling if we are running as a
4815 * service: instead of simply restarting the
4816 * machine we want to restart the entire
4817 * service, so let's inform systemd about this
4818 * with the special exit code 133. The service
4819 * file uses RestartForceExitStatus=133 so
4820 * that this results in a full nspawn
4821 * restart. This is necessary since we might
4822 * have cgroup parameters set we want to have
4823 * flushed out. */
ec16945e
LP
4824 ret = 133;
4825 r = 0;
ce38dbc8
LP
4826 break;
4827 }
6d0b55c2
LP
4828
4829 flush_ports(&exposed);
d87be9b0 4830 }
88213476
LP
4831
4832finish:
af4ec430
LP
4833 sd_notify(false,
4834 "STOPPING=1\n"
4835 "STATUS=Terminating...");
4836
9444b1f2
LP
4837 if (pid > 0)
4838 kill(pid, SIGKILL);
88213476 4839
503546da
LP
4840 /* Try to flush whatever is still queued in the pty */
4841 if (master >= 0)
4842 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
4843
03cfe0d5
LP
4844 loop_remove(loop_nr, &image_fd);
4845
ec16945e
LP
4846 if (remove_subvol && arg_directory) {
4847 int k;
4848
d9e2daaf 4849 k = btrfs_subvol_remove(arg_directory, true);
ec16945e
LP
4850 if (k < 0)
4851 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4852 }
4853
785890ac
LP
4854 if (arg_machine) {
4855 const char *p;
4856
63c372cb 4857 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 4858 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
4859 }
4860
f757855e
LP
4861 flush_ports(&exposed);
4862
04d391da 4863 free(arg_directory);
ec16945e
LP
4864 free(arg_template);
4865 free(arg_image);
7027ff61 4866 free(arg_machine);
c74e630d
LP
4867 free(arg_user);
4868 strv_free(arg_setenv);
f757855e 4869 free(arg_network_bridge);
c74e630d
LP
4870 strv_free(arg_network_interfaces);
4871 strv_free(arg_network_macvlan);
4bbfe7ad 4872 strv_free(arg_network_ipvlan);
f757855e
LP
4873 strv_free(arg_parameters);
4874 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4875 expose_port_free_all(arg_expose_ports);
6d0b55c2 4876
ec16945e 4877 return r < 0 ? EXIT_FAILURE : ret;
88213476 4878}