]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
nspawn: add new .nspawn files for container settings
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
88213476 26#include <sys/mount.h>
88213476
LP
27#include <stdlib.h>
28#include <string.h>
29#include <stdio.h>
30#include <errno.h>
31#include <sys/prctl.h>
88213476 32#include <getopt.h>
687d0825 33#include <grp.h>
5ed27dbd 34#include <linux/fs.h>
9537eab0 35#include <sys/socket.h>
aea38d80 36#include <linux/netlink.h>
aa28aefe 37#include <net/if.h>
69c79d3c 38#include <linux/veth.h>
6afc95b7 39#include <sys/personality.h>
1b9e5b12 40#include <linux/loop.h>
2fbe4296 41#include <sys/file.h>
aa28aefe 42
5d63309c 43#ifdef HAVE_SELINUX
a8828ed9
DW
44#include <selinux/selinux.h>
45#endif
88213476 46
24fb1112
LP
47#ifdef HAVE_SECCOMP
48#include <seccomp.h>
49#endif
50
1b9e5b12
LP
51#ifdef HAVE_BLKID
52#include <blkid/blkid.h>
53#endif
54
1f0cd86b
LP
55#include "sd-daemon.h"
56#include "sd-bus.h"
57#include "sd-id128.h"
1c4baffc 58#include "sd-netlink.h"
958b66ea 59#include "random-util.h"
88213476
LP
60#include "log.h"
61#include "util.h"
49e942b2 62#include "mkdir.h"
c6878637 63#include "rm-rf.h"
6b2d0e85 64#include "macro.h"
94d82985 65#include "missing.h"
04d391da 66#include "cgroup-util.h"
a258bf26 67#include "strv.h"
9eb977db 68#include "path-util.h"
a41fe3a2 69#include "loopback-setup.h"
4fc9982c 70#include "dev-setup.h"
842f3b0f 71#include "fdset.h"
acbeb427 72#include "build.h"
a5c32cff 73#include "fileio.h"
40ca29a1 74#include "bus-util.h"
1f0cd86b 75#include "bus-error.h"
4ba93280 76#include "ptyfwd.h"
f4889f65 77#include "env-util.h"
1c4baffc 78#include "netlink-util.h"
7e227024 79#include "udev-util.h"
1b9e5b12
LP
80#include "blkid-util.h"
81#include "gpt.h"
01dde061 82#include "siphash24.h"
849958d1 83#include "copy.h"
3577de7a 84#include "base-filesystem.h"
a2da110b 85#include "barrier.h"
023fb90b 86#include "event-util.h"
f01ae826 87#include "capability.h"
2822da4f 88#include "cap-list.h"
ec16945e 89#include "btrfs-util.h"
1b9cebf6 90#include "machine-image.h"
6d0b55c2
LP
91#include "list.h"
92#include "in-addr-util.h"
12c2884c 93#include "firewall-util.h"
6d0b55c2 94#include "local-addresses.h"
6482f626 95#include "formats-util.h"
0b452006 96#include "process-util.h"
288a74cc 97#include "terminal-util.h"
958b66ea 98#include "hostname-util.h"
24882e06 99#include "signal-util.h"
f2d88580 100
e9642be2
LP
101#ifdef HAVE_SECCOMP
102#include "seccomp-util.h"
103#endif
104
f757855e
LP
105#include "nspawn.h"
106#include "nspawn-settings.h"
6d0b55c2 107
113cea80
DH
108typedef enum ContainerStatus {
109 CONTAINER_TERMINATED,
110 CONTAINER_REBOOTED
111} ContainerStatus;
112
57fb9fb5
LP
113typedef enum LinkJournal {
114 LINK_NO,
115 LINK_AUTO,
116 LINK_HOST,
117 LINK_GUEST
118} LinkJournal;
88213476
LP
119
120static char *arg_directory = NULL;
ec16945e 121static char *arg_template = NULL;
687d0825 122static char *arg_user = NULL;
9444b1f2 123static sd_id128_t arg_uuid = {};
7027ff61 124static char *arg_machine = NULL;
c74e630d
LP
125static const char *arg_selinux_context = NULL;
126static const char *arg_selinux_apifs_context = NULL;
9444b1f2 127static const char *arg_slice = NULL;
ff01d048 128static bool arg_private_network = false;
bc2f673e 129static bool arg_read_only = false;
0f0dbc46 130static bool arg_boot = false;
ec16945e 131static bool arg_ephemeral = false;
57fb9fb5 132static LinkJournal arg_link_journal = LINK_AUTO;
574edc90 133static bool arg_link_journal_try = false;
5076f0cc
LP
134static uint64_t arg_retain =
135 (1ULL << CAP_CHOWN) |
136 (1ULL << CAP_DAC_OVERRIDE) |
137 (1ULL << CAP_DAC_READ_SEARCH) |
138 (1ULL << CAP_FOWNER) |
139 (1ULL << CAP_FSETID) |
140 (1ULL << CAP_IPC_OWNER) |
141 (1ULL << CAP_KILL) |
142 (1ULL << CAP_LEASE) |
143 (1ULL << CAP_LINUX_IMMUTABLE) |
144 (1ULL << CAP_NET_BIND_SERVICE) |
145 (1ULL << CAP_NET_BROADCAST) |
146 (1ULL << CAP_NET_RAW) |
147 (1ULL << CAP_SETGID) |
148 (1ULL << CAP_SETFCAP) |
149 (1ULL << CAP_SETPCAP) |
150 (1ULL << CAP_SETUID) |
151 (1ULL << CAP_SYS_ADMIN) |
152 (1ULL << CAP_SYS_CHROOT) |
153 (1ULL << CAP_SYS_NICE) |
154 (1ULL << CAP_SYS_PTRACE) |
155 (1ULL << CAP_SYS_TTY_CONFIG) |
d87be9b0 156 (1ULL << CAP_SYS_RESOURCE) |
88d04e31
LP
157 (1ULL << CAP_SYS_BOOT) |
158 (1ULL << CAP_AUDIT_WRITE) |
7f112f50
LP
159 (1ULL << CAP_AUDIT_CONTROL) |
160 (1ULL << CAP_MKNOD);
5a8af538
LP
161static CustomMount *arg_custom_mounts = NULL;
162static unsigned arg_n_custom_mounts = 0;
f4889f65 163static char **arg_setenv = NULL;
284c0b91 164static bool arg_quiet = false;
8a96d94e 165static bool arg_share_system = false;
eb91eb18 166static bool arg_register = true;
89f7c846 167static bool arg_keep_unit = false;
aa28aefe 168static char **arg_network_interfaces = NULL;
c74e630d 169static char **arg_network_macvlan = NULL;
4bbfe7ad 170static char **arg_network_ipvlan = NULL;
69c79d3c 171static bool arg_network_veth = false;
f757855e 172static char *arg_network_bridge = NULL;
050f7277 173static unsigned long arg_personality = PERSONALITY_INVALID;
ec16945e 174static char *arg_image = NULL;
f757855e 175static VolatileMode arg_volatile_mode = VOLATILE_NO;
6d0b55c2 176static ExposePort *arg_expose_ports = NULL;
f36933fe 177static char **arg_property = NULL;
6dac160c
LP
178static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
179static bool arg_userns = false;
c6c8f6e2 180static int arg_kill_signal = 0;
efdb0237 181static bool arg_unified_cgroup_hierarchy = false;
f757855e
LP
182static SettingsMask arg_settings_mask = 0;
183static int arg_settings_trusted = -1;
184static char **arg_parameters = NULL;
88213476 185
601185b4 186static void help(void) {
88213476
LP
187 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
188 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
a8828ed9
DW
189 " -h --help Show this help\n"
190 " --version Print version string\n"
69c79d3c 191 " -q --quiet Do not show status information\n"
1b9e5b12 192 " -D --directory=PATH Root directory for the container\n"
ec16945e
LP
193 " --template=PATH Initialize root directory from template directory,\n"
194 " if missing\n"
195 " -x --ephemeral Run container with snapshot of root directory, and\n"
196 " remove it after exit\n"
197 " -i --image=PATH File system device or disk image for the container\n"
a8828ed9
DW
198 " -b --boot Boot up full system (i.e. invoke init)\n"
199 " -u --user=USER Run the command under specified user or uid\n"
a8828ed9 200 " -M --machine=NAME Set the machine name for the container\n"
69c79d3c 201 " --uuid=UUID Set a specific machine UUID for the container\n"
a8828ed9 202 " -S --slice=SLICE Place the container in the specified slice\n"
f36933fe 203 " --property=NAME=VALUE Set scope unit property\n"
03cfe0d5
LP
204 " --private-users[=UIDBASE[:NUIDS]]\n"
205 " Run within user namespace\n"
69c79d3c
LP
206 " --private-network Disable network in container\n"
207 " --network-interface=INTERFACE\n"
208 " Assign an existing network interface to the\n"
209 " container\n"
c74e630d
LP
210 " --network-macvlan=INTERFACE\n"
211 " Create a macvlan network interface based on an\n"
212 " existing network interface to the container\n"
4bbfe7ad
TG
213 " --network-ipvlan=INTERFACE\n"
214 " Create a ipvlan network interface based on an\n"
215 " existing network interface to the container\n"
0dfaa006 216 " -n --network-veth Add a virtual ethernet connection between host\n"
69c79d3c 217 " and container\n"
ab046dde 218 " --network-bridge=INTERFACE\n"
32457153 219 " Add a virtual ethernet connection between host\n"
ab046dde
TG
220 " and container and add it to an existing bridge on\n"
221 " the host\n"
6d0b55c2 222 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
ab5e3a1b 223 " Expose a container IP port on the host\n"
82adf6af
LP
224 " -Z --selinux-context=SECLABEL\n"
225 " Set the SELinux security context to be used by\n"
226 " processes in the container\n"
227 " -L --selinux-apifs-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " API/tmpfs file systems in the container\n"
a8828ed9
DW
230 " --capability=CAP In addition to the default, retain specified\n"
231 " capability\n"
232 " --drop-capability=CAP Drop the specified capability from the default set\n"
c6c8f6e2 233 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
574edc90
MP
234 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
235 " try-guest, try-host\n"
236 " -j Equivalent to --link-journal=try-guest\n"
69c79d3c 237 " --read-only Mount the root directory read-only\n"
5e5bfa6e
EY
238 " --bind=PATH[:PATH[:OPTIONS]]\n"
239 " Bind mount a file or directory from the host into\n"
a8828ed9 240 " the container\n"
5e5bfa6e
EY
241 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
242 " Similar, but creates a read-only bind mount\n"
06c17c39 243 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
5a8af538
LP
244 " --overlay=PATH[:PATH...]:PATH\n"
245 " Create an overlay mount from the host to \n"
246 " the container\n"
247 " --overlay-ro=PATH[:PATH...]:PATH\n"
248 " Similar, but creates a read-only overlay mount\n"
284c0b91 249 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
69c79d3c 250 " --share-system Share system namespaces with host\n"
eb91eb18 251 " --register=BOOLEAN Register container as machine\n"
89f7c846 252 " --keep-unit Do not register a scope for the machine, reuse\n"
4d9f07b4 253 " the service unit nspawn is running in\n"
6d0b55c2 254 " --volatile[=MODE] Run the system in volatile mode\n"
f757855e 255 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
6d0b55c2 256 , program_invocation_short_name);
88213476
LP
257}
258
f757855e 259static CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
5a8af538
LP
260 CustomMount *c, *ret;
261
f757855e
LP
262 assert(l);
263 assert(n);
264 assert(t >= 0);
265 assert(t < _CUSTOM_MOUNT_TYPE_MAX);
266
267 c = realloc(*l, (*n + 1) * sizeof(CustomMount));
5a8af538
LP
268 if (!c)
269 return NULL;
270
f757855e
LP
271 *l = c;
272 ret = *l + *n;
273 (*n)++;
5a8af538
LP
274
275 *ret = (CustomMount) { .type = t };
276
277 return ret;
278}
279
f757855e 280void custom_mount_free_all(CustomMount *l, unsigned n) {
5a8af538
LP
281 unsigned i;
282
f757855e
LP
283 for (i = 0; i < n; i++) {
284 CustomMount *m = l + i;
5a8af538
LP
285
286 free(m->source);
287 free(m->destination);
288 free(m->options);
289
290 if (m->work_dir) {
291 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
292 free(m->work_dir);
293 }
294
295 strv_free(m->lower);
296 }
297
f757855e 298 free(l);
5a8af538
LP
299}
300
301static int custom_mount_compare(const void *a, const void *b) {
302 const CustomMount *x = a, *y = b;
303 int r;
304
305 r = path_compare(x->destination, y->destination);
306 if (r != 0)
307 return r;
308
309 if (x->type < y->type)
310 return -1;
311 if (x->type > y->type)
312 return 1;
313
314 return 0;
315}
316
317static int custom_mounts_prepare(void) {
318 unsigned i;
319 int r;
320
321 /* Ensure the mounts are applied prefix first. */
322 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
323
324 /* Allocate working directories for the overlay file systems that need it */
325 for (i = 0; i < arg_n_custom_mounts; i++) {
326 CustomMount *m = &arg_custom_mounts[i];
327
825d5287
RM
328 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
329 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
330 return -EINVAL;
331 }
332
5a8af538
LP
333 if (m->type != CUSTOM_MOUNT_OVERLAY)
334 continue;
335
336 if (m->work_dir)
337 continue;
338
339 if (m->read_only)
340 continue;
341
14bcf25c 342 r = tempfn_random(m->source, NULL, &m->work_dir);
5a8af538
LP
343 if (r < 0)
344 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
345 }
346
347 return 0;
348}
349
ec16945e
LP
350static int set_sanitized_path(char **b, const char *path) {
351 char *p;
352
353 assert(b);
354 assert(path);
355
356 p = canonicalize_file_name(path);
357 if (!p) {
358 if (errno != ENOENT)
359 return -errno;
360
361 p = path_make_absolute_cwd(path);
362 if (!p)
363 return -ENOMEM;
364 }
365
366 free(*b);
367 *b = path_kill_slashes(p);
368 return 0;
369}
370
efdb0237
LP
371static int detect_unified_cgroup_hierarchy(void) {
372 const char *e;
373 int r;
374
375 /* Allow the user to control whether the unified hierarchy is used */
376 e = getenv("UNIFIED_CGROUP_HIERARCHY");
377 if (e) {
378 r = parse_boolean(e);
379 if (r < 0)
380 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
381
382 arg_unified_cgroup_hierarchy = r;
383 return 0;
384 }
385
386 /* Otherwise inherit the default from the host system */
387 r = cg_unified();
388 if (r < 0)
389 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
390
391 arg_unified_cgroup_hierarchy = r;
392 return 0;
393}
394
f757855e
LP
395VolatileMode volatile_mode_from_string(const char *s) {
396 int b;
397
398 if (isempty(s))
399 return _VOLATILE_MODE_INVALID;
400
401 b = parse_boolean(s);
402 if (b > 0)
403 return VOLATILE_YES;
404 if (b == 0)
405 return VOLATILE_NO;
406
407 if (streq(s, "state"))
408 return VOLATILE_STATE;
409
410 return _VOLATILE_MODE_INVALID;
411}
412
413int expose_port_parse(ExposePort **l, const char *s) {
414
415 const char *split, *e;
416 uint16_t container_port, host_port;
417 int protocol;
418 ExposePort *p;
419 int r;
420
421 if ((e = startswith(s, "tcp:")))
422 protocol = IPPROTO_TCP;
423 else if ((e = startswith(s, "udp:")))
424 protocol = IPPROTO_UDP;
425 else {
426 e = s;
427 protocol = IPPROTO_TCP;
428 }
429
430 split = strchr(e, ':');
431 if (split) {
432 char v[split - e + 1];
433
434 memcpy(v, e, split - e);
435 v[split - e] = 0;
436
437 r = safe_atou16(v, &host_port);
438 if (r < 0 || host_port <= 0)
439 return -EINVAL;
440
441 r = safe_atou16(split + 1, &container_port);
442 } else {
443 r = safe_atou16(e, &container_port);
444 host_port = container_port;
445 }
446
447 if (r < 0 || container_port <= 0)
448 return -EINVAL;
449
450 LIST_FOREACH(ports, p, arg_expose_ports)
451 if (p->protocol == protocol && p->host_port == host_port)
452 return -EEXIST;
453
454 p = new(ExposePort, 1);
455 if (!p)
456 return -ENOMEM;
457
458 p->protocol = protocol;
459 p->host_port = host_port;
460 p->container_port = container_port;
461
462 LIST_PREPEND(ports, *l, p);
463
464 return 0;
465}
466
467int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
468 _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
469 const char *p = s;
470 CustomMount *m;
471 int r;
472
473 assert(l);
474 assert(n);
475
476 r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
477 if (r < 0)
478 return r;
479 if (r == 0)
480 return -EINVAL;
481
482 if (r == 1) {
483 destination = strdup(source);
484 if (!destination)
485 return -ENOMEM;
486 }
487
488 if (r == 2 && !isempty(p)) {
489 opts = strdup(p);
490 if (!opts)
491 return -ENOMEM;
492 }
493
494 if (!path_is_absolute(source))
495 return -EINVAL;
496
497 if (!path_is_absolute(destination))
498 return -EINVAL;
499
500 m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
501 if (!m)
502 return log_oom();
503
504 m->source = source;
505 m->destination = destination;
506 m->read_only = read_only;
507 m->options = opts;
508
509 source = destination = opts = NULL;
510 return 0;
511}
512
513int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
514 _cleanup_free_ char *path = NULL, *opts = NULL;
515 const char *p = s;
516 CustomMount *m;
517 int r;
518
519 assert(l);
520 assert(n);
521 assert(s);
522
523 r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
524 if (r < 0)
525 return r;
526 if (r == 0)
527 return -EINVAL;
528
529 if (isempty(p))
530 opts = strdup("mode=0755");
531 else
532 opts = strdup(p);
533 if (!opts)
534 return -ENOMEM;
535
536 if (!path_is_absolute(path))
537 return -EINVAL;
538
539 m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
540 if (!m)
541 return -ENOMEM;
542
543 m->destination = path;
544 m->options = opts;
545
546 path = opts = NULL;
547 return 0;
548}
549
88213476
LP
550static int parse_argv(int argc, char *argv[]) {
551
a41fe3a2 552 enum {
acbeb427
ZJS
553 ARG_VERSION = 0x100,
554 ARG_PRIVATE_NETWORK,
bc2f673e 555 ARG_UUID,
5076f0cc 556 ARG_READ_ONLY,
57fb9fb5 557 ARG_CAPABILITY,
420c7379 558 ARG_DROP_CAPABILITY,
17fe0523
LP
559 ARG_LINK_JOURNAL,
560 ARG_BIND,
f4889f65 561 ARG_BIND_RO,
06c17c39 562 ARG_TMPFS,
5a8af538
LP
563 ARG_OVERLAY,
564 ARG_OVERLAY_RO,
f4889f65 565 ARG_SETENV,
eb91eb18 566 ARG_SHARE_SYSTEM,
89f7c846 567 ARG_REGISTER,
aa28aefe 568 ARG_KEEP_UNIT,
69c79d3c 569 ARG_NETWORK_INTERFACE,
c74e630d 570 ARG_NETWORK_MACVLAN,
4bbfe7ad 571 ARG_NETWORK_IPVLAN,
ab046dde 572 ARG_NETWORK_BRIDGE,
6afc95b7 573 ARG_PERSONALITY,
4d9f07b4 574 ARG_VOLATILE,
ec16945e 575 ARG_TEMPLATE,
f36933fe 576 ARG_PROPERTY,
6dac160c 577 ARG_PRIVATE_USERS,
c6c8f6e2 578 ARG_KILL_SIGNAL,
f757855e 579 ARG_SETTINGS,
a41fe3a2
LP
580 };
581
88213476 582 static const struct option options[] = {
aa28aefe
LP
583 { "help", no_argument, NULL, 'h' },
584 { "version", no_argument, NULL, ARG_VERSION },
585 { "directory", required_argument, NULL, 'D' },
ec16945e
LP
586 { "template", required_argument, NULL, ARG_TEMPLATE },
587 { "ephemeral", no_argument, NULL, 'x' },
aa28aefe
LP
588 { "user", required_argument, NULL, 'u' },
589 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
590 { "boot", no_argument, NULL, 'b' },
591 { "uuid", required_argument, NULL, ARG_UUID },
592 { "read-only", no_argument, NULL, ARG_READ_ONLY },
593 { "capability", required_argument, NULL, ARG_CAPABILITY },
594 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
595 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
596 { "bind", required_argument, NULL, ARG_BIND },
597 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
06c17c39 598 { "tmpfs", required_argument, NULL, ARG_TMPFS },
5a8af538
LP
599 { "overlay", required_argument, NULL, ARG_OVERLAY },
600 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
aa28aefe
LP
601 { "machine", required_argument, NULL, 'M' },
602 { "slice", required_argument, NULL, 'S' },
603 { "setenv", required_argument, NULL, ARG_SETENV },
604 { "selinux-context", required_argument, NULL, 'Z' },
605 { "selinux-apifs-context", required_argument, NULL, 'L' },
606 { "quiet", no_argument, NULL, 'q' },
607 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
608 { "register", required_argument, NULL, ARG_REGISTER },
609 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
610 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
c74e630d 611 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
4bbfe7ad 612 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
0dfaa006 613 { "network-veth", no_argument, NULL, 'n' },
ab046dde 614 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
6afc95b7 615 { "personality", required_argument, NULL, ARG_PERSONALITY },
1b9e5b12 616 { "image", required_argument, NULL, 'i' },
4d9f07b4 617 { "volatile", optional_argument, NULL, ARG_VOLATILE },
6d0b55c2 618 { "port", required_argument, NULL, 'p' },
f36933fe 619 { "property", required_argument, NULL, ARG_PROPERTY },
6dac160c 620 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
c6c8f6e2 621 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
f757855e 622 { "settings", required_argument, NULL, ARG_SETTINGS },
eb9da376 623 {}
88213476
LP
624 };
625
9444b1f2 626 int c, r;
a42c8b54 627 uint64_t plus = 0, minus = 0;
f757855e 628 bool mask_all_settings = false, mask_no_settings = false;
88213476
LP
629
630 assert(argc >= 0);
631 assert(argv);
632
0dfaa006 633 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
88213476
LP
634
635 switch (c) {
636
637 case 'h':
601185b4
ZJS
638 help();
639 return 0;
88213476 640
acbeb427
ZJS
641 case ARG_VERSION:
642 puts(PACKAGE_STRING);
643 puts(SYSTEMD_FEATURES);
644 return 0;
645
88213476 646 case 'D':
ec16945e
LP
647 r = set_sanitized_path(&arg_directory, optarg);
648 if (r < 0)
649 return log_error_errno(r, "Invalid root directory: %m");
650
651 break;
652
653 case ARG_TEMPLATE:
654 r = set_sanitized_path(&arg_template, optarg);
655 if (r < 0)
656 return log_error_errno(r, "Invalid template directory: %m");
88213476
LP
657
658 break;
659
1b9e5b12 660 case 'i':
ec16945e
LP
661 r = set_sanitized_path(&arg_image, optarg);
662 if (r < 0)
663 return log_error_errno(r, "Invalid image path: %m");
664
665 break;
666
667 case 'x':
668 arg_ephemeral = true;
1b9e5b12
LP
669 break;
670
687d0825 671 case 'u':
2fc09a9c
DM
672 r = free_and_strdup(&arg_user, optarg);
673 if (r < 0)
7027ff61 674 return log_oom();
687d0825 675
f757855e 676 arg_settings_mask |= SETTING_USER;
687d0825
MV
677 break;
678
ab046dde 679 case ARG_NETWORK_BRIDGE:
f757855e
LP
680 r = free_and_strdup(&arg_network_bridge, optarg);
681 if (r < 0)
682 return log_oom();
ab046dde
TG
683
684 /* fall through */
685
0dfaa006 686 case 'n':
69c79d3c
LP
687 arg_network_veth = true;
688 arg_private_network = true;
f757855e 689 arg_settings_mask |= SETTING_NETWORK;
69c79d3c
LP
690 break;
691
aa28aefe 692 case ARG_NETWORK_INTERFACE:
c74e630d
LP
693 if (strv_extend(&arg_network_interfaces, optarg) < 0)
694 return log_oom();
695
696 arg_private_network = true;
f757855e 697 arg_settings_mask |= SETTING_NETWORK;
c74e630d
LP
698 break;
699
700 case ARG_NETWORK_MACVLAN:
701 if (strv_extend(&arg_network_macvlan, optarg) < 0)
aa28aefe
LP
702 return log_oom();
703
4bbfe7ad 704 arg_private_network = true;
f757855e 705 arg_settings_mask |= SETTING_NETWORK;
4bbfe7ad
TG
706 break;
707
708 case ARG_NETWORK_IPVLAN:
709 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
710 return log_oom();
711
aa28aefe
LP
712 /* fall through */
713
ff01d048
LP
714 case ARG_PRIVATE_NETWORK:
715 arg_private_network = true;
f757855e 716 arg_settings_mask |= SETTING_NETWORK;
a41fe3a2
LP
717 break;
718
0f0dbc46
LP
719 case 'b':
720 arg_boot = true;
f757855e 721 arg_settings_mask |= SETTING_BOOT;
0f0dbc46
LP
722 break;
723
144f0fc0 724 case ARG_UUID:
9444b1f2
LP
725 r = sd_id128_from_string(optarg, &arg_uuid);
726 if (r < 0) {
aa96c6cb 727 log_error("Invalid UUID: %s", optarg);
9444b1f2 728 return r;
aa96c6cb 729 }
f757855e
LP
730
731 arg_settings_mask |= SETTING_MACHINE_ID;
9444b1f2 732 break;
aa96c6cb 733
9444b1f2 734 case 'S':
c74e630d 735 arg_slice = optarg;
144f0fc0
LP
736 break;
737
7027ff61 738 case 'M':
c1521918 739 if (isempty(optarg))
97b11eed 740 arg_machine = mfree(arg_machine);
c1521918 741 else {
0c3c4284 742 if (!machine_name_is_valid(optarg)) {
eb91eb18
LP
743 log_error("Invalid machine name: %s", optarg);
744 return -EINVAL;
745 }
7027ff61 746
0c3c4284
LP
747 r = free_and_strdup(&arg_machine, optarg);
748 if (r < 0)
eb91eb18
LP
749 return log_oom();
750
751 break;
752 }
7027ff61 753
82adf6af
LP
754 case 'Z':
755 arg_selinux_context = optarg;
a8828ed9
DW
756 break;
757
82adf6af
LP
758 case 'L':
759 arg_selinux_apifs_context = optarg;
a8828ed9
DW
760 break;
761
bc2f673e
LP
762 case ARG_READ_ONLY:
763 arg_read_only = true;
f757855e 764 arg_settings_mask |= SETTING_READ_ONLY;
bc2f673e
LP
765 break;
766
420c7379
LP
767 case ARG_CAPABILITY:
768 case ARG_DROP_CAPABILITY: {
a2a5291b 769 const char *state, *word;
5076f0cc
LP
770 size_t length;
771
772 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
39ed67d1 773 _cleanup_free_ char *t;
5076f0cc
LP
774
775 t = strndup(word, length);
0d0f0c50
SL
776 if (!t)
777 return log_oom();
5076f0cc 778
39ed67d1
LP
779 if (streq(t, "all")) {
780 if (c == ARG_CAPABILITY)
a42c8b54 781 plus = (uint64_t) -1;
39ed67d1 782 else
a42c8b54 783 minus = (uint64_t) -1;
39ed67d1 784 } else {
2822da4f
LP
785 int cap;
786
787 cap = capability_from_name(t);
788 if (cap < 0) {
39ed67d1
LP
789 log_error("Failed to parse capability %s.", t);
790 return -EINVAL;
791 }
792
793 if (c == ARG_CAPABILITY)
a42c8b54 794 plus |= 1ULL << (uint64_t) cap;
39ed67d1 795 else
a42c8b54 796 minus |= 1ULL << (uint64_t) cap;
5076f0cc 797 }
5076f0cc
LP
798 }
799
f757855e 800 arg_settings_mask |= SETTING_CAPABILITY;
5076f0cc
LP
801 break;
802 }
803
57fb9fb5
LP
804 case 'j':
805 arg_link_journal = LINK_GUEST;
574edc90 806 arg_link_journal_try = true;
57fb9fb5
LP
807 break;
808
809 case ARG_LINK_JOURNAL:
53e438e3 810 if (streq(optarg, "auto")) {
57fb9fb5 811 arg_link_journal = LINK_AUTO;
53e438e3
LP
812 arg_link_journal_try = false;
813 } else if (streq(optarg, "no")) {
57fb9fb5 814 arg_link_journal = LINK_NO;
53e438e3
LP
815 arg_link_journal_try = false;
816 } else if (streq(optarg, "guest")) {
57fb9fb5 817 arg_link_journal = LINK_GUEST;
53e438e3
LP
818 arg_link_journal_try = false;
819 } else if (streq(optarg, "host")) {
57fb9fb5 820 arg_link_journal = LINK_HOST;
53e438e3
LP
821 arg_link_journal_try = false;
822 } else if (streq(optarg, "try-guest")) {
574edc90
MP
823 arg_link_journal = LINK_GUEST;
824 arg_link_journal_try = true;
825 } else if (streq(optarg, "try-host")) {
826 arg_link_journal = LINK_HOST;
827 arg_link_journal_try = true;
828 } else {
57fb9fb5
LP
829 log_error("Failed to parse link journal mode %s", optarg);
830 return -EINVAL;
831 }
832
833 break;
834
17fe0523 835 case ARG_BIND:
f757855e
LP
836 case ARG_BIND_RO:
837 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
838 if (r < 0)
839 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
17fe0523 840
f757855e 841 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
17fe0523 842 break;
06c17c39 843
f757855e
LP
844 case ARG_TMPFS:
845 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
846 if (r < 0)
847 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
5a8af538 848
f757855e 849 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
5a8af538 850 break;
5a8af538
LP
851
852 case ARG_OVERLAY:
853 case ARG_OVERLAY_RO: {
854 _cleanup_free_ char *upper = NULL, *destination = NULL;
855 _cleanup_strv_free_ char **lower = NULL;
856 CustomMount *m;
857 unsigned n = 0;
858 char **i;
859
62f9f39a
RM
860 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
861 if (r == -ENOMEM)
06c17c39 862 return log_oom();
62f9f39a
RM
863 else if (r < 0) {
864 log_error("Invalid overlay specification: %s", optarg);
865 return r;
866 }
06c17c39 867
5a8af538
LP
868 STRV_FOREACH(i, lower) {
869 if (!path_is_absolute(*i)) {
870 log_error("Overlay path %s is not absolute.", *i);
871 return -EINVAL;
872 }
873
874 n++;
875 }
876
877 if (n < 2) {
878 log_error("--overlay= needs at least two colon-separated directories specified.");
879 return -EINVAL;
880 }
881
882 if (n == 2) {
883 /* If two parameters are specified,
884 * the first one is the lower, the
885 * second one the upper directory. And
af86c440
ZJS
886 * we'll also define the destination
887 * mount point the same as the upper. */
5a8af538
LP
888 upper = lower[1];
889 lower[1] = NULL;
890
891 destination = strdup(upper);
892 if (!destination)
893 return log_oom();
894
895 } else {
896 upper = lower[n - 2];
897 destination = lower[n - 1];
898 lower[n - 2] = NULL;
899 }
900
f757855e 901 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
5a8af538
LP
902 if (!m)
903 return log_oom();
904
905 m->destination = destination;
906 m->source = upper;
907 m->lower = lower;
908 m->read_only = c == ARG_OVERLAY_RO;
909
910 upper = destination = NULL;
911 lower = NULL;
06c17c39 912
f757855e 913 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
06c17c39
LP
914 break;
915 }
916
f4889f65
LP
917 case ARG_SETENV: {
918 char **n;
919
920 if (!env_assignment_is_valid(optarg)) {
921 log_error("Environment variable assignment '%s' is not valid.", optarg);
922 return -EINVAL;
923 }
924
925 n = strv_env_set(arg_setenv, optarg);
926 if (!n)
927 return log_oom();
928
929 strv_free(arg_setenv);
930 arg_setenv = n;
f757855e
LP
931
932 arg_settings_mask |= SETTING_ENVIRONMENT;
f4889f65
LP
933 break;
934 }
935
284c0b91
LP
936 case 'q':
937 arg_quiet = true;
938 break;
939
8a96d94e
LP
940 case ARG_SHARE_SYSTEM:
941 arg_share_system = true;
942 break;
943
eb91eb18
LP
944 case ARG_REGISTER:
945 r = parse_boolean(optarg);
946 if (r < 0) {
947 log_error("Failed to parse --register= argument: %s", optarg);
948 return r;
949 }
950
951 arg_register = r;
952 break;
953
89f7c846
LP
954 case ARG_KEEP_UNIT:
955 arg_keep_unit = true;
956 break;
957
6afc95b7
LP
958 case ARG_PERSONALITY:
959
ac45f971 960 arg_personality = personality_from_string(optarg);
050f7277 961 if (arg_personality == PERSONALITY_INVALID) {
6afc95b7
LP
962 log_error("Unknown or unsupported personality '%s'.", optarg);
963 return -EINVAL;
964 }
965
f757855e 966 arg_settings_mask |= SETTING_PERSONALITY;
6afc95b7
LP
967 break;
968
4d9f07b4
LP
969 case ARG_VOLATILE:
970
971 if (!optarg)
f757855e 972 arg_volatile_mode = VOLATILE_YES;
4d9f07b4 973 else {
f757855e 974 VolatileMode m;
4d9f07b4 975
f757855e
LP
976 m = volatile_mode_from_string(optarg);
977 if (m < 0) {
978 log_error("Failed to parse --volatile= argument: %s", optarg);
6d0b55c2 979 return -EINVAL;
f757855e
LP
980 } else
981 arg_volatile_mode = m;
6d0b55c2
LP
982 }
983
f757855e
LP
984 arg_settings_mask |= SETTING_VOLATILE_MODE;
985 break;
6d0b55c2 986
f757855e
LP
987 case 'p':
988 r = expose_port_parse(&arg_expose_ports, optarg);
989 if (r == -EEXIST)
990 return log_error_errno(r, "Duplicate port specification: %s", optarg);
991 if (r < 0)
992 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
6d0b55c2 993
f757855e 994 arg_settings_mask |= SETTING_EXPOSE_PORTS;
6d0b55c2 995 break;
6d0b55c2 996
f36933fe
LP
997 case ARG_PROPERTY:
998 if (strv_extend(&arg_property, optarg) < 0)
999 return log_oom();
1000
1001 break;
1002
6dac160c
LP
1003 case ARG_PRIVATE_USERS:
1004 if (optarg) {
1005 _cleanup_free_ char *buffer = NULL;
1006 const char *range, *shift;
1007
1008 range = strchr(optarg, ':');
1009 if (range) {
1010 buffer = strndup(optarg, range - optarg);
1011 if (!buffer)
1012 return log_oom();
1013 shift = buffer;
1014
1015 range++;
1016 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
1017 log_error("Failed to parse UID range: %s", range);
1018 return -EINVAL;
1019 }
1020 } else
1021 shift = optarg;
1022
1023 if (parse_uid(shift, &arg_uid_shift) < 0) {
1024 log_error("Failed to parse UID: %s", optarg);
1025 return -EINVAL;
1026 }
1027 }
1028
1029 arg_userns = true;
1030 break;
1031
c6c8f6e2
LP
1032 case ARG_KILL_SIGNAL:
1033 arg_kill_signal = signal_from_string_try_harder(optarg);
1034 if (arg_kill_signal < 0) {
1035 log_error("Cannot parse signal: %s", optarg);
1036 return -EINVAL;
1037 }
1038
f757855e
LP
1039 arg_settings_mask |= SETTING_KILL_SIGNAL;
1040 break;
1041
1042 case ARG_SETTINGS:
1043
1044 /* no → do not read files
1045 * yes → read files, do not override cmdline, trust only subset
1046 * override → read files, override cmdline, trust only subset
1047 * trusted → read files, do not override cmdline, trust all
1048 */
1049
1050 r = parse_boolean(optarg);
1051 if (r < 0) {
1052 if (streq(optarg, "trusted")) {
1053 mask_all_settings = false;
1054 mask_no_settings = false;
1055 arg_settings_trusted = true;
1056
1057 } else if (streq(optarg, "override")) {
1058 mask_all_settings = false;
1059 mask_no_settings = true;
1060 arg_settings_trusted = -1;
1061 } else
1062 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1063 } else if (r > 0) {
1064 /* yes */
1065 mask_all_settings = false;
1066 mask_no_settings = false;
1067 arg_settings_trusted = -1;
1068 } else {
1069 /* no */
1070 mask_all_settings = true;
1071 mask_no_settings = false;
1072 arg_settings_trusted = false;
1073 }
1074
c6c8f6e2
LP
1075 break;
1076
88213476
LP
1077 case '?':
1078 return -EINVAL;
1079
1080 default:
eb9da376 1081 assert_not_reached("Unhandled option");
88213476 1082 }
88213476 1083
eb91eb18
LP
1084 if (arg_share_system)
1085 arg_register = false;
1086
1087 if (arg_boot && arg_share_system) {
1088 log_error("--boot and --share-system may not be combined.");
1089 return -EINVAL;
1090 }
1091
89f7c846
LP
1092 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
1093 log_error("--keep-unit may not be used when invoked from a user session.");
1094 return -EINVAL;
1095 }
1096
1b9e5b12
LP
1097 if (arg_directory && arg_image) {
1098 log_error("--directory= and --image= may not be combined.");
1099 return -EINVAL;
1100 }
1101
ec16945e
LP
1102 if (arg_template && arg_image) {
1103 log_error("--template= and --image= may not be combined.");
1104 return -EINVAL;
1105 }
1106
1107 if (arg_template && !(arg_directory || arg_machine)) {
1108 log_error("--template= needs --directory= or --machine=.");
1109 return -EINVAL;
1110 }
1111
1112 if (arg_ephemeral && arg_template) {
1113 log_error("--ephemeral and --template= may not be combined.");
1114 return -EINVAL;
1115 }
1116
1117 if (arg_ephemeral && arg_image) {
1118 log_error("--ephemeral and --image= may not be combined.");
1119 return -EINVAL;
1120 }
1121
df9a75e4
LP
1122 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1123 log_error("--ephemeral and --link-journal= may not be combined.");
1124 return -EINVAL;
1125 }
1126
f757855e
LP
1127 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
1128 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
1129
1130 if (argc > optind) {
1131 arg_parameters = strv_copy(argv + optind);
1132 if (!arg_parameters)
1133 return log_oom();
1134
1135 arg_settings_mask |= SETTING_BOOT;
1136 }
1137
1138 /* Load all settings from .nspawn files */
1139 if (mask_no_settings)
1140 arg_settings_mask = 0;
1141
1142 /* Don't load any settings from .nspawn files */
1143 if (mask_all_settings)
1144 arg_settings_mask = _SETTINGS_MASK_ALL;
1145
1146 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1147
1148 r = detect_unified_cgroup_hierarchy();
1149 if (r < 0)
1150 return r;
1151
1152 return 1;
1153}
1154
1155static int verify_arguments(void) {
1156
1157 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
4d9f07b4
LP
1158 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1159 return -EINVAL;
1160 }
1161
6d0b55c2
LP
1162 if (arg_expose_ports && !arg_private_network) {
1163 log_error("Cannot use --port= without private networking.");
1164 return -EINVAL;
1165 }
1166
c6c8f6e2
LP
1167 if (arg_boot && arg_kill_signal <= 0)
1168 arg_kill_signal = SIGRTMIN+3;
1169
f757855e 1170 return 0;
88213476
LP
1171}
1172
03cfe0d5
LP
1173static int tmpfs_patch_options(const char *options, char **ret) {
1174 char *buf = NULL;
1175
1176 if (arg_userns && arg_uid_shift != 0) {
825d5287 1177 assert(arg_uid_shift != UID_INVALID);
03cfe0d5
LP
1178
1179 if (options)
f001a835 1180 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
03cfe0d5 1181 else
f001a835 1182 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
03cfe0d5
LP
1183 if (!buf)
1184 return -ENOMEM;
1185
1186 options = buf;
1187 }
1188
1189#ifdef HAVE_SELINUX
1190 if (arg_selinux_apifs_context) {
1191 char *t;
1192
1193 if (options)
1194 t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
1195 else
1196 t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
1197 if (!t) {
1198 free(buf);
1199 return -ENOMEM;
1200 }
1201
1202 free(buf);
1203 buf = t;
1204 }
1205#endif
1206
1207 *ret = buf;
1208 return !!buf;
1209}
1210
1211static int mount_all(const char *dest, bool userns) {
88213476
LP
1212
1213 typedef struct MountPoint {
1214 const char *what;
1215 const char *where;
1216 const char *type;
1217 const char *options;
1218 unsigned long flags;
3bd66c05 1219 bool fatal;
03cfe0d5 1220 bool userns;
88213476
LP
1221 } MountPoint;
1222
1223 static const MountPoint mount_table[] = {
3c59d4f2
RM
1224 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
1225 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
1226 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
1227 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
3c59d4f2
RM
1228 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
1229 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1230 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1231 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
9b634ea5 1232#ifdef HAVE_SELINUX
3c59d4f2
RM
1233 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
1234 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */
9b634ea5 1235#endif
88213476
LP
1236 };
1237
1238 unsigned k;
03cfe0d5 1239 int r;
88213476
LP
1240
1241 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
d15d65a0 1242 _cleanup_free_ char *where = NULL, *options = NULL;
d002827b 1243 const char *o;
88213476 1244
03cfe0d5
LP
1245 if (userns != mount_table[k].userns)
1246 continue;
1247
1248 where = prefix_root(dest, mount_table[k].where);
17fe0523
LP
1249 if (!where)
1250 return log_oom();
88213476 1251
e26d6ce5 1252 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
03cfe0d5
LP
1253 if (r < 0 && r != -ENOENT)
1254 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
88213476 1255
9c1c7f71 1256 /* Skip this entry if it is not a remount. */
03cfe0d5 1257 if (mount_table[k].what && r > 0)
014a9c77
LP
1258 continue;
1259
03cfe0d5
LP
1260 r = mkdir_p(where, 0755);
1261 if (r < 0) {
1262 if (mount_table[k].fatal)
1263 return log_error_errno(r, "Failed to create directory %s: %m", where);
79d80fc1 1264
03cfe0d5 1265 log_warning_errno(r, "Failed to create directory %s: %m", where);
79d80fc1
TG
1266 continue;
1267 }
88213476 1268
03cfe0d5
LP
1269 o = mount_table[k].options;
1270 if (streq_ptr(mount_table[k].type, "tmpfs")) {
1271 r = tmpfs_patch_options(o, &options);
1272 if (r < 0)
6dac160c 1273 return log_oom();
03cfe0d5
LP
1274 if (r > 0)
1275 o = options;
6dac160c 1276 }
a8828ed9 1277
88213476
LP
1278 if (mount(mount_table[k].what,
1279 where,
1280 mount_table[k].type,
1281 mount_table[k].flags,
79d80fc1 1282 o) < 0) {
88213476 1283
03cfe0d5
LP
1284 if (mount_table[k].fatal)
1285 return log_error_errno(errno, "mount(%s) failed: %m", where);
88213476 1286
03cfe0d5 1287 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
88213476 1288 }
88213476
LP
1289 }
1290
03cfe0d5 1291 return 0;
e58a1277 1292}
f8440af5 1293
5e5bfa6e
EY
1294static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
1295 const char *p = options;
1296 unsigned long flags = *mount_flags;
1297 char *opts = NULL;
1298
1299 assert(options);
1300
1301 for (;;) {
1302 _cleanup_free_ char *word = NULL;
a19222e1 1303 int r = extract_first_word(&p, &word, ",", 0);
5e5bfa6e
EY
1304 if (r < 0)
1305 return log_error_errno(r, "Failed to extract mount option: %m");
1306 if (r == 0)
1307 break;
1308
1309 if (streq(word, "rbind"))
1310 flags |= MS_REC;
1311 else if (streq(word, "norbind"))
1312 flags &= ~MS_REC;
1313 else {
1314 log_error("Invalid bind mount option: %s", word);
1315 return -EINVAL;
1316 }
1317 }
1318
1319 *mount_flags = flags;
1320 /* in the future mount_opts will hold string options for mount(2) */
1321 *mount_opts = opts;
1322
1323 return 0;
1324}
1325
5a8af538
LP
1326static int mount_bind(const char *dest, CustomMount *m) {
1327 struct stat source_st, dest_st;
03cfe0d5 1328 const char *where;
5e5bfa6e
EY
1329 unsigned long mount_flags = MS_BIND | MS_REC;
1330 _cleanup_free_ char *mount_opts = NULL;
5a8af538 1331 int r;
17fe0523 1332
5a8af538 1333 assert(m);
d2421337 1334
5e5bfa6e
EY
1335 if (m->options) {
1336 r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
1337 if (r < 0)
1338 return r;
1339 }
1340
5a8af538
LP
1341 if (stat(m->source, &source_st) < 0)
1342 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
17fe0523 1343
03cfe0d5 1344 where = prefix_roota(dest, m->destination);
06c17c39 1345
03cfe0d5 1346 if (stat(where, &dest_st) >= 0) {
5a8af538
LP
1347 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
1348 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
1349 return -EINVAL;
2ed4e5e0 1350 }
06c17c39 1351
5a8af538
LP
1352 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
1353 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
1354 return -EINVAL;
d2421337 1355 }
17fe0523 1356
5a8af538
LP
1357 } else if (errno == ENOENT) {
1358 r = mkdir_parents_label(where, 0755);
1359 if (r < 0)
1360 return log_error_errno(r, "Failed to make parents of %s: %m", where);
1361 } else {
1362 log_error_errno(errno, "Failed to stat %s: %m", where);
1363 return -errno;
1364 }
17fe0523 1365
5a8af538
LP
1366 /* Create the mount point. Any non-directory file can be
1367 * mounted on any non-directory file (regular, fifo, socket,
1368 * char, block).
1369 */
1370 if (S_ISDIR(source_st.st_mode))
1371 r = mkdir_label(where, 0755);
1372 else
1373 r = touch(where);
1374 if (r < 0 && r != -EEXIST)
1375 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1376
5e5bfa6e 1377 if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0)
5a8af538
LP
1378 return log_error_errno(errno, "mount(%s) failed: %m", where);
1379
1380 if (m->read_only) {
1381 r = bind_remount_recursive(where, true);
1382 if (r < 0)
1383 return log_error_errno(r, "Read-only bind mount failed: %m");
1384 }
1385
1386 return 0;
1387}
1388
1389static int mount_tmpfs(const char *dest, CustomMount *m) {
03cfe0d5
LP
1390 const char *where, *options;
1391 _cleanup_free_ char *buf = NULL;
5a8af538
LP
1392 int r;
1393
1394 assert(dest);
1395 assert(m);
1396
03cfe0d5 1397 where = prefix_roota(dest, m->destination);
5a8af538 1398
03cfe0d5 1399 r = mkdir_p_label(where, 0755);
5a8af538
LP
1400 if (r < 0 && r != -EEXIST)
1401 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1402
03cfe0d5
LP
1403 r = tmpfs_patch_options(m->options, &buf);
1404 if (r < 0)
1405 return log_oom();
1406 options = r > 0 ? buf : m->options;
1407
1408 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
5a8af538
LP
1409 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1410
1411 return 0;
1412}
1413
872d0dbd
RM
1414static char *joined_and_escaped_lower_dirs(char * const *lower) {
1415 _cleanup_strv_free_ char **sv = NULL;
1416
1417 sv = strv_copy(lower);
1418 if (!sv)
1419 return NULL;
1420
1421 strv_reverse(sv);
1422
1423 if (!strv_shell_escape(sv, ",:"))
1424 return NULL;
1425
1426 return strv_join(sv, ":");
1427}
1428
5a8af538
LP
1429static int mount_overlay(const char *dest, CustomMount *m) {
1430 _cleanup_free_ char *lower = NULL;
03cfe0d5 1431 const char *where, *options;
5a8af538
LP
1432 int r;
1433
1434 assert(dest);
1435 assert(m);
1436
03cfe0d5 1437 where = prefix_roota(dest, m->destination);
5a8af538
LP
1438
1439 r = mkdir_label(where, 0755);
1440 if (r < 0 && r != -EEXIST)
1441 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
1442
1443 (void) mkdir_p_label(m->source, 0755);
1444
872d0dbd 1445 lower = joined_and_escaped_lower_dirs(m->lower);
5a8af538
LP
1446 if (!lower)
1447 return log_oom();
1448
872d0dbd
RM
1449 if (m->read_only) {
1450 _cleanup_free_ char *escaped_source = NULL;
1451
1452 escaped_source = shell_escape(m->source, ",:");
1453 if (!escaped_source)
1454 return log_oom();
1455
1456 options = strjoina("lowerdir=", escaped_source, ":", lower);
1457 } else {
1458 _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL;
1459
5a8af538
LP
1460 assert(m->work_dir);
1461 (void) mkdir_label(m->work_dir, 0700);
1462
872d0dbd
RM
1463 escaped_source = shell_escape(m->source, ",:");
1464 if (!escaped_source)
1465 return log_oom();
1466 escaped_work_dir = shell_escape(m->work_dir, ",:");
1467 if (!escaped_work_dir)
1468 return log_oom();
1469
1470 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
5a8af538
LP
1471 }
1472
1473 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
1474 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
1475
1476 return 0;
1477}
1478
1479static int mount_custom(const char *dest) {
1480 unsigned i;
1481 int r;
1482
1483 assert(dest);
1484
1485 for (i = 0; i < arg_n_custom_mounts; i++) {
1486 CustomMount *m = &arg_custom_mounts[i];
1487
1488 switch (m->type) {
1489
1490 case CUSTOM_MOUNT_BIND:
1491 r = mount_bind(dest, m);
1492 break;
1493
1494 case CUSTOM_MOUNT_TMPFS:
1495 r = mount_tmpfs(dest, m);
1496 break;
1497
1498 case CUSTOM_MOUNT_OVERLAY:
1499 r = mount_overlay(dest, m);
1500 break;
1501
1502 default:
1503 assert_not_reached("Unknown custom mount type");
17fe0523 1504 }
5a8af538
LP
1505
1506 if (r < 0)
1507 return r;
17fe0523
LP
1508 }
1509
1510 return 0;
1511}
1512
efdb0237 1513static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
b12afc8c
LP
1514 char *to;
1515 int r;
1516
63c372cb 1517 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
b12afc8c 1518
e26d6ce5 1519 r = path_is_mount_point(to, 0);
da00518b 1520 if (r < 0 && r != -ENOENT)
b12afc8c
LP
1521 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1522 if (r > 0)
1523 return 0;
1524
1525 mkdir_p(to, 0755);
1526
c0534580
LP
1527 /* The superblock mount options of the mount point need to be
1528 * identical to the hosts', and hence writable... */
1529 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
b12afc8c
LP
1530 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1531
c0534580
LP
1532 /* ... hence let's only make the bind mount read-only, not the
1533 * superblock. */
1534 if (read_only) {
1535 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1536 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1537 }
b12afc8c
LP
1538 return 1;
1539}
1540
efdb0237 1541static int mount_legacy_cgroups(const char *dest) {
b12afc8c 1542 _cleanup_set_free_free_ Set *controllers = NULL;
03cfe0d5 1543 const char *cgroup_root;
b12afc8c
LP
1544 int r;
1545
efdb0237
LP
1546 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1547
1548 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
1549 r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
1550 if (r < 0)
1551 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
1552 if (r == 0) {
1553 _cleanup_free_ char *options = NULL;
1554
1555 r = tmpfs_patch_options("mode=755", &options);
1556 if (r < 0)
1557 return log_oom();
1558
1559 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
1560 return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
1561 }
1562
1563 if (cg_unified() > 0)
1564 goto skip_controllers;
1565
b12afc8c
LP
1566 controllers = set_new(&string_hash_ops);
1567 if (!controllers)
1568 return log_oom();
1569
1570 r = cg_kernel_controllers(controllers);
1571 if (r < 0)
1572 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1573
b12afc8c
LP
1574 for (;;) {
1575 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1576
1577 controller = set_steal_first(controllers);
1578 if (!controller)
1579 break;
1580
03cfe0d5 1581 origin = prefix_root("/sys/fs/cgroup/", controller);
b12afc8c
LP
1582 if (!origin)
1583 return log_oom();
1584
1585 r = readlink_malloc(origin, &combined);
1586 if (r == -EINVAL) {
1587 /* Not a symbolic link, but directly a single cgroup hierarchy */
1588
efdb0237 1589 r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
b12afc8c
LP
1590 if (r < 0)
1591 return r;
1592
1593 } else if (r < 0)
1594 return log_error_errno(r, "Failed to read link %s: %m", origin);
1595 else {
1596 _cleanup_free_ char *target = NULL;
1597
03cfe0d5 1598 target = prefix_root(dest, origin);
b12afc8c
LP
1599 if (!target)
1600 return log_oom();
1601
1602 /* A symbolic link, a combination of controllers in one hierarchy */
1603
1604 if (!filename_is_valid(combined)) {
1605 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1606 continue;
1607 }
1608
efdb0237 1609 r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
b12afc8c
LP
1610 if (r < 0)
1611 return r;
1612
875e1014
ILG
1613 r = symlink_idempotent(combined, target);
1614 if (r == -EINVAL) {
1615 log_error("Invalid existing symlink for combined hierarchy");
1616 return r;
1617 }
1618 if (r < 0)
1619 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
b12afc8c
LP
1620 }
1621 }
1622
efdb0237
LP
1623skip_controllers:
1624 r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false);
b12afc8c
LP
1625 if (r < 0)
1626 return r;
1627
03cfe0d5
LP
1628 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1629 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1630
1631 return 0;
1632}
1633
efdb0237
LP
1634static int mount_unified_cgroups(const char *dest) {
1635 const char *p;
1636 int r;
1637
1638 assert(dest);
1639
1640 p = strjoina(dest, "/sys/fs/cgroup");
1641
1642 r = path_is_mount_point(p, AT_SYMLINK_FOLLOW);
1643 if (r < 0)
1644 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
1645 if (r > 0) {
1646 p = strjoina(dest, "/sys/fs/cgroup/cgroup.procs");
1647 if (access(p, F_OK) >= 0)
1648 return 0;
1649 if (errno != ENOENT)
1650 return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
1651
1652 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
1653 return -EINVAL;
1654 }
1655
1656 if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0)
1657 return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p);
1658
1659 return 0;
1660}
1661
1662static int mount_cgroups(const char *dest) {
1663 if (arg_unified_cgroup_hierarchy)
1664 return mount_unified_cgroups(dest);
1665 else
1666 return mount_legacy_cgroups(dest);
1667}
1668
03cfe0d5
LP
1669static int mount_systemd_cgroup_writable(const char *dest) {
1670 _cleanup_free_ char *own_cgroup_path = NULL;
1671 const char *systemd_root, *systemd_own;
1672 int r;
1673
1674 assert(dest);
1675
1676 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1677 if (r < 0)
1678 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1679
efdb0237
LP
1680 /* If we are living in the top-level, then there's nothing to do... */
1681 if (path_equal(own_cgroup_path, "/"))
1682 return 0;
1683
1684 if (arg_unified_cgroup_hierarchy) {
1685 systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
1686 systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
1687 } else {
1688 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1689 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
1690 }
1691
b12afc8c 1692 /* Make our own cgroup a (writable) bind mount */
b12afc8c
LP
1693 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1694 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1695
1696 /* And then remount the systemd cgroup root read-only */
b12afc8c
LP
1697 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1698 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1699
03cfe0d5
LP
1700 return 0;
1701}
1702
1703static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1704 assert(p);
1705
1706 if (!arg_userns)
1707 return 0;
1708
1709 if (uid == UID_INVALID && gid == GID_INVALID)
1710 return 0;
1711
1712 if (uid != UID_INVALID) {
1713 uid += arg_uid_shift;
1714
1715 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1716 return -EOVERFLOW;
1717 }
1718
1719 if (gid != GID_INVALID) {
1720 gid += (gid_t) arg_uid_shift;
1721
1722 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1723 return -EOVERFLOW;
1724 }
1725
1726 if (lchown(p, uid, gid) < 0)
1727 return -errno;
b12afc8c
LP
1728
1729 return 0;
1730}
1731
03cfe0d5
LP
1732static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1733 const char *q;
1734
1735 q = prefix_roota(root, path);
1736 if (mkdir(q, mode) < 0) {
1737 if (errno == EEXIST)
1738 return 0;
1739 return -errno;
1740 }
1741
1742 return userns_lchown(q, uid, gid);
1743}
1744
e58a1277 1745static int setup_timezone(const char *dest) {
03cfe0d5
LP
1746 _cleanup_free_ char *p = NULL, *q = NULL;
1747 const char *where, *check, *what;
d4036145
LP
1748 char *z, *y;
1749 int r;
f8440af5 1750
e58a1277
LP
1751 assert(dest);
1752
1753 /* Fix the timezone, if possible */
d4036145
LP
1754 r = readlink_malloc("/etc/localtime", &p);
1755 if (r < 0) {
1756 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1757 return 0;
1758 }
1759
1760 z = path_startswith(p, "../usr/share/zoneinfo/");
1761 if (!z)
1762 z = path_startswith(p, "/usr/share/zoneinfo/");
1763 if (!z) {
1764 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1765 return 0;
1766 }
1767
03cfe0d5 1768 where = prefix_roota(dest, "/etc/localtime");
d4036145
LP
1769 r = readlink_malloc(where, &q);
1770 if (r >= 0) {
1771 y = path_startswith(q, "../usr/share/zoneinfo/");
1772 if (!y)
1773 y = path_startswith(q, "/usr/share/zoneinfo/");
4d1c38b8 1774
d4036145
LP
1775 /* Already pointing to the right place? Then do nothing .. */
1776 if (y && streq(y, z))
1777 return 0;
1778 }
1779
03cfe0d5
LP
1780 check = strjoina("/usr/share/zoneinfo/", z);
1781 check = prefix_root(dest, check);
1782 if (laccess(check, F_OK) < 0) {
d4036145
LP
1783 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1784 return 0;
1785 }
68fb0892 1786
79d80fc1
TG
1787 r = unlink(where);
1788 if (r < 0 && errno != ENOENT) {
56f64d95 1789 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
79d80fc1
TG
1790 return 0;
1791 }
4d9f07b4 1792
03cfe0d5 1793 what = strjoina("../usr/share/zoneinfo/", z);
d4036145 1794 if (symlink(what, where) < 0) {
56f64d95 1795 log_error_errno(errno, "Failed to correct timezone of container: %m");
d4036145
LP
1796 return 0;
1797 }
e58a1277 1798
03cfe0d5
LP
1799 r = userns_lchown(where, 0, 0);
1800 if (r < 0)
1801 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1802
e58a1277 1803 return 0;
88213476
LP
1804}
1805
2547bb41 1806static int setup_resolv_conf(const char *dest) {
03cfe0d5 1807 const char *where = NULL;
79d80fc1 1808 int r;
2547bb41
LP
1809
1810 assert(dest);
1811
1812 if (arg_private_network)
1813 return 0;
1814
1815 /* Fix resolv.conf, if possible */
03cfe0d5 1816 where = prefix_roota(dest, "/etc/resolv.conf");
79d80fc1 1817
f2068bcc 1818 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
79d80fc1 1819 if (r < 0) {
68a313c5
LP
1820 /* If the file already exists as symlink, let's
1821 * suppress the warning, under the assumption that
1822 * resolved or something similar runs inside and the
1823 * symlink points there.
1824 *
1825 * If the disk image is read-only, there's also no
1826 * point in complaining.
1827 */
1828 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1829 "Failed to copy /etc/resolv.conf to %s: %m", where);
79d80fc1
TG
1830 return 0;
1831 }
2547bb41 1832
03cfe0d5
LP
1833 r = userns_lchown(where, 0, 0);
1834 if (r < 0)
1835 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1836
2547bb41
LP
1837 return 0;
1838}
1839
4d9f07b4 1840static int setup_volatile_state(const char *directory) {
03cfe0d5
LP
1841 _cleanup_free_ char *buf = NULL;
1842 const char *p, *options;
4d9f07b4
LP
1843 int r;
1844
1845 assert(directory);
1846
f757855e 1847 if (arg_volatile_mode != VOLATILE_STATE)
4d9f07b4
LP
1848 return 0;
1849
1850 /* --volatile=state means we simply overmount /var
1851 with a tmpfs, and the rest read-only. */
1852
1853 r = bind_remount_recursive(directory, true);
f647962d
MS
1854 if (r < 0)
1855 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
4d9f07b4 1856
03cfe0d5 1857 p = prefix_roota(directory, "/var");
79d80fc1 1858 r = mkdir(p, 0755);
4a62c710
MS
1859 if (r < 0 && errno != EEXIST)
1860 return log_error_errno(errno, "Failed to create %s: %m", directory);
4d9f07b4 1861
03cfe0d5
LP
1862 options = "mode=755";
1863 r = tmpfs_patch_options(options, &buf);
1864 if (r < 0)
1865 return log_oom();
1866 if (r > 0)
1867 options = buf;
1868
1869 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
4a62c710 1870 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
4d9f07b4
LP
1871
1872 return 0;
1873}
1874
1875static int setup_volatile(const char *directory) {
1876 bool tmpfs_mounted = false, bind_mounted = false;
1877 char template[] = "/tmp/nspawn-volatile-XXXXXX";
03cfe0d5
LP
1878 _cleanup_free_ char *buf = NULL;
1879 const char *f, *t, *options;
4d9f07b4
LP
1880 int r;
1881
1882 assert(directory);
1883
f757855e 1884 if (arg_volatile_mode != VOLATILE_YES)
4d9f07b4
LP
1885 return 0;
1886
1887 /* --volatile=yes means we mount a tmpfs to the root dir, and
1888 the original /usr to use inside it, and that read-only. */
1889
4a62c710
MS
1890 if (!mkdtemp(template))
1891 return log_error_errno(errno, "Failed to create temporary directory: %m");
4d9f07b4 1892
03cfe0d5
LP
1893 options = "mode=755";
1894 r = tmpfs_patch_options(options, &buf);
1895 if (r < 0)
1896 return log_oom();
1897 if (r > 0)
1898 options = buf;
1899
1900 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
1901 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
4d9f07b4
LP
1902 goto fail;
1903 }
1904
1905 tmpfs_mounted = true;
1906
03cfe0d5
LP
1907 f = prefix_roota(directory, "/usr");
1908 t = prefix_roota(template, "/usr");
4d9f07b4 1909
79d80fc1
TG
1910 r = mkdir(t, 0755);
1911 if (r < 0 && errno != EEXIST) {
03cfe0d5 1912 r = log_error_errno(errno, "Failed to create %s: %m", t);
79d80fc1
TG
1913 goto fail;
1914 }
1915
4543768d 1916 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
03cfe0d5 1917 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
4d9f07b4
LP
1918 goto fail;
1919 }
1920
1921 bind_mounted = true;
1922
1923 r = bind_remount_recursive(t, true);
1924 if (r < 0) {
da927ba9 1925 log_error_errno(r, "Failed to remount %s read-only: %m", t);
4d9f07b4
LP
1926 goto fail;
1927 }
1928
1929 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
03cfe0d5 1930 r = log_error_errno(errno, "Failed to move root mount: %m");
4d9f07b4
LP
1931 goto fail;
1932 }
1933
03cfe0d5 1934 (void) rmdir(template);
4d9f07b4
LP
1935
1936 return 0;
1937
1938fail:
1939 if (bind_mounted)
03cfe0d5
LP
1940 (void) umount(t);
1941
4d9f07b4 1942 if (tmpfs_mounted)
03cfe0d5
LP
1943 (void) umount(template);
1944 (void) rmdir(template);
4d9f07b4
LP
1945 return r;
1946}
1947
9f24adc2 1948static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
03cfe0d5 1949 assert(s);
9f24adc2
LP
1950
1951 snprintf(s, 37,
1952 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1953 SD_ID128_FORMAT_VAL(id));
1954
1955 return s;
1956}
1957
04bc4a3f 1958static int setup_boot_id(const char *dest) {
03cfe0d5 1959 const char *from, *to;
39883f62 1960 sd_id128_t rnd = {};
04bc4a3f
LP
1961 char as_uuid[37];
1962 int r;
1963
eb91eb18
LP
1964 if (arg_share_system)
1965 return 0;
1966
04bc4a3f
LP
1967 /* Generate a new randomized boot ID, so that each boot-up of
1968 * the container gets a new one */
1969
03cfe0d5
LP
1970 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1971 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
04bc4a3f
LP
1972
1973 r = sd_id128_randomize(&rnd);
f647962d
MS
1974 if (r < 0)
1975 return log_error_errno(r, "Failed to generate random boot id: %m");
04bc4a3f 1976
9f24adc2 1977 id128_format_as_uuid(rnd, as_uuid);
04bc4a3f 1978
4c1fc3e4 1979 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
f647962d
MS
1980 if (r < 0)
1981 return log_error_errno(r, "Failed to write boot id: %m");
04bc4a3f 1982
03cfe0d5
LP
1983 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1984 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1985 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
56f64d95 1986 log_warning_errno(errno, "Failed to make boot id read-only: %m");
04bc4a3f
LP
1987
1988 unlink(from);
04bc4a3f
LP
1989 return r;
1990}
1991
e58a1277 1992static int copy_devnodes(const char *dest) {
88213476
LP
1993
1994 static const char devnodes[] =
1995 "null\0"
1996 "zero\0"
1997 "full\0"
1998 "random\0"
1999 "urandom\0"
85614d66
TG
2000 "tty\0"
2001 "net/tun\0";
88213476
LP
2002
2003 const char *d;
e58a1277 2004 int r = 0;
7fd1b19b 2005 _cleanup_umask_ mode_t u;
a258bf26
LP
2006
2007 assert(dest);
124640f1
LP
2008
2009 u = umask(0000);
88213476 2010
03cfe0d5
LP
2011 /* Create /dev/net, so that we can create /dev/net/tun in it */
2012 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2013 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2014
88213476 2015 NULSTR_FOREACH(d, devnodes) {
7fd1b19b 2016 _cleanup_free_ char *from = NULL, *to = NULL;
7f112f50 2017 struct stat st;
88213476 2018
7f112f50 2019 from = strappend("/dev/", d);
03cfe0d5 2020 to = prefix_root(dest, from);
88213476
LP
2021
2022 if (stat(from, &st) < 0) {
2023
4a62c710
MS
2024 if (errno != ENOENT)
2025 return log_error_errno(errno, "Failed to stat %s: %m", from);
88213476 2026
a258bf26 2027 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 2028
03cfe0d5 2029 log_error("%s is not a char or block device, cannot copy.", from);
7f112f50 2030 return -EIO;
a258bf26 2031
85614d66 2032 } else {
81f5049b
AC
2033 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
2034 if (errno != EPERM)
2035 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2036
2037 /* Some systems abusively restrict mknod but
2038 * allow bind mounts. */
2039 r = touch(to);
2040 if (r < 0)
2041 return log_error_errno(r, "touch (%s) failed: %m", to);
2042 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
2043 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
2044 }
6278cf60 2045
03cfe0d5
LP
2046 r = userns_lchown(to, 0, 0);
2047 if (r < 0)
2048 return log_error_errno(r, "chown() of device node %s failed: %m", to);
88213476 2049 }
88213476
LP
2050 }
2051
e58a1277
LP
2052 return r;
2053}
88213476 2054
03cfe0d5
LP
2055static int setup_pts(const char *dest) {
2056 _cleanup_free_ char *options = NULL;
2057 const char *p;
2058
2059#ifdef HAVE_SELINUX
2060 if (arg_selinux_apifs_context)
2061 (void) asprintf(&options,
3dce8915 2062 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
03cfe0d5
LP
2063 arg_uid_shift + TTY_GID,
2064 arg_selinux_apifs_context);
2065 else
2066#endif
2067 (void) asprintf(&options,
3dce8915 2068 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
03cfe0d5 2069 arg_uid_shift + TTY_GID);
f2d88580 2070
03cfe0d5 2071 if (!options)
f2d88580
LP
2072 return log_oom();
2073
03cfe0d5 2074 /* Mount /dev/pts itself */
cc9fce65 2075 p = prefix_roota(dest, "/dev/pts");
03cfe0d5
LP
2076 if (mkdir(p, 0755) < 0)
2077 return log_error_errno(errno, "Failed to create /dev/pts: %m");
2078 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
2079 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
2080 if (userns_lchown(p, 0, 0) < 0)
2081 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
2082
2083 /* Create /dev/ptmx symlink */
2084 p = prefix_roota(dest, "/dev/ptmx");
4a62c710
MS
2085 if (symlink("pts/ptmx", p) < 0)
2086 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
03cfe0d5
LP
2087 if (userns_lchown(p, 0, 0) < 0)
2088 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
f2d88580 2089
03cfe0d5
LP
2090 /* And fix /dev/pts/ptmx ownership */
2091 p = prefix_roota(dest, "/dev/pts/ptmx");
2092 if (userns_lchown(p, 0, 0) < 0)
2093 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
6278cf60 2094
f2d88580
LP
2095 return 0;
2096}
2097
e58a1277 2098static int setup_dev_console(const char *dest, const char *console) {
eb0f0863
LP
2099 _cleanup_umask_ mode_t u;
2100 const char *to;
e58a1277 2101 int r;
e58a1277
LP
2102
2103 assert(dest);
2104 assert(console);
2105
2106 u = umask(0000);
2107
03cfe0d5 2108 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f647962d
MS
2109 if (r < 0)
2110 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
88213476 2111
a258bf26
LP
2112 /* We need to bind mount the right tty to /dev/console since
2113 * ptys can only exist on pts file systems. To have something
81f5049b 2114 * to bind mount things on we create a empty regular file. */
a258bf26 2115
03cfe0d5 2116 to = prefix_roota(dest, "/dev/console");
81f5049b
AC
2117 r = touch(to);
2118 if (r < 0)
2119 return log_error_errno(r, "touch() for /dev/console failed: %m");
a258bf26 2120
4543768d 2121 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
4a62c710 2122 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
a258bf26 2123
25ea79fe 2124 return 0;
e58a1277
LP
2125}
2126
2127static int setup_kmsg(const char *dest, int kmsg_socket) {
03cfe0d5 2128 const char *from, *to;
7fd1b19b 2129 _cleanup_umask_ mode_t u;
03cfe0d5 2130 int fd, k;
e58a1277
LP
2131 union {
2132 struct cmsghdr cmsghdr;
2133 uint8_t buf[CMSG_SPACE(sizeof(int))];
b92bea5d
ZJS
2134 } control = {};
2135 struct msghdr mh = {
2136 .msg_control = &control,
2137 .msg_controllen = sizeof(control),
2138 };
e58a1277
LP
2139 struct cmsghdr *cmsg;
2140
e58a1277 2141 assert(kmsg_socket >= 0);
a258bf26 2142
e58a1277 2143 u = umask(0000);
a258bf26 2144
03cfe0d5 2145 /* We create the kmsg FIFO as /run/kmsg, but immediately
f1e5dfe2
LP
2146 * delete it after bind mounting it to /proc/kmsg. While FIFOs
2147 * on the reading side behave very similar to /proc/kmsg,
2148 * their writing side behaves differently from /dev/kmsg in
2149 * that writing blocks when nothing is reading. In order to
2150 * avoid any problems with containers deadlocking due to this
2151 * we simply make /dev/kmsg unavailable to the container. */
03cfe0d5
LP
2152 from = prefix_roota(dest, "/run/kmsg");
2153 to = prefix_roota(dest, "/proc/kmsg");
e58a1277 2154
4a62c710 2155 if (mkfifo(from, 0600) < 0)
03cfe0d5 2156 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
4543768d 2157 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
4a62c710 2158 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
e58a1277
LP
2159
2160 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
4a62c710
MS
2161 if (fd < 0)
2162 return log_error_errno(errno, "Failed to open fifo: %m");
e58a1277 2163
e58a1277
LP
2164 cmsg = CMSG_FIRSTHDR(&mh);
2165 cmsg->cmsg_level = SOL_SOCKET;
2166 cmsg->cmsg_type = SCM_RIGHTS;
2167 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
2168 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
2169
2170 mh.msg_controllen = cmsg->cmsg_len;
2171
2172 /* Store away the fd in the socket, so that it stays open as
2173 * long as we run the child */
6d0b55c2 2174 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
03e334a1 2175 safe_close(fd);
e58a1277 2176
4a62c710
MS
2177 if (k < 0)
2178 return log_error_errno(errno, "Failed to send FIFO fd: %m");
a258bf26 2179
03cfe0d5
LP
2180 /* And now make the FIFO unavailable as /run/kmsg... */
2181 (void) unlink(from);
2182
25ea79fe 2183 return 0;
88213476
LP
2184}
2185
6d0b55c2
LP
2186static int send_rtnl(int send_fd) {
2187 union {
2188 struct cmsghdr cmsghdr;
2189 uint8_t buf[CMSG_SPACE(sizeof(int))];
2190 } control = {};
2191 struct msghdr mh = {
2192 .msg_control = &control,
2193 .msg_controllen = sizeof(control),
2194 };
2195 struct cmsghdr *cmsg;
2196 _cleanup_close_ int fd = -1;
2197 ssize_t k;
2198
2199 assert(send_fd >= 0);
2200
2201 if (!arg_expose_ports)
2202 return 0;
2203
2204 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
2205 if (fd < 0)
03cfe0d5 2206 return log_error_errno(errno, "Failed to allocate container netlink: %m");
6d0b55c2
LP
2207
2208 cmsg = CMSG_FIRSTHDR(&mh);
2209 cmsg->cmsg_level = SOL_SOCKET;
2210 cmsg->cmsg_type = SCM_RIGHTS;
2211 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
2212 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
2213
2214 mh.msg_controllen = cmsg->cmsg_len;
2215
2216 /* Store away the fd in the socket, so that it stays open as
2217 * long as we run the child */
2218 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
2219 if (k < 0)
2220 return log_error_errno(errno, "Failed to send netlink fd: %m");
2221
2222 return 0;
2223}
2224
2225static int flush_ports(union in_addr_union *exposed) {
2226 ExposePort *p;
2227 int r, af = AF_INET;
2228
2229 assert(exposed);
2230
2231 if (!arg_expose_ports)
2232 return 0;
2233
2234 if (in_addr_is_null(af, exposed))
2235 return 0;
2236
2237 log_debug("Lost IP address.");
2238
2239 LIST_FOREACH(ports, p, arg_expose_ports) {
2240 r = fw_add_local_dnat(false,
2241 af,
2242 p->protocol,
2243 NULL,
2244 NULL, 0,
2245 NULL, 0,
2246 p->host_port,
2247 exposed,
2248 p->container_port,
2249 NULL);
2250 if (r < 0)
2251 log_warning_errno(r, "Failed to modify firewall: %m");
2252 }
2253
2254 *exposed = IN_ADDR_NULL;
2255 return 0;
2256}
2257
1c4baffc 2258static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
6d0b55c2
LP
2259 _cleanup_free_ struct local_address *addresses = NULL;
2260 _cleanup_free_ char *pretty = NULL;
2261 union in_addr_union new_exposed;
2262 ExposePort *p;
2263 bool add;
2264 int af = AF_INET, r;
2265
2266 assert(exposed);
2267
2268 /* Invoked each time an address is added or removed inside the
2269 * container */
2270
2271 if (!arg_expose_ports)
2272 return 0;
2273
2274 r = local_addresses(rtnl, 0, af, &addresses);
2275 if (r < 0)
2276 return log_error_errno(r, "Failed to enumerate local addresses: %m");
2277
2278 add = r > 0 &&
2279 addresses[0].family == af &&
2280 addresses[0].scope < RT_SCOPE_LINK;
2281
2282 if (!add)
2283 return flush_ports(exposed);
2284
2285 new_exposed = addresses[0].address;
2286 if (in_addr_equal(af, exposed, &new_exposed))
2287 return 0;
2288
2289 in_addr_to_string(af, &new_exposed, &pretty);
2290 log_debug("New container IP is %s.", strna(pretty));
2291
2292 LIST_FOREACH(ports, p, arg_expose_ports) {
2293
2294 r = fw_add_local_dnat(true,
2295 af,
2296 p->protocol,
2297 NULL,
2298 NULL, 0,
2299 NULL, 0,
2300 p->host_port,
2301 &new_exposed,
2302 p->container_port,
2303 in_addr_is_null(af, exposed) ? NULL : exposed);
2304 if (r < 0)
2305 log_warning_errno(r, "Failed to modify firewall: %m");
2306 }
2307
2308 *exposed = new_exposed;
2309 return 0;
2310}
2311
f757855e
LP
2312void expose_port_free_all(ExposePort *p) {
2313
2314 while (p) {
2315 ExposePort *q = p;
2316 LIST_REMOVE(ports, p, q);
2317 free(q);
2318 }
2319}
2320
1c4baffc 2321static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
6d0b55c2
LP
2322 union in_addr_union *exposed = userdata;
2323
2324 assert(rtnl);
2325 assert(m);
2326 assert(exposed);
2327
2328 expose_ports(rtnl, exposed);
2329 return 0;
2330}
2331
1c4baffc 2332static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
6d0b55c2
LP
2333 union {
2334 struct cmsghdr cmsghdr;
2335 uint8_t buf[CMSG_SPACE(sizeof(int))];
2336 } control = {};
2337 struct msghdr mh = {
2338 .msg_control = &control,
2339 .msg_controllen = sizeof(control),
2340 };
2341 struct cmsghdr *cmsg;
1c4baffc 2342 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
6d0b55c2
LP
2343 int fd, r;
2344 ssize_t k;
2345
2346 assert(event);
2347 assert(recv_fd >= 0);
2348 assert(ret);
2349
2350 if (!arg_expose_ports)
2351 return 0;
2352
2353 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
2354 if (k < 0)
2355 return log_error_errno(errno, "Failed to recv netlink fd: %m");
2356
2357 cmsg = CMSG_FIRSTHDR(&mh);
2358 assert(cmsg->cmsg_level == SOL_SOCKET);
2359 assert(cmsg->cmsg_type == SCM_RIGHTS);
657bdca9 2360 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
6d0b55c2
LP
2361 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
2362
1c4baffc 2363 r = sd_netlink_open_fd(&rtnl, fd);
6d0b55c2
LP
2364 if (r < 0) {
2365 safe_close(fd);
2366 return log_error_errno(r, "Failed to create rtnl object: %m");
2367 }
2368
1c4baffc 2369 r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
6d0b55c2
LP
2370 if (r < 0)
2371 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
2372
1c4baffc 2373 r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
6d0b55c2
LP
2374 if (r < 0)
2375 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
2376
1c4baffc 2377 r = sd_netlink_attach_event(rtnl, event, 0);
6d0b55c2
LP
2378 if (r < 0)
2379 return log_error_errno(r, "Failed to add to even loop: %m");
2380
2381 *ret = rtnl;
2382 rtnl = NULL;
2383
2384 return 0;
2385}
2386
3a74cea5 2387static int setup_hostname(void) {
3a74cea5 2388
eb91eb18
LP
2389 if (arg_share_system)
2390 return 0;
2391
605f81a8 2392 if (sethostname_idempotent(arg_machine) < 0)
7027ff61 2393 return -errno;
3a74cea5 2394
7027ff61 2395 return 0;
3a74cea5
LP
2396}
2397
57fb9fb5 2398static int setup_journal(const char *directory) {
4d680aee 2399 sd_id128_t machine_id, this_id;
03cfe0d5
LP
2400 _cleanup_free_ char *b = NULL, *d = NULL;
2401 const char *etc_machine_id, *p, *q;
27407a01 2402 char *id;
57fb9fb5
LP
2403 int r;
2404
df9a75e4
LP
2405 /* Don't link journals in ephemeral mode */
2406 if (arg_ephemeral)
2407 return 0;
2408
03cfe0d5 2409 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
57fb9fb5 2410
03cfe0d5 2411 r = read_one_line_file(etc_machine_id, &b);
27407a01
ZJS
2412 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
2413 return 0;
f647962d 2414 else if (r < 0)
03cfe0d5 2415 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
57fb9fb5 2416
27407a01
ZJS
2417 id = strstrip(b);
2418 if (isempty(id) && arg_link_journal == LINK_AUTO)
2419 return 0;
57fb9fb5 2420
27407a01
ZJS
2421 /* Verify validity */
2422 r = sd_id128_from_string(id, &machine_id);
f647962d 2423 if (r < 0)
03cfe0d5 2424 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
57fb9fb5 2425
4d680aee 2426 r = sd_id128_get_machine(&this_id);
f647962d
MS
2427 if (r < 0)
2428 return log_error_errno(r, "Failed to retrieve machine ID: %m");
4d680aee
ZJS
2429
2430 if (sd_id128_equal(machine_id, this_id)) {
2431 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
2432 "Host and machine ids are equal (%s): refusing to link journals", id);
2433 if (arg_link_journal == LINK_AUTO)
2434 return 0;
df9a75e4 2435 return -EEXIST;
4d680aee
ZJS
2436 }
2437
2438 if (arg_link_journal == LINK_NO)
2439 return 0;
2440
03cfe0d5
LP
2441 r = userns_mkdir(directory, "/var", 0755, 0, 0);
2442 if (r < 0)
2443 return log_error_errno(r, "Failed to create /var: %m");
2444
2445 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
2446 if (r < 0)
2447 return log_error_errno(r, "Failed to create /var/log: %m");
2448
2449 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
2450 if (r < 0)
2451 return log_error_errno(r, "Failed to create /var/log/journal: %m");
2452
2453 p = strjoina("/var/log/journal/", id);
2454 q = prefix_roota(directory, p);
27407a01 2455
e26d6ce5 2456 if (path_is_mount_point(p, 0) > 0) {
27407a01
ZJS
2457 if (arg_link_journal != LINK_AUTO) {
2458 log_error("%s: already a mount point, refusing to use for journal", p);
2459 return -EEXIST;
2460 }
2461
2462 return 0;
57fb9fb5
LP
2463 }
2464
e26d6ce5 2465 if (path_is_mount_point(q, 0) > 0) {
57fb9fb5 2466 if (arg_link_journal != LINK_AUTO) {
27407a01
ZJS
2467 log_error("%s: already a mount point, refusing to use for journal", q);
2468 return -EEXIST;
57fb9fb5
LP
2469 }
2470
27407a01 2471 return 0;
57fb9fb5
LP
2472 }
2473
2474 r = readlink_and_make_absolute(p, &d);
2475 if (r >= 0) {
2476 if ((arg_link_journal == LINK_GUEST ||
2477 arg_link_journal == LINK_AUTO) &&
2478 path_equal(d, q)) {
2479
03cfe0d5 2480 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2481 if (r < 0)
56f64d95 2482 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 2483 return 0;
57fb9fb5
LP
2484 }
2485
4a62c710
MS
2486 if (unlink(p) < 0)
2487 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
57fb9fb5
LP
2488 } else if (r == -EINVAL) {
2489
2490 if (arg_link_journal == LINK_GUEST &&
2491 rmdir(p) < 0) {
2492
27407a01
ZJS
2493 if (errno == ENOTDIR) {
2494 log_error("%s already exists and is neither a symlink nor a directory", p);
2495 return r;
2496 } else {
56f64d95 2497 log_error_errno(errno, "Failed to remove %s: %m", p);
27407a01 2498 return -errno;
57fb9fb5 2499 }
57fb9fb5
LP
2500 }
2501 } else if (r != -ENOENT) {
56f64d95 2502 log_error_errno(errno, "readlink(%s) failed: %m", p);
27407a01 2503 return r;
57fb9fb5
LP
2504 }
2505
2506 if (arg_link_journal == LINK_GUEST) {
2507
2508 if (symlink(q, p) < 0) {
574edc90 2509 if (arg_link_journal_try) {
56f64d95 2510 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
574edc90
MP
2511 return 0;
2512 } else {
56f64d95 2513 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
574edc90
MP
2514 return -errno;
2515 }
57fb9fb5
LP
2516 }
2517
03cfe0d5 2518 r = userns_mkdir(directory, p, 0755, 0, 0);
27407a01 2519 if (r < 0)
56f64d95 2520 log_warning_errno(errno, "Failed to create directory %s: %m", q);
27407a01 2521 return 0;
57fb9fb5
LP
2522 }
2523
2524 if (arg_link_journal == LINK_HOST) {
574edc90
MP
2525 /* don't create parents here -- if the host doesn't have
2526 * permanent journal set up, don't force it here */
2527 r = mkdir(p, 0755);
57fb9fb5 2528 if (r < 0) {
574edc90 2529 if (arg_link_journal_try) {
56f64d95 2530 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
574edc90
MP
2531 return 0;
2532 } else {
56f64d95 2533 log_error_errno(errno, "Failed to create %s: %m", p);
574edc90
MP
2534 return r;
2535 }
57fb9fb5
LP
2536 }
2537
27407a01
ZJS
2538 } else if (access(p, F_OK) < 0)
2539 return 0;
57fb9fb5 2540
cdb2b9d0
LP
2541 if (dir_is_empty(q) == 0)
2542 log_warning("%s is not empty, proceeding anyway.", q);
2543
03cfe0d5 2544 r = userns_mkdir(directory, p, 0755, 0, 0);
57fb9fb5 2545 if (r < 0) {
56f64d95 2546 log_error_errno(errno, "Failed to create %s: %m", q);
27407a01 2547 return r;
57fb9fb5
LP
2548 }
2549
4543768d 2550 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
4a62c710 2551 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
57fb9fb5 2552
27407a01 2553 return 0;
57fb9fb5
LP
2554}
2555
88213476 2556static int drop_capabilities(void) {
5076f0cc 2557 return capability_bounding_set_drop(~arg_retain, false);
88213476
LP
2558}
2559
5aa4bb6b 2560static int register_machine(pid_t pid, int local_ifindex) {
9444b1f2 2561 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
03976f7b 2562 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
9444b1f2
LP
2563 int r;
2564
eb91eb18
LP
2565 if (!arg_register)
2566 return 0;
2567
1c03020c 2568 r = sd_bus_default_system(&bus);
f647962d
MS
2569 if (r < 0)
2570 return log_error_errno(r, "Failed to open system bus: %m");
9444b1f2 2571
89f7c846
LP
2572 if (arg_keep_unit) {
2573 r = sd_bus_call_method(
2574 bus,
2575 "org.freedesktop.machine1",
2576 "/org/freedesktop/machine1",
2577 "org.freedesktop.machine1.Manager",
5aa4bb6b 2578 "RegisterMachineWithNetwork",
89f7c846
LP
2579 &error,
2580 NULL,
5aa4bb6b 2581 "sayssusai",
89f7c846
LP
2582 arg_machine,
2583 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2584 "nspawn",
2585 "container",
2586 (uint32_t) pid,
5aa4bb6b
LP
2587 strempty(arg_directory),
2588 local_ifindex > 0 ? 1 : 0, local_ifindex);
89f7c846 2589 } else {
9457ac5b 2590 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
f36933fe 2591 char **i;
ce5b3ad4 2592 unsigned j;
9457ac5b
LP
2593
2594 r = sd_bus_message_new_method_call(
89f7c846 2595 bus,
9457ac5b 2596 &m,
89f7c846
LP
2597 "org.freedesktop.machine1",
2598 "/org/freedesktop/machine1",
2599 "org.freedesktop.machine1.Manager",
5aa4bb6b 2600 "CreateMachineWithNetwork");
f647962d 2601 if (r < 0)
f36933fe 2602 return bus_log_create_error(r);
9457ac5b
LP
2603
2604 r = sd_bus_message_append(
2605 m,
5aa4bb6b 2606 "sayssusai",
89f7c846
LP
2607 arg_machine,
2608 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2609 "nspawn",
2610 "container",
2611 (uint32_t) pid,
5aa4bb6b
LP
2612 strempty(arg_directory),
2613 local_ifindex > 0 ? 1 : 0, local_ifindex);
f647962d 2614 if (r < 0)
f36933fe 2615 return bus_log_create_error(r);
9457ac5b
LP
2616
2617 r = sd_bus_message_open_container(m, 'a', "(sv)");
f647962d 2618 if (r < 0)
f36933fe 2619 return bus_log_create_error(r);
9457ac5b
LP
2620
2621 if (!isempty(arg_slice)) {
2622 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
f647962d 2623 if (r < 0)
f36933fe 2624 return bus_log_create_error(r);
9457ac5b
LP
2625 }
2626
2627 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
f647962d 2628 if (r < 0)
f36933fe 2629 return bus_log_create_error(r);
9457ac5b 2630
773ce3d8
LP
2631 /* If you make changes here, also make sure to update
2632 * systemd-nspawn@.service, to keep the device
2633 * policies in sync regardless if we are run with or
2634 * without the --keep-unit switch. */
63cc4c31 2635 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
9457ac5b
LP
2636 /* Allow the container to
2637 * access and create the API
2638 * device nodes, so that
2639 * PrivateDevices= in the
2640 * container can work
2641 * fine */
2642 "/dev/null", "rwm",
2643 "/dev/zero", "rwm",
2644 "/dev/full", "rwm",
2645 "/dev/random", "rwm",
2646 "/dev/urandom", "rwm",
2647 "/dev/tty", "rwm",
864e1706 2648 "/dev/net/tun", "rwm",
9457ac5b
LP
2649 /* Allow the container
2650 * access to ptys. However,
2651 * do not permit the
2652 * container to ever create
2653 * these device nodes. */
2654 "/dev/pts/ptmx", "rw",
63cc4c31 2655 "char-pts", "rw");
f647962d 2656 if (r < 0)
27023c0e
LP
2657 return bus_log_create_error(r);
2658
ce5b3ad4
SJ
2659 for (j = 0; j < arg_n_custom_mounts; j++) {
2660 CustomMount *cm = &arg_custom_mounts[j];
2661
2662 if (cm->type != CUSTOM_MOUNT_BIND)
2663 continue;
2664
2665 r = is_device_node(cm->source);
2666 if (r < 0)
2667 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
2668
2669 if (r) {
2670 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
2671 cm->source, cm->read_only ? "r" : "rw");
2672 if (r < 0)
2673 return log_error_errno(r, "Failed to append message arguments: %m");
2674 }
2675 }
2676
27023c0e
LP
2677 if (arg_kill_signal != 0) {
2678 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
2679 if (r < 0)
2680 return bus_log_create_error(r);
2681
2682 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
2683 if (r < 0)
2684 return bus_log_create_error(r);
2685 }
9457ac5b 2686
f36933fe
LP
2687 STRV_FOREACH(i, arg_property) {
2688 r = sd_bus_message_open_container(m, 'r', "sv");
2689 if (r < 0)
2690 return bus_log_create_error(r);
2691
2692 r = bus_append_unit_property_assignment(m, *i);
2693 if (r < 0)
2694 return r;
2695
2696 r = sd_bus_message_close_container(m);
2697 if (r < 0)
2698 return bus_log_create_error(r);
2699 }
2700
9457ac5b 2701 r = sd_bus_message_close_container(m);
f647962d 2702 if (r < 0)
f36933fe 2703 return bus_log_create_error(r);
9457ac5b
LP
2704
2705 r = sd_bus_call(bus, m, 0, &error, NULL);
89f7c846
LP
2706 }
2707
9444b1f2 2708 if (r < 0) {
1f0cd86b
LP
2709 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2710 return r;
2711 }
2712
2713 return 0;
2714}
2715
2716static int terminate_machine(pid_t pid) {
2717 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2718 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
03976f7b 2719 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
1f0cd86b
LP
2720 const char *path;
2721 int r;
2722
eb91eb18
LP
2723 if (!arg_register)
2724 return 0;
2725
1a2399e5
LP
2726 /* If we are reusing the unit, then just exit, systemd will do
2727 * the right thing when we exit. */
2728 if (arg_keep_unit)
2729 return 0;
2730
76b54375 2731 r = sd_bus_default_system(&bus);
f647962d
MS
2732 if (r < 0)
2733 return log_error_errno(r, "Failed to open system bus: %m");
1f0cd86b
LP
2734
2735 r = sd_bus_call_method(
2736 bus,
2737 "org.freedesktop.machine1",
2738 "/org/freedesktop/machine1",
2739 "org.freedesktop.machine1.Manager",
2740 "GetMachineByPID",
2741 &error,
2742 &reply,
2743 "u",
2744 (uint32_t) pid);
2745 if (r < 0) {
2746 /* Note that the machine might already have been
2747 * cleaned up automatically, hence don't consider it a
2748 * failure if we cannot get the machine object. */
2749 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2750 return 0;
2751 }
2752
2753 r = sd_bus_message_read(reply, "o", &path);
5b30bef8
LP
2754 if (r < 0)
2755 return bus_log_parse_error(r);
9444b1f2 2756
1f0cd86b
LP
2757 r = sd_bus_call_method(
2758 bus,
2759 "org.freedesktop.machine1",
2760 path,
2761 "org.freedesktop.machine1.Machine",
2762 "Terminate",
2763 &error,
2764 NULL,
2765 NULL);
2766 if (r < 0) {
2767 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2768 return 0;
2769 }
2770
9444b1f2
LP
2771 return 0;
2772}
2773
db999e0f
LP
2774static int reset_audit_loginuid(void) {
2775 _cleanup_free_ char *p = NULL;
2776 int r;
2777
2778 if (arg_share_system)
2779 return 0;
2780
2781 r = read_one_line_file("/proc/self/loginuid", &p);
13e8ceb8 2782 if (r == -ENOENT)
db999e0f 2783 return 0;
f647962d
MS
2784 if (r < 0)
2785 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
db999e0f
LP
2786
2787 /* Already reset? */
2788 if (streq(p, "4294967295"))
2789 return 0;
2790
ad118bda 2791 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
db999e0f 2792 if (r < 0) {
10a87006
LP
2793 log_error_errno(r,
2794 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2795 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2796 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2797 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2798 "using systemd-nspawn. Sleeping for 5s... (%m)");
77b6e194 2799
db999e0f 2800 sleep(5);
77b6e194 2801 }
db999e0f
LP
2802
2803 return 0;
77b6e194
LP
2804}
2805
4f758c23
LP
2806#define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2807#define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
e867ceb6 2808#define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
01dde061 2809
a90e2305 2810static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
01dde061
TG
2811 uint8_t result[8];
2812 size_t l, sz;
a90e2305
LP
2813 uint8_t *v, *i;
2814 int r;
01dde061
TG
2815
2816 l = strlen(arg_machine);
2817 sz = sizeof(sd_id128_t) + l;
e867ceb6
LP
2818 if (idx > 0)
2819 sz += sizeof(idx);
a90e2305 2820
01dde061
TG
2821 v = alloca(sz);
2822
2823 /* fetch some persistent data unique to the host */
2824 r = sd_id128_get_machine((sd_id128_t*) v);
2825 if (r < 0)
2826 return r;
2827
2828 /* combine with some data unique (on this host) to this
2829 * container instance */
a90e2305
LP
2830 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2831 if (idx > 0) {
2832 idx = htole64(idx);
2833 memcpy(i, &idx, sizeof(idx));
2834 }
01dde061
TG
2835
2836 /* Let's hash the host machine ID plus the container name. We
2837 * use a fixed, but originally randomly created hash key here. */
4f758c23 2838 siphash24(result, v, sz, hash_key.bytes);
01dde061
TG
2839
2840 assert_cc(ETH_ALEN <= sizeof(result));
2841 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2842
2843 /* see eth_random_addr in the kernel */
2844 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2845 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2846
2847 return 0;
2848}
2849
5aa4bb6b 2850static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1c4baffc
TG
2851 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2852 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4f758c23 2853 struct ether_addr mac_host, mac_container;
5aa4bb6b 2854 int r, i;
69c79d3c
LP
2855
2856 if (!arg_private_network)
2857 return 0;
2858
2859 if (!arg_network_veth)
2860 return 0;
2861
08af0da2
LP
2862 /* Use two different interface name prefixes depending whether
2863 * we are in bridge mode or not. */
c00524c9 2864 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
4212a337 2865 arg_network_bridge ? "vb" : "ve", arg_machine);
69c79d3c 2866
e867ceb6
LP
2867 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2868 if (r < 0)
2869 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
4f758c23 2870
e867ceb6
LP
2871 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2872 if (r < 0)
2873 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
01dde061 2874
1c4baffc 2875 r = sd_netlink_open(&rtnl);
f647962d
MS
2876 if (r < 0)
2877 return log_error_errno(r, "Failed to connect to netlink: %m");
69c79d3c 2878
151b9b96 2879 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
2880 if (r < 0)
2881 return log_error_errno(r, "Failed to allocate netlink message: %m");
69c79d3c 2882
1c4baffc 2883 r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
f647962d
MS
2884 if (r < 0)
2885 return log_error_errno(r, "Failed to add netlink interface name: %m");
69c79d3c 2886
1c4baffc 2887 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
f647962d
MS
2888 if (r < 0)
2889 return log_error_errno(r, "Failed to add netlink MAC address: %m");
4f758c23 2890
1c4baffc 2891 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
2892 if (r < 0)
2893 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2894
1c4baffc 2895 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
f647962d
MS
2896 if (r < 0)
2897 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2898
1c4baffc 2899 r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
f647962d
MS
2900 if (r < 0)
2901 return log_error_errno(r, "Failed to open netlink container: %m");
69c79d3c 2902
1c4baffc 2903 r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
f647962d
MS
2904 if (r < 0)
2905 return log_error_errno(r, "Failed to add netlink interface name: %m");
01dde061 2906
1c4baffc 2907 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
f647962d
MS
2908 if (r < 0)
2909 return log_error_errno(r, "Failed to add netlink MAC address: %m");
69c79d3c 2910
1c4baffc 2911 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
2912 if (r < 0)
2913 return log_error_errno(r, "Failed to add netlink namespace field: %m");
69c79d3c 2914
1c4baffc 2915 r = sd_netlink_message_close_container(m);
f647962d
MS
2916 if (r < 0)
2917 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c 2918
1c4baffc 2919 r = sd_netlink_message_close_container(m);
f647962d
MS
2920 if (r < 0)
2921 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c 2922
1c4baffc 2923 r = sd_netlink_message_close_container(m);
f647962d
MS
2924 if (r < 0)
2925 return log_error_errno(r, "Failed to close netlink container: %m");
69c79d3c 2926
1c4baffc 2927 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d 2928 if (r < 0)
637aa8a3 2929 return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
69c79d3c 2930
5aa4bb6b 2931 i = (int) if_nametoindex(iface_name);
4a62c710
MS
2932 if (i <= 0)
2933 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
5aa4bb6b
LP
2934
2935 *ifi = i;
2936
69c79d3c
LP
2937 return 0;
2938}
2939
5aa4bb6b 2940static int setup_bridge(const char veth_name[], int *ifi) {
1c4baffc
TG
2941 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2942 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
ab046dde
TG
2943 int r, bridge;
2944
2945 if (!arg_private_network)
2946 return 0;
2947
2948 if (!arg_network_veth)
2949 return 0;
2950
2951 if (!arg_network_bridge)
2952 return 0;
2953
2954 bridge = (int) if_nametoindex(arg_network_bridge);
4a62c710
MS
2955 if (bridge <= 0)
2956 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
ab046dde 2957
5aa4bb6b
LP
2958 *ifi = bridge;
2959
1c4baffc 2960 r = sd_netlink_open(&rtnl);
f647962d
MS
2961 if (r < 0)
2962 return log_error_errno(r, "Failed to connect to netlink: %m");
ab046dde 2963
151b9b96 2964 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
f647962d
MS
2965 if (r < 0)
2966 return log_error_errno(r, "Failed to allocate netlink message: %m");
ab046dde 2967
039dd4af 2968 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
f647962d
MS
2969 if (r < 0)
2970 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
039dd4af 2971
1c4baffc 2972 r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
f647962d
MS
2973 if (r < 0)
2974 return log_error_errno(r, "Failed to add netlink interface name field: %m");
ab046dde 2975
1c4baffc 2976 r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
f647962d
MS
2977 if (r < 0)
2978 return log_error_errno(r, "Failed to add netlink master field: %m");
ab046dde 2979
1c4baffc 2980 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d
MS
2981 if (r < 0)
2982 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
ab046dde
TG
2983
2984 return 0;
2985}
2986
c74e630d
LP
2987static int parse_interface(struct udev *udev, const char *name) {
2988 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2989 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2990 int ifi;
2991
2992 ifi = (int) if_nametoindex(name);
4a62c710
MS
2993 if (ifi <= 0)
2994 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
c74e630d
LP
2995
2996 sprintf(ifi_str, "n%i", ifi);
2997 d = udev_device_new_from_device_id(udev, ifi_str);
4a62c710
MS
2998 if (!d)
2999 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
c74e630d
LP
3000
3001 if (udev_device_get_is_initialized(d) <= 0) {
3002 log_error("Network interface %s is not initialized yet.", name);
3003 return -EBUSY;
3004 }
3005
3006 return ifi;
3007}
3008
69c79d3c 3009static int move_network_interfaces(pid_t pid) {
7e227024 3010 _cleanup_udev_unref_ struct udev *udev = NULL;
1c4baffc 3011 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
aa28aefe
LP
3012 char **i;
3013 int r;
3014
3015 if (!arg_private_network)
3016 return 0;
3017
3018 if (strv_isempty(arg_network_interfaces))
3019 return 0;
3020
1c4baffc 3021 r = sd_netlink_open(&rtnl);
f647962d
MS
3022 if (r < 0)
3023 return log_error_errno(r, "Failed to connect to netlink: %m");
aa28aefe 3024
7e227024
LP
3025 udev = udev_new();
3026 if (!udev) {
3027 log_error("Failed to connect to udev.");
3028 return -ENOMEM;
3029 }
3030
aa28aefe 3031 STRV_FOREACH(i, arg_network_interfaces) {
1c4baffc 3032 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
b88eb17a 3033 int ifi;
aa28aefe 3034
c74e630d
LP
3035 ifi = parse_interface(udev, *i);
3036 if (ifi < 0)
3037 return ifi;
3038
3125b3ef 3039 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
f647962d
MS
3040 if (r < 0)
3041 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 3042
1c4baffc 3043 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
3044 if (r < 0)
3045 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
7e227024 3046
1c4baffc 3047 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d
MS
3048 if (r < 0)
3049 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
c74e630d 3050 }
7e227024 3051
c74e630d
LP
3052 return 0;
3053}
3054
3055static int setup_macvlan(pid_t pid) {
3056 _cleanup_udev_unref_ struct udev *udev = NULL;
1c4baffc 3057 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
e867ceb6 3058 unsigned idx = 0;
c74e630d
LP
3059 char **i;
3060 int r;
3061
3062 if (!arg_private_network)
3063 return 0;
3064
3065 if (strv_isempty(arg_network_macvlan))
3066 return 0;
3067
1c4baffc 3068 r = sd_netlink_open(&rtnl);
f647962d
MS
3069 if (r < 0)
3070 return log_error_errno(r, "Failed to connect to netlink: %m");
c74e630d
LP
3071
3072 udev = udev_new();
3073 if (!udev) {
3074 log_error("Failed to connect to udev.");
3075 return -ENOMEM;
3076 }
3077
3078 STRV_FOREACH(i, arg_network_macvlan) {
1c4baffc 3079 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
c74e630d 3080 _cleanup_free_ char *n = NULL;
e867ceb6 3081 struct ether_addr mac;
c74e630d
LP
3082 int ifi;
3083
3084 ifi = parse_interface(udev, *i);
3085 if (ifi < 0)
3086 return ifi;
3087
e867ceb6
LP
3088 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
3089 if (r < 0)
3090 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
3091
c74e630d 3092 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
f647962d
MS
3093 if (r < 0)
3094 return log_error_errno(r, "Failed to allocate netlink message: %m");
aa28aefe 3095
1c4baffc 3096 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
f647962d
MS
3097 if (r < 0)
3098 return log_error_errno(r, "Failed to add netlink interface index: %m");
c74e630d
LP
3099
3100 n = strappend("mv-", *i);
3101 if (!n)
3102 return log_oom();
3103
3104 strshorten(n, IFNAMSIZ-1);
3105
1c4baffc 3106 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
f647962d
MS
3107 if (r < 0)
3108 return log_error_errno(r, "Failed to add netlink interface name: %m");
c74e630d 3109
1c4baffc 3110 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
e867ceb6
LP
3111 if (r < 0)
3112 return log_error_errno(r, "Failed to add netlink MAC address: %m");
3113
1c4baffc 3114 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
f647962d
MS
3115 if (r < 0)
3116 return log_error_errno(r, "Failed to add netlink namespace field: %m");
c74e630d 3117
1c4baffc 3118 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
f647962d
MS
3119 if (r < 0)
3120 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 3121
1c4baffc 3122 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
f647962d
MS
3123 if (r < 0)
3124 return log_error_errno(r, "Failed to open netlink container: %m");
c74e630d 3125
1c4baffc 3126 r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
f647962d
MS
3127 if (r < 0)
3128 return log_error_errno(r, "Failed to append macvlan mode: %m");
c74e630d 3129
1c4baffc 3130 r = sd_netlink_message_close_container(m);
f647962d
MS
3131 if (r < 0)
3132 return log_error_errno(r, "Failed to close netlink container: %m");
c74e630d 3133
1c4baffc 3134 r = sd_netlink_message_close_container(m);
f647962d
MS
3135 if (r < 0)
3136 return log_error_errno(r, "Failed to close netlink container: %m");
aa28aefe 3137
1c4baffc 3138 r = sd_netlink_call(rtnl, m, 0, NULL);
f647962d
MS
3139 if (r < 0)
3140 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
aa28aefe
LP
3141 }
3142
3143 return 0;
3144}
3145
4bbfe7ad
TG
3146static int setup_ipvlan(pid_t pid) {
3147 _cleanup_udev_unref_ struct udev *udev = NULL;
1c4baffc 3148 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4bbfe7ad
TG
3149 char **i;
3150 int r;
3151
3152 if (!arg_private_network)
3153 return 0;
3154
3155 if (strv_isempty(arg_network_ipvlan))
3156 return 0;
3157
1c4baffc 3158 r = sd_netlink_open(&rtnl);
4bbfe7ad
TG
3159 if (r < 0)
3160 return log_error_errno(r, "Failed to connect to netlink: %m");
3161
3162 udev = udev_new();
3163 if (!udev) {
3164 log_error("Failed to connect to udev.");
3165 return -ENOMEM;
3166 }
3167
3168 STRV_FOREACH(i, arg_network_ipvlan) {
1c4baffc 3169 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
4bbfe7ad
TG
3170 _cleanup_free_ char *n = NULL;
3171 int ifi;
3172
3173 ifi = parse_interface(udev, *i);
3174 if (ifi < 0)
3175 return ifi;
3176
3177 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
3178 if (r < 0)
3179 return log_error_errno(r, "Failed to allocate netlink message: %m");
3180
1c4baffc 3181 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
4bbfe7ad
TG
3182 if (r < 0)
3183 return log_error_errno(r, "Failed to add netlink interface index: %m");
3184
3185 n = strappend("iv-", *i);
3186 if (!n)
3187 return log_oom();
3188
3189 strshorten(n, IFNAMSIZ-1);
3190
1c4baffc 3191 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
4bbfe7ad
TG
3192 if (r < 0)
3193 return log_error_errno(r, "Failed to add netlink interface name: %m");
3194
1c4baffc 3195 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
4bbfe7ad
TG
3196 if (r < 0)
3197 return log_error_errno(r, "Failed to add netlink namespace field: %m");
3198
1c4baffc 3199 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
4bbfe7ad
TG
3200 if (r < 0)
3201 return log_error_errno(r, "Failed to open netlink container: %m");
3202
1c4baffc 3203 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
4bbfe7ad
TG
3204 if (r < 0)
3205 return log_error_errno(r, "Failed to open netlink container: %m");
3206
1c4baffc 3207 r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
4bbfe7ad
TG
3208 if (r < 0)
3209 return log_error_errno(r, "Failed to add ipvlan mode: %m");
3210
1c4baffc 3211 r = sd_netlink_message_close_container(m);
4bbfe7ad
TG
3212 if (r < 0)
3213 return log_error_errno(r, "Failed to close netlink container: %m");
3214
1c4baffc 3215 r = sd_netlink_message_close_container(m);
4bbfe7ad
TG
3216 if (r < 0)
3217 return log_error_errno(r, "Failed to close netlink container: %m");
3218
1c4baffc 3219 r = sd_netlink_call(rtnl, m, 0, NULL);
4bbfe7ad
TG
3220 if (r < 0)
3221 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
3222 }
3223
3224 return 0;
3225}
3226
28650077 3227static int setup_seccomp(void) {
24fb1112
LP
3228
3229#ifdef HAVE_SECCOMP
9a71b112
JF
3230 static const struct {
3231 uint64_t capability;
3232 int syscall_num;
3233 } blacklist[] = {
5ba7a268
LP
3234 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
3235 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
3236 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
3237 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
3238 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
3239 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
3240 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
3241 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
3242 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
3243 { CAP_SYSLOG, SCMP_SYS(syslog) },
d0a0ccf3
JF
3244 };
3245
24fb1112 3246 scmp_filter_ctx seccomp;
28650077 3247 unsigned i;
24fb1112
LP
3248 int r;
3249
24fb1112
LP
3250 seccomp = seccomp_init(SCMP_ACT_ALLOW);
3251 if (!seccomp)
3252 return log_oom();
3253
e9642be2 3254 r = seccomp_add_secondary_archs(seccomp);
9875fd78 3255 if (r < 0) {
da927ba9 3256 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
e9642be2
LP
3257 goto finish;
3258 }
3259
28650077 3260 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
9a71b112
JF
3261 if (arg_retain & (1ULL << blacklist[i].capability))
3262 continue;
3263
3264 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
28650077
LP
3265 if (r == -EFAULT)
3266 continue; /* unknown syscall */
3267 if (r < 0) {
da927ba9 3268 log_error_errno(r, "Failed to block syscall: %m");
28650077
LP
3269 goto finish;
3270 }
3271 }
3272
d0a0ccf3 3273
28650077
LP
3274 /*
3275 Audit is broken in containers, much of the userspace audit
3276 hookup will fail if running inside a container. We don't
3277 care and just turn off creation of audit sockets.
3278
3279 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
3280 with EAFNOSUPPORT which audit userspace uses as indication
3281 that audit is disabled in the kernel.
3282 */
3283
3302da46 3284 r = seccomp_rule_add(
24fb1112
LP
3285 seccomp,
3286 SCMP_ACT_ERRNO(EAFNOSUPPORT),
3287 SCMP_SYS(socket),
3288 2,
3289 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
3290 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
3291 if (r < 0) {
da927ba9 3292 log_error_errno(r, "Failed to add audit seccomp rule: %m");
24fb1112
LP
3293 goto finish;
3294 }
3295
3296 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
3297 if (r < 0) {
da927ba9 3298 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
24fb1112
LP
3299 goto finish;
3300 }
3301
3302 r = seccomp_load(seccomp);
9b1cbdc6
ILG
3303 if (r == -EINVAL) {
3304 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3305 r = 0;
3306 goto finish;
3307 }
3308 if (r < 0) {
da927ba9 3309 log_error_errno(r, "Failed to install seccomp audit filter: %m");
9b1cbdc6
ILG
3310 goto finish;
3311 }
24fb1112
LP
3312
3313finish:
3314 seccomp_release(seccomp);
3315 return r;
3316#else
3317 return 0;
3318#endif
3319
3320}
3321
785890ac
LP
3322static int setup_propagate(const char *root) {
3323 const char *p, *q;
3324
3325 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3326 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
63c372cb 3327 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
785890ac
LP
3328 (void) mkdir_p(p, 0600);
3329
03cfe0d5
LP
3330 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
3331 return log_error_errno(errno, "Failed to create /run/systemd: %m");
3332
3333 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3334 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
3335
3336 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3337 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
785890ac 3338
03cfe0d5 3339 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
785890ac
LP
3340 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
3341 return log_error_errno(errno, "Failed to install propagation bind mount.");
3342
3343 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
3344 return log_error_errno(errno, "Failed to make propagation mount read-only");
3345
3346 return 0;
3347}
3348
1b9e5b12
LP
3349static int setup_image(char **device_path, int *loop_nr) {
3350 struct loop_info64 info = {
3351 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
3352 };
3353 _cleanup_close_ int fd = -1, control = -1, loop = -1;
3354 _cleanup_free_ char* loopdev = NULL;
3355 struct stat st;
3356 int r, nr;
3357
3358 assert(device_path);
3359 assert(loop_nr);
ec16945e 3360 assert(arg_image);
1b9e5b12
LP
3361
3362 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
3363 if (fd < 0)
3364 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
1b9e5b12 3365
4a62c710
MS
3366 if (fstat(fd, &st) < 0)
3367 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
1b9e5b12
LP
3368
3369 if (S_ISBLK(st.st_mode)) {
3370 char *p;
3371
3372 p = strdup(arg_image);
3373 if (!p)
3374 return log_oom();
3375
3376 *device_path = p;
3377
3378 *loop_nr = -1;
3379
3380 r = fd;
3381 fd = -1;
3382
3383 return r;
3384 }
3385
3386 if (!S_ISREG(st.st_mode)) {
56f64d95 3387 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
1b9e5b12
LP
3388 return -EINVAL;
3389 }
3390
3391 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
4a62c710
MS
3392 if (control < 0)
3393 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12
LP
3394
3395 nr = ioctl(control, LOOP_CTL_GET_FREE);
4a62c710
MS
3396 if (nr < 0)
3397 return log_error_errno(errno, "Failed to allocate loop device: %m");
1b9e5b12
LP
3398
3399 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
3400 return log_oom();
3401
3402 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
4a62c710
MS
3403 if (loop < 0)
3404 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
1b9e5b12 3405
4a62c710
MS
3406 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
3407 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
1b9e5b12
LP
3408
3409 if (arg_read_only)
3410 info.lo_flags |= LO_FLAGS_READ_ONLY;
3411
4a62c710
MS
3412 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
3413 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
1b9e5b12
LP
3414
3415 *device_path = loopdev;
3416 loopdev = NULL;
3417
3418 *loop_nr = nr;
3419
3420 r = loop;
3421 loop = -1;
3422
3423 return r;
3424}
3425
ada4799a
LP
3426#define PARTITION_TABLE_BLURB \
3427 "Note that the disk image needs to either contain only a single MBR partition of\n" \
4aab5d0c 3428 "type 0x83 that is marked bootable, or a single GPT partition of type " \
f6c51a81 3429 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
ada4799a
LP
3430 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3431 "to be bootable with systemd-nspawn."
3432
1b9e5b12
LP
3433static int dissect_image(
3434 int fd,
727fd4fd
LP
3435 char **root_device, bool *root_device_rw,
3436 char **home_device, bool *home_device_rw,
3437 char **srv_device, bool *srv_device_rw,
1b9e5b12
LP
3438 bool *secondary) {
3439
3440#ifdef HAVE_BLKID
01dc33ce
ZJS
3441 int home_nr = -1, srv_nr = -1;
3442#ifdef GPT_ROOT_NATIVE
3443 int root_nr = -1;
3444#endif
3445#ifdef GPT_ROOT_SECONDARY
3446 int secondary_root_nr = -1;
3447#endif
f6c51a81 3448 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
1b9e5b12
LP
3449 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
3450 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
3451 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3452 _cleanup_udev_unref_ struct udev *udev = NULL;
3453 struct udev_list_entry *first, *item;
f6c51a81 3454 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
c09ef2e4 3455 bool is_gpt, is_mbr, multiple_generic = false;
1b9e5b12
LP
3456 const char *pttype = NULL;
3457 blkid_partlist pl;
3458 struct stat st;
c09ef2e4 3459 unsigned i;
1b9e5b12
LP
3460 int r;
3461
3462 assert(fd >= 0);
3463 assert(root_device);
3464 assert(home_device);
3465 assert(srv_device);
3466 assert(secondary);
ec16945e 3467 assert(arg_image);
1b9e5b12
LP
3468
3469 b = blkid_new_probe();
3470 if (!b)
3471 return log_oom();
3472
3473 errno = 0;
3474 r = blkid_probe_set_device(b, fd, 0, 0);
3475 if (r != 0) {
3476 if (errno == 0)
3477 return log_oom();
3478
56f64d95 3479 log_error_errno(errno, "Failed to set device on blkid probe: %m");
1b9e5b12
LP
3480 return -errno;
3481 }
3482
3483 blkid_probe_enable_partitions(b, 1);
3484 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
3485
3486 errno = 0;
3487 r = blkid_do_safeprobe(b);
3488 if (r == -2 || r == 1) {
ada4799a
LP
3489 log_error("Failed to identify any partition table on\n"
3490 " %s\n"
3491 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
3492 return -EINVAL;
3493 } else if (r != 0) {
3494 if (errno == 0)
3495 errno = EIO;
56f64d95 3496 log_error_errno(errno, "Failed to probe: %m");
1b9e5b12
LP
3497 return -errno;
3498 }
3499
48861960 3500 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
ada4799a
LP
3501
3502 is_gpt = streq_ptr(pttype, "gpt");
3503 is_mbr = streq_ptr(pttype, "dos");
3504
3505 if (!is_gpt && !is_mbr) {
3506 log_error("No GPT or MBR partition table discovered on\n"
3507 " %s\n"
3508 PARTITION_TABLE_BLURB, arg_image);
1b9e5b12
LP
3509 return -EINVAL;
3510 }
3511
3512 errno = 0;
3513 pl = blkid_probe_get_partitions(b);
3514 if (!pl) {
3515 if (errno == 0)
3516 return log_oom();
3517
3518 log_error("Failed to list partitions of %s", arg_image);
3519 return -errno;
3520 }
3521
3522 udev = udev_new();
3523 if (!udev)
3524 return log_oom();
3525
4a62c710
MS
3526 if (fstat(fd, &st) < 0)
3527 return log_error_errno(errno, "Failed to stat block device: %m");
1b9e5b12 3528
c09ef2e4
LP
3529 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
3530 if (!d)
1b9e5b12
LP
3531 return log_oom();
3532
c09ef2e4
LP
3533 for (i = 0;; i++) {
3534 int n, m;
1b9e5b12 3535
c09ef2e4
LP
3536 if (i >= 10) {
3537 log_error("Kernel partitions never appeared.");
3538 return -ENXIO;
3539 }
3540
3541 e = udev_enumerate_new(udev);
3542 if (!e)
3543 return log_oom();
3544
3545 r = udev_enumerate_add_match_parent(e, d);
3546 if (r < 0)
3547 return log_oom();
3548
3549 r = udev_enumerate_scan_devices(e);
3550 if (r < 0)
3551 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
3552
3553 /* Count the partitions enumerated by the kernel */
3554 n = 0;
3555 first = udev_enumerate_get_list_entry(e);
3556 udev_list_entry_foreach(item, first)
3557 n++;
3558
3559 /* Count the partitions enumerated by blkid */
3560 m = blkid_partlist_numof_partitions(pl);
3561 if (n == m + 1)
3562 break;
3563 if (n > m + 1) {
3564 log_error("blkid and kernel partition list do not match.");
3565 return -EIO;
3566 }
3567 if (n < m + 1) {
3568 unsigned j;
3569
3570 /* The kernel has probed fewer partitions than
3571 * blkid? Maybe the kernel prober is still
3572 * running or it got EBUSY because udev
3573 * already opened the device. Let's reprobe
3574 * the device, which is a synchronous call
3575 * that waits until probing is complete. */
3576
3577 for (j = 0; j < 20; j++) {
3578
3579 r = ioctl(fd, BLKRRPART, 0);
3580 if (r < 0)
3581 r = -errno;
3582 if (r >= 0 || r != -EBUSY)
3583 break;
3584
3585 /* If something else has the device
3586 * open, such as an udev rule, the
3587 * ioctl will return EBUSY. Since
3588 * there's no way to wait until it
3589 * isn't busy anymore, let's just wait
3590 * a bit, and try again.
3591 *
3592 * This is really something they
3593 * should fix in the kernel! */
3594
3595 usleep(50 * USEC_PER_MSEC);
3596 }
3597
3598 if (r < 0)
3599 return log_error_errno(r, "Failed to reread partition table: %m");
3600 }
3601
3602 e = udev_enumerate_unref(e);
3603 }
1b9e5b12
LP
3604
3605 first = udev_enumerate_get_list_entry(e);
3606 udev_list_entry_foreach(item, first) {
3607 _cleanup_udev_device_unref_ struct udev_device *q;
ada4799a 3608 const char *node;
727fd4fd 3609 unsigned long long flags;
1b9e5b12
LP
3610 blkid_partition pp;
3611 dev_t qn;
3612 int nr;
3613
3614 errno = 0;
3615 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
3616 if (!q) {
3617 if (!errno)
3618 errno = ENOMEM;
3619
56f64d95 3620 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
1b9e5b12
LP
3621 return -errno;
3622 }
3623
3624 qn = udev_device_get_devnum(q);
3625 if (major(qn) == 0)
3626 continue;
3627
3628 if (st.st_rdev == qn)
3629 continue;
3630
3631 node = udev_device_get_devnode(q);
3632 if (!node)
3633 continue;
3634
3635 pp = blkid_partlist_devno_to_partition(pl, qn);
3636 if (!pp)
3637 continue;
3638
727fd4fd 3639 flags = blkid_partition_get_flags(pp);
727fd4fd 3640
1b9e5b12
LP
3641 nr = blkid_partition_get_partno(pp);
3642 if (nr < 0)
3643 continue;
3644
ada4799a
LP
3645 if (is_gpt) {
3646 sd_id128_t type_id;
3647 const char *stype;
1b9e5b12 3648
f6c51a81
LP
3649 if (flags & GPT_FLAG_NO_AUTO)
3650 continue;
3651
ada4799a
LP
3652 stype = blkid_partition_get_type_string(pp);
3653 if (!stype)
3654 continue;
1b9e5b12 3655
ada4799a 3656 if (sd_id128_from_string(stype, &type_id) < 0)
1b9e5b12
LP
3657 continue;
3658
ada4799a 3659 if (sd_id128_equal(type_id, GPT_HOME)) {
727fd4fd 3660
ada4799a
LP
3661 if (home && nr >= home_nr)
3662 continue;
1b9e5b12 3663
ada4799a
LP
3664 home_nr = nr;
3665 home_rw = !(flags & GPT_FLAG_READ_ONLY);
1b9e5b12 3666
ada4799a
LP
3667 r = free_and_strdup(&home, node);
3668 if (r < 0)
3669 return log_oom();
727fd4fd 3670
ada4799a
LP
3671 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3672
3673 if (srv && nr >= srv_nr)
3674 continue;
3675
3676 srv_nr = nr;
3677 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3678
3679 r = free_and_strdup(&srv, node);
3680 if (r < 0)
3681 return log_oom();
3682 }
1b9e5b12 3683#ifdef GPT_ROOT_NATIVE
ada4799a 3684 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
1b9e5b12 3685
ada4799a
LP
3686 if (root && nr >= root_nr)
3687 continue;
1b9e5b12 3688
ada4799a
LP
3689 root_nr = nr;
3690 root_rw = !(flags & GPT_FLAG_READ_ONLY);
727fd4fd 3691
ada4799a
LP
3692 r = free_and_strdup(&root, node);
3693 if (r < 0)
3694 return log_oom();
3695 }
1b9e5b12
LP
3696#endif
3697#ifdef GPT_ROOT_SECONDARY
ada4799a
LP
3698 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3699
3700 if (secondary_root && nr >= secondary_root_nr)
3701 continue;
3702
3703 secondary_root_nr = nr;
3704 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3705
3706 r = free_and_strdup(&secondary_root, node);
3707 if (r < 0)
3708 return log_oom();
3709 }
3710#endif
f6c51a81
LP
3711 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3712
3713 if (generic)
3714 multiple_generic = true;
3715 else {
3716 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3717
3718 r = free_and_strdup(&generic, node);
3719 if (r < 0)
3720 return log_oom();
3721 }
3722 }
ada4799a
LP
3723
3724 } else if (is_mbr) {
3725 int type;
1b9e5b12 3726
f6c51a81
LP
3727 if (flags != 0x80) /* Bootable flag */
3728 continue;
3729
ada4799a
LP
3730 type = blkid_partition_get_type(pp);
3731 if (type != 0x83) /* Linux partition */
1b9e5b12
LP
3732 continue;
3733
f6c51a81
LP
3734 if (generic)
3735 multiple_generic = true;
3736 else {
3737 generic_rw = true;
727fd4fd 3738
f6c51a81
LP
3739 r = free_and_strdup(&root, node);
3740 if (r < 0)
3741 return log_oom();
3742 }
1b9e5b12 3743 }
1b9e5b12
LP
3744 }
3745
1b9e5b12
LP
3746 if (root) {
3747 *root_device = root;
3748 root = NULL;
727fd4fd
LP
3749
3750 *root_device_rw = root_rw;
1b9e5b12
LP
3751 *secondary = false;
3752 } else if (secondary_root) {
3753 *root_device = secondary_root;
3754 secondary_root = NULL;
727fd4fd
LP
3755
3756 *root_device_rw = secondary_root_rw;
1b9e5b12 3757 *secondary = true;
f6c51a81
LP
3758 } else if (generic) {
3759
3760 /* There were no partitions with precise meanings
3761 * around, but we found generic partitions. In this
3762 * case, if there's only one, we can go ahead and boot
3763 * it, otherwise we bail out, because we really cannot
3764 * make any sense of it. */
3765
3766 if (multiple_generic) {
3767 log_error("Identified multiple bootable Linux partitions on\n"
3768 " %s\n"
3769 PARTITION_TABLE_BLURB, arg_image);
3770 return -EINVAL;
3771 }
3772
3773 *root_device = generic;
3774 generic = NULL;
3775
3776 *root_device_rw = generic_rw;
3777 *secondary = false;
3778 } else {
3779 log_error("Failed to identify root partition in disk image\n"
3780 " %s\n"
3781 PARTITION_TABLE_BLURB, arg_image);
3782 return -EINVAL;
1b9e5b12
LP
3783 }
3784
3785 if (home) {
3786 *home_device = home;
3787 home = NULL;
727fd4fd
LP
3788
3789 *home_device_rw = home_rw;
1b9e5b12
LP
3790 }
3791
3792 if (srv) {
3793 *srv_device = srv;
3794 srv = NULL;
727fd4fd
LP
3795
3796 *srv_device_rw = srv_rw;
1b9e5b12
LP
3797 }
3798
3799 return 0;
3800#else
3801 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3802 return -EOPNOTSUPP;
1b9e5b12
LP
3803#endif
3804}
3805
727fd4fd 3806static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
1b9e5b12
LP
3807#ifdef HAVE_BLKID
3808 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3809 const char *fstype, *p;
3810 int r;
3811
3812 assert(what);
3813 assert(where);
3814
727fd4fd
LP
3815 if (arg_read_only)
3816 rw = false;
3817
1b9e5b12 3818 if (directory)
63c372cb 3819 p = strjoina(where, directory);
1b9e5b12
LP
3820 else
3821 p = where;
3822
3823 errno = 0;
3824 b = blkid_new_probe_from_filename(what);
3825 if (!b) {
3826 if (errno == 0)
3827 return log_oom();
56f64d95 3828 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
1b9e5b12
LP
3829 return -errno;
3830 }
3831
3832 blkid_probe_enable_superblocks(b, 1);
3833 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3834
3835 errno = 0;
3836 r = blkid_do_safeprobe(b);
3837 if (r == -1 || r == 1) {
3838 log_error("Cannot determine file system type of %s", what);
3839 return -EINVAL;
3840 } else if (r != 0) {
3841 if (errno == 0)
3842 errno = EIO;
56f64d95 3843 log_error_errno(errno, "Failed to probe %s: %m", what);
1b9e5b12
LP
3844 return -errno;
3845 }
3846
3847 errno = 0;
3848 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3849 if (errno == 0)
3850 errno = EINVAL;
3851 log_error("Failed to determine file system type of %s", what);
3852 return -errno;
3853 }
3854
3855 if (streq(fstype, "crypto_LUKS")) {
3856 log_error("nspawn currently does not support LUKS disk images.");
15411c0c 3857 return -EOPNOTSUPP;
1b9e5b12
LP
3858 }
3859
4a62c710
MS
3860 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3861 return log_error_errno(errno, "Failed to mount %s: %m", what);
1b9e5b12
LP
3862
3863 return 0;
3864#else
3865 log_error("--image= is not supported, compiled without blkid support.");
15411c0c 3866 return -EOPNOTSUPP;
1b9e5b12
LP
3867#endif
3868}
3869
727fd4fd
LP
3870static int mount_devices(
3871 const char *where,
3872 const char *root_device, bool root_device_rw,
3873 const char *home_device, bool home_device_rw,
3874 const char *srv_device, bool srv_device_rw) {
1b9e5b12
LP
3875 int r;
3876
3877 assert(where);
3878
3879 if (root_device) {
727fd4fd 3880 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f647962d
MS
3881 if (r < 0)
3882 return log_error_errno(r, "Failed to mount root directory: %m");
1b9e5b12
LP
3883 }
3884
3885 if (home_device) {
727fd4fd 3886 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f647962d
MS
3887 if (r < 0)
3888 return log_error_errno(r, "Failed to mount home directory: %m");
1b9e5b12
LP
3889 }
3890
3891 if (srv_device) {
727fd4fd 3892 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f647962d
MS
3893 if (r < 0)
3894 return log_error_errno(r, "Failed to mount server data directory: %m");
1b9e5b12
LP
3895 }
3896
3897 return 0;
3898}
3899
3900static void loop_remove(int nr, int *image_fd) {
3901 _cleanup_close_ int control = -1;
e8c8ddcc 3902 int r;
1b9e5b12
LP
3903
3904 if (nr < 0)
3905 return;
3906
3907 if (image_fd && *image_fd >= 0) {
e8c8ddcc
TG
3908 r = ioctl(*image_fd, LOOP_CLR_FD);
3909 if (r < 0)
5e4074aa 3910 log_debug_errno(errno, "Failed to close loop image: %m");
03e334a1 3911 *image_fd = safe_close(*image_fd);
1b9e5b12
LP
3912 }
3913
3914 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
e8c8ddcc 3915 if (control < 0) {
56f64d95 3916 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
1b9e5b12 3917 return;
e8c8ddcc 3918 }
1b9e5b12 3919
e8c8ddcc
TG
3920 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3921 if (r < 0)
5e4074aa 3922 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
1b9e5b12
LP
3923}
3924
0cb9fbcd
LP
3925static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3926 int pipe_fds[2];
3927 pid_t pid;
3928
3929 assert(database);
3930 assert(key);
3931 assert(rpid);
3932
4a62c710
MS
3933 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3934 return log_error_errno(errno, "Failed to allocate pipe: %m");
0cb9fbcd
LP
3935
3936 pid = fork();
4a62c710
MS
3937 if (pid < 0)
3938 return log_error_errno(errno, "Failed to fork getent child: %m");
3939 else if (pid == 0) {
0cb9fbcd
LP
3940 int nullfd;
3941 char *empty_env = NULL;
3942
3943 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3944 _exit(EXIT_FAILURE);
3945
3946 if (pipe_fds[0] > 2)
03e334a1 3947 safe_close(pipe_fds[0]);
0cb9fbcd 3948 if (pipe_fds[1] > 2)
03e334a1 3949 safe_close(pipe_fds[1]);
0cb9fbcd
LP
3950
3951 nullfd = open("/dev/null", O_RDWR);
3952 if (nullfd < 0)
3953 _exit(EXIT_FAILURE);
3954
3955 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3956 _exit(EXIT_FAILURE);
3957
3958 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3959 _exit(EXIT_FAILURE);
3960
3961 if (nullfd > 2)
03e334a1 3962 safe_close(nullfd);
0cb9fbcd 3963
ce30c8dc
LP
3964 (void) reset_all_signal_handlers();
3965 (void) reset_signal_mask();
0cb9fbcd
LP
3966 close_all_fds(NULL, 0);
3967
4de82926
MM
3968 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3969 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
0cb9fbcd
LP
3970 _exit(EXIT_FAILURE);
3971 }
3972
03e334a1 3973 pipe_fds[1] = safe_close(pipe_fds[1]);
0cb9fbcd
LP
3974
3975 *rpid = pid;
3976
3977 return pipe_fds[0];
3978}
3979
3980static int change_uid_gid(char **_home) {
a2a5291b
ZJS
3981 char line[LINE_MAX], *x, *u, *g, *h;
3982 const char *word, *state;
0cb9fbcd
LP
3983 _cleanup_free_ uid_t *uids = NULL;
3984 _cleanup_free_ char *home = NULL;
3985 _cleanup_fclose_ FILE *f = NULL;
3986 _cleanup_close_ int fd = -1;
3987 unsigned n_uids = 0;
70f539ca 3988 size_t sz = 0, l;
0cb9fbcd
LP
3989 uid_t uid;
3990 gid_t gid;
3991 pid_t pid;
3992 int r;
3993
3994 assert(_home);
3995
3996 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3997 /* Reset everything fully to 0, just in case */
3998
03cfe0d5
LP
3999 r = reset_uid_gid();
4000 if (r < 0)
4001 return log_error_errno(r, "Failed to become root: %m");
0cb9fbcd
LP
4002
4003 *_home = NULL;
4004 return 0;
4005 }
4006
4007 /* First, get user credentials */
4008 fd = spawn_getent("passwd", arg_user, &pid);
4009 if (fd < 0)
4010 return fd;
4011
4012 f = fdopen(fd, "r");
4013 if (!f)
4014 return log_oom();
4015 fd = -1;
4016
4017 if (!fgets(line, sizeof(line), f)) {
4018
4019 if (!ferror(f)) {
4020 log_error("Failed to resolve user %s.", arg_user);
4021 return -ESRCH;
4022 }
4023
56f64d95 4024 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
4025 return -errno;
4026 }
4027
4028 truncate_nl(line);
4029
820d3acf 4030 wait_for_terminate_and_warn("getent passwd", pid, true);
0cb9fbcd
LP
4031
4032 x = strchr(line, ':');
4033 if (!x) {
4034 log_error("/etc/passwd entry has invalid user field.");
4035 return -EIO;
4036 }
4037
4038 u = strchr(x+1, ':');
4039 if (!u) {
4040 log_error("/etc/passwd entry has invalid password field.");
4041 return -EIO;
4042 }
4043
4044 u++;
4045 g = strchr(u, ':');
4046 if (!g) {
4047 log_error("/etc/passwd entry has invalid UID field.");
4048 return -EIO;
4049 }
4050
4051 *g = 0;
4052 g++;
4053 x = strchr(g, ':');
4054 if (!x) {
4055 log_error("/etc/passwd entry has invalid GID field.");
4056 return -EIO;
4057 }
4058
4059 *x = 0;
4060 h = strchr(x+1, ':');
4061 if (!h) {
4062 log_error("/etc/passwd entry has invalid GECOS field.");
4063 return -EIO;
4064 }
4065
4066 h++;
4067 x = strchr(h, ':');
4068 if (!x) {
4069 log_error("/etc/passwd entry has invalid home directory field.");
4070 return -EIO;
4071 }
4072
4073 *x = 0;
4074
4075 r = parse_uid(u, &uid);
4076 if (r < 0) {
4077 log_error("Failed to parse UID of user.");
4078 return -EIO;
4079 }
4080
4081 r = parse_gid(g, &gid);
4082 if (r < 0) {
4083 log_error("Failed to parse GID of user.");
4084 return -EIO;
4085 }
4086
4087 home = strdup(h);
4088 if (!home)
4089 return log_oom();
4090
4091 /* Second, get group memberships */
4092 fd = spawn_getent("initgroups", arg_user, &pid);
4093 if (fd < 0)
4094 return fd;
4095
4096 fclose(f);
4097 f = fdopen(fd, "r");
4098 if (!f)
4099 return log_oom();
4100 fd = -1;
4101
4102 if (!fgets(line, sizeof(line), f)) {
4103 if (!ferror(f)) {
4104 log_error("Failed to resolve user %s.", arg_user);
4105 return -ESRCH;
4106 }
4107
56f64d95 4108 log_error_errno(errno, "Failed to read from getent: %m");
0cb9fbcd
LP
4109 return -errno;
4110 }
4111
4112 truncate_nl(line);
4113
820d3acf 4114 wait_for_terminate_and_warn("getent initgroups", pid, true);
0cb9fbcd
LP
4115
4116 /* Skip over the username and subsequent separator whitespace */
4117 x = line;
4118 x += strcspn(x, WHITESPACE);
4119 x += strspn(x, WHITESPACE);
4120
a2a5291b 4121 FOREACH_WORD(word, l, x, state) {
0cb9fbcd
LP
4122 char c[l+1];
4123
a2a5291b 4124 memcpy(c, word, l);
0cb9fbcd
LP
4125 c[l] = 0;
4126
4127 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
4128 return log_oom();
4129
4130 r = parse_uid(c, &uids[n_uids++]);
4131 if (r < 0) {
4132 log_error("Failed to parse group data from getent.");
4133 return -EIO;
4134 }
4135 }
4136
4137 r = mkdir_parents(home, 0775);
f647962d
MS
4138 if (r < 0)
4139 return log_error_errno(r, "Failed to make home root directory: %m");
0cb9fbcd
LP
4140
4141 r = mkdir_safe(home, 0755, uid, gid);
f647962d
MS
4142 if (r < 0 && r != -EEXIST)
4143 return log_error_errno(r, "Failed to make home directory: %m");
0cb9fbcd 4144
03cfe0d5
LP
4145 (void) fchown(STDIN_FILENO, uid, gid);
4146 (void) fchown(STDOUT_FILENO, uid, gid);
4147 (void) fchown(STDERR_FILENO, uid, gid);
0cb9fbcd 4148
4a62c710
MS
4149 if (setgroups(n_uids, uids) < 0)
4150 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
0cb9fbcd 4151
4a62c710
MS
4152 if (setresgid(gid, gid, gid) < 0)
4153 return log_error_errno(errno, "setregid() failed: %m");
0cb9fbcd 4154
4a62c710
MS
4155 if (setresuid(uid, uid, uid) < 0)
4156 return log_error_errno(errno, "setreuid() failed: %m");
0cb9fbcd
LP
4157
4158 if (_home) {
4159 *_home = home;
4160 home = NULL;
4161 }
4162
4163 return 0;
4164}
4165
113cea80 4166/*
6d416b9c
LS
4167 * Return values:
4168 * < 0 : wait_for_terminate() failed to get the state of the
4169 * container, the container was terminated by a signal, or
4170 * failed for an unknown reason. No change is made to the
4171 * container argument.
4172 * > 0 : The program executed in the container terminated with an
4173 * error. The exit code of the program executed in the
919699ec
LP
4174 * container is returned. The container argument has been set
4175 * to CONTAINER_TERMINATED.
6d416b9c
LS
4176 * 0 : The container is being rebooted, has been shut down or exited
4177 * successfully. The container argument has been set to either
4178 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
113cea80 4179 *
6d416b9c
LS
4180 * That is, success is indicated by a return value of zero, and an
4181 * error is indicated by a non-zero value.
113cea80
DH
4182 */
4183static int wait_for_container(pid_t pid, ContainerStatus *container) {
113cea80 4184 siginfo_t status;
919699ec 4185 int r;
113cea80
DH
4186
4187 r = wait_for_terminate(pid, &status);
f647962d
MS
4188 if (r < 0)
4189 return log_warning_errno(r, "Failed to wait for container: %m");
113cea80
DH
4190
4191 switch (status.si_code) {
fddbb89c 4192
113cea80 4193 case CLD_EXITED:
919699ec
LP
4194 if (status.si_status == 0) {
4195 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
113cea80 4196
fddbb89c 4197 } else
919699ec 4198 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
fddbb89c 4199
919699ec
LP
4200 *container = CONTAINER_TERMINATED;
4201 return status.si_status;
113cea80
DH
4202
4203 case CLD_KILLED:
4204 if (status.si_status == SIGINT) {
113cea80 4205
919699ec 4206 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
113cea80 4207 *container = CONTAINER_TERMINATED;
919699ec
LP
4208 return 0;
4209
113cea80 4210 } else if (status.si_status == SIGHUP) {
113cea80 4211
919699ec 4212 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
113cea80 4213 *container = CONTAINER_REBOOTED;
919699ec 4214 return 0;
113cea80 4215 }
919699ec 4216
113cea80
DH
4217 /* CLD_KILLED fallthrough */
4218
4219 case CLD_DUMPED:
fddbb89c 4220 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
919699ec 4221 return -EIO;
113cea80
DH
4222
4223 default:
fddbb89c 4224 log_error("Container %s failed due to unknown reason.", arg_machine);
919699ec 4225 return -EIO;
113cea80
DH
4226 }
4227
4228 return r;
4229}
4230
e866af3a
DH
4231static void nop_handler(int sig) {}
4232
023fb90b
LP
4233static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
4234 pid_t pid;
4235
4236 pid = PTR_TO_UINT32(userdata);
4237 if (pid > 0) {
c6c8f6e2 4238 if (kill(pid, arg_kill_signal) >= 0) {
023fb90b
LP
4239 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
4240 sd_event_source_set_userdata(s, NULL);
4241 return 0;
4242 }
4243 }
4244
4245 sd_event_exit(sd_event_source_get_event(s), 0);
4246 return 0;
4247}
4248
ec16945e 4249static int determine_names(void) {
1b9cebf6 4250 int r;
ec16945e 4251
c1521918
LP
4252 if (arg_template && !arg_directory && arg_machine) {
4253
4254 /* If --template= was specified then we should not
4255 * search for a machine, but instead create a new one
4256 * in /var/lib/machine. */
4257
4258 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
4259 if (!arg_directory)
4260 return log_oom();
4261 }
4262
ec16945e 4263 if (!arg_image && !arg_directory) {
1b9cebf6
LP
4264 if (arg_machine) {
4265 _cleanup_(image_unrefp) Image *i = NULL;
4266
4267 r = image_find(arg_machine, &i);
4268 if (r < 0)
4269 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
4270 else if (r == 0) {
4271 log_error("No image for machine '%s': %m", arg_machine);
4272 return -ENOENT;
4273 }
4274
aceac2f0 4275 if (i->type == IMAGE_RAW)
1b9cebf6
LP
4276 r = set_sanitized_path(&arg_image, i->path);
4277 else
4278 r = set_sanitized_path(&arg_directory, i->path);
4279 if (r < 0)
4280 return log_error_errno(r, "Invalid image directory: %m");
4281
aee327b8
LP
4282 if (!arg_ephemeral)
4283 arg_read_only = arg_read_only || i->read_only;
1b9cebf6 4284 } else
ec16945e
LP
4285 arg_directory = get_current_dir_name();
4286
1b9cebf6
LP
4287 if (!arg_directory && !arg_machine) {
4288 log_error("Failed to determine path, please use -D or -i.");
ec16945e
LP
4289 return -EINVAL;
4290 }
4291 }
4292
4293 if (!arg_machine) {
b9ba4dab
LP
4294 if (arg_directory && path_equal(arg_directory, "/"))
4295 arg_machine = gethostname_malloc();
4296 else
4297 arg_machine = strdup(basename(arg_image ?: arg_directory));
4298
ec16945e
LP
4299 if (!arg_machine)
4300 return log_oom();
4301
ae691c1d 4302 hostname_cleanup(arg_machine);
ec16945e
LP
4303 if (!machine_name_is_valid(arg_machine)) {
4304 log_error("Failed to determine machine name automatically, please use -M.");
4305 return -EINVAL;
4306 }
b9ba4dab
LP
4307
4308 if (arg_ephemeral) {
4309 char *b;
4310
4311 /* Add a random suffix when this is an
4312 * ephemeral machine, so that we can run many
4313 * instances at once without manually having
4314 * to specify -M each time. */
4315
4316 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
4317 return log_oom();
4318
4319 free(arg_machine);
4320 arg_machine = b;
4321 }
ec16945e
LP
4322 }
4323
4324 return 0;
4325}
4326
03cfe0d5 4327static int determine_uid_shift(const char *directory) {
6dac160c
LP
4328 int r;
4329
03cfe0d5
LP
4330 if (!arg_userns) {
4331 arg_uid_shift = 0;
6dac160c 4332 return 0;
03cfe0d5 4333 }
6dac160c
LP
4334
4335 if (arg_uid_shift == UID_INVALID) {
4336 struct stat st;
4337
03cfe0d5 4338 r = stat(directory, &st);
6dac160c 4339 if (r < 0)
03cfe0d5 4340 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
6dac160c
LP
4341
4342 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
4343
4344 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
03cfe0d5 4345 log_error("UID and GID base of %s don't match.", directory);
6dac160c
LP
4346 return -EINVAL;
4347 }
4348
4349 arg_uid_range = UINT32_C(0x10000);
4350 }
4351
4352 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
4353 log_error("UID base too high for UID range.");
4354 return -EINVAL;
4355 }
4356
4357 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
4358 return 0;
4359}
4360
03cfe0d5
LP
4361static int inner_child(
4362 Barrier *barrier,
4363 const char *directory,
4364 bool secondary,
4365 int kmsg_socket,
4366 int rtnl_socket,
f757855e 4367 FDSet *fds) {
69c79d3c 4368
03cfe0d5
LP
4369 _cleanup_free_ char *home = NULL;
4370 unsigned n_env = 2;
4371 const char *envp[] = {
4372 "PATH=" DEFAULT_PATH_SPLIT_USR,
4373 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4374 NULL, /* TERM */
4375 NULL, /* HOME */
4376 NULL, /* USER */
4377 NULL, /* LOGNAME */
4378 NULL, /* container_uuid */
4379 NULL, /* LISTEN_FDS */
4380 NULL, /* LISTEN_PID */
4381 NULL
4382 };
88213476 4383
2371271c 4384 _cleanup_strv_free_ char **env_use = NULL;
03cfe0d5 4385 int r;
88213476 4386
03cfe0d5
LP
4387 assert(barrier);
4388 assert(directory);
4389 assert(kmsg_socket >= 0);
88213476 4390
efdb0237
LP
4391 cg_unified_flush();
4392
03cfe0d5
LP
4393 if (arg_userns) {
4394 /* Tell the parent, that it now can write the UID map. */
4395 (void) barrier_place(barrier); /* #1 */
7027ff61 4396
03cfe0d5
LP
4397 /* Wait until the parent wrote the UID map */
4398 if (!barrier_place_and_sync(barrier)) { /* #2 */
4399 log_error("Parent died too early");
4400 return -ESRCH;
4401 }
88213476
LP
4402 }
4403
03cfe0d5
LP
4404 r = mount_all(NULL, true);
4405 if (r < 0)
4406 return r;
4407
4408 /* Wait until we are cgroup-ified, so that we
4409 * can mount the right cgroup path writable */
4410 if (!barrier_place_and_sync(barrier)) { /* #3 */
4411 log_error("Parent died too early");
4412 return -ESRCH;
88213476
LP
4413 }
4414
03cfe0d5
LP
4415 r = mount_systemd_cgroup_writable("");
4416 if (r < 0)
4417 return r;
ec16945e 4418
03cfe0d5
LP
4419 r = reset_uid_gid();
4420 if (r < 0)
4421 return log_error_errno(r, "Couldn't become new root: %m");
1b9e5b12 4422
03cfe0d5
LP
4423 r = setup_boot_id(NULL);
4424 if (r < 0)
4425 return r;
ec16945e 4426
03cfe0d5
LP
4427 r = setup_kmsg(NULL, kmsg_socket);
4428 if (r < 0)
4429 return r;
4430 kmsg_socket = safe_close(kmsg_socket);
ec16945e 4431
03cfe0d5 4432 umask(0022);
30535c16 4433
03cfe0d5
LP
4434 if (setsid() < 0)
4435 return log_error_errno(errno, "setsid() failed: %m");
4436
4437 if (arg_private_network)
4438 loopback_setup();
4439
4440 r = send_rtnl(rtnl_socket);
4441 if (r < 0)
4442 return r;
4443 rtnl_socket = safe_close(rtnl_socket);
4444
4445 if (drop_capabilities() < 0)
4446 return log_error_errno(errno, "drop_capabilities() failed: %m");
4447
4448 setup_hostname();
4449
050f7277 4450 if (arg_personality != PERSONALITY_INVALID) {
03cfe0d5
LP
4451 if (personality(arg_personality) < 0)
4452 return log_error_errno(errno, "personality() failed: %m");
4453 } else if (secondary) {
4454 if (personality(PER_LINUX32) < 0)
4455 return log_error_errno(errno, "personality() failed: %m");
4456 }
4457
4458#ifdef HAVE_SELINUX
4459 if (arg_selinux_context)
4460 if (setexeccon((security_context_t) arg_selinux_context) < 0)
4461 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4462#endif
4463
4464 r = change_uid_gid(&home);
4465 if (r < 0)
4466 return r;
4467
4468 envp[n_env] = strv_find_prefix(environ, "TERM=");
4469 if (envp[n_env])
4470 n_env ++;
4471
4472 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4473 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4474 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
4475 return log_oom();
4476
4477 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4478 char as_uuid[37];
4479
4480 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
4481 return log_oom();
4482 }
4483
4484 if (fdset_size(fds) > 0) {
4485 r = fdset_cloexec(fds, false);
4486 if (r < 0)
4487 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4488
4489 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
4490 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
4491 return log_oom();
4492 }
4493
2371271c
TG
4494 env_use = strv_env_merge(2, envp, arg_setenv);
4495 if (!env_use)
4496 return log_oom();
03cfe0d5
LP
4497
4498 /* Let the parent know that we are ready and
4499 * wait until the parent is ready with the
4500 * setup, too... */
4501 if (!barrier_place_and_sync(barrier)) { /* #4 */
4502 log_error("Parent died too early");
4503 return -ESRCH;
4504 }
4505
4506 /* Now, explicitly close the log, so that we
4507 * then can close all remaining fds. Closing
4508 * the log explicitly first has the benefit
4509 * that the logging subsystem knows about it,
4510 * and is thus ready to be reopened should we
4511 * need it again. Note that the other fds
4512 * closed here are at least the locking and
4513 * barrier fds. */
4514 log_close();
4515 (void) fdset_close_others(fds);
4516
4517 if (arg_boot) {
4518 char **a;
4519 size_t m;
4520
4521 /* Automatically search for the init system */
4522
f757855e 4523 m = 1 + strv_length(arg_parameters);
03cfe0d5 4524 a = newa(char*, m + 1);
f757855e
LP
4525 if (strv_isempty(arg_parameters))
4526 a[1] = NULL;
4527 else
4528 memcpy(a + 1, arg_parameters, m * sizeof(char*));
03cfe0d5
LP
4529
4530 a[0] = (char*) "/usr/lib/systemd/systemd";
4531 execve(a[0], a, env_use);
4532
4533 a[0] = (char*) "/lib/systemd/systemd";
4534 execve(a[0], a, env_use);
4535
4536 a[0] = (char*) "/sbin/init";
4537 execve(a[0], a, env_use);
f757855e
LP
4538 } else if (!strv_isempty(arg_parameters))
4539 execvpe(arg_parameters[0], arg_parameters, env_use);
03cfe0d5 4540 else {
f757855e 4541 chdir(home ?: "/root");
03cfe0d5
LP
4542 execle("/bin/bash", "-bash", NULL, env_use);
4543 execle("/bin/sh", "-sh", NULL, env_use);
4544 }
4545
4546 (void) log_open();
4547 return log_error_errno(errno, "execv() failed: %m");
4548}
4549
4550static int outer_child(
4551 Barrier *barrier,
4552 const char *directory,
4553 const char *console,
4554 const char *root_device, bool root_device_rw,
4555 const char *home_device, bool home_device_rw,
4556 const char *srv_device, bool srv_device_rw,
4557 bool interactive,
4558 bool secondary,
4559 int pid_socket,
4560 int kmsg_socket,
4561 int rtnl_socket,
825d5287 4562 int uid_shift_socket,
f757855e 4563 FDSet *fds) {
03cfe0d5
LP
4564
4565 pid_t pid;
4566 ssize_t l;
4567 int r;
4568
4569 assert(barrier);
4570 assert(directory);
4571 assert(console);
4572 assert(pid_socket >= 0);
4573 assert(kmsg_socket >= 0);
4574
efdb0237
LP
4575 cg_unified_flush();
4576
03cfe0d5
LP
4577 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
4578 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4579
4580 if (interactive) {
4581 close_nointr(STDIN_FILENO);
4582 close_nointr(STDOUT_FILENO);
4583 close_nointr(STDERR_FILENO);
4584
4585 r = open_terminal(console, O_RDWR);
4586 if (r != STDIN_FILENO) {
4587 if (r >= 0) {
4588 safe_close(r);
4589 r = -EINVAL;
4590 }
4591
4592 return log_error_errno(r, "Failed to open console: %m");
4593 }
4594
4595 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4596 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
4597 return log_error_errno(errno, "Failed to duplicate console: %m");
4598 }
4599
4600 r = reset_audit_loginuid();
4601 if (r < 0)
4602 return r;
4603
4604 /* Mark everything as slave, so that we still
4605 * receive mounts from the real root, but don't
4606 * propagate mounts to the real root. */
4607 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
4608 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4609
4610 r = mount_devices(directory,
4611 root_device, root_device_rw,
4612 home_device, home_device_rw,
4613 srv_device, srv_device_rw);
4614 if (r < 0)
4615 return r;
4616
391567f4
LP
4617 r = determine_uid_shift(directory);
4618 if (r < 0)
4619 return r;
4620
825d5287
RM
4621 if (arg_userns) {
4622 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
4623 if (l < 0)
4624 return log_error_errno(errno, "Failed to send UID shift: %m");
4625 if (l != sizeof(arg_uid_shift)) {
4626 log_error("Short write while sending UID shift.");
4627 return -EIO;
4628 }
4629 }
4630
03cfe0d5
LP
4631 /* Turn directory into bind mount */
4632 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
4633 return log_error_errno(errno, "Failed to make bind mount: %m");
4634
03cfe0d5
LP
4635 r = setup_volatile(directory);
4636 if (r < 0)
4637 return r;
4638
03cfe0d5
LP
4639 r = setup_volatile_state(directory);
4640 if (r < 0)
4641 return r;
4642
03cfe0d5
LP
4643 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
4644 if (r < 0)
4645 return r;
4646
03cfe0d5
LP
4647 if (arg_read_only) {
4648 r = bind_remount_recursive(directory, true);
4649 if (r < 0)
4650 return log_error_errno(r, "Failed to make tree read-only: %m");
4651 }
4652
03cfe0d5
LP
4653 r = mount_all(directory, false);
4654 if (r < 0)
4655 return r;
4656
4657 if (copy_devnodes(directory) < 0)
4658 return r;
4659
4660 dev_setup(directory, arg_uid_shift, arg_uid_shift);
4661
4662 if (setup_pts(directory) < 0)
4663 return r;
4664
4665 r = setup_propagate(directory);
4666 if (r < 0)
4667 return r;
4668
4669 r = setup_dev_console(directory, console);
4670 if (r < 0)
4671 return r;
4672
4673 r = setup_seccomp();
4674 if (r < 0)
4675 return r;
4676
4677 r = setup_timezone(directory);
4678 if (r < 0)
4679 return r;
4680
4681 r = setup_resolv_conf(directory);
4682 if (r < 0)
4683 return r;
4684
4685 r = setup_journal(directory);
4686 if (r < 0)
4687 return r;
4688
4689 r = mount_custom(directory);
4690 if (r < 0)
4691 return r;
4692
efdb0237 4693 r = mount_cgroups(directory);
03cfe0d5
LP
4694 if (r < 0)
4695 return r;
4696
4697 r = mount_move_root(directory);
4698 if (r < 0)
4699 return log_error_errno(r, "Failed to move root directory: %m");
4700
4701 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4702 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
4703 (arg_private_network ? CLONE_NEWNET : 0) |
4704 (arg_userns ? CLONE_NEWUSER : 0),
4705 NULL);
4706 if (pid < 0)
4707 return log_error_errno(errno, "Failed to fork inner child: %m");
03cfe0d5
LP
4708 if (pid == 0) {
4709 pid_socket = safe_close(pid_socket);
825d5287 4710 uid_shift_socket = safe_close(uid_shift_socket);
03cfe0d5
LP
4711
4712 /* The inner child has all namespaces that are
4713 * requested, so that we all are owned by the user if
4714 * user namespaces are turned on. */
4715
f757855e 4716 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
03cfe0d5
LP
4717 if (r < 0)
4718 _exit(EXIT_FAILURE);
4719
4720 _exit(EXIT_SUCCESS);
4721 }
4722
4723 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4724 if (l < 0)
4725 return log_error_errno(errno, "Failed to send PID: %m");
4726 if (l != sizeof(pid)) {
4727 log_error("Short write while sending PID.");
4728 return -EIO;
4729 }
4730
4731 pid_socket = safe_close(pid_socket);
4732
4733 return 0;
4734}
4735
4736static int setup_uid_map(pid_t pid) {
4737 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4738 int r;
4739
4740 assert(pid > 1);
4741
4742 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4743 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
ad118bda 4744 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
4745 if (r < 0)
4746 return log_error_errno(r, "Failed to write UID map: %m");
4747
4748 /* We always assign the same UID and GID ranges */
4749 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
ad118bda 4750 r = write_string_file(uid_map, line, 0);
03cfe0d5
LP
4751 if (r < 0)
4752 return log_error_errno(r, "Failed to write GID map: %m");
4753
4754 return 0;
4755}
4756
4757static int chown_cgroup(pid_t pid) {
4758 _cleanup_free_ char *path = NULL, *fs = NULL;
4759 _cleanup_close_ int fd = -1;
4760 const char *fn;
4761 int r;
4762
4763 r = cg_pid_get_path(NULL, pid, &path);
4764 if (r < 0)
4765 return log_error_errno(r, "Failed to get container cgroup path: %m");
4766
4767 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
4768 if (r < 0)
4769 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
4770
4771 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
4772 if (fd < 0)
4773 return log_error_errno(errno, "Failed to open %s: %m", fs);
4774
efdb0237
LP
4775 FOREACH_STRING(fn,
4776 ".",
4777 "tasks",
4778 "notify_on_release",
4779 "cgroup.procs",
4780 "cgroup.clone_children",
4781 "cgroup.controllers",
4782 "cgroup.subtree_control",
4783 "cgroup.populated")
03cfe0d5 4784 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
efdb0237
LP
4785 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
4786 "Failed to chown() cgroup file %s, ignoring: %m", fn);
4787
4788 return 0;
4789}
4790
4791static int sync_cgroup(pid_t pid) {
4792 _cleanup_free_ char *cgroup = NULL;
4793 char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
4794 bool undo_mount = false;
4795 const char *fn;
4796 int unified, r;
4797
4798 unified = cg_unified();
4799 if (unified < 0)
4800 return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
4801
4802 if ((unified > 0) == arg_unified_cgroup_hierarchy)
4803 return 0;
4804
4805 /* When the host uses the legacy cgroup setup, but the
4806 * container shall use the unified hierarchy, let's make sure
4807 * we copy the path from the name=systemd hierarchy into the
4808 * unified hierarchy. Similar for the reverse situation. */
4809
4810 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
4811 if (r < 0)
4812 return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid);
4813
4814 /* In order to access the unified hierarchy we need to mount it */
4815 if (!mkdtemp(tree))
4816 return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m");
4817
4818 if (unified)
4819 r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
4820 else
4821 r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior");
4822 if (r < 0) {
4823 r = log_error_errno(errno, "Failed to mount unified hierarchy: %m");
4824 goto finish;
4825 }
4826
4827 undo_mount = true;
4828
4829 fn = strjoina(tree, cgroup, "/cgroup.procs");
4830 (void) mkdir_parents(fn, 0755);
4831
4832 sprintf(pid_string, PID_FMT, pid);
4833 r = write_string_file(fn, pid_string, 0);
4834 if (r < 0)
4835 log_error_errno(r, "Failed to move process: %m");
4836
4837finish:
4838 if (undo_mount)
4839 (void) umount(tree);
4840
4841 (void) rmdir(tree);
4842 return r;
4843}
4844
4845static int create_subcgroup(pid_t pid) {
4846 _cleanup_free_ char *cgroup = NULL;
4847 const char *child;
4848 int unified, r;
98e4d8d7 4849 CGroupMask supported;
efdb0237
LP
4850
4851 /* In the unified hierarchy inner nodes may only only contain
4852 * subgroups, but not processes. Hence, if we running in the
4853 * unified hierarchy and the container does the same, and we
4854 * did not create a scope unit for the container move us and
4855 * the container into two separate subcgroups. */
4856
4857 if (!arg_keep_unit)
4858 return 0;
4859
4860 if (!arg_unified_cgroup_hierarchy)
4861 return 0;
4862
4863 unified = cg_unified();
4864 if (unified < 0)
4865 return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
4866 if (unified == 0)
4867 return 0;
4868
98e4d8d7
LP
4869 r = cg_mask_supported(&supported);
4870 if (r < 0)
4871 return log_error_errno(r, "Failed to determine supported controllers: %m");
4872
efdb0237
LP
4873 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
4874 if (r < 0)
4875 return log_error_errno(r, "Failed to get our control group: %m");
4876
4877 child = strjoina(cgroup, "/payload");
4878 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid);
4879 if (r < 0)
4880 return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
4881
4882 child = strjoina(cgroup, "/supervisor");
4883 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0);
4884 if (r < 0)
4885 return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
03cfe0d5 4886
98e4d8d7
LP
4887 /* Try to enable as many controllers as possible for the new payload. */
4888 (void) cg_enable_everywhere(supported, supported, cgroup);
03cfe0d5
LP
4889 return 0;
4890}
4891
f757855e
LP
4892static int load_settings(void) {
4893 _cleanup_(settings_freep) Settings *settings = NULL;
4894 _cleanup_fclose_ FILE *f = NULL;
4895 _cleanup_free_ char *p = NULL;
4896 const char *fn, *i;
4897 int r;
4898
4899 /* If all settings are masked, there's no point in looking for
4900 * the settings file */
4901 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
4902 return 0;
4903
4904 fn = strjoina(arg_machine, ".nspawn");
4905
4906 /* We first look in the admin's directories in /etc and /run */
4907 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4908 _cleanup_free_ char *j = NULL;
4909
4910 j = strjoin(i, "/", fn, NULL);
4911 if (!j)
4912 return log_oom();
4913
4914 f = fopen(j, "re");
4915 if (f) {
4916 p = j;
4917 j = NULL;
4918
4919 /* By default we trust configuration from /etc and /run */
4920 if (arg_settings_trusted < 0)
4921 arg_settings_trusted = true;
4922
4923 break;
4924 }
4925
4926 if (errno != ENOENT)
4927 return log_error_errno(errno, "Failed to open %s: %m", j);
4928 }
4929
4930 if (!f) {
4931 /* After that, let's look for a file next to the
4932 * actual image we shall boot. */
4933
4934 if (arg_image) {
4935 p = file_in_same_dir(arg_image, fn);
4936 if (!p)
4937 return log_oom();
4938 } else if (arg_directory) {
4939 p = file_in_same_dir(arg_directory, fn);
4940 if (!p)
4941 return log_oom();
4942 }
4943
4944 if (p) {
4945 f = fopen(p, "re");
4946 if (!f && errno != ENOENT)
4947 return log_error_errno(errno, "Failed to open %s: %m", p);
4948
4949 /* By default we do not trust configuration from /var/lib/machines */
4950 if (arg_settings_trusted < 0)
4951 arg_settings_trusted = false;
4952 }
4953 }
4954
4955 if (!f)
4956 return 0;
4957
4958 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4959
4960 r = settings_load(f, p, &settings);
4961 if (r < 0)
4962 return r;
4963
4964 /* Copy over bits from the settings, unless they have been
4965 * explicitly masked by command line switches. */
4966
4967 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
4968 settings->boot >= 0) {
4969 arg_boot = settings->boot;
4970
4971 strv_free(arg_parameters);
4972 arg_parameters = settings->parameters;
4973 settings->parameters = NULL;
4974 }
4975
4976 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
4977 settings->environment) {
4978 strv_free(arg_setenv);
4979 arg_setenv = settings->environment;
4980 settings->environment = NULL;
4981 }
4982
4983 if ((arg_settings_mask & SETTING_USER) == 0 &&
4984 settings->user) {
4985 free(arg_user);
4986 arg_user = settings->user;
4987 settings->user = NULL;
4988 }
4989
4990 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
4991
4992 if (!arg_settings_trusted && settings->capability != 0)
4993 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
4994 else
4995 arg_retain |= settings->capability;
4996
4997 arg_retain &= ~settings->drop_capability;
4998 }
4999
5000 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
5001 settings->kill_signal > 0)
5002 arg_kill_signal = settings->kill_signal;
5003
5004 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
5005 settings->personality != PERSONALITY_INVALID)
5006 arg_personality = settings->personality;
5007
5008 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
5009 !sd_id128_is_null(settings->machine_id)) {
5010
5011 if (!arg_settings_trusted)
5012 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
5013 else
5014 arg_uuid = settings->machine_id;
5015 }
5016
5017 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
5018 settings->read_only >= 0)
5019 arg_read_only = settings->read_only;
5020
5021 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
5022 settings->volatile_mode != _VOLATILE_MODE_INVALID)
5023 arg_volatile_mode = settings->volatile_mode;
5024
5025 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
5026 settings->n_custom_mounts > 0) {
5027
5028 if (!arg_settings_trusted)
5029 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
5030 else {
5031 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5032 arg_custom_mounts = settings->custom_mounts;
5033 arg_n_custom_mounts = settings->n_custom_mounts;
5034
5035 settings->custom_mounts = NULL;
5036 settings->n_custom_mounts = 0;
5037 }
5038 }
5039
5040 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
5041 (settings->private_network >= 0 ||
5042 settings->network_veth >= 0 ||
5043 settings->network_bridge ||
5044 settings->network_interfaces ||
5045 settings->network_macvlan ||
5046 settings->network_ipvlan)) {
5047
5048 if (!arg_settings_trusted)
5049 log_warning("Ignoring network settings, file %s is not trusted.", p);
5050 else {
5051 strv_free(arg_network_interfaces);
5052 arg_network_interfaces = settings->network_interfaces;
5053 settings->network_interfaces = NULL;
5054
5055 strv_free(arg_network_macvlan);
5056 arg_network_macvlan = settings->network_macvlan;
5057 settings->network_macvlan = NULL;
5058
5059 strv_free(arg_network_ipvlan);
5060 arg_network_ipvlan = settings->network_ipvlan;
5061 settings->network_ipvlan = NULL;
5062
5063 free(arg_network_bridge);
5064 arg_network_bridge = settings->network_bridge;
5065 settings->network_bridge = NULL;
5066
5067 arg_network_veth = settings->network_veth > 0 || settings->network_bridge;
5068
5069 arg_private_network = true; /* all these settings imply private networking */
5070 }
5071 }
5072
5073 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
5074 settings->expose_ports) {
5075
5076 if (!arg_settings_trusted)
5077 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
5078 else {
5079 expose_port_free_all(arg_expose_ports);
5080 arg_expose_ports = settings->expose_ports;
5081 settings->expose_ports = NULL;
5082 }
5083 }
5084
5085 return 0;
5086}
5087
03cfe0d5
LP
5088int main(int argc, char *argv[]) {
5089
5090 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
5091 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
5092 _cleanup_close_ int master = -1, image_fd = -1;
5093 _cleanup_fdset_free_ FDSet *fds = NULL;
5094 int r, n_fd_passed, loop_nr = -1;
5095 char veth_name[IFNAMSIZ];
5096 bool secondary = false, remove_subvol = false;
72c0a2c2 5097 sigset_t mask_chld;
03cfe0d5
LP
5098 pid_t pid = 0;
5099 int ret = EXIT_SUCCESS;
5100 union in_addr_union exposed = {};
5101 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
5102 bool interactive;
5103
5104 log_parse_environment();
5105 log_open();
5106
5107 r = parse_argv(argc, argv);
5108 if (r <= 0)
5109 goto finish;
5110
03cfe0d5
LP
5111 if (geteuid() != 0) {
5112 log_error("Need to be root.");
5113 r = -EPERM;
5114 goto finish;
5115 }
f757855e
LP
5116 r = determine_names();
5117 if (r < 0)
5118 goto finish;
5119
5120 r = load_settings();
5121 if (r < 0)
5122 goto finish;
5123
5124 r = verify_arguments();
5125 if (r < 0)
5126 goto finish;
03cfe0d5
LP
5127
5128 n_fd_passed = sd_listen_fds(false);
5129 if (n_fd_passed > 0) {
5130 r = fdset_new_listen_fds(&fds, false);
5131 if (r < 0) {
5132 log_error_errno(r, "Failed to collect file descriptors: %m");
5133 goto finish;
5134 }
5135 }
5136
5137 if (arg_directory) {
5138 assert(!arg_image);
5139
5140 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
5141 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
5142 r = -EINVAL;
5143 goto finish;
5144 }
5145
5146 if (arg_ephemeral) {
5147 _cleanup_free_ char *np = NULL;
5148
5149 /* If the specified path is a mount point we
5150 * generate the new snapshot immediately
5151 * inside it under a random name. However if
5152 * the specified is not a mount point we
5153 * create the new snapshot in the parent
5154 * directory, just next to it. */
e26d6ce5 5155 r = path_is_mount_point(arg_directory, 0);
03cfe0d5
LP
5156 if (r < 0) {
5157 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5158 goto finish;
5159 }
5160 if (r > 0)
770b5ce4 5161 r = tempfn_random_child(arg_directory, "machine.", &np);
03cfe0d5 5162 else
770b5ce4 5163 r = tempfn_random(arg_directory, "machine.", &np);
03cfe0d5
LP
5164 if (r < 0) {
5165 log_error_errno(r, "Failed to generate name for snapshot: %m");
5166 goto finish;
5167 }
5168
5169 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5170 if (r < 0) {
5171 log_error_errno(r, "Failed to lock %s: %m", np);
5172 goto finish;
5173 }
5174
5175 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
5176 if (r < 0) {
5177 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5178 goto finish;
ec16945e
LP
5179 }
5180
5181 free(arg_directory);
5182 arg_directory = np;
8a16a7b4 5183 np = NULL;
ec16945e
LP
5184
5185 remove_subvol = true;
30535c16
LP
5186
5187 } else {
5188 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5189 if (r == -EBUSY) {
5190 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5191 goto finish;
5192 }
5193 if (r < 0) {
5194 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
5195 return r;
5196 }
5197
5198 if (arg_template) {
f70a17f8 5199 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
30535c16
LP
5200 if (r == -EEXIST) {
5201 if (!arg_quiet)
5202 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
5203 } else if (r < 0) {
83521414 5204 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
30535c16
LP
5205 goto finish;
5206 } else {
5207 if (!arg_quiet)
5208 log_info("Populated %s from template %s.", arg_directory, arg_template);
5209 }
5210 }
ec16945e
LP
5211 }
5212
1b9e5b12
LP
5213 if (arg_boot) {
5214 if (path_is_os_tree(arg_directory) <= 0) {
5ae4d543 5215 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
ec16945e 5216 r = -EINVAL;
1b9e5b12
LP
5217 goto finish;
5218 }
5219 } else {
5220 const char *p;
5221
63c372cb 5222 p = strjoina(arg_directory,
1b9e5b12
LP
5223 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
5224 if (access(p, F_OK) < 0) {
5225 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
ec16945e 5226 r = -EINVAL;
1b9e5b12 5227 goto finish;
1b9e5b12
LP
5228 }
5229 }
ec16945e 5230
6b9132a9 5231 } else {
1b9e5b12 5232 char template[] = "/tmp/nspawn-root-XXXXXX";
6b9132a9 5233
ec16945e
LP
5234 assert(arg_image);
5235 assert(!arg_template);
5236
30535c16
LP
5237 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5238 if (r == -EBUSY) {
5239 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5240 goto finish;
5241 }
5242 if (r < 0) {
5243 r = log_error_errno(r, "Failed to create image lock: %m");
5244 goto finish;
5245 }
5246
1b9e5b12 5247 if (!mkdtemp(template)) {
56f64d95 5248 log_error_errno(errno, "Failed to create temporary directory: %m");
1b9e5b12 5249 r = -errno;
6b9132a9 5250 goto finish;
1b9e5b12 5251 }
6b9132a9 5252
1b9e5b12
LP
5253 arg_directory = strdup(template);
5254 if (!arg_directory) {
5255 r = log_oom();
5256 goto finish;
6b9132a9 5257 }
88213476 5258
1b9e5b12
LP
5259 image_fd = setup_image(&device_path, &loop_nr);
5260 if (image_fd < 0) {
5261 r = image_fd;
842f3b0f
LP
5262 goto finish;
5263 }
1b9e5b12 5264
4d9f07b4
LP
5265 r = dissect_image(image_fd,
5266 &root_device, &root_device_rw,
5267 &home_device, &home_device_rw,
5268 &srv_device, &srv_device_rw,
5269 &secondary);
1b9e5b12
LP
5270 if (r < 0)
5271 goto finish;
842f3b0f 5272 }
842f3b0f 5273
5a8af538
LP
5274 r = custom_mounts_prepare();
5275 if (r < 0)
5276 goto finish;
5277
03cfe0d5
LP
5278 interactive =
5279 isatty(STDIN_FILENO) > 0 &&
5280 isatty(STDOUT_FILENO) > 0;
9c857b9d 5281
db7feb7e
LP
5282 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
5283 if (master < 0) {
ec16945e 5284 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
a258bf26
LP
5285 goto finish;
5286 }
5287
611b312b
LP
5288 r = ptsname_malloc(master, &console);
5289 if (r < 0) {
5290 r = log_error_errno(r, "Failed to determine tty name: %m");
a258bf26
LP
5291 goto finish;
5292 }
5293
a258bf26 5294 if (unlockpt(master) < 0) {
ec16945e 5295 r = log_error_errno(errno, "Failed to unlock tty: %m");
a258bf26
LP
5296 goto finish;
5297 }
5298
9c857b9d
LP
5299 if (!arg_quiet)
5300 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5301 arg_machine, arg_image ?: arg_directory);
5302
72c0a2c2 5303 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
a258bf26 5304
023fb90b
LP
5305 assert_se(sigemptyset(&mask_chld) == 0);
5306 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
5307
03cfe0d5
LP
5308 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
5309 r = log_error_errno(errno, "Failed to become subreaper: %m");
5310 goto finish;
5311 }
5312
d87be9b0 5313 for (;;) {
825d5287
RM
5314 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
5315 uid_shift_socket_pair[2] = { -1, -1 };
113cea80 5316 ContainerStatus container_status;
7566e267 5317 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
03cfe0d5 5318 static const struct sigaction sa = {
e866af3a
DH
5319 .sa_handler = nop_handler,
5320 .sa_flags = SA_NOCLDSTOP,
5321 };
03cfe0d5
LP
5322 int ifi = 0;
5323 ssize_t l;
dbb60d69
LP
5324 _cleanup_event_unref_ sd_event *event = NULL;
5325 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
5326 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
5327 char last_char = 0;
e866af3a 5328
7566e267 5329 r = barrier_create(&barrier);
a2da110b 5330 if (r < 0) {
da927ba9 5331 log_error_errno(r, "Cannot initialize IPC barrier: %m");
a2da110b
DH
5332 goto finish;
5333 }
5334
6d0b55c2
LP
5335 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
5336 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
5337 goto finish;
5338 }
5339
5340 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
5341 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
5342 goto finish;
5343 }
5344
03cfe0d5
LP
5345 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
5346 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
5347 goto finish;
5348 }
5349
825d5287
RM
5350 if (arg_userns)
5351 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
5352 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
5353 goto finish;
5354 }
5355
e866af3a
DH
5356 /* Child can be killed before execv(), so handle SIGCHLD
5357 * in order to interrupt parent's blocking calls and
5358 * give it a chance to call wait() and terminate. */
5359 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
5360 if (r < 0) {
ec16945e 5361 r = log_error_errno(errno, "Failed to change the signal mask: %m");
d96c1ecf
LP
5362 goto finish;
5363 }
5364
e866af3a
DH
5365 r = sigaction(SIGCHLD, &sa, NULL);
5366 if (r < 0) {
ec16945e 5367 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
40ddbdf8
LP
5368 goto finish;
5369 }
5370
03cfe0d5 5371 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
d87be9b0
LP
5372 if (pid < 0) {
5373 if (errno == EINVAL)
ec16945e 5374 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
d87be9b0 5375 else
ec16945e 5376 r = log_error_errno(errno, "clone() failed: %m");
a258bf26 5377
d87be9b0
LP
5378 goto finish;
5379 }
a258bf26 5380
d87be9b0 5381 if (pid == 0) {
03cfe0d5 5382 /* The outer child only has a file system namespace. */
a2da110b
DH
5383 barrier_set_role(&barrier, BARRIER_CHILD);
5384
03e334a1 5385 master = safe_close(master);
a258bf26 5386
03e334a1 5387 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
6d0b55c2 5388 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
03cfe0d5 5389 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
825d5287 5390 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
a258bf26 5391
ce30c8dc
LP
5392 (void) reset_all_signal_handlers();
5393 (void) reset_signal_mask();
f5c1b9ee 5394
03cfe0d5
LP
5395 r = outer_child(&barrier,
5396 arg_directory,
5397 console,
5398 root_device, root_device_rw,
5399 home_device, home_device_rw,
5400 srv_device, srv_device_rw,
5401 interactive,
5402 secondary,
5403 pid_socket_pair[1],
5404 kmsg_socket_pair[1],
5405 rtnl_socket_pair[1],
825d5287 5406 uid_shift_socket_pair[1],
f757855e 5407 fds);
0cb9fbcd 5408 if (r < 0)
a2da110b 5409 _exit(EXIT_FAILURE);
d87be9b0 5410
03cfe0d5 5411 _exit(EXIT_SUCCESS);
da5b3bad 5412 }
88213476 5413
a2da110b 5414 barrier_set_role(&barrier, BARRIER_PARENT);
03cfe0d5 5415
842f3b0f
LP
5416 fdset_free(fds);
5417 fds = NULL;
5418
6d0b55c2
LP
5419 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
5420 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
03cfe0d5 5421 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
6d0b55c2 5422
03cfe0d5
LP
5423 /* Wait for the outer child. */
5424 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
5425 if (r < 0)
5426 goto finish;
5427 if (r != 0) {
5428 r = -EIO;
5429 goto finish;
5430 }
5431 pid = 0;
6dac160c 5432
03cfe0d5
LP
5433 /* And now retrieve the PID of the inner child. */
5434 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
5435 if (l < 0) {
5436 r = log_error_errno(errno, "Failed to read inner child PID: %m");
5437 goto finish;
5438 }
5439 if (l != sizeof(pid)) {
5440 log_error("Short read while reading inner child PID: %m");
5441 r = EIO;
5442 goto finish;
5443 }
354bfd2b 5444
03cfe0d5 5445 log_debug("Init process invoked as PID " PID_FMT, pid);
aa28aefe 5446
03cfe0d5
LP
5447 if (arg_userns) {
5448 if (!barrier_place_and_sync(&barrier)) { /* #1 */
5449 log_error("Child died too early.");
5450 r = -ESRCH;
840295fc 5451 goto finish;
03cfe0d5 5452 }
ab046dde 5453
825d5287
RM
5454 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
5455 if (l < 0) {
5456 r = log_error_errno(errno, "Failed to read UID shift: %m");
5457 goto finish;
5458 }
5459 if (l != sizeof(arg_uid_shift)) {
5460 log_error("Short read while reading UID shift: %m");
5461 r = EIO;
5462 goto finish;
5463 }
5464
03cfe0d5 5465 r = setup_uid_map(pid);
840295fc
LP
5466 if (r < 0)
5467 goto finish;
ab046dde 5468
03cfe0d5
LP
5469 (void) barrier_place(&barrier); /* #2 */
5470 }
c74e630d 5471
03cfe0d5
LP
5472 r = move_network_interfaces(pid);
5473 if (r < 0)
5474 goto finish;
4bbfe7ad 5475
03cfe0d5
LP
5476 r = setup_veth(pid, veth_name, &ifi);
5477 if (r < 0)
5478 goto finish;
5aa4bb6b 5479
03cfe0d5
LP
5480 r = setup_bridge(veth_name, &ifi);
5481 if (r < 0)
5482 goto finish;
6dac160c 5483
03cfe0d5
LP
5484 r = setup_macvlan(pid);
5485 if (r < 0)
5486 goto finish;
6dac160c 5487
03cfe0d5
LP
5488 r = setup_ipvlan(pid);
5489 if (r < 0)
5490 goto finish;
6dac160c 5491
03cfe0d5
LP
5492 r = register_machine(pid, ifi);
5493 if (r < 0)
5494 goto finish;
6dac160c 5495
efdb0237
LP
5496 r = sync_cgroup(pid);
5497 if (r < 0)
5498 goto finish;
5499
5500 r = create_subcgroup(pid);
5501 if (r < 0)
5502 goto finish;
5503
03cfe0d5
LP
5504 r = chown_cgroup(pid);
5505 if (r < 0)
5506 goto finish;
6dac160c 5507
03cfe0d5
LP
5508 /* Notify the child that the parent is ready with all
5509 * its setup (including cgroup-ification), and that
5510 * the child can now hand over control to the code to
5511 * run inside the container. */
5512 (void) barrier_place(&barrier); /* #3 */
6dac160c 5513
03cfe0d5
LP
5514 /* Block SIGCHLD here, before notifying child.
5515 * process_pty() will handle it with the other signals. */
5516 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
e866af3a 5517
03cfe0d5
LP
5518 /* Reset signal to default */
5519 r = default_signals(SIGCHLD, -1);
5520 if (r < 0) {
5521 log_error_errno(r, "Failed to reset SIGCHLD: %m");
5522 goto finish;
5523 }
e866af3a 5524
03cfe0d5
LP
5525 /* Let the child know that we are ready and wait that the child is completely ready now. */
5526 if (!barrier_place_and_sync(&barrier)) { /* #5 */
5527 log_error("Client died too early.");
5528 r = -ESRCH;
5529 goto finish;
5530 }
b12afc8c 5531
03cfe0d5
LP
5532 sd_notifyf(false,
5533 "READY=1\n"
5534 "STATUS=Container running.\n"
5535 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
354bfd2b 5536
03cfe0d5
LP
5537 r = sd_event_new(&event);
5538 if (r < 0) {
5539 log_error_errno(r, "Failed to get default event source: %m");
5540 goto finish;
5541 }
88213476 5542
03cfe0d5
LP
5543 if (arg_kill_signal > 0) {
5544 /* Try to kill the init system on SIGINT or SIGTERM */
5545 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
5546 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
5547 } else {
5548 /* Immediately exit */
5549 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5550 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
5551 }
023fb90b 5552
03cfe0d5
LP
5553 /* simply exit on sigchld */
5554 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
023fb90b 5555
03cfe0d5
LP
5556 if (arg_expose_ports) {
5557 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
5558 if (r < 0)
5559 goto finish;
023fb90b 5560
03cfe0d5
LP
5561 (void) expose_ports(rtnl, &exposed);
5562 }
023fb90b 5563
03cfe0d5 5564 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
c7b7d449 5565
03cfe0d5
LP
5566 r = pty_forward_new(event, master, true, !interactive, &forward);
5567 if (r < 0) {
5568 log_error_errno(r, "Failed to create PTY forwarder: %m");
5569 goto finish;
5570 }
023fb90b 5571
03cfe0d5
LP
5572 r = sd_event_loop(event);
5573 if (r < 0) {
5574 log_error_errno(r, "Failed to run event loop: %m");
5575 goto finish;
5576 }
6d0b55c2 5577
03cfe0d5 5578 pty_forward_get_last_char(forward, &last_char);
6d0b55c2 5579
03cfe0d5 5580 forward = pty_forward_free(forward);
6d0b55c2 5581
03cfe0d5
LP
5582 if (!arg_quiet && last_char != '\n')
5583 putc('\n', stdout);
04d39279 5584
03cfe0d5
LP
5585 /* Kill if it is not dead yet anyway */
5586 terminate_machine(pid);
1f0cd86b 5587
840295fc 5588 /* Normally redundant, but better safe than sorry */
04d39279 5589 kill(pid, SIGKILL);
a258bf26 5590
113cea80 5591 r = wait_for_container(pid, &container_status);
04d39279
LP
5592 pid = 0;
5593
ec16945e 5594 if (r < 0)
ce9f1527
LP
5595 /* We failed to wait for the container, or the
5596 * container exited abnormally */
ec16945e
LP
5597 goto finish;
5598 else if (r > 0 || container_status == CONTAINER_TERMINATED){
ce9f1527
LP
5599 /* The container exited with a non-zero
5600 * status, or with zero status and no reboot
5601 * was requested. */
ec16945e 5602 ret = r;
d87be9b0 5603 break;
ec16945e 5604 }
88213476 5605
113cea80 5606 /* CONTAINER_REBOOTED, loop again */
ce38dbc8
LP
5607
5608 if (arg_keep_unit) {
5609 /* Special handling if we are running as a
5610 * service: instead of simply restarting the
5611 * machine we want to restart the entire
5612 * service, so let's inform systemd about this
5613 * with the special exit code 133. The service
5614 * file uses RestartForceExitStatus=133 so
5615 * that this results in a full nspawn
5616 * restart. This is necessary since we might
5617 * have cgroup parameters set we want to have
5618 * flushed out. */
ec16945e
LP
5619 ret = 133;
5620 r = 0;
ce38dbc8
LP
5621 break;
5622 }
6d0b55c2
LP
5623
5624 flush_ports(&exposed);
d87be9b0 5625 }
88213476
LP
5626
5627finish:
af4ec430
LP
5628 sd_notify(false,
5629 "STOPPING=1\n"
5630 "STATUS=Terminating...");
5631
9444b1f2
LP
5632 if (pid > 0)
5633 kill(pid, SIGKILL);
88213476 5634
503546da
LP
5635 /* Try to flush whatever is still queued in the pty */
5636 if (master >= 0)
5637 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
5638
03cfe0d5
LP
5639 loop_remove(loop_nr, &image_fd);
5640
ec16945e
LP
5641 if (remove_subvol && arg_directory) {
5642 int k;
5643
d9e2daaf 5644 k = btrfs_subvol_remove(arg_directory, true);
ec16945e
LP
5645 if (k < 0)
5646 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
5647 }
5648
785890ac
LP
5649 if (arg_machine) {
5650 const char *p;
5651
63c372cb 5652 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
c6878637 5653 (void) rm_rf(p, REMOVE_ROOT);
785890ac
LP
5654 }
5655
f757855e
LP
5656 flush_ports(&exposed);
5657
04d391da 5658 free(arg_directory);
ec16945e
LP
5659 free(arg_template);
5660 free(arg_image);
7027ff61 5661 free(arg_machine);
c74e630d
LP
5662 free(arg_user);
5663 strv_free(arg_setenv);
f757855e 5664 free(arg_network_bridge);
c74e630d
LP
5665 strv_free(arg_network_interfaces);
5666 strv_free(arg_network_macvlan);
4bbfe7ad 5667 strv_free(arg_network_ipvlan);
f757855e
LP
5668 strv_free(arg_parameters);
5669 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5670 expose_port_free_all(arg_expose_ports);
6d0b55c2 5671
ec16945e 5672 return r < 0 ? EXIT_FAILURE : ret;
88213476 5673}