]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: add new .nspawn files for container settings
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <net/if.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
41 #include <sys/file.h>
42
43 #ifdef HAVE_SELINUX
44 #include <selinux/selinux.h>
45 #endif
46
47 #ifdef HAVE_SECCOMP
48 #include <seccomp.h>
49 #endif
50
51 #ifdef HAVE_BLKID
52 #include <blkid/blkid.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-netlink.h"
59 #include "random-util.h"
60 #include "log.h"
61 #include "util.h"
62 #include "mkdir.h"
63 #include "rm-rf.h"
64 #include "macro.h"
65 #include "missing.h"
66 #include "cgroup-util.h"
67 #include "strv.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
71 #include "fdset.h"
72 #include "build.h"
73 #include "fileio.h"
74 #include "bus-util.h"
75 #include "bus-error.h"
76 #include "ptyfwd.h"
77 #include "env-util.h"
78 #include "netlink-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
81 #include "gpt.h"
82 #include "siphash24.h"
83 #include "copy.h"
84 #include "base-filesystem.h"
85 #include "barrier.h"
86 #include "event-util.h"
87 #include "capability.h"
88 #include "cap-list.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
91 #include "list.h"
92 #include "in-addr-util.h"
93 #include "firewall-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
100
101 #ifdef HAVE_SECCOMP
102 #include "seccomp-util.h"
103 #endif
104
105 #include "nspawn.h"
106 #include "nspawn-settings.h"
107
108 typedef enum ContainerStatus {
109 CONTAINER_TERMINATED,
110 CONTAINER_REBOOTED
111 } ContainerStatus;
112
113 typedef enum LinkJournal {
114 LINK_NO,
115 LINK_AUTO,
116 LINK_HOST,
117 LINK_GUEST
118 } LinkJournal;
119
120 static char *arg_directory = NULL;
121 static char *arg_template = NULL;
122 static char *arg_user = NULL;
123 static sd_id128_t arg_uuid = {};
124 static char *arg_machine = NULL;
125 static const char *arg_selinux_context = NULL;
126 static const char *arg_selinux_apifs_context = NULL;
127 static const char *arg_slice = NULL;
128 static bool arg_private_network = false;
129 static bool arg_read_only = false;
130 static bool arg_boot = false;
131 static bool arg_ephemeral = false;
132 static LinkJournal arg_link_journal = LINK_AUTO;
133 static bool arg_link_journal_try = false;
134 static uint64_t arg_retain =
135 (1ULL << CAP_CHOWN) |
136 (1ULL << CAP_DAC_OVERRIDE) |
137 (1ULL << CAP_DAC_READ_SEARCH) |
138 (1ULL << CAP_FOWNER) |
139 (1ULL << CAP_FSETID) |
140 (1ULL << CAP_IPC_OWNER) |
141 (1ULL << CAP_KILL) |
142 (1ULL << CAP_LEASE) |
143 (1ULL << CAP_LINUX_IMMUTABLE) |
144 (1ULL << CAP_NET_BIND_SERVICE) |
145 (1ULL << CAP_NET_BROADCAST) |
146 (1ULL << CAP_NET_RAW) |
147 (1ULL << CAP_SETGID) |
148 (1ULL << CAP_SETFCAP) |
149 (1ULL << CAP_SETPCAP) |
150 (1ULL << CAP_SETUID) |
151 (1ULL << CAP_SYS_ADMIN) |
152 (1ULL << CAP_SYS_CHROOT) |
153 (1ULL << CAP_SYS_NICE) |
154 (1ULL << CAP_SYS_PTRACE) |
155 (1ULL << CAP_SYS_TTY_CONFIG) |
156 (1ULL << CAP_SYS_RESOURCE) |
157 (1ULL << CAP_SYS_BOOT) |
158 (1ULL << CAP_AUDIT_WRITE) |
159 (1ULL << CAP_AUDIT_CONTROL) |
160 (1ULL << CAP_MKNOD);
161 static CustomMount *arg_custom_mounts = NULL;
162 static unsigned arg_n_custom_mounts = 0;
163 static char **arg_setenv = NULL;
164 static bool arg_quiet = false;
165 static bool arg_share_system = false;
166 static bool arg_register = true;
167 static bool arg_keep_unit = false;
168 static char **arg_network_interfaces = NULL;
169 static char **arg_network_macvlan = NULL;
170 static char **arg_network_ipvlan = NULL;
171 static bool arg_network_veth = false;
172 static char *arg_network_bridge = NULL;
173 static unsigned long arg_personality = PERSONALITY_INVALID;
174 static char *arg_image = NULL;
175 static VolatileMode arg_volatile_mode = VOLATILE_NO;
176 static ExposePort *arg_expose_ports = NULL;
177 static char **arg_property = NULL;
178 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
179 static bool arg_userns = false;
180 static int arg_kill_signal = 0;
181 static bool arg_unified_cgroup_hierarchy = false;
182 static SettingsMask arg_settings_mask = 0;
183 static int arg_settings_trusted = -1;
184 static char **arg_parameters = NULL;
185
186 static void help(void) {
187 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
188 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
189 " -h --help Show this help\n"
190 " --version Print version string\n"
191 " -q --quiet Do not show status information\n"
192 " -D --directory=PATH Root directory for the container\n"
193 " --template=PATH Initialize root directory from template directory,\n"
194 " if missing\n"
195 " -x --ephemeral Run container with snapshot of root directory, and\n"
196 " remove it after exit\n"
197 " -i --image=PATH File system device or disk image for the container\n"
198 " -b --boot Boot up full system (i.e. invoke init)\n"
199 " -u --user=USER Run the command under specified user or uid\n"
200 " -M --machine=NAME Set the machine name for the container\n"
201 " --uuid=UUID Set a specific machine UUID for the container\n"
202 " -S --slice=SLICE Place the container in the specified slice\n"
203 " --property=NAME=VALUE Set scope unit property\n"
204 " --private-users[=UIDBASE[:NUIDS]]\n"
205 " Run within user namespace\n"
206 " --private-network Disable network in container\n"
207 " --network-interface=INTERFACE\n"
208 " Assign an existing network interface to the\n"
209 " container\n"
210 " --network-macvlan=INTERFACE\n"
211 " Create a macvlan network interface based on an\n"
212 " existing network interface to the container\n"
213 " --network-ipvlan=INTERFACE\n"
214 " Create a ipvlan network interface based on an\n"
215 " existing network interface to the container\n"
216 " -n --network-veth Add a virtual ethernet connection between host\n"
217 " and container\n"
218 " --network-bridge=INTERFACE\n"
219 " Add a virtual ethernet connection between host\n"
220 " and container and add it to an existing bridge on\n"
221 " the host\n"
222 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
223 " Expose a container IP port on the host\n"
224 " -Z --selinux-context=SECLABEL\n"
225 " Set the SELinux security context to be used by\n"
226 " processes in the container\n"
227 " -L --selinux-apifs-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " API/tmpfs file systems in the container\n"
230 " --capability=CAP In addition to the default, retain specified\n"
231 " capability\n"
232 " --drop-capability=CAP Drop the specified capability from the default set\n"
233 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
234 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
235 " try-guest, try-host\n"
236 " -j Equivalent to --link-journal=try-guest\n"
237 " --read-only Mount the root directory read-only\n"
238 " --bind=PATH[:PATH[:OPTIONS]]\n"
239 " Bind mount a file or directory from the host into\n"
240 " the container\n"
241 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
242 " Similar, but creates a read-only bind mount\n"
243 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
244 " --overlay=PATH[:PATH...]:PATH\n"
245 " Create an overlay mount from the host to \n"
246 " the container\n"
247 " --overlay-ro=PATH[:PATH...]:PATH\n"
248 " Similar, but creates a read-only overlay mount\n"
249 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
250 " --share-system Share system namespaces with host\n"
251 " --register=BOOLEAN Register container as machine\n"
252 " --keep-unit Do not register a scope for the machine, reuse\n"
253 " the service unit nspawn is running in\n"
254 " --volatile[=MODE] Run the system in volatile mode\n"
255 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
256 , program_invocation_short_name);
257 }
258
259 static CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
260 CustomMount *c, *ret;
261
262 assert(l);
263 assert(n);
264 assert(t >= 0);
265 assert(t < _CUSTOM_MOUNT_TYPE_MAX);
266
267 c = realloc(*l, (*n + 1) * sizeof(CustomMount));
268 if (!c)
269 return NULL;
270
271 *l = c;
272 ret = *l + *n;
273 (*n)++;
274
275 *ret = (CustomMount) { .type = t };
276
277 return ret;
278 }
279
280 void custom_mount_free_all(CustomMount *l, unsigned n) {
281 unsigned i;
282
283 for (i = 0; i < n; i++) {
284 CustomMount *m = l + i;
285
286 free(m->source);
287 free(m->destination);
288 free(m->options);
289
290 if (m->work_dir) {
291 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
292 free(m->work_dir);
293 }
294
295 strv_free(m->lower);
296 }
297
298 free(l);
299 }
300
301 static int custom_mount_compare(const void *a, const void *b) {
302 const CustomMount *x = a, *y = b;
303 int r;
304
305 r = path_compare(x->destination, y->destination);
306 if (r != 0)
307 return r;
308
309 if (x->type < y->type)
310 return -1;
311 if (x->type > y->type)
312 return 1;
313
314 return 0;
315 }
316
317 static int custom_mounts_prepare(void) {
318 unsigned i;
319 int r;
320
321 /* Ensure the mounts are applied prefix first. */
322 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
323
324 /* Allocate working directories for the overlay file systems that need it */
325 for (i = 0; i < arg_n_custom_mounts; i++) {
326 CustomMount *m = &arg_custom_mounts[i];
327
328 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
329 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
330 return -EINVAL;
331 }
332
333 if (m->type != CUSTOM_MOUNT_OVERLAY)
334 continue;
335
336 if (m->work_dir)
337 continue;
338
339 if (m->read_only)
340 continue;
341
342 r = tempfn_random(m->source, NULL, &m->work_dir);
343 if (r < 0)
344 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
345 }
346
347 return 0;
348 }
349
350 static int set_sanitized_path(char **b, const char *path) {
351 char *p;
352
353 assert(b);
354 assert(path);
355
356 p = canonicalize_file_name(path);
357 if (!p) {
358 if (errno != ENOENT)
359 return -errno;
360
361 p = path_make_absolute_cwd(path);
362 if (!p)
363 return -ENOMEM;
364 }
365
366 free(*b);
367 *b = path_kill_slashes(p);
368 return 0;
369 }
370
371 static int detect_unified_cgroup_hierarchy(void) {
372 const char *e;
373 int r;
374
375 /* Allow the user to control whether the unified hierarchy is used */
376 e = getenv("UNIFIED_CGROUP_HIERARCHY");
377 if (e) {
378 r = parse_boolean(e);
379 if (r < 0)
380 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
381
382 arg_unified_cgroup_hierarchy = r;
383 return 0;
384 }
385
386 /* Otherwise inherit the default from the host system */
387 r = cg_unified();
388 if (r < 0)
389 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
390
391 arg_unified_cgroup_hierarchy = r;
392 return 0;
393 }
394
395 VolatileMode volatile_mode_from_string(const char *s) {
396 int b;
397
398 if (isempty(s))
399 return _VOLATILE_MODE_INVALID;
400
401 b = parse_boolean(s);
402 if (b > 0)
403 return VOLATILE_YES;
404 if (b == 0)
405 return VOLATILE_NO;
406
407 if (streq(s, "state"))
408 return VOLATILE_STATE;
409
410 return _VOLATILE_MODE_INVALID;
411 }
412
413 int expose_port_parse(ExposePort **l, const char *s) {
414
415 const char *split, *e;
416 uint16_t container_port, host_port;
417 int protocol;
418 ExposePort *p;
419 int r;
420
421 if ((e = startswith(s, "tcp:")))
422 protocol = IPPROTO_TCP;
423 else if ((e = startswith(s, "udp:")))
424 protocol = IPPROTO_UDP;
425 else {
426 e = s;
427 protocol = IPPROTO_TCP;
428 }
429
430 split = strchr(e, ':');
431 if (split) {
432 char v[split - e + 1];
433
434 memcpy(v, e, split - e);
435 v[split - e] = 0;
436
437 r = safe_atou16(v, &host_port);
438 if (r < 0 || host_port <= 0)
439 return -EINVAL;
440
441 r = safe_atou16(split + 1, &container_port);
442 } else {
443 r = safe_atou16(e, &container_port);
444 host_port = container_port;
445 }
446
447 if (r < 0 || container_port <= 0)
448 return -EINVAL;
449
450 LIST_FOREACH(ports, p, arg_expose_ports)
451 if (p->protocol == protocol && p->host_port == host_port)
452 return -EEXIST;
453
454 p = new(ExposePort, 1);
455 if (!p)
456 return -ENOMEM;
457
458 p->protocol = protocol;
459 p->host_port = host_port;
460 p->container_port = container_port;
461
462 LIST_PREPEND(ports, *l, p);
463
464 return 0;
465 }
466
467 int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
468 _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
469 const char *p = s;
470 CustomMount *m;
471 int r;
472
473 assert(l);
474 assert(n);
475
476 r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
477 if (r < 0)
478 return r;
479 if (r == 0)
480 return -EINVAL;
481
482 if (r == 1) {
483 destination = strdup(source);
484 if (!destination)
485 return -ENOMEM;
486 }
487
488 if (r == 2 && !isempty(p)) {
489 opts = strdup(p);
490 if (!opts)
491 return -ENOMEM;
492 }
493
494 if (!path_is_absolute(source))
495 return -EINVAL;
496
497 if (!path_is_absolute(destination))
498 return -EINVAL;
499
500 m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
501 if (!m)
502 return log_oom();
503
504 m->source = source;
505 m->destination = destination;
506 m->read_only = read_only;
507 m->options = opts;
508
509 source = destination = opts = NULL;
510 return 0;
511 }
512
513 int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
514 _cleanup_free_ char *path = NULL, *opts = NULL;
515 const char *p = s;
516 CustomMount *m;
517 int r;
518
519 assert(l);
520 assert(n);
521 assert(s);
522
523 r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
524 if (r < 0)
525 return r;
526 if (r == 0)
527 return -EINVAL;
528
529 if (isempty(p))
530 opts = strdup("mode=0755");
531 else
532 opts = strdup(p);
533 if (!opts)
534 return -ENOMEM;
535
536 if (!path_is_absolute(path))
537 return -EINVAL;
538
539 m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
540 if (!m)
541 return -ENOMEM;
542
543 m->destination = path;
544 m->options = opts;
545
546 path = opts = NULL;
547 return 0;
548 }
549
550 static int parse_argv(int argc, char *argv[]) {
551
552 enum {
553 ARG_VERSION = 0x100,
554 ARG_PRIVATE_NETWORK,
555 ARG_UUID,
556 ARG_READ_ONLY,
557 ARG_CAPABILITY,
558 ARG_DROP_CAPABILITY,
559 ARG_LINK_JOURNAL,
560 ARG_BIND,
561 ARG_BIND_RO,
562 ARG_TMPFS,
563 ARG_OVERLAY,
564 ARG_OVERLAY_RO,
565 ARG_SETENV,
566 ARG_SHARE_SYSTEM,
567 ARG_REGISTER,
568 ARG_KEEP_UNIT,
569 ARG_NETWORK_INTERFACE,
570 ARG_NETWORK_MACVLAN,
571 ARG_NETWORK_IPVLAN,
572 ARG_NETWORK_BRIDGE,
573 ARG_PERSONALITY,
574 ARG_VOLATILE,
575 ARG_TEMPLATE,
576 ARG_PROPERTY,
577 ARG_PRIVATE_USERS,
578 ARG_KILL_SIGNAL,
579 ARG_SETTINGS,
580 };
581
582 static const struct option options[] = {
583 { "help", no_argument, NULL, 'h' },
584 { "version", no_argument, NULL, ARG_VERSION },
585 { "directory", required_argument, NULL, 'D' },
586 { "template", required_argument, NULL, ARG_TEMPLATE },
587 { "ephemeral", no_argument, NULL, 'x' },
588 { "user", required_argument, NULL, 'u' },
589 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
590 { "boot", no_argument, NULL, 'b' },
591 { "uuid", required_argument, NULL, ARG_UUID },
592 { "read-only", no_argument, NULL, ARG_READ_ONLY },
593 { "capability", required_argument, NULL, ARG_CAPABILITY },
594 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
595 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
596 { "bind", required_argument, NULL, ARG_BIND },
597 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
598 { "tmpfs", required_argument, NULL, ARG_TMPFS },
599 { "overlay", required_argument, NULL, ARG_OVERLAY },
600 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
601 { "machine", required_argument, NULL, 'M' },
602 { "slice", required_argument, NULL, 'S' },
603 { "setenv", required_argument, NULL, ARG_SETENV },
604 { "selinux-context", required_argument, NULL, 'Z' },
605 { "selinux-apifs-context", required_argument, NULL, 'L' },
606 { "quiet", no_argument, NULL, 'q' },
607 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
608 { "register", required_argument, NULL, ARG_REGISTER },
609 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
610 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
611 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
612 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
613 { "network-veth", no_argument, NULL, 'n' },
614 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
615 { "personality", required_argument, NULL, ARG_PERSONALITY },
616 { "image", required_argument, NULL, 'i' },
617 { "volatile", optional_argument, NULL, ARG_VOLATILE },
618 { "port", required_argument, NULL, 'p' },
619 { "property", required_argument, NULL, ARG_PROPERTY },
620 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
621 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
622 { "settings", required_argument, NULL, ARG_SETTINGS },
623 {}
624 };
625
626 int c, r;
627 uint64_t plus = 0, minus = 0;
628 bool mask_all_settings = false, mask_no_settings = false;
629
630 assert(argc >= 0);
631 assert(argv);
632
633 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
634
635 switch (c) {
636
637 case 'h':
638 help();
639 return 0;
640
641 case ARG_VERSION:
642 puts(PACKAGE_STRING);
643 puts(SYSTEMD_FEATURES);
644 return 0;
645
646 case 'D':
647 r = set_sanitized_path(&arg_directory, optarg);
648 if (r < 0)
649 return log_error_errno(r, "Invalid root directory: %m");
650
651 break;
652
653 case ARG_TEMPLATE:
654 r = set_sanitized_path(&arg_template, optarg);
655 if (r < 0)
656 return log_error_errno(r, "Invalid template directory: %m");
657
658 break;
659
660 case 'i':
661 r = set_sanitized_path(&arg_image, optarg);
662 if (r < 0)
663 return log_error_errno(r, "Invalid image path: %m");
664
665 break;
666
667 case 'x':
668 arg_ephemeral = true;
669 break;
670
671 case 'u':
672 r = free_and_strdup(&arg_user, optarg);
673 if (r < 0)
674 return log_oom();
675
676 arg_settings_mask |= SETTING_USER;
677 break;
678
679 case ARG_NETWORK_BRIDGE:
680 r = free_and_strdup(&arg_network_bridge, optarg);
681 if (r < 0)
682 return log_oom();
683
684 /* fall through */
685
686 case 'n':
687 arg_network_veth = true;
688 arg_private_network = true;
689 arg_settings_mask |= SETTING_NETWORK;
690 break;
691
692 case ARG_NETWORK_INTERFACE:
693 if (strv_extend(&arg_network_interfaces, optarg) < 0)
694 return log_oom();
695
696 arg_private_network = true;
697 arg_settings_mask |= SETTING_NETWORK;
698 break;
699
700 case ARG_NETWORK_MACVLAN:
701 if (strv_extend(&arg_network_macvlan, optarg) < 0)
702 return log_oom();
703
704 arg_private_network = true;
705 arg_settings_mask |= SETTING_NETWORK;
706 break;
707
708 case ARG_NETWORK_IPVLAN:
709 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
710 return log_oom();
711
712 /* fall through */
713
714 case ARG_PRIVATE_NETWORK:
715 arg_private_network = true;
716 arg_settings_mask |= SETTING_NETWORK;
717 break;
718
719 case 'b':
720 arg_boot = true;
721 arg_settings_mask |= SETTING_BOOT;
722 break;
723
724 case ARG_UUID:
725 r = sd_id128_from_string(optarg, &arg_uuid);
726 if (r < 0) {
727 log_error("Invalid UUID: %s", optarg);
728 return r;
729 }
730
731 arg_settings_mask |= SETTING_MACHINE_ID;
732 break;
733
734 case 'S':
735 arg_slice = optarg;
736 break;
737
738 case 'M':
739 if (isempty(optarg))
740 arg_machine = mfree(arg_machine);
741 else {
742 if (!machine_name_is_valid(optarg)) {
743 log_error("Invalid machine name: %s", optarg);
744 return -EINVAL;
745 }
746
747 r = free_and_strdup(&arg_machine, optarg);
748 if (r < 0)
749 return log_oom();
750
751 break;
752 }
753
754 case 'Z':
755 arg_selinux_context = optarg;
756 break;
757
758 case 'L':
759 arg_selinux_apifs_context = optarg;
760 break;
761
762 case ARG_READ_ONLY:
763 arg_read_only = true;
764 arg_settings_mask |= SETTING_READ_ONLY;
765 break;
766
767 case ARG_CAPABILITY:
768 case ARG_DROP_CAPABILITY: {
769 const char *state, *word;
770 size_t length;
771
772 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
773 _cleanup_free_ char *t;
774
775 t = strndup(word, length);
776 if (!t)
777 return log_oom();
778
779 if (streq(t, "all")) {
780 if (c == ARG_CAPABILITY)
781 plus = (uint64_t) -1;
782 else
783 minus = (uint64_t) -1;
784 } else {
785 int cap;
786
787 cap = capability_from_name(t);
788 if (cap < 0) {
789 log_error("Failed to parse capability %s.", t);
790 return -EINVAL;
791 }
792
793 if (c == ARG_CAPABILITY)
794 plus |= 1ULL << (uint64_t) cap;
795 else
796 minus |= 1ULL << (uint64_t) cap;
797 }
798 }
799
800 arg_settings_mask |= SETTING_CAPABILITY;
801 break;
802 }
803
804 case 'j':
805 arg_link_journal = LINK_GUEST;
806 arg_link_journal_try = true;
807 break;
808
809 case ARG_LINK_JOURNAL:
810 if (streq(optarg, "auto")) {
811 arg_link_journal = LINK_AUTO;
812 arg_link_journal_try = false;
813 } else if (streq(optarg, "no")) {
814 arg_link_journal = LINK_NO;
815 arg_link_journal_try = false;
816 } else if (streq(optarg, "guest")) {
817 arg_link_journal = LINK_GUEST;
818 arg_link_journal_try = false;
819 } else if (streq(optarg, "host")) {
820 arg_link_journal = LINK_HOST;
821 arg_link_journal_try = false;
822 } else if (streq(optarg, "try-guest")) {
823 arg_link_journal = LINK_GUEST;
824 arg_link_journal_try = true;
825 } else if (streq(optarg, "try-host")) {
826 arg_link_journal = LINK_HOST;
827 arg_link_journal_try = true;
828 } else {
829 log_error("Failed to parse link journal mode %s", optarg);
830 return -EINVAL;
831 }
832
833 break;
834
835 case ARG_BIND:
836 case ARG_BIND_RO:
837 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
838 if (r < 0)
839 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
840
841 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
842 break;
843
844 case ARG_TMPFS:
845 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
846 if (r < 0)
847 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
848
849 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
850 break;
851
852 case ARG_OVERLAY:
853 case ARG_OVERLAY_RO: {
854 _cleanup_free_ char *upper = NULL, *destination = NULL;
855 _cleanup_strv_free_ char **lower = NULL;
856 CustomMount *m;
857 unsigned n = 0;
858 char **i;
859
860 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
861 if (r == -ENOMEM)
862 return log_oom();
863 else if (r < 0) {
864 log_error("Invalid overlay specification: %s", optarg);
865 return r;
866 }
867
868 STRV_FOREACH(i, lower) {
869 if (!path_is_absolute(*i)) {
870 log_error("Overlay path %s is not absolute.", *i);
871 return -EINVAL;
872 }
873
874 n++;
875 }
876
877 if (n < 2) {
878 log_error("--overlay= needs at least two colon-separated directories specified.");
879 return -EINVAL;
880 }
881
882 if (n == 2) {
883 /* If two parameters are specified,
884 * the first one is the lower, the
885 * second one the upper directory. And
886 * we'll also define the destination
887 * mount point the same as the upper. */
888 upper = lower[1];
889 lower[1] = NULL;
890
891 destination = strdup(upper);
892 if (!destination)
893 return log_oom();
894
895 } else {
896 upper = lower[n - 2];
897 destination = lower[n - 1];
898 lower[n - 2] = NULL;
899 }
900
901 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
902 if (!m)
903 return log_oom();
904
905 m->destination = destination;
906 m->source = upper;
907 m->lower = lower;
908 m->read_only = c == ARG_OVERLAY_RO;
909
910 upper = destination = NULL;
911 lower = NULL;
912
913 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
914 break;
915 }
916
917 case ARG_SETENV: {
918 char **n;
919
920 if (!env_assignment_is_valid(optarg)) {
921 log_error("Environment variable assignment '%s' is not valid.", optarg);
922 return -EINVAL;
923 }
924
925 n = strv_env_set(arg_setenv, optarg);
926 if (!n)
927 return log_oom();
928
929 strv_free(arg_setenv);
930 arg_setenv = n;
931
932 arg_settings_mask |= SETTING_ENVIRONMENT;
933 break;
934 }
935
936 case 'q':
937 arg_quiet = true;
938 break;
939
940 case ARG_SHARE_SYSTEM:
941 arg_share_system = true;
942 break;
943
944 case ARG_REGISTER:
945 r = parse_boolean(optarg);
946 if (r < 0) {
947 log_error("Failed to parse --register= argument: %s", optarg);
948 return r;
949 }
950
951 arg_register = r;
952 break;
953
954 case ARG_KEEP_UNIT:
955 arg_keep_unit = true;
956 break;
957
958 case ARG_PERSONALITY:
959
960 arg_personality = personality_from_string(optarg);
961 if (arg_personality == PERSONALITY_INVALID) {
962 log_error("Unknown or unsupported personality '%s'.", optarg);
963 return -EINVAL;
964 }
965
966 arg_settings_mask |= SETTING_PERSONALITY;
967 break;
968
969 case ARG_VOLATILE:
970
971 if (!optarg)
972 arg_volatile_mode = VOLATILE_YES;
973 else {
974 VolatileMode m;
975
976 m = volatile_mode_from_string(optarg);
977 if (m < 0) {
978 log_error("Failed to parse --volatile= argument: %s", optarg);
979 return -EINVAL;
980 } else
981 arg_volatile_mode = m;
982 }
983
984 arg_settings_mask |= SETTING_VOLATILE_MODE;
985 break;
986
987 case 'p':
988 r = expose_port_parse(&arg_expose_ports, optarg);
989 if (r == -EEXIST)
990 return log_error_errno(r, "Duplicate port specification: %s", optarg);
991 if (r < 0)
992 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
993
994 arg_settings_mask |= SETTING_EXPOSE_PORTS;
995 break;
996
997 case ARG_PROPERTY:
998 if (strv_extend(&arg_property, optarg) < 0)
999 return log_oom();
1000
1001 break;
1002
1003 case ARG_PRIVATE_USERS:
1004 if (optarg) {
1005 _cleanup_free_ char *buffer = NULL;
1006 const char *range, *shift;
1007
1008 range = strchr(optarg, ':');
1009 if (range) {
1010 buffer = strndup(optarg, range - optarg);
1011 if (!buffer)
1012 return log_oom();
1013 shift = buffer;
1014
1015 range++;
1016 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
1017 log_error("Failed to parse UID range: %s", range);
1018 return -EINVAL;
1019 }
1020 } else
1021 shift = optarg;
1022
1023 if (parse_uid(shift, &arg_uid_shift) < 0) {
1024 log_error("Failed to parse UID: %s", optarg);
1025 return -EINVAL;
1026 }
1027 }
1028
1029 arg_userns = true;
1030 break;
1031
1032 case ARG_KILL_SIGNAL:
1033 arg_kill_signal = signal_from_string_try_harder(optarg);
1034 if (arg_kill_signal < 0) {
1035 log_error("Cannot parse signal: %s", optarg);
1036 return -EINVAL;
1037 }
1038
1039 arg_settings_mask |= SETTING_KILL_SIGNAL;
1040 break;
1041
1042 case ARG_SETTINGS:
1043
1044 /* no → do not read files
1045 * yes → read files, do not override cmdline, trust only subset
1046 * override → read files, override cmdline, trust only subset
1047 * trusted → read files, do not override cmdline, trust all
1048 */
1049
1050 r = parse_boolean(optarg);
1051 if (r < 0) {
1052 if (streq(optarg, "trusted")) {
1053 mask_all_settings = false;
1054 mask_no_settings = false;
1055 arg_settings_trusted = true;
1056
1057 } else if (streq(optarg, "override")) {
1058 mask_all_settings = false;
1059 mask_no_settings = true;
1060 arg_settings_trusted = -1;
1061 } else
1062 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1063 } else if (r > 0) {
1064 /* yes */
1065 mask_all_settings = false;
1066 mask_no_settings = false;
1067 arg_settings_trusted = -1;
1068 } else {
1069 /* no */
1070 mask_all_settings = true;
1071 mask_no_settings = false;
1072 arg_settings_trusted = false;
1073 }
1074
1075 break;
1076
1077 case '?':
1078 return -EINVAL;
1079
1080 default:
1081 assert_not_reached("Unhandled option");
1082 }
1083
1084 if (arg_share_system)
1085 arg_register = false;
1086
1087 if (arg_boot && arg_share_system) {
1088 log_error("--boot and --share-system may not be combined.");
1089 return -EINVAL;
1090 }
1091
1092 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
1093 log_error("--keep-unit may not be used when invoked from a user session.");
1094 return -EINVAL;
1095 }
1096
1097 if (arg_directory && arg_image) {
1098 log_error("--directory= and --image= may not be combined.");
1099 return -EINVAL;
1100 }
1101
1102 if (arg_template && arg_image) {
1103 log_error("--template= and --image= may not be combined.");
1104 return -EINVAL;
1105 }
1106
1107 if (arg_template && !(arg_directory || arg_machine)) {
1108 log_error("--template= needs --directory= or --machine=.");
1109 return -EINVAL;
1110 }
1111
1112 if (arg_ephemeral && arg_template) {
1113 log_error("--ephemeral and --template= may not be combined.");
1114 return -EINVAL;
1115 }
1116
1117 if (arg_ephemeral && arg_image) {
1118 log_error("--ephemeral and --image= may not be combined.");
1119 return -EINVAL;
1120 }
1121
1122 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1123 log_error("--ephemeral and --link-journal= may not be combined.");
1124 return -EINVAL;
1125 }
1126
1127 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
1128 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
1129
1130 if (argc > optind) {
1131 arg_parameters = strv_copy(argv + optind);
1132 if (!arg_parameters)
1133 return log_oom();
1134
1135 arg_settings_mask |= SETTING_BOOT;
1136 }
1137
1138 /* Load all settings from .nspawn files */
1139 if (mask_no_settings)
1140 arg_settings_mask = 0;
1141
1142 /* Don't load any settings from .nspawn files */
1143 if (mask_all_settings)
1144 arg_settings_mask = _SETTINGS_MASK_ALL;
1145
1146 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1147
1148 r = detect_unified_cgroup_hierarchy();
1149 if (r < 0)
1150 return r;
1151
1152 return 1;
1153 }
1154
1155 static int verify_arguments(void) {
1156
1157 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
1158 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1159 return -EINVAL;
1160 }
1161
1162 if (arg_expose_ports && !arg_private_network) {
1163 log_error("Cannot use --port= without private networking.");
1164 return -EINVAL;
1165 }
1166
1167 if (arg_boot && arg_kill_signal <= 0)
1168 arg_kill_signal = SIGRTMIN+3;
1169
1170 return 0;
1171 }
1172
1173 static int tmpfs_patch_options(const char *options, char **ret) {
1174 char *buf = NULL;
1175
1176 if (arg_userns && arg_uid_shift != 0) {
1177 assert(arg_uid_shift != UID_INVALID);
1178
1179 if (options)
1180 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
1181 else
1182 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
1183 if (!buf)
1184 return -ENOMEM;
1185
1186 options = buf;
1187 }
1188
1189 #ifdef HAVE_SELINUX
1190 if (arg_selinux_apifs_context) {
1191 char *t;
1192
1193 if (options)
1194 t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
1195 else
1196 t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
1197 if (!t) {
1198 free(buf);
1199 return -ENOMEM;
1200 }
1201
1202 free(buf);
1203 buf = t;
1204 }
1205 #endif
1206
1207 *ret = buf;
1208 return !!buf;
1209 }
1210
1211 static int mount_all(const char *dest, bool userns) {
1212
1213 typedef struct MountPoint {
1214 const char *what;
1215 const char *where;
1216 const char *type;
1217 const char *options;
1218 unsigned long flags;
1219 bool fatal;
1220 bool userns;
1221 } MountPoint;
1222
1223 static const MountPoint mount_table[] = {
1224 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
1225 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
1226 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
1227 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
1228 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
1229 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1230 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1231 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
1232 #ifdef HAVE_SELINUX
1233 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
1234 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */
1235 #endif
1236 };
1237
1238 unsigned k;
1239 int r;
1240
1241 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
1242 _cleanup_free_ char *where = NULL, *options = NULL;
1243 const char *o;
1244
1245 if (userns != mount_table[k].userns)
1246 continue;
1247
1248 where = prefix_root(dest, mount_table[k].where);
1249 if (!where)
1250 return log_oom();
1251
1252 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
1253 if (r < 0 && r != -ENOENT)
1254 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
1255
1256 /* Skip this entry if it is not a remount. */
1257 if (mount_table[k].what && r > 0)
1258 continue;
1259
1260 r = mkdir_p(where, 0755);
1261 if (r < 0) {
1262 if (mount_table[k].fatal)
1263 return log_error_errno(r, "Failed to create directory %s: %m", where);
1264
1265 log_warning_errno(r, "Failed to create directory %s: %m", where);
1266 continue;
1267 }
1268
1269 o = mount_table[k].options;
1270 if (streq_ptr(mount_table[k].type, "tmpfs")) {
1271 r = tmpfs_patch_options(o, &options);
1272 if (r < 0)
1273 return log_oom();
1274 if (r > 0)
1275 o = options;
1276 }
1277
1278 if (mount(mount_table[k].what,
1279 where,
1280 mount_table[k].type,
1281 mount_table[k].flags,
1282 o) < 0) {
1283
1284 if (mount_table[k].fatal)
1285 return log_error_errno(errno, "mount(%s) failed: %m", where);
1286
1287 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
1288 }
1289 }
1290
1291 return 0;
1292 }
1293
1294 static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
1295 const char *p = options;
1296 unsigned long flags = *mount_flags;
1297 char *opts = NULL;
1298
1299 assert(options);
1300
1301 for (;;) {
1302 _cleanup_free_ char *word = NULL;
1303 int r = extract_first_word(&p, &word, ",", 0);
1304 if (r < 0)
1305 return log_error_errno(r, "Failed to extract mount option: %m");
1306 if (r == 0)
1307 break;
1308
1309 if (streq(word, "rbind"))
1310 flags |= MS_REC;
1311 else if (streq(word, "norbind"))
1312 flags &= ~MS_REC;
1313 else {
1314 log_error("Invalid bind mount option: %s", word);
1315 return -EINVAL;
1316 }
1317 }
1318
1319 *mount_flags = flags;
1320 /* in the future mount_opts will hold string options for mount(2) */
1321 *mount_opts = opts;
1322
1323 return 0;
1324 }
1325
1326 static int mount_bind(const char *dest, CustomMount *m) {
1327 struct stat source_st, dest_st;
1328 const char *where;
1329 unsigned long mount_flags = MS_BIND | MS_REC;
1330 _cleanup_free_ char *mount_opts = NULL;
1331 int r;
1332
1333 assert(m);
1334
1335 if (m->options) {
1336 r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
1337 if (r < 0)
1338 return r;
1339 }
1340
1341 if (stat(m->source, &source_st) < 0)
1342 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
1343
1344 where = prefix_roota(dest, m->destination);
1345
1346 if (stat(where, &dest_st) >= 0) {
1347 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
1348 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
1349 return -EINVAL;
1350 }
1351
1352 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
1353 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
1354 return -EINVAL;
1355 }
1356
1357 } else if (errno == ENOENT) {
1358 r = mkdir_parents_label(where, 0755);
1359 if (r < 0)
1360 return log_error_errno(r, "Failed to make parents of %s: %m", where);
1361 } else {
1362 log_error_errno(errno, "Failed to stat %s: %m", where);
1363 return -errno;
1364 }
1365
1366 /* Create the mount point. Any non-directory file can be
1367 * mounted on any non-directory file (regular, fifo, socket,
1368 * char, block).
1369 */
1370 if (S_ISDIR(source_st.st_mode))
1371 r = mkdir_label(where, 0755);
1372 else
1373 r = touch(where);
1374 if (r < 0 && r != -EEXIST)
1375 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1376
1377 if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0)
1378 return log_error_errno(errno, "mount(%s) failed: %m", where);
1379
1380 if (m->read_only) {
1381 r = bind_remount_recursive(where, true);
1382 if (r < 0)
1383 return log_error_errno(r, "Read-only bind mount failed: %m");
1384 }
1385
1386 return 0;
1387 }
1388
1389 static int mount_tmpfs(const char *dest, CustomMount *m) {
1390 const char *where, *options;
1391 _cleanup_free_ char *buf = NULL;
1392 int r;
1393
1394 assert(dest);
1395 assert(m);
1396
1397 where = prefix_roota(dest, m->destination);
1398
1399 r = mkdir_p_label(where, 0755);
1400 if (r < 0 && r != -EEXIST)
1401 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1402
1403 r = tmpfs_patch_options(m->options, &buf);
1404 if (r < 0)
1405 return log_oom();
1406 options = r > 0 ? buf : m->options;
1407
1408 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
1409 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1410
1411 return 0;
1412 }
1413
1414 static char *joined_and_escaped_lower_dirs(char * const *lower) {
1415 _cleanup_strv_free_ char **sv = NULL;
1416
1417 sv = strv_copy(lower);
1418 if (!sv)
1419 return NULL;
1420
1421 strv_reverse(sv);
1422
1423 if (!strv_shell_escape(sv, ",:"))
1424 return NULL;
1425
1426 return strv_join(sv, ":");
1427 }
1428
1429 static int mount_overlay(const char *dest, CustomMount *m) {
1430 _cleanup_free_ char *lower = NULL;
1431 const char *where, *options;
1432 int r;
1433
1434 assert(dest);
1435 assert(m);
1436
1437 where = prefix_roota(dest, m->destination);
1438
1439 r = mkdir_label(where, 0755);
1440 if (r < 0 && r != -EEXIST)
1441 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
1442
1443 (void) mkdir_p_label(m->source, 0755);
1444
1445 lower = joined_and_escaped_lower_dirs(m->lower);
1446 if (!lower)
1447 return log_oom();
1448
1449 if (m->read_only) {
1450 _cleanup_free_ char *escaped_source = NULL;
1451
1452 escaped_source = shell_escape(m->source, ",:");
1453 if (!escaped_source)
1454 return log_oom();
1455
1456 options = strjoina("lowerdir=", escaped_source, ":", lower);
1457 } else {
1458 _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL;
1459
1460 assert(m->work_dir);
1461 (void) mkdir_label(m->work_dir, 0700);
1462
1463 escaped_source = shell_escape(m->source, ",:");
1464 if (!escaped_source)
1465 return log_oom();
1466 escaped_work_dir = shell_escape(m->work_dir, ",:");
1467 if (!escaped_work_dir)
1468 return log_oom();
1469
1470 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
1471 }
1472
1473 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
1474 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
1475
1476 return 0;
1477 }
1478
1479 static int mount_custom(const char *dest) {
1480 unsigned i;
1481 int r;
1482
1483 assert(dest);
1484
1485 for (i = 0; i < arg_n_custom_mounts; i++) {
1486 CustomMount *m = &arg_custom_mounts[i];
1487
1488 switch (m->type) {
1489
1490 case CUSTOM_MOUNT_BIND:
1491 r = mount_bind(dest, m);
1492 break;
1493
1494 case CUSTOM_MOUNT_TMPFS:
1495 r = mount_tmpfs(dest, m);
1496 break;
1497
1498 case CUSTOM_MOUNT_OVERLAY:
1499 r = mount_overlay(dest, m);
1500 break;
1501
1502 default:
1503 assert_not_reached("Unknown custom mount type");
1504 }
1505
1506 if (r < 0)
1507 return r;
1508 }
1509
1510 return 0;
1511 }
1512
1513 static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1514 char *to;
1515 int r;
1516
1517 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1518
1519 r = path_is_mount_point(to, 0);
1520 if (r < 0 && r != -ENOENT)
1521 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1522 if (r > 0)
1523 return 0;
1524
1525 mkdir_p(to, 0755);
1526
1527 /* The superblock mount options of the mount point need to be
1528 * identical to the hosts', and hence writable... */
1529 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1530 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1531
1532 /* ... hence let's only make the bind mount read-only, not the
1533 * superblock. */
1534 if (read_only) {
1535 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1536 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1537 }
1538 return 1;
1539 }
1540
1541 static int mount_legacy_cgroups(const char *dest) {
1542 _cleanup_set_free_free_ Set *controllers = NULL;
1543 const char *cgroup_root;
1544 int r;
1545
1546 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1547
1548 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
1549 r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
1550 if (r < 0)
1551 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
1552 if (r == 0) {
1553 _cleanup_free_ char *options = NULL;
1554
1555 r = tmpfs_patch_options("mode=755", &options);
1556 if (r < 0)
1557 return log_oom();
1558
1559 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
1560 return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
1561 }
1562
1563 if (cg_unified() > 0)
1564 goto skip_controllers;
1565
1566 controllers = set_new(&string_hash_ops);
1567 if (!controllers)
1568 return log_oom();
1569
1570 r = cg_kernel_controllers(controllers);
1571 if (r < 0)
1572 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1573
1574 for (;;) {
1575 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1576
1577 controller = set_steal_first(controllers);
1578 if (!controller)
1579 break;
1580
1581 origin = prefix_root("/sys/fs/cgroup/", controller);
1582 if (!origin)
1583 return log_oom();
1584
1585 r = readlink_malloc(origin, &combined);
1586 if (r == -EINVAL) {
1587 /* Not a symbolic link, but directly a single cgroup hierarchy */
1588
1589 r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
1590 if (r < 0)
1591 return r;
1592
1593 } else if (r < 0)
1594 return log_error_errno(r, "Failed to read link %s: %m", origin);
1595 else {
1596 _cleanup_free_ char *target = NULL;
1597
1598 target = prefix_root(dest, origin);
1599 if (!target)
1600 return log_oom();
1601
1602 /* A symbolic link, a combination of controllers in one hierarchy */
1603
1604 if (!filename_is_valid(combined)) {
1605 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1606 continue;
1607 }
1608
1609 r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
1610 if (r < 0)
1611 return r;
1612
1613 r = symlink_idempotent(combined, target);
1614 if (r == -EINVAL) {
1615 log_error("Invalid existing symlink for combined hierarchy");
1616 return r;
1617 }
1618 if (r < 0)
1619 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1620 }
1621 }
1622
1623 skip_controllers:
1624 r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false);
1625 if (r < 0)
1626 return r;
1627
1628 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1629 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1630
1631 return 0;
1632 }
1633
1634 static int mount_unified_cgroups(const char *dest) {
1635 const char *p;
1636 int r;
1637
1638 assert(dest);
1639
1640 p = strjoina(dest, "/sys/fs/cgroup");
1641
1642 r = path_is_mount_point(p, AT_SYMLINK_FOLLOW);
1643 if (r < 0)
1644 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
1645 if (r > 0) {
1646 p = strjoina(dest, "/sys/fs/cgroup/cgroup.procs");
1647 if (access(p, F_OK) >= 0)
1648 return 0;
1649 if (errno != ENOENT)
1650 return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
1651
1652 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
1653 return -EINVAL;
1654 }
1655
1656 if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0)
1657 return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p);
1658
1659 return 0;
1660 }
1661
1662 static int mount_cgroups(const char *dest) {
1663 if (arg_unified_cgroup_hierarchy)
1664 return mount_unified_cgroups(dest);
1665 else
1666 return mount_legacy_cgroups(dest);
1667 }
1668
1669 static int mount_systemd_cgroup_writable(const char *dest) {
1670 _cleanup_free_ char *own_cgroup_path = NULL;
1671 const char *systemd_root, *systemd_own;
1672 int r;
1673
1674 assert(dest);
1675
1676 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1677 if (r < 0)
1678 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1679
1680 /* If we are living in the top-level, then there's nothing to do... */
1681 if (path_equal(own_cgroup_path, "/"))
1682 return 0;
1683
1684 if (arg_unified_cgroup_hierarchy) {
1685 systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
1686 systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
1687 } else {
1688 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1689 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
1690 }
1691
1692 /* Make our own cgroup a (writable) bind mount */
1693 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1694 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1695
1696 /* And then remount the systemd cgroup root read-only */
1697 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1698 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1699
1700 return 0;
1701 }
1702
1703 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1704 assert(p);
1705
1706 if (!arg_userns)
1707 return 0;
1708
1709 if (uid == UID_INVALID && gid == GID_INVALID)
1710 return 0;
1711
1712 if (uid != UID_INVALID) {
1713 uid += arg_uid_shift;
1714
1715 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1716 return -EOVERFLOW;
1717 }
1718
1719 if (gid != GID_INVALID) {
1720 gid += (gid_t) arg_uid_shift;
1721
1722 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1723 return -EOVERFLOW;
1724 }
1725
1726 if (lchown(p, uid, gid) < 0)
1727 return -errno;
1728
1729 return 0;
1730 }
1731
1732 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1733 const char *q;
1734
1735 q = prefix_roota(root, path);
1736 if (mkdir(q, mode) < 0) {
1737 if (errno == EEXIST)
1738 return 0;
1739 return -errno;
1740 }
1741
1742 return userns_lchown(q, uid, gid);
1743 }
1744
1745 static int setup_timezone(const char *dest) {
1746 _cleanup_free_ char *p = NULL, *q = NULL;
1747 const char *where, *check, *what;
1748 char *z, *y;
1749 int r;
1750
1751 assert(dest);
1752
1753 /* Fix the timezone, if possible */
1754 r = readlink_malloc("/etc/localtime", &p);
1755 if (r < 0) {
1756 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1757 return 0;
1758 }
1759
1760 z = path_startswith(p, "../usr/share/zoneinfo/");
1761 if (!z)
1762 z = path_startswith(p, "/usr/share/zoneinfo/");
1763 if (!z) {
1764 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1765 return 0;
1766 }
1767
1768 where = prefix_roota(dest, "/etc/localtime");
1769 r = readlink_malloc(where, &q);
1770 if (r >= 0) {
1771 y = path_startswith(q, "../usr/share/zoneinfo/");
1772 if (!y)
1773 y = path_startswith(q, "/usr/share/zoneinfo/");
1774
1775 /* Already pointing to the right place? Then do nothing .. */
1776 if (y && streq(y, z))
1777 return 0;
1778 }
1779
1780 check = strjoina("/usr/share/zoneinfo/", z);
1781 check = prefix_root(dest, check);
1782 if (laccess(check, F_OK) < 0) {
1783 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1784 return 0;
1785 }
1786
1787 r = unlink(where);
1788 if (r < 0 && errno != ENOENT) {
1789 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1790 return 0;
1791 }
1792
1793 what = strjoina("../usr/share/zoneinfo/", z);
1794 if (symlink(what, where) < 0) {
1795 log_error_errno(errno, "Failed to correct timezone of container: %m");
1796 return 0;
1797 }
1798
1799 r = userns_lchown(where, 0, 0);
1800 if (r < 0)
1801 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1802
1803 return 0;
1804 }
1805
1806 static int setup_resolv_conf(const char *dest) {
1807 const char *where = NULL;
1808 int r;
1809
1810 assert(dest);
1811
1812 if (arg_private_network)
1813 return 0;
1814
1815 /* Fix resolv.conf, if possible */
1816 where = prefix_roota(dest, "/etc/resolv.conf");
1817
1818 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1819 if (r < 0) {
1820 /* If the file already exists as symlink, let's
1821 * suppress the warning, under the assumption that
1822 * resolved or something similar runs inside and the
1823 * symlink points there.
1824 *
1825 * If the disk image is read-only, there's also no
1826 * point in complaining.
1827 */
1828 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1829 "Failed to copy /etc/resolv.conf to %s: %m", where);
1830 return 0;
1831 }
1832
1833 r = userns_lchown(where, 0, 0);
1834 if (r < 0)
1835 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1836
1837 return 0;
1838 }
1839
1840 static int setup_volatile_state(const char *directory) {
1841 _cleanup_free_ char *buf = NULL;
1842 const char *p, *options;
1843 int r;
1844
1845 assert(directory);
1846
1847 if (arg_volatile_mode != VOLATILE_STATE)
1848 return 0;
1849
1850 /* --volatile=state means we simply overmount /var
1851 with a tmpfs, and the rest read-only. */
1852
1853 r = bind_remount_recursive(directory, true);
1854 if (r < 0)
1855 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1856
1857 p = prefix_roota(directory, "/var");
1858 r = mkdir(p, 0755);
1859 if (r < 0 && errno != EEXIST)
1860 return log_error_errno(errno, "Failed to create %s: %m", directory);
1861
1862 options = "mode=755";
1863 r = tmpfs_patch_options(options, &buf);
1864 if (r < 0)
1865 return log_oom();
1866 if (r > 0)
1867 options = buf;
1868
1869 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
1870 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1871
1872 return 0;
1873 }
1874
1875 static int setup_volatile(const char *directory) {
1876 bool tmpfs_mounted = false, bind_mounted = false;
1877 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1878 _cleanup_free_ char *buf = NULL;
1879 const char *f, *t, *options;
1880 int r;
1881
1882 assert(directory);
1883
1884 if (arg_volatile_mode != VOLATILE_YES)
1885 return 0;
1886
1887 /* --volatile=yes means we mount a tmpfs to the root dir, and
1888 the original /usr to use inside it, and that read-only. */
1889
1890 if (!mkdtemp(template))
1891 return log_error_errno(errno, "Failed to create temporary directory: %m");
1892
1893 options = "mode=755";
1894 r = tmpfs_patch_options(options, &buf);
1895 if (r < 0)
1896 return log_oom();
1897 if (r > 0)
1898 options = buf;
1899
1900 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
1901 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1902 goto fail;
1903 }
1904
1905 tmpfs_mounted = true;
1906
1907 f = prefix_roota(directory, "/usr");
1908 t = prefix_roota(template, "/usr");
1909
1910 r = mkdir(t, 0755);
1911 if (r < 0 && errno != EEXIST) {
1912 r = log_error_errno(errno, "Failed to create %s: %m", t);
1913 goto fail;
1914 }
1915
1916 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
1917 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
1918 goto fail;
1919 }
1920
1921 bind_mounted = true;
1922
1923 r = bind_remount_recursive(t, true);
1924 if (r < 0) {
1925 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1926 goto fail;
1927 }
1928
1929 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1930 r = log_error_errno(errno, "Failed to move root mount: %m");
1931 goto fail;
1932 }
1933
1934 (void) rmdir(template);
1935
1936 return 0;
1937
1938 fail:
1939 if (bind_mounted)
1940 (void) umount(t);
1941
1942 if (tmpfs_mounted)
1943 (void) umount(template);
1944 (void) rmdir(template);
1945 return r;
1946 }
1947
1948 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1949 assert(s);
1950
1951 snprintf(s, 37,
1952 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1953 SD_ID128_FORMAT_VAL(id));
1954
1955 return s;
1956 }
1957
1958 static int setup_boot_id(const char *dest) {
1959 const char *from, *to;
1960 sd_id128_t rnd = {};
1961 char as_uuid[37];
1962 int r;
1963
1964 if (arg_share_system)
1965 return 0;
1966
1967 /* Generate a new randomized boot ID, so that each boot-up of
1968 * the container gets a new one */
1969
1970 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1971 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1972
1973 r = sd_id128_randomize(&rnd);
1974 if (r < 0)
1975 return log_error_errno(r, "Failed to generate random boot id: %m");
1976
1977 id128_format_as_uuid(rnd, as_uuid);
1978
1979 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1980 if (r < 0)
1981 return log_error_errno(r, "Failed to write boot id: %m");
1982
1983 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1984 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1985 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1986 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1987
1988 unlink(from);
1989 return r;
1990 }
1991
1992 static int copy_devnodes(const char *dest) {
1993
1994 static const char devnodes[] =
1995 "null\0"
1996 "zero\0"
1997 "full\0"
1998 "random\0"
1999 "urandom\0"
2000 "tty\0"
2001 "net/tun\0";
2002
2003 const char *d;
2004 int r = 0;
2005 _cleanup_umask_ mode_t u;
2006
2007 assert(dest);
2008
2009 u = umask(0000);
2010
2011 /* Create /dev/net, so that we can create /dev/net/tun in it */
2012 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2013 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2014
2015 NULSTR_FOREACH(d, devnodes) {
2016 _cleanup_free_ char *from = NULL, *to = NULL;
2017 struct stat st;
2018
2019 from = strappend("/dev/", d);
2020 to = prefix_root(dest, from);
2021
2022 if (stat(from, &st) < 0) {
2023
2024 if (errno != ENOENT)
2025 return log_error_errno(errno, "Failed to stat %s: %m", from);
2026
2027 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
2028
2029 log_error("%s is not a char or block device, cannot copy.", from);
2030 return -EIO;
2031
2032 } else {
2033 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
2034 if (errno != EPERM)
2035 return log_error_errno(errno, "mknod(%s) failed: %m", to);
2036
2037 /* Some systems abusively restrict mknod but
2038 * allow bind mounts. */
2039 r = touch(to);
2040 if (r < 0)
2041 return log_error_errno(r, "touch (%s) failed: %m", to);
2042 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
2043 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
2044 }
2045
2046 r = userns_lchown(to, 0, 0);
2047 if (r < 0)
2048 return log_error_errno(r, "chown() of device node %s failed: %m", to);
2049 }
2050 }
2051
2052 return r;
2053 }
2054
2055 static int setup_pts(const char *dest) {
2056 _cleanup_free_ char *options = NULL;
2057 const char *p;
2058
2059 #ifdef HAVE_SELINUX
2060 if (arg_selinux_apifs_context)
2061 (void) asprintf(&options,
2062 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
2063 arg_uid_shift + TTY_GID,
2064 arg_selinux_apifs_context);
2065 else
2066 #endif
2067 (void) asprintf(&options,
2068 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
2069 arg_uid_shift + TTY_GID);
2070
2071 if (!options)
2072 return log_oom();
2073
2074 /* Mount /dev/pts itself */
2075 p = prefix_roota(dest, "/dev/pts");
2076 if (mkdir(p, 0755) < 0)
2077 return log_error_errno(errno, "Failed to create /dev/pts: %m");
2078 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
2079 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
2080 if (userns_lchown(p, 0, 0) < 0)
2081 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
2082
2083 /* Create /dev/ptmx symlink */
2084 p = prefix_roota(dest, "/dev/ptmx");
2085 if (symlink("pts/ptmx", p) < 0)
2086 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
2087 if (userns_lchown(p, 0, 0) < 0)
2088 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
2089
2090 /* And fix /dev/pts/ptmx ownership */
2091 p = prefix_roota(dest, "/dev/pts/ptmx");
2092 if (userns_lchown(p, 0, 0) < 0)
2093 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
2094
2095 return 0;
2096 }
2097
2098 static int setup_dev_console(const char *dest, const char *console) {
2099 _cleanup_umask_ mode_t u;
2100 const char *to;
2101 int r;
2102
2103 assert(dest);
2104 assert(console);
2105
2106 u = umask(0000);
2107
2108 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
2109 if (r < 0)
2110 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
2111
2112 /* We need to bind mount the right tty to /dev/console since
2113 * ptys can only exist on pts file systems. To have something
2114 * to bind mount things on we create a empty regular file. */
2115
2116 to = prefix_roota(dest, "/dev/console");
2117 r = touch(to);
2118 if (r < 0)
2119 return log_error_errno(r, "touch() for /dev/console failed: %m");
2120
2121 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
2122 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
2123
2124 return 0;
2125 }
2126
2127 static int setup_kmsg(const char *dest, int kmsg_socket) {
2128 const char *from, *to;
2129 _cleanup_umask_ mode_t u;
2130 int fd, k;
2131 union {
2132 struct cmsghdr cmsghdr;
2133 uint8_t buf[CMSG_SPACE(sizeof(int))];
2134 } control = {};
2135 struct msghdr mh = {
2136 .msg_control = &control,
2137 .msg_controllen = sizeof(control),
2138 };
2139 struct cmsghdr *cmsg;
2140
2141 assert(kmsg_socket >= 0);
2142
2143 u = umask(0000);
2144
2145 /* We create the kmsg FIFO as /run/kmsg, but immediately
2146 * delete it after bind mounting it to /proc/kmsg. While FIFOs
2147 * on the reading side behave very similar to /proc/kmsg,
2148 * their writing side behaves differently from /dev/kmsg in
2149 * that writing blocks when nothing is reading. In order to
2150 * avoid any problems with containers deadlocking due to this
2151 * we simply make /dev/kmsg unavailable to the container. */
2152 from = prefix_roota(dest, "/run/kmsg");
2153 to = prefix_roota(dest, "/proc/kmsg");
2154
2155 if (mkfifo(from, 0600) < 0)
2156 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
2157 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
2158 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
2159
2160 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
2161 if (fd < 0)
2162 return log_error_errno(errno, "Failed to open fifo: %m");
2163
2164 cmsg = CMSG_FIRSTHDR(&mh);
2165 cmsg->cmsg_level = SOL_SOCKET;
2166 cmsg->cmsg_type = SCM_RIGHTS;
2167 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
2168 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
2169
2170 mh.msg_controllen = cmsg->cmsg_len;
2171
2172 /* Store away the fd in the socket, so that it stays open as
2173 * long as we run the child */
2174 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
2175 safe_close(fd);
2176
2177 if (k < 0)
2178 return log_error_errno(errno, "Failed to send FIFO fd: %m");
2179
2180 /* And now make the FIFO unavailable as /run/kmsg... */
2181 (void) unlink(from);
2182
2183 return 0;
2184 }
2185
2186 static int send_rtnl(int send_fd) {
2187 union {
2188 struct cmsghdr cmsghdr;
2189 uint8_t buf[CMSG_SPACE(sizeof(int))];
2190 } control = {};
2191 struct msghdr mh = {
2192 .msg_control = &control,
2193 .msg_controllen = sizeof(control),
2194 };
2195 struct cmsghdr *cmsg;
2196 _cleanup_close_ int fd = -1;
2197 ssize_t k;
2198
2199 assert(send_fd >= 0);
2200
2201 if (!arg_expose_ports)
2202 return 0;
2203
2204 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
2205 if (fd < 0)
2206 return log_error_errno(errno, "Failed to allocate container netlink: %m");
2207
2208 cmsg = CMSG_FIRSTHDR(&mh);
2209 cmsg->cmsg_level = SOL_SOCKET;
2210 cmsg->cmsg_type = SCM_RIGHTS;
2211 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
2212 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
2213
2214 mh.msg_controllen = cmsg->cmsg_len;
2215
2216 /* Store away the fd in the socket, so that it stays open as
2217 * long as we run the child */
2218 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
2219 if (k < 0)
2220 return log_error_errno(errno, "Failed to send netlink fd: %m");
2221
2222 return 0;
2223 }
2224
2225 static int flush_ports(union in_addr_union *exposed) {
2226 ExposePort *p;
2227 int r, af = AF_INET;
2228
2229 assert(exposed);
2230
2231 if (!arg_expose_ports)
2232 return 0;
2233
2234 if (in_addr_is_null(af, exposed))
2235 return 0;
2236
2237 log_debug("Lost IP address.");
2238
2239 LIST_FOREACH(ports, p, arg_expose_ports) {
2240 r = fw_add_local_dnat(false,
2241 af,
2242 p->protocol,
2243 NULL,
2244 NULL, 0,
2245 NULL, 0,
2246 p->host_port,
2247 exposed,
2248 p->container_port,
2249 NULL);
2250 if (r < 0)
2251 log_warning_errno(r, "Failed to modify firewall: %m");
2252 }
2253
2254 *exposed = IN_ADDR_NULL;
2255 return 0;
2256 }
2257
2258 static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
2259 _cleanup_free_ struct local_address *addresses = NULL;
2260 _cleanup_free_ char *pretty = NULL;
2261 union in_addr_union new_exposed;
2262 ExposePort *p;
2263 bool add;
2264 int af = AF_INET, r;
2265
2266 assert(exposed);
2267
2268 /* Invoked each time an address is added or removed inside the
2269 * container */
2270
2271 if (!arg_expose_ports)
2272 return 0;
2273
2274 r = local_addresses(rtnl, 0, af, &addresses);
2275 if (r < 0)
2276 return log_error_errno(r, "Failed to enumerate local addresses: %m");
2277
2278 add = r > 0 &&
2279 addresses[0].family == af &&
2280 addresses[0].scope < RT_SCOPE_LINK;
2281
2282 if (!add)
2283 return flush_ports(exposed);
2284
2285 new_exposed = addresses[0].address;
2286 if (in_addr_equal(af, exposed, &new_exposed))
2287 return 0;
2288
2289 in_addr_to_string(af, &new_exposed, &pretty);
2290 log_debug("New container IP is %s.", strna(pretty));
2291
2292 LIST_FOREACH(ports, p, arg_expose_ports) {
2293
2294 r = fw_add_local_dnat(true,
2295 af,
2296 p->protocol,
2297 NULL,
2298 NULL, 0,
2299 NULL, 0,
2300 p->host_port,
2301 &new_exposed,
2302 p->container_port,
2303 in_addr_is_null(af, exposed) ? NULL : exposed);
2304 if (r < 0)
2305 log_warning_errno(r, "Failed to modify firewall: %m");
2306 }
2307
2308 *exposed = new_exposed;
2309 return 0;
2310 }
2311
2312 void expose_port_free_all(ExposePort *p) {
2313
2314 while (p) {
2315 ExposePort *q = p;
2316 LIST_REMOVE(ports, p, q);
2317 free(q);
2318 }
2319 }
2320
2321 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2322 union in_addr_union *exposed = userdata;
2323
2324 assert(rtnl);
2325 assert(m);
2326 assert(exposed);
2327
2328 expose_ports(rtnl, exposed);
2329 return 0;
2330 }
2331
2332 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
2333 union {
2334 struct cmsghdr cmsghdr;
2335 uint8_t buf[CMSG_SPACE(sizeof(int))];
2336 } control = {};
2337 struct msghdr mh = {
2338 .msg_control = &control,
2339 .msg_controllen = sizeof(control),
2340 };
2341 struct cmsghdr *cmsg;
2342 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2343 int fd, r;
2344 ssize_t k;
2345
2346 assert(event);
2347 assert(recv_fd >= 0);
2348 assert(ret);
2349
2350 if (!arg_expose_ports)
2351 return 0;
2352
2353 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
2354 if (k < 0)
2355 return log_error_errno(errno, "Failed to recv netlink fd: %m");
2356
2357 cmsg = CMSG_FIRSTHDR(&mh);
2358 assert(cmsg->cmsg_level == SOL_SOCKET);
2359 assert(cmsg->cmsg_type == SCM_RIGHTS);
2360 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
2361 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
2362
2363 r = sd_netlink_open_fd(&rtnl, fd);
2364 if (r < 0) {
2365 safe_close(fd);
2366 return log_error_errno(r, "Failed to create rtnl object: %m");
2367 }
2368
2369 r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
2370 if (r < 0)
2371 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
2372
2373 r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
2374 if (r < 0)
2375 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
2376
2377 r = sd_netlink_attach_event(rtnl, event, 0);
2378 if (r < 0)
2379 return log_error_errno(r, "Failed to add to even loop: %m");
2380
2381 *ret = rtnl;
2382 rtnl = NULL;
2383
2384 return 0;
2385 }
2386
2387 static int setup_hostname(void) {
2388
2389 if (arg_share_system)
2390 return 0;
2391
2392 if (sethostname_idempotent(arg_machine) < 0)
2393 return -errno;
2394
2395 return 0;
2396 }
2397
2398 static int setup_journal(const char *directory) {
2399 sd_id128_t machine_id, this_id;
2400 _cleanup_free_ char *b = NULL, *d = NULL;
2401 const char *etc_machine_id, *p, *q;
2402 char *id;
2403 int r;
2404
2405 /* Don't link journals in ephemeral mode */
2406 if (arg_ephemeral)
2407 return 0;
2408
2409 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2410
2411 r = read_one_line_file(etc_machine_id, &b);
2412 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
2413 return 0;
2414 else if (r < 0)
2415 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
2416
2417 id = strstrip(b);
2418 if (isempty(id) && arg_link_journal == LINK_AUTO)
2419 return 0;
2420
2421 /* Verify validity */
2422 r = sd_id128_from_string(id, &machine_id);
2423 if (r < 0)
2424 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
2425
2426 r = sd_id128_get_machine(&this_id);
2427 if (r < 0)
2428 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2429
2430 if (sd_id128_equal(machine_id, this_id)) {
2431 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
2432 "Host and machine ids are equal (%s): refusing to link journals", id);
2433 if (arg_link_journal == LINK_AUTO)
2434 return 0;
2435 return -EEXIST;
2436 }
2437
2438 if (arg_link_journal == LINK_NO)
2439 return 0;
2440
2441 r = userns_mkdir(directory, "/var", 0755, 0, 0);
2442 if (r < 0)
2443 return log_error_errno(r, "Failed to create /var: %m");
2444
2445 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
2446 if (r < 0)
2447 return log_error_errno(r, "Failed to create /var/log: %m");
2448
2449 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
2450 if (r < 0)
2451 return log_error_errno(r, "Failed to create /var/log/journal: %m");
2452
2453 p = strjoina("/var/log/journal/", id);
2454 q = prefix_roota(directory, p);
2455
2456 if (path_is_mount_point(p, 0) > 0) {
2457 if (arg_link_journal != LINK_AUTO) {
2458 log_error("%s: already a mount point, refusing to use for journal", p);
2459 return -EEXIST;
2460 }
2461
2462 return 0;
2463 }
2464
2465 if (path_is_mount_point(q, 0) > 0) {
2466 if (arg_link_journal != LINK_AUTO) {
2467 log_error("%s: already a mount point, refusing to use for journal", q);
2468 return -EEXIST;
2469 }
2470
2471 return 0;
2472 }
2473
2474 r = readlink_and_make_absolute(p, &d);
2475 if (r >= 0) {
2476 if ((arg_link_journal == LINK_GUEST ||
2477 arg_link_journal == LINK_AUTO) &&
2478 path_equal(d, q)) {
2479
2480 r = userns_mkdir(directory, p, 0755, 0, 0);
2481 if (r < 0)
2482 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2483 return 0;
2484 }
2485
2486 if (unlink(p) < 0)
2487 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2488 } else if (r == -EINVAL) {
2489
2490 if (arg_link_journal == LINK_GUEST &&
2491 rmdir(p) < 0) {
2492
2493 if (errno == ENOTDIR) {
2494 log_error("%s already exists and is neither a symlink nor a directory", p);
2495 return r;
2496 } else {
2497 log_error_errno(errno, "Failed to remove %s: %m", p);
2498 return -errno;
2499 }
2500 }
2501 } else if (r != -ENOENT) {
2502 log_error_errno(errno, "readlink(%s) failed: %m", p);
2503 return r;
2504 }
2505
2506 if (arg_link_journal == LINK_GUEST) {
2507
2508 if (symlink(q, p) < 0) {
2509 if (arg_link_journal_try) {
2510 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2511 return 0;
2512 } else {
2513 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2514 return -errno;
2515 }
2516 }
2517
2518 r = userns_mkdir(directory, p, 0755, 0, 0);
2519 if (r < 0)
2520 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2521 return 0;
2522 }
2523
2524 if (arg_link_journal == LINK_HOST) {
2525 /* don't create parents here -- if the host doesn't have
2526 * permanent journal set up, don't force it here */
2527 r = mkdir(p, 0755);
2528 if (r < 0) {
2529 if (arg_link_journal_try) {
2530 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
2531 return 0;
2532 } else {
2533 log_error_errno(errno, "Failed to create %s: %m", p);
2534 return r;
2535 }
2536 }
2537
2538 } else if (access(p, F_OK) < 0)
2539 return 0;
2540
2541 if (dir_is_empty(q) == 0)
2542 log_warning("%s is not empty, proceeding anyway.", q);
2543
2544 r = userns_mkdir(directory, p, 0755, 0, 0);
2545 if (r < 0) {
2546 log_error_errno(errno, "Failed to create %s: %m", q);
2547 return r;
2548 }
2549
2550 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2551 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2552
2553 return 0;
2554 }
2555
2556 static int drop_capabilities(void) {
2557 return capability_bounding_set_drop(~arg_retain, false);
2558 }
2559
2560 static int register_machine(pid_t pid, int local_ifindex) {
2561 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2562 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2563 int r;
2564
2565 if (!arg_register)
2566 return 0;
2567
2568 r = sd_bus_default_system(&bus);
2569 if (r < 0)
2570 return log_error_errno(r, "Failed to open system bus: %m");
2571
2572 if (arg_keep_unit) {
2573 r = sd_bus_call_method(
2574 bus,
2575 "org.freedesktop.machine1",
2576 "/org/freedesktop/machine1",
2577 "org.freedesktop.machine1.Manager",
2578 "RegisterMachineWithNetwork",
2579 &error,
2580 NULL,
2581 "sayssusai",
2582 arg_machine,
2583 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2584 "nspawn",
2585 "container",
2586 (uint32_t) pid,
2587 strempty(arg_directory),
2588 local_ifindex > 0 ? 1 : 0, local_ifindex);
2589 } else {
2590 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
2591 char **i;
2592 unsigned j;
2593
2594 r = sd_bus_message_new_method_call(
2595 bus,
2596 &m,
2597 "org.freedesktop.machine1",
2598 "/org/freedesktop/machine1",
2599 "org.freedesktop.machine1.Manager",
2600 "CreateMachineWithNetwork");
2601 if (r < 0)
2602 return bus_log_create_error(r);
2603
2604 r = sd_bus_message_append(
2605 m,
2606 "sayssusai",
2607 arg_machine,
2608 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2609 "nspawn",
2610 "container",
2611 (uint32_t) pid,
2612 strempty(arg_directory),
2613 local_ifindex > 0 ? 1 : 0, local_ifindex);
2614 if (r < 0)
2615 return bus_log_create_error(r);
2616
2617 r = sd_bus_message_open_container(m, 'a', "(sv)");
2618 if (r < 0)
2619 return bus_log_create_error(r);
2620
2621 if (!isempty(arg_slice)) {
2622 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2623 if (r < 0)
2624 return bus_log_create_error(r);
2625 }
2626
2627 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2628 if (r < 0)
2629 return bus_log_create_error(r);
2630
2631 /* If you make changes here, also make sure to update
2632 * systemd-nspawn@.service, to keep the device
2633 * policies in sync regardless if we are run with or
2634 * without the --keep-unit switch. */
2635 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2636 /* Allow the container to
2637 * access and create the API
2638 * device nodes, so that
2639 * PrivateDevices= in the
2640 * container can work
2641 * fine */
2642 "/dev/null", "rwm",
2643 "/dev/zero", "rwm",
2644 "/dev/full", "rwm",
2645 "/dev/random", "rwm",
2646 "/dev/urandom", "rwm",
2647 "/dev/tty", "rwm",
2648 "/dev/net/tun", "rwm",
2649 /* Allow the container
2650 * access to ptys. However,
2651 * do not permit the
2652 * container to ever create
2653 * these device nodes. */
2654 "/dev/pts/ptmx", "rw",
2655 "char-pts", "rw");
2656 if (r < 0)
2657 return bus_log_create_error(r);
2658
2659 for (j = 0; j < arg_n_custom_mounts; j++) {
2660 CustomMount *cm = &arg_custom_mounts[j];
2661
2662 if (cm->type != CUSTOM_MOUNT_BIND)
2663 continue;
2664
2665 r = is_device_node(cm->source);
2666 if (r < 0)
2667 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
2668
2669 if (r) {
2670 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
2671 cm->source, cm->read_only ? "r" : "rw");
2672 if (r < 0)
2673 return log_error_errno(r, "Failed to append message arguments: %m");
2674 }
2675 }
2676
2677 if (arg_kill_signal != 0) {
2678 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
2679 if (r < 0)
2680 return bus_log_create_error(r);
2681
2682 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
2683 if (r < 0)
2684 return bus_log_create_error(r);
2685 }
2686
2687 STRV_FOREACH(i, arg_property) {
2688 r = sd_bus_message_open_container(m, 'r', "sv");
2689 if (r < 0)
2690 return bus_log_create_error(r);
2691
2692 r = bus_append_unit_property_assignment(m, *i);
2693 if (r < 0)
2694 return r;
2695
2696 r = sd_bus_message_close_container(m);
2697 if (r < 0)
2698 return bus_log_create_error(r);
2699 }
2700
2701 r = sd_bus_message_close_container(m);
2702 if (r < 0)
2703 return bus_log_create_error(r);
2704
2705 r = sd_bus_call(bus, m, 0, &error, NULL);
2706 }
2707
2708 if (r < 0) {
2709 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2710 return r;
2711 }
2712
2713 return 0;
2714 }
2715
2716 static int terminate_machine(pid_t pid) {
2717 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2718 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2719 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2720 const char *path;
2721 int r;
2722
2723 if (!arg_register)
2724 return 0;
2725
2726 /* If we are reusing the unit, then just exit, systemd will do
2727 * the right thing when we exit. */
2728 if (arg_keep_unit)
2729 return 0;
2730
2731 r = sd_bus_default_system(&bus);
2732 if (r < 0)
2733 return log_error_errno(r, "Failed to open system bus: %m");
2734
2735 r = sd_bus_call_method(
2736 bus,
2737 "org.freedesktop.machine1",
2738 "/org/freedesktop/machine1",
2739 "org.freedesktop.machine1.Manager",
2740 "GetMachineByPID",
2741 &error,
2742 &reply,
2743 "u",
2744 (uint32_t) pid);
2745 if (r < 0) {
2746 /* Note that the machine might already have been
2747 * cleaned up automatically, hence don't consider it a
2748 * failure if we cannot get the machine object. */
2749 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2750 return 0;
2751 }
2752
2753 r = sd_bus_message_read(reply, "o", &path);
2754 if (r < 0)
2755 return bus_log_parse_error(r);
2756
2757 r = sd_bus_call_method(
2758 bus,
2759 "org.freedesktop.machine1",
2760 path,
2761 "org.freedesktop.machine1.Machine",
2762 "Terminate",
2763 &error,
2764 NULL,
2765 NULL);
2766 if (r < 0) {
2767 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2768 return 0;
2769 }
2770
2771 return 0;
2772 }
2773
2774 static int reset_audit_loginuid(void) {
2775 _cleanup_free_ char *p = NULL;
2776 int r;
2777
2778 if (arg_share_system)
2779 return 0;
2780
2781 r = read_one_line_file("/proc/self/loginuid", &p);
2782 if (r == -ENOENT)
2783 return 0;
2784 if (r < 0)
2785 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2786
2787 /* Already reset? */
2788 if (streq(p, "4294967295"))
2789 return 0;
2790
2791 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
2792 if (r < 0) {
2793 log_error_errno(r,
2794 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2795 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2796 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2797 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2798 "using systemd-nspawn. Sleeping for 5s... (%m)");
2799
2800 sleep(5);
2801 }
2802
2803 return 0;
2804 }
2805
2806 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2807 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2808 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2809
2810 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2811 uint8_t result[8];
2812 size_t l, sz;
2813 uint8_t *v, *i;
2814 int r;
2815
2816 l = strlen(arg_machine);
2817 sz = sizeof(sd_id128_t) + l;
2818 if (idx > 0)
2819 sz += sizeof(idx);
2820
2821 v = alloca(sz);
2822
2823 /* fetch some persistent data unique to the host */
2824 r = sd_id128_get_machine((sd_id128_t*) v);
2825 if (r < 0)
2826 return r;
2827
2828 /* combine with some data unique (on this host) to this
2829 * container instance */
2830 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2831 if (idx > 0) {
2832 idx = htole64(idx);
2833 memcpy(i, &idx, sizeof(idx));
2834 }
2835
2836 /* Let's hash the host machine ID plus the container name. We
2837 * use a fixed, but originally randomly created hash key here. */
2838 siphash24(result, v, sz, hash_key.bytes);
2839
2840 assert_cc(ETH_ALEN <= sizeof(result));
2841 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2842
2843 /* see eth_random_addr in the kernel */
2844 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2845 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2846
2847 return 0;
2848 }
2849
2850 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2851 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2852 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2853 struct ether_addr mac_host, mac_container;
2854 int r, i;
2855
2856 if (!arg_private_network)
2857 return 0;
2858
2859 if (!arg_network_veth)
2860 return 0;
2861
2862 /* Use two different interface name prefixes depending whether
2863 * we are in bridge mode or not. */
2864 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2865 arg_network_bridge ? "vb" : "ve", arg_machine);
2866
2867 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2868 if (r < 0)
2869 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2870
2871 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2872 if (r < 0)
2873 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2874
2875 r = sd_netlink_open(&rtnl);
2876 if (r < 0)
2877 return log_error_errno(r, "Failed to connect to netlink: %m");
2878
2879 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2880 if (r < 0)
2881 return log_error_errno(r, "Failed to allocate netlink message: %m");
2882
2883 r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
2884 if (r < 0)
2885 return log_error_errno(r, "Failed to add netlink interface name: %m");
2886
2887 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2888 if (r < 0)
2889 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2890
2891 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2892 if (r < 0)
2893 return log_error_errno(r, "Failed to open netlink container: %m");
2894
2895 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2896 if (r < 0)
2897 return log_error_errno(r, "Failed to open netlink container: %m");
2898
2899 r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
2900 if (r < 0)
2901 return log_error_errno(r, "Failed to open netlink container: %m");
2902
2903 r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
2904 if (r < 0)
2905 return log_error_errno(r, "Failed to add netlink interface name: %m");
2906
2907 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2908 if (r < 0)
2909 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2910
2911 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2912 if (r < 0)
2913 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2914
2915 r = sd_netlink_message_close_container(m);
2916 if (r < 0)
2917 return log_error_errno(r, "Failed to close netlink container: %m");
2918
2919 r = sd_netlink_message_close_container(m);
2920 if (r < 0)
2921 return log_error_errno(r, "Failed to close netlink container: %m");
2922
2923 r = sd_netlink_message_close_container(m);
2924 if (r < 0)
2925 return log_error_errno(r, "Failed to close netlink container: %m");
2926
2927 r = sd_netlink_call(rtnl, m, 0, NULL);
2928 if (r < 0)
2929 return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
2930
2931 i = (int) if_nametoindex(iface_name);
2932 if (i <= 0)
2933 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2934
2935 *ifi = i;
2936
2937 return 0;
2938 }
2939
2940 static int setup_bridge(const char veth_name[], int *ifi) {
2941 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2942 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2943 int r, bridge;
2944
2945 if (!arg_private_network)
2946 return 0;
2947
2948 if (!arg_network_veth)
2949 return 0;
2950
2951 if (!arg_network_bridge)
2952 return 0;
2953
2954 bridge = (int) if_nametoindex(arg_network_bridge);
2955 if (bridge <= 0)
2956 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2957
2958 *ifi = bridge;
2959
2960 r = sd_netlink_open(&rtnl);
2961 if (r < 0)
2962 return log_error_errno(r, "Failed to connect to netlink: %m");
2963
2964 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2965 if (r < 0)
2966 return log_error_errno(r, "Failed to allocate netlink message: %m");
2967
2968 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2969 if (r < 0)
2970 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2971
2972 r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
2973 if (r < 0)
2974 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2975
2976 r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
2977 if (r < 0)
2978 return log_error_errno(r, "Failed to add netlink master field: %m");
2979
2980 r = sd_netlink_call(rtnl, m, 0, NULL);
2981 if (r < 0)
2982 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2983
2984 return 0;
2985 }
2986
2987 static int parse_interface(struct udev *udev, const char *name) {
2988 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2989 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2990 int ifi;
2991
2992 ifi = (int) if_nametoindex(name);
2993 if (ifi <= 0)
2994 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2995
2996 sprintf(ifi_str, "n%i", ifi);
2997 d = udev_device_new_from_device_id(udev, ifi_str);
2998 if (!d)
2999 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
3000
3001 if (udev_device_get_is_initialized(d) <= 0) {
3002 log_error("Network interface %s is not initialized yet.", name);
3003 return -EBUSY;
3004 }
3005
3006 return ifi;
3007 }
3008
3009 static int move_network_interfaces(pid_t pid) {
3010 _cleanup_udev_unref_ struct udev *udev = NULL;
3011 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3012 char **i;
3013 int r;
3014
3015 if (!arg_private_network)
3016 return 0;
3017
3018 if (strv_isempty(arg_network_interfaces))
3019 return 0;
3020
3021 r = sd_netlink_open(&rtnl);
3022 if (r < 0)
3023 return log_error_errno(r, "Failed to connect to netlink: %m");
3024
3025 udev = udev_new();
3026 if (!udev) {
3027 log_error("Failed to connect to udev.");
3028 return -ENOMEM;
3029 }
3030
3031 STRV_FOREACH(i, arg_network_interfaces) {
3032 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
3033 int ifi;
3034
3035 ifi = parse_interface(udev, *i);
3036 if (ifi < 0)
3037 return ifi;
3038
3039 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
3040 if (r < 0)
3041 return log_error_errno(r, "Failed to allocate netlink message: %m");
3042
3043 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
3044 if (r < 0)
3045 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
3046
3047 r = sd_netlink_call(rtnl, m, 0, NULL);
3048 if (r < 0)
3049 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
3050 }
3051
3052 return 0;
3053 }
3054
3055 static int setup_macvlan(pid_t pid) {
3056 _cleanup_udev_unref_ struct udev *udev = NULL;
3057 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3058 unsigned idx = 0;
3059 char **i;
3060 int r;
3061
3062 if (!arg_private_network)
3063 return 0;
3064
3065 if (strv_isempty(arg_network_macvlan))
3066 return 0;
3067
3068 r = sd_netlink_open(&rtnl);
3069 if (r < 0)
3070 return log_error_errno(r, "Failed to connect to netlink: %m");
3071
3072 udev = udev_new();
3073 if (!udev) {
3074 log_error("Failed to connect to udev.");
3075 return -ENOMEM;
3076 }
3077
3078 STRV_FOREACH(i, arg_network_macvlan) {
3079 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
3080 _cleanup_free_ char *n = NULL;
3081 struct ether_addr mac;
3082 int ifi;
3083
3084 ifi = parse_interface(udev, *i);
3085 if (ifi < 0)
3086 return ifi;
3087
3088 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
3089 if (r < 0)
3090 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
3091
3092 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
3093 if (r < 0)
3094 return log_error_errno(r, "Failed to allocate netlink message: %m");
3095
3096 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
3097 if (r < 0)
3098 return log_error_errno(r, "Failed to add netlink interface index: %m");
3099
3100 n = strappend("mv-", *i);
3101 if (!n)
3102 return log_oom();
3103
3104 strshorten(n, IFNAMSIZ-1);
3105
3106 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
3107 if (r < 0)
3108 return log_error_errno(r, "Failed to add netlink interface name: %m");
3109
3110 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
3111 if (r < 0)
3112 return log_error_errno(r, "Failed to add netlink MAC address: %m");
3113
3114 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
3115 if (r < 0)
3116 return log_error_errno(r, "Failed to add netlink namespace field: %m");
3117
3118 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
3119 if (r < 0)
3120 return log_error_errno(r, "Failed to open netlink container: %m");
3121
3122 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
3123 if (r < 0)
3124 return log_error_errno(r, "Failed to open netlink container: %m");
3125
3126 r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
3127 if (r < 0)
3128 return log_error_errno(r, "Failed to append macvlan mode: %m");
3129
3130 r = sd_netlink_message_close_container(m);
3131 if (r < 0)
3132 return log_error_errno(r, "Failed to close netlink container: %m");
3133
3134 r = sd_netlink_message_close_container(m);
3135 if (r < 0)
3136 return log_error_errno(r, "Failed to close netlink container: %m");
3137
3138 r = sd_netlink_call(rtnl, m, 0, NULL);
3139 if (r < 0)
3140 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
3141 }
3142
3143 return 0;
3144 }
3145
3146 static int setup_ipvlan(pid_t pid) {
3147 _cleanup_udev_unref_ struct udev *udev = NULL;
3148 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3149 char **i;
3150 int r;
3151
3152 if (!arg_private_network)
3153 return 0;
3154
3155 if (strv_isempty(arg_network_ipvlan))
3156 return 0;
3157
3158 r = sd_netlink_open(&rtnl);
3159 if (r < 0)
3160 return log_error_errno(r, "Failed to connect to netlink: %m");
3161
3162 udev = udev_new();
3163 if (!udev) {
3164 log_error("Failed to connect to udev.");
3165 return -ENOMEM;
3166 }
3167
3168 STRV_FOREACH(i, arg_network_ipvlan) {
3169 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
3170 _cleanup_free_ char *n = NULL;
3171 int ifi;
3172
3173 ifi = parse_interface(udev, *i);
3174 if (ifi < 0)
3175 return ifi;
3176
3177 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
3178 if (r < 0)
3179 return log_error_errno(r, "Failed to allocate netlink message: %m");
3180
3181 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
3182 if (r < 0)
3183 return log_error_errno(r, "Failed to add netlink interface index: %m");
3184
3185 n = strappend("iv-", *i);
3186 if (!n)
3187 return log_oom();
3188
3189 strshorten(n, IFNAMSIZ-1);
3190
3191 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
3192 if (r < 0)
3193 return log_error_errno(r, "Failed to add netlink interface name: %m");
3194
3195 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
3196 if (r < 0)
3197 return log_error_errno(r, "Failed to add netlink namespace field: %m");
3198
3199 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
3200 if (r < 0)
3201 return log_error_errno(r, "Failed to open netlink container: %m");
3202
3203 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
3204 if (r < 0)
3205 return log_error_errno(r, "Failed to open netlink container: %m");
3206
3207 r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
3208 if (r < 0)
3209 return log_error_errno(r, "Failed to add ipvlan mode: %m");
3210
3211 r = sd_netlink_message_close_container(m);
3212 if (r < 0)
3213 return log_error_errno(r, "Failed to close netlink container: %m");
3214
3215 r = sd_netlink_message_close_container(m);
3216 if (r < 0)
3217 return log_error_errno(r, "Failed to close netlink container: %m");
3218
3219 r = sd_netlink_call(rtnl, m, 0, NULL);
3220 if (r < 0)
3221 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
3222 }
3223
3224 return 0;
3225 }
3226
3227 static int setup_seccomp(void) {
3228
3229 #ifdef HAVE_SECCOMP
3230 static const struct {
3231 uint64_t capability;
3232 int syscall_num;
3233 } blacklist[] = {
3234 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
3235 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
3236 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
3237 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
3238 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
3239 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
3240 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
3241 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
3242 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
3243 { CAP_SYSLOG, SCMP_SYS(syslog) },
3244 };
3245
3246 scmp_filter_ctx seccomp;
3247 unsigned i;
3248 int r;
3249
3250 seccomp = seccomp_init(SCMP_ACT_ALLOW);
3251 if (!seccomp)
3252 return log_oom();
3253
3254 r = seccomp_add_secondary_archs(seccomp);
3255 if (r < 0) {
3256 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
3257 goto finish;
3258 }
3259
3260 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
3261 if (arg_retain & (1ULL << blacklist[i].capability))
3262 continue;
3263
3264 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
3265 if (r == -EFAULT)
3266 continue; /* unknown syscall */
3267 if (r < 0) {
3268 log_error_errno(r, "Failed to block syscall: %m");
3269 goto finish;
3270 }
3271 }
3272
3273
3274 /*
3275 Audit is broken in containers, much of the userspace audit
3276 hookup will fail if running inside a container. We don't
3277 care and just turn off creation of audit sockets.
3278
3279 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
3280 with EAFNOSUPPORT which audit userspace uses as indication
3281 that audit is disabled in the kernel.
3282 */
3283
3284 r = seccomp_rule_add(
3285 seccomp,
3286 SCMP_ACT_ERRNO(EAFNOSUPPORT),
3287 SCMP_SYS(socket),
3288 2,
3289 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
3290 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
3291 if (r < 0) {
3292 log_error_errno(r, "Failed to add audit seccomp rule: %m");
3293 goto finish;
3294 }
3295
3296 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
3297 if (r < 0) {
3298 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
3299 goto finish;
3300 }
3301
3302 r = seccomp_load(seccomp);
3303 if (r == -EINVAL) {
3304 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3305 r = 0;
3306 goto finish;
3307 }
3308 if (r < 0) {
3309 log_error_errno(r, "Failed to install seccomp audit filter: %m");
3310 goto finish;
3311 }
3312
3313 finish:
3314 seccomp_release(seccomp);
3315 return r;
3316 #else
3317 return 0;
3318 #endif
3319
3320 }
3321
3322 static int setup_propagate(const char *root) {
3323 const char *p, *q;
3324
3325 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3326 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
3327 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3328 (void) mkdir_p(p, 0600);
3329
3330 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
3331 return log_error_errno(errno, "Failed to create /run/systemd: %m");
3332
3333 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3334 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
3335
3336 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3337 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
3338
3339 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
3340 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
3341 return log_error_errno(errno, "Failed to install propagation bind mount.");
3342
3343 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
3344 return log_error_errno(errno, "Failed to make propagation mount read-only");
3345
3346 return 0;
3347 }
3348
3349 static int setup_image(char **device_path, int *loop_nr) {
3350 struct loop_info64 info = {
3351 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
3352 };
3353 _cleanup_close_ int fd = -1, control = -1, loop = -1;
3354 _cleanup_free_ char* loopdev = NULL;
3355 struct stat st;
3356 int r, nr;
3357
3358 assert(device_path);
3359 assert(loop_nr);
3360 assert(arg_image);
3361
3362 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3363 if (fd < 0)
3364 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
3365
3366 if (fstat(fd, &st) < 0)
3367 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
3368
3369 if (S_ISBLK(st.st_mode)) {
3370 char *p;
3371
3372 p = strdup(arg_image);
3373 if (!p)
3374 return log_oom();
3375
3376 *device_path = p;
3377
3378 *loop_nr = -1;
3379
3380 r = fd;
3381 fd = -1;
3382
3383 return r;
3384 }
3385
3386 if (!S_ISREG(st.st_mode)) {
3387 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
3388 return -EINVAL;
3389 }
3390
3391 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3392 if (control < 0)
3393 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
3394
3395 nr = ioctl(control, LOOP_CTL_GET_FREE);
3396 if (nr < 0)
3397 return log_error_errno(errno, "Failed to allocate loop device: %m");
3398
3399 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
3400 return log_oom();
3401
3402 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3403 if (loop < 0)
3404 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
3405
3406 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
3407 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
3408
3409 if (arg_read_only)
3410 info.lo_flags |= LO_FLAGS_READ_ONLY;
3411
3412 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
3413 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
3414
3415 *device_path = loopdev;
3416 loopdev = NULL;
3417
3418 *loop_nr = nr;
3419
3420 r = loop;
3421 loop = -1;
3422
3423 return r;
3424 }
3425
3426 #define PARTITION_TABLE_BLURB \
3427 "Note that the disk image needs to either contain only a single MBR partition of\n" \
3428 "type 0x83 that is marked bootable, or a single GPT partition of type " \
3429 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
3430 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3431 "to be bootable with systemd-nspawn."
3432
3433 static int dissect_image(
3434 int fd,
3435 char **root_device, bool *root_device_rw,
3436 char **home_device, bool *home_device_rw,
3437 char **srv_device, bool *srv_device_rw,
3438 bool *secondary) {
3439
3440 #ifdef HAVE_BLKID
3441 int home_nr = -1, srv_nr = -1;
3442 #ifdef GPT_ROOT_NATIVE
3443 int root_nr = -1;
3444 #endif
3445 #ifdef GPT_ROOT_SECONDARY
3446 int secondary_root_nr = -1;
3447 #endif
3448 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
3449 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
3450 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
3451 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3452 _cleanup_udev_unref_ struct udev *udev = NULL;
3453 struct udev_list_entry *first, *item;
3454 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
3455 bool is_gpt, is_mbr, multiple_generic = false;
3456 const char *pttype = NULL;
3457 blkid_partlist pl;
3458 struct stat st;
3459 unsigned i;
3460 int r;
3461
3462 assert(fd >= 0);
3463 assert(root_device);
3464 assert(home_device);
3465 assert(srv_device);
3466 assert(secondary);
3467 assert(arg_image);
3468
3469 b = blkid_new_probe();
3470 if (!b)
3471 return log_oom();
3472
3473 errno = 0;
3474 r = blkid_probe_set_device(b, fd, 0, 0);
3475 if (r != 0) {
3476 if (errno == 0)
3477 return log_oom();
3478
3479 log_error_errno(errno, "Failed to set device on blkid probe: %m");
3480 return -errno;
3481 }
3482
3483 blkid_probe_enable_partitions(b, 1);
3484 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
3485
3486 errno = 0;
3487 r = blkid_do_safeprobe(b);
3488 if (r == -2 || r == 1) {
3489 log_error("Failed to identify any partition table on\n"
3490 " %s\n"
3491 PARTITION_TABLE_BLURB, arg_image);
3492 return -EINVAL;
3493 } else if (r != 0) {
3494 if (errno == 0)
3495 errno = EIO;
3496 log_error_errno(errno, "Failed to probe: %m");
3497 return -errno;
3498 }
3499
3500 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
3501
3502 is_gpt = streq_ptr(pttype, "gpt");
3503 is_mbr = streq_ptr(pttype, "dos");
3504
3505 if (!is_gpt && !is_mbr) {
3506 log_error("No GPT or MBR partition table discovered on\n"
3507 " %s\n"
3508 PARTITION_TABLE_BLURB, arg_image);
3509 return -EINVAL;
3510 }
3511
3512 errno = 0;
3513 pl = blkid_probe_get_partitions(b);
3514 if (!pl) {
3515 if (errno == 0)
3516 return log_oom();
3517
3518 log_error("Failed to list partitions of %s", arg_image);
3519 return -errno;
3520 }
3521
3522 udev = udev_new();
3523 if (!udev)
3524 return log_oom();
3525
3526 if (fstat(fd, &st) < 0)
3527 return log_error_errno(errno, "Failed to stat block device: %m");
3528
3529 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
3530 if (!d)
3531 return log_oom();
3532
3533 for (i = 0;; i++) {
3534 int n, m;
3535
3536 if (i >= 10) {
3537 log_error("Kernel partitions never appeared.");
3538 return -ENXIO;
3539 }
3540
3541 e = udev_enumerate_new(udev);
3542 if (!e)
3543 return log_oom();
3544
3545 r = udev_enumerate_add_match_parent(e, d);
3546 if (r < 0)
3547 return log_oom();
3548
3549 r = udev_enumerate_scan_devices(e);
3550 if (r < 0)
3551 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
3552
3553 /* Count the partitions enumerated by the kernel */
3554 n = 0;
3555 first = udev_enumerate_get_list_entry(e);
3556 udev_list_entry_foreach(item, first)
3557 n++;
3558
3559 /* Count the partitions enumerated by blkid */
3560 m = blkid_partlist_numof_partitions(pl);
3561 if (n == m + 1)
3562 break;
3563 if (n > m + 1) {
3564 log_error("blkid and kernel partition list do not match.");
3565 return -EIO;
3566 }
3567 if (n < m + 1) {
3568 unsigned j;
3569
3570 /* The kernel has probed fewer partitions than
3571 * blkid? Maybe the kernel prober is still
3572 * running or it got EBUSY because udev
3573 * already opened the device. Let's reprobe
3574 * the device, which is a synchronous call
3575 * that waits until probing is complete. */
3576
3577 for (j = 0; j < 20; j++) {
3578
3579 r = ioctl(fd, BLKRRPART, 0);
3580 if (r < 0)
3581 r = -errno;
3582 if (r >= 0 || r != -EBUSY)
3583 break;
3584
3585 /* If something else has the device
3586 * open, such as an udev rule, the
3587 * ioctl will return EBUSY. Since
3588 * there's no way to wait until it
3589 * isn't busy anymore, let's just wait
3590 * a bit, and try again.
3591 *
3592 * This is really something they
3593 * should fix in the kernel! */
3594
3595 usleep(50 * USEC_PER_MSEC);
3596 }
3597
3598 if (r < 0)
3599 return log_error_errno(r, "Failed to reread partition table: %m");
3600 }
3601
3602 e = udev_enumerate_unref(e);
3603 }
3604
3605 first = udev_enumerate_get_list_entry(e);
3606 udev_list_entry_foreach(item, first) {
3607 _cleanup_udev_device_unref_ struct udev_device *q;
3608 const char *node;
3609 unsigned long long flags;
3610 blkid_partition pp;
3611 dev_t qn;
3612 int nr;
3613
3614 errno = 0;
3615 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
3616 if (!q) {
3617 if (!errno)
3618 errno = ENOMEM;
3619
3620 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
3621 return -errno;
3622 }
3623
3624 qn = udev_device_get_devnum(q);
3625 if (major(qn) == 0)
3626 continue;
3627
3628 if (st.st_rdev == qn)
3629 continue;
3630
3631 node = udev_device_get_devnode(q);
3632 if (!node)
3633 continue;
3634
3635 pp = blkid_partlist_devno_to_partition(pl, qn);
3636 if (!pp)
3637 continue;
3638
3639 flags = blkid_partition_get_flags(pp);
3640
3641 nr = blkid_partition_get_partno(pp);
3642 if (nr < 0)
3643 continue;
3644
3645 if (is_gpt) {
3646 sd_id128_t type_id;
3647 const char *stype;
3648
3649 if (flags & GPT_FLAG_NO_AUTO)
3650 continue;
3651
3652 stype = blkid_partition_get_type_string(pp);
3653 if (!stype)
3654 continue;
3655
3656 if (sd_id128_from_string(stype, &type_id) < 0)
3657 continue;
3658
3659 if (sd_id128_equal(type_id, GPT_HOME)) {
3660
3661 if (home && nr >= home_nr)
3662 continue;
3663
3664 home_nr = nr;
3665 home_rw = !(flags & GPT_FLAG_READ_ONLY);
3666
3667 r = free_and_strdup(&home, node);
3668 if (r < 0)
3669 return log_oom();
3670
3671 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3672
3673 if (srv && nr >= srv_nr)
3674 continue;
3675
3676 srv_nr = nr;
3677 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3678
3679 r = free_and_strdup(&srv, node);
3680 if (r < 0)
3681 return log_oom();
3682 }
3683 #ifdef GPT_ROOT_NATIVE
3684 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3685
3686 if (root && nr >= root_nr)
3687 continue;
3688
3689 root_nr = nr;
3690 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3691
3692 r = free_and_strdup(&root, node);
3693 if (r < 0)
3694 return log_oom();
3695 }
3696 #endif
3697 #ifdef GPT_ROOT_SECONDARY
3698 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3699
3700 if (secondary_root && nr >= secondary_root_nr)
3701 continue;
3702
3703 secondary_root_nr = nr;
3704 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3705
3706 r = free_and_strdup(&secondary_root, node);
3707 if (r < 0)
3708 return log_oom();
3709 }
3710 #endif
3711 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3712
3713 if (generic)
3714 multiple_generic = true;
3715 else {
3716 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3717
3718 r = free_and_strdup(&generic, node);
3719 if (r < 0)
3720 return log_oom();
3721 }
3722 }
3723
3724 } else if (is_mbr) {
3725 int type;
3726
3727 if (flags != 0x80) /* Bootable flag */
3728 continue;
3729
3730 type = blkid_partition_get_type(pp);
3731 if (type != 0x83) /* Linux partition */
3732 continue;
3733
3734 if (generic)
3735 multiple_generic = true;
3736 else {
3737 generic_rw = true;
3738
3739 r = free_and_strdup(&root, node);
3740 if (r < 0)
3741 return log_oom();
3742 }
3743 }
3744 }
3745
3746 if (root) {
3747 *root_device = root;
3748 root = NULL;
3749
3750 *root_device_rw = root_rw;
3751 *secondary = false;
3752 } else if (secondary_root) {
3753 *root_device = secondary_root;
3754 secondary_root = NULL;
3755
3756 *root_device_rw = secondary_root_rw;
3757 *secondary = true;
3758 } else if (generic) {
3759
3760 /* There were no partitions with precise meanings
3761 * around, but we found generic partitions. In this
3762 * case, if there's only one, we can go ahead and boot
3763 * it, otherwise we bail out, because we really cannot
3764 * make any sense of it. */
3765
3766 if (multiple_generic) {
3767 log_error("Identified multiple bootable Linux partitions on\n"
3768 " %s\n"
3769 PARTITION_TABLE_BLURB, arg_image);
3770 return -EINVAL;
3771 }
3772
3773 *root_device = generic;
3774 generic = NULL;
3775
3776 *root_device_rw = generic_rw;
3777 *secondary = false;
3778 } else {
3779 log_error("Failed to identify root partition in disk image\n"
3780 " %s\n"
3781 PARTITION_TABLE_BLURB, arg_image);
3782 return -EINVAL;
3783 }
3784
3785 if (home) {
3786 *home_device = home;
3787 home = NULL;
3788
3789 *home_device_rw = home_rw;
3790 }
3791
3792 if (srv) {
3793 *srv_device = srv;
3794 srv = NULL;
3795
3796 *srv_device_rw = srv_rw;
3797 }
3798
3799 return 0;
3800 #else
3801 log_error("--image= is not supported, compiled without blkid support.");
3802 return -EOPNOTSUPP;
3803 #endif
3804 }
3805
3806 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3807 #ifdef HAVE_BLKID
3808 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3809 const char *fstype, *p;
3810 int r;
3811
3812 assert(what);
3813 assert(where);
3814
3815 if (arg_read_only)
3816 rw = false;
3817
3818 if (directory)
3819 p = strjoina(where, directory);
3820 else
3821 p = where;
3822
3823 errno = 0;
3824 b = blkid_new_probe_from_filename(what);
3825 if (!b) {
3826 if (errno == 0)
3827 return log_oom();
3828 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3829 return -errno;
3830 }
3831
3832 blkid_probe_enable_superblocks(b, 1);
3833 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3834
3835 errno = 0;
3836 r = blkid_do_safeprobe(b);
3837 if (r == -1 || r == 1) {
3838 log_error("Cannot determine file system type of %s", what);
3839 return -EINVAL;
3840 } else if (r != 0) {
3841 if (errno == 0)
3842 errno = EIO;
3843 log_error_errno(errno, "Failed to probe %s: %m", what);
3844 return -errno;
3845 }
3846
3847 errno = 0;
3848 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3849 if (errno == 0)
3850 errno = EINVAL;
3851 log_error("Failed to determine file system type of %s", what);
3852 return -errno;
3853 }
3854
3855 if (streq(fstype, "crypto_LUKS")) {
3856 log_error("nspawn currently does not support LUKS disk images.");
3857 return -EOPNOTSUPP;
3858 }
3859
3860 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3861 return log_error_errno(errno, "Failed to mount %s: %m", what);
3862
3863 return 0;
3864 #else
3865 log_error("--image= is not supported, compiled without blkid support.");
3866 return -EOPNOTSUPP;
3867 #endif
3868 }
3869
3870 static int mount_devices(
3871 const char *where,
3872 const char *root_device, bool root_device_rw,
3873 const char *home_device, bool home_device_rw,
3874 const char *srv_device, bool srv_device_rw) {
3875 int r;
3876
3877 assert(where);
3878
3879 if (root_device) {
3880 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3881 if (r < 0)
3882 return log_error_errno(r, "Failed to mount root directory: %m");
3883 }
3884
3885 if (home_device) {
3886 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3887 if (r < 0)
3888 return log_error_errno(r, "Failed to mount home directory: %m");
3889 }
3890
3891 if (srv_device) {
3892 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3893 if (r < 0)
3894 return log_error_errno(r, "Failed to mount server data directory: %m");
3895 }
3896
3897 return 0;
3898 }
3899
3900 static void loop_remove(int nr, int *image_fd) {
3901 _cleanup_close_ int control = -1;
3902 int r;
3903
3904 if (nr < 0)
3905 return;
3906
3907 if (image_fd && *image_fd >= 0) {
3908 r = ioctl(*image_fd, LOOP_CLR_FD);
3909 if (r < 0)
3910 log_debug_errno(errno, "Failed to close loop image: %m");
3911 *image_fd = safe_close(*image_fd);
3912 }
3913
3914 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3915 if (control < 0) {
3916 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3917 return;
3918 }
3919
3920 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3921 if (r < 0)
3922 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3923 }
3924
3925 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3926 int pipe_fds[2];
3927 pid_t pid;
3928
3929 assert(database);
3930 assert(key);
3931 assert(rpid);
3932
3933 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3934 return log_error_errno(errno, "Failed to allocate pipe: %m");
3935
3936 pid = fork();
3937 if (pid < 0)
3938 return log_error_errno(errno, "Failed to fork getent child: %m");
3939 else if (pid == 0) {
3940 int nullfd;
3941 char *empty_env = NULL;
3942
3943 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3944 _exit(EXIT_FAILURE);
3945
3946 if (pipe_fds[0] > 2)
3947 safe_close(pipe_fds[0]);
3948 if (pipe_fds[1] > 2)
3949 safe_close(pipe_fds[1]);
3950
3951 nullfd = open("/dev/null", O_RDWR);
3952 if (nullfd < 0)
3953 _exit(EXIT_FAILURE);
3954
3955 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3956 _exit(EXIT_FAILURE);
3957
3958 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3959 _exit(EXIT_FAILURE);
3960
3961 if (nullfd > 2)
3962 safe_close(nullfd);
3963
3964 (void) reset_all_signal_handlers();
3965 (void) reset_signal_mask();
3966 close_all_fds(NULL, 0);
3967
3968 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3969 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3970 _exit(EXIT_FAILURE);
3971 }
3972
3973 pipe_fds[1] = safe_close(pipe_fds[1]);
3974
3975 *rpid = pid;
3976
3977 return pipe_fds[0];
3978 }
3979
3980 static int change_uid_gid(char **_home) {
3981 char line[LINE_MAX], *x, *u, *g, *h;
3982 const char *word, *state;
3983 _cleanup_free_ uid_t *uids = NULL;
3984 _cleanup_free_ char *home = NULL;
3985 _cleanup_fclose_ FILE *f = NULL;
3986 _cleanup_close_ int fd = -1;
3987 unsigned n_uids = 0;
3988 size_t sz = 0, l;
3989 uid_t uid;
3990 gid_t gid;
3991 pid_t pid;
3992 int r;
3993
3994 assert(_home);
3995
3996 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3997 /* Reset everything fully to 0, just in case */
3998
3999 r = reset_uid_gid();
4000 if (r < 0)
4001 return log_error_errno(r, "Failed to become root: %m");
4002
4003 *_home = NULL;
4004 return 0;
4005 }
4006
4007 /* First, get user credentials */
4008 fd = spawn_getent("passwd", arg_user, &pid);
4009 if (fd < 0)
4010 return fd;
4011
4012 f = fdopen(fd, "r");
4013 if (!f)
4014 return log_oom();
4015 fd = -1;
4016
4017 if (!fgets(line, sizeof(line), f)) {
4018
4019 if (!ferror(f)) {
4020 log_error("Failed to resolve user %s.", arg_user);
4021 return -ESRCH;
4022 }
4023
4024 log_error_errno(errno, "Failed to read from getent: %m");
4025 return -errno;
4026 }
4027
4028 truncate_nl(line);
4029
4030 wait_for_terminate_and_warn("getent passwd", pid, true);
4031
4032 x = strchr(line, ':');
4033 if (!x) {
4034 log_error("/etc/passwd entry has invalid user field.");
4035 return -EIO;
4036 }
4037
4038 u = strchr(x+1, ':');
4039 if (!u) {
4040 log_error("/etc/passwd entry has invalid password field.");
4041 return -EIO;
4042 }
4043
4044 u++;
4045 g = strchr(u, ':');
4046 if (!g) {
4047 log_error("/etc/passwd entry has invalid UID field.");
4048 return -EIO;
4049 }
4050
4051 *g = 0;
4052 g++;
4053 x = strchr(g, ':');
4054 if (!x) {
4055 log_error("/etc/passwd entry has invalid GID field.");
4056 return -EIO;
4057 }
4058
4059 *x = 0;
4060 h = strchr(x+1, ':');
4061 if (!h) {
4062 log_error("/etc/passwd entry has invalid GECOS field.");
4063 return -EIO;
4064 }
4065
4066 h++;
4067 x = strchr(h, ':');
4068 if (!x) {
4069 log_error("/etc/passwd entry has invalid home directory field.");
4070 return -EIO;
4071 }
4072
4073 *x = 0;
4074
4075 r = parse_uid(u, &uid);
4076 if (r < 0) {
4077 log_error("Failed to parse UID of user.");
4078 return -EIO;
4079 }
4080
4081 r = parse_gid(g, &gid);
4082 if (r < 0) {
4083 log_error("Failed to parse GID of user.");
4084 return -EIO;
4085 }
4086
4087 home = strdup(h);
4088 if (!home)
4089 return log_oom();
4090
4091 /* Second, get group memberships */
4092 fd = spawn_getent("initgroups", arg_user, &pid);
4093 if (fd < 0)
4094 return fd;
4095
4096 fclose(f);
4097 f = fdopen(fd, "r");
4098 if (!f)
4099 return log_oom();
4100 fd = -1;
4101
4102 if (!fgets(line, sizeof(line), f)) {
4103 if (!ferror(f)) {
4104 log_error("Failed to resolve user %s.", arg_user);
4105 return -ESRCH;
4106 }
4107
4108 log_error_errno(errno, "Failed to read from getent: %m");
4109 return -errno;
4110 }
4111
4112 truncate_nl(line);
4113
4114 wait_for_terminate_and_warn("getent initgroups", pid, true);
4115
4116 /* Skip over the username and subsequent separator whitespace */
4117 x = line;
4118 x += strcspn(x, WHITESPACE);
4119 x += strspn(x, WHITESPACE);
4120
4121 FOREACH_WORD(word, l, x, state) {
4122 char c[l+1];
4123
4124 memcpy(c, word, l);
4125 c[l] = 0;
4126
4127 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
4128 return log_oom();
4129
4130 r = parse_uid(c, &uids[n_uids++]);
4131 if (r < 0) {
4132 log_error("Failed to parse group data from getent.");
4133 return -EIO;
4134 }
4135 }
4136
4137 r = mkdir_parents(home, 0775);
4138 if (r < 0)
4139 return log_error_errno(r, "Failed to make home root directory: %m");
4140
4141 r = mkdir_safe(home, 0755, uid, gid);
4142 if (r < 0 && r != -EEXIST)
4143 return log_error_errno(r, "Failed to make home directory: %m");
4144
4145 (void) fchown(STDIN_FILENO, uid, gid);
4146 (void) fchown(STDOUT_FILENO, uid, gid);
4147 (void) fchown(STDERR_FILENO, uid, gid);
4148
4149 if (setgroups(n_uids, uids) < 0)
4150 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
4151
4152 if (setresgid(gid, gid, gid) < 0)
4153 return log_error_errno(errno, "setregid() failed: %m");
4154
4155 if (setresuid(uid, uid, uid) < 0)
4156 return log_error_errno(errno, "setreuid() failed: %m");
4157
4158 if (_home) {
4159 *_home = home;
4160 home = NULL;
4161 }
4162
4163 return 0;
4164 }
4165
4166 /*
4167 * Return values:
4168 * < 0 : wait_for_terminate() failed to get the state of the
4169 * container, the container was terminated by a signal, or
4170 * failed for an unknown reason. No change is made to the
4171 * container argument.
4172 * > 0 : The program executed in the container terminated with an
4173 * error. The exit code of the program executed in the
4174 * container is returned. The container argument has been set
4175 * to CONTAINER_TERMINATED.
4176 * 0 : The container is being rebooted, has been shut down or exited
4177 * successfully. The container argument has been set to either
4178 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
4179 *
4180 * That is, success is indicated by a return value of zero, and an
4181 * error is indicated by a non-zero value.
4182 */
4183 static int wait_for_container(pid_t pid, ContainerStatus *container) {
4184 siginfo_t status;
4185 int r;
4186
4187 r = wait_for_terminate(pid, &status);
4188 if (r < 0)
4189 return log_warning_errno(r, "Failed to wait for container: %m");
4190
4191 switch (status.si_code) {
4192
4193 case CLD_EXITED:
4194 if (status.si_status == 0) {
4195 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
4196
4197 } else
4198 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
4199
4200 *container = CONTAINER_TERMINATED;
4201 return status.si_status;
4202
4203 case CLD_KILLED:
4204 if (status.si_status == SIGINT) {
4205
4206 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
4207 *container = CONTAINER_TERMINATED;
4208 return 0;
4209
4210 } else if (status.si_status == SIGHUP) {
4211
4212 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
4213 *container = CONTAINER_REBOOTED;
4214 return 0;
4215 }
4216
4217 /* CLD_KILLED fallthrough */
4218
4219 case CLD_DUMPED:
4220 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
4221 return -EIO;
4222
4223 default:
4224 log_error("Container %s failed due to unknown reason.", arg_machine);
4225 return -EIO;
4226 }
4227
4228 return r;
4229 }
4230
4231 static void nop_handler(int sig) {}
4232
4233 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
4234 pid_t pid;
4235
4236 pid = PTR_TO_UINT32(userdata);
4237 if (pid > 0) {
4238 if (kill(pid, arg_kill_signal) >= 0) {
4239 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
4240 sd_event_source_set_userdata(s, NULL);
4241 return 0;
4242 }
4243 }
4244
4245 sd_event_exit(sd_event_source_get_event(s), 0);
4246 return 0;
4247 }
4248
4249 static int determine_names(void) {
4250 int r;
4251
4252 if (arg_template && !arg_directory && arg_machine) {
4253
4254 /* If --template= was specified then we should not
4255 * search for a machine, but instead create a new one
4256 * in /var/lib/machine. */
4257
4258 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
4259 if (!arg_directory)
4260 return log_oom();
4261 }
4262
4263 if (!arg_image && !arg_directory) {
4264 if (arg_machine) {
4265 _cleanup_(image_unrefp) Image *i = NULL;
4266
4267 r = image_find(arg_machine, &i);
4268 if (r < 0)
4269 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
4270 else if (r == 0) {
4271 log_error("No image for machine '%s': %m", arg_machine);
4272 return -ENOENT;
4273 }
4274
4275 if (i->type == IMAGE_RAW)
4276 r = set_sanitized_path(&arg_image, i->path);
4277 else
4278 r = set_sanitized_path(&arg_directory, i->path);
4279 if (r < 0)
4280 return log_error_errno(r, "Invalid image directory: %m");
4281
4282 if (!arg_ephemeral)
4283 arg_read_only = arg_read_only || i->read_only;
4284 } else
4285 arg_directory = get_current_dir_name();
4286
4287 if (!arg_directory && !arg_machine) {
4288 log_error("Failed to determine path, please use -D or -i.");
4289 return -EINVAL;
4290 }
4291 }
4292
4293 if (!arg_machine) {
4294 if (arg_directory && path_equal(arg_directory, "/"))
4295 arg_machine = gethostname_malloc();
4296 else
4297 arg_machine = strdup(basename(arg_image ?: arg_directory));
4298
4299 if (!arg_machine)
4300 return log_oom();
4301
4302 hostname_cleanup(arg_machine);
4303 if (!machine_name_is_valid(arg_machine)) {
4304 log_error("Failed to determine machine name automatically, please use -M.");
4305 return -EINVAL;
4306 }
4307
4308 if (arg_ephemeral) {
4309 char *b;
4310
4311 /* Add a random suffix when this is an
4312 * ephemeral machine, so that we can run many
4313 * instances at once without manually having
4314 * to specify -M each time. */
4315
4316 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
4317 return log_oom();
4318
4319 free(arg_machine);
4320 arg_machine = b;
4321 }
4322 }
4323
4324 return 0;
4325 }
4326
4327 static int determine_uid_shift(const char *directory) {
4328 int r;
4329
4330 if (!arg_userns) {
4331 arg_uid_shift = 0;
4332 return 0;
4333 }
4334
4335 if (arg_uid_shift == UID_INVALID) {
4336 struct stat st;
4337
4338 r = stat(directory, &st);
4339 if (r < 0)
4340 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
4341
4342 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
4343
4344 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
4345 log_error("UID and GID base of %s don't match.", directory);
4346 return -EINVAL;
4347 }
4348
4349 arg_uid_range = UINT32_C(0x10000);
4350 }
4351
4352 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
4353 log_error("UID base too high for UID range.");
4354 return -EINVAL;
4355 }
4356
4357 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
4358 return 0;
4359 }
4360
4361 static int inner_child(
4362 Barrier *barrier,
4363 const char *directory,
4364 bool secondary,
4365 int kmsg_socket,
4366 int rtnl_socket,
4367 FDSet *fds) {
4368
4369 _cleanup_free_ char *home = NULL;
4370 unsigned n_env = 2;
4371 const char *envp[] = {
4372 "PATH=" DEFAULT_PATH_SPLIT_USR,
4373 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4374 NULL, /* TERM */
4375 NULL, /* HOME */
4376 NULL, /* USER */
4377 NULL, /* LOGNAME */
4378 NULL, /* container_uuid */
4379 NULL, /* LISTEN_FDS */
4380 NULL, /* LISTEN_PID */
4381 NULL
4382 };
4383
4384 _cleanup_strv_free_ char **env_use = NULL;
4385 int r;
4386
4387 assert(barrier);
4388 assert(directory);
4389 assert(kmsg_socket >= 0);
4390
4391 cg_unified_flush();
4392
4393 if (arg_userns) {
4394 /* Tell the parent, that it now can write the UID map. */
4395 (void) barrier_place(barrier); /* #1 */
4396
4397 /* Wait until the parent wrote the UID map */
4398 if (!barrier_place_and_sync(barrier)) { /* #2 */
4399 log_error("Parent died too early");
4400 return -ESRCH;
4401 }
4402 }
4403
4404 r = mount_all(NULL, true);
4405 if (r < 0)
4406 return r;
4407
4408 /* Wait until we are cgroup-ified, so that we
4409 * can mount the right cgroup path writable */
4410 if (!barrier_place_and_sync(barrier)) { /* #3 */
4411 log_error("Parent died too early");
4412 return -ESRCH;
4413 }
4414
4415 r = mount_systemd_cgroup_writable("");
4416 if (r < 0)
4417 return r;
4418
4419 r = reset_uid_gid();
4420 if (r < 0)
4421 return log_error_errno(r, "Couldn't become new root: %m");
4422
4423 r = setup_boot_id(NULL);
4424 if (r < 0)
4425 return r;
4426
4427 r = setup_kmsg(NULL, kmsg_socket);
4428 if (r < 0)
4429 return r;
4430 kmsg_socket = safe_close(kmsg_socket);
4431
4432 umask(0022);
4433
4434 if (setsid() < 0)
4435 return log_error_errno(errno, "setsid() failed: %m");
4436
4437 if (arg_private_network)
4438 loopback_setup();
4439
4440 r = send_rtnl(rtnl_socket);
4441 if (r < 0)
4442 return r;
4443 rtnl_socket = safe_close(rtnl_socket);
4444
4445 if (drop_capabilities() < 0)
4446 return log_error_errno(errno, "drop_capabilities() failed: %m");
4447
4448 setup_hostname();
4449
4450 if (arg_personality != PERSONALITY_INVALID) {
4451 if (personality(arg_personality) < 0)
4452 return log_error_errno(errno, "personality() failed: %m");
4453 } else if (secondary) {
4454 if (personality(PER_LINUX32) < 0)
4455 return log_error_errno(errno, "personality() failed: %m");
4456 }
4457
4458 #ifdef HAVE_SELINUX
4459 if (arg_selinux_context)
4460 if (setexeccon((security_context_t) arg_selinux_context) < 0)
4461 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4462 #endif
4463
4464 r = change_uid_gid(&home);
4465 if (r < 0)
4466 return r;
4467
4468 envp[n_env] = strv_find_prefix(environ, "TERM=");
4469 if (envp[n_env])
4470 n_env ++;
4471
4472 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4473 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4474 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
4475 return log_oom();
4476
4477 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4478 char as_uuid[37];
4479
4480 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
4481 return log_oom();
4482 }
4483
4484 if (fdset_size(fds) > 0) {
4485 r = fdset_cloexec(fds, false);
4486 if (r < 0)
4487 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4488
4489 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
4490 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
4491 return log_oom();
4492 }
4493
4494 env_use = strv_env_merge(2, envp, arg_setenv);
4495 if (!env_use)
4496 return log_oom();
4497
4498 /* Let the parent know that we are ready and
4499 * wait until the parent is ready with the
4500 * setup, too... */
4501 if (!barrier_place_and_sync(barrier)) { /* #4 */
4502 log_error("Parent died too early");
4503 return -ESRCH;
4504 }
4505
4506 /* Now, explicitly close the log, so that we
4507 * then can close all remaining fds. Closing
4508 * the log explicitly first has the benefit
4509 * that the logging subsystem knows about it,
4510 * and is thus ready to be reopened should we
4511 * need it again. Note that the other fds
4512 * closed here are at least the locking and
4513 * barrier fds. */
4514 log_close();
4515 (void) fdset_close_others(fds);
4516
4517 if (arg_boot) {
4518 char **a;
4519 size_t m;
4520
4521 /* Automatically search for the init system */
4522
4523 m = 1 + strv_length(arg_parameters);
4524 a = newa(char*, m + 1);
4525 if (strv_isempty(arg_parameters))
4526 a[1] = NULL;
4527 else
4528 memcpy(a + 1, arg_parameters, m * sizeof(char*));
4529
4530 a[0] = (char*) "/usr/lib/systemd/systemd";
4531 execve(a[0], a, env_use);
4532
4533 a[0] = (char*) "/lib/systemd/systemd";
4534 execve(a[0], a, env_use);
4535
4536 a[0] = (char*) "/sbin/init";
4537 execve(a[0], a, env_use);
4538 } else if (!strv_isempty(arg_parameters))
4539 execvpe(arg_parameters[0], arg_parameters, env_use);
4540 else {
4541 chdir(home ?: "/root");
4542 execle("/bin/bash", "-bash", NULL, env_use);
4543 execle("/bin/sh", "-sh", NULL, env_use);
4544 }
4545
4546 (void) log_open();
4547 return log_error_errno(errno, "execv() failed: %m");
4548 }
4549
4550 static int outer_child(
4551 Barrier *barrier,
4552 const char *directory,
4553 const char *console,
4554 const char *root_device, bool root_device_rw,
4555 const char *home_device, bool home_device_rw,
4556 const char *srv_device, bool srv_device_rw,
4557 bool interactive,
4558 bool secondary,
4559 int pid_socket,
4560 int kmsg_socket,
4561 int rtnl_socket,
4562 int uid_shift_socket,
4563 FDSet *fds) {
4564
4565 pid_t pid;
4566 ssize_t l;
4567 int r;
4568
4569 assert(barrier);
4570 assert(directory);
4571 assert(console);
4572 assert(pid_socket >= 0);
4573 assert(kmsg_socket >= 0);
4574
4575 cg_unified_flush();
4576
4577 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
4578 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4579
4580 if (interactive) {
4581 close_nointr(STDIN_FILENO);
4582 close_nointr(STDOUT_FILENO);
4583 close_nointr(STDERR_FILENO);
4584
4585 r = open_terminal(console, O_RDWR);
4586 if (r != STDIN_FILENO) {
4587 if (r >= 0) {
4588 safe_close(r);
4589 r = -EINVAL;
4590 }
4591
4592 return log_error_errno(r, "Failed to open console: %m");
4593 }
4594
4595 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4596 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
4597 return log_error_errno(errno, "Failed to duplicate console: %m");
4598 }
4599
4600 r = reset_audit_loginuid();
4601 if (r < 0)
4602 return r;
4603
4604 /* Mark everything as slave, so that we still
4605 * receive mounts from the real root, but don't
4606 * propagate mounts to the real root. */
4607 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
4608 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4609
4610 r = mount_devices(directory,
4611 root_device, root_device_rw,
4612 home_device, home_device_rw,
4613 srv_device, srv_device_rw);
4614 if (r < 0)
4615 return r;
4616
4617 r = determine_uid_shift(directory);
4618 if (r < 0)
4619 return r;
4620
4621 if (arg_userns) {
4622 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
4623 if (l < 0)
4624 return log_error_errno(errno, "Failed to send UID shift: %m");
4625 if (l != sizeof(arg_uid_shift)) {
4626 log_error("Short write while sending UID shift.");
4627 return -EIO;
4628 }
4629 }
4630
4631 /* Turn directory into bind mount */
4632 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
4633 return log_error_errno(errno, "Failed to make bind mount: %m");
4634
4635 r = setup_volatile(directory);
4636 if (r < 0)
4637 return r;
4638
4639 r = setup_volatile_state(directory);
4640 if (r < 0)
4641 return r;
4642
4643 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
4644 if (r < 0)
4645 return r;
4646
4647 if (arg_read_only) {
4648 r = bind_remount_recursive(directory, true);
4649 if (r < 0)
4650 return log_error_errno(r, "Failed to make tree read-only: %m");
4651 }
4652
4653 r = mount_all(directory, false);
4654 if (r < 0)
4655 return r;
4656
4657 if (copy_devnodes(directory) < 0)
4658 return r;
4659
4660 dev_setup(directory, arg_uid_shift, arg_uid_shift);
4661
4662 if (setup_pts(directory) < 0)
4663 return r;
4664
4665 r = setup_propagate(directory);
4666 if (r < 0)
4667 return r;
4668
4669 r = setup_dev_console(directory, console);
4670 if (r < 0)
4671 return r;
4672
4673 r = setup_seccomp();
4674 if (r < 0)
4675 return r;
4676
4677 r = setup_timezone(directory);
4678 if (r < 0)
4679 return r;
4680
4681 r = setup_resolv_conf(directory);
4682 if (r < 0)
4683 return r;
4684
4685 r = setup_journal(directory);
4686 if (r < 0)
4687 return r;
4688
4689 r = mount_custom(directory);
4690 if (r < 0)
4691 return r;
4692
4693 r = mount_cgroups(directory);
4694 if (r < 0)
4695 return r;
4696
4697 r = mount_move_root(directory);
4698 if (r < 0)
4699 return log_error_errno(r, "Failed to move root directory: %m");
4700
4701 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4702 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
4703 (arg_private_network ? CLONE_NEWNET : 0) |
4704 (arg_userns ? CLONE_NEWUSER : 0),
4705 NULL);
4706 if (pid < 0)
4707 return log_error_errno(errno, "Failed to fork inner child: %m");
4708 if (pid == 0) {
4709 pid_socket = safe_close(pid_socket);
4710 uid_shift_socket = safe_close(uid_shift_socket);
4711
4712 /* The inner child has all namespaces that are
4713 * requested, so that we all are owned by the user if
4714 * user namespaces are turned on. */
4715
4716 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
4717 if (r < 0)
4718 _exit(EXIT_FAILURE);
4719
4720 _exit(EXIT_SUCCESS);
4721 }
4722
4723 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4724 if (l < 0)
4725 return log_error_errno(errno, "Failed to send PID: %m");
4726 if (l != sizeof(pid)) {
4727 log_error("Short write while sending PID.");
4728 return -EIO;
4729 }
4730
4731 pid_socket = safe_close(pid_socket);
4732
4733 return 0;
4734 }
4735
4736 static int setup_uid_map(pid_t pid) {
4737 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4738 int r;
4739
4740 assert(pid > 1);
4741
4742 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4743 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4744 r = write_string_file(uid_map, line, 0);
4745 if (r < 0)
4746 return log_error_errno(r, "Failed to write UID map: %m");
4747
4748 /* We always assign the same UID and GID ranges */
4749 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4750 r = write_string_file(uid_map, line, 0);
4751 if (r < 0)
4752 return log_error_errno(r, "Failed to write GID map: %m");
4753
4754 return 0;
4755 }
4756
4757 static int chown_cgroup(pid_t pid) {
4758 _cleanup_free_ char *path = NULL, *fs = NULL;
4759 _cleanup_close_ int fd = -1;
4760 const char *fn;
4761 int r;
4762
4763 r = cg_pid_get_path(NULL, pid, &path);
4764 if (r < 0)
4765 return log_error_errno(r, "Failed to get container cgroup path: %m");
4766
4767 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
4768 if (r < 0)
4769 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
4770
4771 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
4772 if (fd < 0)
4773 return log_error_errno(errno, "Failed to open %s: %m", fs);
4774
4775 FOREACH_STRING(fn,
4776 ".",
4777 "tasks",
4778 "notify_on_release",
4779 "cgroup.procs",
4780 "cgroup.clone_children",
4781 "cgroup.controllers",
4782 "cgroup.subtree_control",
4783 "cgroup.populated")
4784 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
4785 log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
4786 "Failed to chown() cgroup file %s, ignoring: %m", fn);
4787
4788 return 0;
4789 }
4790
4791 static int sync_cgroup(pid_t pid) {
4792 _cleanup_free_ char *cgroup = NULL;
4793 char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
4794 bool undo_mount = false;
4795 const char *fn;
4796 int unified, r;
4797
4798 unified = cg_unified();
4799 if (unified < 0)
4800 return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
4801
4802 if ((unified > 0) == arg_unified_cgroup_hierarchy)
4803 return 0;
4804
4805 /* When the host uses the legacy cgroup setup, but the
4806 * container shall use the unified hierarchy, let's make sure
4807 * we copy the path from the name=systemd hierarchy into the
4808 * unified hierarchy. Similar for the reverse situation. */
4809
4810 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
4811 if (r < 0)
4812 return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid);
4813
4814 /* In order to access the unified hierarchy we need to mount it */
4815 if (!mkdtemp(tree))
4816 return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m");
4817
4818 if (unified)
4819 r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
4820 else
4821 r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior");
4822 if (r < 0) {
4823 r = log_error_errno(errno, "Failed to mount unified hierarchy: %m");
4824 goto finish;
4825 }
4826
4827 undo_mount = true;
4828
4829 fn = strjoina(tree, cgroup, "/cgroup.procs");
4830 (void) mkdir_parents(fn, 0755);
4831
4832 sprintf(pid_string, PID_FMT, pid);
4833 r = write_string_file(fn, pid_string, 0);
4834 if (r < 0)
4835 log_error_errno(r, "Failed to move process: %m");
4836
4837 finish:
4838 if (undo_mount)
4839 (void) umount(tree);
4840
4841 (void) rmdir(tree);
4842 return r;
4843 }
4844
4845 static int create_subcgroup(pid_t pid) {
4846 _cleanup_free_ char *cgroup = NULL;
4847 const char *child;
4848 int unified, r;
4849 CGroupMask supported;
4850
4851 /* In the unified hierarchy inner nodes may only only contain
4852 * subgroups, but not processes. Hence, if we running in the
4853 * unified hierarchy and the container does the same, and we
4854 * did not create a scope unit for the container move us and
4855 * the container into two separate subcgroups. */
4856
4857 if (!arg_keep_unit)
4858 return 0;
4859
4860 if (!arg_unified_cgroup_hierarchy)
4861 return 0;
4862
4863 unified = cg_unified();
4864 if (unified < 0)
4865 return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
4866 if (unified == 0)
4867 return 0;
4868
4869 r = cg_mask_supported(&supported);
4870 if (r < 0)
4871 return log_error_errno(r, "Failed to determine supported controllers: %m");
4872
4873 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
4874 if (r < 0)
4875 return log_error_errno(r, "Failed to get our control group: %m");
4876
4877 child = strjoina(cgroup, "/payload");
4878 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid);
4879 if (r < 0)
4880 return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
4881
4882 child = strjoina(cgroup, "/supervisor");
4883 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0);
4884 if (r < 0)
4885 return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
4886
4887 /* Try to enable as many controllers as possible for the new payload. */
4888 (void) cg_enable_everywhere(supported, supported, cgroup);
4889 return 0;
4890 }
4891
4892 static int load_settings(void) {
4893 _cleanup_(settings_freep) Settings *settings = NULL;
4894 _cleanup_fclose_ FILE *f = NULL;
4895 _cleanup_free_ char *p = NULL;
4896 const char *fn, *i;
4897 int r;
4898
4899 /* If all settings are masked, there's no point in looking for
4900 * the settings file */
4901 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
4902 return 0;
4903
4904 fn = strjoina(arg_machine, ".nspawn");
4905
4906 /* We first look in the admin's directories in /etc and /run */
4907 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4908 _cleanup_free_ char *j = NULL;
4909
4910 j = strjoin(i, "/", fn, NULL);
4911 if (!j)
4912 return log_oom();
4913
4914 f = fopen(j, "re");
4915 if (f) {
4916 p = j;
4917 j = NULL;
4918
4919 /* By default we trust configuration from /etc and /run */
4920 if (arg_settings_trusted < 0)
4921 arg_settings_trusted = true;
4922
4923 break;
4924 }
4925
4926 if (errno != ENOENT)
4927 return log_error_errno(errno, "Failed to open %s: %m", j);
4928 }
4929
4930 if (!f) {
4931 /* After that, let's look for a file next to the
4932 * actual image we shall boot. */
4933
4934 if (arg_image) {
4935 p = file_in_same_dir(arg_image, fn);
4936 if (!p)
4937 return log_oom();
4938 } else if (arg_directory) {
4939 p = file_in_same_dir(arg_directory, fn);
4940 if (!p)
4941 return log_oom();
4942 }
4943
4944 if (p) {
4945 f = fopen(p, "re");
4946 if (!f && errno != ENOENT)
4947 return log_error_errno(errno, "Failed to open %s: %m", p);
4948
4949 /* By default we do not trust configuration from /var/lib/machines */
4950 if (arg_settings_trusted < 0)
4951 arg_settings_trusted = false;
4952 }
4953 }
4954
4955 if (!f)
4956 return 0;
4957
4958 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4959
4960 r = settings_load(f, p, &settings);
4961 if (r < 0)
4962 return r;
4963
4964 /* Copy over bits from the settings, unless they have been
4965 * explicitly masked by command line switches. */
4966
4967 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
4968 settings->boot >= 0) {
4969 arg_boot = settings->boot;
4970
4971 strv_free(arg_parameters);
4972 arg_parameters = settings->parameters;
4973 settings->parameters = NULL;
4974 }
4975
4976 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
4977 settings->environment) {
4978 strv_free(arg_setenv);
4979 arg_setenv = settings->environment;
4980 settings->environment = NULL;
4981 }
4982
4983 if ((arg_settings_mask & SETTING_USER) == 0 &&
4984 settings->user) {
4985 free(arg_user);
4986 arg_user = settings->user;
4987 settings->user = NULL;
4988 }
4989
4990 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
4991
4992 if (!arg_settings_trusted && settings->capability != 0)
4993 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
4994 else
4995 arg_retain |= settings->capability;
4996
4997 arg_retain &= ~settings->drop_capability;
4998 }
4999
5000 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
5001 settings->kill_signal > 0)
5002 arg_kill_signal = settings->kill_signal;
5003
5004 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
5005 settings->personality != PERSONALITY_INVALID)
5006 arg_personality = settings->personality;
5007
5008 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
5009 !sd_id128_is_null(settings->machine_id)) {
5010
5011 if (!arg_settings_trusted)
5012 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
5013 else
5014 arg_uuid = settings->machine_id;
5015 }
5016
5017 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
5018 settings->read_only >= 0)
5019 arg_read_only = settings->read_only;
5020
5021 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
5022 settings->volatile_mode != _VOLATILE_MODE_INVALID)
5023 arg_volatile_mode = settings->volatile_mode;
5024
5025 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
5026 settings->n_custom_mounts > 0) {
5027
5028 if (!arg_settings_trusted)
5029 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
5030 else {
5031 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5032 arg_custom_mounts = settings->custom_mounts;
5033 arg_n_custom_mounts = settings->n_custom_mounts;
5034
5035 settings->custom_mounts = NULL;
5036 settings->n_custom_mounts = 0;
5037 }
5038 }
5039
5040 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
5041 (settings->private_network >= 0 ||
5042 settings->network_veth >= 0 ||
5043 settings->network_bridge ||
5044 settings->network_interfaces ||
5045 settings->network_macvlan ||
5046 settings->network_ipvlan)) {
5047
5048 if (!arg_settings_trusted)
5049 log_warning("Ignoring network settings, file %s is not trusted.", p);
5050 else {
5051 strv_free(arg_network_interfaces);
5052 arg_network_interfaces = settings->network_interfaces;
5053 settings->network_interfaces = NULL;
5054
5055 strv_free(arg_network_macvlan);
5056 arg_network_macvlan = settings->network_macvlan;
5057 settings->network_macvlan = NULL;
5058
5059 strv_free(arg_network_ipvlan);
5060 arg_network_ipvlan = settings->network_ipvlan;
5061 settings->network_ipvlan = NULL;
5062
5063 free(arg_network_bridge);
5064 arg_network_bridge = settings->network_bridge;
5065 settings->network_bridge = NULL;
5066
5067 arg_network_veth = settings->network_veth > 0 || settings->network_bridge;
5068
5069 arg_private_network = true; /* all these settings imply private networking */
5070 }
5071 }
5072
5073 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
5074 settings->expose_ports) {
5075
5076 if (!arg_settings_trusted)
5077 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
5078 else {
5079 expose_port_free_all(arg_expose_ports);
5080 arg_expose_ports = settings->expose_ports;
5081 settings->expose_ports = NULL;
5082 }
5083 }
5084
5085 return 0;
5086 }
5087
5088 int main(int argc, char *argv[]) {
5089
5090 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
5091 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
5092 _cleanup_close_ int master = -1, image_fd = -1;
5093 _cleanup_fdset_free_ FDSet *fds = NULL;
5094 int r, n_fd_passed, loop_nr = -1;
5095 char veth_name[IFNAMSIZ];
5096 bool secondary = false, remove_subvol = false;
5097 sigset_t mask_chld;
5098 pid_t pid = 0;
5099 int ret = EXIT_SUCCESS;
5100 union in_addr_union exposed = {};
5101 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
5102 bool interactive;
5103
5104 log_parse_environment();
5105 log_open();
5106
5107 r = parse_argv(argc, argv);
5108 if (r <= 0)
5109 goto finish;
5110
5111 if (geteuid() != 0) {
5112 log_error("Need to be root.");
5113 r = -EPERM;
5114 goto finish;
5115 }
5116 r = determine_names();
5117 if (r < 0)
5118 goto finish;
5119
5120 r = load_settings();
5121 if (r < 0)
5122 goto finish;
5123
5124 r = verify_arguments();
5125 if (r < 0)
5126 goto finish;
5127
5128 n_fd_passed = sd_listen_fds(false);
5129 if (n_fd_passed > 0) {
5130 r = fdset_new_listen_fds(&fds, false);
5131 if (r < 0) {
5132 log_error_errno(r, "Failed to collect file descriptors: %m");
5133 goto finish;
5134 }
5135 }
5136
5137 if (arg_directory) {
5138 assert(!arg_image);
5139
5140 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
5141 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
5142 r = -EINVAL;
5143 goto finish;
5144 }
5145
5146 if (arg_ephemeral) {
5147 _cleanup_free_ char *np = NULL;
5148
5149 /* If the specified path is a mount point we
5150 * generate the new snapshot immediately
5151 * inside it under a random name. However if
5152 * the specified is not a mount point we
5153 * create the new snapshot in the parent
5154 * directory, just next to it. */
5155 r = path_is_mount_point(arg_directory, 0);
5156 if (r < 0) {
5157 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5158 goto finish;
5159 }
5160 if (r > 0)
5161 r = tempfn_random_child(arg_directory, "machine.", &np);
5162 else
5163 r = tempfn_random(arg_directory, "machine.", &np);
5164 if (r < 0) {
5165 log_error_errno(r, "Failed to generate name for snapshot: %m");
5166 goto finish;
5167 }
5168
5169 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5170 if (r < 0) {
5171 log_error_errno(r, "Failed to lock %s: %m", np);
5172 goto finish;
5173 }
5174
5175 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
5176 if (r < 0) {
5177 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5178 goto finish;
5179 }
5180
5181 free(arg_directory);
5182 arg_directory = np;
5183 np = NULL;
5184
5185 remove_subvol = true;
5186
5187 } else {
5188 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5189 if (r == -EBUSY) {
5190 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5191 goto finish;
5192 }
5193 if (r < 0) {
5194 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
5195 return r;
5196 }
5197
5198 if (arg_template) {
5199 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
5200 if (r == -EEXIST) {
5201 if (!arg_quiet)
5202 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
5203 } else if (r < 0) {
5204 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
5205 goto finish;
5206 } else {
5207 if (!arg_quiet)
5208 log_info("Populated %s from template %s.", arg_directory, arg_template);
5209 }
5210 }
5211 }
5212
5213 if (arg_boot) {
5214 if (path_is_os_tree(arg_directory) <= 0) {
5215 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
5216 r = -EINVAL;
5217 goto finish;
5218 }
5219 } else {
5220 const char *p;
5221
5222 p = strjoina(arg_directory,
5223 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
5224 if (access(p, F_OK) < 0) {
5225 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
5226 r = -EINVAL;
5227 goto finish;
5228 }
5229 }
5230
5231 } else {
5232 char template[] = "/tmp/nspawn-root-XXXXXX";
5233
5234 assert(arg_image);
5235 assert(!arg_template);
5236
5237 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5238 if (r == -EBUSY) {
5239 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5240 goto finish;
5241 }
5242 if (r < 0) {
5243 r = log_error_errno(r, "Failed to create image lock: %m");
5244 goto finish;
5245 }
5246
5247 if (!mkdtemp(template)) {
5248 log_error_errno(errno, "Failed to create temporary directory: %m");
5249 r = -errno;
5250 goto finish;
5251 }
5252
5253 arg_directory = strdup(template);
5254 if (!arg_directory) {
5255 r = log_oom();
5256 goto finish;
5257 }
5258
5259 image_fd = setup_image(&device_path, &loop_nr);
5260 if (image_fd < 0) {
5261 r = image_fd;
5262 goto finish;
5263 }
5264
5265 r = dissect_image(image_fd,
5266 &root_device, &root_device_rw,
5267 &home_device, &home_device_rw,
5268 &srv_device, &srv_device_rw,
5269 &secondary);
5270 if (r < 0)
5271 goto finish;
5272 }
5273
5274 r = custom_mounts_prepare();
5275 if (r < 0)
5276 goto finish;
5277
5278 interactive =
5279 isatty(STDIN_FILENO) > 0 &&
5280 isatty(STDOUT_FILENO) > 0;
5281
5282 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
5283 if (master < 0) {
5284 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
5285 goto finish;
5286 }
5287
5288 r = ptsname_malloc(master, &console);
5289 if (r < 0) {
5290 r = log_error_errno(r, "Failed to determine tty name: %m");
5291 goto finish;
5292 }
5293
5294 if (unlockpt(master) < 0) {
5295 r = log_error_errno(errno, "Failed to unlock tty: %m");
5296 goto finish;
5297 }
5298
5299 if (!arg_quiet)
5300 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5301 arg_machine, arg_image ?: arg_directory);
5302
5303 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
5304
5305 assert_se(sigemptyset(&mask_chld) == 0);
5306 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
5307
5308 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
5309 r = log_error_errno(errno, "Failed to become subreaper: %m");
5310 goto finish;
5311 }
5312
5313 for (;;) {
5314 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
5315 uid_shift_socket_pair[2] = { -1, -1 };
5316 ContainerStatus container_status;
5317 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5318 static const struct sigaction sa = {
5319 .sa_handler = nop_handler,
5320 .sa_flags = SA_NOCLDSTOP,
5321 };
5322 int ifi = 0;
5323 ssize_t l;
5324 _cleanup_event_unref_ sd_event *event = NULL;
5325 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
5326 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
5327 char last_char = 0;
5328
5329 r = barrier_create(&barrier);
5330 if (r < 0) {
5331 log_error_errno(r, "Cannot initialize IPC barrier: %m");
5332 goto finish;
5333 }
5334
5335 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
5336 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
5337 goto finish;
5338 }
5339
5340 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
5341 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
5342 goto finish;
5343 }
5344
5345 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
5346 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
5347 goto finish;
5348 }
5349
5350 if (arg_userns)
5351 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
5352 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
5353 goto finish;
5354 }
5355
5356 /* Child can be killed before execv(), so handle SIGCHLD
5357 * in order to interrupt parent's blocking calls and
5358 * give it a chance to call wait() and terminate. */
5359 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
5360 if (r < 0) {
5361 r = log_error_errno(errno, "Failed to change the signal mask: %m");
5362 goto finish;
5363 }
5364
5365 r = sigaction(SIGCHLD, &sa, NULL);
5366 if (r < 0) {
5367 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
5368 goto finish;
5369 }
5370
5371 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
5372 if (pid < 0) {
5373 if (errno == EINVAL)
5374 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
5375 else
5376 r = log_error_errno(errno, "clone() failed: %m");
5377
5378 goto finish;
5379 }
5380
5381 if (pid == 0) {
5382 /* The outer child only has a file system namespace. */
5383 barrier_set_role(&barrier, BARRIER_CHILD);
5384
5385 master = safe_close(master);
5386
5387 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
5388 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
5389 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
5390 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
5391
5392 (void) reset_all_signal_handlers();
5393 (void) reset_signal_mask();
5394
5395 r = outer_child(&barrier,
5396 arg_directory,
5397 console,
5398 root_device, root_device_rw,
5399 home_device, home_device_rw,
5400 srv_device, srv_device_rw,
5401 interactive,
5402 secondary,
5403 pid_socket_pair[1],
5404 kmsg_socket_pair[1],
5405 rtnl_socket_pair[1],
5406 uid_shift_socket_pair[1],
5407 fds);
5408 if (r < 0)
5409 _exit(EXIT_FAILURE);
5410
5411 _exit(EXIT_SUCCESS);
5412 }
5413
5414 barrier_set_role(&barrier, BARRIER_PARENT);
5415
5416 fdset_free(fds);
5417 fds = NULL;
5418
5419 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
5420 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
5421 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
5422
5423 /* Wait for the outer child. */
5424 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
5425 if (r < 0)
5426 goto finish;
5427 if (r != 0) {
5428 r = -EIO;
5429 goto finish;
5430 }
5431 pid = 0;
5432
5433 /* And now retrieve the PID of the inner child. */
5434 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
5435 if (l < 0) {
5436 r = log_error_errno(errno, "Failed to read inner child PID: %m");
5437 goto finish;
5438 }
5439 if (l != sizeof(pid)) {
5440 log_error("Short read while reading inner child PID: %m");
5441 r = EIO;
5442 goto finish;
5443 }
5444
5445 log_debug("Init process invoked as PID " PID_FMT, pid);
5446
5447 if (arg_userns) {
5448 if (!barrier_place_and_sync(&barrier)) { /* #1 */
5449 log_error("Child died too early.");
5450 r = -ESRCH;
5451 goto finish;
5452 }
5453
5454 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
5455 if (l < 0) {
5456 r = log_error_errno(errno, "Failed to read UID shift: %m");
5457 goto finish;
5458 }
5459 if (l != sizeof(arg_uid_shift)) {
5460 log_error("Short read while reading UID shift: %m");
5461 r = EIO;
5462 goto finish;
5463 }
5464
5465 r = setup_uid_map(pid);
5466 if (r < 0)
5467 goto finish;
5468
5469 (void) barrier_place(&barrier); /* #2 */
5470 }
5471
5472 r = move_network_interfaces(pid);
5473 if (r < 0)
5474 goto finish;
5475
5476 r = setup_veth(pid, veth_name, &ifi);
5477 if (r < 0)
5478 goto finish;
5479
5480 r = setup_bridge(veth_name, &ifi);
5481 if (r < 0)
5482 goto finish;
5483
5484 r = setup_macvlan(pid);
5485 if (r < 0)
5486 goto finish;
5487
5488 r = setup_ipvlan(pid);
5489 if (r < 0)
5490 goto finish;
5491
5492 r = register_machine(pid, ifi);
5493 if (r < 0)
5494 goto finish;
5495
5496 r = sync_cgroup(pid);
5497 if (r < 0)
5498 goto finish;
5499
5500 r = create_subcgroup(pid);
5501 if (r < 0)
5502 goto finish;
5503
5504 r = chown_cgroup(pid);
5505 if (r < 0)
5506 goto finish;
5507
5508 /* Notify the child that the parent is ready with all
5509 * its setup (including cgroup-ification), and that
5510 * the child can now hand over control to the code to
5511 * run inside the container. */
5512 (void) barrier_place(&barrier); /* #3 */
5513
5514 /* Block SIGCHLD here, before notifying child.
5515 * process_pty() will handle it with the other signals. */
5516 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5517
5518 /* Reset signal to default */
5519 r = default_signals(SIGCHLD, -1);
5520 if (r < 0) {
5521 log_error_errno(r, "Failed to reset SIGCHLD: %m");
5522 goto finish;
5523 }
5524
5525 /* Let the child know that we are ready and wait that the child is completely ready now. */
5526 if (!barrier_place_and_sync(&barrier)) { /* #5 */
5527 log_error("Client died too early.");
5528 r = -ESRCH;
5529 goto finish;
5530 }
5531
5532 sd_notifyf(false,
5533 "READY=1\n"
5534 "STATUS=Container running.\n"
5535 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
5536
5537 r = sd_event_new(&event);
5538 if (r < 0) {
5539 log_error_errno(r, "Failed to get default event source: %m");
5540 goto finish;
5541 }
5542
5543 if (arg_kill_signal > 0) {
5544 /* Try to kill the init system on SIGINT or SIGTERM */
5545 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
5546 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
5547 } else {
5548 /* Immediately exit */
5549 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5550 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
5551 }
5552
5553 /* simply exit on sigchld */
5554 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
5555
5556 if (arg_expose_ports) {
5557 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
5558 if (r < 0)
5559 goto finish;
5560
5561 (void) expose_ports(rtnl, &exposed);
5562 }
5563
5564 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
5565
5566 r = pty_forward_new(event, master, true, !interactive, &forward);
5567 if (r < 0) {
5568 log_error_errno(r, "Failed to create PTY forwarder: %m");
5569 goto finish;
5570 }
5571
5572 r = sd_event_loop(event);
5573 if (r < 0) {
5574 log_error_errno(r, "Failed to run event loop: %m");
5575 goto finish;
5576 }
5577
5578 pty_forward_get_last_char(forward, &last_char);
5579
5580 forward = pty_forward_free(forward);
5581
5582 if (!arg_quiet && last_char != '\n')
5583 putc('\n', stdout);
5584
5585 /* Kill if it is not dead yet anyway */
5586 terminate_machine(pid);
5587
5588 /* Normally redundant, but better safe than sorry */
5589 kill(pid, SIGKILL);
5590
5591 r = wait_for_container(pid, &container_status);
5592 pid = 0;
5593
5594 if (r < 0)
5595 /* We failed to wait for the container, or the
5596 * container exited abnormally */
5597 goto finish;
5598 else if (r > 0 || container_status == CONTAINER_TERMINATED){
5599 /* The container exited with a non-zero
5600 * status, or with zero status and no reboot
5601 * was requested. */
5602 ret = r;
5603 break;
5604 }
5605
5606 /* CONTAINER_REBOOTED, loop again */
5607
5608 if (arg_keep_unit) {
5609 /* Special handling if we are running as a
5610 * service: instead of simply restarting the
5611 * machine we want to restart the entire
5612 * service, so let's inform systemd about this
5613 * with the special exit code 133. The service
5614 * file uses RestartForceExitStatus=133 so
5615 * that this results in a full nspawn
5616 * restart. This is necessary since we might
5617 * have cgroup parameters set we want to have
5618 * flushed out. */
5619 ret = 133;
5620 r = 0;
5621 break;
5622 }
5623
5624 flush_ports(&exposed);
5625 }
5626
5627 finish:
5628 sd_notify(false,
5629 "STOPPING=1\n"
5630 "STATUS=Terminating...");
5631
5632 if (pid > 0)
5633 kill(pid, SIGKILL);
5634
5635 /* Try to flush whatever is still queued in the pty */
5636 if (master >= 0)
5637 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
5638
5639 loop_remove(loop_nr, &image_fd);
5640
5641 if (remove_subvol && arg_directory) {
5642 int k;
5643
5644 k = btrfs_subvol_remove(arg_directory, true);
5645 if (k < 0)
5646 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
5647 }
5648
5649 if (arg_machine) {
5650 const char *p;
5651
5652 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5653 (void) rm_rf(p, REMOVE_ROOT);
5654 }
5655
5656 flush_ports(&exposed);
5657
5658 free(arg_directory);
5659 free(arg_template);
5660 free(arg_image);
5661 free(arg_machine);
5662 free(arg_user);
5663 strv_free(arg_setenv);
5664 free(arg_network_bridge);
5665 strv_free(arg_network_interfaces);
5666 strv_free(arg_network_macvlan);
5667 strv_free(arg_network_ipvlan);
5668 strv_free(arg_parameters);
5669 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5670 expose_port_free_all(arg_expose_ports);
5671
5672 return r < 0 ? EXIT_FAILURE : ret;
5673 }