]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
fileio: consolidate write_string_file*()
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <net/if.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
41 #include <sys/file.h>
42
43 #ifdef HAVE_SELINUX
44 #include <selinux/selinux.h>
45 #endif
46
47 #ifdef HAVE_SECCOMP
48 #include <seccomp.h>
49 #endif
50
51 #ifdef HAVE_BLKID
52 #include <blkid/blkid.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-netlink.h"
59 #include "random-util.h"
60 #include "log.h"
61 #include "util.h"
62 #include "mkdir.h"
63 #include "rm-rf.h"
64 #include "macro.h"
65 #include "missing.h"
66 #include "cgroup-util.h"
67 #include "strv.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
71 #include "fdset.h"
72 #include "build.h"
73 #include "fileio.h"
74 #include "bus-util.h"
75 #include "bus-error.h"
76 #include "ptyfwd.h"
77 #include "env-util.h"
78 #include "netlink-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
81 #include "gpt.h"
82 #include "siphash24.h"
83 #include "copy.h"
84 #include "base-filesystem.h"
85 #include "barrier.h"
86 #include "event-util.h"
87 #include "capability.h"
88 #include "cap-list.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
91 #include "list.h"
92 #include "in-addr-util.h"
93 #include "firewall-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
100
101 #ifdef HAVE_SECCOMP
102 #include "seccomp-util.h"
103 #endif
104
105 typedef struct ExposePort {
106 int protocol;
107 uint16_t host_port;
108 uint16_t container_port;
109 LIST_FIELDS(struct ExposePort, ports);
110 } ExposePort;
111
112 typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115 } ContainerStatus;
116
117 typedef enum LinkJournal {
118 LINK_NO,
119 LINK_AUTO,
120 LINK_HOST,
121 LINK_GUEST
122 } LinkJournal;
123
124 typedef enum Volatile {
125 VOLATILE_NO,
126 VOLATILE_YES,
127 VOLATILE_STATE,
128 } Volatile;
129
130 typedef enum CustomMountType {
131 CUSTOM_MOUNT_BIND,
132 CUSTOM_MOUNT_TMPFS,
133 CUSTOM_MOUNT_OVERLAY,
134 } CustomMountType;
135
136 typedef struct CustomMount {
137 CustomMountType type;
138 bool read_only;
139 char *source; /* for overlayfs this is the upper directory */
140 char *destination;
141 char *options;
142 char *work_dir;
143 char **lower;
144 } CustomMount;
145
146 static char *arg_directory = NULL;
147 static char *arg_template = NULL;
148 static char *arg_user = NULL;
149 static sd_id128_t arg_uuid = {};
150 static char *arg_machine = NULL;
151 static const char *arg_selinux_context = NULL;
152 static const char *arg_selinux_apifs_context = NULL;
153 static const char *arg_slice = NULL;
154 static bool arg_private_network = false;
155 static bool arg_read_only = false;
156 static bool arg_boot = false;
157 static bool arg_ephemeral = false;
158 static LinkJournal arg_link_journal = LINK_AUTO;
159 static bool arg_link_journal_try = false;
160 static uint64_t arg_retain =
161 (1ULL << CAP_CHOWN) |
162 (1ULL << CAP_DAC_OVERRIDE) |
163 (1ULL << CAP_DAC_READ_SEARCH) |
164 (1ULL << CAP_FOWNER) |
165 (1ULL << CAP_FSETID) |
166 (1ULL << CAP_IPC_OWNER) |
167 (1ULL << CAP_KILL) |
168 (1ULL << CAP_LEASE) |
169 (1ULL << CAP_LINUX_IMMUTABLE) |
170 (1ULL << CAP_NET_BIND_SERVICE) |
171 (1ULL << CAP_NET_BROADCAST) |
172 (1ULL << CAP_NET_RAW) |
173 (1ULL << CAP_SETGID) |
174 (1ULL << CAP_SETFCAP) |
175 (1ULL << CAP_SETPCAP) |
176 (1ULL << CAP_SETUID) |
177 (1ULL << CAP_SYS_ADMIN) |
178 (1ULL << CAP_SYS_CHROOT) |
179 (1ULL << CAP_SYS_NICE) |
180 (1ULL << CAP_SYS_PTRACE) |
181 (1ULL << CAP_SYS_TTY_CONFIG) |
182 (1ULL << CAP_SYS_RESOURCE) |
183 (1ULL << CAP_SYS_BOOT) |
184 (1ULL << CAP_AUDIT_WRITE) |
185 (1ULL << CAP_AUDIT_CONTROL) |
186 (1ULL << CAP_MKNOD);
187 static CustomMount *arg_custom_mounts = NULL;
188 static unsigned arg_n_custom_mounts = 0;
189 static char **arg_setenv = NULL;
190 static bool arg_quiet = false;
191 static bool arg_share_system = false;
192 static bool arg_register = true;
193 static bool arg_keep_unit = false;
194 static char **arg_network_interfaces = NULL;
195 static char **arg_network_macvlan = NULL;
196 static char **arg_network_ipvlan = NULL;
197 static bool arg_network_veth = false;
198 static const char *arg_network_bridge = NULL;
199 static unsigned long arg_personality = PERSONALITY_INVALID;
200 static char *arg_image = NULL;
201 static Volatile arg_volatile = VOLATILE_NO;
202 static ExposePort *arg_expose_ports = NULL;
203 static char **arg_property = NULL;
204 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
205 static bool arg_userns = false;
206 static int arg_kill_signal = 0;
207
208 static void help(void) {
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
211 " -h --help Show this help\n"
212 " --version Print version string\n"
213 " -q --quiet Do not show status information\n"
214 " -D --directory=PATH Root directory for the container\n"
215 " --template=PATH Initialize root directory from template directory,\n"
216 " if missing\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
222 " -M --machine=NAME Set the machine name for the container\n"
223 " --uuid=UUID Set a specific machine UUID for the container\n"
224 " -S --slice=SLICE Place the container in the specified slice\n"
225 " --property=NAME=VALUE Set scope unit property\n"
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
231 " container\n"
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
238 " -n --network-veth Add a virtual ethernet connection between host\n"
239 " and container\n"
240 " --network-bridge=INTERFACE\n"
241 " Add a virtual ethernet connection between host\n"
242 " and container and add it to an existing bridge on\n"
243 " the host\n"
244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
245 " Expose a container IP port on the host\n"
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
252 " --capability=CAP In addition to the default, retain specified\n"
253 " capability\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
259 " --read-only Mount the root directory read-only\n"
260 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
261 " the container\n"
262 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
263 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
264 " --overlay=PATH[:PATH...]:PATH\n"
265 " Create an overlay mount from the host to \n"
266 " the container\n"
267 " --overlay-ro=PATH[:PATH...]:PATH\n"
268 " Similar, but creates a read-only overlay mount\n"
269 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
270 " --share-system Share system namespaces with host\n"
271 " --register=BOOLEAN Register container as machine\n"
272 " --keep-unit Do not register a scope for the machine, reuse\n"
273 " the service unit nspawn is running in\n"
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 , program_invocation_short_name);
276 }
277
278 static CustomMount* custom_mount_add(CustomMountType t) {
279 CustomMount *c, *ret;
280
281 c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
282 if (!c)
283 return NULL;
284
285 arg_custom_mounts = c;
286 ret = arg_custom_mounts + arg_n_custom_mounts;
287 arg_n_custom_mounts++;
288
289 *ret = (CustomMount) { .type = t };
290
291 return ret;
292 }
293
294 static void custom_mount_free_all(void) {
295 unsigned i;
296
297 for (i = 0; i < arg_n_custom_mounts; i++) {
298 CustomMount *m = &arg_custom_mounts[i];
299
300 free(m->source);
301 free(m->destination);
302 free(m->options);
303
304 if (m->work_dir) {
305 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
306 free(m->work_dir);
307 }
308
309 strv_free(m->lower);
310 }
311
312 free(arg_custom_mounts);
313 arg_custom_mounts = NULL;
314 arg_n_custom_mounts = 0;
315 }
316
317 static int custom_mount_compare(const void *a, const void *b) {
318 const CustomMount *x = a, *y = b;
319 int r;
320
321 r = path_compare(x->destination, y->destination);
322 if (r != 0)
323 return r;
324
325 if (x->type < y->type)
326 return -1;
327 if (x->type > y->type)
328 return 1;
329
330 return 0;
331 }
332
333 static int custom_mounts_prepare(void) {
334 unsigned i;
335 int r;
336
337 /* Ensure the mounts are applied prefix first. */
338 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
339
340 /* Allocate working directories for the overlay file systems that need it */
341 for (i = 0; i < arg_n_custom_mounts; i++) {
342 CustomMount *m = &arg_custom_mounts[i];
343
344 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
345 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
346 return -EINVAL;
347 }
348
349 if (m->type != CUSTOM_MOUNT_OVERLAY)
350 continue;
351
352 if (m->work_dir)
353 continue;
354
355 if (m->read_only)
356 continue;
357
358 r = tempfn_random(m->source, NULL, &m->work_dir);
359 if (r < 0)
360 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
361 }
362
363 return 0;
364 }
365
366 static int set_sanitized_path(char **b, const char *path) {
367 char *p;
368
369 assert(b);
370 assert(path);
371
372 p = canonicalize_file_name(path);
373 if (!p) {
374 if (errno != ENOENT)
375 return -errno;
376
377 p = path_make_absolute_cwd(path);
378 if (!p)
379 return -ENOMEM;
380 }
381
382 free(*b);
383 *b = path_kill_slashes(p);
384 return 0;
385 }
386
387 static int parse_argv(int argc, char *argv[]) {
388
389 enum {
390 ARG_VERSION = 0x100,
391 ARG_PRIVATE_NETWORK,
392 ARG_UUID,
393 ARG_READ_ONLY,
394 ARG_CAPABILITY,
395 ARG_DROP_CAPABILITY,
396 ARG_LINK_JOURNAL,
397 ARG_BIND,
398 ARG_BIND_RO,
399 ARG_TMPFS,
400 ARG_OVERLAY,
401 ARG_OVERLAY_RO,
402 ARG_SETENV,
403 ARG_SHARE_SYSTEM,
404 ARG_REGISTER,
405 ARG_KEEP_UNIT,
406 ARG_NETWORK_INTERFACE,
407 ARG_NETWORK_MACVLAN,
408 ARG_NETWORK_IPVLAN,
409 ARG_NETWORK_BRIDGE,
410 ARG_PERSONALITY,
411 ARG_VOLATILE,
412 ARG_TEMPLATE,
413 ARG_PROPERTY,
414 ARG_PRIVATE_USERS,
415 ARG_KILL_SIGNAL,
416 };
417
418 static const struct option options[] = {
419 { "help", no_argument, NULL, 'h' },
420 { "version", no_argument, NULL, ARG_VERSION },
421 { "directory", required_argument, NULL, 'D' },
422 { "template", required_argument, NULL, ARG_TEMPLATE },
423 { "ephemeral", no_argument, NULL, 'x' },
424 { "user", required_argument, NULL, 'u' },
425 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
426 { "boot", no_argument, NULL, 'b' },
427 { "uuid", required_argument, NULL, ARG_UUID },
428 { "read-only", no_argument, NULL, ARG_READ_ONLY },
429 { "capability", required_argument, NULL, ARG_CAPABILITY },
430 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
431 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
432 { "bind", required_argument, NULL, ARG_BIND },
433 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
434 { "tmpfs", required_argument, NULL, ARG_TMPFS },
435 { "overlay", required_argument, NULL, ARG_OVERLAY },
436 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
437 { "machine", required_argument, NULL, 'M' },
438 { "slice", required_argument, NULL, 'S' },
439 { "setenv", required_argument, NULL, ARG_SETENV },
440 { "selinux-context", required_argument, NULL, 'Z' },
441 { "selinux-apifs-context", required_argument, NULL, 'L' },
442 { "quiet", no_argument, NULL, 'q' },
443 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
444 { "register", required_argument, NULL, ARG_REGISTER },
445 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
446 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
447 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
448 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
449 { "network-veth", no_argument, NULL, 'n' },
450 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
451 { "personality", required_argument, NULL, ARG_PERSONALITY },
452 { "image", required_argument, NULL, 'i' },
453 { "volatile", optional_argument, NULL, ARG_VOLATILE },
454 { "port", required_argument, NULL, 'p' },
455 { "property", required_argument, NULL, ARG_PROPERTY },
456 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
457 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
458 {}
459 };
460
461 int c, r;
462 uint64_t plus = 0, minus = 0;
463
464 assert(argc >= 0);
465 assert(argv);
466
467 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
468
469 switch (c) {
470
471 case 'h':
472 help();
473 return 0;
474
475 case ARG_VERSION:
476 puts(PACKAGE_STRING);
477 puts(SYSTEMD_FEATURES);
478 return 0;
479
480 case 'D':
481 r = set_sanitized_path(&arg_directory, optarg);
482 if (r < 0)
483 return log_error_errno(r, "Invalid root directory: %m");
484
485 break;
486
487 case ARG_TEMPLATE:
488 r = set_sanitized_path(&arg_template, optarg);
489 if (r < 0)
490 return log_error_errno(r, "Invalid template directory: %m");
491
492 break;
493
494 case 'i':
495 r = set_sanitized_path(&arg_image, optarg);
496 if (r < 0)
497 return log_error_errno(r, "Invalid image path: %m");
498
499 break;
500
501 case 'x':
502 arg_ephemeral = true;
503 break;
504
505 case 'u':
506 free(arg_user);
507 arg_user = strdup(optarg);
508 if (!arg_user)
509 return log_oom();
510
511 break;
512
513 case ARG_NETWORK_BRIDGE:
514 arg_network_bridge = optarg;
515
516 /* fall through */
517
518 case 'n':
519 arg_network_veth = true;
520 arg_private_network = true;
521 break;
522
523 case ARG_NETWORK_INTERFACE:
524 if (strv_extend(&arg_network_interfaces, optarg) < 0)
525 return log_oom();
526
527 arg_private_network = true;
528 break;
529
530 case ARG_NETWORK_MACVLAN:
531 if (strv_extend(&arg_network_macvlan, optarg) < 0)
532 return log_oom();
533
534 arg_private_network = true;
535 break;
536
537 case ARG_NETWORK_IPVLAN:
538 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
539 return log_oom();
540
541 /* fall through */
542
543 case ARG_PRIVATE_NETWORK:
544 arg_private_network = true;
545 break;
546
547 case 'b':
548 arg_boot = true;
549 break;
550
551 case ARG_UUID:
552 r = sd_id128_from_string(optarg, &arg_uuid);
553 if (r < 0) {
554 log_error("Invalid UUID: %s", optarg);
555 return r;
556 }
557 break;
558
559 case 'S':
560 arg_slice = optarg;
561 break;
562
563 case 'M':
564 if (isempty(optarg)) {
565 free(arg_machine);
566 arg_machine = NULL;
567 } else {
568 if (!machine_name_is_valid(optarg)) {
569 log_error("Invalid machine name: %s", optarg);
570 return -EINVAL;
571 }
572
573 r = free_and_strdup(&arg_machine, optarg);
574 if (r < 0)
575 return log_oom();
576
577 break;
578 }
579
580 case 'Z':
581 arg_selinux_context = optarg;
582 break;
583
584 case 'L':
585 arg_selinux_apifs_context = optarg;
586 break;
587
588 case ARG_READ_ONLY:
589 arg_read_only = true;
590 break;
591
592 case ARG_CAPABILITY:
593 case ARG_DROP_CAPABILITY: {
594 const char *state, *word;
595 size_t length;
596
597 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
598 _cleanup_free_ char *t;
599
600 t = strndup(word, length);
601 if (!t)
602 return log_oom();
603
604 if (streq(t, "all")) {
605 if (c == ARG_CAPABILITY)
606 plus = (uint64_t) -1;
607 else
608 minus = (uint64_t) -1;
609 } else {
610 int cap;
611
612 cap = capability_from_name(t);
613 if (cap < 0) {
614 log_error("Failed to parse capability %s.", t);
615 return -EINVAL;
616 }
617
618 if (c == ARG_CAPABILITY)
619 plus |= 1ULL << (uint64_t) cap;
620 else
621 minus |= 1ULL << (uint64_t) cap;
622 }
623 }
624
625 break;
626 }
627
628 case 'j':
629 arg_link_journal = LINK_GUEST;
630 arg_link_journal_try = true;
631 break;
632
633 case ARG_LINK_JOURNAL:
634 if (streq(optarg, "auto")) {
635 arg_link_journal = LINK_AUTO;
636 arg_link_journal_try = false;
637 } else if (streq(optarg, "no")) {
638 arg_link_journal = LINK_NO;
639 arg_link_journal_try = false;
640 } else if (streq(optarg, "guest")) {
641 arg_link_journal = LINK_GUEST;
642 arg_link_journal_try = false;
643 } else if (streq(optarg, "host")) {
644 arg_link_journal = LINK_HOST;
645 arg_link_journal_try = false;
646 } else if (streq(optarg, "try-guest")) {
647 arg_link_journal = LINK_GUEST;
648 arg_link_journal_try = true;
649 } else if (streq(optarg, "try-host")) {
650 arg_link_journal = LINK_HOST;
651 arg_link_journal_try = true;
652 } else {
653 log_error("Failed to parse link journal mode %s", optarg);
654 return -EINVAL;
655 }
656
657 break;
658
659 case ARG_BIND:
660 case ARG_BIND_RO: {
661 _cleanup_free_ char *source = NULL, *destination = NULL;
662 CustomMount *m;
663 char *e;
664
665 e = strchr(optarg, ':');
666 if (e) {
667 source = strndup(optarg, e - optarg);
668 destination = strdup(e + 1);
669 } else {
670 source = strdup(optarg);
671 destination = strdup(optarg);
672 }
673
674 if (!source || !destination)
675 return log_oom();
676
677 if (!path_is_absolute(source) || !path_is_absolute(destination)) {
678 log_error("Invalid bind mount specification: %s", optarg);
679 return -EINVAL;
680 }
681
682 m = custom_mount_add(CUSTOM_MOUNT_BIND);
683 if (!m)
684 return log_oom();
685
686 m->source = source;
687 m->destination = destination;
688 m->read_only = c == ARG_BIND_RO;
689
690 source = destination = NULL;
691
692 break;
693 }
694
695 case ARG_TMPFS: {
696 _cleanup_free_ char *path = NULL, *opts = NULL;
697 CustomMount *m;
698 char *e;
699
700 e = strchr(optarg, ':');
701 if (e) {
702 path = strndup(optarg, e - optarg);
703 opts = strdup(e + 1);
704 } else {
705 path = strdup(optarg);
706 opts = strdup("mode=0755");
707 }
708
709 if (!path || !opts)
710 return log_oom();
711
712 if (!path_is_absolute(path)) {
713 log_error("Invalid tmpfs specification: %s", optarg);
714 return -EINVAL;
715 }
716
717 m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
718 if (!m)
719 return log_oom();
720
721 m->destination = path;
722 m->options = opts;
723
724 path = opts = NULL;
725
726 break;
727 }
728
729 case ARG_OVERLAY:
730 case ARG_OVERLAY_RO: {
731 _cleanup_free_ char *upper = NULL, *destination = NULL;
732 _cleanup_strv_free_ char **lower = NULL;
733 CustomMount *m;
734 unsigned n = 0;
735 char **i;
736
737 lower = strv_split(optarg, ":");
738 if (!lower)
739 return log_oom();
740
741 STRV_FOREACH(i, lower) {
742 if (!path_is_absolute(*i)) {
743 log_error("Overlay path %s is not absolute.", *i);
744 return -EINVAL;
745 }
746
747 n++;
748 }
749
750 if (n < 2) {
751 log_error("--overlay= needs at least two colon-separated directories specified.");
752 return -EINVAL;
753 }
754
755 if (n == 2) {
756 /* If two parameters are specified,
757 * the first one is the lower, the
758 * second one the upper directory. And
759 * we'll also define the the
760 * destination mount point the same as
761 * the upper. */
762 upper = lower[1];
763 lower[1] = NULL;
764
765 destination = strdup(upper);
766 if (!destination)
767 return log_oom();
768
769 } else {
770 upper = lower[n - 2];
771 destination = lower[n - 1];
772 lower[n - 2] = NULL;
773 }
774
775 m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
776 if (!m)
777 return log_oom();
778
779 m->destination = destination;
780 m->source = upper;
781 m->lower = lower;
782 m->read_only = c == ARG_OVERLAY_RO;
783
784 upper = destination = NULL;
785 lower = NULL;
786
787 break;
788 }
789
790 case ARG_SETENV: {
791 char **n;
792
793 if (!env_assignment_is_valid(optarg)) {
794 log_error("Environment variable assignment '%s' is not valid.", optarg);
795 return -EINVAL;
796 }
797
798 n = strv_env_set(arg_setenv, optarg);
799 if (!n)
800 return log_oom();
801
802 strv_free(arg_setenv);
803 arg_setenv = n;
804 break;
805 }
806
807 case 'q':
808 arg_quiet = true;
809 break;
810
811 case ARG_SHARE_SYSTEM:
812 arg_share_system = true;
813 break;
814
815 case ARG_REGISTER:
816 r = parse_boolean(optarg);
817 if (r < 0) {
818 log_error("Failed to parse --register= argument: %s", optarg);
819 return r;
820 }
821
822 arg_register = r;
823 break;
824
825 case ARG_KEEP_UNIT:
826 arg_keep_unit = true;
827 break;
828
829 case ARG_PERSONALITY:
830
831 arg_personality = personality_from_string(optarg);
832 if (arg_personality == PERSONALITY_INVALID) {
833 log_error("Unknown or unsupported personality '%s'.", optarg);
834 return -EINVAL;
835 }
836
837 break;
838
839 case ARG_VOLATILE:
840
841 if (!optarg)
842 arg_volatile = VOLATILE_YES;
843 else {
844 r = parse_boolean(optarg);
845 if (r < 0) {
846 if (streq(optarg, "state"))
847 arg_volatile = VOLATILE_STATE;
848 else {
849 log_error("Failed to parse --volatile= argument: %s", optarg);
850 return r;
851 }
852 } else
853 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
854 }
855
856 break;
857
858 case 'p': {
859 const char *split, *e;
860 uint16_t container_port, host_port;
861 int protocol;
862 ExposePort *p;
863
864 if ((e = startswith(optarg, "tcp:")))
865 protocol = IPPROTO_TCP;
866 else if ((e = startswith(optarg, "udp:")))
867 protocol = IPPROTO_UDP;
868 else {
869 e = optarg;
870 protocol = IPPROTO_TCP;
871 }
872
873 split = strchr(e, ':');
874 if (split) {
875 char v[split - e + 1];
876
877 memcpy(v, e, split - e);
878 v[split - e] = 0;
879
880 r = safe_atou16(v, &host_port);
881 if (r < 0 || host_port <= 0) {
882 log_error("Failed to parse host port: %s", optarg);
883 return -EINVAL;
884 }
885
886 r = safe_atou16(split + 1, &container_port);
887 } else {
888 r = safe_atou16(e, &container_port);
889 host_port = container_port;
890 }
891
892 if (r < 0 || container_port <= 0) {
893 log_error("Failed to parse host port: %s", optarg);
894 return -EINVAL;
895 }
896
897 LIST_FOREACH(ports, p, arg_expose_ports) {
898 if (p->protocol == protocol && p->host_port == host_port) {
899 log_error("Duplicate port specification: %s", optarg);
900 return -EINVAL;
901 }
902 }
903
904 p = new(ExposePort, 1);
905 if (!p)
906 return log_oom();
907
908 p->protocol = protocol;
909 p->host_port = host_port;
910 p->container_port = container_port;
911
912 LIST_PREPEND(ports, arg_expose_ports, p);
913
914 break;
915 }
916
917 case ARG_PROPERTY:
918 if (strv_extend(&arg_property, optarg) < 0)
919 return log_oom();
920
921 break;
922
923 case ARG_PRIVATE_USERS:
924 if (optarg) {
925 _cleanup_free_ char *buffer = NULL;
926 const char *range, *shift;
927
928 range = strchr(optarg, ':');
929 if (range) {
930 buffer = strndup(optarg, range - optarg);
931 if (!buffer)
932 return log_oom();
933 shift = buffer;
934
935 range++;
936 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
937 log_error("Failed to parse UID range: %s", range);
938 return -EINVAL;
939 }
940 } else
941 shift = optarg;
942
943 if (parse_uid(shift, &arg_uid_shift) < 0) {
944 log_error("Failed to parse UID: %s", optarg);
945 return -EINVAL;
946 }
947 }
948
949 arg_userns = true;
950 break;
951
952 case ARG_KILL_SIGNAL:
953 arg_kill_signal = signal_from_string_try_harder(optarg);
954 if (arg_kill_signal < 0) {
955 log_error("Cannot parse signal: %s", optarg);
956 return -EINVAL;
957 }
958
959 break;
960
961 case '?':
962 return -EINVAL;
963
964 default:
965 assert_not_reached("Unhandled option");
966 }
967
968 if (arg_share_system)
969 arg_register = false;
970
971 if (arg_boot && arg_share_system) {
972 log_error("--boot and --share-system may not be combined.");
973 return -EINVAL;
974 }
975
976 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
977 log_error("--keep-unit may not be used when invoked from a user session.");
978 return -EINVAL;
979 }
980
981 if (arg_directory && arg_image) {
982 log_error("--directory= and --image= may not be combined.");
983 return -EINVAL;
984 }
985
986 if (arg_template && arg_image) {
987 log_error("--template= and --image= may not be combined.");
988 return -EINVAL;
989 }
990
991 if (arg_template && !(arg_directory || arg_machine)) {
992 log_error("--template= needs --directory= or --machine=.");
993 return -EINVAL;
994 }
995
996 if (arg_ephemeral && arg_template) {
997 log_error("--ephemeral and --template= may not be combined.");
998 return -EINVAL;
999 }
1000
1001 if (arg_ephemeral && arg_image) {
1002 log_error("--ephemeral and --image= may not be combined.");
1003 return -EINVAL;
1004 }
1005
1006 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1007 log_error("--ephemeral and --link-journal= may not be combined.");
1008 return -EINVAL;
1009 }
1010
1011 if (arg_volatile != VOLATILE_NO && arg_read_only) {
1012 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1013 return -EINVAL;
1014 }
1015
1016 if (arg_expose_ports && !arg_private_network) {
1017 log_error("Cannot use --port= without private networking.");
1018 return -EINVAL;
1019 }
1020
1021 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
1022 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
1023
1024 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1025
1026 if (arg_boot && arg_kill_signal <= 0)
1027 arg_kill_signal = SIGRTMIN+3;
1028
1029 return 1;
1030 }
1031
1032 static int tmpfs_patch_options(const char *options, char **ret) {
1033 char *buf = NULL;
1034
1035 if (arg_userns && arg_uid_shift != 0) {
1036 assert(arg_uid_shift != UID_INVALID);
1037
1038 if (options)
1039 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
1040 else
1041 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
1042 if (!buf)
1043 return -ENOMEM;
1044
1045 options = buf;
1046 }
1047
1048 #ifdef HAVE_SELINUX
1049 if (arg_selinux_apifs_context) {
1050 char *t;
1051
1052 if (options)
1053 t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
1054 else
1055 t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
1056 if (!t) {
1057 free(buf);
1058 return -ENOMEM;
1059 }
1060
1061 free(buf);
1062 buf = t;
1063 }
1064 #endif
1065
1066 *ret = buf;
1067 return !!buf;
1068 }
1069
1070 static int mount_all(const char *dest, bool userns) {
1071
1072 typedef struct MountPoint {
1073 const char *what;
1074 const char *where;
1075 const char *type;
1076 const char *options;
1077 unsigned long flags;
1078 bool fatal;
1079 bool userns;
1080 } MountPoint;
1081
1082 static const MountPoint mount_table[] = {
1083 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
1084 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
1085 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
1086 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
1087 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
1088 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
1089 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1090 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1091 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
1092 #ifdef HAVE_SELINUX
1093 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
1094 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */
1095 #endif
1096 };
1097
1098 unsigned k;
1099 int r;
1100
1101 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
1102 _cleanup_free_ char *where = NULL, *options = NULL;
1103 const char *o;
1104
1105 if (userns != mount_table[k].userns)
1106 continue;
1107
1108 where = prefix_root(dest, mount_table[k].where);
1109 if (!where)
1110 return log_oom();
1111
1112 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
1113 if (r < 0 && r != -ENOENT)
1114 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
1115
1116 /* Skip this entry if it is not a remount. */
1117 if (mount_table[k].what && r > 0)
1118 continue;
1119
1120 r = mkdir_p(where, 0755);
1121 if (r < 0) {
1122 if (mount_table[k].fatal)
1123 return log_error_errno(r, "Failed to create directory %s: %m", where);
1124
1125 log_warning_errno(r, "Failed to create directory %s: %m", where);
1126 continue;
1127 }
1128
1129 o = mount_table[k].options;
1130 if (streq_ptr(mount_table[k].type, "tmpfs")) {
1131 r = tmpfs_patch_options(o, &options);
1132 if (r < 0)
1133 return log_oom();
1134 if (r > 0)
1135 o = options;
1136 }
1137
1138 if (mount(mount_table[k].what,
1139 where,
1140 mount_table[k].type,
1141 mount_table[k].flags,
1142 o) < 0) {
1143
1144 if (mount_table[k].fatal)
1145 return log_error_errno(errno, "mount(%s) failed: %m", where);
1146
1147 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
1148 }
1149 }
1150
1151 return 0;
1152 }
1153
1154 static int mount_bind(const char *dest, CustomMount *m) {
1155 struct stat source_st, dest_st;
1156 const char *where;
1157 int r;
1158
1159 assert(m);
1160
1161 if (stat(m->source, &source_st) < 0)
1162 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
1163
1164 where = prefix_roota(dest, m->destination);
1165
1166 if (stat(where, &dest_st) >= 0) {
1167 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
1168 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
1169 return -EINVAL;
1170 }
1171
1172 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
1173 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
1174 return -EINVAL;
1175 }
1176
1177 } else if (errno == ENOENT) {
1178 r = mkdir_parents_label(where, 0755);
1179 if (r < 0)
1180 return log_error_errno(r, "Failed to make parents of %s: %m", where);
1181 } else {
1182 log_error_errno(errno, "Failed to stat %s: %m", where);
1183 return -errno;
1184 }
1185
1186 /* Create the mount point. Any non-directory file can be
1187 * mounted on any non-directory file (regular, fifo, socket,
1188 * char, block).
1189 */
1190 if (S_ISDIR(source_st.st_mode))
1191 r = mkdir_label(where, 0755);
1192 else
1193 r = touch(where);
1194 if (r < 0 && r != -EEXIST)
1195 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1196
1197 if (mount(m->source, where, NULL, MS_BIND, NULL) < 0)
1198 return log_error_errno(errno, "mount(%s) failed: %m", where);
1199
1200 if (m->read_only) {
1201 r = bind_remount_recursive(where, true);
1202 if (r < 0)
1203 return log_error_errno(r, "Read-only bind mount failed: %m");
1204 }
1205
1206 return 0;
1207 }
1208
1209 static int mount_tmpfs(const char *dest, CustomMount *m) {
1210 const char *where, *options;
1211 _cleanup_free_ char *buf = NULL;
1212 int r;
1213
1214 assert(dest);
1215 assert(m);
1216
1217 where = prefix_roota(dest, m->destination);
1218
1219 r = mkdir_p_label(where, 0755);
1220 if (r < 0 && r != -EEXIST)
1221 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1222
1223 r = tmpfs_patch_options(m->options, &buf);
1224 if (r < 0)
1225 return log_oom();
1226 options = r > 0 ? buf : m->options;
1227
1228 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
1229 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1230
1231 return 0;
1232 }
1233
1234 static int mount_overlay(const char *dest, CustomMount *m) {
1235 _cleanup_free_ char *lower = NULL;
1236 const char *where, *options;
1237 int r;
1238
1239 assert(dest);
1240 assert(m);
1241
1242 where = prefix_roota(dest, m->destination);
1243
1244 r = mkdir_label(where, 0755);
1245 if (r < 0 && r != -EEXIST)
1246 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
1247
1248 (void) mkdir_p_label(m->source, 0755);
1249
1250 strv_reverse(m->lower);
1251 lower = strv_join(m->lower, ":");
1252 strv_reverse(m->lower);
1253 if (!lower)
1254 return log_oom();
1255
1256 if (m->read_only)
1257 options = strjoina("lowerdir=", m->source, ":", lower);
1258 else {
1259 assert(m->work_dir);
1260 (void) mkdir_label(m->work_dir, 0700);
1261
1262 options = strjoina("lowerdir=", lower, ",upperdir=", m->source, ",workdir=", m->work_dir);
1263 }
1264
1265 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
1266 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
1267
1268 return 0;
1269 }
1270
1271 static int mount_custom(const char *dest) {
1272 unsigned i;
1273 int r;
1274
1275 assert(dest);
1276
1277 for (i = 0; i < arg_n_custom_mounts; i++) {
1278 CustomMount *m = &arg_custom_mounts[i];
1279
1280 switch (m->type) {
1281
1282 case CUSTOM_MOUNT_BIND:
1283 r = mount_bind(dest, m);
1284 break;
1285
1286 case CUSTOM_MOUNT_TMPFS:
1287 r = mount_tmpfs(dest, m);
1288 break;
1289
1290 case CUSTOM_MOUNT_OVERLAY:
1291 r = mount_overlay(dest, m);
1292 break;
1293
1294 default:
1295 assert_not_reached("Unknown custom mount type");
1296 }
1297
1298 if (r < 0)
1299 return r;
1300 }
1301
1302 return 0;
1303 }
1304
1305 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1306 char *to;
1307 int r;
1308
1309 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1310
1311 r = path_is_mount_point(to, 0);
1312 if (r < 0 && r != -ENOENT)
1313 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1314 if (r > 0)
1315 return 0;
1316
1317 mkdir_p(to, 0755);
1318
1319 /* The superblock mount options of the mount point need to be
1320 * identical to the hosts', and hence writable... */
1321 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1322 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1323
1324 /* ... hence let's only make the bind mount read-only, not the
1325 * superblock. */
1326 if (read_only) {
1327 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1328 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1329 }
1330 return 1;
1331 }
1332
1333 static int mount_cgroup(const char *dest) {
1334 _cleanup_set_free_free_ Set *controllers = NULL;
1335 const char *cgroup_root;
1336 int r;
1337
1338 controllers = set_new(&string_hash_ops);
1339 if (!controllers)
1340 return log_oom();
1341
1342 r = cg_kernel_controllers(controllers);
1343 if (r < 0)
1344 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1345
1346 for (;;) {
1347 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1348
1349 controller = set_steal_first(controllers);
1350 if (!controller)
1351 break;
1352
1353 origin = prefix_root("/sys/fs/cgroup/", controller);
1354 if (!origin)
1355 return log_oom();
1356
1357 r = readlink_malloc(origin, &combined);
1358 if (r == -EINVAL) {
1359 /* Not a symbolic link, but directly a single cgroup hierarchy */
1360
1361 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1362 if (r < 0)
1363 return r;
1364
1365 } else if (r < 0)
1366 return log_error_errno(r, "Failed to read link %s: %m", origin);
1367 else {
1368 _cleanup_free_ char *target = NULL;
1369
1370 target = prefix_root(dest, origin);
1371 if (!target)
1372 return log_oom();
1373
1374 /* A symbolic link, a combination of controllers in one hierarchy */
1375
1376 if (!filename_is_valid(combined)) {
1377 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1378 continue;
1379 }
1380
1381 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1382 if (r < 0)
1383 return r;
1384
1385 r = symlink_idempotent(combined, target);
1386 if (r == -EINVAL) {
1387 log_error("Invalid existing symlink for combined hierarchy");
1388 return r;
1389 }
1390 if (r < 0)
1391 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1392 }
1393 }
1394
1395 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1396 if (r < 0)
1397 return r;
1398
1399 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1400 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1401 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1402
1403 return 0;
1404 }
1405
1406 static int mount_systemd_cgroup_writable(const char *dest) {
1407 _cleanup_free_ char *own_cgroup_path = NULL;
1408 const char *systemd_root, *systemd_own;
1409 int r;
1410
1411 assert(dest);
1412
1413 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1414 if (r < 0)
1415 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1416
1417 /* Make our own cgroup a (writable) bind mount */
1418 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1419 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1420 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1421
1422 /* And then remount the systemd cgroup root read-only */
1423 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
1424 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1425 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1426
1427 return 0;
1428 }
1429
1430 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1431 assert(p);
1432
1433 if (!arg_userns)
1434 return 0;
1435
1436 if (uid == UID_INVALID && gid == GID_INVALID)
1437 return 0;
1438
1439 if (uid != UID_INVALID) {
1440 uid += arg_uid_shift;
1441
1442 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1443 return -EOVERFLOW;
1444 }
1445
1446 if (gid != GID_INVALID) {
1447 gid += (gid_t) arg_uid_shift;
1448
1449 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1450 return -EOVERFLOW;
1451 }
1452
1453 if (lchown(p, uid, gid) < 0)
1454 return -errno;
1455
1456 return 0;
1457 }
1458
1459 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1460 const char *q;
1461
1462 q = prefix_roota(root, path);
1463 if (mkdir(q, mode) < 0) {
1464 if (errno == EEXIST)
1465 return 0;
1466 return -errno;
1467 }
1468
1469 return userns_lchown(q, uid, gid);
1470 }
1471
1472 static int setup_timezone(const char *dest) {
1473 _cleanup_free_ char *p = NULL, *q = NULL;
1474 const char *where, *check, *what;
1475 char *z, *y;
1476 int r;
1477
1478 assert(dest);
1479
1480 /* Fix the timezone, if possible */
1481 r = readlink_malloc("/etc/localtime", &p);
1482 if (r < 0) {
1483 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1484 return 0;
1485 }
1486
1487 z = path_startswith(p, "../usr/share/zoneinfo/");
1488 if (!z)
1489 z = path_startswith(p, "/usr/share/zoneinfo/");
1490 if (!z) {
1491 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1492 return 0;
1493 }
1494
1495 where = prefix_roota(dest, "/etc/localtime");
1496 r = readlink_malloc(where, &q);
1497 if (r >= 0) {
1498 y = path_startswith(q, "../usr/share/zoneinfo/");
1499 if (!y)
1500 y = path_startswith(q, "/usr/share/zoneinfo/");
1501
1502 /* Already pointing to the right place? Then do nothing .. */
1503 if (y && streq(y, z))
1504 return 0;
1505 }
1506
1507 check = strjoina("/usr/share/zoneinfo/", z);
1508 check = prefix_root(dest, check);
1509 if (laccess(check, F_OK) < 0) {
1510 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1511 return 0;
1512 }
1513
1514 r = unlink(where);
1515 if (r < 0 && errno != ENOENT) {
1516 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1517 return 0;
1518 }
1519
1520 what = strjoina("../usr/share/zoneinfo/", z);
1521 if (symlink(what, where) < 0) {
1522 log_error_errno(errno, "Failed to correct timezone of container: %m");
1523 return 0;
1524 }
1525
1526 r = userns_lchown(where, 0, 0);
1527 if (r < 0)
1528 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1529
1530 return 0;
1531 }
1532
1533 static int setup_resolv_conf(const char *dest) {
1534 const char *where = NULL;
1535 int r;
1536
1537 assert(dest);
1538
1539 if (arg_private_network)
1540 return 0;
1541
1542 /* Fix resolv.conf, if possible */
1543 where = prefix_roota(dest, "/etc/resolv.conf");
1544
1545 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1546 if (r < 0) {
1547 /* If the file already exists as symlink, let's
1548 * suppress the warning, under the assumption that
1549 * resolved or something similar runs inside and the
1550 * symlink points there.
1551 *
1552 * If the disk image is read-only, there's also no
1553 * point in complaining.
1554 */
1555 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1556 "Failed to copy /etc/resolv.conf to %s: %m", where);
1557 return 0;
1558 }
1559
1560 r = userns_lchown(where, 0, 0);
1561 if (r < 0)
1562 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1563
1564 return 0;
1565 }
1566
1567 static int setup_volatile_state(const char *directory) {
1568 _cleanup_free_ char *buf = NULL;
1569 const char *p, *options;
1570 int r;
1571
1572 assert(directory);
1573
1574 if (arg_volatile != VOLATILE_STATE)
1575 return 0;
1576
1577 /* --volatile=state means we simply overmount /var
1578 with a tmpfs, and the rest read-only. */
1579
1580 r = bind_remount_recursive(directory, true);
1581 if (r < 0)
1582 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1583
1584 p = prefix_roota(directory, "/var");
1585 r = mkdir(p, 0755);
1586 if (r < 0 && errno != EEXIST)
1587 return log_error_errno(errno, "Failed to create %s: %m", directory);
1588
1589 options = "mode=755";
1590 r = tmpfs_patch_options(options, &buf);
1591 if (r < 0)
1592 return log_oom();
1593 if (r > 0)
1594 options = buf;
1595
1596 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
1597 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1598
1599 return 0;
1600 }
1601
1602 static int setup_volatile(const char *directory) {
1603 bool tmpfs_mounted = false, bind_mounted = false;
1604 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1605 _cleanup_free_ char *buf = NULL;
1606 const char *f, *t, *options;
1607 int r;
1608
1609 assert(directory);
1610
1611 if (arg_volatile != VOLATILE_YES)
1612 return 0;
1613
1614 /* --volatile=yes means we mount a tmpfs to the root dir, and
1615 the original /usr to use inside it, and that read-only. */
1616
1617 if (!mkdtemp(template))
1618 return log_error_errno(errno, "Failed to create temporary directory: %m");
1619
1620 options = "mode=755";
1621 r = tmpfs_patch_options(options, &buf);
1622 if (r < 0)
1623 return log_oom();
1624 if (r > 0)
1625 options = buf;
1626
1627 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
1628 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1629 goto fail;
1630 }
1631
1632 tmpfs_mounted = true;
1633
1634 f = prefix_roota(directory, "/usr");
1635 t = prefix_roota(template, "/usr");
1636
1637 r = mkdir(t, 0755);
1638 if (r < 0 && errno != EEXIST) {
1639 r = log_error_errno(errno, "Failed to create %s: %m", t);
1640 goto fail;
1641 }
1642
1643 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
1644 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
1645 goto fail;
1646 }
1647
1648 bind_mounted = true;
1649
1650 r = bind_remount_recursive(t, true);
1651 if (r < 0) {
1652 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1653 goto fail;
1654 }
1655
1656 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1657 r = log_error_errno(errno, "Failed to move root mount: %m");
1658 goto fail;
1659 }
1660
1661 (void) rmdir(template);
1662
1663 return 0;
1664
1665 fail:
1666 if (bind_mounted)
1667 (void) umount(t);
1668
1669 if (tmpfs_mounted)
1670 (void) umount(template);
1671 (void) rmdir(template);
1672 return r;
1673 }
1674
1675 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1676 assert(s);
1677
1678 snprintf(s, 37,
1679 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1680 SD_ID128_FORMAT_VAL(id));
1681
1682 return s;
1683 }
1684
1685 static int setup_boot_id(const char *dest) {
1686 const char *from, *to;
1687 sd_id128_t rnd = {};
1688 char as_uuid[37];
1689 int r;
1690
1691 if (arg_share_system)
1692 return 0;
1693
1694 /* Generate a new randomized boot ID, so that each boot-up of
1695 * the container gets a new one */
1696
1697 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1698 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1699
1700 r = sd_id128_randomize(&rnd);
1701 if (r < 0)
1702 return log_error_errno(r, "Failed to generate random boot id: %m");
1703
1704 id128_format_as_uuid(rnd, as_uuid);
1705
1706 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1707 if (r < 0)
1708 return log_error_errno(r, "Failed to write boot id: %m");
1709
1710 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1711 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1712 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1713 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1714
1715 unlink(from);
1716 return r;
1717 }
1718
1719 static int copy_devnodes(const char *dest) {
1720
1721 static const char devnodes[] =
1722 "null\0"
1723 "zero\0"
1724 "full\0"
1725 "random\0"
1726 "urandom\0"
1727 "tty\0"
1728 "net/tun\0";
1729
1730 const char *d;
1731 int r = 0;
1732 _cleanup_umask_ mode_t u;
1733
1734 assert(dest);
1735
1736 u = umask(0000);
1737
1738 /* Create /dev/net, so that we can create /dev/net/tun in it */
1739 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1740 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1741
1742 NULSTR_FOREACH(d, devnodes) {
1743 _cleanup_free_ char *from = NULL, *to = NULL;
1744 struct stat st;
1745
1746 from = strappend("/dev/", d);
1747 to = prefix_root(dest, from);
1748
1749 if (stat(from, &st) < 0) {
1750
1751 if (errno != ENOENT)
1752 return log_error_errno(errno, "Failed to stat %s: %m", from);
1753
1754 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1755
1756 log_error("%s is not a char or block device, cannot copy.", from);
1757 return -EIO;
1758
1759 } else {
1760 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1761 if (errno != EPERM)
1762 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1763
1764 /* Some systems abusively restrict mknod but
1765 * allow bind mounts. */
1766 r = touch(to);
1767 if (r < 0)
1768 return log_error_errno(r, "touch (%s) failed: %m", to);
1769 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1770 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1771 }
1772
1773 r = userns_lchown(to, 0, 0);
1774 if (r < 0)
1775 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1776 }
1777 }
1778
1779 return r;
1780 }
1781
1782 static int setup_pts(const char *dest) {
1783 _cleanup_free_ char *options = NULL;
1784 const char *p;
1785
1786 #ifdef HAVE_SELINUX
1787 if (arg_selinux_apifs_context)
1788 (void) asprintf(&options,
1789 "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT ",context=\"%s\"",
1790 arg_uid_shift,
1791 arg_uid_shift + TTY_GID,
1792 arg_selinux_apifs_context);
1793 else
1794 #endif
1795 (void) asprintf(&options,
1796 "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT,
1797 arg_uid_shift,
1798 arg_uid_shift + TTY_GID);
1799
1800 if (!options)
1801 return log_oom();
1802
1803 /* Mount /dev/pts itself */
1804 p = prefix_roota(dest, "/dev/pts");
1805 if (mkdir(p, 0755) < 0)
1806 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1807 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1808 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1809 if (userns_lchown(p, 0, 0) < 0)
1810 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1811
1812 /* Create /dev/ptmx symlink */
1813 p = prefix_roota(dest, "/dev/ptmx");
1814 if (symlink("pts/ptmx", p) < 0)
1815 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1816 if (userns_lchown(p, 0, 0) < 0)
1817 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1818
1819 /* And fix /dev/pts/ptmx ownership */
1820 p = prefix_roota(dest, "/dev/pts/ptmx");
1821 if (userns_lchown(p, 0, 0) < 0)
1822 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1823
1824 return 0;
1825 }
1826
1827 static int setup_dev_console(const char *dest, const char *console) {
1828 _cleanup_umask_ mode_t u;
1829 const char *to;
1830 int r;
1831
1832 assert(dest);
1833 assert(console);
1834
1835 u = umask(0000);
1836
1837 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1838 if (r < 0)
1839 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1840
1841 /* We need to bind mount the right tty to /dev/console since
1842 * ptys can only exist on pts file systems. To have something
1843 * to bind mount things on we create a empty regular file. */
1844
1845 to = prefix_roota(dest, "/dev/console");
1846 r = touch(to);
1847 if (r < 0)
1848 return log_error_errno(r, "touch() for /dev/console failed: %m");
1849
1850 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1851 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1852
1853 return 0;
1854 }
1855
1856 static int setup_kmsg(const char *dest, int kmsg_socket) {
1857 const char *from, *to;
1858 _cleanup_umask_ mode_t u;
1859 int fd, k;
1860 union {
1861 struct cmsghdr cmsghdr;
1862 uint8_t buf[CMSG_SPACE(sizeof(int))];
1863 } control = {};
1864 struct msghdr mh = {
1865 .msg_control = &control,
1866 .msg_controllen = sizeof(control),
1867 };
1868 struct cmsghdr *cmsg;
1869
1870 assert(kmsg_socket >= 0);
1871
1872 u = umask(0000);
1873
1874 /* We create the kmsg FIFO as /run/kmsg, but immediately
1875 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1876 * on the reading side behave very similar to /proc/kmsg,
1877 * their writing side behaves differently from /dev/kmsg in
1878 * that writing blocks when nothing is reading. In order to
1879 * avoid any problems with containers deadlocking due to this
1880 * we simply make /dev/kmsg unavailable to the container. */
1881 from = prefix_roota(dest, "/run/kmsg");
1882 to = prefix_roota(dest, "/proc/kmsg");
1883
1884 if (mkfifo(from, 0600) < 0)
1885 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1886 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1887 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1888
1889 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1890 if (fd < 0)
1891 return log_error_errno(errno, "Failed to open fifo: %m");
1892
1893 cmsg = CMSG_FIRSTHDR(&mh);
1894 cmsg->cmsg_level = SOL_SOCKET;
1895 cmsg->cmsg_type = SCM_RIGHTS;
1896 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1897 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1898
1899 mh.msg_controllen = cmsg->cmsg_len;
1900
1901 /* Store away the fd in the socket, so that it stays open as
1902 * long as we run the child */
1903 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1904 safe_close(fd);
1905
1906 if (k < 0)
1907 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1908
1909 /* And now make the FIFO unavailable as /run/kmsg... */
1910 (void) unlink(from);
1911
1912 return 0;
1913 }
1914
1915 static int send_rtnl(int send_fd) {
1916 union {
1917 struct cmsghdr cmsghdr;
1918 uint8_t buf[CMSG_SPACE(sizeof(int))];
1919 } control = {};
1920 struct msghdr mh = {
1921 .msg_control = &control,
1922 .msg_controllen = sizeof(control),
1923 };
1924 struct cmsghdr *cmsg;
1925 _cleanup_close_ int fd = -1;
1926 ssize_t k;
1927
1928 assert(send_fd >= 0);
1929
1930 if (!arg_expose_ports)
1931 return 0;
1932
1933 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1934 if (fd < 0)
1935 return log_error_errno(errno, "Failed to allocate container netlink: %m");
1936
1937 cmsg = CMSG_FIRSTHDR(&mh);
1938 cmsg->cmsg_level = SOL_SOCKET;
1939 cmsg->cmsg_type = SCM_RIGHTS;
1940 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1941 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1942
1943 mh.msg_controllen = cmsg->cmsg_len;
1944
1945 /* Store away the fd in the socket, so that it stays open as
1946 * long as we run the child */
1947 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1948 if (k < 0)
1949 return log_error_errno(errno, "Failed to send netlink fd: %m");
1950
1951 return 0;
1952 }
1953
1954 static int flush_ports(union in_addr_union *exposed) {
1955 ExposePort *p;
1956 int r, af = AF_INET;
1957
1958 assert(exposed);
1959
1960 if (!arg_expose_ports)
1961 return 0;
1962
1963 if (in_addr_is_null(af, exposed))
1964 return 0;
1965
1966 log_debug("Lost IP address.");
1967
1968 LIST_FOREACH(ports, p, arg_expose_ports) {
1969 r = fw_add_local_dnat(false,
1970 af,
1971 p->protocol,
1972 NULL,
1973 NULL, 0,
1974 NULL, 0,
1975 p->host_port,
1976 exposed,
1977 p->container_port,
1978 NULL);
1979 if (r < 0)
1980 log_warning_errno(r, "Failed to modify firewall: %m");
1981 }
1982
1983 *exposed = IN_ADDR_NULL;
1984 return 0;
1985 }
1986
1987 static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
1988 _cleanup_free_ struct local_address *addresses = NULL;
1989 _cleanup_free_ char *pretty = NULL;
1990 union in_addr_union new_exposed;
1991 ExposePort *p;
1992 bool add;
1993 int af = AF_INET, r;
1994
1995 assert(exposed);
1996
1997 /* Invoked each time an address is added or removed inside the
1998 * container */
1999
2000 if (!arg_expose_ports)
2001 return 0;
2002
2003 r = local_addresses(rtnl, 0, af, &addresses);
2004 if (r < 0)
2005 return log_error_errno(r, "Failed to enumerate local addresses: %m");
2006
2007 add = r > 0 &&
2008 addresses[0].family == af &&
2009 addresses[0].scope < RT_SCOPE_LINK;
2010
2011 if (!add)
2012 return flush_ports(exposed);
2013
2014 new_exposed = addresses[0].address;
2015 if (in_addr_equal(af, exposed, &new_exposed))
2016 return 0;
2017
2018 in_addr_to_string(af, &new_exposed, &pretty);
2019 log_debug("New container IP is %s.", strna(pretty));
2020
2021 LIST_FOREACH(ports, p, arg_expose_ports) {
2022
2023 r = fw_add_local_dnat(true,
2024 af,
2025 p->protocol,
2026 NULL,
2027 NULL, 0,
2028 NULL, 0,
2029 p->host_port,
2030 &new_exposed,
2031 p->container_port,
2032 in_addr_is_null(af, exposed) ? NULL : exposed);
2033 if (r < 0)
2034 log_warning_errno(r, "Failed to modify firewall: %m");
2035 }
2036
2037 *exposed = new_exposed;
2038 return 0;
2039 }
2040
2041 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2042 union in_addr_union *exposed = userdata;
2043
2044 assert(rtnl);
2045 assert(m);
2046 assert(exposed);
2047
2048 expose_ports(rtnl, exposed);
2049 return 0;
2050 }
2051
2052 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
2053 union {
2054 struct cmsghdr cmsghdr;
2055 uint8_t buf[CMSG_SPACE(sizeof(int))];
2056 } control = {};
2057 struct msghdr mh = {
2058 .msg_control = &control,
2059 .msg_controllen = sizeof(control),
2060 };
2061 struct cmsghdr *cmsg;
2062 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2063 int fd, r;
2064 ssize_t k;
2065
2066 assert(event);
2067 assert(recv_fd >= 0);
2068 assert(ret);
2069
2070 if (!arg_expose_ports)
2071 return 0;
2072
2073 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
2074 if (k < 0)
2075 return log_error_errno(errno, "Failed to recv netlink fd: %m");
2076
2077 cmsg = CMSG_FIRSTHDR(&mh);
2078 assert(cmsg->cmsg_level == SOL_SOCKET);
2079 assert(cmsg->cmsg_type == SCM_RIGHTS);
2080 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
2081 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
2082
2083 r = sd_netlink_open_fd(&rtnl, fd);
2084 if (r < 0) {
2085 safe_close(fd);
2086 return log_error_errno(r, "Failed to create rtnl object: %m");
2087 }
2088
2089 r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
2090 if (r < 0)
2091 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
2092
2093 r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
2094 if (r < 0)
2095 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
2096
2097 r = sd_netlink_attach_event(rtnl, event, 0);
2098 if (r < 0)
2099 return log_error_errno(r, "Failed to add to even loop: %m");
2100
2101 *ret = rtnl;
2102 rtnl = NULL;
2103
2104 return 0;
2105 }
2106
2107 static int setup_hostname(void) {
2108
2109 if (arg_share_system)
2110 return 0;
2111
2112 if (sethostname_idempotent(arg_machine) < 0)
2113 return -errno;
2114
2115 return 0;
2116 }
2117
2118 static int setup_journal(const char *directory) {
2119 sd_id128_t machine_id, this_id;
2120 _cleanup_free_ char *b = NULL, *d = NULL;
2121 const char *etc_machine_id, *p, *q;
2122 char *id;
2123 int r;
2124
2125 /* Don't link journals in ephemeral mode */
2126 if (arg_ephemeral)
2127 return 0;
2128
2129 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2130
2131 r = read_one_line_file(etc_machine_id, &b);
2132 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
2133 return 0;
2134 else if (r < 0)
2135 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
2136
2137 id = strstrip(b);
2138 if (isempty(id) && arg_link_journal == LINK_AUTO)
2139 return 0;
2140
2141 /* Verify validity */
2142 r = sd_id128_from_string(id, &machine_id);
2143 if (r < 0)
2144 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
2145
2146 r = sd_id128_get_machine(&this_id);
2147 if (r < 0)
2148 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2149
2150 if (sd_id128_equal(machine_id, this_id)) {
2151 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
2152 "Host and machine ids are equal (%s): refusing to link journals", id);
2153 if (arg_link_journal == LINK_AUTO)
2154 return 0;
2155 return -EEXIST;
2156 }
2157
2158 if (arg_link_journal == LINK_NO)
2159 return 0;
2160
2161 r = userns_mkdir(directory, "/var", 0755, 0, 0);
2162 if (r < 0)
2163 return log_error_errno(r, "Failed to create /var: %m");
2164
2165 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
2166 if (r < 0)
2167 return log_error_errno(r, "Failed to create /var/log: %m");
2168
2169 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
2170 if (r < 0)
2171 return log_error_errno(r, "Failed to create /var/log/journal: %m");
2172
2173 p = strjoina("/var/log/journal/", id);
2174 q = prefix_roota(directory, p);
2175
2176 if (path_is_mount_point(p, 0) > 0) {
2177 if (arg_link_journal != LINK_AUTO) {
2178 log_error("%s: already a mount point, refusing to use for journal", p);
2179 return -EEXIST;
2180 }
2181
2182 return 0;
2183 }
2184
2185 if (path_is_mount_point(q, 0) > 0) {
2186 if (arg_link_journal != LINK_AUTO) {
2187 log_error("%s: already a mount point, refusing to use for journal", q);
2188 return -EEXIST;
2189 }
2190
2191 return 0;
2192 }
2193
2194 r = readlink_and_make_absolute(p, &d);
2195 if (r >= 0) {
2196 if ((arg_link_journal == LINK_GUEST ||
2197 arg_link_journal == LINK_AUTO) &&
2198 path_equal(d, q)) {
2199
2200 r = userns_mkdir(directory, p, 0755, 0, 0);
2201 if (r < 0)
2202 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2203 return 0;
2204 }
2205
2206 if (unlink(p) < 0)
2207 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2208 } else if (r == -EINVAL) {
2209
2210 if (arg_link_journal == LINK_GUEST &&
2211 rmdir(p) < 0) {
2212
2213 if (errno == ENOTDIR) {
2214 log_error("%s already exists and is neither a symlink nor a directory", p);
2215 return r;
2216 } else {
2217 log_error_errno(errno, "Failed to remove %s: %m", p);
2218 return -errno;
2219 }
2220 }
2221 } else if (r != -ENOENT) {
2222 log_error_errno(errno, "readlink(%s) failed: %m", p);
2223 return r;
2224 }
2225
2226 if (arg_link_journal == LINK_GUEST) {
2227
2228 if (symlink(q, p) < 0) {
2229 if (arg_link_journal_try) {
2230 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2231 return 0;
2232 } else {
2233 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2234 return -errno;
2235 }
2236 }
2237
2238 r = userns_mkdir(directory, p, 0755, 0, 0);
2239 if (r < 0)
2240 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2241 return 0;
2242 }
2243
2244 if (arg_link_journal == LINK_HOST) {
2245 /* don't create parents here -- if the host doesn't have
2246 * permanent journal set up, don't force it here */
2247 r = mkdir(p, 0755);
2248 if (r < 0) {
2249 if (arg_link_journal_try) {
2250 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
2251 return 0;
2252 } else {
2253 log_error_errno(errno, "Failed to create %s: %m", p);
2254 return r;
2255 }
2256 }
2257
2258 } else if (access(p, F_OK) < 0)
2259 return 0;
2260
2261 if (dir_is_empty(q) == 0)
2262 log_warning("%s is not empty, proceeding anyway.", q);
2263
2264 r = userns_mkdir(directory, p, 0755, 0, 0);
2265 if (r < 0) {
2266 log_error_errno(errno, "Failed to create %s: %m", q);
2267 return r;
2268 }
2269
2270 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2271 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2272
2273 return 0;
2274 }
2275
2276 static int drop_capabilities(void) {
2277 return capability_bounding_set_drop(~arg_retain, false);
2278 }
2279
2280 static int register_machine(pid_t pid, int local_ifindex) {
2281 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2282 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2283 int r;
2284
2285 if (!arg_register)
2286 return 0;
2287
2288 r = sd_bus_default_system(&bus);
2289 if (r < 0)
2290 return log_error_errno(r, "Failed to open system bus: %m");
2291
2292 if (arg_keep_unit) {
2293 r = sd_bus_call_method(
2294 bus,
2295 "org.freedesktop.machine1",
2296 "/org/freedesktop/machine1",
2297 "org.freedesktop.machine1.Manager",
2298 "RegisterMachineWithNetwork",
2299 &error,
2300 NULL,
2301 "sayssusai",
2302 arg_machine,
2303 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2304 "nspawn",
2305 "container",
2306 (uint32_t) pid,
2307 strempty(arg_directory),
2308 local_ifindex > 0 ? 1 : 0, local_ifindex);
2309 } else {
2310 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
2311 char **i;
2312 unsigned j;
2313
2314 r = sd_bus_message_new_method_call(
2315 bus,
2316 &m,
2317 "org.freedesktop.machine1",
2318 "/org/freedesktop/machine1",
2319 "org.freedesktop.machine1.Manager",
2320 "CreateMachineWithNetwork");
2321 if (r < 0)
2322 return bus_log_create_error(r);
2323
2324 r = sd_bus_message_append(
2325 m,
2326 "sayssusai",
2327 arg_machine,
2328 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2329 "nspawn",
2330 "container",
2331 (uint32_t) pid,
2332 strempty(arg_directory),
2333 local_ifindex > 0 ? 1 : 0, local_ifindex);
2334 if (r < 0)
2335 return bus_log_create_error(r);
2336
2337 r = sd_bus_message_open_container(m, 'a', "(sv)");
2338 if (r < 0)
2339 return bus_log_create_error(r);
2340
2341 if (!isempty(arg_slice)) {
2342 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2343 if (r < 0)
2344 return bus_log_create_error(r);
2345 }
2346
2347 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2348 if (r < 0)
2349 return bus_log_create_error(r);
2350
2351 /* If you make changes here, also make sure to update
2352 * systemd-nspawn@.service, to keep the device
2353 * policies in sync regardless if we are run with or
2354 * without the --keep-unit switch. */
2355 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2356 /* Allow the container to
2357 * access and create the API
2358 * device nodes, so that
2359 * PrivateDevices= in the
2360 * container can work
2361 * fine */
2362 "/dev/null", "rwm",
2363 "/dev/zero", "rwm",
2364 "/dev/full", "rwm",
2365 "/dev/random", "rwm",
2366 "/dev/urandom", "rwm",
2367 "/dev/tty", "rwm",
2368 "/dev/net/tun", "rwm",
2369 /* Allow the container
2370 * access to ptys. However,
2371 * do not permit the
2372 * container to ever create
2373 * these device nodes. */
2374 "/dev/pts/ptmx", "rw",
2375 "char-pts", "rw");
2376 if (r < 0)
2377 return bus_log_create_error(r);
2378
2379 for (j = 0; j < arg_n_custom_mounts; j++) {
2380 CustomMount *cm = &arg_custom_mounts[j];
2381
2382 if (cm->type != CUSTOM_MOUNT_BIND)
2383 continue;
2384
2385 r = is_device_node(cm->source);
2386 if (r < 0)
2387 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
2388
2389 if (r) {
2390 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
2391 cm->source, cm->read_only ? "r" : "rw");
2392 if (r < 0)
2393 return log_error_errno(r, "Failed to append message arguments: %m");
2394 }
2395 }
2396
2397 if (arg_kill_signal != 0) {
2398 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
2399 if (r < 0)
2400 return bus_log_create_error(r);
2401
2402 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
2403 if (r < 0)
2404 return bus_log_create_error(r);
2405 }
2406
2407 STRV_FOREACH(i, arg_property) {
2408 r = sd_bus_message_open_container(m, 'r', "sv");
2409 if (r < 0)
2410 return bus_log_create_error(r);
2411
2412 r = bus_append_unit_property_assignment(m, *i);
2413 if (r < 0)
2414 return r;
2415
2416 r = sd_bus_message_close_container(m);
2417 if (r < 0)
2418 return bus_log_create_error(r);
2419 }
2420
2421 r = sd_bus_message_close_container(m);
2422 if (r < 0)
2423 return bus_log_create_error(r);
2424
2425 r = sd_bus_call(bus, m, 0, &error, NULL);
2426 }
2427
2428 if (r < 0) {
2429 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2430 return r;
2431 }
2432
2433 return 0;
2434 }
2435
2436 static int terminate_machine(pid_t pid) {
2437 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2438 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2439 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2440 const char *path;
2441 int r;
2442
2443 if (!arg_register)
2444 return 0;
2445
2446 /* If we are reusing the unit, then just exit, systemd will do
2447 * the right thing when we exit. */
2448 if (arg_keep_unit)
2449 return 0;
2450
2451 r = sd_bus_default_system(&bus);
2452 if (r < 0)
2453 return log_error_errno(r, "Failed to open system bus: %m");
2454
2455 r = sd_bus_call_method(
2456 bus,
2457 "org.freedesktop.machine1",
2458 "/org/freedesktop/machine1",
2459 "org.freedesktop.machine1.Manager",
2460 "GetMachineByPID",
2461 &error,
2462 &reply,
2463 "u",
2464 (uint32_t) pid);
2465 if (r < 0) {
2466 /* Note that the machine might already have been
2467 * cleaned up automatically, hence don't consider it a
2468 * failure if we cannot get the machine object. */
2469 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2470 return 0;
2471 }
2472
2473 r = sd_bus_message_read(reply, "o", &path);
2474 if (r < 0)
2475 return bus_log_parse_error(r);
2476
2477 r = sd_bus_call_method(
2478 bus,
2479 "org.freedesktop.machine1",
2480 path,
2481 "org.freedesktop.machine1.Machine",
2482 "Terminate",
2483 &error,
2484 NULL,
2485 NULL);
2486 if (r < 0) {
2487 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2488 return 0;
2489 }
2490
2491 return 0;
2492 }
2493
2494 static int reset_audit_loginuid(void) {
2495 _cleanup_free_ char *p = NULL;
2496 int r;
2497
2498 if (arg_share_system)
2499 return 0;
2500
2501 r = read_one_line_file("/proc/self/loginuid", &p);
2502 if (r == -ENOENT)
2503 return 0;
2504 if (r < 0)
2505 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2506
2507 /* Already reset? */
2508 if (streq(p, "4294967295"))
2509 return 0;
2510
2511 r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_CREATE);
2512 if (r < 0) {
2513 log_error_errno(r,
2514 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2515 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2516 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2517 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2518 "using systemd-nspawn. Sleeping for 5s... (%m)");
2519
2520 sleep(5);
2521 }
2522
2523 return 0;
2524 }
2525
2526 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2527 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2528 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2529
2530 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2531 uint8_t result[8];
2532 size_t l, sz;
2533 uint8_t *v, *i;
2534 int r;
2535
2536 l = strlen(arg_machine);
2537 sz = sizeof(sd_id128_t) + l;
2538 if (idx > 0)
2539 sz += sizeof(idx);
2540
2541 v = alloca(sz);
2542
2543 /* fetch some persistent data unique to the host */
2544 r = sd_id128_get_machine((sd_id128_t*) v);
2545 if (r < 0)
2546 return r;
2547
2548 /* combine with some data unique (on this host) to this
2549 * container instance */
2550 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2551 if (idx > 0) {
2552 idx = htole64(idx);
2553 memcpy(i, &idx, sizeof(idx));
2554 }
2555
2556 /* Let's hash the host machine ID plus the container name. We
2557 * use a fixed, but originally randomly created hash key here. */
2558 siphash24(result, v, sz, hash_key.bytes);
2559
2560 assert_cc(ETH_ALEN <= sizeof(result));
2561 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2562
2563 /* see eth_random_addr in the kernel */
2564 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2565 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2566
2567 return 0;
2568 }
2569
2570 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2571 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2572 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2573 struct ether_addr mac_host, mac_container;
2574 int r, i;
2575
2576 if (!arg_private_network)
2577 return 0;
2578
2579 if (!arg_network_veth)
2580 return 0;
2581
2582 /* Use two different interface name prefixes depending whether
2583 * we are in bridge mode or not. */
2584 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2585 arg_network_bridge ? "vb" : "ve", arg_machine);
2586
2587 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2588 if (r < 0)
2589 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2590
2591 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2592 if (r < 0)
2593 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2594
2595 r = sd_netlink_open(&rtnl);
2596 if (r < 0)
2597 return log_error_errno(r, "Failed to connect to netlink: %m");
2598
2599 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2600 if (r < 0)
2601 return log_error_errno(r, "Failed to allocate netlink message: %m");
2602
2603 r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
2604 if (r < 0)
2605 return log_error_errno(r, "Failed to add netlink interface name: %m");
2606
2607 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2608 if (r < 0)
2609 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2610
2611 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2612 if (r < 0)
2613 return log_error_errno(r, "Failed to open netlink container: %m");
2614
2615 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2616 if (r < 0)
2617 return log_error_errno(r, "Failed to open netlink container: %m");
2618
2619 r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
2620 if (r < 0)
2621 return log_error_errno(r, "Failed to open netlink container: %m");
2622
2623 r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
2624 if (r < 0)
2625 return log_error_errno(r, "Failed to add netlink interface name: %m");
2626
2627 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2628 if (r < 0)
2629 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2630
2631 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2632 if (r < 0)
2633 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2634
2635 r = sd_netlink_message_close_container(m);
2636 if (r < 0)
2637 return log_error_errno(r, "Failed to close netlink container: %m");
2638
2639 r = sd_netlink_message_close_container(m);
2640 if (r < 0)
2641 return log_error_errno(r, "Failed to close netlink container: %m");
2642
2643 r = sd_netlink_message_close_container(m);
2644 if (r < 0)
2645 return log_error_errno(r, "Failed to close netlink container: %m");
2646
2647 r = sd_netlink_call(rtnl, m, 0, NULL);
2648 if (r < 0)
2649 return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
2650
2651 i = (int) if_nametoindex(iface_name);
2652 if (i <= 0)
2653 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2654
2655 *ifi = i;
2656
2657 return 0;
2658 }
2659
2660 static int setup_bridge(const char veth_name[], int *ifi) {
2661 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2662 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2663 int r, bridge;
2664
2665 if (!arg_private_network)
2666 return 0;
2667
2668 if (!arg_network_veth)
2669 return 0;
2670
2671 if (!arg_network_bridge)
2672 return 0;
2673
2674 bridge = (int) if_nametoindex(arg_network_bridge);
2675 if (bridge <= 0)
2676 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2677
2678 *ifi = bridge;
2679
2680 r = sd_netlink_open(&rtnl);
2681 if (r < 0)
2682 return log_error_errno(r, "Failed to connect to netlink: %m");
2683
2684 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2685 if (r < 0)
2686 return log_error_errno(r, "Failed to allocate netlink message: %m");
2687
2688 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2689 if (r < 0)
2690 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2691
2692 r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
2693 if (r < 0)
2694 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2695
2696 r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
2697 if (r < 0)
2698 return log_error_errno(r, "Failed to add netlink master field: %m");
2699
2700 r = sd_netlink_call(rtnl, m, 0, NULL);
2701 if (r < 0)
2702 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2703
2704 return 0;
2705 }
2706
2707 static int parse_interface(struct udev *udev, const char *name) {
2708 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2709 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2710 int ifi;
2711
2712 ifi = (int) if_nametoindex(name);
2713 if (ifi <= 0)
2714 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2715
2716 sprintf(ifi_str, "n%i", ifi);
2717 d = udev_device_new_from_device_id(udev, ifi_str);
2718 if (!d)
2719 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2720
2721 if (udev_device_get_is_initialized(d) <= 0) {
2722 log_error("Network interface %s is not initialized yet.", name);
2723 return -EBUSY;
2724 }
2725
2726 return ifi;
2727 }
2728
2729 static int move_network_interfaces(pid_t pid) {
2730 _cleanup_udev_unref_ struct udev *udev = NULL;
2731 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2732 char **i;
2733 int r;
2734
2735 if (!arg_private_network)
2736 return 0;
2737
2738 if (strv_isempty(arg_network_interfaces))
2739 return 0;
2740
2741 r = sd_netlink_open(&rtnl);
2742 if (r < 0)
2743 return log_error_errno(r, "Failed to connect to netlink: %m");
2744
2745 udev = udev_new();
2746 if (!udev) {
2747 log_error("Failed to connect to udev.");
2748 return -ENOMEM;
2749 }
2750
2751 STRV_FOREACH(i, arg_network_interfaces) {
2752 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2753 int ifi;
2754
2755 ifi = parse_interface(udev, *i);
2756 if (ifi < 0)
2757 return ifi;
2758
2759 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2760 if (r < 0)
2761 return log_error_errno(r, "Failed to allocate netlink message: %m");
2762
2763 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2764 if (r < 0)
2765 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2766
2767 r = sd_netlink_call(rtnl, m, 0, NULL);
2768 if (r < 0)
2769 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2770 }
2771
2772 return 0;
2773 }
2774
2775 static int setup_macvlan(pid_t pid) {
2776 _cleanup_udev_unref_ struct udev *udev = NULL;
2777 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2778 unsigned idx = 0;
2779 char **i;
2780 int r;
2781
2782 if (!arg_private_network)
2783 return 0;
2784
2785 if (strv_isempty(arg_network_macvlan))
2786 return 0;
2787
2788 r = sd_netlink_open(&rtnl);
2789 if (r < 0)
2790 return log_error_errno(r, "Failed to connect to netlink: %m");
2791
2792 udev = udev_new();
2793 if (!udev) {
2794 log_error("Failed to connect to udev.");
2795 return -ENOMEM;
2796 }
2797
2798 STRV_FOREACH(i, arg_network_macvlan) {
2799 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2800 _cleanup_free_ char *n = NULL;
2801 struct ether_addr mac;
2802 int ifi;
2803
2804 ifi = parse_interface(udev, *i);
2805 if (ifi < 0)
2806 return ifi;
2807
2808 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2809 if (r < 0)
2810 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2811
2812 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2813 if (r < 0)
2814 return log_error_errno(r, "Failed to allocate netlink message: %m");
2815
2816 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2817 if (r < 0)
2818 return log_error_errno(r, "Failed to add netlink interface index: %m");
2819
2820 n = strappend("mv-", *i);
2821 if (!n)
2822 return log_oom();
2823
2824 strshorten(n, IFNAMSIZ-1);
2825
2826 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2827 if (r < 0)
2828 return log_error_errno(r, "Failed to add netlink interface name: %m");
2829
2830 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2831 if (r < 0)
2832 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2833
2834 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2835 if (r < 0)
2836 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2837
2838 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2839 if (r < 0)
2840 return log_error_errno(r, "Failed to open netlink container: %m");
2841
2842 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2843 if (r < 0)
2844 return log_error_errno(r, "Failed to open netlink container: %m");
2845
2846 r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2847 if (r < 0)
2848 return log_error_errno(r, "Failed to append macvlan mode: %m");
2849
2850 r = sd_netlink_message_close_container(m);
2851 if (r < 0)
2852 return log_error_errno(r, "Failed to close netlink container: %m");
2853
2854 r = sd_netlink_message_close_container(m);
2855 if (r < 0)
2856 return log_error_errno(r, "Failed to close netlink container: %m");
2857
2858 r = sd_netlink_call(rtnl, m, 0, NULL);
2859 if (r < 0)
2860 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2861 }
2862
2863 return 0;
2864 }
2865
2866 static int setup_ipvlan(pid_t pid) {
2867 _cleanup_udev_unref_ struct udev *udev = NULL;
2868 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2869 char **i;
2870 int r;
2871
2872 if (!arg_private_network)
2873 return 0;
2874
2875 if (strv_isempty(arg_network_ipvlan))
2876 return 0;
2877
2878 r = sd_netlink_open(&rtnl);
2879 if (r < 0)
2880 return log_error_errno(r, "Failed to connect to netlink: %m");
2881
2882 udev = udev_new();
2883 if (!udev) {
2884 log_error("Failed to connect to udev.");
2885 return -ENOMEM;
2886 }
2887
2888 STRV_FOREACH(i, arg_network_ipvlan) {
2889 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2890 _cleanup_free_ char *n = NULL;
2891 int ifi;
2892
2893 ifi = parse_interface(udev, *i);
2894 if (ifi < 0)
2895 return ifi;
2896
2897 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2898 if (r < 0)
2899 return log_error_errno(r, "Failed to allocate netlink message: %m");
2900
2901 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2902 if (r < 0)
2903 return log_error_errno(r, "Failed to add netlink interface index: %m");
2904
2905 n = strappend("iv-", *i);
2906 if (!n)
2907 return log_oom();
2908
2909 strshorten(n, IFNAMSIZ-1);
2910
2911 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2912 if (r < 0)
2913 return log_error_errno(r, "Failed to add netlink interface name: %m");
2914
2915 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2916 if (r < 0)
2917 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2918
2919 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2920 if (r < 0)
2921 return log_error_errno(r, "Failed to open netlink container: %m");
2922
2923 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2924 if (r < 0)
2925 return log_error_errno(r, "Failed to open netlink container: %m");
2926
2927 r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2928 if (r < 0)
2929 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2930
2931 r = sd_netlink_message_close_container(m);
2932 if (r < 0)
2933 return log_error_errno(r, "Failed to close netlink container: %m");
2934
2935 r = sd_netlink_message_close_container(m);
2936 if (r < 0)
2937 return log_error_errno(r, "Failed to close netlink container: %m");
2938
2939 r = sd_netlink_call(rtnl, m, 0, NULL);
2940 if (r < 0)
2941 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2942 }
2943
2944 return 0;
2945 }
2946
2947 static int setup_seccomp(void) {
2948
2949 #ifdef HAVE_SECCOMP
2950 static const struct {
2951 uint64_t capability;
2952 int syscall_num;
2953 } blacklist[] = {
2954 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
2955 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
2956 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
2957 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
2958 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
2959 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
2960 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
2961 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
2962 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
2963 { CAP_SYSLOG, SCMP_SYS(syslog) },
2964 };
2965
2966 scmp_filter_ctx seccomp;
2967 unsigned i;
2968 int r;
2969
2970 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2971 if (!seccomp)
2972 return log_oom();
2973
2974 r = seccomp_add_secondary_archs(seccomp);
2975 if (r < 0) {
2976 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2977 goto finish;
2978 }
2979
2980 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2981 if (arg_retain & (1ULL << blacklist[i].capability))
2982 continue;
2983
2984 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
2985 if (r == -EFAULT)
2986 continue; /* unknown syscall */
2987 if (r < 0) {
2988 log_error_errno(r, "Failed to block syscall: %m");
2989 goto finish;
2990 }
2991 }
2992
2993
2994 /*
2995 Audit is broken in containers, much of the userspace audit
2996 hookup will fail if running inside a container. We don't
2997 care and just turn off creation of audit sockets.
2998
2999 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
3000 with EAFNOSUPPORT which audit userspace uses as indication
3001 that audit is disabled in the kernel.
3002 */
3003
3004 r = seccomp_rule_add(
3005 seccomp,
3006 SCMP_ACT_ERRNO(EAFNOSUPPORT),
3007 SCMP_SYS(socket),
3008 2,
3009 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
3010 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
3011 if (r < 0) {
3012 log_error_errno(r, "Failed to add audit seccomp rule: %m");
3013 goto finish;
3014 }
3015
3016 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
3017 if (r < 0) {
3018 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
3019 goto finish;
3020 }
3021
3022 r = seccomp_load(seccomp);
3023 if (r == -EINVAL) {
3024 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3025 r = 0;
3026 goto finish;
3027 }
3028 if (r < 0) {
3029 log_error_errno(r, "Failed to install seccomp audit filter: %m");
3030 goto finish;
3031 }
3032
3033 finish:
3034 seccomp_release(seccomp);
3035 return r;
3036 #else
3037 return 0;
3038 #endif
3039
3040 }
3041
3042 static int setup_propagate(const char *root) {
3043 const char *p, *q;
3044
3045 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3046 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
3047 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3048 (void) mkdir_p(p, 0600);
3049
3050 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
3051 return log_error_errno(errno, "Failed to create /run/systemd: %m");
3052
3053 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3054 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
3055
3056 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3057 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
3058
3059 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
3060 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
3061 return log_error_errno(errno, "Failed to install propagation bind mount.");
3062
3063 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
3064 return log_error_errno(errno, "Failed to make propagation mount read-only");
3065
3066 return 0;
3067 }
3068
3069 static int setup_image(char **device_path, int *loop_nr) {
3070 struct loop_info64 info = {
3071 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
3072 };
3073 _cleanup_close_ int fd = -1, control = -1, loop = -1;
3074 _cleanup_free_ char* loopdev = NULL;
3075 struct stat st;
3076 int r, nr;
3077
3078 assert(device_path);
3079 assert(loop_nr);
3080 assert(arg_image);
3081
3082 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3083 if (fd < 0)
3084 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
3085
3086 if (fstat(fd, &st) < 0)
3087 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
3088
3089 if (S_ISBLK(st.st_mode)) {
3090 char *p;
3091
3092 p = strdup(arg_image);
3093 if (!p)
3094 return log_oom();
3095
3096 *device_path = p;
3097
3098 *loop_nr = -1;
3099
3100 r = fd;
3101 fd = -1;
3102
3103 return r;
3104 }
3105
3106 if (!S_ISREG(st.st_mode)) {
3107 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
3108 return -EINVAL;
3109 }
3110
3111 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3112 if (control < 0)
3113 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
3114
3115 nr = ioctl(control, LOOP_CTL_GET_FREE);
3116 if (nr < 0)
3117 return log_error_errno(errno, "Failed to allocate loop device: %m");
3118
3119 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
3120 return log_oom();
3121
3122 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3123 if (loop < 0)
3124 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
3125
3126 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
3127 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
3128
3129 if (arg_read_only)
3130 info.lo_flags |= LO_FLAGS_READ_ONLY;
3131
3132 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
3133 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
3134
3135 *device_path = loopdev;
3136 loopdev = NULL;
3137
3138 *loop_nr = nr;
3139
3140 r = loop;
3141 loop = -1;
3142
3143 return r;
3144 }
3145
3146 #define PARTITION_TABLE_BLURB \
3147 "Note that the disk image needs to either contain only a single MBR partition of\n" \
3148 "type 0x83 that is marked bootable, or a single GPT partition of type " \
3149 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
3150 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3151 "to be bootable with systemd-nspawn."
3152
3153 static int dissect_image(
3154 int fd,
3155 char **root_device, bool *root_device_rw,
3156 char **home_device, bool *home_device_rw,
3157 char **srv_device, bool *srv_device_rw,
3158 bool *secondary) {
3159
3160 #ifdef HAVE_BLKID
3161 int home_nr = -1, srv_nr = -1;
3162 #ifdef GPT_ROOT_NATIVE
3163 int root_nr = -1;
3164 #endif
3165 #ifdef GPT_ROOT_SECONDARY
3166 int secondary_root_nr = -1;
3167 #endif
3168 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
3169 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
3170 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
3171 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3172 _cleanup_udev_unref_ struct udev *udev = NULL;
3173 struct udev_list_entry *first, *item;
3174 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
3175 bool is_gpt, is_mbr, multiple_generic = false;
3176 const char *pttype = NULL;
3177 blkid_partlist pl;
3178 struct stat st;
3179 unsigned i;
3180 int r;
3181
3182 assert(fd >= 0);
3183 assert(root_device);
3184 assert(home_device);
3185 assert(srv_device);
3186 assert(secondary);
3187 assert(arg_image);
3188
3189 b = blkid_new_probe();
3190 if (!b)
3191 return log_oom();
3192
3193 errno = 0;
3194 r = blkid_probe_set_device(b, fd, 0, 0);
3195 if (r != 0) {
3196 if (errno == 0)
3197 return log_oom();
3198
3199 log_error_errno(errno, "Failed to set device on blkid probe: %m");
3200 return -errno;
3201 }
3202
3203 blkid_probe_enable_partitions(b, 1);
3204 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
3205
3206 errno = 0;
3207 r = blkid_do_safeprobe(b);
3208 if (r == -2 || r == 1) {
3209 log_error("Failed to identify any partition table on\n"
3210 " %s\n"
3211 PARTITION_TABLE_BLURB, arg_image);
3212 return -EINVAL;
3213 } else if (r != 0) {
3214 if (errno == 0)
3215 errno = EIO;
3216 log_error_errno(errno, "Failed to probe: %m");
3217 return -errno;
3218 }
3219
3220 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
3221
3222 is_gpt = streq_ptr(pttype, "gpt");
3223 is_mbr = streq_ptr(pttype, "dos");
3224
3225 if (!is_gpt && !is_mbr) {
3226 log_error("No GPT or MBR partition table discovered on\n"
3227 " %s\n"
3228 PARTITION_TABLE_BLURB, arg_image);
3229 return -EINVAL;
3230 }
3231
3232 errno = 0;
3233 pl = blkid_probe_get_partitions(b);
3234 if (!pl) {
3235 if (errno == 0)
3236 return log_oom();
3237
3238 log_error("Failed to list partitions of %s", arg_image);
3239 return -errno;
3240 }
3241
3242 udev = udev_new();
3243 if (!udev)
3244 return log_oom();
3245
3246 if (fstat(fd, &st) < 0)
3247 return log_error_errno(errno, "Failed to stat block device: %m");
3248
3249 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
3250 if (!d)
3251 return log_oom();
3252
3253 for (i = 0;; i++) {
3254 int n, m;
3255
3256 if (i >= 10) {
3257 log_error("Kernel partitions never appeared.");
3258 return -ENXIO;
3259 }
3260
3261 e = udev_enumerate_new(udev);
3262 if (!e)
3263 return log_oom();
3264
3265 r = udev_enumerate_add_match_parent(e, d);
3266 if (r < 0)
3267 return log_oom();
3268
3269 r = udev_enumerate_scan_devices(e);
3270 if (r < 0)
3271 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
3272
3273 /* Count the partitions enumerated by the kernel */
3274 n = 0;
3275 first = udev_enumerate_get_list_entry(e);
3276 udev_list_entry_foreach(item, first)
3277 n++;
3278
3279 /* Count the partitions enumerated by blkid */
3280 m = blkid_partlist_numof_partitions(pl);
3281 if (n == m + 1)
3282 break;
3283 if (n > m + 1) {
3284 log_error("blkid and kernel partition list do not match.");
3285 return -EIO;
3286 }
3287 if (n < m + 1) {
3288 unsigned j;
3289
3290 /* The kernel has probed fewer partitions than
3291 * blkid? Maybe the kernel prober is still
3292 * running or it got EBUSY because udev
3293 * already opened the device. Let's reprobe
3294 * the device, which is a synchronous call
3295 * that waits until probing is complete. */
3296
3297 for (j = 0; j < 20; j++) {
3298
3299 r = ioctl(fd, BLKRRPART, 0);
3300 if (r < 0)
3301 r = -errno;
3302 if (r >= 0 || r != -EBUSY)
3303 break;
3304
3305 /* If something else has the device
3306 * open, such as an udev rule, the
3307 * ioctl will return EBUSY. Since
3308 * there's no way to wait until it
3309 * isn't busy anymore, let's just wait
3310 * a bit, and try again.
3311 *
3312 * This is really something they
3313 * should fix in the kernel! */
3314
3315 usleep(50 * USEC_PER_MSEC);
3316 }
3317
3318 if (r < 0)
3319 return log_error_errno(r, "Failed to reread partition table: %m");
3320 }
3321
3322 e = udev_enumerate_unref(e);
3323 }
3324
3325 first = udev_enumerate_get_list_entry(e);
3326 udev_list_entry_foreach(item, first) {
3327 _cleanup_udev_device_unref_ struct udev_device *q;
3328 const char *node;
3329 unsigned long long flags;
3330 blkid_partition pp;
3331 dev_t qn;
3332 int nr;
3333
3334 errno = 0;
3335 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
3336 if (!q) {
3337 if (!errno)
3338 errno = ENOMEM;
3339
3340 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
3341 return -errno;
3342 }
3343
3344 qn = udev_device_get_devnum(q);
3345 if (major(qn) == 0)
3346 continue;
3347
3348 if (st.st_rdev == qn)
3349 continue;
3350
3351 node = udev_device_get_devnode(q);
3352 if (!node)
3353 continue;
3354
3355 pp = blkid_partlist_devno_to_partition(pl, qn);
3356 if (!pp)
3357 continue;
3358
3359 flags = blkid_partition_get_flags(pp);
3360
3361 nr = blkid_partition_get_partno(pp);
3362 if (nr < 0)
3363 continue;
3364
3365 if (is_gpt) {
3366 sd_id128_t type_id;
3367 const char *stype;
3368
3369 if (flags & GPT_FLAG_NO_AUTO)
3370 continue;
3371
3372 stype = blkid_partition_get_type_string(pp);
3373 if (!stype)
3374 continue;
3375
3376 if (sd_id128_from_string(stype, &type_id) < 0)
3377 continue;
3378
3379 if (sd_id128_equal(type_id, GPT_HOME)) {
3380
3381 if (home && nr >= home_nr)
3382 continue;
3383
3384 home_nr = nr;
3385 home_rw = !(flags & GPT_FLAG_READ_ONLY);
3386
3387 r = free_and_strdup(&home, node);
3388 if (r < 0)
3389 return log_oom();
3390
3391 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3392
3393 if (srv && nr >= srv_nr)
3394 continue;
3395
3396 srv_nr = nr;
3397 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3398
3399 r = free_and_strdup(&srv, node);
3400 if (r < 0)
3401 return log_oom();
3402 }
3403 #ifdef GPT_ROOT_NATIVE
3404 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3405
3406 if (root && nr >= root_nr)
3407 continue;
3408
3409 root_nr = nr;
3410 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3411
3412 r = free_and_strdup(&root, node);
3413 if (r < 0)
3414 return log_oom();
3415 }
3416 #endif
3417 #ifdef GPT_ROOT_SECONDARY
3418 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3419
3420 if (secondary_root && nr >= secondary_root_nr)
3421 continue;
3422
3423 secondary_root_nr = nr;
3424 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3425
3426 r = free_and_strdup(&secondary_root, node);
3427 if (r < 0)
3428 return log_oom();
3429 }
3430 #endif
3431 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3432
3433 if (generic)
3434 multiple_generic = true;
3435 else {
3436 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3437
3438 r = free_and_strdup(&generic, node);
3439 if (r < 0)
3440 return log_oom();
3441 }
3442 }
3443
3444 } else if (is_mbr) {
3445 int type;
3446
3447 if (flags != 0x80) /* Bootable flag */
3448 continue;
3449
3450 type = blkid_partition_get_type(pp);
3451 if (type != 0x83) /* Linux partition */
3452 continue;
3453
3454 if (generic)
3455 multiple_generic = true;
3456 else {
3457 generic_rw = true;
3458
3459 r = free_and_strdup(&root, node);
3460 if (r < 0)
3461 return log_oom();
3462 }
3463 }
3464 }
3465
3466 if (root) {
3467 *root_device = root;
3468 root = NULL;
3469
3470 *root_device_rw = root_rw;
3471 *secondary = false;
3472 } else if (secondary_root) {
3473 *root_device = secondary_root;
3474 secondary_root = NULL;
3475
3476 *root_device_rw = secondary_root_rw;
3477 *secondary = true;
3478 } else if (generic) {
3479
3480 /* There were no partitions with precise meanings
3481 * around, but we found generic partitions. In this
3482 * case, if there's only one, we can go ahead and boot
3483 * it, otherwise we bail out, because we really cannot
3484 * make any sense of it. */
3485
3486 if (multiple_generic) {
3487 log_error("Identified multiple bootable Linux partitions on\n"
3488 " %s\n"
3489 PARTITION_TABLE_BLURB, arg_image);
3490 return -EINVAL;
3491 }
3492
3493 *root_device = generic;
3494 generic = NULL;
3495
3496 *root_device_rw = generic_rw;
3497 *secondary = false;
3498 } else {
3499 log_error("Failed to identify root partition in disk image\n"
3500 " %s\n"
3501 PARTITION_TABLE_BLURB, arg_image);
3502 return -EINVAL;
3503 }
3504
3505 if (home) {
3506 *home_device = home;
3507 home = NULL;
3508
3509 *home_device_rw = home_rw;
3510 }
3511
3512 if (srv) {
3513 *srv_device = srv;
3514 srv = NULL;
3515
3516 *srv_device_rw = srv_rw;
3517 }
3518
3519 return 0;
3520 #else
3521 log_error("--image= is not supported, compiled without blkid support.");
3522 return -EOPNOTSUPP;
3523 #endif
3524 }
3525
3526 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3527 #ifdef HAVE_BLKID
3528 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3529 const char *fstype, *p;
3530 int r;
3531
3532 assert(what);
3533 assert(where);
3534
3535 if (arg_read_only)
3536 rw = false;
3537
3538 if (directory)
3539 p = strjoina(where, directory);
3540 else
3541 p = where;
3542
3543 errno = 0;
3544 b = blkid_new_probe_from_filename(what);
3545 if (!b) {
3546 if (errno == 0)
3547 return log_oom();
3548 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3549 return -errno;
3550 }
3551
3552 blkid_probe_enable_superblocks(b, 1);
3553 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3554
3555 errno = 0;
3556 r = blkid_do_safeprobe(b);
3557 if (r == -1 || r == 1) {
3558 log_error("Cannot determine file system type of %s", what);
3559 return -EINVAL;
3560 } else if (r != 0) {
3561 if (errno == 0)
3562 errno = EIO;
3563 log_error_errno(errno, "Failed to probe %s: %m", what);
3564 return -errno;
3565 }
3566
3567 errno = 0;
3568 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3569 if (errno == 0)
3570 errno = EINVAL;
3571 log_error("Failed to determine file system type of %s", what);
3572 return -errno;
3573 }
3574
3575 if (streq(fstype, "crypto_LUKS")) {
3576 log_error("nspawn currently does not support LUKS disk images.");
3577 return -EOPNOTSUPP;
3578 }
3579
3580 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3581 return log_error_errno(errno, "Failed to mount %s: %m", what);
3582
3583 return 0;
3584 #else
3585 log_error("--image= is not supported, compiled without blkid support.");
3586 return -EOPNOTSUPP;
3587 #endif
3588 }
3589
3590 static int mount_devices(
3591 const char *where,
3592 const char *root_device, bool root_device_rw,
3593 const char *home_device, bool home_device_rw,
3594 const char *srv_device, bool srv_device_rw) {
3595 int r;
3596
3597 assert(where);
3598
3599 if (root_device) {
3600 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3601 if (r < 0)
3602 return log_error_errno(r, "Failed to mount root directory: %m");
3603 }
3604
3605 if (home_device) {
3606 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3607 if (r < 0)
3608 return log_error_errno(r, "Failed to mount home directory: %m");
3609 }
3610
3611 if (srv_device) {
3612 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3613 if (r < 0)
3614 return log_error_errno(r, "Failed to mount server data directory: %m");
3615 }
3616
3617 return 0;
3618 }
3619
3620 static void loop_remove(int nr, int *image_fd) {
3621 _cleanup_close_ int control = -1;
3622 int r;
3623
3624 if (nr < 0)
3625 return;
3626
3627 if (image_fd && *image_fd >= 0) {
3628 r = ioctl(*image_fd, LOOP_CLR_FD);
3629 if (r < 0)
3630 log_debug_errno(errno, "Failed to close loop image: %m");
3631 *image_fd = safe_close(*image_fd);
3632 }
3633
3634 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3635 if (control < 0) {
3636 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3637 return;
3638 }
3639
3640 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3641 if (r < 0)
3642 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3643 }
3644
3645 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3646 int pipe_fds[2];
3647 pid_t pid;
3648
3649 assert(database);
3650 assert(key);
3651 assert(rpid);
3652
3653 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3654 return log_error_errno(errno, "Failed to allocate pipe: %m");
3655
3656 pid = fork();
3657 if (pid < 0)
3658 return log_error_errno(errno, "Failed to fork getent child: %m");
3659 else if (pid == 0) {
3660 int nullfd;
3661 char *empty_env = NULL;
3662
3663 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3664 _exit(EXIT_FAILURE);
3665
3666 if (pipe_fds[0] > 2)
3667 safe_close(pipe_fds[0]);
3668 if (pipe_fds[1] > 2)
3669 safe_close(pipe_fds[1]);
3670
3671 nullfd = open("/dev/null", O_RDWR);
3672 if (nullfd < 0)
3673 _exit(EXIT_FAILURE);
3674
3675 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3676 _exit(EXIT_FAILURE);
3677
3678 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3679 _exit(EXIT_FAILURE);
3680
3681 if (nullfd > 2)
3682 safe_close(nullfd);
3683
3684 (void) reset_all_signal_handlers();
3685 (void) reset_signal_mask();
3686 close_all_fds(NULL, 0);
3687
3688 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3689 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3690 _exit(EXIT_FAILURE);
3691 }
3692
3693 pipe_fds[1] = safe_close(pipe_fds[1]);
3694
3695 *rpid = pid;
3696
3697 return pipe_fds[0];
3698 }
3699
3700 static int change_uid_gid(char **_home) {
3701 char line[LINE_MAX], *x, *u, *g, *h;
3702 const char *word, *state;
3703 _cleanup_free_ uid_t *uids = NULL;
3704 _cleanup_free_ char *home = NULL;
3705 _cleanup_fclose_ FILE *f = NULL;
3706 _cleanup_close_ int fd = -1;
3707 unsigned n_uids = 0;
3708 size_t sz = 0, l;
3709 uid_t uid;
3710 gid_t gid;
3711 pid_t pid;
3712 int r;
3713
3714 assert(_home);
3715
3716 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3717 /* Reset everything fully to 0, just in case */
3718
3719 r = reset_uid_gid();
3720 if (r < 0)
3721 return log_error_errno(r, "Failed to become root: %m");
3722
3723 *_home = NULL;
3724 return 0;
3725 }
3726
3727 /* First, get user credentials */
3728 fd = spawn_getent("passwd", arg_user, &pid);
3729 if (fd < 0)
3730 return fd;
3731
3732 f = fdopen(fd, "r");
3733 if (!f)
3734 return log_oom();
3735 fd = -1;
3736
3737 if (!fgets(line, sizeof(line), f)) {
3738
3739 if (!ferror(f)) {
3740 log_error("Failed to resolve user %s.", arg_user);
3741 return -ESRCH;
3742 }
3743
3744 log_error_errno(errno, "Failed to read from getent: %m");
3745 return -errno;
3746 }
3747
3748 truncate_nl(line);
3749
3750 wait_for_terminate_and_warn("getent passwd", pid, true);
3751
3752 x = strchr(line, ':');
3753 if (!x) {
3754 log_error("/etc/passwd entry has invalid user field.");
3755 return -EIO;
3756 }
3757
3758 u = strchr(x+1, ':');
3759 if (!u) {
3760 log_error("/etc/passwd entry has invalid password field.");
3761 return -EIO;
3762 }
3763
3764 u++;
3765 g = strchr(u, ':');
3766 if (!g) {
3767 log_error("/etc/passwd entry has invalid UID field.");
3768 return -EIO;
3769 }
3770
3771 *g = 0;
3772 g++;
3773 x = strchr(g, ':');
3774 if (!x) {
3775 log_error("/etc/passwd entry has invalid GID field.");
3776 return -EIO;
3777 }
3778
3779 *x = 0;
3780 h = strchr(x+1, ':');
3781 if (!h) {
3782 log_error("/etc/passwd entry has invalid GECOS field.");
3783 return -EIO;
3784 }
3785
3786 h++;
3787 x = strchr(h, ':');
3788 if (!x) {
3789 log_error("/etc/passwd entry has invalid home directory field.");
3790 return -EIO;
3791 }
3792
3793 *x = 0;
3794
3795 r = parse_uid(u, &uid);
3796 if (r < 0) {
3797 log_error("Failed to parse UID of user.");
3798 return -EIO;
3799 }
3800
3801 r = parse_gid(g, &gid);
3802 if (r < 0) {
3803 log_error("Failed to parse GID of user.");
3804 return -EIO;
3805 }
3806
3807 home = strdup(h);
3808 if (!home)
3809 return log_oom();
3810
3811 /* Second, get group memberships */
3812 fd = spawn_getent("initgroups", arg_user, &pid);
3813 if (fd < 0)
3814 return fd;
3815
3816 fclose(f);
3817 f = fdopen(fd, "r");
3818 if (!f)
3819 return log_oom();
3820 fd = -1;
3821
3822 if (!fgets(line, sizeof(line), f)) {
3823 if (!ferror(f)) {
3824 log_error("Failed to resolve user %s.", arg_user);
3825 return -ESRCH;
3826 }
3827
3828 log_error_errno(errno, "Failed to read from getent: %m");
3829 return -errno;
3830 }
3831
3832 truncate_nl(line);
3833
3834 wait_for_terminate_and_warn("getent initgroups", pid, true);
3835
3836 /* Skip over the username and subsequent separator whitespace */
3837 x = line;
3838 x += strcspn(x, WHITESPACE);
3839 x += strspn(x, WHITESPACE);
3840
3841 FOREACH_WORD(word, l, x, state) {
3842 char c[l+1];
3843
3844 memcpy(c, word, l);
3845 c[l] = 0;
3846
3847 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3848 return log_oom();
3849
3850 r = parse_uid(c, &uids[n_uids++]);
3851 if (r < 0) {
3852 log_error("Failed to parse group data from getent.");
3853 return -EIO;
3854 }
3855 }
3856
3857 r = mkdir_parents(home, 0775);
3858 if (r < 0)
3859 return log_error_errno(r, "Failed to make home root directory: %m");
3860
3861 r = mkdir_safe(home, 0755, uid, gid);
3862 if (r < 0 && r != -EEXIST)
3863 return log_error_errno(r, "Failed to make home directory: %m");
3864
3865 (void) fchown(STDIN_FILENO, uid, gid);
3866 (void) fchown(STDOUT_FILENO, uid, gid);
3867 (void) fchown(STDERR_FILENO, uid, gid);
3868
3869 if (setgroups(n_uids, uids) < 0)
3870 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3871
3872 if (setresgid(gid, gid, gid) < 0)
3873 return log_error_errno(errno, "setregid() failed: %m");
3874
3875 if (setresuid(uid, uid, uid) < 0)
3876 return log_error_errno(errno, "setreuid() failed: %m");
3877
3878 if (_home) {
3879 *_home = home;
3880 home = NULL;
3881 }
3882
3883 return 0;
3884 }
3885
3886 /*
3887 * Return values:
3888 * < 0 : wait_for_terminate() failed to get the state of the
3889 * container, the container was terminated by a signal, or
3890 * failed for an unknown reason. No change is made to the
3891 * container argument.
3892 * > 0 : The program executed in the container terminated with an
3893 * error. The exit code of the program executed in the
3894 * container is returned. The container argument has been set
3895 * to CONTAINER_TERMINATED.
3896 * 0 : The container is being rebooted, has been shut down or exited
3897 * successfully. The container argument has been set to either
3898 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3899 *
3900 * That is, success is indicated by a return value of zero, and an
3901 * error is indicated by a non-zero value.
3902 */
3903 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3904 siginfo_t status;
3905 int r;
3906
3907 r = wait_for_terminate(pid, &status);
3908 if (r < 0)
3909 return log_warning_errno(r, "Failed to wait for container: %m");
3910
3911 switch (status.si_code) {
3912
3913 case CLD_EXITED:
3914 if (status.si_status == 0) {
3915 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3916
3917 } else
3918 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3919
3920 *container = CONTAINER_TERMINATED;
3921 return status.si_status;
3922
3923 case CLD_KILLED:
3924 if (status.si_status == SIGINT) {
3925
3926 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3927 *container = CONTAINER_TERMINATED;
3928 return 0;
3929
3930 } else if (status.si_status == SIGHUP) {
3931
3932 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3933 *container = CONTAINER_REBOOTED;
3934 return 0;
3935 }
3936
3937 /* CLD_KILLED fallthrough */
3938
3939 case CLD_DUMPED:
3940 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3941 return -EIO;
3942
3943 default:
3944 log_error("Container %s failed due to unknown reason.", arg_machine);
3945 return -EIO;
3946 }
3947
3948 return r;
3949 }
3950
3951 static void nop_handler(int sig) {}
3952
3953 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3954 pid_t pid;
3955
3956 pid = PTR_TO_UINT32(userdata);
3957 if (pid > 0) {
3958 if (kill(pid, arg_kill_signal) >= 0) {
3959 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3960 sd_event_source_set_userdata(s, NULL);
3961 return 0;
3962 }
3963 }
3964
3965 sd_event_exit(sd_event_source_get_event(s), 0);
3966 return 0;
3967 }
3968
3969 static int determine_names(void) {
3970 int r;
3971
3972 if (!arg_image && !arg_directory) {
3973 if (arg_machine) {
3974 _cleanup_(image_unrefp) Image *i = NULL;
3975
3976 r = image_find(arg_machine, &i);
3977 if (r < 0)
3978 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3979 else if (r == 0) {
3980 log_error("No image for machine '%s': %m", arg_machine);
3981 return -ENOENT;
3982 }
3983
3984 if (i->type == IMAGE_RAW)
3985 r = set_sanitized_path(&arg_image, i->path);
3986 else
3987 r = set_sanitized_path(&arg_directory, i->path);
3988 if (r < 0)
3989 return log_error_errno(r, "Invalid image directory: %m");
3990
3991 if (!arg_ephemeral)
3992 arg_read_only = arg_read_only || i->read_only;
3993 } else
3994 arg_directory = get_current_dir_name();
3995
3996 if (!arg_directory && !arg_machine) {
3997 log_error("Failed to determine path, please use -D or -i.");
3998 return -EINVAL;
3999 }
4000 }
4001
4002 if (!arg_machine) {
4003 if (arg_directory && path_equal(arg_directory, "/"))
4004 arg_machine = gethostname_malloc();
4005 else
4006 arg_machine = strdup(basename(arg_image ?: arg_directory));
4007
4008 if (!arg_machine)
4009 return log_oom();
4010
4011 hostname_cleanup(arg_machine, false);
4012 if (!machine_name_is_valid(arg_machine)) {
4013 log_error("Failed to determine machine name automatically, please use -M.");
4014 return -EINVAL;
4015 }
4016
4017 if (arg_ephemeral) {
4018 char *b;
4019
4020 /* Add a random suffix when this is an
4021 * ephemeral machine, so that we can run many
4022 * instances at once without manually having
4023 * to specify -M each time. */
4024
4025 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
4026 return log_oom();
4027
4028 free(arg_machine);
4029 arg_machine = b;
4030 }
4031 }
4032
4033 return 0;
4034 }
4035
4036 static int determine_uid_shift(const char *directory) {
4037 int r;
4038
4039 if (!arg_userns) {
4040 arg_uid_shift = 0;
4041 return 0;
4042 }
4043
4044 if (arg_uid_shift == UID_INVALID) {
4045 struct stat st;
4046
4047 r = stat(directory, &st);
4048 if (r < 0)
4049 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
4050
4051 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
4052
4053 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
4054 log_error("UID and GID base of %s don't match.", directory);
4055 return -EINVAL;
4056 }
4057
4058 arg_uid_range = UINT32_C(0x10000);
4059 }
4060
4061 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
4062 log_error("UID base too high for UID range.");
4063 return -EINVAL;
4064 }
4065
4066 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
4067 return 0;
4068 }
4069
4070 static int inner_child(
4071 Barrier *barrier,
4072 const char *directory,
4073 bool secondary,
4074 int kmsg_socket,
4075 int rtnl_socket,
4076 FDSet *fds,
4077 int argc,
4078 char *argv[]) {
4079
4080 _cleanup_free_ char *home = NULL;
4081 unsigned n_env = 2;
4082 const char *envp[] = {
4083 "PATH=" DEFAULT_PATH_SPLIT_USR,
4084 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4085 NULL, /* TERM */
4086 NULL, /* HOME */
4087 NULL, /* USER */
4088 NULL, /* LOGNAME */
4089 NULL, /* container_uuid */
4090 NULL, /* LISTEN_FDS */
4091 NULL, /* LISTEN_PID */
4092 NULL
4093 };
4094
4095 _cleanup_strv_free_ char **env_use = NULL;
4096 int r;
4097
4098 assert(barrier);
4099 assert(directory);
4100 assert(kmsg_socket >= 0);
4101
4102 if (arg_userns) {
4103 /* Tell the parent, that it now can write the UID map. */
4104 (void) barrier_place(barrier); /* #1 */
4105
4106 /* Wait until the parent wrote the UID map */
4107 if (!barrier_place_and_sync(barrier)) { /* #2 */
4108 log_error("Parent died too early");
4109 return -ESRCH;
4110 }
4111 }
4112
4113 r = mount_all(NULL, true);
4114 if (r < 0)
4115 return r;
4116
4117 /* Wait until we are cgroup-ified, so that we
4118 * can mount the right cgroup path writable */
4119 if (!barrier_place_and_sync(barrier)) { /* #3 */
4120 log_error("Parent died too early");
4121 return -ESRCH;
4122 }
4123
4124 r = mount_systemd_cgroup_writable("");
4125 if (r < 0)
4126 return r;
4127
4128 r = reset_uid_gid();
4129 if (r < 0)
4130 return log_error_errno(r, "Couldn't become new root: %m");
4131
4132 r = setup_boot_id(NULL);
4133 if (r < 0)
4134 return r;
4135
4136 r = setup_kmsg(NULL, kmsg_socket);
4137 if (r < 0)
4138 return r;
4139 kmsg_socket = safe_close(kmsg_socket);
4140
4141 umask(0022);
4142
4143 if (setsid() < 0)
4144 return log_error_errno(errno, "setsid() failed: %m");
4145
4146 if (arg_private_network)
4147 loopback_setup();
4148
4149 r = send_rtnl(rtnl_socket);
4150 if (r < 0)
4151 return r;
4152 rtnl_socket = safe_close(rtnl_socket);
4153
4154 if (drop_capabilities() < 0)
4155 return log_error_errno(errno, "drop_capabilities() failed: %m");
4156
4157 setup_hostname();
4158
4159 if (arg_personality != PERSONALITY_INVALID) {
4160 if (personality(arg_personality) < 0)
4161 return log_error_errno(errno, "personality() failed: %m");
4162 } else if (secondary) {
4163 if (personality(PER_LINUX32) < 0)
4164 return log_error_errno(errno, "personality() failed: %m");
4165 }
4166
4167 #ifdef HAVE_SELINUX
4168 if (arg_selinux_context)
4169 if (setexeccon((security_context_t) arg_selinux_context) < 0)
4170 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4171 #endif
4172
4173 r = change_uid_gid(&home);
4174 if (r < 0)
4175 return r;
4176
4177 envp[n_env] = strv_find_prefix(environ, "TERM=");
4178 if (envp[n_env])
4179 n_env ++;
4180
4181 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4182 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4183 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
4184 return log_oom();
4185
4186 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4187 char as_uuid[37];
4188
4189 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
4190 return log_oom();
4191 }
4192
4193 if (fdset_size(fds) > 0) {
4194 r = fdset_cloexec(fds, false);
4195 if (r < 0)
4196 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4197
4198 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
4199 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
4200 return log_oom();
4201 }
4202
4203 env_use = strv_env_merge(2, envp, arg_setenv);
4204 if (!env_use)
4205 return log_oom();
4206
4207 /* Let the parent know that we are ready and
4208 * wait until the parent is ready with the
4209 * setup, too... */
4210 if (!barrier_place_and_sync(barrier)) { /* #4 */
4211 log_error("Parent died too early");
4212 return -ESRCH;
4213 }
4214
4215 /* Now, explicitly close the log, so that we
4216 * then can close all remaining fds. Closing
4217 * the log explicitly first has the benefit
4218 * that the logging subsystem knows about it,
4219 * and is thus ready to be reopened should we
4220 * need it again. Note that the other fds
4221 * closed here are at least the locking and
4222 * barrier fds. */
4223 log_close();
4224 (void) fdset_close_others(fds);
4225
4226 if (arg_boot) {
4227 char **a;
4228 size_t m;
4229
4230 /* Automatically search for the init system */
4231
4232 m = 1 + argc - optind;
4233 a = newa(char*, m + 1);
4234 memcpy(a + 1, argv + optind, m * sizeof(char*));
4235
4236 a[0] = (char*) "/usr/lib/systemd/systemd";
4237 execve(a[0], a, env_use);
4238
4239 a[0] = (char*) "/lib/systemd/systemd";
4240 execve(a[0], a, env_use);
4241
4242 a[0] = (char*) "/sbin/init";
4243 execve(a[0], a, env_use);
4244 } else if (argc > optind)
4245 execvpe(argv[optind], argv + optind, env_use);
4246 else {
4247 chdir(home ? home : "/root");
4248 execle("/bin/bash", "-bash", NULL, env_use);
4249 execle("/bin/sh", "-sh", NULL, env_use);
4250 }
4251
4252 (void) log_open();
4253 return log_error_errno(errno, "execv() failed: %m");
4254 }
4255
4256 static int outer_child(
4257 Barrier *barrier,
4258 const char *directory,
4259 const char *console,
4260 const char *root_device, bool root_device_rw,
4261 const char *home_device, bool home_device_rw,
4262 const char *srv_device, bool srv_device_rw,
4263 bool interactive,
4264 bool secondary,
4265 int pid_socket,
4266 int kmsg_socket,
4267 int rtnl_socket,
4268 int uid_shift_socket,
4269 FDSet *fds,
4270 int argc,
4271 char *argv[]) {
4272
4273 pid_t pid;
4274 ssize_t l;
4275 int r;
4276
4277 assert(barrier);
4278 assert(directory);
4279 assert(console);
4280 assert(pid_socket >= 0);
4281 assert(kmsg_socket >= 0);
4282
4283 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
4284 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4285
4286 if (interactive) {
4287 close_nointr(STDIN_FILENO);
4288 close_nointr(STDOUT_FILENO);
4289 close_nointr(STDERR_FILENO);
4290
4291 r = open_terminal(console, O_RDWR);
4292 if (r != STDIN_FILENO) {
4293 if (r >= 0) {
4294 safe_close(r);
4295 r = -EINVAL;
4296 }
4297
4298 return log_error_errno(r, "Failed to open console: %m");
4299 }
4300
4301 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4302 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
4303 return log_error_errno(errno, "Failed to duplicate console: %m");
4304 }
4305
4306 r = reset_audit_loginuid();
4307 if (r < 0)
4308 return r;
4309
4310 /* Mark everything as slave, so that we still
4311 * receive mounts from the real root, but don't
4312 * propagate mounts to the real root. */
4313 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
4314 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4315
4316 r = mount_devices(directory,
4317 root_device, root_device_rw,
4318 home_device, home_device_rw,
4319 srv_device, srv_device_rw);
4320 if (r < 0)
4321 return r;
4322
4323 r = determine_uid_shift(directory);
4324 if (r < 0)
4325 return r;
4326
4327 if (arg_userns) {
4328 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
4329 if (l < 0)
4330 return log_error_errno(errno, "Failed to send UID shift: %m");
4331 if (l != sizeof(arg_uid_shift)) {
4332 log_error("Short write while sending UID shift.");
4333 return -EIO;
4334 }
4335 }
4336
4337 /* Turn directory into bind mount */
4338 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
4339 return log_error_errno(errno, "Failed to make bind mount: %m");
4340
4341 r = setup_volatile(directory);
4342 if (r < 0)
4343 return r;
4344
4345 r = setup_volatile_state(directory);
4346 if (r < 0)
4347 return r;
4348
4349 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
4350 if (r < 0)
4351 return r;
4352
4353 if (arg_read_only) {
4354 r = bind_remount_recursive(directory, true);
4355 if (r < 0)
4356 return log_error_errno(r, "Failed to make tree read-only: %m");
4357 }
4358
4359 r = mount_all(directory, false);
4360 if (r < 0)
4361 return r;
4362
4363 if (copy_devnodes(directory) < 0)
4364 return r;
4365
4366 dev_setup(directory, arg_uid_shift, arg_uid_shift);
4367
4368 if (setup_pts(directory) < 0)
4369 return r;
4370
4371 r = setup_propagate(directory);
4372 if (r < 0)
4373 return r;
4374
4375 r = setup_dev_console(directory, console);
4376 if (r < 0)
4377 return r;
4378
4379 r = setup_seccomp();
4380 if (r < 0)
4381 return r;
4382
4383 r = setup_timezone(directory);
4384 if (r < 0)
4385 return r;
4386
4387 r = setup_resolv_conf(directory);
4388 if (r < 0)
4389 return r;
4390
4391 r = setup_journal(directory);
4392 if (r < 0)
4393 return r;
4394
4395 r = mount_custom(directory);
4396 if (r < 0)
4397 return r;
4398
4399 r = mount_cgroup(directory);
4400 if (r < 0)
4401 return r;
4402
4403 r = mount_move_root(directory);
4404 if (r < 0)
4405 return log_error_errno(r, "Failed to move root directory: %m");
4406
4407 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4408 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
4409 (arg_private_network ? CLONE_NEWNET : 0) |
4410 (arg_userns ? CLONE_NEWUSER : 0),
4411 NULL);
4412 if (pid < 0)
4413 return log_error_errno(errno, "Failed to fork inner child: %m");
4414
4415 if (pid == 0) {
4416 pid_socket = safe_close(pid_socket);
4417 uid_shift_socket = safe_close(uid_shift_socket);
4418
4419 /* The inner child has all namespaces that are
4420 * requested, so that we all are owned by the user if
4421 * user namespaces are turned on. */
4422
4423 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
4424 if (r < 0)
4425 _exit(EXIT_FAILURE);
4426
4427 _exit(EXIT_SUCCESS);
4428 }
4429
4430 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4431 if (l < 0)
4432 return log_error_errno(errno, "Failed to send PID: %m");
4433 if (l != sizeof(pid)) {
4434 log_error("Short write while sending PID.");
4435 return -EIO;
4436 }
4437
4438 pid_socket = safe_close(pid_socket);
4439
4440 return 0;
4441 }
4442
4443 static int setup_uid_map(pid_t pid) {
4444 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4445 int r;
4446
4447 assert(pid > 1);
4448
4449 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4450 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4451 r = write_string_file(uid_map, line, WRITE_STRING_FILE_CREATE);
4452 if (r < 0)
4453 return log_error_errno(r, "Failed to write UID map: %m");
4454
4455 /* We always assign the same UID and GID ranges */
4456 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4457 r = write_string_file(uid_map, line, WRITE_STRING_FILE_CREATE);
4458 if (r < 0)
4459 return log_error_errno(r, "Failed to write GID map: %m");
4460
4461 return 0;
4462 }
4463
4464 static int chown_cgroup(pid_t pid) {
4465 _cleanup_free_ char *path = NULL, *fs = NULL;
4466 _cleanup_close_ int fd = -1;
4467 const char *fn;
4468 int r;
4469
4470 r = cg_pid_get_path(NULL, pid, &path);
4471 if (r < 0)
4472 return log_error_errno(r, "Failed to get container cgroup path: %m");
4473
4474 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
4475 if (r < 0)
4476 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
4477
4478 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
4479 if (fd < 0)
4480 return log_error_errno(errno, "Failed to open %s: %m", fs);
4481
4482 FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4483 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
4484 log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
4485
4486 return 0;
4487 }
4488
4489 int main(int argc, char *argv[]) {
4490
4491 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
4492 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4493 _cleanup_close_ int master = -1, image_fd = -1;
4494 _cleanup_fdset_free_ FDSet *fds = NULL;
4495 int r, n_fd_passed, loop_nr = -1;
4496 char veth_name[IFNAMSIZ];
4497 bool secondary = false, remove_subvol = false;
4498 sigset_t mask_chld;
4499 pid_t pid = 0;
4500 int ret = EXIT_SUCCESS;
4501 union in_addr_union exposed = {};
4502 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4503 bool interactive;
4504
4505 log_parse_environment();
4506 log_open();
4507
4508 r = parse_argv(argc, argv);
4509 if (r <= 0)
4510 goto finish;
4511
4512 r = determine_names();
4513 if (r < 0)
4514 goto finish;
4515
4516 if (geteuid() != 0) {
4517 log_error("Need to be root.");
4518 r = -EPERM;
4519 goto finish;
4520 }
4521
4522 n_fd_passed = sd_listen_fds(false);
4523 if (n_fd_passed > 0) {
4524 r = fdset_new_listen_fds(&fds, false);
4525 if (r < 0) {
4526 log_error_errno(r, "Failed to collect file descriptors: %m");
4527 goto finish;
4528 }
4529 }
4530
4531 if (arg_directory) {
4532 assert(!arg_image);
4533
4534 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4535 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4536 r = -EINVAL;
4537 goto finish;
4538 }
4539
4540 if (arg_ephemeral) {
4541 _cleanup_free_ char *np = NULL;
4542
4543 /* If the specified path is a mount point we
4544 * generate the new snapshot immediately
4545 * inside it under a random name. However if
4546 * the specified is not a mount point we
4547 * create the new snapshot in the parent
4548 * directory, just next to it. */
4549 r = path_is_mount_point(arg_directory, 0);
4550 if (r < 0) {
4551 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4552 goto finish;
4553 }
4554 if (r > 0)
4555 r = tempfn_random_child(arg_directory, "machine.", &np);
4556 else
4557 r = tempfn_random(arg_directory, "machine.", &np);
4558 if (r < 0) {
4559 log_error_errno(r, "Failed to generate name for snapshot: %m");
4560 goto finish;
4561 }
4562
4563 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4564 if (r < 0) {
4565 log_error_errno(r, "Failed to lock %s: %m", np);
4566 goto finish;
4567 }
4568
4569 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4570 if (r < 0) {
4571 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4572 goto finish;
4573 }
4574
4575 free(arg_directory);
4576 arg_directory = np;
4577 np = NULL;
4578
4579 remove_subvol = true;
4580
4581 } else {
4582 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4583 if (r == -EBUSY) {
4584 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4585 goto finish;
4586 }
4587 if (r < 0) {
4588 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4589 return r;
4590 }
4591
4592 if (arg_template) {
4593 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4594 if (r == -EEXIST) {
4595 if (!arg_quiet)
4596 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4597 } else if (r < 0) {
4598 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4599 goto finish;
4600 } else {
4601 if (!arg_quiet)
4602 log_info("Populated %s from template %s.", arg_directory, arg_template);
4603 }
4604 }
4605 }
4606
4607 if (arg_boot) {
4608 if (path_is_os_tree(arg_directory) <= 0) {
4609 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
4610 r = -EINVAL;
4611 goto finish;
4612 }
4613 } else {
4614 const char *p;
4615
4616 p = strjoina(arg_directory,
4617 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
4618 if (access(p, F_OK) < 0) {
4619 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
4620 r = -EINVAL;
4621 goto finish;
4622 }
4623 }
4624
4625 } else {
4626 char template[] = "/tmp/nspawn-root-XXXXXX";
4627
4628 assert(arg_image);
4629 assert(!arg_template);
4630
4631 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4632 if (r == -EBUSY) {
4633 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4634 goto finish;
4635 }
4636 if (r < 0) {
4637 r = log_error_errno(r, "Failed to create image lock: %m");
4638 goto finish;
4639 }
4640
4641 if (!mkdtemp(template)) {
4642 log_error_errno(errno, "Failed to create temporary directory: %m");
4643 r = -errno;
4644 goto finish;
4645 }
4646
4647 arg_directory = strdup(template);
4648 if (!arg_directory) {
4649 r = log_oom();
4650 goto finish;
4651 }
4652
4653 image_fd = setup_image(&device_path, &loop_nr);
4654 if (image_fd < 0) {
4655 r = image_fd;
4656 goto finish;
4657 }
4658
4659 r = dissect_image(image_fd,
4660 &root_device, &root_device_rw,
4661 &home_device, &home_device_rw,
4662 &srv_device, &srv_device_rw,
4663 &secondary);
4664 if (r < 0)
4665 goto finish;
4666 }
4667
4668 r = custom_mounts_prepare();
4669 if (r < 0)
4670 goto finish;
4671
4672 interactive =
4673 isatty(STDIN_FILENO) > 0 &&
4674 isatty(STDOUT_FILENO) > 0;
4675
4676 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4677 if (master < 0) {
4678 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
4679 goto finish;
4680 }
4681
4682 r = ptsname_malloc(master, &console);
4683 if (r < 0) {
4684 r = log_error_errno(r, "Failed to determine tty name: %m");
4685 goto finish;
4686 }
4687
4688 if (unlockpt(master) < 0) {
4689 r = log_error_errno(errno, "Failed to unlock tty: %m");
4690 goto finish;
4691 }
4692
4693 if (!arg_quiet)
4694 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4695 arg_machine, arg_image ?: arg_directory);
4696
4697 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
4698
4699 assert_se(sigemptyset(&mask_chld) == 0);
4700 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4701
4702 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4703 r = log_error_errno(errno, "Failed to become subreaper: %m");
4704 goto finish;
4705 }
4706
4707 for (;;) {
4708 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
4709 uid_shift_socket_pair[2] = { -1, -1 };
4710 ContainerStatus container_status;
4711 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4712 static const struct sigaction sa = {
4713 .sa_handler = nop_handler,
4714 .sa_flags = SA_NOCLDSTOP,
4715 };
4716 int ifi = 0;
4717 ssize_t l;
4718 _cleanup_event_unref_ sd_event *event = NULL;
4719 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4720 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4721 char last_char = 0;
4722
4723 r = barrier_create(&barrier);
4724 if (r < 0) {
4725 log_error_errno(r, "Cannot initialize IPC barrier: %m");
4726 goto finish;
4727 }
4728
4729 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
4730 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4731 goto finish;
4732 }
4733
4734 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
4735 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4736 goto finish;
4737 }
4738
4739 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
4740 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
4741 goto finish;
4742 }
4743
4744 if (arg_userns)
4745 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
4746 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4747 goto finish;
4748 }
4749
4750 /* Child can be killed before execv(), so handle SIGCHLD
4751 * in order to interrupt parent's blocking calls and
4752 * give it a chance to call wait() and terminate. */
4753 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4754 if (r < 0) {
4755 r = log_error_errno(errno, "Failed to change the signal mask: %m");
4756 goto finish;
4757 }
4758
4759 r = sigaction(SIGCHLD, &sa, NULL);
4760 if (r < 0) {
4761 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4762 goto finish;
4763 }
4764
4765 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
4766 if (pid < 0) {
4767 if (errno == EINVAL)
4768 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
4769 else
4770 r = log_error_errno(errno, "clone() failed: %m");
4771
4772 goto finish;
4773 }
4774
4775 if (pid == 0) {
4776 /* The outer child only has a file system namespace. */
4777 barrier_set_role(&barrier, BARRIER_CHILD);
4778
4779 master = safe_close(master);
4780
4781 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4782 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4783 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4784 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4785
4786 (void) reset_all_signal_handlers();
4787 (void) reset_signal_mask();
4788
4789 r = outer_child(&barrier,
4790 arg_directory,
4791 console,
4792 root_device, root_device_rw,
4793 home_device, home_device_rw,
4794 srv_device, srv_device_rw,
4795 interactive,
4796 secondary,
4797 pid_socket_pair[1],
4798 kmsg_socket_pair[1],
4799 rtnl_socket_pair[1],
4800 uid_shift_socket_pair[1],
4801 fds,
4802 argc, argv);
4803 if (r < 0)
4804 _exit(EXIT_FAILURE);
4805
4806 _exit(EXIT_SUCCESS);
4807 }
4808
4809 barrier_set_role(&barrier, BARRIER_PARENT);
4810
4811 fdset_free(fds);
4812 fds = NULL;
4813
4814 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4815 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4816 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4817
4818 /* Wait for the outer child. */
4819 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
4820 if (r < 0)
4821 goto finish;
4822 if (r != 0) {
4823 r = -EIO;
4824 goto finish;
4825 }
4826 pid = 0;
4827
4828 /* And now retrieve the PID of the inner child. */
4829 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
4830 if (l < 0) {
4831 r = log_error_errno(errno, "Failed to read inner child PID: %m");
4832 goto finish;
4833 }
4834 if (l != sizeof(pid)) {
4835 log_error("Short read while reading inner child PID: %m");
4836 r = EIO;
4837 goto finish;
4838 }
4839
4840 log_debug("Init process invoked as PID " PID_FMT, pid);
4841
4842 if (arg_userns) {
4843 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4844 log_error("Child died too early.");
4845 r = -ESRCH;
4846 goto finish;
4847 }
4848
4849 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
4850 if (l < 0) {
4851 r = log_error_errno(errno, "Failed to read UID shift: %m");
4852 goto finish;
4853 }
4854 if (l != sizeof(arg_uid_shift)) {
4855 log_error("Short read while reading UID shift: %m");
4856 r = EIO;
4857 goto finish;
4858 }
4859
4860 r = setup_uid_map(pid);
4861 if (r < 0)
4862 goto finish;
4863
4864 (void) barrier_place(&barrier); /* #2 */
4865 }
4866
4867 r = move_network_interfaces(pid);
4868 if (r < 0)
4869 goto finish;
4870
4871 r = setup_veth(pid, veth_name, &ifi);
4872 if (r < 0)
4873 goto finish;
4874
4875 r = setup_bridge(veth_name, &ifi);
4876 if (r < 0)
4877 goto finish;
4878
4879 r = setup_macvlan(pid);
4880 if (r < 0)
4881 goto finish;
4882
4883 r = setup_ipvlan(pid);
4884 if (r < 0)
4885 goto finish;
4886
4887 r = register_machine(pid, ifi);
4888 if (r < 0)
4889 goto finish;
4890
4891 r = chown_cgroup(pid);
4892 if (r < 0)
4893 goto finish;
4894
4895 /* Notify the child that the parent is ready with all
4896 * its setup (including cgroup-ification), and that
4897 * the child can now hand over control to the code to
4898 * run inside the container. */
4899 (void) barrier_place(&barrier); /* #3 */
4900
4901 /* Block SIGCHLD here, before notifying child.
4902 * process_pty() will handle it with the other signals. */
4903 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4904
4905 /* Reset signal to default */
4906 r = default_signals(SIGCHLD, -1);
4907 if (r < 0) {
4908 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4909 goto finish;
4910 }
4911
4912 /* Let the child know that we are ready and wait that the child is completely ready now. */
4913 if (!barrier_place_and_sync(&barrier)) { /* #5 */
4914 log_error("Client died too early.");
4915 r = -ESRCH;
4916 goto finish;
4917 }
4918
4919 sd_notifyf(false,
4920 "READY=1\n"
4921 "STATUS=Container running.\n"
4922 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4923
4924 r = sd_event_new(&event);
4925 if (r < 0) {
4926 log_error_errno(r, "Failed to get default event source: %m");
4927 goto finish;
4928 }
4929
4930 if (arg_kill_signal > 0) {
4931 /* Try to kill the init system on SIGINT or SIGTERM */
4932 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4933 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4934 } else {
4935 /* Immediately exit */
4936 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4937 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4938 }
4939
4940 /* simply exit on sigchld */
4941 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4942
4943 if (arg_expose_ports) {
4944 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4945 if (r < 0)
4946 goto finish;
4947
4948 (void) expose_ports(rtnl, &exposed);
4949 }
4950
4951 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4952
4953 r = pty_forward_new(event, master, true, !interactive, &forward);
4954 if (r < 0) {
4955 log_error_errno(r, "Failed to create PTY forwarder: %m");
4956 goto finish;
4957 }
4958
4959 r = sd_event_loop(event);
4960 if (r < 0) {
4961 log_error_errno(r, "Failed to run event loop: %m");
4962 goto finish;
4963 }
4964
4965 pty_forward_get_last_char(forward, &last_char);
4966
4967 forward = pty_forward_free(forward);
4968
4969 if (!arg_quiet && last_char != '\n')
4970 putc('\n', stdout);
4971
4972 /* Kill if it is not dead yet anyway */
4973 terminate_machine(pid);
4974
4975 /* Normally redundant, but better safe than sorry */
4976 kill(pid, SIGKILL);
4977
4978 r = wait_for_container(pid, &container_status);
4979 pid = 0;
4980
4981 if (r < 0)
4982 /* We failed to wait for the container, or the
4983 * container exited abnormally */
4984 goto finish;
4985 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4986 /* The container exited with a non-zero
4987 * status, or with zero status and no reboot
4988 * was requested. */
4989 ret = r;
4990 break;
4991 }
4992
4993 /* CONTAINER_REBOOTED, loop again */
4994
4995 if (arg_keep_unit) {
4996 /* Special handling if we are running as a
4997 * service: instead of simply restarting the
4998 * machine we want to restart the entire
4999 * service, so let's inform systemd about this
5000 * with the special exit code 133. The service
5001 * file uses RestartForceExitStatus=133 so
5002 * that this results in a full nspawn
5003 * restart. This is necessary since we might
5004 * have cgroup parameters set we want to have
5005 * flushed out. */
5006 ret = 133;
5007 r = 0;
5008 break;
5009 }
5010
5011 flush_ports(&exposed);
5012 }
5013
5014 finish:
5015 sd_notify(false,
5016 "STOPPING=1\n"
5017 "STATUS=Terminating...");
5018
5019 if (pid > 0)
5020 kill(pid, SIGKILL);
5021
5022 /* Try to flush whatever is still queued in the pty */
5023 if (master >= 0)
5024 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
5025
5026 loop_remove(loop_nr, &image_fd);
5027
5028 if (remove_subvol && arg_directory) {
5029 int k;
5030
5031 k = btrfs_subvol_remove(arg_directory, true);
5032 if (k < 0)
5033 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
5034 }
5035
5036 if (arg_machine) {
5037 const char *p;
5038
5039 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5040 (void) rm_rf(p, REMOVE_ROOT);
5041 }
5042
5043 free(arg_directory);
5044 free(arg_template);
5045 free(arg_image);
5046 free(arg_machine);
5047 free(arg_user);
5048 strv_free(arg_setenv);
5049 strv_free(arg_network_interfaces);
5050 strv_free(arg_network_macvlan);
5051 strv_free(arg_network_ipvlan);
5052 custom_mount_free_all();
5053
5054 flush_ports(&exposed);
5055
5056 while (arg_expose_ports) {
5057 ExposePort *p = arg_expose_ports;
5058 LIST_REMOVE(ports, arg_expose_ports, p);
5059 free(p);
5060 }
5061
5062 return r < 0 ? EXIT_FAILURE : ret;
5063 }