]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: Allow : characters in nspawn --bind paths
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <net/if.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
41 #include <sys/file.h>
42
43 #ifdef HAVE_SELINUX
44 #include <selinux/selinux.h>
45 #endif
46
47 #ifdef HAVE_SECCOMP
48 #include <seccomp.h>
49 #endif
50
51 #ifdef HAVE_BLKID
52 #include <blkid/blkid.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-netlink.h"
59 #include "random-util.h"
60 #include "log.h"
61 #include "util.h"
62 #include "mkdir.h"
63 #include "rm-rf.h"
64 #include "macro.h"
65 #include "missing.h"
66 #include "cgroup-util.h"
67 #include "strv.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
71 #include "fdset.h"
72 #include "build.h"
73 #include "fileio.h"
74 #include "bus-util.h"
75 #include "bus-error.h"
76 #include "ptyfwd.h"
77 #include "env-util.h"
78 #include "netlink-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
81 #include "gpt.h"
82 #include "siphash24.h"
83 #include "copy.h"
84 #include "base-filesystem.h"
85 #include "barrier.h"
86 #include "event-util.h"
87 #include "capability.h"
88 #include "cap-list.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
91 #include "list.h"
92 #include "in-addr-util.h"
93 #include "firewall-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
100
101 #ifdef HAVE_SECCOMP
102 #include "seccomp-util.h"
103 #endif
104
105 typedef struct ExposePort {
106 int protocol;
107 uint16_t host_port;
108 uint16_t container_port;
109 LIST_FIELDS(struct ExposePort, ports);
110 } ExposePort;
111
112 typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115 } ContainerStatus;
116
117 typedef enum LinkJournal {
118 LINK_NO,
119 LINK_AUTO,
120 LINK_HOST,
121 LINK_GUEST
122 } LinkJournal;
123
124 typedef enum Volatile {
125 VOLATILE_NO,
126 VOLATILE_YES,
127 VOLATILE_STATE,
128 } Volatile;
129
130 typedef enum CustomMountType {
131 CUSTOM_MOUNT_BIND,
132 CUSTOM_MOUNT_TMPFS,
133 CUSTOM_MOUNT_OVERLAY,
134 } CustomMountType;
135
136 typedef struct CustomMount {
137 CustomMountType type;
138 bool read_only;
139 char *source; /* for overlayfs this is the upper directory */
140 char *destination;
141 char *options;
142 char *work_dir;
143 char **lower;
144 } CustomMount;
145
146 static char *arg_directory = NULL;
147 static char *arg_template = NULL;
148 static char *arg_user = NULL;
149 static sd_id128_t arg_uuid = {};
150 static char *arg_machine = NULL;
151 static const char *arg_selinux_context = NULL;
152 static const char *arg_selinux_apifs_context = NULL;
153 static const char *arg_slice = NULL;
154 static bool arg_private_network = false;
155 static bool arg_read_only = false;
156 static bool arg_boot = false;
157 static bool arg_ephemeral = false;
158 static LinkJournal arg_link_journal = LINK_AUTO;
159 static bool arg_link_journal_try = false;
160 static uint64_t arg_retain =
161 (1ULL << CAP_CHOWN) |
162 (1ULL << CAP_DAC_OVERRIDE) |
163 (1ULL << CAP_DAC_READ_SEARCH) |
164 (1ULL << CAP_FOWNER) |
165 (1ULL << CAP_FSETID) |
166 (1ULL << CAP_IPC_OWNER) |
167 (1ULL << CAP_KILL) |
168 (1ULL << CAP_LEASE) |
169 (1ULL << CAP_LINUX_IMMUTABLE) |
170 (1ULL << CAP_NET_BIND_SERVICE) |
171 (1ULL << CAP_NET_BROADCAST) |
172 (1ULL << CAP_NET_RAW) |
173 (1ULL << CAP_SETGID) |
174 (1ULL << CAP_SETFCAP) |
175 (1ULL << CAP_SETPCAP) |
176 (1ULL << CAP_SETUID) |
177 (1ULL << CAP_SYS_ADMIN) |
178 (1ULL << CAP_SYS_CHROOT) |
179 (1ULL << CAP_SYS_NICE) |
180 (1ULL << CAP_SYS_PTRACE) |
181 (1ULL << CAP_SYS_TTY_CONFIG) |
182 (1ULL << CAP_SYS_RESOURCE) |
183 (1ULL << CAP_SYS_BOOT) |
184 (1ULL << CAP_AUDIT_WRITE) |
185 (1ULL << CAP_AUDIT_CONTROL) |
186 (1ULL << CAP_MKNOD);
187 static CustomMount *arg_custom_mounts = NULL;
188 static unsigned arg_n_custom_mounts = 0;
189 static char **arg_setenv = NULL;
190 static bool arg_quiet = false;
191 static bool arg_share_system = false;
192 static bool arg_register = true;
193 static bool arg_keep_unit = false;
194 static char **arg_network_interfaces = NULL;
195 static char **arg_network_macvlan = NULL;
196 static char **arg_network_ipvlan = NULL;
197 static bool arg_network_veth = false;
198 static const char *arg_network_bridge = NULL;
199 static unsigned long arg_personality = PERSONALITY_INVALID;
200 static char *arg_image = NULL;
201 static Volatile arg_volatile = VOLATILE_NO;
202 static ExposePort *arg_expose_ports = NULL;
203 static char **arg_property = NULL;
204 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
205 static bool arg_userns = false;
206 static int arg_kill_signal = 0;
207
208 static void help(void) {
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
211 " -h --help Show this help\n"
212 " --version Print version string\n"
213 " -q --quiet Do not show status information\n"
214 " -D --directory=PATH Root directory for the container\n"
215 " --template=PATH Initialize root directory from template directory,\n"
216 " if missing\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
222 " -M --machine=NAME Set the machine name for the container\n"
223 " --uuid=UUID Set a specific machine UUID for the container\n"
224 " -S --slice=SLICE Place the container in the specified slice\n"
225 " --property=NAME=VALUE Set scope unit property\n"
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
231 " container\n"
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
238 " -n --network-veth Add a virtual ethernet connection between host\n"
239 " and container\n"
240 " --network-bridge=INTERFACE\n"
241 " Add a virtual ethernet connection between host\n"
242 " and container and add it to an existing bridge on\n"
243 " the host\n"
244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
245 " Expose a container IP port on the host\n"
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
252 " --capability=CAP In addition to the default, retain specified\n"
253 " capability\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
259 " --read-only Mount the root directory read-only\n"
260 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
261 " the container\n"
262 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
263 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
264 " --overlay=PATH[:PATH...]:PATH\n"
265 " Create an overlay mount from the host to \n"
266 " the container\n"
267 " --overlay-ro=PATH[:PATH...]:PATH\n"
268 " Similar, but creates a read-only overlay mount\n"
269 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
270 " --share-system Share system namespaces with host\n"
271 " --register=BOOLEAN Register container as machine\n"
272 " --keep-unit Do not register a scope for the machine, reuse\n"
273 " the service unit nspawn is running in\n"
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 , program_invocation_short_name);
276 }
277
278 static CustomMount* custom_mount_add(CustomMountType t) {
279 CustomMount *c, *ret;
280
281 c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
282 if (!c)
283 return NULL;
284
285 arg_custom_mounts = c;
286 ret = arg_custom_mounts + arg_n_custom_mounts;
287 arg_n_custom_mounts++;
288
289 *ret = (CustomMount) { .type = t };
290
291 return ret;
292 }
293
294 static void custom_mount_free_all(void) {
295 unsigned i;
296
297 for (i = 0; i < arg_n_custom_mounts; i++) {
298 CustomMount *m = &arg_custom_mounts[i];
299
300 free(m->source);
301 free(m->destination);
302 free(m->options);
303
304 if (m->work_dir) {
305 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
306 free(m->work_dir);
307 }
308
309 strv_free(m->lower);
310 }
311
312 arg_custom_mounts = mfree(arg_custom_mounts);
313 arg_n_custom_mounts = 0;
314 }
315
316 static int custom_mount_compare(const void *a, const void *b) {
317 const CustomMount *x = a, *y = b;
318 int r;
319
320 r = path_compare(x->destination, y->destination);
321 if (r != 0)
322 return r;
323
324 if (x->type < y->type)
325 return -1;
326 if (x->type > y->type)
327 return 1;
328
329 return 0;
330 }
331
332 static int custom_mounts_prepare(void) {
333 unsigned i;
334 int r;
335
336 /* Ensure the mounts are applied prefix first. */
337 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
338
339 /* Allocate working directories for the overlay file systems that need it */
340 for (i = 0; i < arg_n_custom_mounts; i++) {
341 CustomMount *m = &arg_custom_mounts[i];
342
343 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
344 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
345 return -EINVAL;
346 }
347
348 if (m->type != CUSTOM_MOUNT_OVERLAY)
349 continue;
350
351 if (m->work_dir)
352 continue;
353
354 if (m->read_only)
355 continue;
356
357 r = tempfn_random(m->source, NULL, &m->work_dir);
358 if (r < 0)
359 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
360 }
361
362 return 0;
363 }
364
365 static int set_sanitized_path(char **b, const char *path) {
366 char *p;
367
368 assert(b);
369 assert(path);
370
371 p = canonicalize_file_name(path);
372 if (!p) {
373 if (errno != ENOENT)
374 return -errno;
375
376 p = path_make_absolute_cwd(path);
377 if (!p)
378 return -ENOMEM;
379 }
380
381 free(*b);
382 *b = path_kill_slashes(p);
383 return 0;
384 }
385
386 static int parse_argv(int argc, char *argv[]) {
387
388 enum {
389 ARG_VERSION = 0x100,
390 ARG_PRIVATE_NETWORK,
391 ARG_UUID,
392 ARG_READ_ONLY,
393 ARG_CAPABILITY,
394 ARG_DROP_CAPABILITY,
395 ARG_LINK_JOURNAL,
396 ARG_BIND,
397 ARG_BIND_RO,
398 ARG_TMPFS,
399 ARG_OVERLAY,
400 ARG_OVERLAY_RO,
401 ARG_SETENV,
402 ARG_SHARE_SYSTEM,
403 ARG_REGISTER,
404 ARG_KEEP_UNIT,
405 ARG_NETWORK_INTERFACE,
406 ARG_NETWORK_MACVLAN,
407 ARG_NETWORK_IPVLAN,
408 ARG_NETWORK_BRIDGE,
409 ARG_PERSONALITY,
410 ARG_VOLATILE,
411 ARG_TEMPLATE,
412 ARG_PROPERTY,
413 ARG_PRIVATE_USERS,
414 ARG_KILL_SIGNAL,
415 };
416
417 static const struct option options[] = {
418 { "help", no_argument, NULL, 'h' },
419 { "version", no_argument, NULL, ARG_VERSION },
420 { "directory", required_argument, NULL, 'D' },
421 { "template", required_argument, NULL, ARG_TEMPLATE },
422 { "ephemeral", no_argument, NULL, 'x' },
423 { "user", required_argument, NULL, 'u' },
424 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
425 { "boot", no_argument, NULL, 'b' },
426 { "uuid", required_argument, NULL, ARG_UUID },
427 { "read-only", no_argument, NULL, ARG_READ_ONLY },
428 { "capability", required_argument, NULL, ARG_CAPABILITY },
429 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
430 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
431 { "bind", required_argument, NULL, ARG_BIND },
432 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
433 { "tmpfs", required_argument, NULL, ARG_TMPFS },
434 { "overlay", required_argument, NULL, ARG_OVERLAY },
435 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
436 { "machine", required_argument, NULL, 'M' },
437 { "slice", required_argument, NULL, 'S' },
438 { "setenv", required_argument, NULL, ARG_SETENV },
439 { "selinux-context", required_argument, NULL, 'Z' },
440 { "selinux-apifs-context", required_argument, NULL, 'L' },
441 { "quiet", no_argument, NULL, 'q' },
442 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
443 { "register", required_argument, NULL, ARG_REGISTER },
444 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
445 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
446 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
447 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
448 { "network-veth", no_argument, NULL, 'n' },
449 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
450 { "personality", required_argument, NULL, ARG_PERSONALITY },
451 { "image", required_argument, NULL, 'i' },
452 { "volatile", optional_argument, NULL, ARG_VOLATILE },
453 { "port", required_argument, NULL, 'p' },
454 { "property", required_argument, NULL, ARG_PROPERTY },
455 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
456 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
457 {}
458 };
459
460 int c, r;
461 uint64_t plus = 0, minus = 0;
462
463 assert(argc >= 0);
464 assert(argv);
465
466 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
467
468 switch (c) {
469
470 case 'h':
471 help();
472 return 0;
473
474 case ARG_VERSION:
475 puts(PACKAGE_STRING);
476 puts(SYSTEMD_FEATURES);
477 return 0;
478
479 case 'D':
480 r = set_sanitized_path(&arg_directory, optarg);
481 if (r < 0)
482 return log_error_errno(r, "Invalid root directory: %m");
483
484 break;
485
486 case ARG_TEMPLATE:
487 r = set_sanitized_path(&arg_template, optarg);
488 if (r < 0)
489 return log_error_errno(r, "Invalid template directory: %m");
490
491 break;
492
493 case 'i':
494 r = set_sanitized_path(&arg_image, optarg);
495 if (r < 0)
496 return log_error_errno(r, "Invalid image path: %m");
497
498 break;
499
500 case 'x':
501 arg_ephemeral = true;
502 break;
503
504 case 'u':
505 r = free_and_strdup(&arg_user, optarg);
506 if (r < 0)
507 return log_oom();
508
509 break;
510
511 case ARG_NETWORK_BRIDGE:
512 arg_network_bridge = optarg;
513
514 /* fall through */
515
516 case 'n':
517 arg_network_veth = true;
518 arg_private_network = true;
519 break;
520
521 case ARG_NETWORK_INTERFACE:
522 if (strv_extend(&arg_network_interfaces, optarg) < 0)
523 return log_oom();
524
525 arg_private_network = true;
526 break;
527
528 case ARG_NETWORK_MACVLAN:
529 if (strv_extend(&arg_network_macvlan, optarg) < 0)
530 return log_oom();
531
532 arg_private_network = true;
533 break;
534
535 case ARG_NETWORK_IPVLAN:
536 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
537 return log_oom();
538
539 /* fall through */
540
541 case ARG_PRIVATE_NETWORK:
542 arg_private_network = true;
543 break;
544
545 case 'b':
546 arg_boot = true;
547 break;
548
549 case ARG_UUID:
550 r = sd_id128_from_string(optarg, &arg_uuid);
551 if (r < 0) {
552 log_error("Invalid UUID: %s", optarg);
553 return r;
554 }
555 break;
556
557 case 'S':
558 arg_slice = optarg;
559 break;
560
561 case 'M':
562 if (isempty(optarg)) {
563 arg_machine = mfree(arg_machine);
564 } else {
565 if (!machine_name_is_valid(optarg)) {
566 log_error("Invalid machine name: %s", optarg);
567 return -EINVAL;
568 }
569
570 r = free_and_strdup(&arg_machine, optarg);
571 if (r < 0)
572 return log_oom();
573
574 break;
575 }
576
577 case 'Z':
578 arg_selinux_context = optarg;
579 break;
580
581 case 'L':
582 arg_selinux_apifs_context = optarg;
583 break;
584
585 case ARG_READ_ONLY:
586 arg_read_only = true;
587 break;
588
589 case ARG_CAPABILITY:
590 case ARG_DROP_CAPABILITY: {
591 const char *state, *word;
592 size_t length;
593
594 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
595 _cleanup_free_ char *t;
596
597 t = strndup(word, length);
598 if (!t)
599 return log_oom();
600
601 if (streq(t, "all")) {
602 if (c == ARG_CAPABILITY)
603 plus = (uint64_t) -1;
604 else
605 minus = (uint64_t) -1;
606 } else {
607 int cap;
608
609 cap = capability_from_name(t);
610 if (cap < 0) {
611 log_error("Failed to parse capability %s.", t);
612 return -EINVAL;
613 }
614
615 if (c == ARG_CAPABILITY)
616 plus |= 1ULL << (uint64_t) cap;
617 else
618 minus |= 1ULL << (uint64_t) cap;
619 }
620 }
621
622 break;
623 }
624
625 case 'j':
626 arg_link_journal = LINK_GUEST;
627 arg_link_journal_try = true;
628 break;
629
630 case ARG_LINK_JOURNAL:
631 if (streq(optarg, "auto")) {
632 arg_link_journal = LINK_AUTO;
633 arg_link_journal_try = false;
634 } else if (streq(optarg, "no")) {
635 arg_link_journal = LINK_NO;
636 arg_link_journal_try = false;
637 } else if (streq(optarg, "guest")) {
638 arg_link_journal = LINK_GUEST;
639 arg_link_journal_try = false;
640 } else if (streq(optarg, "host")) {
641 arg_link_journal = LINK_HOST;
642 arg_link_journal_try = false;
643 } else if (streq(optarg, "try-guest")) {
644 arg_link_journal = LINK_GUEST;
645 arg_link_journal_try = true;
646 } else if (streq(optarg, "try-host")) {
647 arg_link_journal = LINK_HOST;
648 arg_link_journal_try = true;
649 } else {
650 log_error("Failed to parse link journal mode %s", optarg);
651 return -EINVAL;
652 }
653
654 break;
655
656 case ARG_BIND:
657 case ARG_BIND_RO: {
658 const char *current = optarg;
659 _cleanup_free_ char *source = NULL, *destination = NULL;
660 CustomMount *m;
661 _cleanup_strv_free_ char **strv = NULL;
662
663 r = extract_many_words(&current, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
664 switch (r) {
665 case 1:
666 destination = strdup(source);
667 case 2:
668 break;
669 case -ENOMEM:
670 return log_oom();
671 default:
672 log_error("Invalid bind mount specification: %s", optarg);
673 return -EINVAL;
674 }
675
676 if (!source || !destination)
677 return log_oom();
678
679 if (!path_is_absolute(source) || !path_is_absolute(destination)) {
680 log_error("Invalid bind mount specification: %s", optarg);
681 return -EINVAL;
682 }
683
684 m = custom_mount_add(CUSTOM_MOUNT_BIND);
685 if (!m)
686 return log_oom();
687
688 m->source = source;
689 m->destination = destination;
690 m->read_only = c == ARG_BIND_RO;
691
692 source = destination = NULL;
693
694 break;
695 }
696
697 case ARG_TMPFS: {
698 const char *current = optarg;
699 _cleanup_free_ char *path = NULL, *opts = NULL;
700 CustomMount *m;
701
702 r = extract_first_word(&current, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
703 if (r == -ENOMEM)
704 return log_oom();
705 else if (r < 0) {
706 log_error("Invalid tmpfs specification: %s", optarg);
707 return r;
708 }
709 if (r)
710 opts = strdup(current);
711 else
712 opts = strdup("mode=0755");
713
714 if (!path || !opts)
715 return log_oom();
716
717 if (!path_is_absolute(path)) {
718 log_error("Invalid tmpfs specification: %s", optarg);
719 return -EINVAL;
720 }
721
722 m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
723 if (!m)
724 return log_oom();
725
726 m->destination = path;
727 m->options = opts;
728
729 path = opts = NULL;
730
731 break;
732 }
733
734 case ARG_OVERLAY:
735 case ARG_OVERLAY_RO: {
736 _cleanup_free_ char *upper = NULL, *destination = NULL;
737 _cleanup_strv_free_ char **lower = NULL;
738 CustomMount *m;
739 unsigned n = 0;
740 char **i;
741
742 lower = strv_split(optarg, ":");
743 if (!lower)
744 return log_oom();
745
746 STRV_FOREACH(i, lower) {
747 if (!path_is_absolute(*i)) {
748 log_error("Overlay path %s is not absolute.", *i);
749 return -EINVAL;
750 }
751
752 n++;
753 }
754
755 if (n < 2) {
756 log_error("--overlay= needs at least two colon-separated directories specified.");
757 return -EINVAL;
758 }
759
760 if (n == 2) {
761 /* If two parameters are specified,
762 * the first one is the lower, the
763 * second one the upper directory. And
764 * we'll also define the destination
765 * mount point the same as the upper. */
766 upper = lower[1];
767 lower[1] = NULL;
768
769 destination = strdup(upper);
770 if (!destination)
771 return log_oom();
772
773 } else {
774 upper = lower[n - 2];
775 destination = lower[n - 1];
776 lower[n - 2] = NULL;
777 }
778
779 m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
780 if (!m)
781 return log_oom();
782
783 m->destination = destination;
784 m->source = upper;
785 m->lower = lower;
786 m->read_only = c == ARG_OVERLAY_RO;
787
788 upper = destination = NULL;
789 lower = NULL;
790
791 break;
792 }
793
794 case ARG_SETENV: {
795 char **n;
796
797 if (!env_assignment_is_valid(optarg)) {
798 log_error("Environment variable assignment '%s' is not valid.", optarg);
799 return -EINVAL;
800 }
801
802 n = strv_env_set(arg_setenv, optarg);
803 if (!n)
804 return log_oom();
805
806 strv_free(arg_setenv);
807 arg_setenv = n;
808 break;
809 }
810
811 case 'q':
812 arg_quiet = true;
813 break;
814
815 case ARG_SHARE_SYSTEM:
816 arg_share_system = true;
817 break;
818
819 case ARG_REGISTER:
820 r = parse_boolean(optarg);
821 if (r < 0) {
822 log_error("Failed to parse --register= argument: %s", optarg);
823 return r;
824 }
825
826 arg_register = r;
827 break;
828
829 case ARG_KEEP_UNIT:
830 arg_keep_unit = true;
831 break;
832
833 case ARG_PERSONALITY:
834
835 arg_personality = personality_from_string(optarg);
836 if (arg_personality == PERSONALITY_INVALID) {
837 log_error("Unknown or unsupported personality '%s'.", optarg);
838 return -EINVAL;
839 }
840
841 break;
842
843 case ARG_VOLATILE:
844
845 if (!optarg)
846 arg_volatile = VOLATILE_YES;
847 else {
848 r = parse_boolean(optarg);
849 if (r < 0) {
850 if (streq(optarg, "state"))
851 arg_volatile = VOLATILE_STATE;
852 else {
853 log_error("Failed to parse --volatile= argument: %s", optarg);
854 return r;
855 }
856 } else
857 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
858 }
859
860 break;
861
862 case 'p': {
863 const char *split, *e;
864 uint16_t container_port, host_port;
865 int protocol;
866 ExposePort *p;
867
868 if ((e = startswith(optarg, "tcp:")))
869 protocol = IPPROTO_TCP;
870 else if ((e = startswith(optarg, "udp:")))
871 protocol = IPPROTO_UDP;
872 else {
873 e = optarg;
874 protocol = IPPROTO_TCP;
875 }
876
877 split = strchr(e, ':');
878 if (split) {
879 char v[split - e + 1];
880
881 memcpy(v, e, split - e);
882 v[split - e] = 0;
883
884 r = safe_atou16(v, &host_port);
885 if (r < 0 || host_port <= 0) {
886 log_error("Failed to parse host port: %s", optarg);
887 return -EINVAL;
888 }
889
890 r = safe_atou16(split + 1, &container_port);
891 } else {
892 r = safe_atou16(e, &container_port);
893 host_port = container_port;
894 }
895
896 if (r < 0 || container_port <= 0) {
897 log_error("Failed to parse host port: %s", optarg);
898 return -EINVAL;
899 }
900
901 LIST_FOREACH(ports, p, arg_expose_ports) {
902 if (p->protocol == protocol && p->host_port == host_port) {
903 log_error("Duplicate port specification: %s", optarg);
904 return -EINVAL;
905 }
906 }
907
908 p = new(ExposePort, 1);
909 if (!p)
910 return log_oom();
911
912 p->protocol = protocol;
913 p->host_port = host_port;
914 p->container_port = container_port;
915
916 LIST_PREPEND(ports, arg_expose_ports, p);
917
918 break;
919 }
920
921 case ARG_PROPERTY:
922 if (strv_extend(&arg_property, optarg) < 0)
923 return log_oom();
924
925 break;
926
927 case ARG_PRIVATE_USERS:
928 if (optarg) {
929 _cleanup_free_ char *buffer = NULL;
930 const char *range, *shift;
931
932 range = strchr(optarg, ':');
933 if (range) {
934 buffer = strndup(optarg, range - optarg);
935 if (!buffer)
936 return log_oom();
937 shift = buffer;
938
939 range++;
940 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
941 log_error("Failed to parse UID range: %s", range);
942 return -EINVAL;
943 }
944 } else
945 shift = optarg;
946
947 if (parse_uid(shift, &arg_uid_shift) < 0) {
948 log_error("Failed to parse UID: %s", optarg);
949 return -EINVAL;
950 }
951 }
952
953 arg_userns = true;
954 break;
955
956 case ARG_KILL_SIGNAL:
957 arg_kill_signal = signal_from_string_try_harder(optarg);
958 if (arg_kill_signal < 0) {
959 log_error("Cannot parse signal: %s", optarg);
960 return -EINVAL;
961 }
962
963 break;
964
965 case '?':
966 return -EINVAL;
967
968 default:
969 assert_not_reached("Unhandled option");
970 }
971
972 if (arg_share_system)
973 arg_register = false;
974
975 if (arg_boot && arg_share_system) {
976 log_error("--boot and --share-system may not be combined.");
977 return -EINVAL;
978 }
979
980 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
981 log_error("--keep-unit may not be used when invoked from a user session.");
982 return -EINVAL;
983 }
984
985 if (arg_directory && arg_image) {
986 log_error("--directory= and --image= may not be combined.");
987 return -EINVAL;
988 }
989
990 if (arg_template && arg_image) {
991 log_error("--template= and --image= may not be combined.");
992 return -EINVAL;
993 }
994
995 if (arg_template && !(arg_directory || arg_machine)) {
996 log_error("--template= needs --directory= or --machine=.");
997 return -EINVAL;
998 }
999
1000 if (arg_ephemeral && arg_template) {
1001 log_error("--ephemeral and --template= may not be combined.");
1002 return -EINVAL;
1003 }
1004
1005 if (arg_ephemeral && arg_image) {
1006 log_error("--ephemeral and --image= may not be combined.");
1007 return -EINVAL;
1008 }
1009
1010 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1011 log_error("--ephemeral and --link-journal= may not be combined.");
1012 return -EINVAL;
1013 }
1014
1015 if (arg_volatile != VOLATILE_NO && arg_read_only) {
1016 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1017 return -EINVAL;
1018 }
1019
1020 if (arg_expose_ports && !arg_private_network) {
1021 log_error("Cannot use --port= without private networking.");
1022 return -EINVAL;
1023 }
1024
1025 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
1026 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
1027
1028 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1029
1030 if (arg_boot && arg_kill_signal <= 0)
1031 arg_kill_signal = SIGRTMIN+3;
1032
1033 return 1;
1034 }
1035
1036 static int tmpfs_patch_options(const char *options, char **ret) {
1037 char *buf = NULL;
1038
1039 if (arg_userns && arg_uid_shift != 0) {
1040 assert(arg_uid_shift != UID_INVALID);
1041
1042 if (options)
1043 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
1044 else
1045 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
1046 if (!buf)
1047 return -ENOMEM;
1048
1049 options = buf;
1050 }
1051
1052 #ifdef HAVE_SELINUX
1053 if (arg_selinux_apifs_context) {
1054 char *t;
1055
1056 if (options)
1057 t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
1058 else
1059 t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
1060 if (!t) {
1061 free(buf);
1062 return -ENOMEM;
1063 }
1064
1065 free(buf);
1066 buf = t;
1067 }
1068 #endif
1069
1070 *ret = buf;
1071 return !!buf;
1072 }
1073
1074 static int mount_all(const char *dest, bool userns) {
1075
1076 typedef struct MountPoint {
1077 const char *what;
1078 const char *where;
1079 const char *type;
1080 const char *options;
1081 unsigned long flags;
1082 bool fatal;
1083 bool userns;
1084 } MountPoint;
1085
1086 static const MountPoint mount_table[] = {
1087 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
1088 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
1089 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
1090 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
1091 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
1092 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
1093 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1094 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1095 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
1096 #ifdef HAVE_SELINUX
1097 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
1098 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */
1099 #endif
1100 };
1101
1102 unsigned k;
1103 int r;
1104
1105 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
1106 _cleanup_free_ char *where = NULL, *options = NULL;
1107 const char *o;
1108
1109 if (userns != mount_table[k].userns)
1110 continue;
1111
1112 where = prefix_root(dest, mount_table[k].where);
1113 if (!where)
1114 return log_oom();
1115
1116 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
1117 if (r < 0 && r != -ENOENT)
1118 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
1119
1120 /* Skip this entry if it is not a remount. */
1121 if (mount_table[k].what && r > 0)
1122 continue;
1123
1124 r = mkdir_p(where, 0755);
1125 if (r < 0) {
1126 if (mount_table[k].fatal)
1127 return log_error_errno(r, "Failed to create directory %s: %m", where);
1128
1129 log_warning_errno(r, "Failed to create directory %s: %m", where);
1130 continue;
1131 }
1132
1133 o = mount_table[k].options;
1134 if (streq_ptr(mount_table[k].type, "tmpfs")) {
1135 r = tmpfs_patch_options(o, &options);
1136 if (r < 0)
1137 return log_oom();
1138 if (r > 0)
1139 o = options;
1140 }
1141
1142 if (mount(mount_table[k].what,
1143 where,
1144 mount_table[k].type,
1145 mount_table[k].flags,
1146 o) < 0) {
1147
1148 if (mount_table[k].fatal)
1149 return log_error_errno(errno, "mount(%s) failed: %m", where);
1150
1151 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
1152 }
1153 }
1154
1155 return 0;
1156 }
1157
1158 static int mount_bind(const char *dest, CustomMount *m) {
1159 struct stat source_st, dest_st;
1160 const char *where;
1161 int r;
1162
1163 assert(m);
1164
1165 if (stat(m->source, &source_st) < 0)
1166 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
1167
1168 where = prefix_roota(dest, m->destination);
1169
1170 if (stat(where, &dest_st) >= 0) {
1171 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
1172 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
1173 return -EINVAL;
1174 }
1175
1176 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
1177 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
1178 return -EINVAL;
1179 }
1180
1181 } else if (errno == ENOENT) {
1182 r = mkdir_parents_label(where, 0755);
1183 if (r < 0)
1184 return log_error_errno(r, "Failed to make parents of %s: %m", where);
1185 } else {
1186 log_error_errno(errno, "Failed to stat %s: %m", where);
1187 return -errno;
1188 }
1189
1190 /* Create the mount point. Any non-directory file can be
1191 * mounted on any non-directory file (regular, fifo, socket,
1192 * char, block).
1193 */
1194 if (S_ISDIR(source_st.st_mode))
1195 r = mkdir_label(where, 0755);
1196 else
1197 r = touch(where);
1198 if (r < 0 && r != -EEXIST)
1199 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1200
1201 if (mount(m->source, where, NULL, MS_BIND, NULL) < 0)
1202 return log_error_errno(errno, "mount(%s) failed: %m", where);
1203
1204 if (m->read_only) {
1205 r = bind_remount_recursive(where, true);
1206 if (r < 0)
1207 return log_error_errno(r, "Read-only bind mount failed: %m");
1208 }
1209
1210 return 0;
1211 }
1212
1213 static int mount_tmpfs(const char *dest, CustomMount *m) {
1214 const char *where, *options;
1215 _cleanup_free_ char *buf = NULL;
1216 int r;
1217
1218 assert(dest);
1219 assert(m);
1220
1221 where = prefix_roota(dest, m->destination);
1222
1223 r = mkdir_p_label(where, 0755);
1224 if (r < 0 && r != -EEXIST)
1225 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1226
1227 r = tmpfs_patch_options(m->options, &buf);
1228 if (r < 0)
1229 return log_oom();
1230 options = r > 0 ? buf : m->options;
1231
1232 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
1233 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1234
1235 return 0;
1236 }
1237
1238 static int mount_overlay(const char *dest, CustomMount *m) {
1239 _cleanup_free_ char *lower = NULL;
1240 const char *where, *options;
1241 int r;
1242
1243 assert(dest);
1244 assert(m);
1245
1246 where = prefix_roota(dest, m->destination);
1247
1248 r = mkdir_label(where, 0755);
1249 if (r < 0 && r != -EEXIST)
1250 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
1251
1252 (void) mkdir_p_label(m->source, 0755);
1253
1254 strv_reverse(m->lower);
1255 lower = strv_join(m->lower, ":");
1256 strv_reverse(m->lower);
1257 if (!lower)
1258 return log_oom();
1259
1260 if (m->read_only)
1261 options = strjoina("lowerdir=", m->source, ":", lower);
1262 else {
1263 assert(m->work_dir);
1264 (void) mkdir_label(m->work_dir, 0700);
1265
1266 options = strjoina("lowerdir=", lower, ",upperdir=", m->source, ",workdir=", m->work_dir);
1267 }
1268
1269 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
1270 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
1271
1272 return 0;
1273 }
1274
1275 static int mount_custom(const char *dest) {
1276 unsigned i;
1277 int r;
1278
1279 assert(dest);
1280
1281 for (i = 0; i < arg_n_custom_mounts; i++) {
1282 CustomMount *m = &arg_custom_mounts[i];
1283
1284 switch (m->type) {
1285
1286 case CUSTOM_MOUNT_BIND:
1287 r = mount_bind(dest, m);
1288 break;
1289
1290 case CUSTOM_MOUNT_TMPFS:
1291 r = mount_tmpfs(dest, m);
1292 break;
1293
1294 case CUSTOM_MOUNT_OVERLAY:
1295 r = mount_overlay(dest, m);
1296 break;
1297
1298 default:
1299 assert_not_reached("Unknown custom mount type");
1300 }
1301
1302 if (r < 0)
1303 return r;
1304 }
1305
1306 return 0;
1307 }
1308
1309 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1310 char *to;
1311 int r;
1312
1313 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1314
1315 r = path_is_mount_point(to, 0);
1316 if (r < 0 && r != -ENOENT)
1317 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1318 if (r > 0)
1319 return 0;
1320
1321 mkdir_p(to, 0755);
1322
1323 /* The superblock mount options of the mount point need to be
1324 * identical to the hosts', and hence writable... */
1325 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1326 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1327
1328 /* ... hence let's only make the bind mount read-only, not the
1329 * superblock. */
1330 if (read_only) {
1331 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1332 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1333 }
1334 return 1;
1335 }
1336
1337 static int mount_cgroup(const char *dest) {
1338 _cleanup_set_free_free_ Set *controllers = NULL;
1339 const char *cgroup_root;
1340 int r;
1341
1342 controllers = set_new(&string_hash_ops);
1343 if (!controllers)
1344 return log_oom();
1345
1346 r = cg_kernel_controllers(controllers);
1347 if (r < 0)
1348 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1349
1350 for (;;) {
1351 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1352
1353 controller = set_steal_first(controllers);
1354 if (!controller)
1355 break;
1356
1357 origin = prefix_root("/sys/fs/cgroup/", controller);
1358 if (!origin)
1359 return log_oom();
1360
1361 r = readlink_malloc(origin, &combined);
1362 if (r == -EINVAL) {
1363 /* Not a symbolic link, but directly a single cgroup hierarchy */
1364
1365 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1366 if (r < 0)
1367 return r;
1368
1369 } else if (r < 0)
1370 return log_error_errno(r, "Failed to read link %s: %m", origin);
1371 else {
1372 _cleanup_free_ char *target = NULL;
1373
1374 target = prefix_root(dest, origin);
1375 if (!target)
1376 return log_oom();
1377
1378 /* A symbolic link, a combination of controllers in one hierarchy */
1379
1380 if (!filename_is_valid(combined)) {
1381 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1382 continue;
1383 }
1384
1385 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1386 if (r < 0)
1387 return r;
1388
1389 r = symlink_idempotent(combined, target);
1390 if (r == -EINVAL) {
1391 log_error("Invalid existing symlink for combined hierarchy");
1392 return r;
1393 }
1394 if (r < 0)
1395 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1396 }
1397 }
1398
1399 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1400 if (r < 0)
1401 return r;
1402
1403 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1404 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1405 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1406
1407 return 0;
1408 }
1409
1410 static int mount_systemd_cgroup_writable(const char *dest) {
1411 _cleanup_free_ char *own_cgroup_path = NULL;
1412 const char *systemd_root, *systemd_own;
1413 int r;
1414
1415 assert(dest);
1416
1417 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1418 if (r < 0)
1419 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1420
1421 /* Make our own cgroup a (writable) bind mount */
1422 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1423 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1424 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1425
1426 /* And then remount the systemd cgroup root read-only */
1427 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
1428 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1429 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1430
1431 return 0;
1432 }
1433
1434 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1435 assert(p);
1436
1437 if (!arg_userns)
1438 return 0;
1439
1440 if (uid == UID_INVALID && gid == GID_INVALID)
1441 return 0;
1442
1443 if (uid != UID_INVALID) {
1444 uid += arg_uid_shift;
1445
1446 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1447 return -EOVERFLOW;
1448 }
1449
1450 if (gid != GID_INVALID) {
1451 gid += (gid_t) arg_uid_shift;
1452
1453 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1454 return -EOVERFLOW;
1455 }
1456
1457 if (lchown(p, uid, gid) < 0)
1458 return -errno;
1459
1460 return 0;
1461 }
1462
1463 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1464 const char *q;
1465
1466 q = prefix_roota(root, path);
1467 if (mkdir(q, mode) < 0) {
1468 if (errno == EEXIST)
1469 return 0;
1470 return -errno;
1471 }
1472
1473 return userns_lchown(q, uid, gid);
1474 }
1475
1476 static int setup_timezone(const char *dest) {
1477 _cleanup_free_ char *p = NULL, *q = NULL;
1478 const char *where, *check, *what;
1479 char *z, *y;
1480 int r;
1481
1482 assert(dest);
1483
1484 /* Fix the timezone, if possible */
1485 r = readlink_malloc("/etc/localtime", &p);
1486 if (r < 0) {
1487 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1488 return 0;
1489 }
1490
1491 z = path_startswith(p, "../usr/share/zoneinfo/");
1492 if (!z)
1493 z = path_startswith(p, "/usr/share/zoneinfo/");
1494 if (!z) {
1495 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1496 return 0;
1497 }
1498
1499 where = prefix_roota(dest, "/etc/localtime");
1500 r = readlink_malloc(where, &q);
1501 if (r >= 0) {
1502 y = path_startswith(q, "../usr/share/zoneinfo/");
1503 if (!y)
1504 y = path_startswith(q, "/usr/share/zoneinfo/");
1505
1506 /* Already pointing to the right place? Then do nothing .. */
1507 if (y && streq(y, z))
1508 return 0;
1509 }
1510
1511 check = strjoina("/usr/share/zoneinfo/", z);
1512 check = prefix_root(dest, check);
1513 if (laccess(check, F_OK) < 0) {
1514 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1515 return 0;
1516 }
1517
1518 r = unlink(where);
1519 if (r < 0 && errno != ENOENT) {
1520 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1521 return 0;
1522 }
1523
1524 what = strjoina("../usr/share/zoneinfo/", z);
1525 if (symlink(what, where) < 0) {
1526 log_error_errno(errno, "Failed to correct timezone of container: %m");
1527 return 0;
1528 }
1529
1530 r = userns_lchown(where, 0, 0);
1531 if (r < 0)
1532 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1533
1534 return 0;
1535 }
1536
1537 static int setup_resolv_conf(const char *dest) {
1538 const char *where = NULL;
1539 int r;
1540
1541 assert(dest);
1542
1543 if (arg_private_network)
1544 return 0;
1545
1546 /* Fix resolv.conf, if possible */
1547 where = prefix_roota(dest, "/etc/resolv.conf");
1548
1549 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1550 if (r < 0) {
1551 /* If the file already exists as symlink, let's
1552 * suppress the warning, under the assumption that
1553 * resolved or something similar runs inside and the
1554 * symlink points there.
1555 *
1556 * If the disk image is read-only, there's also no
1557 * point in complaining.
1558 */
1559 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1560 "Failed to copy /etc/resolv.conf to %s: %m", where);
1561 return 0;
1562 }
1563
1564 r = userns_lchown(where, 0, 0);
1565 if (r < 0)
1566 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1567
1568 return 0;
1569 }
1570
1571 static int setup_volatile_state(const char *directory) {
1572 _cleanup_free_ char *buf = NULL;
1573 const char *p, *options;
1574 int r;
1575
1576 assert(directory);
1577
1578 if (arg_volatile != VOLATILE_STATE)
1579 return 0;
1580
1581 /* --volatile=state means we simply overmount /var
1582 with a tmpfs, and the rest read-only. */
1583
1584 r = bind_remount_recursive(directory, true);
1585 if (r < 0)
1586 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1587
1588 p = prefix_roota(directory, "/var");
1589 r = mkdir(p, 0755);
1590 if (r < 0 && errno != EEXIST)
1591 return log_error_errno(errno, "Failed to create %s: %m", directory);
1592
1593 options = "mode=755";
1594 r = tmpfs_patch_options(options, &buf);
1595 if (r < 0)
1596 return log_oom();
1597 if (r > 0)
1598 options = buf;
1599
1600 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
1601 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1602
1603 return 0;
1604 }
1605
1606 static int setup_volatile(const char *directory) {
1607 bool tmpfs_mounted = false, bind_mounted = false;
1608 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1609 _cleanup_free_ char *buf = NULL;
1610 const char *f, *t, *options;
1611 int r;
1612
1613 assert(directory);
1614
1615 if (arg_volatile != VOLATILE_YES)
1616 return 0;
1617
1618 /* --volatile=yes means we mount a tmpfs to the root dir, and
1619 the original /usr to use inside it, and that read-only. */
1620
1621 if (!mkdtemp(template))
1622 return log_error_errno(errno, "Failed to create temporary directory: %m");
1623
1624 options = "mode=755";
1625 r = tmpfs_patch_options(options, &buf);
1626 if (r < 0)
1627 return log_oom();
1628 if (r > 0)
1629 options = buf;
1630
1631 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
1632 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1633 goto fail;
1634 }
1635
1636 tmpfs_mounted = true;
1637
1638 f = prefix_roota(directory, "/usr");
1639 t = prefix_roota(template, "/usr");
1640
1641 r = mkdir(t, 0755);
1642 if (r < 0 && errno != EEXIST) {
1643 r = log_error_errno(errno, "Failed to create %s: %m", t);
1644 goto fail;
1645 }
1646
1647 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
1648 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
1649 goto fail;
1650 }
1651
1652 bind_mounted = true;
1653
1654 r = bind_remount_recursive(t, true);
1655 if (r < 0) {
1656 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1657 goto fail;
1658 }
1659
1660 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1661 r = log_error_errno(errno, "Failed to move root mount: %m");
1662 goto fail;
1663 }
1664
1665 (void) rmdir(template);
1666
1667 return 0;
1668
1669 fail:
1670 if (bind_mounted)
1671 (void) umount(t);
1672
1673 if (tmpfs_mounted)
1674 (void) umount(template);
1675 (void) rmdir(template);
1676 return r;
1677 }
1678
1679 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1680 assert(s);
1681
1682 snprintf(s, 37,
1683 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1684 SD_ID128_FORMAT_VAL(id));
1685
1686 return s;
1687 }
1688
1689 static int setup_boot_id(const char *dest) {
1690 const char *from, *to;
1691 sd_id128_t rnd = {};
1692 char as_uuid[37];
1693 int r;
1694
1695 if (arg_share_system)
1696 return 0;
1697
1698 /* Generate a new randomized boot ID, so that each boot-up of
1699 * the container gets a new one */
1700
1701 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1702 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1703
1704 r = sd_id128_randomize(&rnd);
1705 if (r < 0)
1706 return log_error_errno(r, "Failed to generate random boot id: %m");
1707
1708 id128_format_as_uuid(rnd, as_uuid);
1709
1710 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1711 if (r < 0)
1712 return log_error_errno(r, "Failed to write boot id: %m");
1713
1714 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1715 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1716 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1717 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1718
1719 unlink(from);
1720 return r;
1721 }
1722
1723 static int copy_devnodes(const char *dest) {
1724
1725 static const char devnodes[] =
1726 "null\0"
1727 "zero\0"
1728 "full\0"
1729 "random\0"
1730 "urandom\0"
1731 "tty\0"
1732 "net/tun\0";
1733
1734 const char *d;
1735 int r = 0;
1736 _cleanup_umask_ mode_t u;
1737
1738 assert(dest);
1739
1740 u = umask(0000);
1741
1742 /* Create /dev/net, so that we can create /dev/net/tun in it */
1743 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1744 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1745
1746 NULSTR_FOREACH(d, devnodes) {
1747 _cleanup_free_ char *from = NULL, *to = NULL;
1748 struct stat st;
1749
1750 from = strappend("/dev/", d);
1751 to = prefix_root(dest, from);
1752
1753 if (stat(from, &st) < 0) {
1754
1755 if (errno != ENOENT)
1756 return log_error_errno(errno, "Failed to stat %s: %m", from);
1757
1758 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1759
1760 log_error("%s is not a char or block device, cannot copy.", from);
1761 return -EIO;
1762
1763 } else {
1764 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1765 if (errno != EPERM)
1766 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1767
1768 /* Some systems abusively restrict mknod but
1769 * allow bind mounts. */
1770 r = touch(to);
1771 if (r < 0)
1772 return log_error_errno(r, "touch (%s) failed: %m", to);
1773 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1774 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1775 }
1776
1777 r = userns_lchown(to, 0, 0);
1778 if (r < 0)
1779 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1780 }
1781 }
1782
1783 return r;
1784 }
1785
1786 static int setup_pts(const char *dest) {
1787 _cleanup_free_ char *options = NULL;
1788 const char *p;
1789
1790 #ifdef HAVE_SELINUX
1791 if (arg_selinux_apifs_context)
1792 (void) asprintf(&options,
1793 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1794 arg_uid_shift + TTY_GID,
1795 arg_selinux_apifs_context);
1796 else
1797 #endif
1798 (void) asprintf(&options,
1799 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1800 arg_uid_shift + TTY_GID);
1801
1802 if (!options)
1803 return log_oom();
1804
1805 /* Mount /dev/pts itself */
1806 p = prefix_roota(dest, "/dev/pts");
1807 if (mkdir(p, 0755) < 0)
1808 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1809 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1810 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1811 if (userns_lchown(p, 0, 0) < 0)
1812 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1813
1814 /* Create /dev/ptmx symlink */
1815 p = prefix_roota(dest, "/dev/ptmx");
1816 if (symlink("pts/ptmx", p) < 0)
1817 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1818 if (userns_lchown(p, 0, 0) < 0)
1819 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1820
1821 /* And fix /dev/pts/ptmx ownership */
1822 p = prefix_roota(dest, "/dev/pts/ptmx");
1823 if (userns_lchown(p, 0, 0) < 0)
1824 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1825
1826 return 0;
1827 }
1828
1829 static int setup_dev_console(const char *dest, const char *console) {
1830 _cleanup_umask_ mode_t u;
1831 const char *to;
1832 int r;
1833
1834 assert(dest);
1835 assert(console);
1836
1837 u = umask(0000);
1838
1839 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1840 if (r < 0)
1841 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1842
1843 /* We need to bind mount the right tty to /dev/console since
1844 * ptys can only exist on pts file systems. To have something
1845 * to bind mount things on we create a empty regular file. */
1846
1847 to = prefix_roota(dest, "/dev/console");
1848 r = touch(to);
1849 if (r < 0)
1850 return log_error_errno(r, "touch() for /dev/console failed: %m");
1851
1852 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1853 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1854
1855 return 0;
1856 }
1857
1858 static int setup_kmsg(const char *dest, int kmsg_socket) {
1859 const char *from, *to;
1860 _cleanup_umask_ mode_t u;
1861 int fd, k;
1862 union {
1863 struct cmsghdr cmsghdr;
1864 uint8_t buf[CMSG_SPACE(sizeof(int))];
1865 } control = {};
1866 struct msghdr mh = {
1867 .msg_control = &control,
1868 .msg_controllen = sizeof(control),
1869 };
1870 struct cmsghdr *cmsg;
1871
1872 assert(kmsg_socket >= 0);
1873
1874 u = umask(0000);
1875
1876 /* We create the kmsg FIFO as /run/kmsg, but immediately
1877 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1878 * on the reading side behave very similar to /proc/kmsg,
1879 * their writing side behaves differently from /dev/kmsg in
1880 * that writing blocks when nothing is reading. In order to
1881 * avoid any problems with containers deadlocking due to this
1882 * we simply make /dev/kmsg unavailable to the container. */
1883 from = prefix_roota(dest, "/run/kmsg");
1884 to = prefix_roota(dest, "/proc/kmsg");
1885
1886 if (mkfifo(from, 0600) < 0)
1887 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1888 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1889 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1890
1891 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1892 if (fd < 0)
1893 return log_error_errno(errno, "Failed to open fifo: %m");
1894
1895 cmsg = CMSG_FIRSTHDR(&mh);
1896 cmsg->cmsg_level = SOL_SOCKET;
1897 cmsg->cmsg_type = SCM_RIGHTS;
1898 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1899 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1900
1901 mh.msg_controllen = cmsg->cmsg_len;
1902
1903 /* Store away the fd in the socket, so that it stays open as
1904 * long as we run the child */
1905 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1906 safe_close(fd);
1907
1908 if (k < 0)
1909 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1910
1911 /* And now make the FIFO unavailable as /run/kmsg... */
1912 (void) unlink(from);
1913
1914 return 0;
1915 }
1916
1917 static int send_rtnl(int send_fd) {
1918 union {
1919 struct cmsghdr cmsghdr;
1920 uint8_t buf[CMSG_SPACE(sizeof(int))];
1921 } control = {};
1922 struct msghdr mh = {
1923 .msg_control = &control,
1924 .msg_controllen = sizeof(control),
1925 };
1926 struct cmsghdr *cmsg;
1927 _cleanup_close_ int fd = -1;
1928 ssize_t k;
1929
1930 assert(send_fd >= 0);
1931
1932 if (!arg_expose_ports)
1933 return 0;
1934
1935 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1936 if (fd < 0)
1937 return log_error_errno(errno, "Failed to allocate container netlink: %m");
1938
1939 cmsg = CMSG_FIRSTHDR(&mh);
1940 cmsg->cmsg_level = SOL_SOCKET;
1941 cmsg->cmsg_type = SCM_RIGHTS;
1942 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1943 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1944
1945 mh.msg_controllen = cmsg->cmsg_len;
1946
1947 /* Store away the fd in the socket, so that it stays open as
1948 * long as we run the child */
1949 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1950 if (k < 0)
1951 return log_error_errno(errno, "Failed to send netlink fd: %m");
1952
1953 return 0;
1954 }
1955
1956 static int flush_ports(union in_addr_union *exposed) {
1957 ExposePort *p;
1958 int r, af = AF_INET;
1959
1960 assert(exposed);
1961
1962 if (!arg_expose_ports)
1963 return 0;
1964
1965 if (in_addr_is_null(af, exposed))
1966 return 0;
1967
1968 log_debug("Lost IP address.");
1969
1970 LIST_FOREACH(ports, p, arg_expose_ports) {
1971 r = fw_add_local_dnat(false,
1972 af,
1973 p->protocol,
1974 NULL,
1975 NULL, 0,
1976 NULL, 0,
1977 p->host_port,
1978 exposed,
1979 p->container_port,
1980 NULL);
1981 if (r < 0)
1982 log_warning_errno(r, "Failed to modify firewall: %m");
1983 }
1984
1985 *exposed = IN_ADDR_NULL;
1986 return 0;
1987 }
1988
1989 static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
1990 _cleanup_free_ struct local_address *addresses = NULL;
1991 _cleanup_free_ char *pretty = NULL;
1992 union in_addr_union new_exposed;
1993 ExposePort *p;
1994 bool add;
1995 int af = AF_INET, r;
1996
1997 assert(exposed);
1998
1999 /* Invoked each time an address is added or removed inside the
2000 * container */
2001
2002 if (!arg_expose_ports)
2003 return 0;
2004
2005 r = local_addresses(rtnl, 0, af, &addresses);
2006 if (r < 0)
2007 return log_error_errno(r, "Failed to enumerate local addresses: %m");
2008
2009 add = r > 0 &&
2010 addresses[0].family == af &&
2011 addresses[0].scope < RT_SCOPE_LINK;
2012
2013 if (!add)
2014 return flush_ports(exposed);
2015
2016 new_exposed = addresses[0].address;
2017 if (in_addr_equal(af, exposed, &new_exposed))
2018 return 0;
2019
2020 in_addr_to_string(af, &new_exposed, &pretty);
2021 log_debug("New container IP is %s.", strna(pretty));
2022
2023 LIST_FOREACH(ports, p, arg_expose_ports) {
2024
2025 r = fw_add_local_dnat(true,
2026 af,
2027 p->protocol,
2028 NULL,
2029 NULL, 0,
2030 NULL, 0,
2031 p->host_port,
2032 &new_exposed,
2033 p->container_port,
2034 in_addr_is_null(af, exposed) ? NULL : exposed);
2035 if (r < 0)
2036 log_warning_errno(r, "Failed to modify firewall: %m");
2037 }
2038
2039 *exposed = new_exposed;
2040 return 0;
2041 }
2042
2043 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2044 union in_addr_union *exposed = userdata;
2045
2046 assert(rtnl);
2047 assert(m);
2048 assert(exposed);
2049
2050 expose_ports(rtnl, exposed);
2051 return 0;
2052 }
2053
2054 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
2055 union {
2056 struct cmsghdr cmsghdr;
2057 uint8_t buf[CMSG_SPACE(sizeof(int))];
2058 } control = {};
2059 struct msghdr mh = {
2060 .msg_control = &control,
2061 .msg_controllen = sizeof(control),
2062 };
2063 struct cmsghdr *cmsg;
2064 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2065 int fd, r;
2066 ssize_t k;
2067
2068 assert(event);
2069 assert(recv_fd >= 0);
2070 assert(ret);
2071
2072 if (!arg_expose_ports)
2073 return 0;
2074
2075 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
2076 if (k < 0)
2077 return log_error_errno(errno, "Failed to recv netlink fd: %m");
2078
2079 cmsg = CMSG_FIRSTHDR(&mh);
2080 assert(cmsg->cmsg_level == SOL_SOCKET);
2081 assert(cmsg->cmsg_type == SCM_RIGHTS);
2082 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
2083 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
2084
2085 r = sd_netlink_open_fd(&rtnl, fd);
2086 if (r < 0) {
2087 safe_close(fd);
2088 return log_error_errno(r, "Failed to create rtnl object: %m");
2089 }
2090
2091 r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
2092 if (r < 0)
2093 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
2094
2095 r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
2096 if (r < 0)
2097 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
2098
2099 r = sd_netlink_attach_event(rtnl, event, 0);
2100 if (r < 0)
2101 return log_error_errno(r, "Failed to add to even loop: %m");
2102
2103 *ret = rtnl;
2104 rtnl = NULL;
2105
2106 return 0;
2107 }
2108
2109 static int setup_hostname(void) {
2110
2111 if (arg_share_system)
2112 return 0;
2113
2114 if (sethostname_idempotent(arg_machine) < 0)
2115 return -errno;
2116
2117 return 0;
2118 }
2119
2120 static int setup_journal(const char *directory) {
2121 sd_id128_t machine_id, this_id;
2122 _cleanup_free_ char *b = NULL, *d = NULL;
2123 const char *etc_machine_id, *p, *q;
2124 char *id;
2125 int r;
2126
2127 /* Don't link journals in ephemeral mode */
2128 if (arg_ephemeral)
2129 return 0;
2130
2131 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2132
2133 r = read_one_line_file(etc_machine_id, &b);
2134 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
2135 return 0;
2136 else if (r < 0)
2137 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
2138
2139 id = strstrip(b);
2140 if (isempty(id) && arg_link_journal == LINK_AUTO)
2141 return 0;
2142
2143 /* Verify validity */
2144 r = sd_id128_from_string(id, &machine_id);
2145 if (r < 0)
2146 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
2147
2148 r = sd_id128_get_machine(&this_id);
2149 if (r < 0)
2150 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2151
2152 if (sd_id128_equal(machine_id, this_id)) {
2153 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
2154 "Host and machine ids are equal (%s): refusing to link journals", id);
2155 if (arg_link_journal == LINK_AUTO)
2156 return 0;
2157 return -EEXIST;
2158 }
2159
2160 if (arg_link_journal == LINK_NO)
2161 return 0;
2162
2163 r = userns_mkdir(directory, "/var", 0755, 0, 0);
2164 if (r < 0)
2165 return log_error_errno(r, "Failed to create /var: %m");
2166
2167 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
2168 if (r < 0)
2169 return log_error_errno(r, "Failed to create /var/log: %m");
2170
2171 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
2172 if (r < 0)
2173 return log_error_errno(r, "Failed to create /var/log/journal: %m");
2174
2175 p = strjoina("/var/log/journal/", id);
2176 q = prefix_roota(directory, p);
2177
2178 if (path_is_mount_point(p, 0) > 0) {
2179 if (arg_link_journal != LINK_AUTO) {
2180 log_error("%s: already a mount point, refusing to use for journal", p);
2181 return -EEXIST;
2182 }
2183
2184 return 0;
2185 }
2186
2187 if (path_is_mount_point(q, 0) > 0) {
2188 if (arg_link_journal != LINK_AUTO) {
2189 log_error("%s: already a mount point, refusing to use for journal", q);
2190 return -EEXIST;
2191 }
2192
2193 return 0;
2194 }
2195
2196 r = readlink_and_make_absolute(p, &d);
2197 if (r >= 0) {
2198 if ((arg_link_journal == LINK_GUEST ||
2199 arg_link_journal == LINK_AUTO) &&
2200 path_equal(d, q)) {
2201
2202 r = userns_mkdir(directory, p, 0755, 0, 0);
2203 if (r < 0)
2204 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2205 return 0;
2206 }
2207
2208 if (unlink(p) < 0)
2209 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2210 } else if (r == -EINVAL) {
2211
2212 if (arg_link_journal == LINK_GUEST &&
2213 rmdir(p) < 0) {
2214
2215 if (errno == ENOTDIR) {
2216 log_error("%s already exists and is neither a symlink nor a directory", p);
2217 return r;
2218 } else {
2219 log_error_errno(errno, "Failed to remove %s: %m", p);
2220 return -errno;
2221 }
2222 }
2223 } else if (r != -ENOENT) {
2224 log_error_errno(errno, "readlink(%s) failed: %m", p);
2225 return r;
2226 }
2227
2228 if (arg_link_journal == LINK_GUEST) {
2229
2230 if (symlink(q, p) < 0) {
2231 if (arg_link_journal_try) {
2232 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2233 return 0;
2234 } else {
2235 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2236 return -errno;
2237 }
2238 }
2239
2240 r = userns_mkdir(directory, p, 0755, 0, 0);
2241 if (r < 0)
2242 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2243 return 0;
2244 }
2245
2246 if (arg_link_journal == LINK_HOST) {
2247 /* don't create parents here -- if the host doesn't have
2248 * permanent journal set up, don't force it here */
2249 r = mkdir(p, 0755);
2250 if (r < 0) {
2251 if (arg_link_journal_try) {
2252 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
2253 return 0;
2254 } else {
2255 log_error_errno(errno, "Failed to create %s: %m", p);
2256 return r;
2257 }
2258 }
2259
2260 } else if (access(p, F_OK) < 0)
2261 return 0;
2262
2263 if (dir_is_empty(q) == 0)
2264 log_warning("%s is not empty, proceeding anyway.", q);
2265
2266 r = userns_mkdir(directory, p, 0755, 0, 0);
2267 if (r < 0) {
2268 log_error_errno(errno, "Failed to create %s: %m", q);
2269 return r;
2270 }
2271
2272 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2273 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2274
2275 return 0;
2276 }
2277
2278 static int drop_capabilities(void) {
2279 return capability_bounding_set_drop(~arg_retain, false);
2280 }
2281
2282 static int register_machine(pid_t pid, int local_ifindex) {
2283 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2284 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2285 int r;
2286
2287 if (!arg_register)
2288 return 0;
2289
2290 r = sd_bus_default_system(&bus);
2291 if (r < 0)
2292 return log_error_errno(r, "Failed to open system bus: %m");
2293
2294 if (arg_keep_unit) {
2295 r = sd_bus_call_method(
2296 bus,
2297 "org.freedesktop.machine1",
2298 "/org/freedesktop/machine1",
2299 "org.freedesktop.machine1.Manager",
2300 "RegisterMachineWithNetwork",
2301 &error,
2302 NULL,
2303 "sayssusai",
2304 arg_machine,
2305 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2306 "nspawn",
2307 "container",
2308 (uint32_t) pid,
2309 strempty(arg_directory),
2310 local_ifindex > 0 ? 1 : 0, local_ifindex);
2311 } else {
2312 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
2313 char **i;
2314 unsigned j;
2315
2316 r = sd_bus_message_new_method_call(
2317 bus,
2318 &m,
2319 "org.freedesktop.machine1",
2320 "/org/freedesktop/machine1",
2321 "org.freedesktop.machine1.Manager",
2322 "CreateMachineWithNetwork");
2323 if (r < 0)
2324 return bus_log_create_error(r);
2325
2326 r = sd_bus_message_append(
2327 m,
2328 "sayssusai",
2329 arg_machine,
2330 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2331 "nspawn",
2332 "container",
2333 (uint32_t) pid,
2334 strempty(arg_directory),
2335 local_ifindex > 0 ? 1 : 0, local_ifindex);
2336 if (r < 0)
2337 return bus_log_create_error(r);
2338
2339 r = sd_bus_message_open_container(m, 'a', "(sv)");
2340 if (r < 0)
2341 return bus_log_create_error(r);
2342
2343 if (!isempty(arg_slice)) {
2344 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2345 if (r < 0)
2346 return bus_log_create_error(r);
2347 }
2348
2349 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2350 if (r < 0)
2351 return bus_log_create_error(r);
2352
2353 /* If you make changes here, also make sure to update
2354 * systemd-nspawn@.service, to keep the device
2355 * policies in sync regardless if we are run with or
2356 * without the --keep-unit switch. */
2357 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2358 /* Allow the container to
2359 * access and create the API
2360 * device nodes, so that
2361 * PrivateDevices= in the
2362 * container can work
2363 * fine */
2364 "/dev/null", "rwm",
2365 "/dev/zero", "rwm",
2366 "/dev/full", "rwm",
2367 "/dev/random", "rwm",
2368 "/dev/urandom", "rwm",
2369 "/dev/tty", "rwm",
2370 "/dev/net/tun", "rwm",
2371 /* Allow the container
2372 * access to ptys. However,
2373 * do not permit the
2374 * container to ever create
2375 * these device nodes. */
2376 "/dev/pts/ptmx", "rw",
2377 "char-pts", "rw");
2378 if (r < 0)
2379 return bus_log_create_error(r);
2380
2381 for (j = 0; j < arg_n_custom_mounts; j++) {
2382 CustomMount *cm = &arg_custom_mounts[j];
2383
2384 if (cm->type != CUSTOM_MOUNT_BIND)
2385 continue;
2386
2387 r = is_device_node(cm->source);
2388 if (r < 0)
2389 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
2390
2391 if (r) {
2392 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
2393 cm->source, cm->read_only ? "r" : "rw");
2394 if (r < 0)
2395 return log_error_errno(r, "Failed to append message arguments: %m");
2396 }
2397 }
2398
2399 if (arg_kill_signal != 0) {
2400 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
2401 if (r < 0)
2402 return bus_log_create_error(r);
2403
2404 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
2405 if (r < 0)
2406 return bus_log_create_error(r);
2407 }
2408
2409 STRV_FOREACH(i, arg_property) {
2410 r = sd_bus_message_open_container(m, 'r', "sv");
2411 if (r < 0)
2412 return bus_log_create_error(r);
2413
2414 r = bus_append_unit_property_assignment(m, *i);
2415 if (r < 0)
2416 return r;
2417
2418 r = sd_bus_message_close_container(m);
2419 if (r < 0)
2420 return bus_log_create_error(r);
2421 }
2422
2423 r = sd_bus_message_close_container(m);
2424 if (r < 0)
2425 return bus_log_create_error(r);
2426
2427 r = sd_bus_call(bus, m, 0, &error, NULL);
2428 }
2429
2430 if (r < 0) {
2431 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2432 return r;
2433 }
2434
2435 return 0;
2436 }
2437
2438 static int terminate_machine(pid_t pid) {
2439 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2440 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2441 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2442 const char *path;
2443 int r;
2444
2445 if (!arg_register)
2446 return 0;
2447
2448 /* If we are reusing the unit, then just exit, systemd will do
2449 * the right thing when we exit. */
2450 if (arg_keep_unit)
2451 return 0;
2452
2453 r = sd_bus_default_system(&bus);
2454 if (r < 0)
2455 return log_error_errno(r, "Failed to open system bus: %m");
2456
2457 r = sd_bus_call_method(
2458 bus,
2459 "org.freedesktop.machine1",
2460 "/org/freedesktop/machine1",
2461 "org.freedesktop.machine1.Manager",
2462 "GetMachineByPID",
2463 &error,
2464 &reply,
2465 "u",
2466 (uint32_t) pid);
2467 if (r < 0) {
2468 /* Note that the machine might already have been
2469 * cleaned up automatically, hence don't consider it a
2470 * failure if we cannot get the machine object. */
2471 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2472 return 0;
2473 }
2474
2475 r = sd_bus_message_read(reply, "o", &path);
2476 if (r < 0)
2477 return bus_log_parse_error(r);
2478
2479 r = sd_bus_call_method(
2480 bus,
2481 "org.freedesktop.machine1",
2482 path,
2483 "org.freedesktop.machine1.Machine",
2484 "Terminate",
2485 &error,
2486 NULL,
2487 NULL);
2488 if (r < 0) {
2489 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2490 return 0;
2491 }
2492
2493 return 0;
2494 }
2495
2496 static int reset_audit_loginuid(void) {
2497 _cleanup_free_ char *p = NULL;
2498 int r;
2499
2500 if (arg_share_system)
2501 return 0;
2502
2503 r = read_one_line_file("/proc/self/loginuid", &p);
2504 if (r == -ENOENT)
2505 return 0;
2506 if (r < 0)
2507 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2508
2509 /* Already reset? */
2510 if (streq(p, "4294967295"))
2511 return 0;
2512
2513 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
2514 if (r < 0) {
2515 log_error_errno(r,
2516 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2517 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2518 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2519 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2520 "using systemd-nspawn. Sleeping for 5s... (%m)");
2521
2522 sleep(5);
2523 }
2524
2525 return 0;
2526 }
2527
2528 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2529 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2530 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2531
2532 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2533 uint8_t result[8];
2534 size_t l, sz;
2535 uint8_t *v, *i;
2536 int r;
2537
2538 l = strlen(arg_machine);
2539 sz = sizeof(sd_id128_t) + l;
2540 if (idx > 0)
2541 sz += sizeof(idx);
2542
2543 v = alloca(sz);
2544
2545 /* fetch some persistent data unique to the host */
2546 r = sd_id128_get_machine((sd_id128_t*) v);
2547 if (r < 0)
2548 return r;
2549
2550 /* combine with some data unique (on this host) to this
2551 * container instance */
2552 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2553 if (idx > 0) {
2554 idx = htole64(idx);
2555 memcpy(i, &idx, sizeof(idx));
2556 }
2557
2558 /* Let's hash the host machine ID plus the container name. We
2559 * use a fixed, but originally randomly created hash key here. */
2560 siphash24(result, v, sz, hash_key.bytes);
2561
2562 assert_cc(ETH_ALEN <= sizeof(result));
2563 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2564
2565 /* see eth_random_addr in the kernel */
2566 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2567 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2568
2569 return 0;
2570 }
2571
2572 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2573 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2574 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2575 struct ether_addr mac_host, mac_container;
2576 int r, i;
2577
2578 if (!arg_private_network)
2579 return 0;
2580
2581 if (!arg_network_veth)
2582 return 0;
2583
2584 /* Use two different interface name prefixes depending whether
2585 * we are in bridge mode or not. */
2586 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2587 arg_network_bridge ? "vb" : "ve", arg_machine);
2588
2589 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2590 if (r < 0)
2591 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2592
2593 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2594 if (r < 0)
2595 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2596
2597 r = sd_netlink_open(&rtnl);
2598 if (r < 0)
2599 return log_error_errno(r, "Failed to connect to netlink: %m");
2600
2601 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2602 if (r < 0)
2603 return log_error_errno(r, "Failed to allocate netlink message: %m");
2604
2605 r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
2606 if (r < 0)
2607 return log_error_errno(r, "Failed to add netlink interface name: %m");
2608
2609 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2610 if (r < 0)
2611 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2612
2613 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2614 if (r < 0)
2615 return log_error_errno(r, "Failed to open netlink container: %m");
2616
2617 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2618 if (r < 0)
2619 return log_error_errno(r, "Failed to open netlink container: %m");
2620
2621 r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
2622 if (r < 0)
2623 return log_error_errno(r, "Failed to open netlink container: %m");
2624
2625 r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
2626 if (r < 0)
2627 return log_error_errno(r, "Failed to add netlink interface name: %m");
2628
2629 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2630 if (r < 0)
2631 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2632
2633 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2634 if (r < 0)
2635 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2636
2637 r = sd_netlink_message_close_container(m);
2638 if (r < 0)
2639 return log_error_errno(r, "Failed to close netlink container: %m");
2640
2641 r = sd_netlink_message_close_container(m);
2642 if (r < 0)
2643 return log_error_errno(r, "Failed to close netlink container: %m");
2644
2645 r = sd_netlink_message_close_container(m);
2646 if (r < 0)
2647 return log_error_errno(r, "Failed to close netlink container: %m");
2648
2649 r = sd_netlink_call(rtnl, m, 0, NULL);
2650 if (r < 0)
2651 return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
2652
2653 i = (int) if_nametoindex(iface_name);
2654 if (i <= 0)
2655 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2656
2657 *ifi = i;
2658
2659 return 0;
2660 }
2661
2662 static int setup_bridge(const char veth_name[], int *ifi) {
2663 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2664 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2665 int r, bridge;
2666
2667 if (!arg_private_network)
2668 return 0;
2669
2670 if (!arg_network_veth)
2671 return 0;
2672
2673 if (!arg_network_bridge)
2674 return 0;
2675
2676 bridge = (int) if_nametoindex(arg_network_bridge);
2677 if (bridge <= 0)
2678 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2679
2680 *ifi = bridge;
2681
2682 r = sd_netlink_open(&rtnl);
2683 if (r < 0)
2684 return log_error_errno(r, "Failed to connect to netlink: %m");
2685
2686 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2687 if (r < 0)
2688 return log_error_errno(r, "Failed to allocate netlink message: %m");
2689
2690 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2691 if (r < 0)
2692 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2693
2694 r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
2695 if (r < 0)
2696 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2697
2698 r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
2699 if (r < 0)
2700 return log_error_errno(r, "Failed to add netlink master field: %m");
2701
2702 r = sd_netlink_call(rtnl, m, 0, NULL);
2703 if (r < 0)
2704 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2705
2706 return 0;
2707 }
2708
2709 static int parse_interface(struct udev *udev, const char *name) {
2710 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2711 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2712 int ifi;
2713
2714 ifi = (int) if_nametoindex(name);
2715 if (ifi <= 0)
2716 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2717
2718 sprintf(ifi_str, "n%i", ifi);
2719 d = udev_device_new_from_device_id(udev, ifi_str);
2720 if (!d)
2721 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2722
2723 if (udev_device_get_is_initialized(d) <= 0) {
2724 log_error("Network interface %s is not initialized yet.", name);
2725 return -EBUSY;
2726 }
2727
2728 return ifi;
2729 }
2730
2731 static int move_network_interfaces(pid_t pid) {
2732 _cleanup_udev_unref_ struct udev *udev = NULL;
2733 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2734 char **i;
2735 int r;
2736
2737 if (!arg_private_network)
2738 return 0;
2739
2740 if (strv_isempty(arg_network_interfaces))
2741 return 0;
2742
2743 r = sd_netlink_open(&rtnl);
2744 if (r < 0)
2745 return log_error_errno(r, "Failed to connect to netlink: %m");
2746
2747 udev = udev_new();
2748 if (!udev) {
2749 log_error("Failed to connect to udev.");
2750 return -ENOMEM;
2751 }
2752
2753 STRV_FOREACH(i, arg_network_interfaces) {
2754 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2755 int ifi;
2756
2757 ifi = parse_interface(udev, *i);
2758 if (ifi < 0)
2759 return ifi;
2760
2761 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2762 if (r < 0)
2763 return log_error_errno(r, "Failed to allocate netlink message: %m");
2764
2765 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2766 if (r < 0)
2767 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2768
2769 r = sd_netlink_call(rtnl, m, 0, NULL);
2770 if (r < 0)
2771 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2772 }
2773
2774 return 0;
2775 }
2776
2777 static int setup_macvlan(pid_t pid) {
2778 _cleanup_udev_unref_ struct udev *udev = NULL;
2779 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2780 unsigned idx = 0;
2781 char **i;
2782 int r;
2783
2784 if (!arg_private_network)
2785 return 0;
2786
2787 if (strv_isempty(arg_network_macvlan))
2788 return 0;
2789
2790 r = sd_netlink_open(&rtnl);
2791 if (r < 0)
2792 return log_error_errno(r, "Failed to connect to netlink: %m");
2793
2794 udev = udev_new();
2795 if (!udev) {
2796 log_error("Failed to connect to udev.");
2797 return -ENOMEM;
2798 }
2799
2800 STRV_FOREACH(i, arg_network_macvlan) {
2801 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2802 _cleanup_free_ char *n = NULL;
2803 struct ether_addr mac;
2804 int ifi;
2805
2806 ifi = parse_interface(udev, *i);
2807 if (ifi < 0)
2808 return ifi;
2809
2810 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2811 if (r < 0)
2812 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2813
2814 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2815 if (r < 0)
2816 return log_error_errno(r, "Failed to allocate netlink message: %m");
2817
2818 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2819 if (r < 0)
2820 return log_error_errno(r, "Failed to add netlink interface index: %m");
2821
2822 n = strappend("mv-", *i);
2823 if (!n)
2824 return log_oom();
2825
2826 strshorten(n, IFNAMSIZ-1);
2827
2828 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2829 if (r < 0)
2830 return log_error_errno(r, "Failed to add netlink interface name: %m");
2831
2832 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2833 if (r < 0)
2834 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2835
2836 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2837 if (r < 0)
2838 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2839
2840 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2841 if (r < 0)
2842 return log_error_errno(r, "Failed to open netlink container: %m");
2843
2844 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2845 if (r < 0)
2846 return log_error_errno(r, "Failed to open netlink container: %m");
2847
2848 r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2849 if (r < 0)
2850 return log_error_errno(r, "Failed to append macvlan mode: %m");
2851
2852 r = sd_netlink_message_close_container(m);
2853 if (r < 0)
2854 return log_error_errno(r, "Failed to close netlink container: %m");
2855
2856 r = sd_netlink_message_close_container(m);
2857 if (r < 0)
2858 return log_error_errno(r, "Failed to close netlink container: %m");
2859
2860 r = sd_netlink_call(rtnl, m, 0, NULL);
2861 if (r < 0)
2862 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2863 }
2864
2865 return 0;
2866 }
2867
2868 static int setup_ipvlan(pid_t pid) {
2869 _cleanup_udev_unref_ struct udev *udev = NULL;
2870 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2871 char **i;
2872 int r;
2873
2874 if (!arg_private_network)
2875 return 0;
2876
2877 if (strv_isempty(arg_network_ipvlan))
2878 return 0;
2879
2880 r = sd_netlink_open(&rtnl);
2881 if (r < 0)
2882 return log_error_errno(r, "Failed to connect to netlink: %m");
2883
2884 udev = udev_new();
2885 if (!udev) {
2886 log_error("Failed to connect to udev.");
2887 return -ENOMEM;
2888 }
2889
2890 STRV_FOREACH(i, arg_network_ipvlan) {
2891 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2892 _cleanup_free_ char *n = NULL;
2893 int ifi;
2894
2895 ifi = parse_interface(udev, *i);
2896 if (ifi < 0)
2897 return ifi;
2898
2899 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2900 if (r < 0)
2901 return log_error_errno(r, "Failed to allocate netlink message: %m");
2902
2903 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2904 if (r < 0)
2905 return log_error_errno(r, "Failed to add netlink interface index: %m");
2906
2907 n = strappend("iv-", *i);
2908 if (!n)
2909 return log_oom();
2910
2911 strshorten(n, IFNAMSIZ-1);
2912
2913 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2914 if (r < 0)
2915 return log_error_errno(r, "Failed to add netlink interface name: %m");
2916
2917 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2918 if (r < 0)
2919 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2920
2921 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2922 if (r < 0)
2923 return log_error_errno(r, "Failed to open netlink container: %m");
2924
2925 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2926 if (r < 0)
2927 return log_error_errno(r, "Failed to open netlink container: %m");
2928
2929 r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2930 if (r < 0)
2931 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2932
2933 r = sd_netlink_message_close_container(m);
2934 if (r < 0)
2935 return log_error_errno(r, "Failed to close netlink container: %m");
2936
2937 r = sd_netlink_message_close_container(m);
2938 if (r < 0)
2939 return log_error_errno(r, "Failed to close netlink container: %m");
2940
2941 r = sd_netlink_call(rtnl, m, 0, NULL);
2942 if (r < 0)
2943 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2944 }
2945
2946 return 0;
2947 }
2948
2949 static int setup_seccomp(void) {
2950
2951 #ifdef HAVE_SECCOMP
2952 static const struct {
2953 uint64_t capability;
2954 int syscall_num;
2955 } blacklist[] = {
2956 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
2957 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
2958 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
2959 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
2960 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
2961 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
2962 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
2963 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
2964 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
2965 { CAP_SYSLOG, SCMP_SYS(syslog) },
2966 };
2967
2968 scmp_filter_ctx seccomp;
2969 unsigned i;
2970 int r;
2971
2972 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2973 if (!seccomp)
2974 return log_oom();
2975
2976 r = seccomp_add_secondary_archs(seccomp);
2977 if (r < 0) {
2978 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2979 goto finish;
2980 }
2981
2982 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2983 if (arg_retain & (1ULL << blacklist[i].capability))
2984 continue;
2985
2986 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
2987 if (r == -EFAULT)
2988 continue; /* unknown syscall */
2989 if (r < 0) {
2990 log_error_errno(r, "Failed to block syscall: %m");
2991 goto finish;
2992 }
2993 }
2994
2995
2996 /*
2997 Audit is broken in containers, much of the userspace audit
2998 hookup will fail if running inside a container. We don't
2999 care and just turn off creation of audit sockets.
3000
3001 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
3002 with EAFNOSUPPORT which audit userspace uses as indication
3003 that audit is disabled in the kernel.
3004 */
3005
3006 r = seccomp_rule_add(
3007 seccomp,
3008 SCMP_ACT_ERRNO(EAFNOSUPPORT),
3009 SCMP_SYS(socket),
3010 2,
3011 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
3012 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
3013 if (r < 0) {
3014 log_error_errno(r, "Failed to add audit seccomp rule: %m");
3015 goto finish;
3016 }
3017
3018 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
3019 if (r < 0) {
3020 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
3021 goto finish;
3022 }
3023
3024 r = seccomp_load(seccomp);
3025 if (r == -EINVAL) {
3026 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3027 r = 0;
3028 goto finish;
3029 }
3030 if (r < 0) {
3031 log_error_errno(r, "Failed to install seccomp audit filter: %m");
3032 goto finish;
3033 }
3034
3035 finish:
3036 seccomp_release(seccomp);
3037 return r;
3038 #else
3039 return 0;
3040 #endif
3041
3042 }
3043
3044 static int setup_propagate(const char *root) {
3045 const char *p, *q;
3046
3047 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3048 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
3049 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3050 (void) mkdir_p(p, 0600);
3051
3052 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
3053 return log_error_errno(errno, "Failed to create /run/systemd: %m");
3054
3055 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3056 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
3057
3058 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3059 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
3060
3061 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
3062 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
3063 return log_error_errno(errno, "Failed to install propagation bind mount.");
3064
3065 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
3066 return log_error_errno(errno, "Failed to make propagation mount read-only");
3067
3068 return 0;
3069 }
3070
3071 static int setup_image(char **device_path, int *loop_nr) {
3072 struct loop_info64 info = {
3073 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
3074 };
3075 _cleanup_close_ int fd = -1, control = -1, loop = -1;
3076 _cleanup_free_ char* loopdev = NULL;
3077 struct stat st;
3078 int r, nr;
3079
3080 assert(device_path);
3081 assert(loop_nr);
3082 assert(arg_image);
3083
3084 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3085 if (fd < 0)
3086 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
3087
3088 if (fstat(fd, &st) < 0)
3089 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
3090
3091 if (S_ISBLK(st.st_mode)) {
3092 char *p;
3093
3094 p = strdup(arg_image);
3095 if (!p)
3096 return log_oom();
3097
3098 *device_path = p;
3099
3100 *loop_nr = -1;
3101
3102 r = fd;
3103 fd = -1;
3104
3105 return r;
3106 }
3107
3108 if (!S_ISREG(st.st_mode)) {
3109 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
3110 return -EINVAL;
3111 }
3112
3113 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3114 if (control < 0)
3115 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
3116
3117 nr = ioctl(control, LOOP_CTL_GET_FREE);
3118 if (nr < 0)
3119 return log_error_errno(errno, "Failed to allocate loop device: %m");
3120
3121 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
3122 return log_oom();
3123
3124 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3125 if (loop < 0)
3126 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
3127
3128 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
3129 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
3130
3131 if (arg_read_only)
3132 info.lo_flags |= LO_FLAGS_READ_ONLY;
3133
3134 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
3135 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
3136
3137 *device_path = loopdev;
3138 loopdev = NULL;
3139
3140 *loop_nr = nr;
3141
3142 r = loop;
3143 loop = -1;
3144
3145 return r;
3146 }
3147
3148 #define PARTITION_TABLE_BLURB \
3149 "Note that the disk image needs to either contain only a single MBR partition of\n" \
3150 "type 0x83 that is marked bootable, or a single GPT partition of type " \
3151 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
3152 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3153 "to be bootable with systemd-nspawn."
3154
3155 static int dissect_image(
3156 int fd,
3157 char **root_device, bool *root_device_rw,
3158 char **home_device, bool *home_device_rw,
3159 char **srv_device, bool *srv_device_rw,
3160 bool *secondary) {
3161
3162 #ifdef HAVE_BLKID
3163 int home_nr = -1, srv_nr = -1;
3164 #ifdef GPT_ROOT_NATIVE
3165 int root_nr = -1;
3166 #endif
3167 #ifdef GPT_ROOT_SECONDARY
3168 int secondary_root_nr = -1;
3169 #endif
3170 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
3171 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
3172 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
3173 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3174 _cleanup_udev_unref_ struct udev *udev = NULL;
3175 struct udev_list_entry *first, *item;
3176 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
3177 bool is_gpt, is_mbr, multiple_generic = false;
3178 const char *pttype = NULL;
3179 blkid_partlist pl;
3180 struct stat st;
3181 unsigned i;
3182 int r;
3183
3184 assert(fd >= 0);
3185 assert(root_device);
3186 assert(home_device);
3187 assert(srv_device);
3188 assert(secondary);
3189 assert(arg_image);
3190
3191 b = blkid_new_probe();
3192 if (!b)
3193 return log_oom();
3194
3195 errno = 0;
3196 r = blkid_probe_set_device(b, fd, 0, 0);
3197 if (r != 0) {
3198 if (errno == 0)
3199 return log_oom();
3200
3201 log_error_errno(errno, "Failed to set device on blkid probe: %m");
3202 return -errno;
3203 }
3204
3205 blkid_probe_enable_partitions(b, 1);
3206 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
3207
3208 errno = 0;
3209 r = blkid_do_safeprobe(b);
3210 if (r == -2 || r == 1) {
3211 log_error("Failed to identify any partition table on\n"
3212 " %s\n"
3213 PARTITION_TABLE_BLURB, arg_image);
3214 return -EINVAL;
3215 } else if (r != 0) {
3216 if (errno == 0)
3217 errno = EIO;
3218 log_error_errno(errno, "Failed to probe: %m");
3219 return -errno;
3220 }
3221
3222 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
3223
3224 is_gpt = streq_ptr(pttype, "gpt");
3225 is_mbr = streq_ptr(pttype, "dos");
3226
3227 if (!is_gpt && !is_mbr) {
3228 log_error("No GPT or MBR partition table discovered on\n"
3229 " %s\n"
3230 PARTITION_TABLE_BLURB, arg_image);
3231 return -EINVAL;
3232 }
3233
3234 errno = 0;
3235 pl = blkid_probe_get_partitions(b);
3236 if (!pl) {
3237 if (errno == 0)
3238 return log_oom();
3239
3240 log_error("Failed to list partitions of %s", arg_image);
3241 return -errno;
3242 }
3243
3244 udev = udev_new();
3245 if (!udev)
3246 return log_oom();
3247
3248 if (fstat(fd, &st) < 0)
3249 return log_error_errno(errno, "Failed to stat block device: %m");
3250
3251 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
3252 if (!d)
3253 return log_oom();
3254
3255 for (i = 0;; i++) {
3256 int n, m;
3257
3258 if (i >= 10) {
3259 log_error("Kernel partitions never appeared.");
3260 return -ENXIO;
3261 }
3262
3263 e = udev_enumerate_new(udev);
3264 if (!e)
3265 return log_oom();
3266
3267 r = udev_enumerate_add_match_parent(e, d);
3268 if (r < 0)
3269 return log_oom();
3270
3271 r = udev_enumerate_scan_devices(e);
3272 if (r < 0)
3273 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
3274
3275 /* Count the partitions enumerated by the kernel */
3276 n = 0;
3277 first = udev_enumerate_get_list_entry(e);
3278 udev_list_entry_foreach(item, first)
3279 n++;
3280
3281 /* Count the partitions enumerated by blkid */
3282 m = blkid_partlist_numof_partitions(pl);
3283 if (n == m + 1)
3284 break;
3285 if (n > m + 1) {
3286 log_error("blkid and kernel partition list do not match.");
3287 return -EIO;
3288 }
3289 if (n < m + 1) {
3290 unsigned j;
3291
3292 /* The kernel has probed fewer partitions than
3293 * blkid? Maybe the kernel prober is still
3294 * running or it got EBUSY because udev
3295 * already opened the device. Let's reprobe
3296 * the device, which is a synchronous call
3297 * that waits until probing is complete. */
3298
3299 for (j = 0; j < 20; j++) {
3300
3301 r = ioctl(fd, BLKRRPART, 0);
3302 if (r < 0)
3303 r = -errno;
3304 if (r >= 0 || r != -EBUSY)
3305 break;
3306
3307 /* If something else has the device
3308 * open, such as an udev rule, the
3309 * ioctl will return EBUSY. Since
3310 * there's no way to wait until it
3311 * isn't busy anymore, let's just wait
3312 * a bit, and try again.
3313 *
3314 * This is really something they
3315 * should fix in the kernel! */
3316
3317 usleep(50 * USEC_PER_MSEC);
3318 }
3319
3320 if (r < 0)
3321 return log_error_errno(r, "Failed to reread partition table: %m");
3322 }
3323
3324 e = udev_enumerate_unref(e);
3325 }
3326
3327 first = udev_enumerate_get_list_entry(e);
3328 udev_list_entry_foreach(item, first) {
3329 _cleanup_udev_device_unref_ struct udev_device *q;
3330 const char *node;
3331 unsigned long long flags;
3332 blkid_partition pp;
3333 dev_t qn;
3334 int nr;
3335
3336 errno = 0;
3337 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
3338 if (!q) {
3339 if (!errno)
3340 errno = ENOMEM;
3341
3342 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
3343 return -errno;
3344 }
3345
3346 qn = udev_device_get_devnum(q);
3347 if (major(qn) == 0)
3348 continue;
3349
3350 if (st.st_rdev == qn)
3351 continue;
3352
3353 node = udev_device_get_devnode(q);
3354 if (!node)
3355 continue;
3356
3357 pp = blkid_partlist_devno_to_partition(pl, qn);
3358 if (!pp)
3359 continue;
3360
3361 flags = blkid_partition_get_flags(pp);
3362
3363 nr = blkid_partition_get_partno(pp);
3364 if (nr < 0)
3365 continue;
3366
3367 if (is_gpt) {
3368 sd_id128_t type_id;
3369 const char *stype;
3370
3371 if (flags & GPT_FLAG_NO_AUTO)
3372 continue;
3373
3374 stype = blkid_partition_get_type_string(pp);
3375 if (!stype)
3376 continue;
3377
3378 if (sd_id128_from_string(stype, &type_id) < 0)
3379 continue;
3380
3381 if (sd_id128_equal(type_id, GPT_HOME)) {
3382
3383 if (home && nr >= home_nr)
3384 continue;
3385
3386 home_nr = nr;
3387 home_rw = !(flags & GPT_FLAG_READ_ONLY);
3388
3389 r = free_and_strdup(&home, node);
3390 if (r < 0)
3391 return log_oom();
3392
3393 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3394
3395 if (srv && nr >= srv_nr)
3396 continue;
3397
3398 srv_nr = nr;
3399 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3400
3401 r = free_and_strdup(&srv, node);
3402 if (r < 0)
3403 return log_oom();
3404 }
3405 #ifdef GPT_ROOT_NATIVE
3406 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3407
3408 if (root && nr >= root_nr)
3409 continue;
3410
3411 root_nr = nr;
3412 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3413
3414 r = free_and_strdup(&root, node);
3415 if (r < 0)
3416 return log_oom();
3417 }
3418 #endif
3419 #ifdef GPT_ROOT_SECONDARY
3420 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3421
3422 if (secondary_root && nr >= secondary_root_nr)
3423 continue;
3424
3425 secondary_root_nr = nr;
3426 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3427
3428 r = free_and_strdup(&secondary_root, node);
3429 if (r < 0)
3430 return log_oom();
3431 }
3432 #endif
3433 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3434
3435 if (generic)
3436 multiple_generic = true;
3437 else {
3438 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3439
3440 r = free_and_strdup(&generic, node);
3441 if (r < 0)
3442 return log_oom();
3443 }
3444 }
3445
3446 } else if (is_mbr) {
3447 int type;
3448
3449 if (flags != 0x80) /* Bootable flag */
3450 continue;
3451
3452 type = blkid_partition_get_type(pp);
3453 if (type != 0x83) /* Linux partition */
3454 continue;
3455
3456 if (generic)
3457 multiple_generic = true;
3458 else {
3459 generic_rw = true;
3460
3461 r = free_and_strdup(&root, node);
3462 if (r < 0)
3463 return log_oom();
3464 }
3465 }
3466 }
3467
3468 if (root) {
3469 *root_device = root;
3470 root = NULL;
3471
3472 *root_device_rw = root_rw;
3473 *secondary = false;
3474 } else if (secondary_root) {
3475 *root_device = secondary_root;
3476 secondary_root = NULL;
3477
3478 *root_device_rw = secondary_root_rw;
3479 *secondary = true;
3480 } else if (generic) {
3481
3482 /* There were no partitions with precise meanings
3483 * around, but we found generic partitions. In this
3484 * case, if there's only one, we can go ahead and boot
3485 * it, otherwise we bail out, because we really cannot
3486 * make any sense of it. */
3487
3488 if (multiple_generic) {
3489 log_error("Identified multiple bootable Linux partitions on\n"
3490 " %s\n"
3491 PARTITION_TABLE_BLURB, arg_image);
3492 return -EINVAL;
3493 }
3494
3495 *root_device = generic;
3496 generic = NULL;
3497
3498 *root_device_rw = generic_rw;
3499 *secondary = false;
3500 } else {
3501 log_error("Failed to identify root partition in disk image\n"
3502 " %s\n"
3503 PARTITION_TABLE_BLURB, arg_image);
3504 return -EINVAL;
3505 }
3506
3507 if (home) {
3508 *home_device = home;
3509 home = NULL;
3510
3511 *home_device_rw = home_rw;
3512 }
3513
3514 if (srv) {
3515 *srv_device = srv;
3516 srv = NULL;
3517
3518 *srv_device_rw = srv_rw;
3519 }
3520
3521 return 0;
3522 #else
3523 log_error("--image= is not supported, compiled without blkid support.");
3524 return -EOPNOTSUPP;
3525 #endif
3526 }
3527
3528 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3529 #ifdef HAVE_BLKID
3530 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3531 const char *fstype, *p;
3532 int r;
3533
3534 assert(what);
3535 assert(where);
3536
3537 if (arg_read_only)
3538 rw = false;
3539
3540 if (directory)
3541 p = strjoina(where, directory);
3542 else
3543 p = where;
3544
3545 errno = 0;
3546 b = blkid_new_probe_from_filename(what);
3547 if (!b) {
3548 if (errno == 0)
3549 return log_oom();
3550 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3551 return -errno;
3552 }
3553
3554 blkid_probe_enable_superblocks(b, 1);
3555 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3556
3557 errno = 0;
3558 r = blkid_do_safeprobe(b);
3559 if (r == -1 || r == 1) {
3560 log_error("Cannot determine file system type of %s", what);
3561 return -EINVAL;
3562 } else if (r != 0) {
3563 if (errno == 0)
3564 errno = EIO;
3565 log_error_errno(errno, "Failed to probe %s: %m", what);
3566 return -errno;
3567 }
3568
3569 errno = 0;
3570 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3571 if (errno == 0)
3572 errno = EINVAL;
3573 log_error("Failed to determine file system type of %s", what);
3574 return -errno;
3575 }
3576
3577 if (streq(fstype, "crypto_LUKS")) {
3578 log_error("nspawn currently does not support LUKS disk images.");
3579 return -EOPNOTSUPP;
3580 }
3581
3582 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3583 return log_error_errno(errno, "Failed to mount %s: %m", what);
3584
3585 return 0;
3586 #else
3587 log_error("--image= is not supported, compiled without blkid support.");
3588 return -EOPNOTSUPP;
3589 #endif
3590 }
3591
3592 static int mount_devices(
3593 const char *where,
3594 const char *root_device, bool root_device_rw,
3595 const char *home_device, bool home_device_rw,
3596 const char *srv_device, bool srv_device_rw) {
3597 int r;
3598
3599 assert(where);
3600
3601 if (root_device) {
3602 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3603 if (r < 0)
3604 return log_error_errno(r, "Failed to mount root directory: %m");
3605 }
3606
3607 if (home_device) {
3608 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3609 if (r < 0)
3610 return log_error_errno(r, "Failed to mount home directory: %m");
3611 }
3612
3613 if (srv_device) {
3614 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3615 if (r < 0)
3616 return log_error_errno(r, "Failed to mount server data directory: %m");
3617 }
3618
3619 return 0;
3620 }
3621
3622 static void loop_remove(int nr, int *image_fd) {
3623 _cleanup_close_ int control = -1;
3624 int r;
3625
3626 if (nr < 0)
3627 return;
3628
3629 if (image_fd && *image_fd >= 0) {
3630 r = ioctl(*image_fd, LOOP_CLR_FD);
3631 if (r < 0)
3632 log_debug_errno(errno, "Failed to close loop image: %m");
3633 *image_fd = safe_close(*image_fd);
3634 }
3635
3636 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3637 if (control < 0) {
3638 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3639 return;
3640 }
3641
3642 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3643 if (r < 0)
3644 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3645 }
3646
3647 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3648 int pipe_fds[2];
3649 pid_t pid;
3650
3651 assert(database);
3652 assert(key);
3653 assert(rpid);
3654
3655 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3656 return log_error_errno(errno, "Failed to allocate pipe: %m");
3657
3658 pid = fork();
3659 if (pid < 0)
3660 return log_error_errno(errno, "Failed to fork getent child: %m");
3661 else if (pid == 0) {
3662 int nullfd;
3663 char *empty_env = NULL;
3664
3665 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3666 _exit(EXIT_FAILURE);
3667
3668 if (pipe_fds[0] > 2)
3669 safe_close(pipe_fds[0]);
3670 if (pipe_fds[1] > 2)
3671 safe_close(pipe_fds[1]);
3672
3673 nullfd = open("/dev/null", O_RDWR);
3674 if (nullfd < 0)
3675 _exit(EXIT_FAILURE);
3676
3677 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3678 _exit(EXIT_FAILURE);
3679
3680 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3681 _exit(EXIT_FAILURE);
3682
3683 if (nullfd > 2)
3684 safe_close(nullfd);
3685
3686 (void) reset_all_signal_handlers();
3687 (void) reset_signal_mask();
3688 close_all_fds(NULL, 0);
3689
3690 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3691 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3692 _exit(EXIT_FAILURE);
3693 }
3694
3695 pipe_fds[1] = safe_close(pipe_fds[1]);
3696
3697 *rpid = pid;
3698
3699 return pipe_fds[0];
3700 }
3701
3702 static int change_uid_gid(char **_home) {
3703 char line[LINE_MAX], *x, *u, *g, *h;
3704 const char *word, *state;
3705 _cleanup_free_ uid_t *uids = NULL;
3706 _cleanup_free_ char *home = NULL;
3707 _cleanup_fclose_ FILE *f = NULL;
3708 _cleanup_close_ int fd = -1;
3709 unsigned n_uids = 0;
3710 size_t sz = 0, l;
3711 uid_t uid;
3712 gid_t gid;
3713 pid_t pid;
3714 int r;
3715
3716 assert(_home);
3717
3718 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3719 /* Reset everything fully to 0, just in case */
3720
3721 r = reset_uid_gid();
3722 if (r < 0)
3723 return log_error_errno(r, "Failed to become root: %m");
3724
3725 *_home = NULL;
3726 return 0;
3727 }
3728
3729 /* First, get user credentials */
3730 fd = spawn_getent("passwd", arg_user, &pid);
3731 if (fd < 0)
3732 return fd;
3733
3734 f = fdopen(fd, "r");
3735 if (!f)
3736 return log_oom();
3737 fd = -1;
3738
3739 if (!fgets(line, sizeof(line), f)) {
3740
3741 if (!ferror(f)) {
3742 log_error("Failed to resolve user %s.", arg_user);
3743 return -ESRCH;
3744 }
3745
3746 log_error_errno(errno, "Failed to read from getent: %m");
3747 return -errno;
3748 }
3749
3750 truncate_nl(line);
3751
3752 wait_for_terminate_and_warn("getent passwd", pid, true);
3753
3754 x = strchr(line, ':');
3755 if (!x) {
3756 log_error("/etc/passwd entry has invalid user field.");
3757 return -EIO;
3758 }
3759
3760 u = strchr(x+1, ':');
3761 if (!u) {
3762 log_error("/etc/passwd entry has invalid password field.");
3763 return -EIO;
3764 }
3765
3766 u++;
3767 g = strchr(u, ':');
3768 if (!g) {
3769 log_error("/etc/passwd entry has invalid UID field.");
3770 return -EIO;
3771 }
3772
3773 *g = 0;
3774 g++;
3775 x = strchr(g, ':');
3776 if (!x) {
3777 log_error("/etc/passwd entry has invalid GID field.");
3778 return -EIO;
3779 }
3780
3781 *x = 0;
3782 h = strchr(x+1, ':');
3783 if (!h) {
3784 log_error("/etc/passwd entry has invalid GECOS field.");
3785 return -EIO;
3786 }
3787
3788 h++;
3789 x = strchr(h, ':');
3790 if (!x) {
3791 log_error("/etc/passwd entry has invalid home directory field.");
3792 return -EIO;
3793 }
3794
3795 *x = 0;
3796
3797 r = parse_uid(u, &uid);
3798 if (r < 0) {
3799 log_error("Failed to parse UID of user.");
3800 return -EIO;
3801 }
3802
3803 r = parse_gid(g, &gid);
3804 if (r < 0) {
3805 log_error("Failed to parse GID of user.");
3806 return -EIO;
3807 }
3808
3809 home = strdup(h);
3810 if (!home)
3811 return log_oom();
3812
3813 /* Second, get group memberships */
3814 fd = spawn_getent("initgroups", arg_user, &pid);
3815 if (fd < 0)
3816 return fd;
3817
3818 fclose(f);
3819 f = fdopen(fd, "r");
3820 if (!f)
3821 return log_oom();
3822 fd = -1;
3823
3824 if (!fgets(line, sizeof(line), f)) {
3825 if (!ferror(f)) {
3826 log_error("Failed to resolve user %s.", arg_user);
3827 return -ESRCH;
3828 }
3829
3830 log_error_errno(errno, "Failed to read from getent: %m");
3831 return -errno;
3832 }
3833
3834 truncate_nl(line);
3835
3836 wait_for_terminate_and_warn("getent initgroups", pid, true);
3837
3838 /* Skip over the username and subsequent separator whitespace */
3839 x = line;
3840 x += strcspn(x, WHITESPACE);
3841 x += strspn(x, WHITESPACE);
3842
3843 FOREACH_WORD(word, l, x, state) {
3844 char c[l+1];
3845
3846 memcpy(c, word, l);
3847 c[l] = 0;
3848
3849 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3850 return log_oom();
3851
3852 r = parse_uid(c, &uids[n_uids++]);
3853 if (r < 0) {
3854 log_error("Failed to parse group data from getent.");
3855 return -EIO;
3856 }
3857 }
3858
3859 r = mkdir_parents(home, 0775);
3860 if (r < 0)
3861 return log_error_errno(r, "Failed to make home root directory: %m");
3862
3863 r = mkdir_safe(home, 0755, uid, gid);
3864 if (r < 0 && r != -EEXIST)
3865 return log_error_errno(r, "Failed to make home directory: %m");
3866
3867 (void) fchown(STDIN_FILENO, uid, gid);
3868 (void) fchown(STDOUT_FILENO, uid, gid);
3869 (void) fchown(STDERR_FILENO, uid, gid);
3870
3871 if (setgroups(n_uids, uids) < 0)
3872 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3873
3874 if (setresgid(gid, gid, gid) < 0)
3875 return log_error_errno(errno, "setregid() failed: %m");
3876
3877 if (setresuid(uid, uid, uid) < 0)
3878 return log_error_errno(errno, "setreuid() failed: %m");
3879
3880 if (_home) {
3881 *_home = home;
3882 home = NULL;
3883 }
3884
3885 return 0;
3886 }
3887
3888 /*
3889 * Return values:
3890 * < 0 : wait_for_terminate() failed to get the state of the
3891 * container, the container was terminated by a signal, or
3892 * failed for an unknown reason. No change is made to the
3893 * container argument.
3894 * > 0 : The program executed in the container terminated with an
3895 * error. The exit code of the program executed in the
3896 * container is returned. The container argument has been set
3897 * to CONTAINER_TERMINATED.
3898 * 0 : The container is being rebooted, has been shut down or exited
3899 * successfully. The container argument has been set to either
3900 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3901 *
3902 * That is, success is indicated by a return value of zero, and an
3903 * error is indicated by a non-zero value.
3904 */
3905 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3906 siginfo_t status;
3907 int r;
3908
3909 r = wait_for_terminate(pid, &status);
3910 if (r < 0)
3911 return log_warning_errno(r, "Failed to wait for container: %m");
3912
3913 switch (status.si_code) {
3914
3915 case CLD_EXITED:
3916 if (status.si_status == 0) {
3917 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3918
3919 } else
3920 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3921
3922 *container = CONTAINER_TERMINATED;
3923 return status.si_status;
3924
3925 case CLD_KILLED:
3926 if (status.si_status == SIGINT) {
3927
3928 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3929 *container = CONTAINER_TERMINATED;
3930 return 0;
3931
3932 } else if (status.si_status == SIGHUP) {
3933
3934 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3935 *container = CONTAINER_REBOOTED;
3936 return 0;
3937 }
3938
3939 /* CLD_KILLED fallthrough */
3940
3941 case CLD_DUMPED:
3942 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3943 return -EIO;
3944
3945 default:
3946 log_error("Container %s failed due to unknown reason.", arg_machine);
3947 return -EIO;
3948 }
3949
3950 return r;
3951 }
3952
3953 static void nop_handler(int sig) {}
3954
3955 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3956 pid_t pid;
3957
3958 pid = PTR_TO_UINT32(userdata);
3959 if (pid > 0) {
3960 if (kill(pid, arg_kill_signal) >= 0) {
3961 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3962 sd_event_source_set_userdata(s, NULL);
3963 return 0;
3964 }
3965 }
3966
3967 sd_event_exit(sd_event_source_get_event(s), 0);
3968 return 0;
3969 }
3970
3971 static int determine_names(void) {
3972 int r;
3973
3974 if (!arg_image && !arg_directory) {
3975 if (arg_machine) {
3976 _cleanup_(image_unrefp) Image *i = NULL;
3977
3978 r = image_find(arg_machine, &i);
3979 if (r < 0)
3980 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3981 else if (r == 0) {
3982 log_error("No image for machine '%s': %m", arg_machine);
3983 return -ENOENT;
3984 }
3985
3986 if (i->type == IMAGE_RAW)
3987 r = set_sanitized_path(&arg_image, i->path);
3988 else
3989 r = set_sanitized_path(&arg_directory, i->path);
3990 if (r < 0)
3991 return log_error_errno(r, "Invalid image directory: %m");
3992
3993 if (!arg_ephemeral)
3994 arg_read_only = arg_read_only || i->read_only;
3995 } else
3996 arg_directory = get_current_dir_name();
3997
3998 if (!arg_directory && !arg_machine) {
3999 log_error("Failed to determine path, please use -D or -i.");
4000 return -EINVAL;
4001 }
4002 }
4003
4004 if (!arg_machine) {
4005 if (arg_directory && path_equal(arg_directory, "/"))
4006 arg_machine = gethostname_malloc();
4007 else
4008 arg_machine = strdup(basename(arg_image ?: arg_directory));
4009
4010 if (!arg_machine)
4011 return log_oom();
4012
4013 hostname_cleanup(arg_machine);
4014 if (!machine_name_is_valid(arg_machine)) {
4015 log_error("Failed to determine machine name automatically, please use -M.");
4016 return -EINVAL;
4017 }
4018
4019 if (arg_ephemeral) {
4020 char *b;
4021
4022 /* Add a random suffix when this is an
4023 * ephemeral machine, so that we can run many
4024 * instances at once without manually having
4025 * to specify -M each time. */
4026
4027 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
4028 return log_oom();
4029
4030 free(arg_machine);
4031 arg_machine = b;
4032 }
4033 }
4034
4035 return 0;
4036 }
4037
4038 static int determine_uid_shift(const char *directory) {
4039 int r;
4040
4041 if (!arg_userns) {
4042 arg_uid_shift = 0;
4043 return 0;
4044 }
4045
4046 if (arg_uid_shift == UID_INVALID) {
4047 struct stat st;
4048
4049 r = stat(directory, &st);
4050 if (r < 0)
4051 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
4052
4053 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
4054
4055 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
4056 log_error("UID and GID base of %s don't match.", directory);
4057 return -EINVAL;
4058 }
4059
4060 arg_uid_range = UINT32_C(0x10000);
4061 }
4062
4063 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
4064 log_error("UID base too high for UID range.");
4065 return -EINVAL;
4066 }
4067
4068 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
4069 return 0;
4070 }
4071
4072 static int inner_child(
4073 Barrier *barrier,
4074 const char *directory,
4075 bool secondary,
4076 int kmsg_socket,
4077 int rtnl_socket,
4078 FDSet *fds,
4079 int argc,
4080 char *argv[]) {
4081
4082 _cleanup_free_ char *home = NULL;
4083 unsigned n_env = 2;
4084 const char *envp[] = {
4085 "PATH=" DEFAULT_PATH_SPLIT_USR,
4086 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4087 NULL, /* TERM */
4088 NULL, /* HOME */
4089 NULL, /* USER */
4090 NULL, /* LOGNAME */
4091 NULL, /* container_uuid */
4092 NULL, /* LISTEN_FDS */
4093 NULL, /* LISTEN_PID */
4094 NULL
4095 };
4096
4097 _cleanup_strv_free_ char **env_use = NULL;
4098 int r;
4099
4100 assert(barrier);
4101 assert(directory);
4102 assert(kmsg_socket >= 0);
4103
4104 if (arg_userns) {
4105 /* Tell the parent, that it now can write the UID map. */
4106 (void) barrier_place(barrier); /* #1 */
4107
4108 /* Wait until the parent wrote the UID map */
4109 if (!barrier_place_and_sync(barrier)) { /* #2 */
4110 log_error("Parent died too early");
4111 return -ESRCH;
4112 }
4113 }
4114
4115 r = mount_all(NULL, true);
4116 if (r < 0)
4117 return r;
4118
4119 /* Wait until we are cgroup-ified, so that we
4120 * can mount the right cgroup path writable */
4121 if (!barrier_place_and_sync(barrier)) { /* #3 */
4122 log_error("Parent died too early");
4123 return -ESRCH;
4124 }
4125
4126 r = mount_systemd_cgroup_writable("");
4127 if (r < 0)
4128 return r;
4129
4130 r = reset_uid_gid();
4131 if (r < 0)
4132 return log_error_errno(r, "Couldn't become new root: %m");
4133
4134 r = setup_boot_id(NULL);
4135 if (r < 0)
4136 return r;
4137
4138 r = setup_kmsg(NULL, kmsg_socket);
4139 if (r < 0)
4140 return r;
4141 kmsg_socket = safe_close(kmsg_socket);
4142
4143 umask(0022);
4144
4145 if (setsid() < 0)
4146 return log_error_errno(errno, "setsid() failed: %m");
4147
4148 if (arg_private_network)
4149 loopback_setup();
4150
4151 r = send_rtnl(rtnl_socket);
4152 if (r < 0)
4153 return r;
4154 rtnl_socket = safe_close(rtnl_socket);
4155
4156 if (drop_capabilities() < 0)
4157 return log_error_errno(errno, "drop_capabilities() failed: %m");
4158
4159 setup_hostname();
4160
4161 if (arg_personality != PERSONALITY_INVALID) {
4162 if (personality(arg_personality) < 0)
4163 return log_error_errno(errno, "personality() failed: %m");
4164 } else if (secondary) {
4165 if (personality(PER_LINUX32) < 0)
4166 return log_error_errno(errno, "personality() failed: %m");
4167 }
4168
4169 #ifdef HAVE_SELINUX
4170 if (arg_selinux_context)
4171 if (setexeccon((security_context_t) arg_selinux_context) < 0)
4172 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4173 #endif
4174
4175 r = change_uid_gid(&home);
4176 if (r < 0)
4177 return r;
4178
4179 envp[n_env] = strv_find_prefix(environ, "TERM=");
4180 if (envp[n_env])
4181 n_env ++;
4182
4183 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4184 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4185 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
4186 return log_oom();
4187
4188 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4189 char as_uuid[37];
4190
4191 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
4192 return log_oom();
4193 }
4194
4195 if (fdset_size(fds) > 0) {
4196 r = fdset_cloexec(fds, false);
4197 if (r < 0)
4198 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4199
4200 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
4201 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
4202 return log_oom();
4203 }
4204
4205 env_use = strv_env_merge(2, envp, arg_setenv);
4206 if (!env_use)
4207 return log_oom();
4208
4209 /* Let the parent know that we are ready and
4210 * wait until the parent is ready with the
4211 * setup, too... */
4212 if (!barrier_place_and_sync(barrier)) { /* #4 */
4213 log_error("Parent died too early");
4214 return -ESRCH;
4215 }
4216
4217 /* Now, explicitly close the log, so that we
4218 * then can close all remaining fds. Closing
4219 * the log explicitly first has the benefit
4220 * that the logging subsystem knows about it,
4221 * and is thus ready to be reopened should we
4222 * need it again. Note that the other fds
4223 * closed here are at least the locking and
4224 * barrier fds. */
4225 log_close();
4226 (void) fdset_close_others(fds);
4227
4228 if (arg_boot) {
4229 char **a;
4230 size_t m;
4231
4232 /* Automatically search for the init system */
4233
4234 m = 1 + argc - optind;
4235 a = newa(char*, m + 1);
4236 memcpy(a + 1, argv + optind, m * sizeof(char*));
4237
4238 a[0] = (char*) "/usr/lib/systemd/systemd";
4239 execve(a[0], a, env_use);
4240
4241 a[0] = (char*) "/lib/systemd/systemd";
4242 execve(a[0], a, env_use);
4243
4244 a[0] = (char*) "/sbin/init";
4245 execve(a[0], a, env_use);
4246 } else if (argc > optind)
4247 execvpe(argv[optind], argv + optind, env_use);
4248 else {
4249 chdir(home ? home : "/root");
4250 execle("/bin/bash", "-bash", NULL, env_use);
4251 execle("/bin/sh", "-sh", NULL, env_use);
4252 }
4253
4254 (void) log_open();
4255 return log_error_errno(errno, "execv() failed: %m");
4256 }
4257
4258 static int outer_child(
4259 Barrier *barrier,
4260 const char *directory,
4261 const char *console,
4262 const char *root_device, bool root_device_rw,
4263 const char *home_device, bool home_device_rw,
4264 const char *srv_device, bool srv_device_rw,
4265 bool interactive,
4266 bool secondary,
4267 int pid_socket,
4268 int kmsg_socket,
4269 int rtnl_socket,
4270 int uid_shift_socket,
4271 FDSet *fds,
4272 int argc,
4273 char *argv[]) {
4274
4275 pid_t pid;
4276 ssize_t l;
4277 int r;
4278
4279 assert(barrier);
4280 assert(directory);
4281 assert(console);
4282 assert(pid_socket >= 0);
4283 assert(kmsg_socket >= 0);
4284
4285 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
4286 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4287
4288 if (interactive) {
4289 close_nointr(STDIN_FILENO);
4290 close_nointr(STDOUT_FILENO);
4291 close_nointr(STDERR_FILENO);
4292
4293 r = open_terminal(console, O_RDWR);
4294 if (r != STDIN_FILENO) {
4295 if (r >= 0) {
4296 safe_close(r);
4297 r = -EINVAL;
4298 }
4299
4300 return log_error_errno(r, "Failed to open console: %m");
4301 }
4302
4303 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4304 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
4305 return log_error_errno(errno, "Failed to duplicate console: %m");
4306 }
4307
4308 r = reset_audit_loginuid();
4309 if (r < 0)
4310 return r;
4311
4312 /* Mark everything as slave, so that we still
4313 * receive mounts from the real root, but don't
4314 * propagate mounts to the real root. */
4315 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
4316 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4317
4318 r = mount_devices(directory,
4319 root_device, root_device_rw,
4320 home_device, home_device_rw,
4321 srv_device, srv_device_rw);
4322 if (r < 0)
4323 return r;
4324
4325 r = determine_uid_shift(directory);
4326 if (r < 0)
4327 return r;
4328
4329 if (arg_userns) {
4330 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
4331 if (l < 0)
4332 return log_error_errno(errno, "Failed to send UID shift: %m");
4333 if (l != sizeof(arg_uid_shift)) {
4334 log_error("Short write while sending UID shift.");
4335 return -EIO;
4336 }
4337 }
4338
4339 /* Turn directory into bind mount */
4340 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
4341 return log_error_errno(errno, "Failed to make bind mount: %m");
4342
4343 r = setup_volatile(directory);
4344 if (r < 0)
4345 return r;
4346
4347 r = setup_volatile_state(directory);
4348 if (r < 0)
4349 return r;
4350
4351 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
4352 if (r < 0)
4353 return r;
4354
4355 if (arg_read_only) {
4356 r = bind_remount_recursive(directory, true);
4357 if (r < 0)
4358 return log_error_errno(r, "Failed to make tree read-only: %m");
4359 }
4360
4361 r = mount_all(directory, false);
4362 if (r < 0)
4363 return r;
4364
4365 if (copy_devnodes(directory) < 0)
4366 return r;
4367
4368 dev_setup(directory, arg_uid_shift, arg_uid_shift);
4369
4370 if (setup_pts(directory) < 0)
4371 return r;
4372
4373 r = setup_propagate(directory);
4374 if (r < 0)
4375 return r;
4376
4377 r = setup_dev_console(directory, console);
4378 if (r < 0)
4379 return r;
4380
4381 r = setup_seccomp();
4382 if (r < 0)
4383 return r;
4384
4385 r = setup_timezone(directory);
4386 if (r < 0)
4387 return r;
4388
4389 r = setup_resolv_conf(directory);
4390 if (r < 0)
4391 return r;
4392
4393 r = setup_journal(directory);
4394 if (r < 0)
4395 return r;
4396
4397 r = mount_custom(directory);
4398 if (r < 0)
4399 return r;
4400
4401 r = mount_cgroup(directory);
4402 if (r < 0)
4403 return r;
4404
4405 r = mount_move_root(directory);
4406 if (r < 0)
4407 return log_error_errno(r, "Failed to move root directory: %m");
4408
4409 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4410 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
4411 (arg_private_network ? CLONE_NEWNET : 0) |
4412 (arg_userns ? CLONE_NEWUSER : 0),
4413 NULL);
4414 if (pid < 0)
4415 return log_error_errno(errno, "Failed to fork inner child: %m");
4416
4417 if (pid == 0) {
4418 pid_socket = safe_close(pid_socket);
4419 uid_shift_socket = safe_close(uid_shift_socket);
4420
4421 /* The inner child has all namespaces that are
4422 * requested, so that we all are owned by the user if
4423 * user namespaces are turned on. */
4424
4425 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
4426 if (r < 0)
4427 _exit(EXIT_FAILURE);
4428
4429 _exit(EXIT_SUCCESS);
4430 }
4431
4432 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4433 if (l < 0)
4434 return log_error_errno(errno, "Failed to send PID: %m");
4435 if (l != sizeof(pid)) {
4436 log_error("Short write while sending PID.");
4437 return -EIO;
4438 }
4439
4440 pid_socket = safe_close(pid_socket);
4441
4442 return 0;
4443 }
4444
4445 static int setup_uid_map(pid_t pid) {
4446 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4447 int r;
4448
4449 assert(pid > 1);
4450
4451 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4452 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4453 r = write_string_file(uid_map, line, 0);
4454 if (r < 0)
4455 return log_error_errno(r, "Failed to write UID map: %m");
4456
4457 /* We always assign the same UID and GID ranges */
4458 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4459 r = write_string_file(uid_map, line, 0);
4460 if (r < 0)
4461 return log_error_errno(r, "Failed to write GID map: %m");
4462
4463 return 0;
4464 }
4465
4466 static int chown_cgroup(pid_t pid) {
4467 _cleanup_free_ char *path = NULL, *fs = NULL;
4468 _cleanup_close_ int fd = -1;
4469 const char *fn;
4470 int r;
4471
4472 r = cg_pid_get_path(NULL, pid, &path);
4473 if (r < 0)
4474 return log_error_errno(r, "Failed to get container cgroup path: %m");
4475
4476 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
4477 if (r < 0)
4478 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
4479
4480 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
4481 if (fd < 0)
4482 return log_error_errno(errno, "Failed to open %s: %m", fs);
4483
4484 FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4485 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
4486 log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
4487
4488 return 0;
4489 }
4490
4491 int main(int argc, char *argv[]) {
4492
4493 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
4494 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4495 _cleanup_close_ int master = -1, image_fd = -1;
4496 _cleanup_fdset_free_ FDSet *fds = NULL;
4497 int r, n_fd_passed, loop_nr = -1;
4498 char veth_name[IFNAMSIZ];
4499 bool secondary = false, remove_subvol = false;
4500 sigset_t mask_chld;
4501 pid_t pid = 0;
4502 int ret = EXIT_SUCCESS;
4503 union in_addr_union exposed = {};
4504 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4505 bool interactive;
4506
4507 log_parse_environment();
4508 log_open();
4509
4510 r = parse_argv(argc, argv);
4511 if (r <= 0)
4512 goto finish;
4513
4514 r = determine_names();
4515 if (r < 0)
4516 goto finish;
4517
4518 if (geteuid() != 0) {
4519 log_error("Need to be root.");
4520 r = -EPERM;
4521 goto finish;
4522 }
4523
4524 n_fd_passed = sd_listen_fds(false);
4525 if (n_fd_passed > 0) {
4526 r = fdset_new_listen_fds(&fds, false);
4527 if (r < 0) {
4528 log_error_errno(r, "Failed to collect file descriptors: %m");
4529 goto finish;
4530 }
4531 }
4532
4533 if (arg_directory) {
4534 assert(!arg_image);
4535
4536 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4537 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4538 r = -EINVAL;
4539 goto finish;
4540 }
4541
4542 if (arg_ephemeral) {
4543 _cleanup_free_ char *np = NULL;
4544
4545 /* If the specified path is a mount point we
4546 * generate the new snapshot immediately
4547 * inside it under a random name. However if
4548 * the specified is not a mount point we
4549 * create the new snapshot in the parent
4550 * directory, just next to it. */
4551 r = path_is_mount_point(arg_directory, 0);
4552 if (r < 0) {
4553 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4554 goto finish;
4555 }
4556 if (r > 0)
4557 r = tempfn_random_child(arg_directory, "machine.", &np);
4558 else
4559 r = tempfn_random(arg_directory, "machine.", &np);
4560 if (r < 0) {
4561 log_error_errno(r, "Failed to generate name for snapshot: %m");
4562 goto finish;
4563 }
4564
4565 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4566 if (r < 0) {
4567 log_error_errno(r, "Failed to lock %s: %m", np);
4568 goto finish;
4569 }
4570
4571 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4572 if (r < 0) {
4573 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4574 goto finish;
4575 }
4576
4577 free(arg_directory);
4578 arg_directory = np;
4579 np = NULL;
4580
4581 remove_subvol = true;
4582
4583 } else {
4584 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4585 if (r == -EBUSY) {
4586 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4587 goto finish;
4588 }
4589 if (r < 0) {
4590 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4591 return r;
4592 }
4593
4594 if (arg_template) {
4595 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4596 if (r == -EEXIST) {
4597 if (!arg_quiet)
4598 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4599 } else if (r < 0) {
4600 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4601 goto finish;
4602 } else {
4603 if (!arg_quiet)
4604 log_info("Populated %s from template %s.", arg_directory, arg_template);
4605 }
4606 }
4607 }
4608
4609 if (arg_boot) {
4610 if (path_is_os_tree(arg_directory) <= 0) {
4611 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
4612 r = -EINVAL;
4613 goto finish;
4614 }
4615 } else {
4616 const char *p;
4617
4618 p = strjoina(arg_directory,
4619 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
4620 if (access(p, F_OK) < 0) {
4621 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
4622 r = -EINVAL;
4623 goto finish;
4624 }
4625 }
4626
4627 } else {
4628 char template[] = "/tmp/nspawn-root-XXXXXX";
4629
4630 assert(arg_image);
4631 assert(!arg_template);
4632
4633 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4634 if (r == -EBUSY) {
4635 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4636 goto finish;
4637 }
4638 if (r < 0) {
4639 r = log_error_errno(r, "Failed to create image lock: %m");
4640 goto finish;
4641 }
4642
4643 if (!mkdtemp(template)) {
4644 log_error_errno(errno, "Failed to create temporary directory: %m");
4645 r = -errno;
4646 goto finish;
4647 }
4648
4649 arg_directory = strdup(template);
4650 if (!arg_directory) {
4651 r = log_oom();
4652 goto finish;
4653 }
4654
4655 image_fd = setup_image(&device_path, &loop_nr);
4656 if (image_fd < 0) {
4657 r = image_fd;
4658 goto finish;
4659 }
4660
4661 r = dissect_image(image_fd,
4662 &root_device, &root_device_rw,
4663 &home_device, &home_device_rw,
4664 &srv_device, &srv_device_rw,
4665 &secondary);
4666 if (r < 0)
4667 goto finish;
4668 }
4669
4670 r = custom_mounts_prepare();
4671 if (r < 0)
4672 goto finish;
4673
4674 interactive =
4675 isatty(STDIN_FILENO) > 0 &&
4676 isatty(STDOUT_FILENO) > 0;
4677
4678 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4679 if (master < 0) {
4680 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
4681 goto finish;
4682 }
4683
4684 r = ptsname_malloc(master, &console);
4685 if (r < 0) {
4686 r = log_error_errno(r, "Failed to determine tty name: %m");
4687 goto finish;
4688 }
4689
4690 if (unlockpt(master) < 0) {
4691 r = log_error_errno(errno, "Failed to unlock tty: %m");
4692 goto finish;
4693 }
4694
4695 if (!arg_quiet)
4696 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4697 arg_machine, arg_image ?: arg_directory);
4698
4699 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
4700
4701 assert_se(sigemptyset(&mask_chld) == 0);
4702 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4703
4704 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4705 r = log_error_errno(errno, "Failed to become subreaper: %m");
4706 goto finish;
4707 }
4708
4709 for (;;) {
4710 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
4711 uid_shift_socket_pair[2] = { -1, -1 };
4712 ContainerStatus container_status;
4713 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4714 static const struct sigaction sa = {
4715 .sa_handler = nop_handler,
4716 .sa_flags = SA_NOCLDSTOP,
4717 };
4718 int ifi = 0;
4719 ssize_t l;
4720 _cleanup_event_unref_ sd_event *event = NULL;
4721 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4722 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4723 char last_char = 0;
4724
4725 r = barrier_create(&barrier);
4726 if (r < 0) {
4727 log_error_errno(r, "Cannot initialize IPC barrier: %m");
4728 goto finish;
4729 }
4730
4731 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
4732 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4733 goto finish;
4734 }
4735
4736 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
4737 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4738 goto finish;
4739 }
4740
4741 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
4742 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
4743 goto finish;
4744 }
4745
4746 if (arg_userns)
4747 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
4748 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4749 goto finish;
4750 }
4751
4752 /* Child can be killed before execv(), so handle SIGCHLD
4753 * in order to interrupt parent's blocking calls and
4754 * give it a chance to call wait() and terminate. */
4755 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4756 if (r < 0) {
4757 r = log_error_errno(errno, "Failed to change the signal mask: %m");
4758 goto finish;
4759 }
4760
4761 r = sigaction(SIGCHLD, &sa, NULL);
4762 if (r < 0) {
4763 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4764 goto finish;
4765 }
4766
4767 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
4768 if (pid < 0) {
4769 if (errno == EINVAL)
4770 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
4771 else
4772 r = log_error_errno(errno, "clone() failed: %m");
4773
4774 goto finish;
4775 }
4776
4777 if (pid == 0) {
4778 /* The outer child only has a file system namespace. */
4779 barrier_set_role(&barrier, BARRIER_CHILD);
4780
4781 master = safe_close(master);
4782
4783 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4784 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4785 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4786 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4787
4788 (void) reset_all_signal_handlers();
4789 (void) reset_signal_mask();
4790
4791 r = outer_child(&barrier,
4792 arg_directory,
4793 console,
4794 root_device, root_device_rw,
4795 home_device, home_device_rw,
4796 srv_device, srv_device_rw,
4797 interactive,
4798 secondary,
4799 pid_socket_pair[1],
4800 kmsg_socket_pair[1],
4801 rtnl_socket_pair[1],
4802 uid_shift_socket_pair[1],
4803 fds,
4804 argc, argv);
4805 if (r < 0)
4806 _exit(EXIT_FAILURE);
4807
4808 _exit(EXIT_SUCCESS);
4809 }
4810
4811 barrier_set_role(&barrier, BARRIER_PARENT);
4812
4813 fdset_free(fds);
4814 fds = NULL;
4815
4816 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4817 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4818 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4819
4820 /* Wait for the outer child. */
4821 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
4822 if (r < 0)
4823 goto finish;
4824 if (r != 0) {
4825 r = -EIO;
4826 goto finish;
4827 }
4828 pid = 0;
4829
4830 /* And now retrieve the PID of the inner child. */
4831 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
4832 if (l < 0) {
4833 r = log_error_errno(errno, "Failed to read inner child PID: %m");
4834 goto finish;
4835 }
4836 if (l != sizeof(pid)) {
4837 log_error("Short read while reading inner child PID: %m");
4838 r = EIO;
4839 goto finish;
4840 }
4841
4842 log_debug("Init process invoked as PID " PID_FMT, pid);
4843
4844 if (arg_userns) {
4845 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4846 log_error("Child died too early.");
4847 r = -ESRCH;
4848 goto finish;
4849 }
4850
4851 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
4852 if (l < 0) {
4853 r = log_error_errno(errno, "Failed to read UID shift: %m");
4854 goto finish;
4855 }
4856 if (l != sizeof(arg_uid_shift)) {
4857 log_error("Short read while reading UID shift: %m");
4858 r = EIO;
4859 goto finish;
4860 }
4861
4862 r = setup_uid_map(pid);
4863 if (r < 0)
4864 goto finish;
4865
4866 (void) barrier_place(&barrier); /* #2 */
4867 }
4868
4869 r = move_network_interfaces(pid);
4870 if (r < 0)
4871 goto finish;
4872
4873 r = setup_veth(pid, veth_name, &ifi);
4874 if (r < 0)
4875 goto finish;
4876
4877 r = setup_bridge(veth_name, &ifi);
4878 if (r < 0)
4879 goto finish;
4880
4881 r = setup_macvlan(pid);
4882 if (r < 0)
4883 goto finish;
4884
4885 r = setup_ipvlan(pid);
4886 if (r < 0)
4887 goto finish;
4888
4889 r = register_machine(pid, ifi);
4890 if (r < 0)
4891 goto finish;
4892
4893 r = chown_cgroup(pid);
4894 if (r < 0)
4895 goto finish;
4896
4897 /* Notify the child that the parent is ready with all
4898 * its setup (including cgroup-ification), and that
4899 * the child can now hand over control to the code to
4900 * run inside the container. */
4901 (void) barrier_place(&barrier); /* #3 */
4902
4903 /* Block SIGCHLD here, before notifying child.
4904 * process_pty() will handle it with the other signals. */
4905 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4906
4907 /* Reset signal to default */
4908 r = default_signals(SIGCHLD, -1);
4909 if (r < 0) {
4910 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4911 goto finish;
4912 }
4913
4914 /* Let the child know that we are ready and wait that the child is completely ready now. */
4915 if (!barrier_place_and_sync(&barrier)) { /* #5 */
4916 log_error("Client died too early.");
4917 r = -ESRCH;
4918 goto finish;
4919 }
4920
4921 sd_notifyf(false,
4922 "READY=1\n"
4923 "STATUS=Container running.\n"
4924 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4925
4926 r = sd_event_new(&event);
4927 if (r < 0) {
4928 log_error_errno(r, "Failed to get default event source: %m");
4929 goto finish;
4930 }
4931
4932 if (arg_kill_signal > 0) {
4933 /* Try to kill the init system on SIGINT or SIGTERM */
4934 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4935 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4936 } else {
4937 /* Immediately exit */
4938 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4939 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4940 }
4941
4942 /* simply exit on sigchld */
4943 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4944
4945 if (arg_expose_ports) {
4946 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4947 if (r < 0)
4948 goto finish;
4949
4950 (void) expose_ports(rtnl, &exposed);
4951 }
4952
4953 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4954
4955 r = pty_forward_new(event, master, true, !interactive, &forward);
4956 if (r < 0) {
4957 log_error_errno(r, "Failed to create PTY forwarder: %m");
4958 goto finish;
4959 }
4960
4961 r = sd_event_loop(event);
4962 if (r < 0) {
4963 log_error_errno(r, "Failed to run event loop: %m");
4964 goto finish;
4965 }
4966
4967 pty_forward_get_last_char(forward, &last_char);
4968
4969 forward = pty_forward_free(forward);
4970
4971 if (!arg_quiet && last_char != '\n')
4972 putc('\n', stdout);
4973
4974 /* Kill if it is not dead yet anyway */
4975 terminate_machine(pid);
4976
4977 /* Normally redundant, but better safe than sorry */
4978 kill(pid, SIGKILL);
4979
4980 r = wait_for_container(pid, &container_status);
4981 pid = 0;
4982
4983 if (r < 0)
4984 /* We failed to wait for the container, or the
4985 * container exited abnormally */
4986 goto finish;
4987 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4988 /* The container exited with a non-zero
4989 * status, or with zero status and no reboot
4990 * was requested. */
4991 ret = r;
4992 break;
4993 }
4994
4995 /* CONTAINER_REBOOTED, loop again */
4996
4997 if (arg_keep_unit) {
4998 /* Special handling if we are running as a
4999 * service: instead of simply restarting the
5000 * machine we want to restart the entire
5001 * service, so let's inform systemd about this
5002 * with the special exit code 133. The service
5003 * file uses RestartForceExitStatus=133 so
5004 * that this results in a full nspawn
5005 * restart. This is necessary since we might
5006 * have cgroup parameters set we want to have
5007 * flushed out. */
5008 ret = 133;
5009 r = 0;
5010 break;
5011 }
5012
5013 flush_ports(&exposed);
5014 }
5015
5016 finish:
5017 sd_notify(false,
5018 "STOPPING=1\n"
5019 "STATUS=Terminating...");
5020
5021 if (pid > 0)
5022 kill(pid, SIGKILL);
5023
5024 /* Try to flush whatever is still queued in the pty */
5025 if (master >= 0)
5026 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
5027
5028 loop_remove(loop_nr, &image_fd);
5029
5030 if (remove_subvol && arg_directory) {
5031 int k;
5032
5033 k = btrfs_subvol_remove(arg_directory, true);
5034 if (k < 0)
5035 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
5036 }
5037
5038 if (arg_machine) {
5039 const char *p;
5040
5041 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5042 (void) rm_rf(p, REMOVE_ROOT);
5043 }
5044
5045 free(arg_directory);
5046 free(arg_template);
5047 free(arg_image);
5048 free(arg_machine);
5049 free(arg_user);
5050 strv_free(arg_setenv);
5051 strv_free(arg_network_interfaces);
5052 strv_free(arg_network_macvlan);
5053 strv_free(arg_network_ipvlan);
5054 custom_mount_free_all();
5055
5056 flush_ports(&exposed);
5057
5058 while (arg_expose_ports) {
5059 ExposePort *p = arg_expose_ports;
5060 LIST_REMOVE(ports, arg_expose_ports, p);
5061 free(p);
5062 }
5063
5064 return r < 0 ? EXIT_FAILURE : ret;
5065 }