]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: escape paths in overlay mount options
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <net/if.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
41 #include <sys/file.h>
42
43 #ifdef HAVE_SELINUX
44 #include <selinux/selinux.h>
45 #endif
46
47 #ifdef HAVE_SECCOMP
48 #include <seccomp.h>
49 #endif
50
51 #ifdef HAVE_BLKID
52 #include <blkid/blkid.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-netlink.h"
59 #include "random-util.h"
60 #include "log.h"
61 #include "util.h"
62 #include "mkdir.h"
63 #include "rm-rf.h"
64 #include "macro.h"
65 #include "missing.h"
66 #include "cgroup-util.h"
67 #include "strv.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
71 #include "fdset.h"
72 #include "build.h"
73 #include "fileio.h"
74 #include "bus-util.h"
75 #include "bus-error.h"
76 #include "ptyfwd.h"
77 #include "env-util.h"
78 #include "netlink-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
81 #include "gpt.h"
82 #include "siphash24.h"
83 #include "copy.h"
84 #include "base-filesystem.h"
85 #include "barrier.h"
86 #include "event-util.h"
87 #include "capability.h"
88 #include "cap-list.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
91 #include "list.h"
92 #include "in-addr-util.h"
93 #include "firewall-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
100
101 #ifdef HAVE_SECCOMP
102 #include "seccomp-util.h"
103 #endif
104
105 typedef struct ExposePort {
106 int protocol;
107 uint16_t host_port;
108 uint16_t container_port;
109 LIST_FIELDS(struct ExposePort, ports);
110 } ExposePort;
111
112 typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115 } ContainerStatus;
116
117 typedef enum LinkJournal {
118 LINK_NO,
119 LINK_AUTO,
120 LINK_HOST,
121 LINK_GUEST
122 } LinkJournal;
123
124 typedef enum Volatile {
125 VOLATILE_NO,
126 VOLATILE_YES,
127 VOLATILE_STATE,
128 } Volatile;
129
130 typedef enum CustomMountType {
131 CUSTOM_MOUNT_BIND,
132 CUSTOM_MOUNT_TMPFS,
133 CUSTOM_MOUNT_OVERLAY,
134 } CustomMountType;
135
136 typedef struct CustomMount {
137 CustomMountType type;
138 bool read_only;
139 char *source; /* for overlayfs this is the upper directory */
140 char *destination;
141 char *options;
142 char *work_dir;
143 char **lower;
144 } CustomMount;
145
146 static char *arg_directory = NULL;
147 static char *arg_template = NULL;
148 static char *arg_user = NULL;
149 static sd_id128_t arg_uuid = {};
150 static char *arg_machine = NULL;
151 static const char *arg_selinux_context = NULL;
152 static const char *arg_selinux_apifs_context = NULL;
153 static const char *arg_slice = NULL;
154 static bool arg_private_network = false;
155 static bool arg_read_only = false;
156 static bool arg_boot = false;
157 static bool arg_ephemeral = false;
158 static LinkJournal arg_link_journal = LINK_AUTO;
159 static bool arg_link_journal_try = false;
160 static uint64_t arg_retain =
161 (1ULL << CAP_CHOWN) |
162 (1ULL << CAP_DAC_OVERRIDE) |
163 (1ULL << CAP_DAC_READ_SEARCH) |
164 (1ULL << CAP_FOWNER) |
165 (1ULL << CAP_FSETID) |
166 (1ULL << CAP_IPC_OWNER) |
167 (1ULL << CAP_KILL) |
168 (1ULL << CAP_LEASE) |
169 (1ULL << CAP_LINUX_IMMUTABLE) |
170 (1ULL << CAP_NET_BIND_SERVICE) |
171 (1ULL << CAP_NET_BROADCAST) |
172 (1ULL << CAP_NET_RAW) |
173 (1ULL << CAP_SETGID) |
174 (1ULL << CAP_SETFCAP) |
175 (1ULL << CAP_SETPCAP) |
176 (1ULL << CAP_SETUID) |
177 (1ULL << CAP_SYS_ADMIN) |
178 (1ULL << CAP_SYS_CHROOT) |
179 (1ULL << CAP_SYS_NICE) |
180 (1ULL << CAP_SYS_PTRACE) |
181 (1ULL << CAP_SYS_TTY_CONFIG) |
182 (1ULL << CAP_SYS_RESOURCE) |
183 (1ULL << CAP_SYS_BOOT) |
184 (1ULL << CAP_AUDIT_WRITE) |
185 (1ULL << CAP_AUDIT_CONTROL) |
186 (1ULL << CAP_MKNOD);
187 static CustomMount *arg_custom_mounts = NULL;
188 static unsigned arg_n_custom_mounts = 0;
189 static char **arg_setenv = NULL;
190 static bool arg_quiet = false;
191 static bool arg_share_system = false;
192 static bool arg_register = true;
193 static bool arg_keep_unit = false;
194 static char **arg_network_interfaces = NULL;
195 static char **arg_network_macvlan = NULL;
196 static char **arg_network_ipvlan = NULL;
197 static bool arg_network_veth = false;
198 static const char *arg_network_bridge = NULL;
199 static unsigned long arg_personality = PERSONALITY_INVALID;
200 static char *arg_image = NULL;
201 static Volatile arg_volatile = VOLATILE_NO;
202 static ExposePort *arg_expose_ports = NULL;
203 static char **arg_property = NULL;
204 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
205 static bool arg_userns = false;
206 static int arg_kill_signal = 0;
207
208 static void help(void) {
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
211 " -h --help Show this help\n"
212 " --version Print version string\n"
213 " -q --quiet Do not show status information\n"
214 " -D --directory=PATH Root directory for the container\n"
215 " --template=PATH Initialize root directory from template directory,\n"
216 " if missing\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
222 " -M --machine=NAME Set the machine name for the container\n"
223 " --uuid=UUID Set a specific machine UUID for the container\n"
224 " -S --slice=SLICE Place the container in the specified slice\n"
225 " --property=NAME=VALUE Set scope unit property\n"
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
231 " container\n"
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
238 " -n --network-veth Add a virtual ethernet connection between host\n"
239 " and container\n"
240 " --network-bridge=INTERFACE\n"
241 " Add a virtual ethernet connection between host\n"
242 " and container and add it to an existing bridge on\n"
243 " the host\n"
244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
245 " Expose a container IP port on the host\n"
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
252 " --capability=CAP In addition to the default, retain specified\n"
253 " capability\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
259 " --read-only Mount the root directory read-only\n"
260 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
261 " the container\n"
262 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
263 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
264 " --overlay=PATH[:PATH...]:PATH\n"
265 " Create an overlay mount from the host to \n"
266 " the container\n"
267 " --overlay-ro=PATH[:PATH...]:PATH\n"
268 " Similar, but creates a read-only overlay mount\n"
269 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
270 " --share-system Share system namespaces with host\n"
271 " --register=BOOLEAN Register container as machine\n"
272 " --keep-unit Do not register a scope for the machine, reuse\n"
273 " the service unit nspawn is running in\n"
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 , program_invocation_short_name);
276 }
277
278 static CustomMount* custom_mount_add(CustomMountType t) {
279 CustomMount *c, *ret;
280
281 c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
282 if (!c)
283 return NULL;
284
285 arg_custom_mounts = c;
286 ret = arg_custom_mounts + arg_n_custom_mounts;
287 arg_n_custom_mounts++;
288
289 *ret = (CustomMount) { .type = t };
290
291 return ret;
292 }
293
294 static void custom_mount_free_all(void) {
295 unsigned i;
296
297 for (i = 0; i < arg_n_custom_mounts; i++) {
298 CustomMount *m = &arg_custom_mounts[i];
299
300 free(m->source);
301 free(m->destination);
302 free(m->options);
303
304 if (m->work_dir) {
305 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
306 free(m->work_dir);
307 }
308
309 strv_free(m->lower);
310 }
311
312 arg_custom_mounts = mfree(arg_custom_mounts);
313 arg_n_custom_mounts = 0;
314 }
315
316 static int custom_mount_compare(const void *a, const void *b) {
317 const CustomMount *x = a, *y = b;
318 int r;
319
320 r = path_compare(x->destination, y->destination);
321 if (r != 0)
322 return r;
323
324 if (x->type < y->type)
325 return -1;
326 if (x->type > y->type)
327 return 1;
328
329 return 0;
330 }
331
332 static int custom_mounts_prepare(void) {
333 unsigned i;
334 int r;
335
336 /* Ensure the mounts are applied prefix first. */
337 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
338
339 /* Allocate working directories for the overlay file systems that need it */
340 for (i = 0; i < arg_n_custom_mounts; i++) {
341 CustomMount *m = &arg_custom_mounts[i];
342
343 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
344 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
345 return -EINVAL;
346 }
347
348 if (m->type != CUSTOM_MOUNT_OVERLAY)
349 continue;
350
351 if (m->work_dir)
352 continue;
353
354 if (m->read_only)
355 continue;
356
357 r = tempfn_random(m->source, NULL, &m->work_dir);
358 if (r < 0)
359 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
360 }
361
362 return 0;
363 }
364
365 static int set_sanitized_path(char **b, const char *path) {
366 char *p;
367
368 assert(b);
369 assert(path);
370
371 p = canonicalize_file_name(path);
372 if (!p) {
373 if (errno != ENOENT)
374 return -errno;
375
376 p = path_make_absolute_cwd(path);
377 if (!p)
378 return -ENOMEM;
379 }
380
381 free(*b);
382 *b = path_kill_slashes(p);
383 return 0;
384 }
385
386 static int parse_argv(int argc, char *argv[]) {
387
388 enum {
389 ARG_VERSION = 0x100,
390 ARG_PRIVATE_NETWORK,
391 ARG_UUID,
392 ARG_READ_ONLY,
393 ARG_CAPABILITY,
394 ARG_DROP_CAPABILITY,
395 ARG_LINK_JOURNAL,
396 ARG_BIND,
397 ARG_BIND_RO,
398 ARG_TMPFS,
399 ARG_OVERLAY,
400 ARG_OVERLAY_RO,
401 ARG_SETENV,
402 ARG_SHARE_SYSTEM,
403 ARG_REGISTER,
404 ARG_KEEP_UNIT,
405 ARG_NETWORK_INTERFACE,
406 ARG_NETWORK_MACVLAN,
407 ARG_NETWORK_IPVLAN,
408 ARG_NETWORK_BRIDGE,
409 ARG_PERSONALITY,
410 ARG_VOLATILE,
411 ARG_TEMPLATE,
412 ARG_PROPERTY,
413 ARG_PRIVATE_USERS,
414 ARG_KILL_SIGNAL,
415 };
416
417 static const struct option options[] = {
418 { "help", no_argument, NULL, 'h' },
419 { "version", no_argument, NULL, ARG_VERSION },
420 { "directory", required_argument, NULL, 'D' },
421 { "template", required_argument, NULL, ARG_TEMPLATE },
422 { "ephemeral", no_argument, NULL, 'x' },
423 { "user", required_argument, NULL, 'u' },
424 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
425 { "boot", no_argument, NULL, 'b' },
426 { "uuid", required_argument, NULL, ARG_UUID },
427 { "read-only", no_argument, NULL, ARG_READ_ONLY },
428 { "capability", required_argument, NULL, ARG_CAPABILITY },
429 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
430 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
431 { "bind", required_argument, NULL, ARG_BIND },
432 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
433 { "tmpfs", required_argument, NULL, ARG_TMPFS },
434 { "overlay", required_argument, NULL, ARG_OVERLAY },
435 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
436 { "machine", required_argument, NULL, 'M' },
437 { "slice", required_argument, NULL, 'S' },
438 { "setenv", required_argument, NULL, ARG_SETENV },
439 { "selinux-context", required_argument, NULL, 'Z' },
440 { "selinux-apifs-context", required_argument, NULL, 'L' },
441 { "quiet", no_argument, NULL, 'q' },
442 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
443 { "register", required_argument, NULL, ARG_REGISTER },
444 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
445 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
446 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
447 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
448 { "network-veth", no_argument, NULL, 'n' },
449 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
450 { "personality", required_argument, NULL, ARG_PERSONALITY },
451 { "image", required_argument, NULL, 'i' },
452 { "volatile", optional_argument, NULL, ARG_VOLATILE },
453 { "port", required_argument, NULL, 'p' },
454 { "property", required_argument, NULL, ARG_PROPERTY },
455 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
456 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
457 {}
458 };
459
460 int c, r;
461 uint64_t plus = 0, minus = 0;
462
463 assert(argc >= 0);
464 assert(argv);
465
466 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
467
468 switch (c) {
469
470 case 'h':
471 help();
472 return 0;
473
474 case ARG_VERSION:
475 puts(PACKAGE_STRING);
476 puts(SYSTEMD_FEATURES);
477 return 0;
478
479 case 'D':
480 r = set_sanitized_path(&arg_directory, optarg);
481 if (r < 0)
482 return log_error_errno(r, "Invalid root directory: %m");
483
484 break;
485
486 case ARG_TEMPLATE:
487 r = set_sanitized_path(&arg_template, optarg);
488 if (r < 0)
489 return log_error_errno(r, "Invalid template directory: %m");
490
491 break;
492
493 case 'i':
494 r = set_sanitized_path(&arg_image, optarg);
495 if (r < 0)
496 return log_error_errno(r, "Invalid image path: %m");
497
498 break;
499
500 case 'x':
501 arg_ephemeral = true;
502 break;
503
504 case 'u':
505 r = free_and_strdup(&arg_user, optarg);
506 if (r < 0)
507 return log_oom();
508
509 break;
510
511 case ARG_NETWORK_BRIDGE:
512 arg_network_bridge = optarg;
513
514 /* fall through */
515
516 case 'n':
517 arg_network_veth = true;
518 arg_private_network = true;
519 break;
520
521 case ARG_NETWORK_INTERFACE:
522 if (strv_extend(&arg_network_interfaces, optarg) < 0)
523 return log_oom();
524
525 arg_private_network = true;
526 break;
527
528 case ARG_NETWORK_MACVLAN:
529 if (strv_extend(&arg_network_macvlan, optarg) < 0)
530 return log_oom();
531
532 arg_private_network = true;
533 break;
534
535 case ARG_NETWORK_IPVLAN:
536 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
537 return log_oom();
538
539 /* fall through */
540
541 case ARG_PRIVATE_NETWORK:
542 arg_private_network = true;
543 break;
544
545 case 'b':
546 arg_boot = true;
547 break;
548
549 case ARG_UUID:
550 r = sd_id128_from_string(optarg, &arg_uuid);
551 if (r < 0) {
552 log_error("Invalid UUID: %s", optarg);
553 return r;
554 }
555 break;
556
557 case 'S':
558 arg_slice = optarg;
559 break;
560
561 case 'M':
562 if (isempty(optarg)) {
563 arg_machine = mfree(arg_machine);
564 } else {
565 if (!machine_name_is_valid(optarg)) {
566 log_error("Invalid machine name: %s", optarg);
567 return -EINVAL;
568 }
569
570 r = free_and_strdup(&arg_machine, optarg);
571 if (r < 0)
572 return log_oom();
573
574 break;
575 }
576
577 case 'Z':
578 arg_selinux_context = optarg;
579 break;
580
581 case 'L':
582 arg_selinux_apifs_context = optarg;
583 break;
584
585 case ARG_READ_ONLY:
586 arg_read_only = true;
587 break;
588
589 case ARG_CAPABILITY:
590 case ARG_DROP_CAPABILITY: {
591 const char *state, *word;
592 size_t length;
593
594 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
595 _cleanup_free_ char *t;
596
597 t = strndup(word, length);
598 if (!t)
599 return log_oom();
600
601 if (streq(t, "all")) {
602 if (c == ARG_CAPABILITY)
603 plus = (uint64_t) -1;
604 else
605 minus = (uint64_t) -1;
606 } else {
607 int cap;
608
609 cap = capability_from_name(t);
610 if (cap < 0) {
611 log_error("Failed to parse capability %s.", t);
612 return -EINVAL;
613 }
614
615 if (c == ARG_CAPABILITY)
616 plus |= 1ULL << (uint64_t) cap;
617 else
618 minus |= 1ULL << (uint64_t) cap;
619 }
620 }
621
622 break;
623 }
624
625 case 'j':
626 arg_link_journal = LINK_GUEST;
627 arg_link_journal_try = true;
628 break;
629
630 case ARG_LINK_JOURNAL:
631 if (streq(optarg, "auto")) {
632 arg_link_journal = LINK_AUTO;
633 arg_link_journal_try = false;
634 } else if (streq(optarg, "no")) {
635 arg_link_journal = LINK_NO;
636 arg_link_journal_try = false;
637 } else if (streq(optarg, "guest")) {
638 arg_link_journal = LINK_GUEST;
639 arg_link_journal_try = false;
640 } else if (streq(optarg, "host")) {
641 arg_link_journal = LINK_HOST;
642 arg_link_journal_try = false;
643 } else if (streq(optarg, "try-guest")) {
644 arg_link_journal = LINK_GUEST;
645 arg_link_journal_try = true;
646 } else if (streq(optarg, "try-host")) {
647 arg_link_journal = LINK_HOST;
648 arg_link_journal_try = true;
649 } else {
650 log_error("Failed to parse link journal mode %s", optarg);
651 return -EINVAL;
652 }
653
654 break;
655
656 case ARG_BIND:
657 case ARG_BIND_RO: {
658 const char *current = optarg;
659 _cleanup_free_ char *source = NULL, *destination = NULL;
660 CustomMount *m;
661 _cleanup_strv_free_ char **strv = NULL;
662
663 r = extract_many_words(&current, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
664 switch (r) {
665 case 1:
666 destination = strdup(source);
667 case 2:
668 break;
669 case -ENOMEM:
670 return log_oom();
671 default:
672 log_error("Invalid bind mount specification: %s", optarg);
673 return -EINVAL;
674 }
675
676 if (!source || !destination)
677 return log_oom();
678
679 if (!path_is_absolute(source) || !path_is_absolute(destination)) {
680 log_error("Invalid bind mount specification: %s", optarg);
681 return -EINVAL;
682 }
683
684 m = custom_mount_add(CUSTOM_MOUNT_BIND);
685 if (!m)
686 return log_oom();
687
688 m->source = source;
689 m->destination = destination;
690 m->read_only = c == ARG_BIND_RO;
691
692 source = destination = NULL;
693
694 break;
695 }
696
697 case ARG_TMPFS: {
698 const char *current = optarg;
699 _cleanup_free_ char *path = NULL, *opts = NULL;
700 CustomMount *m;
701
702 r = extract_first_word(&current, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
703 if (r == -ENOMEM)
704 return log_oom();
705 else if (r < 0) {
706 log_error("Invalid tmpfs specification: %s", optarg);
707 return r;
708 }
709 if (r)
710 opts = strdup(current);
711 else
712 opts = strdup("mode=0755");
713
714 if (!path || !opts)
715 return log_oom();
716
717 if (!path_is_absolute(path)) {
718 log_error("Invalid tmpfs specification: %s", optarg);
719 return -EINVAL;
720 }
721
722 m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
723 if (!m)
724 return log_oom();
725
726 m->destination = path;
727 m->options = opts;
728
729 path = opts = NULL;
730
731 break;
732 }
733
734 case ARG_OVERLAY:
735 case ARG_OVERLAY_RO: {
736 _cleanup_free_ char *upper = NULL, *destination = NULL;
737 _cleanup_strv_free_ char **lower = NULL;
738 CustomMount *m;
739 unsigned n = 0;
740 char **i;
741
742 lower = strv_split(optarg, ":");
743 if (!lower)
744 return log_oom();
745
746 STRV_FOREACH(i, lower) {
747 if (!path_is_absolute(*i)) {
748 log_error("Overlay path %s is not absolute.", *i);
749 return -EINVAL;
750 }
751
752 n++;
753 }
754
755 if (n < 2) {
756 log_error("--overlay= needs at least two colon-separated directories specified.");
757 return -EINVAL;
758 }
759
760 if (n == 2) {
761 /* If two parameters are specified,
762 * the first one is the lower, the
763 * second one the upper directory. And
764 * we'll also define the destination
765 * mount point the same as the upper. */
766 upper = lower[1];
767 lower[1] = NULL;
768
769 destination = strdup(upper);
770 if (!destination)
771 return log_oom();
772
773 } else {
774 upper = lower[n - 2];
775 destination = lower[n - 1];
776 lower[n - 2] = NULL;
777 }
778
779 m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
780 if (!m)
781 return log_oom();
782
783 m->destination = destination;
784 m->source = upper;
785 m->lower = lower;
786 m->read_only = c == ARG_OVERLAY_RO;
787
788 upper = destination = NULL;
789 lower = NULL;
790
791 break;
792 }
793
794 case ARG_SETENV: {
795 char **n;
796
797 if (!env_assignment_is_valid(optarg)) {
798 log_error("Environment variable assignment '%s' is not valid.", optarg);
799 return -EINVAL;
800 }
801
802 n = strv_env_set(arg_setenv, optarg);
803 if (!n)
804 return log_oom();
805
806 strv_free(arg_setenv);
807 arg_setenv = n;
808 break;
809 }
810
811 case 'q':
812 arg_quiet = true;
813 break;
814
815 case ARG_SHARE_SYSTEM:
816 arg_share_system = true;
817 break;
818
819 case ARG_REGISTER:
820 r = parse_boolean(optarg);
821 if (r < 0) {
822 log_error("Failed to parse --register= argument: %s", optarg);
823 return r;
824 }
825
826 arg_register = r;
827 break;
828
829 case ARG_KEEP_UNIT:
830 arg_keep_unit = true;
831 break;
832
833 case ARG_PERSONALITY:
834
835 arg_personality = personality_from_string(optarg);
836 if (arg_personality == PERSONALITY_INVALID) {
837 log_error("Unknown or unsupported personality '%s'.", optarg);
838 return -EINVAL;
839 }
840
841 break;
842
843 case ARG_VOLATILE:
844
845 if (!optarg)
846 arg_volatile = VOLATILE_YES;
847 else {
848 r = parse_boolean(optarg);
849 if (r < 0) {
850 if (streq(optarg, "state"))
851 arg_volatile = VOLATILE_STATE;
852 else {
853 log_error("Failed to parse --volatile= argument: %s", optarg);
854 return r;
855 }
856 } else
857 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
858 }
859
860 break;
861
862 case 'p': {
863 const char *split, *e;
864 uint16_t container_port, host_port;
865 int protocol;
866 ExposePort *p;
867
868 if ((e = startswith(optarg, "tcp:")))
869 protocol = IPPROTO_TCP;
870 else if ((e = startswith(optarg, "udp:")))
871 protocol = IPPROTO_UDP;
872 else {
873 e = optarg;
874 protocol = IPPROTO_TCP;
875 }
876
877 split = strchr(e, ':');
878 if (split) {
879 char v[split - e + 1];
880
881 memcpy(v, e, split - e);
882 v[split - e] = 0;
883
884 r = safe_atou16(v, &host_port);
885 if (r < 0 || host_port <= 0) {
886 log_error("Failed to parse host port: %s", optarg);
887 return -EINVAL;
888 }
889
890 r = safe_atou16(split + 1, &container_port);
891 } else {
892 r = safe_atou16(e, &container_port);
893 host_port = container_port;
894 }
895
896 if (r < 0 || container_port <= 0) {
897 log_error("Failed to parse host port: %s", optarg);
898 return -EINVAL;
899 }
900
901 LIST_FOREACH(ports, p, arg_expose_ports) {
902 if (p->protocol == protocol && p->host_port == host_port) {
903 log_error("Duplicate port specification: %s", optarg);
904 return -EINVAL;
905 }
906 }
907
908 p = new(ExposePort, 1);
909 if (!p)
910 return log_oom();
911
912 p->protocol = protocol;
913 p->host_port = host_port;
914 p->container_port = container_port;
915
916 LIST_PREPEND(ports, arg_expose_ports, p);
917
918 break;
919 }
920
921 case ARG_PROPERTY:
922 if (strv_extend(&arg_property, optarg) < 0)
923 return log_oom();
924
925 break;
926
927 case ARG_PRIVATE_USERS:
928 if (optarg) {
929 _cleanup_free_ char *buffer = NULL;
930 const char *range, *shift;
931
932 range = strchr(optarg, ':');
933 if (range) {
934 buffer = strndup(optarg, range - optarg);
935 if (!buffer)
936 return log_oom();
937 shift = buffer;
938
939 range++;
940 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
941 log_error("Failed to parse UID range: %s", range);
942 return -EINVAL;
943 }
944 } else
945 shift = optarg;
946
947 if (parse_uid(shift, &arg_uid_shift) < 0) {
948 log_error("Failed to parse UID: %s", optarg);
949 return -EINVAL;
950 }
951 }
952
953 arg_userns = true;
954 break;
955
956 case ARG_KILL_SIGNAL:
957 arg_kill_signal = signal_from_string_try_harder(optarg);
958 if (arg_kill_signal < 0) {
959 log_error("Cannot parse signal: %s", optarg);
960 return -EINVAL;
961 }
962
963 break;
964
965 case '?':
966 return -EINVAL;
967
968 default:
969 assert_not_reached("Unhandled option");
970 }
971
972 if (arg_share_system)
973 arg_register = false;
974
975 if (arg_boot && arg_share_system) {
976 log_error("--boot and --share-system may not be combined.");
977 return -EINVAL;
978 }
979
980 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
981 log_error("--keep-unit may not be used when invoked from a user session.");
982 return -EINVAL;
983 }
984
985 if (arg_directory && arg_image) {
986 log_error("--directory= and --image= may not be combined.");
987 return -EINVAL;
988 }
989
990 if (arg_template && arg_image) {
991 log_error("--template= and --image= may not be combined.");
992 return -EINVAL;
993 }
994
995 if (arg_template && !(arg_directory || arg_machine)) {
996 log_error("--template= needs --directory= or --machine=.");
997 return -EINVAL;
998 }
999
1000 if (arg_ephemeral && arg_template) {
1001 log_error("--ephemeral and --template= may not be combined.");
1002 return -EINVAL;
1003 }
1004
1005 if (arg_ephemeral && arg_image) {
1006 log_error("--ephemeral and --image= may not be combined.");
1007 return -EINVAL;
1008 }
1009
1010 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1011 log_error("--ephemeral and --link-journal= may not be combined.");
1012 return -EINVAL;
1013 }
1014
1015 if (arg_volatile != VOLATILE_NO && arg_read_only) {
1016 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1017 return -EINVAL;
1018 }
1019
1020 if (arg_expose_ports && !arg_private_network) {
1021 log_error("Cannot use --port= without private networking.");
1022 return -EINVAL;
1023 }
1024
1025 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
1026 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
1027
1028 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1029
1030 if (arg_boot && arg_kill_signal <= 0)
1031 arg_kill_signal = SIGRTMIN+3;
1032
1033 return 1;
1034 }
1035
1036 static int tmpfs_patch_options(const char *options, char **ret) {
1037 char *buf = NULL;
1038
1039 if (arg_userns && arg_uid_shift != 0) {
1040 assert(arg_uid_shift != UID_INVALID);
1041
1042 if (options)
1043 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
1044 else
1045 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
1046 if (!buf)
1047 return -ENOMEM;
1048
1049 options = buf;
1050 }
1051
1052 #ifdef HAVE_SELINUX
1053 if (arg_selinux_apifs_context) {
1054 char *t;
1055
1056 if (options)
1057 t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
1058 else
1059 t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
1060 if (!t) {
1061 free(buf);
1062 return -ENOMEM;
1063 }
1064
1065 free(buf);
1066 buf = t;
1067 }
1068 #endif
1069
1070 *ret = buf;
1071 return !!buf;
1072 }
1073
1074 static int mount_all(const char *dest, bool userns) {
1075
1076 typedef struct MountPoint {
1077 const char *what;
1078 const char *where;
1079 const char *type;
1080 const char *options;
1081 unsigned long flags;
1082 bool fatal;
1083 bool userns;
1084 } MountPoint;
1085
1086 static const MountPoint mount_table[] = {
1087 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
1088 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
1089 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
1090 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
1091 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
1092 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
1093 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1094 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1095 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
1096 #ifdef HAVE_SELINUX
1097 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
1098 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */
1099 #endif
1100 };
1101
1102 unsigned k;
1103 int r;
1104
1105 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
1106 _cleanup_free_ char *where = NULL, *options = NULL;
1107 const char *o;
1108
1109 if (userns != mount_table[k].userns)
1110 continue;
1111
1112 where = prefix_root(dest, mount_table[k].where);
1113 if (!where)
1114 return log_oom();
1115
1116 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
1117 if (r < 0 && r != -ENOENT)
1118 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
1119
1120 /* Skip this entry if it is not a remount. */
1121 if (mount_table[k].what && r > 0)
1122 continue;
1123
1124 r = mkdir_p(where, 0755);
1125 if (r < 0) {
1126 if (mount_table[k].fatal)
1127 return log_error_errno(r, "Failed to create directory %s: %m", where);
1128
1129 log_warning_errno(r, "Failed to create directory %s: %m", where);
1130 continue;
1131 }
1132
1133 o = mount_table[k].options;
1134 if (streq_ptr(mount_table[k].type, "tmpfs")) {
1135 r = tmpfs_patch_options(o, &options);
1136 if (r < 0)
1137 return log_oom();
1138 if (r > 0)
1139 o = options;
1140 }
1141
1142 if (mount(mount_table[k].what,
1143 where,
1144 mount_table[k].type,
1145 mount_table[k].flags,
1146 o) < 0) {
1147
1148 if (mount_table[k].fatal)
1149 return log_error_errno(errno, "mount(%s) failed: %m", where);
1150
1151 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
1152 }
1153 }
1154
1155 return 0;
1156 }
1157
1158 static int mount_bind(const char *dest, CustomMount *m) {
1159 struct stat source_st, dest_st;
1160 const char *where;
1161 int r;
1162
1163 assert(m);
1164
1165 if (stat(m->source, &source_st) < 0)
1166 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
1167
1168 where = prefix_roota(dest, m->destination);
1169
1170 if (stat(where, &dest_st) >= 0) {
1171 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
1172 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
1173 return -EINVAL;
1174 }
1175
1176 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
1177 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
1178 return -EINVAL;
1179 }
1180
1181 } else if (errno == ENOENT) {
1182 r = mkdir_parents_label(where, 0755);
1183 if (r < 0)
1184 return log_error_errno(r, "Failed to make parents of %s: %m", where);
1185 } else {
1186 log_error_errno(errno, "Failed to stat %s: %m", where);
1187 return -errno;
1188 }
1189
1190 /* Create the mount point. Any non-directory file can be
1191 * mounted on any non-directory file (regular, fifo, socket,
1192 * char, block).
1193 */
1194 if (S_ISDIR(source_st.st_mode))
1195 r = mkdir_label(where, 0755);
1196 else
1197 r = touch(where);
1198 if (r < 0 && r != -EEXIST)
1199 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1200
1201 if (mount(m->source, where, NULL, MS_BIND, NULL) < 0)
1202 return log_error_errno(errno, "mount(%s) failed: %m", where);
1203
1204 if (m->read_only) {
1205 r = bind_remount_recursive(where, true);
1206 if (r < 0)
1207 return log_error_errno(r, "Read-only bind mount failed: %m");
1208 }
1209
1210 return 0;
1211 }
1212
1213 static int mount_tmpfs(const char *dest, CustomMount *m) {
1214 const char *where, *options;
1215 _cleanup_free_ char *buf = NULL;
1216 int r;
1217
1218 assert(dest);
1219 assert(m);
1220
1221 where = prefix_roota(dest, m->destination);
1222
1223 r = mkdir_p_label(where, 0755);
1224 if (r < 0 && r != -EEXIST)
1225 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1226
1227 r = tmpfs_patch_options(m->options, &buf);
1228 if (r < 0)
1229 return log_oom();
1230 options = r > 0 ? buf : m->options;
1231
1232 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
1233 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1234
1235 return 0;
1236 }
1237
1238 static char *joined_and_escaped_lower_dirs(char * const *lower) {
1239 _cleanup_strv_free_ char **sv = NULL;
1240
1241 sv = strv_copy(lower);
1242 if (!sv)
1243 return NULL;
1244
1245 strv_reverse(sv);
1246
1247 if (!strv_shell_escape(sv, ",:"))
1248 return NULL;
1249
1250 return strv_join(sv, ":");
1251 }
1252
1253 static int mount_overlay(const char *dest, CustomMount *m) {
1254 _cleanup_free_ char *lower = NULL;
1255 const char *where, *options;
1256 int r;
1257
1258 assert(dest);
1259 assert(m);
1260
1261 where = prefix_roota(dest, m->destination);
1262
1263 r = mkdir_label(where, 0755);
1264 if (r < 0 && r != -EEXIST)
1265 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
1266
1267 (void) mkdir_p_label(m->source, 0755);
1268
1269 lower = joined_and_escaped_lower_dirs(m->lower);
1270 if (!lower)
1271 return log_oom();
1272
1273 if (m->read_only) {
1274 _cleanup_free_ char *escaped_source = NULL;
1275
1276 escaped_source = shell_escape(m->source, ",:");
1277 if (!escaped_source)
1278 return log_oom();
1279
1280 options = strjoina("lowerdir=", escaped_source, ":", lower);
1281 } else {
1282 _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL;
1283
1284 assert(m->work_dir);
1285 (void) mkdir_label(m->work_dir, 0700);
1286
1287 escaped_source = shell_escape(m->source, ",:");
1288 if (!escaped_source)
1289 return log_oom();
1290 escaped_work_dir = shell_escape(m->work_dir, ",:");
1291 if (!escaped_work_dir)
1292 return log_oom();
1293
1294 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
1295 }
1296
1297 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
1298 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
1299
1300 return 0;
1301 }
1302
1303 static int mount_custom(const char *dest) {
1304 unsigned i;
1305 int r;
1306
1307 assert(dest);
1308
1309 for (i = 0; i < arg_n_custom_mounts; i++) {
1310 CustomMount *m = &arg_custom_mounts[i];
1311
1312 switch (m->type) {
1313
1314 case CUSTOM_MOUNT_BIND:
1315 r = mount_bind(dest, m);
1316 break;
1317
1318 case CUSTOM_MOUNT_TMPFS:
1319 r = mount_tmpfs(dest, m);
1320 break;
1321
1322 case CUSTOM_MOUNT_OVERLAY:
1323 r = mount_overlay(dest, m);
1324 break;
1325
1326 default:
1327 assert_not_reached("Unknown custom mount type");
1328 }
1329
1330 if (r < 0)
1331 return r;
1332 }
1333
1334 return 0;
1335 }
1336
1337 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1338 char *to;
1339 int r;
1340
1341 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1342
1343 r = path_is_mount_point(to, 0);
1344 if (r < 0 && r != -ENOENT)
1345 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1346 if (r > 0)
1347 return 0;
1348
1349 mkdir_p(to, 0755);
1350
1351 /* The superblock mount options of the mount point need to be
1352 * identical to the hosts', and hence writable... */
1353 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1354 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1355
1356 /* ... hence let's only make the bind mount read-only, not the
1357 * superblock. */
1358 if (read_only) {
1359 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1360 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1361 }
1362 return 1;
1363 }
1364
1365 static int mount_cgroup(const char *dest) {
1366 _cleanup_set_free_free_ Set *controllers = NULL;
1367 const char *cgroup_root;
1368 int r;
1369
1370 controllers = set_new(&string_hash_ops);
1371 if (!controllers)
1372 return log_oom();
1373
1374 r = cg_kernel_controllers(controllers);
1375 if (r < 0)
1376 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1377
1378 for (;;) {
1379 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1380
1381 controller = set_steal_first(controllers);
1382 if (!controller)
1383 break;
1384
1385 origin = prefix_root("/sys/fs/cgroup/", controller);
1386 if (!origin)
1387 return log_oom();
1388
1389 r = readlink_malloc(origin, &combined);
1390 if (r == -EINVAL) {
1391 /* Not a symbolic link, but directly a single cgroup hierarchy */
1392
1393 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1394 if (r < 0)
1395 return r;
1396
1397 } else if (r < 0)
1398 return log_error_errno(r, "Failed to read link %s: %m", origin);
1399 else {
1400 _cleanup_free_ char *target = NULL;
1401
1402 target = prefix_root(dest, origin);
1403 if (!target)
1404 return log_oom();
1405
1406 /* A symbolic link, a combination of controllers in one hierarchy */
1407
1408 if (!filename_is_valid(combined)) {
1409 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1410 continue;
1411 }
1412
1413 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1414 if (r < 0)
1415 return r;
1416
1417 r = symlink_idempotent(combined, target);
1418 if (r == -EINVAL) {
1419 log_error("Invalid existing symlink for combined hierarchy");
1420 return r;
1421 }
1422 if (r < 0)
1423 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1424 }
1425 }
1426
1427 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1428 if (r < 0)
1429 return r;
1430
1431 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1432 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1433 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1434
1435 return 0;
1436 }
1437
1438 static int mount_systemd_cgroup_writable(const char *dest) {
1439 _cleanup_free_ char *own_cgroup_path = NULL;
1440 const char *systemd_root, *systemd_own;
1441 int r;
1442
1443 assert(dest);
1444
1445 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1446 if (r < 0)
1447 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1448
1449 /* Make our own cgroup a (writable) bind mount */
1450 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1451 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1452 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1453
1454 /* And then remount the systemd cgroup root read-only */
1455 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
1456 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1457 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1458
1459 return 0;
1460 }
1461
1462 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1463 assert(p);
1464
1465 if (!arg_userns)
1466 return 0;
1467
1468 if (uid == UID_INVALID && gid == GID_INVALID)
1469 return 0;
1470
1471 if (uid != UID_INVALID) {
1472 uid += arg_uid_shift;
1473
1474 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1475 return -EOVERFLOW;
1476 }
1477
1478 if (gid != GID_INVALID) {
1479 gid += (gid_t) arg_uid_shift;
1480
1481 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1482 return -EOVERFLOW;
1483 }
1484
1485 if (lchown(p, uid, gid) < 0)
1486 return -errno;
1487
1488 return 0;
1489 }
1490
1491 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1492 const char *q;
1493
1494 q = prefix_roota(root, path);
1495 if (mkdir(q, mode) < 0) {
1496 if (errno == EEXIST)
1497 return 0;
1498 return -errno;
1499 }
1500
1501 return userns_lchown(q, uid, gid);
1502 }
1503
1504 static int setup_timezone(const char *dest) {
1505 _cleanup_free_ char *p = NULL, *q = NULL;
1506 const char *where, *check, *what;
1507 char *z, *y;
1508 int r;
1509
1510 assert(dest);
1511
1512 /* Fix the timezone, if possible */
1513 r = readlink_malloc("/etc/localtime", &p);
1514 if (r < 0) {
1515 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1516 return 0;
1517 }
1518
1519 z = path_startswith(p, "../usr/share/zoneinfo/");
1520 if (!z)
1521 z = path_startswith(p, "/usr/share/zoneinfo/");
1522 if (!z) {
1523 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1524 return 0;
1525 }
1526
1527 where = prefix_roota(dest, "/etc/localtime");
1528 r = readlink_malloc(where, &q);
1529 if (r >= 0) {
1530 y = path_startswith(q, "../usr/share/zoneinfo/");
1531 if (!y)
1532 y = path_startswith(q, "/usr/share/zoneinfo/");
1533
1534 /* Already pointing to the right place? Then do nothing .. */
1535 if (y && streq(y, z))
1536 return 0;
1537 }
1538
1539 check = strjoina("/usr/share/zoneinfo/", z);
1540 check = prefix_root(dest, check);
1541 if (laccess(check, F_OK) < 0) {
1542 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1543 return 0;
1544 }
1545
1546 r = unlink(where);
1547 if (r < 0 && errno != ENOENT) {
1548 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1549 return 0;
1550 }
1551
1552 what = strjoina("../usr/share/zoneinfo/", z);
1553 if (symlink(what, where) < 0) {
1554 log_error_errno(errno, "Failed to correct timezone of container: %m");
1555 return 0;
1556 }
1557
1558 r = userns_lchown(where, 0, 0);
1559 if (r < 0)
1560 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1561
1562 return 0;
1563 }
1564
1565 static int setup_resolv_conf(const char *dest) {
1566 const char *where = NULL;
1567 int r;
1568
1569 assert(dest);
1570
1571 if (arg_private_network)
1572 return 0;
1573
1574 /* Fix resolv.conf, if possible */
1575 where = prefix_roota(dest, "/etc/resolv.conf");
1576
1577 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1578 if (r < 0) {
1579 /* If the file already exists as symlink, let's
1580 * suppress the warning, under the assumption that
1581 * resolved or something similar runs inside and the
1582 * symlink points there.
1583 *
1584 * If the disk image is read-only, there's also no
1585 * point in complaining.
1586 */
1587 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1588 "Failed to copy /etc/resolv.conf to %s: %m", where);
1589 return 0;
1590 }
1591
1592 r = userns_lchown(where, 0, 0);
1593 if (r < 0)
1594 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1595
1596 return 0;
1597 }
1598
1599 static int setup_volatile_state(const char *directory) {
1600 _cleanup_free_ char *buf = NULL;
1601 const char *p, *options;
1602 int r;
1603
1604 assert(directory);
1605
1606 if (arg_volatile != VOLATILE_STATE)
1607 return 0;
1608
1609 /* --volatile=state means we simply overmount /var
1610 with a tmpfs, and the rest read-only. */
1611
1612 r = bind_remount_recursive(directory, true);
1613 if (r < 0)
1614 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1615
1616 p = prefix_roota(directory, "/var");
1617 r = mkdir(p, 0755);
1618 if (r < 0 && errno != EEXIST)
1619 return log_error_errno(errno, "Failed to create %s: %m", directory);
1620
1621 options = "mode=755";
1622 r = tmpfs_patch_options(options, &buf);
1623 if (r < 0)
1624 return log_oom();
1625 if (r > 0)
1626 options = buf;
1627
1628 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
1629 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1630
1631 return 0;
1632 }
1633
1634 static int setup_volatile(const char *directory) {
1635 bool tmpfs_mounted = false, bind_mounted = false;
1636 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1637 _cleanup_free_ char *buf = NULL;
1638 const char *f, *t, *options;
1639 int r;
1640
1641 assert(directory);
1642
1643 if (arg_volatile != VOLATILE_YES)
1644 return 0;
1645
1646 /* --volatile=yes means we mount a tmpfs to the root dir, and
1647 the original /usr to use inside it, and that read-only. */
1648
1649 if (!mkdtemp(template))
1650 return log_error_errno(errno, "Failed to create temporary directory: %m");
1651
1652 options = "mode=755";
1653 r = tmpfs_patch_options(options, &buf);
1654 if (r < 0)
1655 return log_oom();
1656 if (r > 0)
1657 options = buf;
1658
1659 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
1660 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1661 goto fail;
1662 }
1663
1664 tmpfs_mounted = true;
1665
1666 f = prefix_roota(directory, "/usr");
1667 t = prefix_roota(template, "/usr");
1668
1669 r = mkdir(t, 0755);
1670 if (r < 0 && errno != EEXIST) {
1671 r = log_error_errno(errno, "Failed to create %s: %m", t);
1672 goto fail;
1673 }
1674
1675 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
1676 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
1677 goto fail;
1678 }
1679
1680 bind_mounted = true;
1681
1682 r = bind_remount_recursive(t, true);
1683 if (r < 0) {
1684 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1685 goto fail;
1686 }
1687
1688 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1689 r = log_error_errno(errno, "Failed to move root mount: %m");
1690 goto fail;
1691 }
1692
1693 (void) rmdir(template);
1694
1695 return 0;
1696
1697 fail:
1698 if (bind_mounted)
1699 (void) umount(t);
1700
1701 if (tmpfs_mounted)
1702 (void) umount(template);
1703 (void) rmdir(template);
1704 return r;
1705 }
1706
1707 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1708 assert(s);
1709
1710 snprintf(s, 37,
1711 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1712 SD_ID128_FORMAT_VAL(id));
1713
1714 return s;
1715 }
1716
1717 static int setup_boot_id(const char *dest) {
1718 const char *from, *to;
1719 sd_id128_t rnd = {};
1720 char as_uuid[37];
1721 int r;
1722
1723 if (arg_share_system)
1724 return 0;
1725
1726 /* Generate a new randomized boot ID, so that each boot-up of
1727 * the container gets a new one */
1728
1729 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1730 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1731
1732 r = sd_id128_randomize(&rnd);
1733 if (r < 0)
1734 return log_error_errno(r, "Failed to generate random boot id: %m");
1735
1736 id128_format_as_uuid(rnd, as_uuid);
1737
1738 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1739 if (r < 0)
1740 return log_error_errno(r, "Failed to write boot id: %m");
1741
1742 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1743 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1744 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1745 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1746
1747 unlink(from);
1748 return r;
1749 }
1750
1751 static int copy_devnodes(const char *dest) {
1752
1753 static const char devnodes[] =
1754 "null\0"
1755 "zero\0"
1756 "full\0"
1757 "random\0"
1758 "urandom\0"
1759 "tty\0"
1760 "net/tun\0";
1761
1762 const char *d;
1763 int r = 0;
1764 _cleanup_umask_ mode_t u;
1765
1766 assert(dest);
1767
1768 u = umask(0000);
1769
1770 /* Create /dev/net, so that we can create /dev/net/tun in it */
1771 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1772 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1773
1774 NULSTR_FOREACH(d, devnodes) {
1775 _cleanup_free_ char *from = NULL, *to = NULL;
1776 struct stat st;
1777
1778 from = strappend("/dev/", d);
1779 to = prefix_root(dest, from);
1780
1781 if (stat(from, &st) < 0) {
1782
1783 if (errno != ENOENT)
1784 return log_error_errno(errno, "Failed to stat %s: %m", from);
1785
1786 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1787
1788 log_error("%s is not a char or block device, cannot copy.", from);
1789 return -EIO;
1790
1791 } else {
1792 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1793 if (errno != EPERM)
1794 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1795
1796 /* Some systems abusively restrict mknod but
1797 * allow bind mounts. */
1798 r = touch(to);
1799 if (r < 0)
1800 return log_error_errno(r, "touch (%s) failed: %m", to);
1801 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1802 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1803 }
1804
1805 r = userns_lchown(to, 0, 0);
1806 if (r < 0)
1807 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1808 }
1809 }
1810
1811 return r;
1812 }
1813
1814 static int setup_pts(const char *dest) {
1815 _cleanup_free_ char *options = NULL;
1816 const char *p;
1817
1818 #ifdef HAVE_SELINUX
1819 if (arg_selinux_apifs_context)
1820 (void) asprintf(&options,
1821 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1822 arg_uid_shift + TTY_GID,
1823 arg_selinux_apifs_context);
1824 else
1825 #endif
1826 (void) asprintf(&options,
1827 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1828 arg_uid_shift + TTY_GID);
1829
1830 if (!options)
1831 return log_oom();
1832
1833 /* Mount /dev/pts itself */
1834 p = prefix_roota(dest, "/dev/pts");
1835 if (mkdir(p, 0755) < 0)
1836 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1837 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1838 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1839 if (userns_lchown(p, 0, 0) < 0)
1840 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1841
1842 /* Create /dev/ptmx symlink */
1843 p = prefix_roota(dest, "/dev/ptmx");
1844 if (symlink("pts/ptmx", p) < 0)
1845 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1846 if (userns_lchown(p, 0, 0) < 0)
1847 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1848
1849 /* And fix /dev/pts/ptmx ownership */
1850 p = prefix_roota(dest, "/dev/pts/ptmx");
1851 if (userns_lchown(p, 0, 0) < 0)
1852 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1853
1854 return 0;
1855 }
1856
1857 static int setup_dev_console(const char *dest, const char *console) {
1858 _cleanup_umask_ mode_t u;
1859 const char *to;
1860 int r;
1861
1862 assert(dest);
1863 assert(console);
1864
1865 u = umask(0000);
1866
1867 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1868 if (r < 0)
1869 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1870
1871 /* We need to bind mount the right tty to /dev/console since
1872 * ptys can only exist on pts file systems. To have something
1873 * to bind mount things on we create a empty regular file. */
1874
1875 to = prefix_roota(dest, "/dev/console");
1876 r = touch(to);
1877 if (r < 0)
1878 return log_error_errno(r, "touch() for /dev/console failed: %m");
1879
1880 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1881 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1882
1883 return 0;
1884 }
1885
1886 static int setup_kmsg(const char *dest, int kmsg_socket) {
1887 const char *from, *to;
1888 _cleanup_umask_ mode_t u;
1889 int fd, k;
1890 union {
1891 struct cmsghdr cmsghdr;
1892 uint8_t buf[CMSG_SPACE(sizeof(int))];
1893 } control = {};
1894 struct msghdr mh = {
1895 .msg_control = &control,
1896 .msg_controllen = sizeof(control),
1897 };
1898 struct cmsghdr *cmsg;
1899
1900 assert(kmsg_socket >= 0);
1901
1902 u = umask(0000);
1903
1904 /* We create the kmsg FIFO as /run/kmsg, but immediately
1905 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1906 * on the reading side behave very similar to /proc/kmsg,
1907 * their writing side behaves differently from /dev/kmsg in
1908 * that writing blocks when nothing is reading. In order to
1909 * avoid any problems with containers deadlocking due to this
1910 * we simply make /dev/kmsg unavailable to the container. */
1911 from = prefix_roota(dest, "/run/kmsg");
1912 to = prefix_roota(dest, "/proc/kmsg");
1913
1914 if (mkfifo(from, 0600) < 0)
1915 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1916 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1917 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1918
1919 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1920 if (fd < 0)
1921 return log_error_errno(errno, "Failed to open fifo: %m");
1922
1923 cmsg = CMSG_FIRSTHDR(&mh);
1924 cmsg->cmsg_level = SOL_SOCKET;
1925 cmsg->cmsg_type = SCM_RIGHTS;
1926 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1927 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1928
1929 mh.msg_controllen = cmsg->cmsg_len;
1930
1931 /* Store away the fd in the socket, so that it stays open as
1932 * long as we run the child */
1933 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1934 safe_close(fd);
1935
1936 if (k < 0)
1937 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1938
1939 /* And now make the FIFO unavailable as /run/kmsg... */
1940 (void) unlink(from);
1941
1942 return 0;
1943 }
1944
1945 static int send_rtnl(int send_fd) {
1946 union {
1947 struct cmsghdr cmsghdr;
1948 uint8_t buf[CMSG_SPACE(sizeof(int))];
1949 } control = {};
1950 struct msghdr mh = {
1951 .msg_control = &control,
1952 .msg_controllen = sizeof(control),
1953 };
1954 struct cmsghdr *cmsg;
1955 _cleanup_close_ int fd = -1;
1956 ssize_t k;
1957
1958 assert(send_fd >= 0);
1959
1960 if (!arg_expose_ports)
1961 return 0;
1962
1963 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1964 if (fd < 0)
1965 return log_error_errno(errno, "Failed to allocate container netlink: %m");
1966
1967 cmsg = CMSG_FIRSTHDR(&mh);
1968 cmsg->cmsg_level = SOL_SOCKET;
1969 cmsg->cmsg_type = SCM_RIGHTS;
1970 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1971 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1972
1973 mh.msg_controllen = cmsg->cmsg_len;
1974
1975 /* Store away the fd in the socket, so that it stays open as
1976 * long as we run the child */
1977 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1978 if (k < 0)
1979 return log_error_errno(errno, "Failed to send netlink fd: %m");
1980
1981 return 0;
1982 }
1983
1984 static int flush_ports(union in_addr_union *exposed) {
1985 ExposePort *p;
1986 int r, af = AF_INET;
1987
1988 assert(exposed);
1989
1990 if (!arg_expose_ports)
1991 return 0;
1992
1993 if (in_addr_is_null(af, exposed))
1994 return 0;
1995
1996 log_debug("Lost IP address.");
1997
1998 LIST_FOREACH(ports, p, arg_expose_ports) {
1999 r = fw_add_local_dnat(false,
2000 af,
2001 p->protocol,
2002 NULL,
2003 NULL, 0,
2004 NULL, 0,
2005 p->host_port,
2006 exposed,
2007 p->container_port,
2008 NULL);
2009 if (r < 0)
2010 log_warning_errno(r, "Failed to modify firewall: %m");
2011 }
2012
2013 *exposed = IN_ADDR_NULL;
2014 return 0;
2015 }
2016
2017 static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
2018 _cleanup_free_ struct local_address *addresses = NULL;
2019 _cleanup_free_ char *pretty = NULL;
2020 union in_addr_union new_exposed;
2021 ExposePort *p;
2022 bool add;
2023 int af = AF_INET, r;
2024
2025 assert(exposed);
2026
2027 /* Invoked each time an address is added or removed inside the
2028 * container */
2029
2030 if (!arg_expose_ports)
2031 return 0;
2032
2033 r = local_addresses(rtnl, 0, af, &addresses);
2034 if (r < 0)
2035 return log_error_errno(r, "Failed to enumerate local addresses: %m");
2036
2037 add = r > 0 &&
2038 addresses[0].family == af &&
2039 addresses[0].scope < RT_SCOPE_LINK;
2040
2041 if (!add)
2042 return flush_ports(exposed);
2043
2044 new_exposed = addresses[0].address;
2045 if (in_addr_equal(af, exposed, &new_exposed))
2046 return 0;
2047
2048 in_addr_to_string(af, &new_exposed, &pretty);
2049 log_debug("New container IP is %s.", strna(pretty));
2050
2051 LIST_FOREACH(ports, p, arg_expose_ports) {
2052
2053 r = fw_add_local_dnat(true,
2054 af,
2055 p->protocol,
2056 NULL,
2057 NULL, 0,
2058 NULL, 0,
2059 p->host_port,
2060 &new_exposed,
2061 p->container_port,
2062 in_addr_is_null(af, exposed) ? NULL : exposed);
2063 if (r < 0)
2064 log_warning_errno(r, "Failed to modify firewall: %m");
2065 }
2066
2067 *exposed = new_exposed;
2068 return 0;
2069 }
2070
2071 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2072 union in_addr_union *exposed = userdata;
2073
2074 assert(rtnl);
2075 assert(m);
2076 assert(exposed);
2077
2078 expose_ports(rtnl, exposed);
2079 return 0;
2080 }
2081
2082 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
2083 union {
2084 struct cmsghdr cmsghdr;
2085 uint8_t buf[CMSG_SPACE(sizeof(int))];
2086 } control = {};
2087 struct msghdr mh = {
2088 .msg_control = &control,
2089 .msg_controllen = sizeof(control),
2090 };
2091 struct cmsghdr *cmsg;
2092 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2093 int fd, r;
2094 ssize_t k;
2095
2096 assert(event);
2097 assert(recv_fd >= 0);
2098 assert(ret);
2099
2100 if (!arg_expose_ports)
2101 return 0;
2102
2103 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
2104 if (k < 0)
2105 return log_error_errno(errno, "Failed to recv netlink fd: %m");
2106
2107 cmsg = CMSG_FIRSTHDR(&mh);
2108 assert(cmsg->cmsg_level == SOL_SOCKET);
2109 assert(cmsg->cmsg_type == SCM_RIGHTS);
2110 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
2111 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
2112
2113 r = sd_netlink_open_fd(&rtnl, fd);
2114 if (r < 0) {
2115 safe_close(fd);
2116 return log_error_errno(r, "Failed to create rtnl object: %m");
2117 }
2118
2119 r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
2120 if (r < 0)
2121 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
2122
2123 r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
2124 if (r < 0)
2125 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
2126
2127 r = sd_netlink_attach_event(rtnl, event, 0);
2128 if (r < 0)
2129 return log_error_errno(r, "Failed to add to even loop: %m");
2130
2131 *ret = rtnl;
2132 rtnl = NULL;
2133
2134 return 0;
2135 }
2136
2137 static int setup_hostname(void) {
2138
2139 if (arg_share_system)
2140 return 0;
2141
2142 if (sethostname_idempotent(arg_machine) < 0)
2143 return -errno;
2144
2145 return 0;
2146 }
2147
2148 static int setup_journal(const char *directory) {
2149 sd_id128_t machine_id, this_id;
2150 _cleanup_free_ char *b = NULL, *d = NULL;
2151 const char *etc_machine_id, *p, *q;
2152 char *id;
2153 int r;
2154
2155 /* Don't link journals in ephemeral mode */
2156 if (arg_ephemeral)
2157 return 0;
2158
2159 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2160
2161 r = read_one_line_file(etc_machine_id, &b);
2162 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
2163 return 0;
2164 else if (r < 0)
2165 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
2166
2167 id = strstrip(b);
2168 if (isempty(id) && arg_link_journal == LINK_AUTO)
2169 return 0;
2170
2171 /* Verify validity */
2172 r = sd_id128_from_string(id, &machine_id);
2173 if (r < 0)
2174 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
2175
2176 r = sd_id128_get_machine(&this_id);
2177 if (r < 0)
2178 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2179
2180 if (sd_id128_equal(machine_id, this_id)) {
2181 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
2182 "Host and machine ids are equal (%s): refusing to link journals", id);
2183 if (arg_link_journal == LINK_AUTO)
2184 return 0;
2185 return -EEXIST;
2186 }
2187
2188 if (arg_link_journal == LINK_NO)
2189 return 0;
2190
2191 r = userns_mkdir(directory, "/var", 0755, 0, 0);
2192 if (r < 0)
2193 return log_error_errno(r, "Failed to create /var: %m");
2194
2195 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
2196 if (r < 0)
2197 return log_error_errno(r, "Failed to create /var/log: %m");
2198
2199 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
2200 if (r < 0)
2201 return log_error_errno(r, "Failed to create /var/log/journal: %m");
2202
2203 p = strjoina("/var/log/journal/", id);
2204 q = prefix_roota(directory, p);
2205
2206 if (path_is_mount_point(p, 0) > 0) {
2207 if (arg_link_journal != LINK_AUTO) {
2208 log_error("%s: already a mount point, refusing to use for journal", p);
2209 return -EEXIST;
2210 }
2211
2212 return 0;
2213 }
2214
2215 if (path_is_mount_point(q, 0) > 0) {
2216 if (arg_link_journal != LINK_AUTO) {
2217 log_error("%s: already a mount point, refusing to use for journal", q);
2218 return -EEXIST;
2219 }
2220
2221 return 0;
2222 }
2223
2224 r = readlink_and_make_absolute(p, &d);
2225 if (r >= 0) {
2226 if ((arg_link_journal == LINK_GUEST ||
2227 arg_link_journal == LINK_AUTO) &&
2228 path_equal(d, q)) {
2229
2230 r = userns_mkdir(directory, p, 0755, 0, 0);
2231 if (r < 0)
2232 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2233 return 0;
2234 }
2235
2236 if (unlink(p) < 0)
2237 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2238 } else if (r == -EINVAL) {
2239
2240 if (arg_link_journal == LINK_GUEST &&
2241 rmdir(p) < 0) {
2242
2243 if (errno == ENOTDIR) {
2244 log_error("%s already exists and is neither a symlink nor a directory", p);
2245 return r;
2246 } else {
2247 log_error_errno(errno, "Failed to remove %s: %m", p);
2248 return -errno;
2249 }
2250 }
2251 } else if (r != -ENOENT) {
2252 log_error_errno(errno, "readlink(%s) failed: %m", p);
2253 return r;
2254 }
2255
2256 if (arg_link_journal == LINK_GUEST) {
2257
2258 if (symlink(q, p) < 0) {
2259 if (arg_link_journal_try) {
2260 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2261 return 0;
2262 } else {
2263 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2264 return -errno;
2265 }
2266 }
2267
2268 r = userns_mkdir(directory, p, 0755, 0, 0);
2269 if (r < 0)
2270 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2271 return 0;
2272 }
2273
2274 if (arg_link_journal == LINK_HOST) {
2275 /* don't create parents here -- if the host doesn't have
2276 * permanent journal set up, don't force it here */
2277 r = mkdir(p, 0755);
2278 if (r < 0) {
2279 if (arg_link_journal_try) {
2280 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
2281 return 0;
2282 } else {
2283 log_error_errno(errno, "Failed to create %s: %m", p);
2284 return r;
2285 }
2286 }
2287
2288 } else if (access(p, F_OK) < 0)
2289 return 0;
2290
2291 if (dir_is_empty(q) == 0)
2292 log_warning("%s is not empty, proceeding anyway.", q);
2293
2294 r = userns_mkdir(directory, p, 0755, 0, 0);
2295 if (r < 0) {
2296 log_error_errno(errno, "Failed to create %s: %m", q);
2297 return r;
2298 }
2299
2300 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2301 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2302
2303 return 0;
2304 }
2305
2306 static int drop_capabilities(void) {
2307 return capability_bounding_set_drop(~arg_retain, false);
2308 }
2309
2310 static int register_machine(pid_t pid, int local_ifindex) {
2311 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2312 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2313 int r;
2314
2315 if (!arg_register)
2316 return 0;
2317
2318 r = sd_bus_default_system(&bus);
2319 if (r < 0)
2320 return log_error_errno(r, "Failed to open system bus: %m");
2321
2322 if (arg_keep_unit) {
2323 r = sd_bus_call_method(
2324 bus,
2325 "org.freedesktop.machine1",
2326 "/org/freedesktop/machine1",
2327 "org.freedesktop.machine1.Manager",
2328 "RegisterMachineWithNetwork",
2329 &error,
2330 NULL,
2331 "sayssusai",
2332 arg_machine,
2333 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2334 "nspawn",
2335 "container",
2336 (uint32_t) pid,
2337 strempty(arg_directory),
2338 local_ifindex > 0 ? 1 : 0, local_ifindex);
2339 } else {
2340 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
2341 char **i;
2342 unsigned j;
2343
2344 r = sd_bus_message_new_method_call(
2345 bus,
2346 &m,
2347 "org.freedesktop.machine1",
2348 "/org/freedesktop/machine1",
2349 "org.freedesktop.machine1.Manager",
2350 "CreateMachineWithNetwork");
2351 if (r < 0)
2352 return bus_log_create_error(r);
2353
2354 r = sd_bus_message_append(
2355 m,
2356 "sayssusai",
2357 arg_machine,
2358 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2359 "nspawn",
2360 "container",
2361 (uint32_t) pid,
2362 strempty(arg_directory),
2363 local_ifindex > 0 ? 1 : 0, local_ifindex);
2364 if (r < 0)
2365 return bus_log_create_error(r);
2366
2367 r = sd_bus_message_open_container(m, 'a', "(sv)");
2368 if (r < 0)
2369 return bus_log_create_error(r);
2370
2371 if (!isempty(arg_slice)) {
2372 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2373 if (r < 0)
2374 return bus_log_create_error(r);
2375 }
2376
2377 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2378 if (r < 0)
2379 return bus_log_create_error(r);
2380
2381 /* If you make changes here, also make sure to update
2382 * systemd-nspawn@.service, to keep the device
2383 * policies in sync regardless if we are run with or
2384 * without the --keep-unit switch. */
2385 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2386 /* Allow the container to
2387 * access and create the API
2388 * device nodes, so that
2389 * PrivateDevices= in the
2390 * container can work
2391 * fine */
2392 "/dev/null", "rwm",
2393 "/dev/zero", "rwm",
2394 "/dev/full", "rwm",
2395 "/dev/random", "rwm",
2396 "/dev/urandom", "rwm",
2397 "/dev/tty", "rwm",
2398 "/dev/net/tun", "rwm",
2399 /* Allow the container
2400 * access to ptys. However,
2401 * do not permit the
2402 * container to ever create
2403 * these device nodes. */
2404 "/dev/pts/ptmx", "rw",
2405 "char-pts", "rw");
2406 if (r < 0)
2407 return bus_log_create_error(r);
2408
2409 for (j = 0; j < arg_n_custom_mounts; j++) {
2410 CustomMount *cm = &arg_custom_mounts[j];
2411
2412 if (cm->type != CUSTOM_MOUNT_BIND)
2413 continue;
2414
2415 r = is_device_node(cm->source);
2416 if (r < 0)
2417 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
2418
2419 if (r) {
2420 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
2421 cm->source, cm->read_only ? "r" : "rw");
2422 if (r < 0)
2423 return log_error_errno(r, "Failed to append message arguments: %m");
2424 }
2425 }
2426
2427 if (arg_kill_signal != 0) {
2428 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
2429 if (r < 0)
2430 return bus_log_create_error(r);
2431
2432 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
2433 if (r < 0)
2434 return bus_log_create_error(r);
2435 }
2436
2437 STRV_FOREACH(i, arg_property) {
2438 r = sd_bus_message_open_container(m, 'r', "sv");
2439 if (r < 0)
2440 return bus_log_create_error(r);
2441
2442 r = bus_append_unit_property_assignment(m, *i);
2443 if (r < 0)
2444 return r;
2445
2446 r = sd_bus_message_close_container(m);
2447 if (r < 0)
2448 return bus_log_create_error(r);
2449 }
2450
2451 r = sd_bus_message_close_container(m);
2452 if (r < 0)
2453 return bus_log_create_error(r);
2454
2455 r = sd_bus_call(bus, m, 0, &error, NULL);
2456 }
2457
2458 if (r < 0) {
2459 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2460 return r;
2461 }
2462
2463 return 0;
2464 }
2465
2466 static int terminate_machine(pid_t pid) {
2467 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2468 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2469 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2470 const char *path;
2471 int r;
2472
2473 if (!arg_register)
2474 return 0;
2475
2476 /* If we are reusing the unit, then just exit, systemd will do
2477 * the right thing when we exit. */
2478 if (arg_keep_unit)
2479 return 0;
2480
2481 r = sd_bus_default_system(&bus);
2482 if (r < 0)
2483 return log_error_errno(r, "Failed to open system bus: %m");
2484
2485 r = sd_bus_call_method(
2486 bus,
2487 "org.freedesktop.machine1",
2488 "/org/freedesktop/machine1",
2489 "org.freedesktop.machine1.Manager",
2490 "GetMachineByPID",
2491 &error,
2492 &reply,
2493 "u",
2494 (uint32_t) pid);
2495 if (r < 0) {
2496 /* Note that the machine might already have been
2497 * cleaned up automatically, hence don't consider it a
2498 * failure if we cannot get the machine object. */
2499 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2500 return 0;
2501 }
2502
2503 r = sd_bus_message_read(reply, "o", &path);
2504 if (r < 0)
2505 return bus_log_parse_error(r);
2506
2507 r = sd_bus_call_method(
2508 bus,
2509 "org.freedesktop.machine1",
2510 path,
2511 "org.freedesktop.machine1.Machine",
2512 "Terminate",
2513 &error,
2514 NULL,
2515 NULL);
2516 if (r < 0) {
2517 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2518 return 0;
2519 }
2520
2521 return 0;
2522 }
2523
2524 static int reset_audit_loginuid(void) {
2525 _cleanup_free_ char *p = NULL;
2526 int r;
2527
2528 if (arg_share_system)
2529 return 0;
2530
2531 r = read_one_line_file("/proc/self/loginuid", &p);
2532 if (r == -ENOENT)
2533 return 0;
2534 if (r < 0)
2535 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2536
2537 /* Already reset? */
2538 if (streq(p, "4294967295"))
2539 return 0;
2540
2541 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
2542 if (r < 0) {
2543 log_error_errno(r,
2544 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2545 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2546 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2547 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2548 "using systemd-nspawn. Sleeping for 5s... (%m)");
2549
2550 sleep(5);
2551 }
2552
2553 return 0;
2554 }
2555
2556 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2557 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2558 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2559
2560 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2561 uint8_t result[8];
2562 size_t l, sz;
2563 uint8_t *v, *i;
2564 int r;
2565
2566 l = strlen(arg_machine);
2567 sz = sizeof(sd_id128_t) + l;
2568 if (idx > 0)
2569 sz += sizeof(idx);
2570
2571 v = alloca(sz);
2572
2573 /* fetch some persistent data unique to the host */
2574 r = sd_id128_get_machine((sd_id128_t*) v);
2575 if (r < 0)
2576 return r;
2577
2578 /* combine with some data unique (on this host) to this
2579 * container instance */
2580 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2581 if (idx > 0) {
2582 idx = htole64(idx);
2583 memcpy(i, &idx, sizeof(idx));
2584 }
2585
2586 /* Let's hash the host machine ID plus the container name. We
2587 * use a fixed, but originally randomly created hash key here. */
2588 siphash24(result, v, sz, hash_key.bytes);
2589
2590 assert_cc(ETH_ALEN <= sizeof(result));
2591 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2592
2593 /* see eth_random_addr in the kernel */
2594 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2595 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2596
2597 return 0;
2598 }
2599
2600 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2601 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2602 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2603 struct ether_addr mac_host, mac_container;
2604 int r, i;
2605
2606 if (!arg_private_network)
2607 return 0;
2608
2609 if (!arg_network_veth)
2610 return 0;
2611
2612 /* Use two different interface name prefixes depending whether
2613 * we are in bridge mode or not. */
2614 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2615 arg_network_bridge ? "vb" : "ve", arg_machine);
2616
2617 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2618 if (r < 0)
2619 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2620
2621 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2622 if (r < 0)
2623 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2624
2625 r = sd_netlink_open(&rtnl);
2626 if (r < 0)
2627 return log_error_errno(r, "Failed to connect to netlink: %m");
2628
2629 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2630 if (r < 0)
2631 return log_error_errno(r, "Failed to allocate netlink message: %m");
2632
2633 r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
2634 if (r < 0)
2635 return log_error_errno(r, "Failed to add netlink interface name: %m");
2636
2637 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2638 if (r < 0)
2639 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2640
2641 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2642 if (r < 0)
2643 return log_error_errno(r, "Failed to open netlink container: %m");
2644
2645 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2646 if (r < 0)
2647 return log_error_errno(r, "Failed to open netlink container: %m");
2648
2649 r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
2650 if (r < 0)
2651 return log_error_errno(r, "Failed to open netlink container: %m");
2652
2653 r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
2654 if (r < 0)
2655 return log_error_errno(r, "Failed to add netlink interface name: %m");
2656
2657 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2658 if (r < 0)
2659 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2660
2661 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2662 if (r < 0)
2663 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2664
2665 r = sd_netlink_message_close_container(m);
2666 if (r < 0)
2667 return log_error_errno(r, "Failed to close netlink container: %m");
2668
2669 r = sd_netlink_message_close_container(m);
2670 if (r < 0)
2671 return log_error_errno(r, "Failed to close netlink container: %m");
2672
2673 r = sd_netlink_message_close_container(m);
2674 if (r < 0)
2675 return log_error_errno(r, "Failed to close netlink container: %m");
2676
2677 r = sd_netlink_call(rtnl, m, 0, NULL);
2678 if (r < 0)
2679 return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
2680
2681 i = (int) if_nametoindex(iface_name);
2682 if (i <= 0)
2683 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2684
2685 *ifi = i;
2686
2687 return 0;
2688 }
2689
2690 static int setup_bridge(const char veth_name[], int *ifi) {
2691 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2692 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2693 int r, bridge;
2694
2695 if (!arg_private_network)
2696 return 0;
2697
2698 if (!arg_network_veth)
2699 return 0;
2700
2701 if (!arg_network_bridge)
2702 return 0;
2703
2704 bridge = (int) if_nametoindex(arg_network_bridge);
2705 if (bridge <= 0)
2706 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2707
2708 *ifi = bridge;
2709
2710 r = sd_netlink_open(&rtnl);
2711 if (r < 0)
2712 return log_error_errno(r, "Failed to connect to netlink: %m");
2713
2714 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2715 if (r < 0)
2716 return log_error_errno(r, "Failed to allocate netlink message: %m");
2717
2718 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2719 if (r < 0)
2720 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2721
2722 r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
2723 if (r < 0)
2724 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2725
2726 r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
2727 if (r < 0)
2728 return log_error_errno(r, "Failed to add netlink master field: %m");
2729
2730 r = sd_netlink_call(rtnl, m, 0, NULL);
2731 if (r < 0)
2732 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2733
2734 return 0;
2735 }
2736
2737 static int parse_interface(struct udev *udev, const char *name) {
2738 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2739 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2740 int ifi;
2741
2742 ifi = (int) if_nametoindex(name);
2743 if (ifi <= 0)
2744 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2745
2746 sprintf(ifi_str, "n%i", ifi);
2747 d = udev_device_new_from_device_id(udev, ifi_str);
2748 if (!d)
2749 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2750
2751 if (udev_device_get_is_initialized(d) <= 0) {
2752 log_error("Network interface %s is not initialized yet.", name);
2753 return -EBUSY;
2754 }
2755
2756 return ifi;
2757 }
2758
2759 static int move_network_interfaces(pid_t pid) {
2760 _cleanup_udev_unref_ struct udev *udev = NULL;
2761 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2762 char **i;
2763 int r;
2764
2765 if (!arg_private_network)
2766 return 0;
2767
2768 if (strv_isempty(arg_network_interfaces))
2769 return 0;
2770
2771 r = sd_netlink_open(&rtnl);
2772 if (r < 0)
2773 return log_error_errno(r, "Failed to connect to netlink: %m");
2774
2775 udev = udev_new();
2776 if (!udev) {
2777 log_error("Failed to connect to udev.");
2778 return -ENOMEM;
2779 }
2780
2781 STRV_FOREACH(i, arg_network_interfaces) {
2782 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2783 int ifi;
2784
2785 ifi = parse_interface(udev, *i);
2786 if (ifi < 0)
2787 return ifi;
2788
2789 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2790 if (r < 0)
2791 return log_error_errno(r, "Failed to allocate netlink message: %m");
2792
2793 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2794 if (r < 0)
2795 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2796
2797 r = sd_netlink_call(rtnl, m, 0, NULL);
2798 if (r < 0)
2799 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2800 }
2801
2802 return 0;
2803 }
2804
2805 static int setup_macvlan(pid_t pid) {
2806 _cleanup_udev_unref_ struct udev *udev = NULL;
2807 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2808 unsigned idx = 0;
2809 char **i;
2810 int r;
2811
2812 if (!arg_private_network)
2813 return 0;
2814
2815 if (strv_isempty(arg_network_macvlan))
2816 return 0;
2817
2818 r = sd_netlink_open(&rtnl);
2819 if (r < 0)
2820 return log_error_errno(r, "Failed to connect to netlink: %m");
2821
2822 udev = udev_new();
2823 if (!udev) {
2824 log_error("Failed to connect to udev.");
2825 return -ENOMEM;
2826 }
2827
2828 STRV_FOREACH(i, arg_network_macvlan) {
2829 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2830 _cleanup_free_ char *n = NULL;
2831 struct ether_addr mac;
2832 int ifi;
2833
2834 ifi = parse_interface(udev, *i);
2835 if (ifi < 0)
2836 return ifi;
2837
2838 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2839 if (r < 0)
2840 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2841
2842 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2843 if (r < 0)
2844 return log_error_errno(r, "Failed to allocate netlink message: %m");
2845
2846 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2847 if (r < 0)
2848 return log_error_errno(r, "Failed to add netlink interface index: %m");
2849
2850 n = strappend("mv-", *i);
2851 if (!n)
2852 return log_oom();
2853
2854 strshorten(n, IFNAMSIZ-1);
2855
2856 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2857 if (r < 0)
2858 return log_error_errno(r, "Failed to add netlink interface name: %m");
2859
2860 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2861 if (r < 0)
2862 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2863
2864 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2865 if (r < 0)
2866 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2867
2868 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2869 if (r < 0)
2870 return log_error_errno(r, "Failed to open netlink container: %m");
2871
2872 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2873 if (r < 0)
2874 return log_error_errno(r, "Failed to open netlink container: %m");
2875
2876 r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2877 if (r < 0)
2878 return log_error_errno(r, "Failed to append macvlan mode: %m");
2879
2880 r = sd_netlink_message_close_container(m);
2881 if (r < 0)
2882 return log_error_errno(r, "Failed to close netlink container: %m");
2883
2884 r = sd_netlink_message_close_container(m);
2885 if (r < 0)
2886 return log_error_errno(r, "Failed to close netlink container: %m");
2887
2888 r = sd_netlink_call(rtnl, m, 0, NULL);
2889 if (r < 0)
2890 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2891 }
2892
2893 return 0;
2894 }
2895
2896 static int setup_ipvlan(pid_t pid) {
2897 _cleanup_udev_unref_ struct udev *udev = NULL;
2898 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2899 char **i;
2900 int r;
2901
2902 if (!arg_private_network)
2903 return 0;
2904
2905 if (strv_isempty(arg_network_ipvlan))
2906 return 0;
2907
2908 r = sd_netlink_open(&rtnl);
2909 if (r < 0)
2910 return log_error_errno(r, "Failed to connect to netlink: %m");
2911
2912 udev = udev_new();
2913 if (!udev) {
2914 log_error("Failed to connect to udev.");
2915 return -ENOMEM;
2916 }
2917
2918 STRV_FOREACH(i, arg_network_ipvlan) {
2919 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2920 _cleanup_free_ char *n = NULL;
2921 int ifi;
2922
2923 ifi = parse_interface(udev, *i);
2924 if (ifi < 0)
2925 return ifi;
2926
2927 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2928 if (r < 0)
2929 return log_error_errno(r, "Failed to allocate netlink message: %m");
2930
2931 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2932 if (r < 0)
2933 return log_error_errno(r, "Failed to add netlink interface index: %m");
2934
2935 n = strappend("iv-", *i);
2936 if (!n)
2937 return log_oom();
2938
2939 strshorten(n, IFNAMSIZ-1);
2940
2941 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2942 if (r < 0)
2943 return log_error_errno(r, "Failed to add netlink interface name: %m");
2944
2945 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2946 if (r < 0)
2947 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2948
2949 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2950 if (r < 0)
2951 return log_error_errno(r, "Failed to open netlink container: %m");
2952
2953 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2954 if (r < 0)
2955 return log_error_errno(r, "Failed to open netlink container: %m");
2956
2957 r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2958 if (r < 0)
2959 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2960
2961 r = sd_netlink_message_close_container(m);
2962 if (r < 0)
2963 return log_error_errno(r, "Failed to close netlink container: %m");
2964
2965 r = sd_netlink_message_close_container(m);
2966 if (r < 0)
2967 return log_error_errno(r, "Failed to close netlink container: %m");
2968
2969 r = sd_netlink_call(rtnl, m, 0, NULL);
2970 if (r < 0)
2971 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2972 }
2973
2974 return 0;
2975 }
2976
2977 static int setup_seccomp(void) {
2978
2979 #ifdef HAVE_SECCOMP
2980 static const struct {
2981 uint64_t capability;
2982 int syscall_num;
2983 } blacklist[] = {
2984 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
2985 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
2986 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
2987 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
2988 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
2989 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
2990 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
2991 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
2992 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
2993 { CAP_SYSLOG, SCMP_SYS(syslog) },
2994 };
2995
2996 scmp_filter_ctx seccomp;
2997 unsigned i;
2998 int r;
2999
3000 seccomp = seccomp_init(SCMP_ACT_ALLOW);
3001 if (!seccomp)
3002 return log_oom();
3003
3004 r = seccomp_add_secondary_archs(seccomp);
3005 if (r < 0) {
3006 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
3007 goto finish;
3008 }
3009
3010 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
3011 if (arg_retain & (1ULL << blacklist[i].capability))
3012 continue;
3013
3014 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
3015 if (r == -EFAULT)
3016 continue; /* unknown syscall */
3017 if (r < 0) {
3018 log_error_errno(r, "Failed to block syscall: %m");
3019 goto finish;
3020 }
3021 }
3022
3023
3024 /*
3025 Audit is broken in containers, much of the userspace audit
3026 hookup will fail if running inside a container. We don't
3027 care and just turn off creation of audit sockets.
3028
3029 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
3030 with EAFNOSUPPORT which audit userspace uses as indication
3031 that audit is disabled in the kernel.
3032 */
3033
3034 r = seccomp_rule_add(
3035 seccomp,
3036 SCMP_ACT_ERRNO(EAFNOSUPPORT),
3037 SCMP_SYS(socket),
3038 2,
3039 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
3040 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
3041 if (r < 0) {
3042 log_error_errno(r, "Failed to add audit seccomp rule: %m");
3043 goto finish;
3044 }
3045
3046 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
3047 if (r < 0) {
3048 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
3049 goto finish;
3050 }
3051
3052 r = seccomp_load(seccomp);
3053 if (r == -EINVAL) {
3054 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3055 r = 0;
3056 goto finish;
3057 }
3058 if (r < 0) {
3059 log_error_errno(r, "Failed to install seccomp audit filter: %m");
3060 goto finish;
3061 }
3062
3063 finish:
3064 seccomp_release(seccomp);
3065 return r;
3066 #else
3067 return 0;
3068 #endif
3069
3070 }
3071
3072 static int setup_propagate(const char *root) {
3073 const char *p, *q;
3074
3075 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3076 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
3077 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3078 (void) mkdir_p(p, 0600);
3079
3080 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
3081 return log_error_errno(errno, "Failed to create /run/systemd: %m");
3082
3083 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3084 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
3085
3086 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3087 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
3088
3089 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
3090 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
3091 return log_error_errno(errno, "Failed to install propagation bind mount.");
3092
3093 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
3094 return log_error_errno(errno, "Failed to make propagation mount read-only");
3095
3096 return 0;
3097 }
3098
3099 static int setup_image(char **device_path, int *loop_nr) {
3100 struct loop_info64 info = {
3101 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
3102 };
3103 _cleanup_close_ int fd = -1, control = -1, loop = -1;
3104 _cleanup_free_ char* loopdev = NULL;
3105 struct stat st;
3106 int r, nr;
3107
3108 assert(device_path);
3109 assert(loop_nr);
3110 assert(arg_image);
3111
3112 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3113 if (fd < 0)
3114 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
3115
3116 if (fstat(fd, &st) < 0)
3117 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
3118
3119 if (S_ISBLK(st.st_mode)) {
3120 char *p;
3121
3122 p = strdup(arg_image);
3123 if (!p)
3124 return log_oom();
3125
3126 *device_path = p;
3127
3128 *loop_nr = -1;
3129
3130 r = fd;
3131 fd = -1;
3132
3133 return r;
3134 }
3135
3136 if (!S_ISREG(st.st_mode)) {
3137 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
3138 return -EINVAL;
3139 }
3140
3141 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3142 if (control < 0)
3143 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
3144
3145 nr = ioctl(control, LOOP_CTL_GET_FREE);
3146 if (nr < 0)
3147 return log_error_errno(errno, "Failed to allocate loop device: %m");
3148
3149 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
3150 return log_oom();
3151
3152 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3153 if (loop < 0)
3154 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
3155
3156 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
3157 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
3158
3159 if (arg_read_only)
3160 info.lo_flags |= LO_FLAGS_READ_ONLY;
3161
3162 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
3163 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
3164
3165 *device_path = loopdev;
3166 loopdev = NULL;
3167
3168 *loop_nr = nr;
3169
3170 r = loop;
3171 loop = -1;
3172
3173 return r;
3174 }
3175
3176 #define PARTITION_TABLE_BLURB \
3177 "Note that the disk image needs to either contain only a single MBR partition of\n" \
3178 "type 0x83 that is marked bootable, or a single GPT partition of type " \
3179 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
3180 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3181 "to be bootable with systemd-nspawn."
3182
3183 static int dissect_image(
3184 int fd,
3185 char **root_device, bool *root_device_rw,
3186 char **home_device, bool *home_device_rw,
3187 char **srv_device, bool *srv_device_rw,
3188 bool *secondary) {
3189
3190 #ifdef HAVE_BLKID
3191 int home_nr = -1, srv_nr = -1;
3192 #ifdef GPT_ROOT_NATIVE
3193 int root_nr = -1;
3194 #endif
3195 #ifdef GPT_ROOT_SECONDARY
3196 int secondary_root_nr = -1;
3197 #endif
3198 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
3199 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
3200 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
3201 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3202 _cleanup_udev_unref_ struct udev *udev = NULL;
3203 struct udev_list_entry *first, *item;
3204 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
3205 bool is_gpt, is_mbr, multiple_generic = false;
3206 const char *pttype = NULL;
3207 blkid_partlist pl;
3208 struct stat st;
3209 unsigned i;
3210 int r;
3211
3212 assert(fd >= 0);
3213 assert(root_device);
3214 assert(home_device);
3215 assert(srv_device);
3216 assert(secondary);
3217 assert(arg_image);
3218
3219 b = blkid_new_probe();
3220 if (!b)
3221 return log_oom();
3222
3223 errno = 0;
3224 r = blkid_probe_set_device(b, fd, 0, 0);
3225 if (r != 0) {
3226 if (errno == 0)
3227 return log_oom();
3228
3229 log_error_errno(errno, "Failed to set device on blkid probe: %m");
3230 return -errno;
3231 }
3232
3233 blkid_probe_enable_partitions(b, 1);
3234 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
3235
3236 errno = 0;
3237 r = blkid_do_safeprobe(b);
3238 if (r == -2 || r == 1) {
3239 log_error("Failed to identify any partition table on\n"
3240 " %s\n"
3241 PARTITION_TABLE_BLURB, arg_image);
3242 return -EINVAL;
3243 } else if (r != 0) {
3244 if (errno == 0)
3245 errno = EIO;
3246 log_error_errno(errno, "Failed to probe: %m");
3247 return -errno;
3248 }
3249
3250 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
3251
3252 is_gpt = streq_ptr(pttype, "gpt");
3253 is_mbr = streq_ptr(pttype, "dos");
3254
3255 if (!is_gpt && !is_mbr) {
3256 log_error("No GPT or MBR partition table discovered on\n"
3257 " %s\n"
3258 PARTITION_TABLE_BLURB, arg_image);
3259 return -EINVAL;
3260 }
3261
3262 errno = 0;
3263 pl = blkid_probe_get_partitions(b);
3264 if (!pl) {
3265 if (errno == 0)
3266 return log_oom();
3267
3268 log_error("Failed to list partitions of %s", arg_image);
3269 return -errno;
3270 }
3271
3272 udev = udev_new();
3273 if (!udev)
3274 return log_oom();
3275
3276 if (fstat(fd, &st) < 0)
3277 return log_error_errno(errno, "Failed to stat block device: %m");
3278
3279 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
3280 if (!d)
3281 return log_oom();
3282
3283 for (i = 0;; i++) {
3284 int n, m;
3285
3286 if (i >= 10) {
3287 log_error("Kernel partitions never appeared.");
3288 return -ENXIO;
3289 }
3290
3291 e = udev_enumerate_new(udev);
3292 if (!e)
3293 return log_oom();
3294
3295 r = udev_enumerate_add_match_parent(e, d);
3296 if (r < 0)
3297 return log_oom();
3298
3299 r = udev_enumerate_scan_devices(e);
3300 if (r < 0)
3301 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
3302
3303 /* Count the partitions enumerated by the kernel */
3304 n = 0;
3305 first = udev_enumerate_get_list_entry(e);
3306 udev_list_entry_foreach(item, first)
3307 n++;
3308
3309 /* Count the partitions enumerated by blkid */
3310 m = blkid_partlist_numof_partitions(pl);
3311 if (n == m + 1)
3312 break;
3313 if (n > m + 1) {
3314 log_error("blkid and kernel partition list do not match.");
3315 return -EIO;
3316 }
3317 if (n < m + 1) {
3318 unsigned j;
3319
3320 /* The kernel has probed fewer partitions than
3321 * blkid? Maybe the kernel prober is still
3322 * running or it got EBUSY because udev
3323 * already opened the device. Let's reprobe
3324 * the device, which is a synchronous call
3325 * that waits until probing is complete. */
3326
3327 for (j = 0; j < 20; j++) {
3328
3329 r = ioctl(fd, BLKRRPART, 0);
3330 if (r < 0)
3331 r = -errno;
3332 if (r >= 0 || r != -EBUSY)
3333 break;
3334
3335 /* If something else has the device
3336 * open, such as an udev rule, the
3337 * ioctl will return EBUSY. Since
3338 * there's no way to wait until it
3339 * isn't busy anymore, let's just wait
3340 * a bit, and try again.
3341 *
3342 * This is really something they
3343 * should fix in the kernel! */
3344
3345 usleep(50 * USEC_PER_MSEC);
3346 }
3347
3348 if (r < 0)
3349 return log_error_errno(r, "Failed to reread partition table: %m");
3350 }
3351
3352 e = udev_enumerate_unref(e);
3353 }
3354
3355 first = udev_enumerate_get_list_entry(e);
3356 udev_list_entry_foreach(item, first) {
3357 _cleanup_udev_device_unref_ struct udev_device *q;
3358 const char *node;
3359 unsigned long long flags;
3360 blkid_partition pp;
3361 dev_t qn;
3362 int nr;
3363
3364 errno = 0;
3365 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
3366 if (!q) {
3367 if (!errno)
3368 errno = ENOMEM;
3369
3370 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
3371 return -errno;
3372 }
3373
3374 qn = udev_device_get_devnum(q);
3375 if (major(qn) == 0)
3376 continue;
3377
3378 if (st.st_rdev == qn)
3379 continue;
3380
3381 node = udev_device_get_devnode(q);
3382 if (!node)
3383 continue;
3384
3385 pp = blkid_partlist_devno_to_partition(pl, qn);
3386 if (!pp)
3387 continue;
3388
3389 flags = blkid_partition_get_flags(pp);
3390
3391 nr = blkid_partition_get_partno(pp);
3392 if (nr < 0)
3393 continue;
3394
3395 if (is_gpt) {
3396 sd_id128_t type_id;
3397 const char *stype;
3398
3399 if (flags & GPT_FLAG_NO_AUTO)
3400 continue;
3401
3402 stype = blkid_partition_get_type_string(pp);
3403 if (!stype)
3404 continue;
3405
3406 if (sd_id128_from_string(stype, &type_id) < 0)
3407 continue;
3408
3409 if (sd_id128_equal(type_id, GPT_HOME)) {
3410
3411 if (home && nr >= home_nr)
3412 continue;
3413
3414 home_nr = nr;
3415 home_rw = !(flags & GPT_FLAG_READ_ONLY);
3416
3417 r = free_and_strdup(&home, node);
3418 if (r < 0)
3419 return log_oom();
3420
3421 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3422
3423 if (srv && nr >= srv_nr)
3424 continue;
3425
3426 srv_nr = nr;
3427 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3428
3429 r = free_and_strdup(&srv, node);
3430 if (r < 0)
3431 return log_oom();
3432 }
3433 #ifdef GPT_ROOT_NATIVE
3434 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3435
3436 if (root && nr >= root_nr)
3437 continue;
3438
3439 root_nr = nr;
3440 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3441
3442 r = free_and_strdup(&root, node);
3443 if (r < 0)
3444 return log_oom();
3445 }
3446 #endif
3447 #ifdef GPT_ROOT_SECONDARY
3448 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3449
3450 if (secondary_root && nr >= secondary_root_nr)
3451 continue;
3452
3453 secondary_root_nr = nr;
3454 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3455
3456 r = free_and_strdup(&secondary_root, node);
3457 if (r < 0)
3458 return log_oom();
3459 }
3460 #endif
3461 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3462
3463 if (generic)
3464 multiple_generic = true;
3465 else {
3466 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3467
3468 r = free_and_strdup(&generic, node);
3469 if (r < 0)
3470 return log_oom();
3471 }
3472 }
3473
3474 } else if (is_mbr) {
3475 int type;
3476
3477 if (flags != 0x80) /* Bootable flag */
3478 continue;
3479
3480 type = blkid_partition_get_type(pp);
3481 if (type != 0x83) /* Linux partition */
3482 continue;
3483
3484 if (generic)
3485 multiple_generic = true;
3486 else {
3487 generic_rw = true;
3488
3489 r = free_and_strdup(&root, node);
3490 if (r < 0)
3491 return log_oom();
3492 }
3493 }
3494 }
3495
3496 if (root) {
3497 *root_device = root;
3498 root = NULL;
3499
3500 *root_device_rw = root_rw;
3501 *secondary = false;
3502 } else if (secondary_root) {
3503 *root_device = secondary_root;
3504 secondary_root = NULL;
3505
3506 *root_device_rw = secondary_root_rw;
3507 *secondary = true;
3508 } else if (generic) {
3509
3510 /* There were no partitions with precise meanings
3511 * around, but we found generic partitions. In this
3512 * case, if there's only one, we can go ahead and boot
3513 * it, otherwise we bail out, because we really cannot
3514 * make any sense of it. */
3515
3516 if (multiple_generic) {
3517 log_error("Identified multiple bootable Linux partitions on\n"
3518 " %s\n"
3519 PARTITION_TABLE_BLURB, arg_image);
3520 return -EINVAL;
3521 }
3522
3523 *root_device = generic;
3524 generic = NULL;
3525
3526 *root_device_rw = generic_rw;
3527 *secondary = false;
3528 } else {
3529 log_error("Failed to identify root partition in disk image\n"
3530 " %s\n"
3531 PARTITION_TABLE_BLURB, arg_image);
3532 return -EINVAL;
3533 }
3534
3535 if (home) {
3536 *home_device = home;
3537 home = NULL;
3538
3539 *home_device_rw = home_rw;
3540 }
3541
3542 if (srv) {
3543 *srv_device = srv;
3544 srv = NULL;
3545
3546 *srv_device_rw = srv_rw;
3547 }
3548
3549 return 0;
3550 #else
3551 log_error("--image= is not supported, compiled without blkid support.");
3552 return -EOPNOTSUPP;
3553 #endif
3554 }
3555
3556 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3557 #ifdef HAVE_BLKID
3558 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3559 const char *fstype, *p;
3560 int r;
3561
3562 assert(what);
3563 assert(where);
3564
3565 if (arg_read_only)
3566 rw = false;
3567
3568 if (directory)
3569 p = strjoina(where, directory);
3570 else
3571 p = where;
3572
3573 errno = 0;
3574 b = blkid_new_probe_from_filename(what);
3575 if (!b) {
3576 if (errno == 0)
3577 return log_oom();
3578 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3579 return -errno;
3580 }
3581
3582 blkid_probe_enable_superblocks(b, 1);
3583 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3584
3585 errno = 0;
3586 r = blkid_do_safeprobe(b);
3587 if (r == -1 || r == 1) {
3588 log_error("Cannot determine file system type of %s", what);
3589 return -EINVAL;
3590 } else if (r != 0) {
3591 if (errno == 0)
3592 errno = EIO;
3593 log_error_errno(errno, "Failed to probe %s: %m", what);
3594 return -errno;
3595 }
3596
3597 errno = 0;
3598 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3599 if (errno == 0)
3600 errno = EINVAL;
3601 log_error("Failed to determine file system type of %s", what);
3602 return -errno;
3603 }
3604
3605 if (streq(fstype, "crypto_LUKS")) {
3606 log_error("nspawn currently does not support LUKS disk images.");
3607 return -EOPNOTSUPP;
3608 }
3609
3610 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3611 return log_error_errno(errno, "Failed to mount %s: %m", what);
3612
3613 return 0;
3614 #else
3615 log_error("--image= is not supported, compiled without blkid support.");
3616 return -EOPNOTSUPP;
3617 #endif
3618 }
3619
3620 static int mount_devices(
3621 const char *where,
3622 const char *root_device, bool root_device_rw,
3623 const char *home_device, bool home_device_rw,
3624 const char *srv_device, bool srv_device_rw) {
3625 int r;
3626
3627 assert(where);
3628
3629 if (root_device) {
3630 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3631 if (r < 0)
3632 return log_error_errno(r, "Failed to mount root directory: %m");
3633 }
3634
3635 if (home_device) {
3636 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3637 if (r < 0)
3638 return log_error_errno(r, "Failed to mount home directory: %m");
3639 }
3640
3641 if (srv_device) {
3642 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3643 if (r < 0)
3644 return log_error_errno(r, "Failed to mount server data directory: %m");
3645 }
3646
3647 return 0;
3648 }
3649
3650 static void loop_remove(int nr, int *image_fd) {
3651 _cleanup_close_ int control = -1;
3652 int r;
3653
3654 if (nr < 0)
3655 return;
3656
3657 if (image_fd && *image_fd >= 0) {
3658 r = ioctl(*image_fd, LOOP_CLR_FD);
3659 if (r < 0)
3660 log_debug_errno(errno, "Failed to close loop image: %m");
3661 *image_fd = safe_close(*image_fd);
3662 }
3663
3664 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3665 if (control < 0) {
3666 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3667 return;
3668 }
3669
3670 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3671 if (r < 0)
3672 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3673 }
3674
3675 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3676 int pipe_fds[2];
3677 pid_t pid;
3678
3679 assert(database);
3680 assert(key);
3681 assert(rpid);
3682
3683 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3684 return log_error_errno(errno, "Failed to allocate pipe: %m");
3685
3686 pid = fork();
3687 if (pid < 0)
3688 return log_error_errno(errno, "Failed to fork getent child: %m");
3689 else if (pid == 0) {
3690 int nullfd;
3691 char *empty_env = NULL;
3692
3693 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3694 _exit(EXIT_FAILURE);
3695
3696 if (pipe_fds[0] > 2)
3697 safe_close(pipe_fds[0]);
3698 if (pipe_fds[1] > 2)
3699 safe_close(pipe_fds[1]);
3700
3701 nullfd = open("/dev/null", O_RDWR);
3702 if (nullfd < 0)
3703 _exit(EXIT_FAILURE);
3704
3705 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3706 _exit(EXIT_FAILURE);
3707
3708 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3709 _exit(EXIT_FAILURE);
3710
3711 if (nullfd > 2)
3712 safe_close(nullfd);
3713
3714 (void) reset_all_signal_handlers();
3715 (void) reset_signal_mask();
3716 close_all_fds(NULL, 0);
3717
3718 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3719 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3720 _exit(EXIT_FAILURE);
3721 }
3722
3723 pipe_fds[1] = safe_close(pipe_fds[1]);
3724
3725 *rpid = pid;
3726
3727 return pipe_fds[0];
3728 }
3729
3730 static int change_uid_gid(char **_home) {
3731 char line[LINE_MAX], *x, *u, *g, *h;
3732 const char *word, *state;
3733 _cleanup_free_ uid_t *uids = NULL;
3734 _cleanup_free_ char *home = NULL;
3735 _cleanup_fclose_ FILE *f = NULL;
3736 _cleanup_close_ int fd = -1;
3737 unsigned n_uids = 0;
3738 size_t sz = 0, l;
3739 uid_t uid;
3740 gid_t gid;
3741 pid_t pid;
3742 int r;
3743
3744 assert(_home);
3745
3746 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3747 /* Reset everything fully to 0, just in case */
3748
3749 r = reset_uid_gid();
3750 if (r < 0)
3751 return log_error_errno(r, "Failed to become root: %m");
3752
3753 *_home = NULL;
3754 return 0;
3755 }
3756
3757 /* First, get user credentials */
3758 fd = spawn_getent("passwd", arg_user, &pid);
3759 if (fd < 0)
3760 return fd;
3761
3762 f = fdopen(fd, "r");
3763 if (!f)
3764 return log_oom();
3765 fd = -1;
3766
3767 if (!fgets(line, sizeof(line), f)) {
3768
3769 if (!ferror(f)) {
3770 log_error("Failed to resolve user %s.", arg_user);
3771 return -ESRCH;
3772 }
3773
3774 log_error_errno(errno, "Failed to read from getent: %m");
3775 return -errno;
3776 }
3777
3778 truncate_nl(line);
3779
3780 wait_for_terminate_and_warn("getent passwd", pid, true);
3781
3782 x = strchr(line, ':');
3783 if (!x) {
3784 log_error("/etc/passwd entry has invalid user field.");
3785 return -EIO;
3786 }
3787
3788 u = strchr(x+1, ':');
3789 if (!u) {
3790 log_error("/etc/passwd entry has invalid password field.");
3791 return -EIO;
3792 }
3793
3794 u++;
3795 g = strchr(u, ':');
3796 if (!g) {
3797 log_error("/etc/passwd entry has invalid UID field.");
3798 return -EIO;
3799 }
3800
3801 *g = 0;
3802 g++;
3803 x = strchr(g, ':');
3804 if (!x) {
3805 log_error("/etc/passwd entry has invalid GID field.");
3806 return -EIO;
3807 }
3808
3809 *x = 0;
3810 h = strchr(x+1, ':');
3811 if (!h) {
3812 log_error("/etc/passwd entry has invalid GECOS field.");
3813 return -EIO;
3814 }
3815
3816 h++;
3817 x = strchr(h, ':');
3818 if (!x) {
3819 log_error("/etc/passwd entry has invalid home directory field.");
3820 return -EIO;
3821 }
3822
3823 *x = 0;
3824
3825 r = parse_uid(u, &uid);
3826 if (r < 0) {
3827 log_error("Failed to parse UID of user.");
3828 return -EIO;
3829 }
3830
3831 r = parse_gid(g, &gid);
3832 if (r < 0) {
3833 log_error("Failed to parse GID of user.");
3834 return -EIO;
3835 }
3836
3837 home = strdup(h);
3838 if (!home)
3839 return log_oom();
3840
3841 /* Second, get group memberships */
3842 fd = spawn_getent("initgroups", arg_user, &pid);
3843 if (fd < 0)
3844 return fd;
3845
3846 fclose(f);
3847 f = fdopen(fd, "r");
3848 if (!f)
3849 return log_oom();
3850 fd = -1;
3851
3852 if (!fgets(line, sizeof(line), f)) {
3853 if (!ferror(f)) {
3854 log_error("Failed to resolve user %s.", arg_user);
3855 return -ESRCH;
3856 }
3857
3858 log_error_errno(errno, "Failed to read from getent: %m");
3859 return -errno;
3860 }
3861
3862 truncate_nl(line);
3863
3864 wait_for_terminate_and_warn("getent initgroups", pid, true);
3865
3866 /* Skip over the username and subsequent separator whitespace */
3867 x = line;
3868 x += strcspn(x, WHITESPACE);
3869 x += strspn(x, WHITESPACE);
3870
3871 FOREACH_WORD(word, l, x, state) {
3872 char c[l+1];
3873
3874 memcpy(c, word, l);
3875 c[l] = 0;
3876
3877 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3878 return log_oom();
3879
3880 r = parse_uid(c, &uids[n_uids++]);
3881 if (r < 0) {
3882 log_error("Failed to parse group data from getent.");
3883 return -EIO;
3884 }
3885 }
3886
3887 r = mkdir_parents(home, 0775);
3888 if (r < 0)
3889 return log_error_errno(r, "Failed to make home root directory: %m");
3890
3891 r = mkdir_safe(home, 0755, uid, gid);
3892 if (r < 0 && r != -EEXIST)
3893 return log_error_errno(r, "Failed to make home directory: %m");
3894
3895 (void) fchown(STDIN_FILENO, uid, gid);
3896 (void) fchown(STDOUT_FILENO, uid, gid);
3897 (void) fchown(STDERR_FILENO, uid, gid);
3898
3899 if (setgroups(n_uids, uids) < 0)
3900 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3901
3902 if (setresgid(gid, gid, gid) < 0)
3903 return log_error_errno(errno, "setregid() failed: %m");
3904
3905 if (setresuid(uid, uid, uid) < 0)
3906 return log_error_errno(errno, "setreuid() failed: %m");
3907
3908 if (_home) {
3909 *_home = home;
3910 home = NULL;
3911 }
3912
3913 return 0;
3914 }
3915
3916 /*
3917 * Return values:
3918 * < 0 : wait_for_terminate() failed to get the state of the
3919 * container, the container was terminated by a signal, or
3920 * failed for an unknown reason. No change is made to the
3921 * container argument.
3922 * > 0 : The program executed in the container terminated with an
3923 * error. The exit code of the program executed in the
3924 * container is returned. The container argument has been set
3925 * to CONTAINER_TERMINATED.
3926 * 0 : The container is being rebooted, has been shut down or exited
3927 * successfully. The container argument has been set to either
3928 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3929 *
3930 * That is, success is indicated by a return value of zero, and an
3931 * error is indicated by a non-zero value.
3932 */
3933 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3934 siginfo_t status;
3935 int r;
3936
3937 r = wait_for_terminate(pid, &status);
3938 if (r < 0)
3939 return log_warning_errno(r, "Failed to wait for container: %m");
3940
3941 switch (status.si_code) {
3942
3943 case CLD_EXITED:
3944 if (status.si_status == 0) {
3945 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3946
3947 } else
3948 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3949
3950 *container = CONTAINER_TERMINATED;
3951 return status.si_status;
3952
3953 case CLD_KILLED:
3954 if (status.si_status == SIGINT) {
3955
3956 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3957 *container = CONTAINER_TERMINATED;
3958 return 0;
3959
3960 } else if (status.si_status == SIGHUP) {
3961
3962 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3963 *container = CONTAINER_REBOOTED;
3964 return 0;
3965 }
3966
3967 /* CLD_KILLED fallthrough */
3968
3969 case CLD_DUMPED:
3970 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3971 return -EIO;
3972
3973 default:
3974 log_error("Container %s failed due to unknown reason.", arg_machine);
3975 return -EIO;
3976 }
3977
3978 return r;
3979 }
3980
3981 static void nop_handler(int sig) {}
3982
3983 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3984 pid_t pid;
3985
3986 pid = PTR_TO_UINT32(userdata);
3987 if (pid > 0) {
3988 if (kill(pid, arg_kill_signal) >= 0) {
3989 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3990 sd_event_source_set_userdata(s, NULL);
3991 return 0;
3992 }
3993 }
3994
3995 sd_event_exit(sd_event_source_get_event(s), 0);
3996 return 0;
3997 }
3998
3999 static int determine_names(void) {
4000 int r;
4001
4002 if (!arg_image && !arg_directory) {
4003 if (arg_machine) {
4004 _cleanup_(image_unrefp) Image *i = NULL;
4005
4006 r = image_find(arg_machine, &i);
4007 if (r < 0)
4008 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
4009 else if (r == 0) {
4010 log_error("No image for machine '%s': %m", arg_machine);
4011 return -ENOENT;
4012 }
4013
4014 if (i->type == IMAGE_RAW)
4015 r = set_sanitized_path(&arg_image, i->path);
4016 else
4017 r = set_sanitized_path(&arg_directory, i->path);
4018 if (r < 0)
4019 return log_error_errno(r, "Invalid image directory: %m");
4020
4021 if (!arg_ephemeral)
4022 arg_read_only = arg_read_only || i->read_only;
4023 } else
4024 arg_directory = get_current_dir_name();
4025
4026 if (!arg_directory && !arg_machine) {
4027 log_error("Failed to determine path, please use -D or -i.");
4028 return -EINVAL;
4029 }
4030 }
4031
4032 if (!arg_machine) {
4033 if (arg_directory && path_equal(arg_directory, "/"))
4034 arg_machine = gethostname_malloc();
4035 else
4036 arg_machine = strdup(basename(arg_image ?: arg_directory));
4037
4038 if (!arg_machine)
4039 return log_oom();
4040
4041 hostname_cleanup(arg_machine);
4042 if (!machine_name_is_valid(arg_machine)) {
4043 log_error("Failed to determine machine name automatically, please use -M.");
4044 return -EINVAL;
4045 }
4046
4047 if (arg_ephemeral) {
4048 char *b;
4049
4050 /* Add a random suffix when this is an
4051 * ephemeral machine, so that we can run many
4052 * instances at once without manually having
4053 * to specify -M each time. */
4054
4055 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
4056 return log_oom();
4057
4058 free(arg_machine);
4059 arg_machine = b;
4060 }
4061 }
4062
4063 return 0;
4064 }
4065
4066 static int determine_uid_shift(const char *directory) {
4067 int r;
4068
4069 if (!arg_userns) {
4070 arg_uid_shift = 0;
4071 return 0;
4072 }
4073
4074 if (arg_uid_shift == UID_INVALID) {
4075 struct stat st;
4076
4077 r = stat(directory, &st);
4078 if (r < 0)
4079 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
4080
4081 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
4082
4083 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
4084 log_error("UID and GID base of %s don't match.", directory);
4085 return -EINVAL;
4086 }
4087
4088 arg_uid_range = UINT32_C(0x10000);
4089 }
4090
4091 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
4092 log_error("UID base too high for UID range.");
4093 return -EINVAL;
4094 }
4095
4096 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
4097 return 0;
4098 }
4099
4100 static int inner_child(
4101 Barrier *barrier,
4102 const char *directory,
4103 bool secondary,
4104 int kmsg_socket,
4105 int rtnl_socket,
4106 FDSet *fds,
4107 int argc,
4108 char *argv[]) {
4109
4110 _cleanup_free_ char *home = NULL;
4111 unsigned n_env = 2;
4112 const char *envp[] = {
4113 "PATH=" DEFAULT_PATH_SPLIT_USR,
4114 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4115 NULL, /* TERM */
4116 NULL, /* HOME */
4117 NULL, /* USER */
4118 NULL, /* LOGNAME */
4119 NULL, /* container_uuid */
4120 NULL, /* LISTEN_FDS */
4121 NULL, /* LISTEN_PID */
4122 NULL
4123 };
4124
4125 _cleanup_strv_free_ char **env_use = NULL;
4126 int r;
4127
4128 assert(barrier);
4129 assert(directory);
4130 assert(kmsg_socket >= 0);
4131
4132 if (arg_userns) {
4133 /* Tell the parent, that it now can write the UID map. */
4134 (void) barrier_place(barrier); /* #1 */
4135
4136 /* Wait until the parent wrote the UID map */
4137 if (!barrier_place_and_sync(barrier)) { /* #2 */
4138 log_error("Parent died too early");
4139 return -ESRCH;
4140 }
4141 }
4142
4143 r = mount_all(NULL, true);
4144 if (r < 0)
4145 return r;
4146
4147 /* Wait until we are cgroup-ified, so that we
4148 * can mount the right cgroup path writable */
4149 if (!barrier_place_and_sync(barrier)) { /* #3 */
4150 log_error("Parent died too early");
4151 return -ESRCH;
4152 }
4153
4154 r = mount_systemd_cgroup_writable("");
4155 if (r < 0)
4156 return r;
4157
4158 r = reset_uid_gid();
4159 if (r < 0)
4160 return log_error_errno(r, "Couldn't become new root: %m");
4161
4162 r = setup_boot_id(NULL);
4163 if (r < 0)
4164 return r;
4165
4166 r = setup_kmsg(NULL, kmsg_socket);
4167 if (r < 0)
4168 return r;
4169 kmsg_socket = safe_close(kmsg_socket);
4170
4171 umask(0022);
4172
4173 if (setsid() < 0)
4174 return log_error_errno(errno, "setsid() failed: %m");
4175
4176 if (arg_private_network)
4177 loopback_setup();
4178
4179 r = send_rtnl(rtnl_socket);
4180 if (r < 0)
4181 return r;
4182 rtnl_socket = safe_close(rtnl_socket);
4183
4184 if (drop_capabilities() < 0)
4185 return log_error_errno(errno, "drop_capabilities() failed: %m");
4186
4187 setup_hostname();
4188
4189 if (arg_personality != PERSONALITY_INVALID) {
4190 if (personality(arg_personality) < 0)
4191 return log_error_errno(errno, "personality() failed: %m");
4192 } else if (secondary) {
4193 if (personality(PER_LINUX32) < 0)
4194 return log_error_errno(errno, "personality() failed: %m");
4195 }
4196
4197 #ifdef HAVE_SELINUX
4198 if (arg_selinux_context)
4199 if (setexeccon((security_context_t) arg_selinux_context) < 0)
4200 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4201 #endif
4202
4203 r = change_uid_gid(&home);
4204 if (r < 0)
4205 return r;
4206
4207 envp[n_env] = strv_find_prefix(environ, "TERM=");
4208 if (envp[n_env])
4209 n_env ++;
4210
4211 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4212 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4213 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
4214 return log_oom();
4215
4216 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4217 char as_uuid[37];
4218
4219 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
4220 return log_oom();
4221 }
4222
4223 if (fdset_size(fds) > 0) {
4224 r = fdset_cloexec(fds, false);
4225 if (r < 0)
4226 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4227
4228 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
4229 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
4230 return log_oom();
4231 }
4232
4233 env_use = strv_env_merge(2, envp, arg_setenv);
4234 if (!env_use)
4235 return log_oom();
4236
4237 /* Let the parent know that we are ready and
4238 * wait until the parent is ready with the
4239 * setup, too... */
4240 if (!barrier_place_and_sync(barrier)) { /* #4 */
4241 log_error("Parent died too early");
4242 return -ESRCH;
4243 }
4244
4245 /* Now, explicitly close the log, so that we
4246 * then can close all remaining fds. Closing
4247 * the log explicitly first has the benefit
4248 * that the logging subsystem knows about it,
4249 * and is thus ready to be reopened should we
4250 * need it again. Note that the other fds
4251 * closed here are at least the locking and
4252 * barrier fds. */
4253 log_close();
4254 (void) fdset_close_others(fds);
4255
4256 if (arg_boot) {
4257 char **a;
4258 size_t m;
4259
4260 /* Automatically search for the init system */
4261
4262 m = 1 + argc - optind;
4263 a = newa(char*, m + 1);
4264 memcpy(a + 1, argv + optind, m * sizeof(char*));
4265
4266 a[0] = (char*) "/usr/lib/systemd/systemd";
4267 execve(a[0], a, env_use);
4268
4269 a[0] = (char*) "/lib/systemd/systemd";
4270 execve(a[0], a, env_use);
4271
4272 a[0] = (char*) "/sbin/init";
4273 execve(a[0], a, env_use);
4274 } else if (argc > optind)
4275 execvpe(argv[optind], argv + optind, env_use);
4276 else {
4277 chdir(home ? home : "/root");
4278 execle("/bin/bash", "-bash", NULL, env_use);
4279 execle("/bin/sh", "-sh", NULL, env_use);
4280 }
4281
4282 (void) log_open();
4283 return log_error_errno(errno, "execv() failed: %m");
4284 }
4285
4286 static int outer_child(
4287 Barrier *barrier,
4288 const char *directory,
4289 const char *console,
4290 const char *root_device, bool root_device_rw,
4291 const char *home_device, bool home_device_rw,
4292 const char *srv_device, bool srv_device_rw,
4293 bool interactive,
4294 bool secondary,
4295 int pid_socket,
4296 int kmsg_socket,
4297 int rtnl_socket,
4298 int uid_shift_socket,
4299 FDSet *fds,
4300 int argc,
4301 char *argv[]) {
4302
4303 pid_t pid;
4304 ssize_t l;
4305 int r;
4306
4307 assert(barrier);
4308 assert(directory);
4309 assert(console);
4310 assert(pid_socket >= 0);
4311 assert(kmsg_socket >= 0);
4312
4313 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
4314 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4315
4316 if (interactive) {
4317 close_nointr(STDIN_FILENO);
4318 close_nointr(STDOUT_FILENO);
4319 close_nointr(STDERR_FILENO);
4320
4321 r = open_terminal(console, O_RDWR);
4322 if (r != STDIN_FILENO) {
4323 if (r >= 0) {
4324 safe_close(r);
4325 r = -EINVAL;
4326 }
4327
4328 return log_error_errno(r, "Failed to open console: %m");
4329 }
4330
4331 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4332 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
4333 return log_error_errno(errno, "Failed to duplicate console: %m");
4334 }
4335
4336 r = reset_audit_loginuid();
4337 if (r < 0)
4338 return r;
4339
4340 /* Mark everything as slave, so that we still
4341 * receive mounts from the real root, but don't
4342 * propagate mounts to the real root. */
4343 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
4344 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4345
4346 r = mount_devices(directory,
4347 root_device, root_device_rw,
4348 home_device, home_device_rw,
4349 srv_device, srv_device_rw);
4350 if (r < 0)
4351 return r;
4352
4353 r = determine_uid_shift(directory);
4354 if (r < 0)
4355 return r;
4356
4357 if (arg_userns) {
4358 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
4359 if (l < 0)
4360 return log_error_errno(errno, "Failed to send UID shift: %m");
4361 if (l != sizeof(arg_uid_shift)) {
4362 log_error("Short write while sending UID shift.");
4363 return -EIO;
4364 }
4365 }
4366
4367 /* Turn directory into bind mount */
4368 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
4369 return log_error_errno(errno, "Failed to make bind mount: %m");
4370
4371 r = setup_volatile(directory);
4372 if (r < 0)
4373 return r;
4374
4375 r = setup_volatile_state(directory);
4376 if (r < 0)
4377 return r;
4378
4379 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
4380 if (r < 0)
4381 return r;
4382
4383 if (arg_read_only) {
4384 r = bind_remount_recursive(directory, true);
4385 if (r < 0)
4386 return log_error_errno(r, "Failed to make tree read-only: %m");
4387 }
4388
4389 r = mount_all(directory, false);
4390 if (r < 0)
4391 return r;
4392
4393 if (copy_devnodes(directory) < 0)
4394 return r;
4395
4396 dev_setup(directory, arg_uid_shift, arg_uid_shift);
4397
4398 if (setup_pts(directory) < 0)
4399 return r;
4400
4401 r = setup_propagate(directory);
4402 if (r < 0)
4403 return r;
4404
4405 r = setup_dev_console(directory, console);
4406 if (r < 0)
4407 return r;
4408
4409 r = setup_seccomp();
4410 if (r < 0)
4411 return r;
4412
4413 r = setup_timezone(directory);
4414 if (r < 0)
4415 return r;
4416
4417 r = setup_resolv_conf(directory);
4418 if (r < 0)
4419 return r;
4420
4421 r = setup_journal(directory);
4422 if (r < 0)
4423 return r;
4424
4425 r = mount_custom(directory);
4426 if (r < 0)
4427 return r;
4428
4429 r = mount_cgroup(directory);
4430 if (r < 0)
4431 return r;
4432
4433 r = mount_move_root(directory);
4434 if (r < 0)
4435 return log_error_errno(r, "Failed to move root directory: %m");
4436
4437 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4438 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
4439 (arg_private_network ? CLONE_NEWNET : 0) |
4440 (arg_userns ? CLONE_NEWUSER : 0),
4441 NULL);
4442 if (pid < 0)
4443 return log_error_errno(errno, "Failed to fork inner child: %m");
4444
4445 if (pid == 0) {
4446 pid_socket = safe_close(pid_socket);
4447 uid_shift_socket = safe_close(uid_shift_socket);
4448
4449 /* The inner child has all namespaces that are
4450 * requested, so that we all are owned by the user if
4451 * user namespaces are turned on. */
4452
4453 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
4454 if (r < 0)
4455 _exit(EXIT_FAILURE);
4456
4457 _exit(EXIT_SUCCESS);
4458 }
4459
4460 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4461 if (l < 0)
4462 return log_error_errno(errno, "Failed to send PID: %m");
4463 if (l != sizeof(pid)) {
4464 log_error("Short write while sending PID.");
4465 return -EIO;
4466 }
4467
4468 pid_socket = safe_close(pid_socket);
4469
4470 return 0;
4471 }
4472
4473 static int setup_uid_map(pid_t pid) {
4474 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4475 int r;
4476
4477 assert(pid > 1);
4478
4479 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4480 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4481 r = write_string_file(uid_map, line, 0);
4482 if (r < 0)
4483 return log_error_errno(r, "Failed to write UID map: %m");
4484
4485 /* We always assign the same UID and GID ranges */
4486 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4487 r = write_string_file(uid_map, line, 0);
4488 if (r < 0)
4489 return log_error_errno(r, "Failed to write GID map: %m");
4490
4491 return 0;
4492 }
4493
4494 static int chown_cgroup(pid_t pid) {
4495 _cleanup_free_ char *path = NULL, *fs = NULL;
4496 _cleanup_close_ int fd = -1;
4497 const char *fn;
4498 int r;
4499
4500 r = cg_pid_get_path(NULL, pid, &path);
4501 if (r < 0)
4502 return log_error_errno(r, "Failed to get container cgroup path: %m");
4503
4504 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
4505 if (r < 0)
4506 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
4507
4508 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
4509 if (fd < 0)
4510 return log_error_errno(errno, "Failed to open %s: %m", fs);
4511
4512 FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4513 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
4514 log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
4515
4516 return 0;
4517 }
4518
4519 int main(int argc, char *argv[]) {
4520
4521 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
4522 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4523 _cleanup_close_ int master = -1, image_fd = -1;
4524 _cleanup_fdset_free_ FDSet *fds = NULL;
4525 int r, n_fd_passed, loop_nr = -1;
4526 char veth_name[IFNAMSIZ];
4527 bool secondary = false, remove_subvol = false;
4528 sigset_t mask_chld;
4529 pid_t pid = 0;
4530 int ret = EXIT_SUCCESS;
4531 union in_addr_union exposed = {};
4532 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4533 bool interactive;
4534
4535 log_parse_environment();
4536 log_open();
4537
4538 r = parse_argv(argc, argv);
4539 if (r <= 0)
4540 goto finish;
4541
4542 r = determine_names();
4543 if (r < 0)
4544 goto finish;
4545
4546 if (geteuid() != 0) {
4547 log_error("Need to be root.");
4548 r = -EPERM;
4549 goto finish;
4550 }
4551
4552 n_fd_passed = sd_listen_fds(false);
4553 if (n_fd_passed > 0) {
4554 r = fdset_new_listen_fds(&fds, false);
4555 if (r < 0) {
4556 log_error_errno(r, "Failed to collect file descriptors: %m");
4557 goto finish;
4558 }
4559 }
4560
4561 if (arg_directory) {
4562 assert(!arg_image);
4563
4564 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4565 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4566 r = -EINVAL;
4567 goto finish;
4568 }
4569
4570 if (arg_ephemeral) {
4571 _cleanup_free_ char *np = NULL;
4572
4573 /* If the specified path is a mount point we
4574 * generate the new snapshot immediately
4575 * inside it under a random name. However if
4576 * the specified is not a mount point we
4577 * create the new snapshot in the parent
4578 * directory, just next to it. */
4579 r = path_is_mount_point(arg_directory, 0);
4580 if (r < 0) {
4581 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4582 goto finish;
4583 }
4584 if (r > 0)
4585 r = tempfn_random_child(arg_directory, "machine.", &np);
4586 else
4587 r = tempfn_random(arg_directory, "machine.", &np);
4588 if (r < 0) {
4589 log_error_errno(r, "Failed to generate name for snapshot: %m");
4590 goto finish;
4591 }
4592
4593 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4594 if (r < 0) {
4595 log_error_errno(r, "Failed to lock %s: %m", np);
4596 goto finish;
4597 }
4598
4599 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4600 if (r < 0) {
4601 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4602 goto finish;
4603 }
4604
4605 free(arg_directory);
4606 arg_directory = np;
4607 np = NULL;
4608
4609 remove_subvol = true;
4610
4611 } else {
4612 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4613 if (r == -EBUSY) {
4614 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4615 goto finish;
4616 }
4617 if (r < 0) {
4618 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4619 return r;
4620 }
4621
4622 if (arg_template) {
4623 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4624 if (r == -EEXIST) {
4625 if (!arg_quiet)
4626 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4627 } else if (r < 0) {
4628 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4629 goto finish;
4630 } else {
4631 if (!arg_quiet)
4632 log_info("Populated %s from template %s.", arg_directory, arg_template);
4633 }
4634 }
4635 }
4636
4637 if (arg_boot) {
4638 if (path_is_os_tree(arg_directory) <= 0) {
4639 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
4640 r = -EINVAL;
4641 goto finish;
4642 }
4643 } else {
4644 const char *p;
4645
4646 p = strjoina(arg_directory,
4647 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
4648 if (access(p, F_OK) < 0) {
4649 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
4650 r = -EINVAL;
4651 goto finish;
4652 }
4653 }
4654
4655 } else {
4656 char template[] = "/tmp/nspawn-root-XXXXXX";
4657
4658 assert(arg_image);
4659 assert(!arg_template);
4660
4661 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4662 if (r == -EBUSY) {
4663 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4664 goto finish;
4665 }
4666 if (r < 0) {
4667 r = log_error_errno(r, "Failed to create image lock: %m");
4668 goto finish;
4669 }
4670
4671 if (!mkdtemp(template)) {
4672 log_error_errno(errno, "Failed to create temporary directory: %m");
4673 r = -errno;
4674 goto finish;
4675 }
4676
4677 arg_directory = strdup(template);
4678 if (!arg_directory) {
4679 r = log_oom();
4680 goto finish;
4681 }
4682
4683 image_fd = setup_image(&device_path, &loop_nr);
4684 if (image_fd < 0) {
4685 r = image_fd;
4686 goto finish;
4687 }
4688
4689 r = dissect_image(image_fd,
4690 &root_device, &root_device_rw,
4691 &home_device, &home_device_rw,
4692 &srv_device, &srv_device_rw,
4693 &secondary);
4694 if (r < 0)
4695 goto finish;
4696 }
4697
4698 r = custom_mounts_prepare();
4699 if (r < 0)
4700 goto finish;
4701
4702 interactive =
4703 isatty(STDIN_FILENO) > 0 &&
4704 isatty(STDOUT_FILENO) > 0;
4705
4706 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4707 if (master < 0) {
4708 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
4709 goto finish;
4710 }
4711
4712 r = ptsname_malloc(master, &console);
4713 if (r < 0) {
4714 r = log_error_errno(r, "Failed to determine tty name: %m");
4715 goto finish;
4716 }
4717
4718 if (unlockpt(master) < 0) {
4719 r = log_error_errno(errno, "Failed to unlock tty: %m");
4720 goto finish;
4721 }
4722
4723 if (!arg_quiet)
4724 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4725 arg_machine, arg_image ?: arg_directory);
4726
4727 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
4728
4729 assert_se(sigemptyset(&mask_chld) == 0);
4730 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4731
4732 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4733 r = log_error_errno(errno, "Failed to become subreaper: %m");
4734 goto finish;
4735 }
4736
4737 for (;;) {
4738 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
4739 uid_shift_socket_pair[2] = { -1, -1 };
4740 ContainerStatus container_status;
4741 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4742 static const struct sigaction sa = {
4743 .sa_handler = nop_handler,
4744 .sa_flags = SA_NOCLDSTOP,
4745 };
4746 int ifi = 0;
4747 ssize_t l;
4748 _cleanup_event_unref_ sd_event *event = NULL;
4749 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4750 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4751 char last_char = 0;
4752
4753 r = barrier_create(&barrier);
4754 if (r < 0) {
4755 log_error_errno(r, "Cannot initialize IPC barrier: %m");
4756 goto finish;
4757 }
4758
4759 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
4760 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4761 goto finish;
4762 }
4763
4764 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
4765 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4766 goto finish;
4767 }
4768
4769 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
4770 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
4771 goto finish;
4772 }
4773
4774 if (arg_userns)
4775 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
4776 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4777 goto finish;
4778 }
4779
4780 /* Child can be killed before execv(), so handle SIGCHLD
4781 * in order to interrupt parent's blocking calls and
4782 * give it a chance to call wait() and terminate. */
4783 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4784 if (r < 0) {
4785 r = log_error_errno(errno, "Failed to change the signal mask: %m");
4786 goto finish;
4787 }
4788
4789 r = sigaction(SIGCHLD, &sa, NULL);
4790 if (r < 0) {
4791 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4792 goto finish;
4793 }
4794
4795 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
4796 if (pid < 0) {
4797 if (errno == EINVAL)
4798 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
4799 else
4800 r = log_error_errno(errno, "clone() failed: %m");
4801
4802 goto finish;
4803 }
4804
4805 if (pid == 0) {
4806 /* The outer child only has a file system namespace. */
4807 barrier_set_role(&barrier, BARRIER_CHILD);
4808
4809 master = safe_close(master);
4810
4811 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4812 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4813 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4814 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4815
4816 (void) reset_all_signal_handlers();
4817 (void) reset_signal_mask();
4818
4819 r = outer_child(&barrier,
4820 arg_directory,
4821 console,
4822 root_device, root_device_rw,
4823 home_device, home_device_rw,
4824 srv_device, srv_device_rw,
4825 interactive,
4826 secondary,
4827 pid_socket_pair[1],
4828 kmsg_socket_pair[1],
4829 rtnl_socket_pair[1],
4830 uid_shift_socket_pair[1],
4831 fds,
4832 argc, argv);
4833 if (r < 0)
4834 _exit(EXIT_FAILURE);
4835
4836 _exit(EXIT_SUCCESS);
4837 }
4838
4839 barrier_set_role(&barrier, BARRIER_PARENT);
4840
4841 fdset_free(fds);
4842 fds = NULL;
4843
4844 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4845 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4846 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4847
4848 /* Wait for the outer child. */
4849 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
4850 if (r < 0)
4851 goto finish;
4852 if (r != 0) {
4853 r = -EIO;
4854 goto finish;
4855 }
4856 pid = 0;
4857
4858 /* And now retrieve the PID of the inner child. */
4859 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
4860 if (l < 0) {
4861 r = log_error_errno(errno, "Failed to read inner child PID: %m");
4862 goto finish;
4863 }
4864 if (l != sizeof(pid)) {
4865 log_error("Short read while reading inner child PID: %m");
4866 r = EIO;
4867 goto finish;
4868 }
4869
4870 log_debug("Init process invoked as PID " PID_FMT, pid);
4871
4872 if (arg_userns) {
4873 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4874 log_error("Child died too early.");
4875 r = -ESRCH;
4876 goto finish;
4877 }
4878
4879 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
4880 if (l < 0) {
4881 r = log_error_errno(errno, "Failed to read UID shift: %m");
4882 goto finish;
4883 }
4884 if (l != sizeof(arg_uid_shift)) {
4885 log_error("Short read while reading UID shift: %m");
4886 r = EIO;
4887 goto finish;
4888 }
4889
4890 r = setup_uid_map(pid);
4891 if (r < 0)
4892 goto finish;
4893
4894 (void) barrier_place(&barrier); /* #2 */
4895 }
4896
4897 r = move_network_interfaces(pid);
4898 if (r < 0)
4899 goto finish;
4900
4901 r = setup_veth(pid, veth_name, &ifi);
4902 if (r < 0)
4903 goto finish;
4904
4905 r = setup_bridge(veth_name, &ifi);
4906 if (r < 0)
4907 goto finish;
4908
4909 r = setup_macvlan(pid);
4910 if (r < 0)
4911 goto finish;
4912
4913 r = setup_ipvlan(pid);
4914 if (r < 0)
4915 goto finish;
4916
4917 r = register_machine(pid, ifi);
4918 if (r < 0)
4919 goto finish;
4920
4921 r = chown_cgroup(pid);
4922 if (r < 0)
4923 goto finish;
4924
4925 /* Notify the child that the parent is ready with all
4926 * its setup (including cgroup-ification), and that
4927 * the child can now hand over control to the code to
4928 * run inside the container. */
4929 (void) barrier_place(&barrier); /* #3 */
4930
4931 /* Block SIGCHLD here, before notifying child.
4932 * process_pty() will handle it with the other signals. */
4933 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4934
4935 /* Reset signal to default */
4936 r = default_signals(SIGCHLD, -1);
4937 if (r < 0) {
4938 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4939 goto finish;
4940 }
4941
4942 /* Let the child know that we are ready and wait that the child is completely ready now. */
4943 if (!barrier_place_and_sync(&barrier)) { /* #5 */
4944 log_error("Client died too early.");
4945 r = -ESRCH;
4946 goto finish;
4947 }
4948
4949 sd_notifyf(false,
4950 "READY=1\n"
4951 "STATUS=Container running.\n"
4952 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4953
4954 r = sd_event_new(&event);
4955 if (r < 0) {
4956 log_error_errno(r, "Failed to get default event source: %m");
4957 goto finish;
4958 }
4959
4960 if (arg_kill_signal > 0) {
4961 /* Try to kill the init system on SIGINT or SIGTERM */
4962 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4963 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4964 } else {
4965 /* Immediately exit */
4966 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4967 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4968 }
4969
4970 /* simply exit on sigchld */
4971 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4972
4973 if (arg_expose_ports) {
4974 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4975 if (r < 0)
4976 goto finish;
4977
4978 (void) expose_ports(rtnl, &exposed);
4979 }
4980
4981 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4982
4983 r = pty_forward_new(event, master, true, !interactive, &forward);
4984 if (r < 0) {
4985 log_error_errno(r, "Failed to create PTY forwarder: %m");
4986 goto finish;
4987 }
4988
4989 r = sd_event_loop(event);
4990 if (r < 0) {
4991 log_error_errno(r, "Failed to run event loop: %m");
4992 goto finish;
4993 }
4994
4995 pty_forward_get_last_char(forward, &last_char);
4996
4997 forward = pty_forward_free(forward);
4998
4999 if (!arg_quiet && last_char != '\n')
5000 putc('\n', stdout);
5001
5002 /* Kill if it is not dead yet anyway */
5003 terminate_machine(pid);
5004
5005 /* Normally redundant, but better safe than sorry */
5006 kill(pid, SIGKILL);
5007
5008 r = wait_for_container(pid, &container_status);
5009 pid = 0;
5010
5011 if (r < 0)
5012 /* We failed to wait for the container, or the
5013 * container exited abnormally */
5014 goto finish;
5015 else if (r > 0 || container_status == CONTAINER_TERMINATED){
5016 /* The container exited with a non-zero
5017 * status, or with zero status and no reboot
5018 * was requested. */
5019 ret = r;
5020 break;
5021 }
5022
5023 /* CONTAINER_REBOOTED, loop again */
5024
5025 if (arg_keep_unit) {
5026 /* Special handling if we are running as a
5027 * service: instead of simply restarting the
5028 * machine we want to restart the entire
5029 * service, so let's inform systemd about this
5030 * with the special exit code 133. The service
5031 * file uses RestartForceExitStatus=133 so
5032 * that this results in a full nspawn
5033 * restart. This is necessary since we might
5034 * have cgroup parameters set we want to have
5035 * flushed out. */
5036 ret = 133;
5037 r = 0;
5038 break;
5039 }
5040
5041 flush_ports(&exposed);
5042 }
5043
5044 finish:
5045 sd_notify(false,
5046 "STOPPING=1\n"
5047 "STATUS=Terminating...");
5048
5049 if (pid > 0)
5050 kill(pid, SIGKILL);
5051
5052 /* Try to flush whatever is still queued in the pty */
5053 if (master >= 0)
5054 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
5055
5056 loop_remove(loop_nr, &image_fd);
5057
5058 if (remove_subvol && arg_directory) {
5059 int k;
5060
5061 k = btrfs_subvol_remove(arg_directory, true);
5062 if (k < 0)
5063 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
5064 }
5065
5066 if (arg_machine) {
5067 const char *p;
5068
5069 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5070 (void) rm_rf(p, REMOVE_ROOT);
5071 }
5072
5073 free(arg_directory);
5074 free(arg_template);
5075 free(arg_image);
5076 free(arg_machine);
5077 free(arg_user);
5078 strv_free(arg_setenv);
5079 strv_free(arg_network_interfaces);
5080 strv_free(arg_network_macvlan);
5081 strv_free(arg_network_ipvlan);
5082 custom_mount_free_all();
5083
5084 flush_ports(&exposed);
5085
5086 while (arg_expose_ports) {
5087 ExposePort *p = arg_expose_ports;
5088 LIST_REMOVE(ports, arg_expose_ports, p);
5089 free(p);
5090 }
5091
5092 return r < 0 ? EXIT_FAILURE : ret;
5093 }