]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
nspawn: don't try to extract quotes from option string, glibc doesn't do that either
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <net/if.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
41 #include <sys/file.h>
42
43 #ifdef HAVE_SELINUX
44 #include <selinux/selinux.h>
45 #endif
46
47 #ifdef HAVE_SECCOMP
48 #include <seccomp.h>
49 #endif
50
51 #ifdef HAVE_BLKID
52 #include <blkid/blkid.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-netlink.h"
59 #include "random-util.h"
60 #include "log.h"
61 #include "util.h"
62 #include "mkdir.h"
63 #include "rm-rf.h"
64 #include "macro.h"
65 #include "missing.h"
66 #include "cgroup-util.h"
67 #include "strv.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
71 #include "fdset.h"
72 #include "build.h"
73 #include "fileio.h"
74 #include "bus-util.h"
75 #include "bus-error.h"
76 #include "ptyfwd.h"
77 #include "env-util.h"
78 #include "netlink-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
81 #include "gpt.h"
82 #include "siphash24.h"
83 #include "copy.h"
84 #include "base-filesystem.h"
85 #include "barrier.h"
86 #include "event-util.h"
87 #include "capability.h"
88 #include "cap-list.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
91 #include "list.h"
92 #include "in-addr-util.h"
93 #include "firewall-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
100
101 #ifdef HAVE_SECCOMP
102 #include "seccomp-util.h"
103 #endif
104
105 typedef struct ExposePort {
106 int protocol;
107 uint16_t host_port;
108 uint16_t container_port;
109 LIST_FIELDS(struct ExposePort, ports);
110 } ExposePort;
111
112 typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115 } ContainerStatus;
116
117 typedef enum LinkJournal {
118 LINK_NO,
119 LINK_AUTO,
120 LINK_HOST,
121 LINK_GUEST
122 } LinkJournal;
123
124 typedef enum Volatile {
125 VOLATILE_NO,
126 VOLATILE_YES,
127 VOLATILE_STATE,
128 } Volatile;
129
130 typedef enum CustomMountType {
131 CUSTOM_MOUNT_BIND,
132 CUSTOM_MOUNT_TMPFS,
133 CUSTOM_MOUNT_OVERLAY,
134 } CustomMountType;
135
136 typedef struct CustomMount {
137 CustomMountType type;
138 bool read_only;
139 char *source; /* for overlayfs this is the upper directory */
140 char *destination;
141 char *options;
142 char *work_dir;
143 char **lower;
144 } CustomMount;
145
146 static char *arg_directory = NULL;
147 static char *arg_template = NULL;
148 static char *arg_user = NULL;
149 static sd_id128_t arg_uuid = {};
150 static char *arg_machine = NULL;
151 static const char *arg_selinux_context = NULL;
152 static const char *arg_selinux_apifs_context = NULL;
153 static const char *arg_slice = NULL;
154 static bool arg_private_network = false;
155 static bool arg_read_only = false;
156 static bool arg_boot = false;
157 static bool arg_ephemeral = false;
158 static LinkJournal arg_link_journal = LINK_AUTO;
159 static bool arg_link_journal_try = false;
160 static uint64_t arg_retain =
161 (1ULL << CAP_CHOWN) |
162 (1ULL << CAP_DAC_OVERRIDE) |
163 (1ULL << CAP_DAC_READ_SEARCH) |
164 (1ULL << CAP_FOWNER) |
165 (1ULL << CAP_FSETID) |
166 (1ULL << CAP_IPC_OWNER) |
167 (1ULL << CAP_KILL) |
168 (1ULL << CAP_LEASE) |
169 (1ULL << CAP_LINUX_IMMUTABLE) |
170 (1ULL << CAP_NET_BIND_SERVICE) |
171 (1ULL << CAP_NET_BROADCAST) |
172 (1ULL << CAP_NET_RAW) |
173 (1ULL << CAP_SETGID) |
174 (1ULL << CAP_SETFCAP) |
175 (1ULL << CAP_SETPCAP) |
176 (1ULL << CAP_SETUID) |
177 (1ULL << CAP_SYS_ADMIN) |
178 (1ULL << CAP_SYS_CHROOT) |
179 (1ULL << CAP_SYS_NICE) |
180 (1ULL << CAP_SYS_PTRACE) |
181 (1ULL << CAP_SYS_TTY_CONFIG) |
182 (1ULL << CAP_SYS_RESOURCE) |
183 (1ULL << CAP_SYS_BOOT) |
184 (1ULL << CAP_AUDIT_WRITE) |
185 (1ULL << CAP_AUDIT_CONTROL) |
186 (1ULL << CAP_MKNOD);
187 static CustomMount *arg_custom_mounts = NULL;
188 static unsigned arg_n_custom_mounts = 0;
189 static char **arg_setenv = NULL;
190 static bool arg_quiet = false;
191 static bool arg_share_system = false;
192 static bool arg_register = true;
193 static bool arg_keep_unit = false;
194 static char **arg_network_interfaces = NULL;
195 static char **arg_network_macvlan = NULL;
196 static char **arg_network_ipvlan = NULL;
197 static bool arg_network_veth = false;
198 static const char *arg_network_bridge = NULL;
199 static unsigned long arg_personality = PERSONALITY_INVALID;
200 static char *arg_image = NULL;
201 static Volatile arg_volatile = VOLATILE_NO;
202 static ExposePort *arg_expose_ports = NULL;
203 static char **arg_property = NULL;
204 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
205 static bool arg_userns = false;
206 static int arg_kill_signal = 0;
207
208 static void help(void) {
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
211 " -h --help Show this help\n"
212 " --version Print version string\n"
213 " -q --quiet Do not show status information\n"
214 " -D --directory=PATH Root directory for the container\n"
215 " --template=PATH Initialize root directory from template directory,\n"
216 " if missing\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
222 " -M --machine=NAME Set the machine name for the container\n"
223 " --uuid=UUID Set a specific machine UUID for the container\n"
224 " -S --slice=SLICE Place the container in the specified slice\n"
225 " --property=NAME=VALUE Set scope unit property\n"
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
231 " container\n"
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
238 " -n --network-veth Add a virtual ethernet connection between host\n"
239 " and container\n"
240 " --network-bridge=INTERFACE\n"
241 " Add a virtual ethernet connection between host\n"
242 " and container and add it to an existing bridge on\n"
243 " the host\n"
244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
245 " Expose a container IP port on the host\n"
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
252 " --capability=CAP In addition to the default, retain specified\n"
253 " capability\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
259 " --read-only Mount the root directory read-only\n"
260 " --bind=PATH[:PATH[:OPTIONS]]\n"
261 " Bind mount a file or directory from the host into\n"
262 " the container\n"
263 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
264 " Similar, but creates a read-only bind mount\n"
265 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
266 " --overlay=PATH[:PATH...]:PATH\n"
267 " Create an overlay mount from the host to \n"
268 " the container\n"
269 " --overlay-ro=PATH[:PATH...]:PATH\n"
270 " Similar, but creates a read-only overlay mount\n"
271 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
272 " --share-system Share system namespaces with host\n"
273 " --register=BOOLEAN Register container as machine\n"
274 " --keep-unit Do not register a scope for the machine, reuse\n"
275 " the service unit nspawn is running in\n"
276 " --volatile[=MODE] Run the system in volatile mode\n"
277 , program_invocation_short_name);
278 }
279
280 static CustomMount* custom_mount_add(CustomMountType t) {
281 CustomMount *c, *ret;
282
283 c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
284 if (!c)
285 return NULL;
286
287 arg_custom_mounts = c;
288 ret = arg_custom_mounts + arg_n_custom_mounts;
289 arg_n_custom_mounts++;
290
291 *ret = (CustomMount) { .type = t };
292
293 return ret;
294 }
295
296 static void custom_mount_free_all(void) {
297 unsigned i;
298
299 for (i = 0; i < arg_n_custom_mounts; i++) {
300 CustomMount *m = &arg_custom_mounts[i];
301
302 free(m->source);
303 free(m->destination);
304 free(m->options);
305
306 if (m->work_dir) {
307 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
308 free(m->work_dir);
309 }
310
311 strv_free(m->lower);
312 }
313
314 arg_custom_mounts = mfree(arg_custom_mounts);
315 arg_n_custom_mounts = 0;
316 }
317
318 static int custom_mount_compare(const void *a, const void *b) {
319 const CustomMount *x = a, *y = b;
320 int r;
321
322 r = path_compare(x->destination, y->destination);
323 if (r != 0)
324 return r;
325
326 if (x->type < y->type)
327 return -1;
328 if (x->type > y->type)
329 return 1;
330
331 return 0;
332 }
333
334 static int custom_mounts_prepare(void) {
335 unsigned i;
336 int r;
337
338 /* Ensure the mounts are applied prefix first. */
339 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
340
341 /* Allocate working directories for the overlay file systems that need it */
342 for (i = 0; i < arg_n_custom_mounts; i++) {
343 CustomMount *m = &arg_custom_mounts[i];
344
345 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
346 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
347 return -EINVAL;
348 }
349
350 if (m->type != CUSTOM_MOUNT_OVERLAY)
351 continue;
352
353 if (m->work_dir)
354 continue;
355
356 if (m->read_only)
357 continue;
358
359 r = tempfn_random(m->source, NULL, &m->work_dir);
360 if (r < 0)
361 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
362 }
363
364 return 0;
365 }
366
367 static int set_sanitized_path(char **b, const char *path) {
368 char *p;
369
370 assert(b);
371 assert(path);
372
373 p = canonicalize_file_name(path);
374 if (!p) {
375 if (errno != ENOENT)
376 return -errno;
377
378 p = path_make_absolute_cwd(path);
379 if (!p)
380 return -ENOMEM;
381 }
382
383 free(*b);
384 *b = path_kill_slashes(p);
385 return 0;
386 }
387
388 static int parse_argv(int argc, char *argv[]) {
389
390 enum {
391 ARG_VERSION = 0x100,
392 ARG_PRIVATE_NETWORK,
393 ARG_UUID,
394 ARG_READ_ONLY,
395 ARG_CAPABILITY,
396 ARG_DROP_CAPABILITY,
397 ARG_LINK_JOURNAL,
398 ARG_BIND,
399 ARG_BIND_RO,
400 ARG_TMPFS,
401 ARG_OVERLAY,
402 ARG_OVERLAY_RO,
403 ARG_SETENV,
404 ARG_SHARE_SYSTEM,
405 ARG_REGISTER,
406 ARG_KEEP_UNIT,
407 ARG_NETWORK_INTERFACE,
408 ARG_NETWORK_MACVLAN,
409 ARG_NETWORK_IPVLAN,
410 ARG_NETWORK_BRIDGE,
411 ARG_PERSONALITY,
412 ARG_VOLATILE,
413 ARG_TEMPLATE,
414 ARG_PROPERTY,
415 ARG_PRIVATE_USERS,
416 ARG_KILL_SIGNAL,
417 };
418
419 static const struct option options[] = {
420 { "help", no_argument, NULL, 'h' },
421 { "version", no_argument, NULL, ARG_VERSION },
422 { "directory", required_argument, NULL, 'D' },
423 { "template", required_argument, NULL, ARG_TEMPLATE },
424 { "ephemeral", no_argument, NULL, 'x' },
425 { "user", required_argument, NULL, 'u' },
426 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
427 { "boot", no_argument, NULL, 'b' },
428 { "uuid", required_argument, NULL, ARG_UUID },
429 { "read-only", no_argument, NULL, ARG_READ_ONLY },
430 { "capability", required_argument, NULL, ARG_CAPABILITY },
431 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
432 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
433 { "bind", required_argument, NULL, ARG_BIND },
434 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
435 { "tmpfs", required_argument, NULL, ARG_TMPFS },
436 { "overlay", required_argument, NULL, ARG_OVERLAY },
437 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
438 { "machine", required_argument, NULL, 'M' },
439 { "slice", required_argument, NULL, 'S' },
440 { "setenv", required_argument, NULL, ARG_SETENV },
441 { "selinux-context", required_argument, NULL, 'Z' },
442 { "selinux-apifs-context", required_argument, NULL, 'L' },
443 { "quiet", no_argument, NULL, 'q' },
444 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
445 { "register", required_argument, NULL, ARG_REGISTER },
446 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
447 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
448 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
449 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
450 { "network-veth", no_argument, NULL, 'n' },
451 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
452 { "personality", required_argument, NULL, ARG_PERSONALITY },
453 { "image", required_argument, NULL, 'i' },
454 { "volatile", optional_argument, NULL, ARG_VOLATILE },
455 { "port", required_argument, NULL, 'p' },
456 { "property", required_argument, NULL, ARG_PROPERTY },
457 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
458 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
459 {}
460 };
461
462 int c, r;
463 uint64_t plus = 0, minus = 0;
464
465 assert(argc >= 0);
466 assert(argv);
467
468 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
469
470 switch (c) {
471
472 case 'h':
473 help();
474 return 0;
475
476 case ARG_VERSION:
477 puts(PACKAGE_STRING);
478 puts(SYSTEMD_FEATURES);
479 return 0;
480
481 case 'D':
482 r = set_sanitized_path(&arg_directory, optarg);
483 if (r < 0)
484 return log_error_errno(r, "Invalid root directory: %m");
485
486 break;
487
488 case ARG_TEMPLATE:
489 r = set_sanitized_path(&arg_template, optarg);
490 if (r < 0)
491 return log_error_errno(r, "Invalid template directory: %m");
492
493 break;
494
495 case 'i':
496 r = set_sanitized_path(&arg_image, optarg);
497 if (r < 0)
498 return log_error_errno(r, "Invalid image path: %m");
499
500 break;
501
502 case 'x':
503 arg_ephemeral = true;
504 break;
505
506 case 'u':
507 r = free_and_strdup(&arg_user, optarg);
508 if (r < 0)
509 return log_oom();
510
511 break;
512
513 case ARG_NETWORK_BRIDGE:
514 arg_network_bridge = optarg;
515
516 /* fall through */
517
518 case 'n':
519 arg_network_veth = true;
520 arg_private_network = true;
521 break;
522
523 case ARG_NETWORK_INTERFACE:
524 if (strv_extend(&arg_network_interfaces, optarg) < 0)
525 return log_oom();
526
527 arg_private_network = true;
528 break;
529
530 case ARG_NETWORK_MACVLAN:
531 if (strv_extend(&arg_network_macvlan, optarg) < 0)
532 return log_oom();
533
534 arg_private_network = true;
535 break;
536
537 case ARG_NETWORK_IPVLAN:
538 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
539 return log_oom();
540
541 /* fall through */
542
543 case ARG_PRIVATE_NETWORK:
544 arg_private_network = true;
545 break;
546
547 case 'b':
548 arg_boot = true;
549 break;
550
551 case ARG_UUID:
552 r = sd_id128_from_string(optarg, &arg_uuid);
553 if (r < 0) {
554 log_error("Invalid UUID: %s", optarg);
555 return r;
556 }
557 break;
558
559 case 'S':
560 arg_slice = optarg;
561 break;
562
563 case 'M':
564 if (isempty(optarg))
565 arg_machine = mfree(arg_machine);
566 else {
567 if (!machine_name_is_valid(optarg)) {
568 log_error("Invalid machine name: %s", optarg);
569 return -EINVAL;
570 }
571
572 r = free_and_strdup(&arg_machine, optarg);
573 if (r < 0)
574 return log_oom();
575
576 break;
577 }
578
579 case 'Z':
580 arg_selinux_context = optarg;
581 break;
582
583 case 'L':
584 arg_selinux_apifs_context = optarg;
585 break;
586
587 case ARG_READ_ONLY:
588 arg_read_only = true;
589 break;
590
591 case ARG_CAPABILITY:
592 case ARG_DROP_CAPABILITY: {
593 const char *state, *word;
594 size_t length;
595
596 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
597 _cleanup_free_ char *t;
598
599 t = strndup(word, length);
600 if (!t)
601 return log_oom();
602
603 if (streq(t, "all")) {
604 if (c == ARG_CAPABILITY)
605 plus = (uint64_t) -1;
606 else
607 minus = (uint64_t) -1;
608 } else {
609 int cap;
610
611 cap = capability_from_name(t);
612 if (cap < 0) {
613 log_error("Failed to parse capability %s.", t);
614 return -EINVAL;
615 }
616
617 if (c == ARG_CAPABILITY)
618 plus |= 1ULL << (uint64_t) cap;
619 else
620 minus |= 1ULL << (uint64_t) cap;
621 }
622 }
623
624 break;
625 }
626
627 case 'j':
628 arg_link_journal = LINK_GUEST;
629 arg_link_journal_try = true;
630 break;
631
632 case ARG_LINK_JOURNAL:
633 if (streq(optarg, "auto")) {
634 arg_link_journal = LINK_AUTO;
635 arg_link_journal_try = false;
636 } else if (streq(optarg, "no")) {
637 arg_link_journal = LINK_NO;
638 arg_link_journal_try = false;
639 } else if (streq(optarg, "guest")) {
640 arg_link_journal = LINK_GUEST;
641 arg_link_journal_try = false;
642 } else if (streq(optarg, "host")) {
643 arg_link_journal = LINK_HOST;
644 arg_link_journal_try = false;
645 } else if (streq(optarg, "try-guest")) {
646 arg_link_journal = LINK_GUEST;
647 arg_link_journal_try = true;
648 } else if (streq(optarg, "try-host")) {
649 arg_link_journal = LINK_HOST;
650 arg_link_journal_try = true;
651 } else {
652 log_error("Failed to parse link journal mode %s", optarg);
653 return -EINVAL;
654 }
655
656 break;
657
658 case ARG_BIND:
659 case ARG_BIND_RO: {
660 const char *current = optarg;
661 _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
662 CustomMount *m;
663
664 r = extract_many_words(&current, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, &opts, NULL);
665 switch (r) {
666 case 1:
667 destination = strdup(source);
668 case 2:
669 case 3:
670 break;
671 case -ENOMEM:
672 return log_oom();
673 default:
674 log_error("Invalid bind mount specification: %s", optarg);
675 return -EINVAL;
676 }
677
678 if (!source || !destination)
679 return log_oom();
680
681 if (!path_is_absolute(source) || !path_is_absolute(destination)) {
682 log_error("Invalid bind mount specification: %s", optarg);
683 return -EINVAL;
684 }
685
686 m = custom_mount_add(CUSTOM_MOUNT_BIND);
687 if (!m)
688 return log_oom();
689
690 m->source = source;
691 m->destination = destination;
692 m->read_only = c == ARG_BIND_RO;
693 m->options = opts;
694
695 source = destination = opts = NULL;
696
697 break;
698 }
699
700 case ARG_TMPFS: {
701 const char *current = optarg;
702 _cleanup_free_ char *path = NULL, *opts = NULL;
703 CustomMount *m;
704
705 r = extract_first_word(&current, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
706 if (r == -ENOMEM)
707 return log_oom();
708 else if (r < 0) {
709 log_error("Invalid tmpfs specification: %s", optarg);
710 return r;
711 }
712 if (r)
713 opts = strdup(current);
714 else
715 opts = strdup("mode=0755");
716
717 if (!path || !opts)
718 return log_oom();
719
720 if (!path_is_absolute(path)) {
721 log_error("Invalid tmpfs specification: %s", optarg);
722 return -EINVAL;
723 }
724
725 m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
726 if (!m)
727 return log_oom();
728
729 m->destination = path;
730 m->options = opts;
731
732 path = opts = NULL;
733
734 break;
735 }
736
737 case ARG_OVERLAY:
738 case ARG_OVERLAY_RO: {
739 _cleanup_free_ char *upper = NULL, *destination = NULL;
740 _cleanup_strv_free_ char **lower = NULL;
741 CustomMount *m;
742 unsigned n = 0;
743 char **i;
744
745 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
746 if (r == -ENOMEM)
747 return log_oom();
748 else if (r < 0) {
749 log_error("Invalid overlay specification: %s", optarg);
750 return r;
751 }
752
753 STRV_FOREACH(i, lower) {
754 if (!path_is_absolute(*i)) {
755 log_error("Overlay path %s is not absolute.", *i);
756 return -EINVAL;
757 }
758
759 n++;
760 }
761
762 if (n < 2) {
763 log_error("--overlay= needs at least two colon-separated directories specified.");
764 return -EINVAL;
765 }
766
767 if (n == 2) {
768 /* If two parameters are specified,
769 * the first one is the lower, the
770 * second one the upper directory. And
771 * we'll also define the destination
772 * mount point the same as the upper. */
773 upper = lower[1];
774 lower[1] = NULL;
775
776 destination = strdup(upper);
777 if (!destination)
778 return log_oom();
779
780 } else {
781 upper = lower[n - 2];
782 destination = lower[n - 1];
783 lower[n - 2] = NULL;
784 }
785
786 m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
787 if (!m)
788 return log_oom();
789
790 m->destination = destination;
791 m->source = upper;
792 m->lower = lower;
793 m->read_only = c == ARG_OVERLAY_RO;
794
795 upper = destination = NULL;
796 lower = NULL;
797
798 break;
799 }
800
801 case ARG_SETENV: {
802 char **n;
803
804 if (!env_assignment_is_valid(optarg)) {
805 log_error("Environment variable assignment '%s' is not valid.", optarg);
806 return -EINVAL;
807 }
808
809 n = strv_env_set(arg_setenv, optarg);
810 if (!n)
811 return log_oom();
812
813 strv_free(arg_setenv);
814 arg_setenv = n;
815 break;
816 }
817
818 case 'q':
819 arg_quiet = true;
820 break;
821
822 case ARG_SHARE_SYSTEM:
823 arg_share_system = true;
824 break;
825
826 case ARG_REGISTER:
827 r = parse_boolean(optarg);
828 if (r < 0) {
829 log_error("Failed to parse --register= argument: %s", optarg);
830 return r;
831 }
832
833 arg_register = r;
834 break;
835
836 case ARG_KEEP_UNIT:
837 arg_keep_unit = true;
838 break;
839
840 case ARG_PERSONALITY:
841
842 arg_personality = personality_from_string(optarg);
843 if (arg_personality == PERSONALITY_INVALID) {
844 log_error("Unknown or unsupported personality '%s'.", optarg);
845 return -EINVAL;
846 }
847
848 break;
849
850 case ARG_VOLATILE:
851
852 if (!optarg)
853 arg_volatile = VOLATILE_YES;
854 else {
855 r = parse_boolean(optarg);
856 if (r < 0) {
857 if (streq(optarg, "state"))
858 arg_volatile = VOLATILE_STATE;
859 else {
860 log_error("Failed to parse --volatile= argument: %s", optarg);
861 return r;
862 }
863 } else
864 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
865 }
866
867 break;
868
869 case 'p': {
870 const char *split, *e;
871 uint16_t container_port, host_port;
872 int protocol;
873 ExposePort *p;
874
875 if ((e = startswith(optarg, "tcp:")))
876 protocol = IPPROTO_TCP;
877 else if ((e = startswith(optarg, "udp:")))
878 protocol = IPPROTO_UDP;
879 else {
880 e = optarg;
881 protocol = IPPROTO_TCP;
882 }
883
884 split = strchr(e, ':');
885 if (split) {
886 char v[split - e + 1];
887
888 memcpy(v, e, split - e);
889 v[split - e] = 0;
890
891 r = safe_atou16(v, &host_port);
892 if (r < 0 || host_port <= 0) {
893 log_error("Failed to parse host port: %s", optarg);
894 return -EINVAL;
895 }
896
897 r = safe_atou16(split + 1, &container_port);
898 } else {
899 r = safe_atou16(e, &container_port);
900 host_port = container_port;
901 }
902
903 if (r < 0 || container_port <= 0) {
904 log_error("Failed to parse host port: %s", optarg);
905 return -EINVAL;
906 }
907
908 LIST_FOREACH(ports, p, arg_expose_ports) {
909 if (p->protocol == protocol && p->host_port == host_port) {
910 log_error("Duplicate port specification: %s", optarg);
911 return -EINVAL;
912 }
913 }
914
915 p = new(ExposePort, 1);
916 if (!p)
917 return log_oom();
918
919 p->protocol = protocol;
920 p->host_port = host_port;
921 p->container_port = container_port;
922
923 LIST_PREPEND(ports, arg_expose_ports, p);
924
925 break;
926 }
927
928 case ARG_PROPERTY:
929 if (strv_extend(&arg_property, optarg) < 0)
930 return log_oom();
931
932 break;
933
934 case ARG_PRIVATE_USERS:
935 if (optarg) {
936 _cleanup_free_ char *buffer = NULL;
937 const char *range, *shift;
938
939 range = strchr(optarg, ':');
940 if (range) {
941 buffer = strndup(optarg, range - optarg);
942 if (!buffer)
943 return log_oom();
944 shift = buffer;
945
946 range++;
947 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
948 log_error("Failed to parse UID range: %s", range);
949 return -EINVAL;
950 }
951 } else
952 shift = optarg;
953
954 if (parse_uid(shift, &arg_uid_shift) < 0) {
955 log_error("Failed to parse UID: %s", optarg);
956 return -EINVAL;
957 }
958 }
959
960 arg_userns = true;
961 break;
962
963 case ARG_KILL_SIGNAL:
964 arg_kill_signal = signal_from_string_try_harder(optarg);
965 if (arg_kill_signal < 0) {
966 log_error("Cannot parse signal: %s", optarg);
967 return -EINVAL;
968 }
969
970 break;
971
972 case '?':
973 return -EINVAL;
974
975 default:
976 assert_not_reached("Unhandled option");
977 }
978
979 if (arg_share_system)
980 arg_register = false;
981
982 if (arg_boot && arg_share_system) {
983 log_error("--boot and --share-system may not be combined.");
984 return -EINVAL;
985 }
986
987 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
988 log_error("--keep-unit may not be used when invoked from a user session.");
989 return -EINVAL;
990 }
991
992 if (arg_directory && arg_image) {
993 log_error("--directory= and --image= may not be combined.");
994 return -EINVAL;
995 }
996
997 if (arg_template && arg_image) {
998 log_error("--template= and --image= may not be combined.");
999 return -EINVAL;
1000 }
1001
1002 if (arg_template && !(arg_directory || arg_machine)) {
1003 log_error("--template= needs --directory= or --machine=.");
1004 return -EINVAL;
1005 }
1006
1007 if (arg_ephemeral && arg_template) {
1008 log_error("--ephemeral and --template= may not be combined.");
1009 return -EINVAL;
1010 }
1011
1012 if (arg_ephemeral && arg_image) {
1013 log_error("--ephemeral and --image= may not be combined.");
1014 return -EINVAL;
1015 }
1016
1017 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1018 log_error("--ephemeral and --link-journal= may not be combined.");
1019 return -EINVAL;
1020 }
1021
1022 if (arg_volatile != VOLATILE_NO && arg_read_only) {
1023 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1024 return -EINVAL;
1025 }
1026
1027 if (arg_expose_ports && !arg_private_network) {
1028 log_error("Cannot use --port= without private networking.");
1029 return -EINVAL;
1030 }
1031
1032 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
1033 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
1034
1035 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1036
1037 if (arg_boot && arg_kill_signal <= 0)
1038 arg_kill_signal = SIGRTMIN+3;
1039
1040 return 1;
1041 }
1042
1043 static int tmpfs_patch_options(const char *options, char **ret) {
1044 char *buf = NULL;
1045
1046 if (arg_userns && arg_uid_shift != 0) {
1047 assert(arg_uid_shift != UID_INVALID);
1048
1049 if (options)
1050 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
1051 else
1052 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
1053 if (!buf)
1054 return -ENOMEM;
1055
1056 options = buf;
1057 }
1058
1059 #ifdef HAVE_SELINUX
1060 if (arg_selinux_apifs_context) {
1061 char *t;
1062
1063 if (options)
1064 t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
1065 else
1066 t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
1067 if (!t) {
1068 free(buf);
1069 return -ENOMEM;
1070 }
1071
1072 free(buf);
1073 buf = t;
1074 }
1075 #endif
1076
1077 *ret = buf;
1078 return !!buf;
1079 }
1080
1081 static int mount_all(const char *dest, bool userns) {
1082
1083 typedef struct MountPoint {
1084 const char *what;
1085 const char *where;
1086 const char *type;
1087 const char *options;
1088 unsigned long flags;
1089 bool fatal;
1090 bool userns;
1091 } MountPoint;
1092
1093 static const MountPoint mount_table[] = {
1094 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
1095 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
1096 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
1097 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
1098 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
1099 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
1100 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1101 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1102 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
1103 #ifdef HAVE_SELINUX
1104 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
1105 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */
1106 #endif
1107 };
1108
1109 unsigned k;
1110 int r;
1111
1112 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
1113 _cleanup_free_ char *where = NULL, *options = NULL;
1114 const char *o;
1115
1116 if (userns != mount_table[k].userns)
1117 continue;
1118
1119 where = prefix_root(dest, mount_table[k].where);
1120 if (!where)
1121 return log_oom();
1122
1123 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
1124 if (r < 0 && r != -ENOENT)
1125 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
1126
1127 /* Skip this entry if it is not a remount. */
1128 if (mount_table[k].what && r > 0)
1129 continue;
1130
1131 r = mkdir_p(where, 0755);
1132 if (r < 0) {
1133 if (mount_table[k].fatal)
1134 return log_error_errno(r, "Failed to create directory %s: %m", where);
1135
1136 log_warning_errno(r, "Failed to create directory %s: %m", where);
1137 continue;
1138 }
1139
1140 o = mount_table[k].options;
1141 if (streq_ptr(mount_table[k].type, "tmpfs")) {
1142 r = tmpfs_patch_options(o, &options);
1143 if (r < 0)
1144 return log_oom();
1145 if (r > 0)
1146 o = options;
1147 }
1148
1149 if (mount(mount_table[k].what,
1150 where,
1151 mount_table[k].type,
1152 mount_table[k].flags,
1153 o) < 0) {
1154
1155 if (mount_table[k].fatal)
1156 return log_error_errno(errno, "mount(%s) failed: %m", where);
1157
1158 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
1159 }
1160 }
1161
1162 return 0;
1163 }
1164
1165 static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
1166 const char *p = options;
1167 unsigned long flags = *mount_flags;
1168 char *opts = NULL;
1169
1170 assert(options);
1171
1172 for (;;) {
1173 _cleanup_free_ char *word = NULL;
1174 int r = extract_first_word(&p, &word, ",", 0);
1175 if (r < 0)
1176 return log_error_errno(r, "Failed to extract mount option: %m");
1177 if (r == 0)
1178 break;
1179
1180 if (streq(word, "rbind"))
1181 flags |= MS_REC;
1182 else if (streq(word, "norbind"))
1183 flags &= ~MS_REC;
1184 else {
1185 log_error("Invalid bind mount option: %s", word);
1186 return -EINVAL;
1187 }
1188 }
1189
1190 *mount_flags = flags;
1191 /* in the future mount_opts will hold string options for mount(2) */
1192 *mount_opts = opts;
1193
1194 return 0;
1195 }
1196
1197 static int mount_bind(const char *dest, CustomMount *m) {
1198 struct stat source_st, dest_st;
1199 const char *where;
1200 unsigned long mount_flags = MS_BIND | MS_REC;
1201 _cleanup_free_ char *mount_opts = NULL;
1202 int r;
1203
1204 assert(m);
1205
1206 if (m->options) {
1207 r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
1208 if (r < 0)
1209 return r;
1210 }
1211
1212 if (stat(m->source, &source_st) < 0)
1213 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
1214
1215 where = prefix_roota(dest, m->destination);
1216
1217 if (stat(where, &dest_st) >= 0) {
1218 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
1219 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
1220 return -EINVAL;
1221 }
1222
1223 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
1224 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
1225 return -EINVAL;
1226 }
1227
1228 } else if (errno == ENOENT) {
1229 r = mkdir_parents_label(where, 0755);
1230 if (r < 0)
1231 return log_error_errno(r, "Failed to make parents of %s: %m", where);
1232 } else {
1233 log_error_errno(errno, "Failed to stat %s: %m", where);
1234 return -errno;
1235 }
1236
1237 /* Create the mount point. Any non-directory file can be
1238 * mounted on any non-directory file (regular, fifo, socket,
1239 * char, block).
1240 */
1241 if (S_ISDIR(source_st.st_mode))
1242 r = mkdir_label(where, 0755);
1243 else
1244 r = touch(where);
1245 if (r < 0 && r != -EEXIST)
1246 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1247
1248 if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0)
1249 return log_error_errno(errno, "mount(%s) failed: %m", where);
1250
1251 if (m->read_only) {
1252 r = bind_remount_recursive(where, true);
1253 if (r < 0)
1254 return log_error_errno(r, "Read-only bind mount failed: %m");
1255 }
1256
1257 return 0;
1258 }
1259
1260 static int mount_tmpfs(const char *dest, CustomMount *m) {
1261 const char *where, *options;
1262 _cleanup_free_ char *buf = NULL;
1263 int r;
1264
1265 assert(dest);
1266 assert(m);
1267
1268 where = prefix_roota(dest, m->destination);
1269
1270 r = mkdir_p_label(where, 0755);
1271 if (r < 0 && r != -EEXIST)
1272 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1273
1274 r = tmpfs_patch_options(m->options, &buf);
1275 if (r < 0)
1276 return log_oom();
1277 options = r > 0 ? buf : m->options;
1278
1279 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
1280 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1281
1282 return 0;
1283 }
1284
1285 static char *joined_and_escaped_lower_dirs(char * const *lower) {
1286 _cleanup_strv_free_ char **sv = NULL;
1287
1288 sv = strv_copy(lower);
1289 if (!sv)
1290 return NULL;
1291
1292 strv_reverse(sv);
1293
1294 if (!strv_shell_escape(sv, ",:"))
1295 return NULL;
1296
1297 return strv_join(sv, ":");
1298 }
1299
1300 static int mount_overlay(const char *dest, CustomMount *m) {
1301 _cleanup_free_ char *lower = NULL;
1302 const char *where, *options;
1303 int r;
1304
1305 assert(dest);
1306 assert(m);
1307
1308 where = prefix_roota(dest, m->destination);
1309
1310 r = mkdir_label(where, 0755);
1311 if (r < 0 && r != -EEXIST)
1312 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
1313
1314 (void) mkdir_p_label(m->source, 0755);
1315
1316 lower = joined_and_escaped_lower_dirs(m->lower);
1317 if (!lower)
1318 return log_oom();
1319
1320 if (m->read_only) {
1321 _cleanup_free_ char *escaped_source = NULL;
1322
1323 escaped_source = shell_escape(m->source, ",:");
1324 if (!escaped_source)
1325 return log_oom();
1326
1327 options = strjoina("lowerdir=", escaped_source, ":", lower);
1328 } else {
1329 _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL;
1330
1331 assert(m->work_dir);
1332 (void) mkdir_label(m->work_dir, 0700);
1333
1334 escaped_source = shell_escape(m->source, ",:");
1335 if (!escaped_source)
1336 return log_oom();
1337 escaped_work_dir = shell_escape(m->work_dir, ",:");
1338 if (!escaped_work_dir)
1339 return log_oom();
1340
1341 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
1342 }
1343
1344 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
1345 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
1346
1347 return 0;
1348 }
1349
1350 static int mount_custom(const char *dest) {
1351 unsigned i;
1352 int r;
1353
1354 assert(dest);
1355
1356 for (i = 0; i < arg_n_custom_mounts; i++) {
1357 CustomMount *m = &arg_custom_mounts[i];
1358
1359 switch (m->type) {
1360
1361 case CUSTOM_MOUNT_BIND:
1362 r = mount_bind(dest, m);
1363 break;
1364
1365 case CUSTOM_MOUNT_TMPFS:
1366 r = mount_tmpfs(dest, m);
1367 break;
1368
1369 case CUSTOM_MOUNT_OVERLAY:
1370 r = mount_overlay(dest, m);
1371 break;
1372
1373 default:
1374 assert_not_reached("Unknown custom mount type");
1375 }
1376
1377 if (r < 0)
1378 return r;
1379 }
1380
1381 return 0;
1382 }
1383
1384 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1385 char *to;
1386 int r;
1387
1388 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1389
1390 r = path_is_mount_point(to, 0);
1391 if (r < 0 && r != -ENOENT)
1392 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1393 if (r > 0)
1394 return 0;
1395
1396 mkdir_p(to, 0755);
1397
1398 /* The superblock mount options of the mount point need to be
1399 * identical to the hosts', and hence writable... */
1400 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1401 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1402
1403 /* ... hence let's only make the bind mount read-only, not the
1404 * superblock. */
1405 if (read_only) {
1406 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1407 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1408 }
1409 return 1;
1410 }
1411
1412 static int mount_cgroup(const char *dest) {
1413 _cleanup_set_free_free_ Set *controllers = NULL;
1414 const char *cgroup_root;
1415 int r;
1416
1417 controllers = set_new(&string_hash_ops);
1418 if (!controllers)
1419 return log_oom();
1420
1421 r = cg_kernel_controllers(controllers);
1422 if (r < 0)
1423 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1424
1425 for (;;) {
1426 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1427
1428 controller = set_steal_first(controllers);
1429 if (!controller)
1430 break;
1431
1432 origin = prefix_root("/sys/fs/cgroup/", controller);
1433 if (!origin)
1434 return log_oom();
1435
1436 r = readlink_malloc(origin, &combined);
1437 if (r == -EINVAL) {
1438 /* Not a symbolic link, but directly a single cgroup hierarchy */
1439
1440 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1441 if (r < 0)
1442 return r;
1443
1444 } else if (r < 0)
1445 return log_error_errno(r, "Failed to read link %s: %m", origin);
1446 else {
1447 _cleanup_free_ char *target = NULL;
1448
1449 target = prefix_root(dest, origin);
1450 if (!target)
1451 return log_oom();
1452
1453 /* A symbolic link, a combination of controllers in one hierarchy */
1454
1455 if (!filename_is_valid(combined)) {
1456 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1457 continue;
1458 }
1459
1460 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1461 if (r < 0)
1462 return r;
1463
1464 r = symlink_idempotent(combined, target);
1465 if (r == -EINVAL) {
1466 log_error("Invalid existing symlink for combined hierarchy");
1467 return r;
1468 }
1469 if (r < 0)
1470 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1471 }
1472 }
1473
1474 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1475 if (r < 0)
1476 return r;
1477
1478 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1479 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1480 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1481
1482 return 0;
1483 }
1484
1485 static int mount_systemd_cgroup_writable(const char *dest) {
1486 _cleanup_free_ char *own_cgroup_path = NULL;
1487 const char *systemd_root, *systemd_own;
1488 int r;
1489
1490 assert(dest);
1491
1492 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1493 if (r < 0)
1494 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1495
1496 /* Make our own cgroup a (writable) bind mount */
1497 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1498 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1499 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1500
1501 /* And then remount the systemd cgroup root read-only */
1502 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
1503 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1504 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1505
1506 return 0;
1507 }
1508
1509 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1510 assert(p);
1511
1512 if (!arg_userns)
1513 return 0;
1514
1515 if (uid == UID_INVALID && gid == GID_INVALID)
1516 return 0;
1517
1518 if (uid != UID_INVALID) {
1519 uid += arg_uid_shift;
1520
1521 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1522 return -EOVERFLOW;
1523 }
1524
1525 if (gid != GID_INVALID) {
1526 gid += (gid_t) arg_uid_shift;
1527
1528 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1529 return -EOVERFLOW;
1530 }
1531
1532 if (lchown(p, uid, gid) < 0)
1533 return -errno;
1534
1535 return 0;
1536 }
1537
1538 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1539 const char *q;
1540
1541 q = prefix_roota(root, path);
1542 if (mkdir(q, mode) < 0) {
1543 if (errno == EEXIST)
1544 return 0;
1545 return -errno;
1546 }
1547
1548 return userns_lchown(q, uid, gid);
1549 }
1550
1551 static int setup_timezone(const char *dest) {
1552 _cleanup_free_ char *p = NULL, *q = NULL;
1553 const char *where, *check, *what;
1554 char *z, *y;
1555 int r;
1556
1557 assert(dest);
1558
1559 /* Fix the timezone, if possible */
1560 r = readlink_malloc("/etc/localtime", &p);
1561 if (r < 0) {
1562 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1563 return 0;
1564 }
1565
1566 z = path_startswith(p, "../usr/share/zoneinfo/");
1567 if (!z)
1568 z = path_startswith(p, "/usr/share/zoneinfo/");
1569 if (!z) {
1570 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1571 return 0;
1572 }
1573
1574 where = prefix_roota(dest, "/etc/localtime");
1575 r = readlink_malloc(where, &q);
1576 if (r >= 0) {
1577 y = path_startswith(q, "../usr/share/zoneinfo/");
1578 if (!y)
1579 y = path_startswith(q, "/usr/share/zoneinfo/");
1580
1581 /* Already pointing to the right place? Then do nothing .. */
1582 if (y && streq(y, z))
1583 return 0;
1584 }
1585
1586 check = strjoina("/usr/share/zoneinfo/", z);
1587 check = prefix_root(dest, check);
1588 if (laccess(check, F_OK) < 0) {
1589 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1590 return 0;
1591 }
1592
1593 r = unlink(where);
1594 if (r < 0 && errno != ENOENT) {
1595 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1596 return 0;
1597 }
1598
1599 what = strjoina("../usr/share/zoneinfo/", z);
1600 if (symlink(what, where) < 0) {
1601 log_error_errno(errno, "Failed to correct timezone of container: %m");
1602 return 0;
1603 }
1604
1605 r = userns_lchown(where, 0, 0);
1606 if (r < 0)
1607 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1608
1609 return 0;
1610 }
1611
1612 static int setup_resolv_conf(const char *dest) {
1613 const char *where = NULL;
1614 int r;
1615
1616 assert(dest);
1617
1618 if (arg_private_network)
1619 return 0;
1620
1621 /* Fix resolv.conf, if possible */
1622 where = prefix_roota(dest, "/etc/resolv.conf");
1623
1624 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1625 if (r < 0) {
1626 /* If the file already exists as symlink, let's
1627 * suppress the warning, under the assumption that
1628 * resolved or something similar runs inside and the
1629 * symlink points there.
1630 *
1631 * If the disk image is read-only, there's also no
1632 * point in complaining.
1633 */
1634 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1635 "Failed to copy /etc/resolv.conf to %s: %m", where);
1636 return 0;
1637 }
1638
1639 r = userns_lchown(where, 0, 0);
1640 if (r < 0)
1641 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1642
1643 return 0;
1644 }
1645
1646 static int setup_volatile_state(const char *directory) {
1647 _cleanup_free_ char *buf = NULL;
1648 const char *p, *options;
1649 int r;
1650
1651 assert(directory);
1652
1653 if (arg_volatile != VOLATILE_STATE)
1654 return 0;
1655
1656 /* --volatile=state means we simply overmount /var
1657 with a tmpfs, and the rest read-only. */
1658
1659 r = bind_remount_recursive(directory, true);
1660 if (r < 0)
1661 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1662
1663 p = prefix_roota(directory, "/var");
1664 r = mkdir(p, 0755);
1665 if (r < 0 && errno != EEXIST)
1666 return log_error_errno(errno, "Failed to create %s: %m", directory);
1667
1668 options = "mode=755";
1669 r = tmpfs_patch_options(options, &buf);
1670 if (r < 0)
1671 return log_oom();
1672 if (r > 0)
1673 options = buf;
1674
1675 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
1676 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1677
1678 return 0;
1679 }
1680
1681 static int setup_volatile(const char *directory) {
1682 bool tmpfs_mounted = false, bind_mounted = false;
1683 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1684 _cleanup_free_ char *buf = NULL;
1685 const char *f, *t, *options;
1686 int r;
1687
1688 assert(directory);
1689
1690 if (arg_volatile != VOLATILE_YES)
1691 return 0;
1692
1693 /* --volatile=yes means we mount a tmpfs to the root dir, and
1694 the original /usr to use inside it, and that read-only. */
1695
1696 if (!mkdtemp(template))
1697 return log_error_errno(errno, "Failed to create temporary directory: %m");
1698
1699 options = "mode=755";
1700 r = tmpfs_patch_options(options, &buf);
1701 if (r < 0)
1702 return log_oom();
1703 if (r > 0)
1704 options = buf;
1705
1706 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
1707 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1708 goto fail;
1709 }
1710
1711 tmpfs_mounted = true;
1712
1713 f = prefix_roota(directory, "/usr");
1714 t = prefix_roota(template, "/usr");
1715
1716 r = mkdir(t, 0755);
1717 if (r < 0 && errno != EEXIST) {
1718 r = log_error_errno(errno, "Failed to create %s: %m", t);
1719 goto fail;
1720 }
1721
1722 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
1723 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
1724 goto fail;
1725 }
1726
1727 bind_mounted = true;
1728
1729 r = bind_remount_recursive(t, true);
1730 if (r < 0) {
1731 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1732 goto fail;
1733 }
1734
1735 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1736 r = log_error_errno(errno, "Failed to move root mount: %m");
1737 goto fail;
1738 }
1739
1740 (void) rmdir(template);
1741
1742 return 0;
1743
1744 fail:
1745 if (bind_mounted)
1746 (void) umount(t);
1747
1748 if (tmpfs_mounted)
1749 (void) umount(template);
1750 (void) rmdir(template);
1751 return r;
1752 }
1753
1754 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1755 assert(s);
1756
1757 snprintf(s, 37,
1758 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1759 SD_ID128_FORMAT_VAL(id));
1760
1761 return s;
1762 }
1763
1764 static int setup_boot_id(const char *dest) {
1765 const char *from, *to;
1766 sd_id128_t rnd = {};
1767 char as_uuid[37];
1768 int r;
1769
1770 if (arg_share_system)
1771 return 0;
1772
1773 /* Generate a new randomized boot ID, so that each boot-up of
1774 * the container gets a new one */
1775
1776 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1777 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1778
1779 r = sd_id128_randomize(&rnd);
1780 if (r < 0)
1781 return log_error_errno(r, "Failed to generate random boot id: %m");
1782
1783 id128_format_as_uuid(rnd, as_uuid);
1784
1785 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1786 if (r < 0)
1787 return log_error_errno(r, "Failed to write boot id: %m");
1788
1789 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1790 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1791 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1792 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1793
1794 unlink(from);
1795 return r;
1796 }
1797
1798 static int copy_devnodes(const char *dest) {
1799
1800 static const char devnodes[] =
1801 "null\0"
1802 "zero\0"
1803 "full\0"
1804 "random\0"
1805 "urandom\0"
1806 "tty\0"
1807 "net/tun\0";
1808
1809 const char *d;
1810 int r = 0;
1811 _cleanup_umask_ mode_t u;
1812
1813 assert(dest);
1814
1815 u = umask(0000);
1816
1817 /* Create /dev/net, so that we can create /dev/net/tun in it */
1818 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1819 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1820
1821 NULSTR_FOREACH(d, devnodes) {
1822 _cleanup_free_ char *from = NULL, *to = NULL;
1823 struct stat st;
1824
1825 from = strappend("/dev/", d);
1826 to = prefix_root(dest, from);
1827
1828 if (stat(from, &st) < 0) {
1829
1830 if (errno != ENOENT)
1831 return log_error_errno(errno, "Failed to stat %s: %m", from);
1832
1833 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1834
1835 log_error("%s is not a char or block device, cannot copy.", from);
1836 return -EIO;
1837
1838 } else {
1839 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1840 if (errno != EPERM)
1841 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1842
1843 /* Some systems abusively restrict mknod but
1844 * allow bind mounts. */
1845 r = touch(to);
1846 if (r < 0)
1847 return log_error_errno(r, "touch (%s) failed: %m", to);
1848 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1849 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1850 }
1851
1852 r = userns_lchown(to, 0, 0);
1853 if (r < 0)
1854 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1855 }
1856 }
1857
1858 return r;
1859 }
1860
1861 static int setup_pts(const char *dest) {
1862 _cleanup_free_ char *options = NULL;
1863 const char *p;
1864
1865 #ifdef HAVE_SELINUX
1866 if (arg_selinux_apifs_context)
1867 (void) asprintf(&options,
1868 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1869 arg_uid_shift + TTY_GID,
1870 arg_selinux_apifs_context);
1871 else
1872 #endif
1873 (void) asprintf(&options,
1874 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1875 arg_uid_shift + TTY_GID);
1876
1877 if (!options)
1878 return log_oom();
1879
1880 /* Mount /dev/pts itself */
1881 p = prefix_roota(dest, "/dev/pts");
1882 if (mkdir(p, 0755) < 0)
1883 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1884 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1885 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1886 if (userns_lchown(p, 0, 0) < 0)
1887 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1888
1889 /* Create /dev/ptmx symlink */
1890 p = prefix_roota(dest, "/dev/ptmx");
1891 if (symlink("pts/ptmx", p) < 0)
1892 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1893 if (userns_lchown(p, 0, 0) < 0)
1894 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1895
1896 /* And fix /dev/pts/ptmx ownership */
1897 p = prefix_roota(dest, "/dev/pts/ptmx");
1898 if (userns_lchown(p, 0, 0) < 0)
1899 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1900
1901 return 0;
1902 }
1903
1904 static int setup_dev_console(const char *dest, const char *console) {
1905 _cleanup_umask_ mode_t u;
1906 const char *to;
1907 int r;
1908
1909 assert(dest);
1910 assert(console);
1911
1912 u = umask(0000);
1913
1914 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1915 if (r < 0)
1916 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1917
1918 /* We need to bind mount the right tty to /dev/console since
1919 * ptys can only exist on pts file systems. To have something
1920 * to bind mount things on we create a empty regular file. */
1921
1922 to = prefix_roota(dest, "/dev/console");
1923 r = touch(to);
1924 if (r < 0)
1925 return log_error_errno(r, "touch() for /dev/console failed: %m");
1926
1927 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1928 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1929
1930 return 0;
1931 }
1932
1933 static int setup_kmsg(const char *dest, int kmsg_socket) {
1934 const char *from, *to;
1935 _cleanup_umask_ mode_t u;
1936 int fd, k;
1937 union {
1938 struct cmsghdr cmsghdr;
1939 uint8_t buf[CMSG_SPACE(sizeof(int))];
1940 } control = {};
1941 struct msghdr mh = {
1942 .msg_control = &control,
1943 .msg_controllen = sizeof(control),
1944 };
1945 struct cmsghdr *cmsg;
1946
1947 assert(kmsg_socket >= 0);
1948
1949 u = umask(0000);
1950
1951 /* We create the kmsg FIFO as /run/kmsg, but immediately
1952 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1953 * on the reading side behave very similar to /proc/kmsg,
1954 * their writing side behaves differently from /dev/kmsg in
1955 * that writing blocks when nothing is reading. In order to
1956 * avoid any problems with containers deadlocking due to this
1957 * we simply make /dev/kmsg unavailable to the container. */
1958 from = prefix_roota(dest, "/run/kmsg");
1959 to = prefix_roota(dest, "/proc/kmsg");
1960
1961 if (mkfifo(from, 0600) < 0)
1962 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1963 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1964 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1965
1966 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1967 if (fd < 0)
1968 return log_error_errno(errno, "Failed to open fifo: %m");
1969
1970 cmsg = CMSG_FIRSTHDR(&mh);
1971 cmsg->cmsg_level = SOL_SOCKET;
1972 cmsg->cmsg_type = SCM_RIGHTS;
1973 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1974 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1975
1976 mh.msg_controllen = cmsg->cmsg_len;
1977
1978 /* Store away the fd in the socket, so that it stays open as
1979 * long as we run the child */
1980 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1981 safe_close(fd);
1982
1983 if (k < 0)
1984 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1985
1986 /* And now make the FIFO unavailable as /run/kmsg... */
1987 (void) unlink(from);
1988
1989 return 0;
1990 }
1991
1992 static int send_rtnl(int send_fd) {
1993 union {
1994 struct cmsghdr cmsghdr;
1995 uint8_t buf[CMSG_SPACE(sizeof(int))];
1996 } control = {};
1997 struct msghdr mh = {
1998 .msg_control = &control,
1999 .msg_controllen = sizeof(control),
2000 };
2001 struct cmsghdr *cmsg;
2002 _cleanup_close_ int fd = -1;
2003 ssize_t k;
2004
2005 assert(send_fd >= 0);
2006
2007 if (!arg_expose_ports)
2008 return 0;
2009
2010 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
2011 if (fd < 0)
2012 return log_error_errno(errno, "Failed to allocate container netlink: %m");
2013
2014 cmsg = CMSG_FIRSTHDR(&mh);
2015 cmsg->cmsg_level = SOL_SOCKET;
2016 cmsg->cmsg_type = SCM_RIGHTS;
2017 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
2018 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
2019
2020 mh.msg_controllen = cmsg->cmsg_len;
2021
2022 /* Store away the fd in the socket, so that it stays open as
2023 * long as we run the child */
2024 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
2025 if (k < 0)
2026 return log_error_errno(errno, "Failed to send netlink fd: %m");
2027
2028 return 0;
2029 }
2030
2031 static int flush_ports(union in_addr_union *exposed) {
2032 ExposePort *p;
2033 int r, af = AF_INET;
2034
2035 assert(exposed);
2036
2037 if (!arg_expose_ports)
2038 return 0;
2039
2040 if (in_addr_is_null(af, exposed))
2041 return 0;
2042
2043 log_debug("Lost IP address.");
2044
2045 LIST_FOREACH(ports, p, arg_expose_ports) {
2046 r = fw_add_local_dnat(false,
2047 af,
2048 p->protocol,
2049 NULL,
2050 NULL, 0,
2051 NULL, 0,
2052 p->host_port,
2053 exposed,
2054 p->container_port,
2055 NULL);
2056 if (r < 0)
2057 log_warning_errno(r, "Failed to modify firewall: %m");
2058 }
2059
2060 *exposed = IN_ADDR_NULL;
2061 return 0;
2062 }
2063
2064 static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
2065 _cleanup_free_ struct local_address *addresses = NULL;
2066 _cleanup_free_ char *pretty = NULL;
2067 union in_addr_union new_exposed;
2068 ExposePort *p;
2069 bool add;
2070 int af = AF_INET, r;
2071
2072 assert(exposed);
2073
2074 /* Invoked each time an address is added or removed inside the
2075 * container */
2076
2077 if (!arg_expose_ports)
2078 return 0;
2079
2080 r = local_addresses(rtnl, 0, af, &addresses);
2081 if (r < 0)
2082 return log_error_errno(r, "Failed to enumerate local addresses: %m");
2083
2084 add = r > 0 &&
2085 addresses[0].family == af &&
2086 addresses[0].scope < RT_SCOPE_LINK;
2087
2088 if (!add)
2089 return flush_ports(exposed);
2090
2091 new_exposed = addresses[0].address;
2092 if (in_addr_equal(af, exposed, &new_exposed))
2093 return 0;
2094
2095 in_addr_to_string(af, &new_exposed, &pretty);
2096 log_debug("New container IP is %s.", strna(pretty));
2097
2098 LIST_FOREACH(ports, p, arg_expose_ports) {
2099
2100 r = fw_add_local_dnat(true,
2101 af,
2102 p->protocol,
2103 NULL,
2104 NULL, 0,
2105 NULL, 0,
2106 p->host_port,
2107 &new_exposed,
2108 p->container_port,
2109 in_addr_is_null(af, exposed) ? NULL : exposed);
2110 if (r < 0)
2111 log_warning_errno(r, "Failed to modify firewall: %m");
2112 }
2113
2114 *exposed = new_exposed;
2115 return 0;
2116 }
2117
2118 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2119 union in_addr_union *exposed = userdata;
2120
2121 assert(rtnl);
2122 assert(m);
2123 assert(exposed);
2124
2125 expose_ports(rtnl, exposed);
2126 return 0;
2127 }
2128
2129 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
2130 union {
2131 struct cmsghdr cmsghdr;
2132 uint8_t buf[CMSG_SPACE(sizeof(int))];
2133 } control = {};
2134 struct msghdr mh = {
2135 .msg_control = &control,
2136 .msg_controllen = sizeof(control),
2137 };
2138 struct cmsghdr *cmsg;
2139 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2140 int fd, r;
2141 ssize_t k;
2142
2143 assert(event);
2144 assert(recv_fd >= 0);
2145 assert(ret);
2146
2147 if (!arg_expose_ports)
2148 return 0;
2149
2150 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
2151 if (k < 0)
2152 return log_error_errno(errno, "Failed to recv netlink fd: %m");
2153
2154 cmsg = CMSG_FIRSTHDR(&mh);
2155 assert(cmsg->cmsg_level == SOL_SOCKET);
2156 assert(cmsg->cmsg_type == SCM_RIGHTS);
2157 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
2158 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
2159
2160 r = sd_netlink_open_fd(&rtnl, fd);
2161 if (r < 0) {
2162 safe_close(fd);
2163 return log_error_errno(r, "Failed to create rtnl object: %m");
2164 }
2165
2166 r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
2167 if (r < 0)
2168 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
2169
2170 r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
2171 if (r < 0)
2172 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
2173
2174 r = sd_netlink_attach_event(rtnl, event, 0);
2175 if (r < 0)
2176 return log_error_errno(r, "Failed to add to even loop: %m");
2177
2178 *ret = rtnl;
2179 rtnl = NULL;
2180
2181 return 0;
2182 }
2183
2184 static int setup_hostname(void) {
2185
2186 if (arg_share_system)
2187 return 0;
2188
2189 if (sethostname_idempotent(arg_machine) < 0)
2190 return -errno;
2191
2192 return 0;
2193 }
2194
2195 static int setup_journal(const char *directory) {
2196 sd_id128_t machine_id, this_id;
2197 _cleanup_free_ char *b = NULL, *d = NULL;
2198 const char *etc_machine_id, *p, *q;
2199 char *id;
2200 int r;
2201
2202 /* Don't link journals in ephemeral mode */
2203 if (arg_ephemeral)
2204 return 0;
2205
2206 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2207
2208 r = read_one_line_file(etc_machine_id, &b);
2209 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
2210 return 0;
2211 else if (r < 0)
2212 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
2213
2214 id = strstrip(b);
2215 if (isempty(id) && arg_link_journal == LINK_AUTO)
2216 return 0;
2217
2218 /* Verify validity */
2219 r = sd_id128_from_string(id, &machine_id);
2220 if (r < 0)
2221 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
2222
2223 r = sd_id128_get_machine(&this_id);
2224 if (r < 0)
2225 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2226
2227 if (sd_id128_equal(machine_id, this_id)) {
2228 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
2229 "Host and machine ids are equal (%s): refusing to link journals", id);
2230 if (arg_link_journal == LINK_AUTO)
2231 return 0;
2232 return -EEXIST;
2233 }
2234
2235 if (arg_link_journal == LINK_NO)
2236 return 0;
2237
2238 r = userns_mkdir(directory, "/var", 0755, 0, 0);
2239 if (r < 0)
2240 return log_error_errno(r, "Failed to create /var: %m");
2241
2242 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
2243 if (r < 0)
2244 return log_error_errno(r, "Failed to create /var/log: %m");
2245
2246 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
2247 if (r < 0)
2248 return log_error_errno(r, "Failed to create /var/log/journal: %m");
2249
2250 p = strjoina("/var/log/journal/", id);
2251 q = prefix_roota(directory, p);
2252
2253 if (path_is_mount_point(p, 0) > 0) {
2254 if (arg_link_journal != LINK_AUTO) {
2255 log_error("%s: already a mount point, refusing to use for journal", p);
2256 return -EEXIST;
2257 }
2258
2259 return 0;
2260 }
2261
2262 if (path_is_mount_point(q, 0) > 0) {
2263 if (arg_link_journal != LINK_AUTO) {
2264 log_error("%s: already a mount point, refusing to use for journal", q);
2265 return -EEXIST;
2266 }
2267
2268 return 0;
2269 }
2270
2271 r = readlink_and_make_absolute(p, &d);
2272 if (r >= 0) {
2273 if ((arg_link_journal == LINK_GUEST ||
2274 arg_link_journal == LINK_AUTO) &&
2275 path_equal(d, q)) {
2276
2277 r = userns_mkdir(directory, p, 0755, 0, 0);
2278 if (r < 0)
2279 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2280 return 0;
2281 }
2282
2283 if (unlink(p) < 0)
2284 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2285 } else if (r == -EINVAL) {
2286
2287 if (arg_link_journal == LINK_GUEST &&
2288 rmdir(p) < 0) {
2289
2290 if (errno == ENOTDIR) {
2291 log_error("%s already exists and is neither a symlink nor a directory", p);
2292 return r;
2293 } else {
2294 log_error_errno(errno, "Failed to remove %s: %m", p);
2295 return -errno;
2296 }
2297 }
2298 } else if (r != -ENOENT) {
2299 log_error_errno(errno, "readlink(%s) failed: %m", p);
2300 return r;
2301 }
2302
2303 if (arg_link_journal == LINK_GUEST) {
2304
2305 if (symlink(q, p) < 0) {
2306 if (arg_link_journal_try) {
2307 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2308 return 0;
2309 } else {
2310 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2311 return -errno;
2312 }
2313 }
2314
2315 r = userns_mkdir(directory, p, 0755, 0, 0);
2316 if (r < 0)
2317 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2318 return 0;
2319 }
2320
2321 if (arg_link_journal == LINK_HOST) {
2322 /* don't create parents here -- if the host doesn't have
2323 * permanent journal set up, don't force it here */
2324 r = mkdir(p, 0755);
2325 if (r < 0) {
2326 if (arg_link_journal_try) {
2327 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
2328 return 0;
2329 } else {
2330 log_error_errno(errno, "Failed to create %s: %m", p);
2331 return r;
2332 }
2333 }
2334
2335 } else if (access(p, F_OK) < 0)
2336 return 0;
2337
2338 if (dir_is_empty(q) == 0)
2339 log_warning("%s is not empty, proceeding anyway.", q);
2340
2341 r = userns_mkdir(directory, p, 0755, 0, 0);
2342 if (r < 0) {
2343 log_error_errno(errno, "Failed to create %s: %m", q);
2344 return r;
2345 }
2346
2347 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2348 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2349
2350 return 0;
2351 }
2352
2353 static int drop_capabilities(void) {
2354 return capability_bounding_set_drop(~arg_retain, false);
2355 }
2356
2357 static int register_machine(pid_t pid, int local_ifindex) {
2358 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2359 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2360 int r;
2361
2362 if (!arg_register)
2363 return 0;
2364
2365 r = sd_bus_default_system(&bus);
2366 if (r < 0)
2367 return log_error_errno(r, "Failed to open system bus: %m");
2368
2369 if (arg_keep_unit) {
2370 r = sd_bus_call_method(
2371 bus,
2372 "org.freedesktop.machine1",
2373 "/org/freedesktop/machine1",
2374 "org.freedesktop.machine1.Manager",
2375 "RegisterMachineWithNetwork",
2376 &error,
2377 NULL,
2378 "sayssusai",
2379 arg_machine,
2380 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2381 "nspawn",
2382 "container",
2383 (uint32_t) pid,
2384 strempty(arg_directory),
2385 local_ifindex > 0 ? 1 : 0, local_ifindex);
2386 } else {
2387 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
2388 char **i;
2389 unsigned j;
2390
2391 r = sd_bus_message_new_method_call(
2392 bus,
2393 &m,
2394 "org.freedesktop.machine1",
2395 "/org/freedesktop/machine1",
2396 "org.freedesktop.machine1.Manager",
2397 "CreateMachineWithNetwork");
2398 if (r < 0)
2399 return bus_log_create_error(r);
2400
2401 r = sd_bus_message_append(
2402 m,
2403 "sayssusai",
2404 arg_machine,
2405 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2406 "nspawn",
2407 "container",
2408 (uint32_t) pid,
2409 strempty(arg_directory),
2410 local_ifindex > 0 ? 1 : 0, local_ifindex);
2411 if (r < 0)
2412 return bus_log_create_error(r);
2413
2414 r = sd_bus_message_open_container(m, 'a', "(sv)");
2415 if (r < 0)
2416 return bus_log_create_error(r);
2417
2418 if (!isempty(arg_slice)) {
2419 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2420 if (r < 0)
2421 return bus_log_create_error(r);
2422 }
2423
2424 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2425 if (r < 0)
2426 return bus_log_create_error(r);
2427
2428 /* If you make changes here, also make sure to update
2429 * systemd-nspawn@.service, to keep the device
2430 * policies in sync regardless if we are run with or
2431 * without the --keep-unit switch. */
2432 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2433 /* Allow the container to
2434 * access and create the API
2435 * device nodes, so that
2436 * PrivateDevices= in the
2437 * container can work
2438 * fine */
2439 "/dev/null", "rwm",
2440 "/dev/zero", "rwm",
2441 "/dev/full", "rwm",
2442 "/dev/random", "rwm",
2443 "/dev/urandom", "rwm",
2444 "/dev/tty", "rwm",
2445 "/dev/net/tun", "rwm",
2446 /* Allow the container
2447 * access to ptys. However,
2448 * do not permit the
2449 * container to ever create
2450 * these device nodes. */
2451 "/dev/pts/ptmx", "rw",
2452 "char-pts", "rw");
2453 if (r < 0)
2454 return bus_log_create_error(r);
2455
2456 for (j = 0; j < arg_n_custom_mounts; j++) {
2457 CustomMount *cm = &arg_custom_mounts[j];
2458
2459 if (cm->type != CUSTOM_MOUNT_BIND)
2460 continue;
2461
2462 r = is_device_node(cm->source);
2463 if (r < 0)
2464 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
2465
2466 if (r) {
2467 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
2468 cm->source, cm->read_only ? "r" : "rw");
2469 if (r < 0)
2470 return log_error_errno(r, "Failed to append message arguments: %m");
2471 }
2472 }
2473
2474 if (arg_kill_signal != 0) {
2475 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
2476 if (r < 0)
2477 return bus_log_create_error(r);
2478
2479 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
2480 if (r < 0)
2481 return bus_log_create_error(r);
2482 }
2483
2484 STRV_FOREACH(i, arg_property) {
2485 r = sd_bus_message_open_container(m, 'r', "sv");
2486 if (r < 0)
2487 return bus_log_create_error(r);
2488
2489 r = bus_append_unit_property_assignment(m, *i);
2490 if (r < 0)
2491 return r;
2492
2493 r = sd_bus_message_close_container(m);
2494 if (r < 0)
2495 return bus_log_create_error(r);
2496 }
2497
2498 r = sd_bus_message_close_container(m);
2499 if (r < 0)
2500 return bus_log_create_error(r);
2501
2502 r = sd_bus_call(bus, m, 0, &error, NULL);
2503 }
2504
2505 if (r < 0) {
2506 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2507 return r;
2508 }
2509
2510 return 0;
2511 }
2512
2513 static int terminate_machine(pid_t pid) {
2514 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2515 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2516 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2517 const char *path;
2518 int r;
2519
2520 if (!arg_register)
2521 return 0;
2522
2523 /* If we are reusing the unit, then just exit, systemd will do
2524 * the right thing when we exit. */
2525 if (arg_keep_unit)
2526 return 0;
2527
2528 r = sd_bus_default_system(&bus);
2529 if (r < 0)
2530 return log_error_errno(r, "Failed to open system bus: %m");
2531
2532 r = sd_bus_call_method(
2533 bus,
2534 "org.freedesktop.machine1",
2535 "/org/freedesktop/machine1",
2536 "org.freedesktop.machine1.Manager",
2537 "GetMachineByPID",
2538 &error,
2539 &reply,
2540 "u",
2541 (uint32_t) pid);
2542 if (r < 0) {
2543 /* Note that the machine might already have been
2544 * cleaned up automatically, hence don't consider it a
2545 * failure if we cannot get the machine object. */
2546 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2547 return 0;
2548 }
2549
2550 r = sd_bus_message_read(reply, "o", &path);
2551 if (r < 0)
2552 return bus_log_parse_error(r);
2553
2554 r = sd_bus_call_method(
2555 bus,
2556 "org.freedesktop.machine1",
2557 path,
2558 "org.freedesktop.machine1.Machine",
2559 "Terminate",
2560 &error,
2561 NULL,
2562 NULL);
2563 if (r < 0) {
2564 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2565 return 0;
2566 }
2567
2568 return 0;
2569 }
2570
2571 static int reset_audit_loginuid(void) {
2572 _cleanup_free_ char *p = NULL;
2573 int r;
2574
2575 if (arg_share_system)
2576 return 0;
2577
2578 r = read_one_line_file("/proc/self/loginuid", &p);
2579 if (r == -ENOENT)
2580 return 0;
2581 if (r < 0)
2582 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2583
2584 /* Already reset? */
2585 if (streq(p, "4294967295"))
2586 return 0;
2587
2588 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
2589 if (r < 0) {
2590 log_error_errno(r,
2591 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2592 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2593 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2594 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2595 "using systemd-nspawn. Sleeping for 5s... (%m)");
2596
2597 sleep(5);
2598 }
2599
2600 return 0;
2601 }
2602
2603 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2604 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2605 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2606
2607 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2608 uint8_t result[8];
2609 size_t l, sz;
2610 uint8_t *v, *i;
2611 int r;
2612
2613 l = strlen(arg_machine);
2614 sz = sizeof(sd_id128_t) + l;
2615 if (idx > 0)
2616 sz += sizeof(idx);
2617
2618 v = alloca(sz);
2619
2620 /* fetch some persistent data unique to the host */
2621 r = sd_id128_get_machine((sd_id128_t*) v);
2622 if (r < 0)
2623 return r;
2624
2625 /* combine with some data unique (on this host) to this
2626 * container instance */
2627 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2628 if (idx > 0) {
2629 idx = htole64(idx);
2630 memcpy(i, &idx, sizeof(idx));
2631 }
2632
2633 /* Let's hash the host machine ID plus the container name. We
2634 * use a fixed, but originally randomly created hash key here. */
2635 siphash24(result, v, sz, hash_key.bytes);
2636
2637 assert_cc(ETH_ALEN <= sizeof(result));
2638 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2639
2640 /* see eth_random_addr in the kernel */
2641 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2642 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2643
2644 return 0;
2645 }
2646
2647 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2648 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2649 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2650 struct ether_addr mac_host, mac_container;
2651 int r, i;
2652
2653 if (!arg_private_network)
2654 return 0;
2655
2656 if (!arg_network_veth)
2657 return 0;
2658
2659 /* Use two different interface name prefixes depending whether
2660 * we are in bridge mode or not. */
2661 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2662 arg_network_bridge ? "vb" : "ve", arg_machine);
2663
2664 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2665 if (r < 0)
2666 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2667
2668 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2669 if (r < 0)
2670 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2671
2672 r = sd_netlink_open(&rtnl);
2673 if (r < 0)
2674 return log_error_errno(r, "Failed to connect to netlink: %m");
2675
2676 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2677 if (r < 0)
2678 return log_error_errno(r, "Failed to allocate netlink message: %m");
2679
2680 r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
2681 if (r < 0)
2682 return log_error_errno(r, "Failed to add netlink interface name: %m");
2683
2684 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2685 if (r < 0)
2686 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2687
2688 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2689 if (r < 0)
2690 return log_error_errno(r, "Failed to open netlink container: %m");
2691
2692 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2693 if (r < 0)
2694 return log_error_errno(r, "Failed to open netlink container: %m");
2695
2696 r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
2697 if (r < 0)
2698 return log_error_errno(r, "Failed to open netlink container: %m");
2699
2700 r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
2701 if (r < 0)
2702 return log_error_errno(r, "Failed to add netlink interface name: %m");
2703
2704 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2705 if (r < 0)
2706 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2707
2708 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2709 if (r < 0)
2710 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2711
2712 r = sd_netlink_message_close_container(m);
2713 if (r < 0)
2714 return log_error_errno(r, "Failed to close netlink container: %m");
2715
2716 r = sd_netlink_message_close_container(m);
2717 if (r < 0)
2718 return log_error_errno(r, "Failed to close netlink container: %m");
2719
2720 r = sd_netlink_message_close_container(m);
2721 if (r < 0)
2722 return log_error_errno(r, "Failed to close netlink container: %m");
2723
2724 r = sd_netlink_call(rtnl, m, 0, NULL);
2725 if (r < 0)
2726 return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
2727
2728 i = (int) if_nametoindex(iface_name);
2729 if (i <= 0)
2730 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2731
2732 *ifi = i;
2733
2734 return 0;
2735 }
2736
2737 static int setup_bridge(const char veth_name[], int *ifi) {
2738 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2739 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2740 int r, bridge;
2741
2742 if (!arg_private_network)
2743 return 0;
2744
2745 if (!arg_network_veth)
2746 return 0;
2747
2748 if (!arg_network_bridge)
2749 return 0;
2750
2751 bridge = (int) if_nametoindex(arg_network_bridge);
2752 if (bridge <= 0)
2753 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2754
2755 *ifi = bridge;
2756
2757 r = sd_netlink_open(&rtnl);
2758 if (r < 0)
2759 return log_error_errno(r, "Failed to connect to netlink: %m");
2760
2761 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2762 if (r < 0)
2763 return log_error_errno(r, "Failed to allocate netlink message: %m");
2764
2765 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2766 if (r < 0)
2767 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2768
2769 r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
2770 if (r < 0)
2771 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2772
2773 r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
2774 if (r < 0)
2775 return log_error_errno(r, "Failed to add netlink master field: %m");
2776
2777 r = sd_netlink_call(rtnl, m, 0, NULL);
2778 if (r < 0)
2779 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2780
2781 return 0;
2782 }
2783
2784 static int parse_interface(struct udev *udev, const char *name) {
2785 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2786 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2787 int ifi;
2788
2789 ifi = (int) if_nametoindex(name);
2790 if (ifi <= 0)
2791 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2792
2793 sprintf(ifi_str, "n%i", ifi);
2794 d = udev_device_new_from_device_id(udev, ifi_str);
2795 if (!d)
2796 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2797
2798 if (udev_device_get_is_initialized(d) <= 0) {
2799 log_error("Network interface %s is not initialized yet.", name);
2800 return -EBUSY;
2801 }
2802
2803 return ifi;
2804 }
2805
2806 static int move_network_interfaces(pid_t pid) {
2807 _cleanup_udev_unref_ struct udev *udev = NULL;
2808 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2809 char **i;
2810 int r;
2811
2812 if (!arg_private_network)
2813 return 0;
2814
2815 if (strv_isempty(arg_network_interfaces))
2816 return 0;
2817
2818 r = sd_netlink_open(&rtnl);
2819 if (r < 0)
2820 return log_error_errno(r, "Failed to connect to netlink: %m");
2821
2822 udev = udev_new();
2823 if (!udev) {
2824 log_error("Failed to connect to udev.");
2825 return -ENOMEM;
2826 }
2827
2828 STRV_FOREACH(i, arg_network_interfaces) {
2829 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2830 int ifi;
2831
2832 ifi = parse_interface(udev, *i);
2833 if (ifi < 0)
2834 return ifi;
2835
2836 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2837 if (r < 0)
2838 return log_error_errno(r, "Failed to allocate netlink message: %m");
2839
2840 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2841 if (r < 0)
2842 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2843
2844 r = sd_netlink_call(rtnl, m, 0, NULL);
2845 if (r < 0)
2846 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2847 }
2848
2849 return 0;
2850 }
2851
2852 static int setup_macvlan(pid_t pid) {
2853 _cleanup_udev_unref_ struct udev *udev = NULL;
2854 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2855 unsigned idx = 0;
2856 char **i;
2857 int r;
2858
2859 if (!arg_private_network)
2860 return 0;
2861
2862 if (strv_isempty(arg_network_macvlan))
2863 return 0;
2864
2865 r = sd_netlink_open(&rtnl);
2866 if (r < 0)
2867 return log_error_errno(r, "Failed to connect to netlink: %m");
2868
2869 udev = udev_new();
2870 if (!udev) {
2871 log_error("Failed to connect to udev.");
2872 return -ENOMEM;
2873 }
2874
2875 STRV_FOREACH(i, arg_network_macvlan) {
2876 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2877 _cleanup_free_ char *n = NULL;
2878 struct ether_addr mac;
2879 int ifi;
2880
2881 ifi = parse_interface(udev, *i);
2882 if (ifi < 0)
2883 return ifi;
2884
2885 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2886 if (r < 0)
2887 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2888
2889 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2890 if (r < 0)
2891 return log_error_errno(r, "Failed to allocate netlink message: %m");
2892
2893 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2894 if (r < 0)
2895 return log_error_errno(r, "Failed to add netlink interface index: %m");
2896
2897 n = strappend("mv-", *i);
2898 if (!n)
2899 return log_oom();
2900
2901 strshorten(n, IFNAMSIZ-1);
2902
2903 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2904 if (r < 0)
2905 return log_error_errno(r, "Failed to add netlink interface name: %m");
2906
2907 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2908 if (r < 0)
2909 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2910
2911 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2912 if (r < 0)
2913 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2914
2915 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2916 if (r < 0)
2917 return log_error_errno(r, "Failed to open netlink container: %m");
2918
2919 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2920 if (r < 0)
2921 return log_error_errno(r, "Failed to open netlink container: %m");
2922
2923 r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2924 if (r < 0)
2925 return log_error_errno(r, "Failed to append macvlan mode: %m");
2926
2927 r = sd_netlink_message_close_container(m);
2928 if (r < 0)
2929 return log_error_errno(r, "Failed to close netlink container: %m");
2930
2931 r = sd_netlink_message_close_container(m);
2932 if (r < 0)
2933 return log_error_errno(r, "Failed to close netlink container: %m");
2934
2935 r = sd_netlink_call(rtnl, m, 0, NULL);
2936 if (r < 0)
2937 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2938 }
2939
2940 return 0;
2941 }
2942
2943 static int setup_ipvlan(pid_t pid) {
2944 _cleanup_udev_unref_ struct udev *udev = NULL;
2945 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2946 char **i;
2947 int r;
2948
2949 if (!arg_private_network)
2950 return 0;
2951
2952 if (strv_isempty(arg_network_ipvlan))
2953 return 0;
2954
2955 r = sd_netlink_open(&rtnl);
2956 if (r < 0)
2957 return log_error_errno(r, "Failed to connect to netlink: %m");
2958
2959 udev = udev_new();
2960 if (!udev) {
2961 log_error("Failed to connect to udev.");
2962 return -ENOMEM;
2963 }
2964
2965 STRV_FOREACH(i, arg_network_ipvlan) {
2966 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2967 _cleanup_free_ char *n = NULL;
2968 int ifi;
2969
2970 ifi = parse_interface(udev, *i);
2971 if (ifi < 0)
2972 return ifi;
2973
2974 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2975 if (r < 0)
2976 return log_error_errno(r, "Failed to allocate netlink message: %m");
2977
2978 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2979 if (r < 0)
2980 return log_error_errno(r, "Failed to add netlink interface index: %m");
2981
2982 n = strappend("iv-", *i);
2983 if (!n)
2984 return log_oom();
2985
2986 strshorten(n, IFNAMSIZ-1);
2987
2988 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2989 if (r < 0)
2990 return log_error_errno(r, "Failed to add netlink interface name: %m");
2991
2992 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2993 if (r < 0)
2994 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2995
2996 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2997 if (r < 0)
2998 return log_error_errno(r, "Failed to open netlink container: %m");
2999
3000 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
3001 if (r < 0)
3002 return log_error_errno(r, "Failed to open netlink container: %m");
3003
3004 r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
3005 if (r < 0)
3006 return log_error_errno(r, "Failed to add ipvlan mode: %m");
3007
3008 r = sd_netlink_message_close_container(m);
3009 if (r < 0)
3010 return log_error_errno(r, "Failed to close netlink container: %m");
3011
3012 r = sd_netlink_message_close_container(m);
3013 if (r < 0)
3014 return log_error_errno(r, "Failed to close netlink container: %m");
3015
3016 r = sd_netlink_call(rtnl, m, 0, NULL);
3017 if (r < 0)
3018 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
3019 }
3020
3021 return 0;
3022 }
3023
3024 static int setup_seccomp(void) {
3025
3026 #ifdef HAVE_SECCOMP
3027 static const struct {
3028 uint64_t capability;
3029 int syscall_num;
3030 } blacklist[] = {
3031 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
3032 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
3033 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
3034 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
3035 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
3036 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
3037 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
3038 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
3039 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
3040 { CAP_SYSLOG, SCMP_SYS(syslog) },
3041 };
3042
3043 scmp_filter_ctx seccomp;
3044 unsigned i;
3045 int r;
3046
3047 seccomp = seccomp_init(SCMP_ACT_ALLOW);
3048 if (!seccomp)
3049 return log_oom();
3050
3051 r = seccomp_add_secondary_archs(seccomp);
3052 if (r < 0) {
3053 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
3054 goto finish;
3055 }
3056
3057 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
3058 if (arg_retain & (1ULL << blacklist[i].capability))
3059 continue;
3060
3061 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
3062 if (r == -EFAULT)
3063 continue; /* unknown syscall */
3064 if (r < 0) {
3065 log_error_errno(r, "Failed to block syscall: %m");
3066 goto finish;
3067 }
3068 }
3069
3070
3071 /*
3072 Audit is broken in containers, much of the userspace audit
3073 hookup will fail if running inside a container. We don't
3074 care and just turn off creation of audit sockets.
3075
3076 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
3077 with EAFNOSUPPORT which audit userspace uses as indication
3078 that audit is disabled in the kernel.
3079 */
3080
3081 r = seccomp_rule_add(
3082 seccomp,
3083 SCMP_ACT_ERRNO(EAFNOSUPPORT),
3084 SCMP_SYS(socket),
3085 2,
3086 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
3087 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
3088 if (r < 0) {
3089 log_error_errno(r, "Failed to add audit seccomp rule: %m");
3090 goto finish;
3091 }
3092
3093 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
3094 if (r < 0) {
3095 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
3096 goto finish;
3097 }
3098
3099 r = seccomp_load(seccomp);
3100 if (r == -EINVAL) {
3101 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3102 r = 0;
3103 goto finish;
3104 }
3105 if (r < 0) {
3106 log_error_errno(r, "Failed to install seccomp audit filter: %m");
3107 goto finish;
3108 }
3109
3110 finish:
3111 seccomp_release(seccomp);
3112 return r;
3113 #else
3114 return 0;
3115 #endif
3116
3117 }
3118
3119 static int setup_propagate(const char *root) {
3120 const char *p, *q;
3121
3122 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3123 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
3124 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3125 (void) mkdir_p(p, 0600);
3126
3127 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
3128 return log_error_errno(errno, "Failed to create /run/systemd: %m");
3129
3130 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3131 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
3132
3133 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3134 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
3135
3136 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
3137 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
3138 return log_error_errno(errno, "Failed to install propagation bind mount.");
3139
3140 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
3141 return log_error_errno(errno, "Failed to make propagation mount read-only");
3142
3143 return 0;
3144 }
3145
3146 static int setup_image(char **device_path, int *loop_nr) {
3147 struct loop_info64 info = {
3148 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
3149 };
3150 _cleanup_close_ int fd = -1, control = -1, loop = -1;
3151 _cleanup_free_ char* loopdev = NULL;
3152 struct stat st;
3153 int r, nr;
3154
3155 assert(device_path);
3156 assert(loop_nr);
3157 assert(arg_image);
3158
3159 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3160 if (fd < 0)
3161 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
3162
3163 if (fstat(fd, &st) < 0)
3164 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
3165
3166 if (S_ISBLK(st.st_mode)) {
3167 char *p;
3168
3169 p = strdup(arg_image);
3170 if (!p)
3171 return log_oom();
3172
3173 *device_path = p;
3174
3175 *loop_nr = -1;
3176
3177 r = fd;
3178 fd = -1;
3179
3180 return r;
3181 }
3182
3183 if (!S_ISREG(st.st_mode)) {
3184 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
3185 return -EINVAL;
3186 }
3187
3188 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3189 if (control < 0)
3190 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
3191
3192 nr = ioctl(control, LOOP_CTL_GET_FREE);
3193 if (nr < 0)
3194 return log_error_errno(errno, "Failed to allocate loop device: %m");
3195
3196 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
3197 return log_oom();
3198
3199 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3200 if (loop < 0)
3201 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
3202
3203 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
3204 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
3205
3206 if (arg_read_only)
3207 info.lo_flags |= LO_FLAGS_READ_ONLY;
3208
3209 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
3210 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
3211
3212 *device_path = loopdev;
3213 loopdev = NULL;
3214
3215 *loop_nr = nr;
3216
3217 r = loop;
3218 loop = -1;
3219
3220 return r;
3221 }
3222
3223 #define PARTITION_TABLE_BLURB \
3224 "Note that the disk image needs to either contain only a single MBR partition of\n" \
3225 "type 0x83 that is marked bootable, or a single GPT partition of type " \
3226 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
3227 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3228 "to be bootable with systemd-nspawn."
3229
3230 static int dissect_image(
3231 int fd,
3232 char **root_device, bool *root_device_rw,
3233 char **home_device, bool *home_device_rw,
3234 char **srv_device, bool *srv_device_rw,
3235 bool *secondary) {
3236
3237 #ifdef HAVE_BLKID
3238 int home_nr = -1, srv_nr = -1;
3239 #ifdef GPT_ROOT_NATIVE
3240 int root_nr = -1;
3241 #endif
3242 #ifdef GPT_ROOT_SECONDARY
3243 int secondary_root_nr = -1;
3244 #endif
3245 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
3246 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
3247 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
3248 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3249 _cleanup_udev_unref_ struct udev *udev = NULL;
3250 struct udev_list_entry *first, *item;
3251 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
3252 bool is_gpt, is_mbr, multiple_generic = false;
3253 const char *pttype = NULL;
3254 blkid_partlist pl;
3255 struct stat st;
3256 unsigned i;
3257 int r;
3258
3259 assert(fd >= 0);
3260 assert(root_device);
3261 assert(home_device);
3262 assert(srv_device);
3263 assert(secondary);
3264 assert(arg_image);
3265
3266 b = blkid_new_probe();
3267 if (!b)
3268 return log_oom();
3269
3270 errno = 0;
3271 r = blkid_probe_set_device(b, fd, 0, 0);
3272 if (r != 0) {
3273 if (errno == 0)
3274 return log_oom();
3275
3276 log_error_errno(errno, "Failed to set device on blkid probe: %m");
3277 return -errno;
3278 }
3279
3280 blkid_probe_enable_partitions(b, 1);
3281 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
3282
3283 errno = 0;
3284 r = blkid_do_safeprobe(b);
3285 if (r == -2 || r == 1) {
3286 log_error("Failed to identify any partition table on\n"
3287 " %s\n"
3288 PARTITION_TABLE_BLURB, arg_image);
3289 return -EINVAL;
3290 } else if (r != 0) {
3291 if (errno == 0)
3292 errno = EIO;
3293 log_error_errno(errno, "Failed to probe: %m");
3294 return -errno;
3295 }
3296
3297 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
3298
3299 is_gpt = streq_ptr(pttype, "gpt");
3300 is_mbr = streq_ptr(pttype, "dos");
3301
3302 if (!is_gpt && !is_mbr) {
3303 log_error("No GPT or MBR partition table discovered on\n"
3304 " %s\n"
3305 PARTITION_TABLE_BLURB, arg_image);
3306 return -EINVAL;
3307 }
3308
3309 errno = 0;
3310 pl = blkid_probe_get_partitions(b);
3311 if (!pl) {
3312 if (errno == 0)
3313 return log_oom();
3314
3315 log_error("Failed to list partitions of %s", arg_image);
3316 return -errno;
3317 }
3318
3319 udev = udev_new();
3320 if (!udev)
3321 return log_oom();
3322
3323 if (fstat(fd, &st) < 0)
3324 return log_error_errno(errno, "Failed to stat block device: %m");
3325
3326 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
3327 if (!d)
3328 return log_oom();
3329
3330 for (i = 0;; i++) {
3331 int n, m;
3332
3333 if (i >= 10) {
3334 log_error("Kernel partitions never appeared.");
3335 return -ENXIO;
3336 }
3337
3338 e = udev_enumerate_new(udev);
3339 if (!e)
3340 return log_oom();
3341
3342 r = udev_enumerate_add_match_parent(e, d);
3343 if (r < 0)
3344 return log_oom();
3345
3346 r = udev_enumerate_scan_devices(e);
3347 if (r < 0)
3348 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
3349
3350 /* Count the partitions enumerated by the kernel */
3351 n = 0;
3352 first = udev_enumerate_get_list_entry(e);
3353 udev_list_entry_foreach(item, first)
3354 n++;
3355
3356 /* Count the partitions enumerated by blkid */
3357 m = blkid_partlist_numof_partitions(pl);
3358 if (n == m + 1)
3359 break;
3360 if (n > m + 1) {
3361 log_error("blkid and kernel partition list do not match.");
3362 return -EIO;
3363 }
3364 if (n < m + 1) {
3365 unsigned j;
3366
3367 /* The kernel has probed fewer partitions than
3368 * blkid? Maybe the kernel prober is still
3369 * running or it got EBUSY because udev
3370 * already opened the device. Let's reprobe
3371 * the device, which is a synchronous call
3372 * that waits until probing is complete. */
3373
3374 for (j = 0; j < 20; j++) {
3375
3376 r = ioctl(fd, BLKRRPART, 0);
3377 if (r < 0)
3378 r = -errno;
3379 if (r >= 0 || r != -EBUSY)
3380 break;
3381
3382 /* If something else has the device
3383 * open, such as an udev rule, the
3384 * ioctl will return EBUSY. Since
3385 * there's no way to wait until it
3386 * isn't busy anymore, let's just wait
3387 * a bit, and try again.
3388 *
3389 * This is really something they
3390 * should fix in the kernel! */
3391
3392 usleep(50 * USEC_PER_MSEC);
3393 }
3394
3395 if (r < 0)
3396 return log_error_errno(r, "Failed to reread partition table: %m");
3397 }
3398
3399 e = udev_enumerate_unref(e);
3400 }
3401
3402 first = udev_enumerate_get_list_entry(e);
3403 udev_list_entry_foreach(item, first) {
3404 _cleanup_udev_device_unref_ struct udev_device *q;
3405 const char *node;
3406 unsigned long long flags;
3407 blkid_partition pp;
3408 dev_t qn;
3409 int nr;
3410
3411 errno = 0;
3412 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
3413 if (!q) {
3414 if (!errno)
3415 errno = ENOMEM;
3416
3417 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
3418 return -errno;
3419 }
3420
3421 qn = udev_device_get_devnum(q);
3422 if (major(qn) == 0)
3423 continue;
3424
3425 if (st.st_rdev == qn)
3426 continue;
3427
3428 node = udev_device_get_devnode(q);
3429 if (!node)
3430 continue;
3431
3432 pp = blkid_partlist_devno_to_partition(pl, qn);
3433 if (!pp)
3434 continue;
3435
3436 flags = blkid_partition_get_flags(pp);
3437
3438 nr = blkid_partition_get_partno(pp);
3439 if (nr < 0)
3440 continue;
3441
3442 if (is_gpt) {
3443 sd_id128_t type_id;
3444 const char *stype;
3445
3446 if (flags & GPT_FLAG_NO_AUTO)
3447 continue;
3448
3449 stype = blkid_partition_get_type_string(pp);
3450 if (!stype)
3451 continue;
3452
3453 if (sd_id128_from_string(stype, &type_id) < 0)
3454 continue;
3455
3456 if (sd_id128_equal(type_id, GPT_HOME)) {
3457
3458 if (home && nr >= home_nr)
3459 continue;
3460
3461 home_nr = nr;
3462 home_rw = !(flags & GPT_FLAG_READ_ONLY);
3463
3464 r = free_and_strdup(&home, node);
3465 if (r < 0)
3466 return log_oom();
3467
3468 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3469
3470 if (srv && nr >= srv_nr)
3471 continue;
3472
3473 srv_nr = nr;
3474 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3475
3476 r = free_and_strdup(&srv, node);
3477 if (r < 0)
3478 return log_oom();
3479 }
3480 #ifdef GPT_ROOT_NATIVE
3481 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3482
3483 if (root && nr >= root_nr)
3484 continue;
3485
3486 root_nr = nr;
3487 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3488
3489 r = free_and_strdup(&root, node);
3490 if (r < 0)
3491 return log_oom();
3492 }
3493 #endif
3494 #ifdef GPT_ROOT_SECONDARY
3495 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3496
3497 if (secondary_root && nr >= secondary_root_nr)
3498 continue;
3499
3500 secondary_root_nr = nr;
3501 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3502
3503 r = free_and_strdup(&secondary_root, node);
3504 if (r < 0)
3505 return log_oom();
3506 }
3507 #endif
3508 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3509
3510 if (generic)
3511 multiple_generic = true;
3512 else {
3513 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3514
3515 r = free_and_strdup(&generic, node);
3516 if (r < 0)
3517 return log_oom();
3518 }
3519 }
3520
3521 } else if (is_mbr) {
3522 int type;
3523
3524 if (flags != 0x80) /* Bootable flag */
3525 continue;
3526
3527 type = blkid_partition_get_type(pp);
3528 if (type != 0x83) /* Linux partition */
3529 continue;
3530
3531 if (generic)
3532 multiple_generic = true;
3533 else {
3534 generic_rw = true;
3535
3536 r = free_and_strdup(&root, node);
3537 if (r < 0)
3538 return log_oom();
3539 }
3540 }
3541 }
3542
3543 if (root) {
3544 *root_device = root;
3545 root = NULL;
3546
3547 *root_device_rw = root_rw;
3548 *secondary = false;
3549 } else if (secondary_root) {
3550 *root_device = secondary_root;
3551 secondary_root = NULL;
3552
3553 *root_device_rw = secondary_root_rw;
3554 *secondary = true;
3555 } else if (generic) {
3556
3557 /* There were no partitions with precise meanings
3558 * around, but we found generic partitions. In this
3559 * case, if there's only one, we can go ahead and boot
3560 * it, otherwise we bail out, because we really cannot
3561 * make any sense of it. */
3562
3563 if (multiple_generic) {
3564 log_error("Identified multiple bootable Linux partitions on\n"
3565 " %s\n"
3566 PARTITION_TABLE_BLURB, arg_image);
3567 return -EINVAL;
3568 }
3569
3570 *root_device = generic;
3571 generic = NULL;
3572
3573 *root_device_rw = generic_rw;
3574 *secondary = false;
3575 } else {
3576 log_error("Failed to identify root partition in disk image\n"
3577 " %s\n"
3578 PARTITION_TABLE_BLURB, arg_image);
3579 return -EINVAL;
3580 }
3581
3582 if (home) {
3583 *home_device = home;
3584 home = NULL;
3585
3586 *home_device_rw = home_rw;
3587 }
3588
3589 if (srv) {
3590 *srv_device = srv;
3591 srv = NULL;
3592
3593 *srv_device_rw = srv_rw;
3594 }
3595
3596 return 0;
3597 #else
3598 log_error("--image= is not supported, compiled without blkid support.");
3599 return -EOPNOTSUPP;
3600 #endif
3601 }
3602
3603 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3604 #ifdef HAVE_BLKID
3605 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3606 const char *fstype, *p;
3607 int r;
3608
3609 assert(what);
3610 assert(where);
3611
3612 if (arg_read_only)
3613 rw = false;
3614
3615 if (directory)
3616 p = strjoina(where, directory);
3617 else
3618 p = where;
3619
3620 errno = 0;
3621 b = blkid_new_probe_from_filename(what);
3622 if (!b) {
3623 if (errno == 0)
3624 return log_oom();
3625 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3626 return -errno;
3627 }
3628
3629 blkid_probe_enable_superblocks(b, 1);
3630 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3631
3632 errno = 0;
3633 r = blkid_do_safeprobe(b);
3634 if (r == -1 || r == 1) {
3635 log_error("Cannot determine file system type of %s", what);
3636 return -EINVAL;
3637 } else if (r != 0) {
3638 if (errno == 0)
3639 errno = EIO;
3640 log_error_errno(errno, "Failed to probe %s: %m", what);
3641 return -errno;
3642 }
3643
3644 errno = 0;
3645 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3646 if (errno == 0)
3647 errno = EINVAL;
3648 log_error("Failed to determine file system type of %s", what);
3649 return -errno;
3650 }
3651
3652 if (streq(fstype, "crypto_LUKS")) {
3653 log_error("nspawn currently does not support LUKS disk images.");
3654 return -EOPNOTSUPP;
3655 }
3656
3657 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3658 return log_error_errno(errno, "Failed to mount %s: %m", what);
3659
3660 return 0;
3661 #else
3662 log_error("--image= is not supported, compiled without blkid support.");
3663 return -EOPNOTSUPP;
3664 #endif
3665 }
3666
3667 static int mount_devices(
3668 const char *where,
3669 const char *root_device, bool root_device_rw,
3670 const char *home_device, bool home_device_rw,
3671 const char *srv_device, bool srv_device_rw) {
3672 int r;
3673
3674 assert(where);
3675
3676 if (root_device) {
3677 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3678 if (r < 0)
3679 return log_error_errno(r, "Failed to mount root directory: %m");
3680 }
3681
3682 if (home_device) {
3683 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3684 if (r < 0)
3685 return log_error_errno(r, "Failed to mount home directory: %m");
3686 }
3687
3688 if (srv_device) {
3689 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3690 if (r < 0)
3691 return log_error_errno(r, "Failed to mount server data directory: %m");
3692 }
3693
3694 return 0;
3695 }
3696
3697 static void loop_remove(int nr, int *image_fd) {
3698 _cleanup_close_ int control = -1;
3699 int r;
3700
3701 if (nr < 0)
3702 return;
3703
3704 if (image_fd && *image_fd >= 0) {
3705 r = ioctl(*image_fd, LOOP_CLR_FD);
3706 if (r < 0)
3707 log_debug_errno(errno, "Failed to close loop image: %m");
3708 *image_fd = safe_close(*image_fd);
3709 }
3710
3711 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3712 if (control < 0) {
3713 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3714 return;
3715 }
3716
3717 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3718 if (r < 0)
3719 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3720 }
3721
3722 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3723 int pipe_fds[2];
3724 pid_t pid;
3725
3726 assert(database);
3727 assert(key);
3728 assert(rpid);
3729
3730 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3731 return log_error_errno(errno, "Failed to allocate pipe: %m");
3732
3733 pid = fork();
3734 if (pid < 0)
3735 return log_error_errno(errno, "Failed to fork getent child: %m");
3736 else if (pid == 0) {
3737 int nullfd;
3738 char *empty_env = NULL;
3739
3740 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3741 _exit(EXIT_FAILURE);
3742
3743 if (pipe_fds[0] > 2)
3744 safe_close(pipe_fds[0]);
3745 if (pipe_fds[1] > 2)
3746 safe_close(pipe_fds[1]);
3747
3748 nullfd = open("/dev/null", O_RDWR);
3749 if (nullfd < 0)
3750 _exit(EXIT_FAILURE);
3751
3752 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3753 _exit(EXIT_FAILURE);
3754
3755 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3756 _exit(EXIT_FAILURE);
3757
3758 if (nullfd > 2)
3759 safe_close(nullfd);
3760
3761 (void) reset_all_signal_handlers();
3762 (void) reset_signal_mask();
3763 close_all_fds(NULL, 0);
3764
3765 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3766 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3767 _exit(EXIT_FAILURE);
3768 }
3769
3770 pipe_fds[1] = safe_close(pipe_fds[1]);
3771
3772 *rpid = pid;
3773
3774 return pipe_fds[0];
3775 }
3776
3777 static int change_uid_gid(char **_home) {
3778 char line[LINE_MAX], *x, *u, *g, *h;
3779 const char *word, *state;
3780 _cleanup_free_ uid_t *uids = NULL;
3781 _cleanup_free_ char *home = NULL;
3782 _cleanup_fclose_ FILE *f = NULL;
3783 _cleanup_close_ int fd = -1;
3784 unsigned n_uids = 0;
3785 size_t sz = 0, l;
3786 uid_t uid;
3787 gid_t gid;
3788 pid_t pid;
3789 int r;
3790
3791 assert(_home);
3792
3793 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3794 /* Reset everything fully to 0, just in case */
3795
3796 r = reset_uid_gid();
3797 if (r < 0)
3798 return log_error_errno(r, "Failed to become root: %m");
3799
3800 *_home = NULL;
3801 return 0;
3802 }
3803
3804 /* First, get user credentials */
3805 fd = spawn_getent("passwd", arg_user, &pid);
3806 if (fd < 0)
3807 return fd;
3808
3809 f = fdopen(fd, "r");
3810 if (!f)
3811 return log_oom();
3812 fd = -1;
3813
3814 if (!fgets(line, sizeof(line), f)) {
3815
3816 if (!ferror(f)) {
3817 log_error("Failed to resolve user %s.", arg_user);
3818 return -ESRCH;
3819 }
3820
3821 log_error_errno(errno, "Failed to read from getent: %m");
3822 return -errno;
3823 }
3824
3825 truncate_nl(line);
3826
3827 wait_for_terminate_and_warn("getent passwd", pid, true);
3828
3829 x = strchr(line, ':');
3830 if (!x) {
3831 log_error("/etc/passwd entry has invalid user field.");
3832 return -EIO;
3833 }
3834
3835 u = strchr(x+1, ':');
3836 if (!u) {
3837 log_error("/etc/passwd entry has invalid password field.");
3838 return -EIO;
3839 }
3840
3841 u++;
3842 g = strchr(u, ':');
3843 if (!g) {
3844 log_error("/etc/passwd entry has invalid UID field.");
3845 return -EIO;
3846 }
3847
3848 *g = 0;
3849 g++;
3850 x = strchr(g, ':');
3851 if (!x) {
3852 log_error("/etc/passwd entry has invalid GID field.");
3853 return -EIO;
3854 }
3855
3856 *x = 0;
3857 h = strchr(x+1, ':');
3858 if (!h) {
3859 log_error("/etc/passwd entry has invalid GECOS field.");
3860 return -EIO;
3861 }
3862
3863 h++;
3864 x = strchr(h, ':');
3865 if (!x) {
3866 log_error("/etc/passwd entry has invalid home directory field.");
3867 return -EIO;
3868 }
3869
3870 *x = 0;
3871
3872 r = parse_uid(u, &uid);
3873 if (r < 0) {
3874 log_error("Failed to parse UID of user.");
3875 return -EIO;
3876 }
3877
3878 r = parse_gid(g, &gid);
3879 if (r < 0) {
3880 log_error("Failed to parse GID of user.");
3881 return -EIO;
3882 }
3883
3884 home = strdup(h);
3885 if (!home)
3886 return log_oom();
3887
3888 /* Second, get group memberships */
3889 fd = spawn_getent("initgroups", arg_user, &pid);
3890 if (fd < 0)
3891 return fd;
3892
3893 fclose(f);
3894 f = fdopen(fd, "r");
3895 if (!f)
3896 return log_oom();
3897 fd = -1;
3898
3899 if (!fgets(line, sizeof(line), f)) {
3900 if (!ferror(f)) {
3901 log_error("Failed to resolve user %s.", arg_user);
3902 return -ESRCH;
3903 }
3904
3905 log_error_errno(errno, "Failed to read from getent: %m");
3906 return -errno;
3907 }
3908
3909 truncate_nl(line);
3910
3911 wait_for_terminate_and_warn("getent initgroups", pid, true);
3912
3913 /* Skip over the username and subsequent separator whitespace */
3914 x = line;
3915 x += strcspn(x, WHITESPACE);
3916 x += strspn(x, WHITESPACE);
3917
3918 FOREACH_WORD(word, l, x, state) {
3919 char c[l+1];
3920
3921 memcpy(c, word, l);
3922 c[l] = 0;
3923
3924 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3925 return log_oom();
3926
3927 r = parse_uid(c, &uids[n_uids++]);
3928 if (r < 0) {
3929 log_error("Failed to parse group data from getent.");
3930 return -EIO;
3931 }
3932 }
3933
3934 r = mkdir_parents(home, 0775);
3935 if (r < 0)
3936 return log_error_errno(r, "Failed to make home root directory: %m");
3937
3938 r = mkdir_safe(home, 0755, uid, gid);
3939 if (r < 0 && r != -EEXIST)
3940 return log_error_errno(r, "Failed to make home directory: %m");
3941
3942 (void) fchown(STDIN_FILENO, uid, gid);
3943 (void) fchown(STDOUT_FILENO, uid, gid);
3944 (void) fchown(STDERR_FILENO, uid, gid);
3945
3946 if (setgroups(n_uids, uids) < 0)
3947 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3948
3949 if (setresgid(gid, gid, gid) < 0)
3950 return log_error_errno(errno, "setregid() failed: %m");
3951
3952 if (setresuid(uid, uid, uid) < 0)
3953 return log_error_errno(errno, "setreuid() failed: %m");
3954
3955 if (_home) {
3956 *_home = home;
3957 home = NULL;
3958 }
3959
3960 return 0;
3961 }
3962
3963 /*
3964 * Return values:
3965 * < 0 : wait_for_terminate() failed to get the state of the
3966 * container, the container was terminated by a signal, or
3967 * failed for an unknown reason. No change is made to the
3968 * container argument.
3969 * > 0 : The program executed in the container terminated with an
3970 * error. The exit code of the program executed in the
3971 * container is returned. The container argument has been set
3972 * to CONTAINER_TERMINATED.
3973 * 0 : The container is being rebooted, has been shut down or exited
3974 * successfully. The container argument has been set to either
3975 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3976 *
3977 * That is, success is indicated by a return value of zero, and an
3978 * error is indicated by a non-zero value.
3979 */
3980 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3981 siginfo_t status;
3982 int r;
3983
3984 r = wait_for_terminate(pid, &status);
3985 if (r < 0)
3986 return log_warning_errno(r, "Failed to wait for container: %m");
3987
3988 switch (status.si_code) {
3989
3990 case CLD_EXITED:
3991 if (status.si_status == 0) {
3992 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3993
3994 } else
3995 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3996
3997 *container = CONTAINER_TERMINATED;
3998 return status.si_status;
3999
4000 case CLD_KILLED:
4001 if (status.si_status == SIGINT) {
4002
4003 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
4004 *container = CONTAINER_TERMINATED;
4005 return 0;
4006
4007 } else if (status.si_status == SIGHUP) {
4008
4009 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
4010 *container = CONTAINER_REBOOTED;
4011 return 0;
4012 }
4013
4014 /* CLD_KILLED fallthrough */
4015
4016 case CLD_DUMPED:
4017 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
4018 return -EIO;
4019
4020 default:
4021 log_error("Container %s failed due to unknown reason.", arg_machine);
4022 return -EIO;
4023 }
4024
4025 return r;
4026 }
4027
4028 static void nop_handler(int sig) {}
4029
4030 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
4031 pid_t pid;
4032
4033 pid = PTR_TO_UINT32(userdata);
4034 if (pid > 0) {
4035 if (kill(pid, arg_kill_signal) >= 0) {
4036 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
4037 sd_event_source_set_userdata(s, NULL);
4038 return 0;
4039 }
4040 }
4041
4042 sd_event_exit(sd_event_source_get_event(s), 0);
4043 return 0;
4044 }
4045
4046 static int determine_names(void) {
4047 int r;
4048
4049 if (arg_template && !arg_directory && arg_machine) {
4050
4051 /* If --template= was specified then we should not
4052 * search for a machine, but instead create a new one
4053 * in /var/lib/machine. */
4054
4055 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
4056 if (!arg_directory)
4057 return log_oom();
4058 }
4059
4060 if (!arg_image && !arg_directory) {
4061 if (arg_machine) {
4062 _cleanup_(image_unrefp) Image *i = NULL;
4063
4064 r = image_find(arg_machine, &i);
4065 if (r < 0)
4066 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
4067 else if (r == 0) {
4068 log_error("No image for machine '%s': %m", arg_machine);
4069 return -ENOENT;
4070 }
4071
4072 if (i->type == IMAGE_RAW)
4073 r = set_sanitized_path(&arg_image, i->path);
4074 else
4075 r = set_sanitized_path(&arg_directory, i->path);
4076 if (r < 0)
4077 return log_error_errno(r, "Invalid image directory: %m");
4078
4079 if (!arg_ephemeral)
4080 arg_read_only = arg_read_only || i->read_only;
4081 } else
4082 arg_directory = get_current_dir_name();
4083
4084 if (!arg_directory && !arg_machine) {
4085 log_error("Failed to determine path, please use -D or -i.");
4086 return -EINVAL;
4087 }
4088 }
4089
4090 if (!arg_machine) {
4091 if (arg_directory && path_equal(arg_directory, "/"))
4092 arg_machine = gethostname_malloc();
4093 else
4094 arg_machine = strdup(basename(arg_image ?: arg_directory));
4095
4096 if (!arg_machine)
4097 return log_oom();
4098
4099 hostname_cleanup(arg_machine);
4100 if (!machine_name_is_valid(arg_machine)) {
4101 log_error("Failed to determine machine name automatically, please use -M.");
4102 return -EINVAL;
4103 }
4104
4105 if (arg_ephemeral) {
4106 char *b;
4107
4108 /* Add a random suffix when this is an
4109 * ephemeral machine, so that we can run many
4110 * instances at once without manually having
4111 * to specify -M each time. */
4112
4113 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
4114 return log_oom();
4115
4116 free(arg_machine);
4117 arg_machine = b;
4118 }
4119 }
4120
4121 return 0;
4122 }
4123
4124 static int determine_uid_shift(const char *directory) {
4125 int r;
4126
4127 if (!arg_userns) {
4128 arg_uid_shift = 0;
4129 return 0;
4130 }
4131
4132 if (arg_uid_shift == UID_INVALID) {
4133 struct stat st;
4134
4135 r = stat(directory, &st);
4136 if (r < 0)
4137 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
4138
4139 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
4140
4141 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
4142 log_error("UID and GID base of %s don't match.", directory);
4143 return -EINVAL;
4144 }
4145
4146 arg_uid_range = UINT32_C(0x10000);
4147 }
4148
4149 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
4150 log_error("UID base too high for UID range.");
4151 return -EINVAL;
4152 }
4153
4154 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
4155 return 0;
4156 }
4157
4158 static int inner_child(
4159 Barrier *barrier,
4160 const char *directory,
4161 bool secondary,
4162 int kmsg_socket,
4163 int rtnl_socket,
4164 FDSet *fds,
4165 int argc,
4166 char *argv[]) {
4167
4168 _cleanup_free_ char *home = NULL;
4169 unsigned n_env = 2;
4170 const char *envp[] = {
4171 "PATH=" DEFAULT_PATH_SPLIT_USR,
4172 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4173 NULL, /* TERM */
4174 NULL, /* HOME */
4175 NULL, /* USER */
4176 NULL, /* LOGNAME */
4177 NULL, /* container_uuid */
4178 NULL, /* LISTEN_FDS */
4179 NULL, /* LISTEN_PID */
4180 NULL
4181 };
4182
4183 _cleanup_strv_free_ char **env_use = NULL;
4184 int r;
4185
4186 assert(barrier);
4187 assert(directory);
4188 assert(kmsg_socket >= 0);
4189
4190 if (arg_userns) {
4191 /* Tell the parent, that it now can write the UID map. */
4192 (void) barrier_place(barrier); /* #1 */
4193
4194 /* Wait until the parent wrote the UID map */
4195 if (!barrier_place_and_sync(barrier)) { /* #2 */
4196 log_error("Parent died too early");
4197 return -ESRCH;
4198 }
4199 }
4200
4201 r = mount_all(NULL, true);
4202 if (r < 0)
4203 return r;
4204
4205 /* Wait until we are cgroup-ified, so that we
4206 * can mount the right cgroup path writable */
4207 if (!barrier_place_and_sync(barrier)) { /* #3 */
4208 log_error("Parent died too early");
4209 return -ESRCH;
4210 }
4211
4212 r = mount_systemd_cgroup_writable("");
4213 if (r < 0)
4214 return r;
4215
4216 r = reset_uid_gid();
4217 if (r < 0)
4218 return log_error_errno(r, "Couldn't become new root: %m");
4219
4220 r = setup_boot_id(NULL);
4221 if (r < 0)
4222 return r;
4223
4224 r = setup_kmsg(NULL, kmsg_socket);
4225 if (r < 0)
4226 return r;
4227 kmsg_socket = safe_close(kmsg_socket);
4228
4229 umask(0022);
4230
4231 if (setsid() < 0)
4232 return log_error_errno(errno, "setsid() failed: %m");
4233
4234 if (arg_private_network)
4235 loopback_setup();
4236
4237 r = send_rtnl(rtnl_socket);
4238 if (r < 0)
4239 return r;
4240 rtnl_socket = safe_close(rtnl_socket);
4241
4242 if (drop_capabilities() < 0)
4243 return log_error_errno(errno, "drop_capabilities() failed: %m");
4244
4245 setup_hostname();
4246
4247 if (arg_personality != PERSONALITY_INVALID) {
4248 if (personality(arg_personality) < 0)
4249 return log_error_errno(errno, "personality() failed: %m");
4250 } else if (secondary) {
4251 if (personality(PER_LINUX32) < 0)
4252 return log_error_errno(errno, "personality() failed: %m");
4253 }
4254
4255 #ifdef HAVE_SELINUX
4256 if (arg_selinux_context)
4257 if (setexeccon((security_context_t) arg_selinux_context) < 0)
4258 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4259 #endif
4260
4261 r = change_uid_gid(&home);
4262 if (r < 0)
4263 return r;
4264
4265 envp[n_env] = strv_find_prefix(environ, "TERM=");
4266 if (envp[n_env])
4267 n_env ++;
4268
4269 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4270 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4271 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
4272 return log_oom();
4273
4274 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4275 char as_uuid[37];
4276
4277 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
4278 return log_oom();
4279 }
4280
4281 if (fdset_size(fds) > 0) {
4282 r = fdset_cloexec(fds, false);
4283 if (r < 0)
4284 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4285
4286 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
4287 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
4288 return log_oom();
4289 }
4290
4291 env_use = strv_env_merge(2, envp, arg_setenv);
4292 if (!env_use)
4293 return log_oom();
4294
4295 /* Let the parent know that we are ready and
4296 * wait until the parent is ready with the
4297 * setup, too... */
4298 if (!barrier_place_and_sync(barrier)) { /* #4 */
4299 log_error("Parent died too early");
4300 return -ESRCH;
4301 }
4302
4303 /* Now, explicitly close the log, so that we
4304 * then can close all remaining fds. Closing
4305 * the log explicitly first has the benefit
4306 * that the logging subsystem knows about it,
4307 * and is thus ready to be reopened should we
4308 * need it again. Note that the other fds
4309 * closed here are at least the locking and
4310 * barrier fds. */
4311 log_close();
4312 (void) fdset_close_others(fds);
4313
4314 if (arg_boot) {
4315 char **a;
4316 size_t m;
4317
4318 /* Automatically search for the init system */
4319
4320 m = 1 + argc - optind;
4321 a = newa(char*, m + 1);
4322 memcpy(a + 1, argv + optind, m * sizeof(char*));
4323
4324 a[0] = (char*) "/usr/lib/systemd/systemd";
4325 execve(a[0], a, env_use);
4326
4327 a[0] = (char*) "/lib/systemd/systemd";
4328 execve(a[0], a, env_use);
4329
4330 a[0] = (char*) "/sbin/init";
4331 execve(a[0], a, env_use);
4332 } else if (argc > optind)
4333 execvpe(argv[optind], argv + optind, env_use);
4334 else {
4335 chdir(home ? home : "/root");
4336 execle("/bin/bash", "-bash", NULL, env_use);
4337 execle("/bin/sh", "-sh", NULL, env_use);
4338 }
4339
4340 (void) log_open();
4341 return log_error_errno(errno, "execv() failed: %m");
4342 }
4343
4344 static int outer_child(
4345 Barrier *barrier,
4346 const char *directory,
4347 const char *console,
4348 const char *root_device, bool root_device_rw,
4349 const char *home_device, bool home_device_rw,
4350 const char *srv_device, bool srv_device_rw,
4351 bool interactive,
4352 bool secondary,
4353 int pid_socket,
4354 int kmsg_socket,
4355 int rtnl_socket,
4356 int uid_shift_socket,
4357 FDSet *fds,
4358 int argc,
4359 char *argv[]) {
4360
4361 pid_t pid;
4362 ssize_t l;
4363 int r;
4364
4365 assert(barrier);
4366 assert(directory);
4367 assert(console);
4368 assert(pid_socket >= 0);
4369 assert(kmsg_socket >= 0);
4370
4371 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
4372 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4373
4374 if (interactive) {
4375 close_nointr(STDIN_FILENO);
4376 close_nointr(STDOUT_FILENO);
4377 close_nointr(STDERR_FILENO);
4378
4379 r = open_terminal(console, O_RDWR);
4380 if (r != STDIN_FILENO) {
4381 if (r >= 0) {
4382 safe_close(r);
4383 r = -EINVAL;
4384 }
4385
4386 return log_error_errno(r, "Failed to open console: %m");
4387 }
4388
4389 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4390 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
4391 return log_error_errno(errno, "Failed to duplicate console: %m");
4392 }
4393
4394 r = reset_audit_loginuid();
4395 if (r < 0)
4396 return r;
4397
4398 /* Mark everything as slave, so that we still
4399 * receive mounts from the real root, but don't
4400 * propagate mounts to the real root. */
4401 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
4402 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4403
4404 r = mount_devices(directory,
4405 root_device, root_device_rw,
4406 home_device, home_device_rw,
4407 srv_device, srv_device_rw);
4408 if (r < 0)
4409 return r;
4410
4411 r = determine_uid_shift(directory);
4412 if (r < 0)
4413 return r;
4414
4415 if (arg_userns) {
4416 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
4417 if (l < 0)
4418 return log_error_errno(errno, "Failed to send UID shift: %m");
4419 if (l != sizeof(arg_uid_shift)) {
4420 log_error("Short write while sending UID shift.");
4421 return -EIO;
4422 }
4423 }
4424
4425 /* Turn directory into bind mount */
4426 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
4427 return log_error_errno(errno, "Failed to make bind mount: %m");
4428
4429 r = setup_volatile(directory);
4430 if (r < 0)
4431 return r;
4432
4433 r = setup_volatile_state(directory);
4434 if (r < 0)
4435 return r;
4436
4437 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
4438 if (r < 0)
4439 return r;
4440
4441 if (arg_read_only) {
4442 r = bind_remount_recursive(directory, true);
4443 if (r < 0)
4444 return log_error_errno(r, "Failed to make tree read-only: %m");
4445 }
4446
4447 r = mount_all(directory, false);
4448 if (r < 0)
4449 return r;
4450
4451 if (copy_devnodes(directory) < 0)
4452 return r;
4453
4454 dev_setup(directory, arg_uid_shift, arg_uid_shift);
4455
4456 if (setup_pts(directory) < 0)
4457 return r;
4458
4459 r = setup_propagate(directory);
4460 if (r < 0)
4461 return r;
4462
4463 r = setup_dev_console(directory, console);
4464 if (r < 0)
4465 return r;
4466
4467 r = setup_seccomp();
4468 if (r < 0)
4469 return r;
4470
4471 r = setup_timezone(directory);
4472 if (r < 0)
4473 return r;
4474
4475 r = setup_resolv_conf(directory);
4476 if (r < 0)
4477 return r;
4478
4479 r = setup_journal(directory);
4480 if (r < 0)
4481 return r;
4482
4483 r = mount_custom(directory);
4484 if (r < 0)
4485 return r;
4486
4487 r = mount_cgroup(directory);
4488 if (r < 0)
4489 return r;
4490
4491 r = mount_move_root(directory);
4492 if (r < 0)
4493 return log_error_errno(r, "Failed to move root directory: %m");
4494
4495 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4496 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
4497 (arg_private_network ? CLONE_NEWNET : 0) |
4498 (arg_userns ? CLONE_NEWUSER : 0),
4499 NULL);
4500 if (pid < 0)
4501 return log_error_errno(errno, "Failed to fork inner child: %m");
4502
4503 if (pid == 0) {
4504 pid_socket = safe_close(pid_socket);
4505 uid_shift_socket = safe_close(uid_shift_socket);
4506
4507 /* The inner child has all namespaces that are
4508 * requested, so that we all are owned by the user if
4509 * user namespaces are turned on. */
4510
4511 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
4512 if (r < 0)
4513 _exit(EXIT_FAILURE);
4514
4515 _exit(EXIT_SUCCESS);
4516 }
4517
4518 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4519 if (l < 0)
4520 return log_error_errno(errno, "Failed to send PID: %m");
4521 if (l != sizeof(pid)) {
4522 log_error("Short write while sending PID.");
4523 return -EIO;
4524 }
4525
4526 pid_socket = safe_close(pid_socket);
4527
4528 return 0;
4529 }
4530
4531 static int setup_uid_map(pid_t pid) {
4532 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4533 int r;
4534
4535 assert(pid > 1);
4536
4537 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4538 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4539 r = write_string_file(uid_map, line, 0);
4540 if (r < 0)
4541 return log_error_errno(r, "Failed to write UID map: %m");
4542
4543 /* We always assign the same UID and GID ranges */
4544 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4545 r = write_string_file(uid_map, line, 0);
4546 if (r < 0)
4547 return log_error_errno(r, "Failed to write GID map: %m");
4548
4549 return 0;
4550 }
4551
4552 static int chown_cgroup(pid_t pid) {
4553 _cleanup_free_ char *path = NULL, *fs = NULL;
4554 _cleanup_close_ int fd = -1;
4555 const char *fn;
4556 int r;
4557
4558 r = cg_pid_get_path(NULL, pid, &path);
4559 if (r < 0)
4560 return log_error_errno(r, "Failed to get container cgroup path: %m");
4561
4562 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
4563 if (r < 0)
4564 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
4565
4566 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
4567 if (fd < 0)
4568 return log_error_errno(errno, "Failed to open %s: %m", fs);
4569
4570 FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4571 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
4572 log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
4573
4574 return 0;
4575 }
4576
4577 int main(int argc, char *argv[]) {
4578
4579 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
4580 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4581 _cleanup_close_ int master = -1, image_fd = -1;
4582 _cleanup_fdset_free_ FDSet *fds = NULL;
4583 int r, n_fd_passed, loop_nr = -1;
4584 char veth_name[IFNAMSIZ];
4585 bool secondary = false, remove_subvol = false;
4586 sigset_t mask_chld;
4587 pid_t pid = 0;
4588 int ret = EXIT_SUCCESS;
4589 union in_addr_union exposed = {};
4590 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4591 bool interactive;
4592
4593 log_parse_environment();
4594 log_open();
4595
4596 r = parse_argv(argc, argv);
4597 if (r <= 0)
4598 goto finish;
4599
4600 r = determine_names();
4601 if (r < 0)
4602 goto finish;
4603
4604 if (geteuid() != 0) {
4605 log_error("Need to be root.");
4606 r = -EPERM;
4607 goto finish;
4608 }
4609
4610 n_fd_passed = sd_listen_fds(false);
4611 if (n_fd_passed > 0) {
4612 r = fdset_new_listen_fds(&fds, false);
4613 if (r < 0) {
4614 log_error_errno(r, "Failed to collect file descriptors: %m");
4615 goto finish;
4616 }
4617 }
4618
4619 if (arg_directory) {
4620 assert(!arg_image);
4621
4622 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4623 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4624 r = -EINVAL;
4625 goto finish;
4626 }
4627
4628 if (arg_ephemeral) {
4629 _cleanup_free_ char *np = NULL;
4630
4631 /* If the specified path is a mount point we
4632 * generate the new snapshot immediately
4633 * inside it under a random name. However if
4634 * the specified is not a mount point we
4635 * create the new snapshot in the parent
4636 * directory, just next to it. */
4637 r = path_is_mount_point(arg_directory, 0);
4638 if (r < 0) {
4639 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4640 goto finish;
4641 }
4642 if (r > 0)
4643 r = tempfn_random_child(arg_directory, "machine.", &np);
4644 else
4645 r = tempfn_random(arg_directory, "machine.", &np);
4646 if (r < 0) {
4647 log_error_errno(r, "Failed to generate name for snapshot: %m");
4648 goto finish;
4649 }
4650
4651 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4652 if (r < 0) {
4653 log_error_errno(r, "Failed to lock %s: %m", np);
4654 goto finish;
4655 }
4656
4657 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4658 if (r < 0) {
4659 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4660 goto finish;
4661 }
4662
4663 free(arg_directory);
4664 arg_directory = np;
4665 np = NULL;
4666
4667 remove_subvol = true;
4668
4669 } else {
4670 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4671 if (r == -EBUSY) {
4672 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4673 goto finish;
4674 }
4675 if (r < 0) {
4676 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4677 return r;
4678 }
4679
4680 if (arg_template) {
4681 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4682 if (r == -EEXIST) {
4683 if (!arg_quiet)
4684 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4685 } else if (r < 0) {
4686 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4687 goto finish;
4688 } else {
4689 if (!arg_quiet)
4690 log_info("Populated %s from template %s.", arg_directory, arg_template);
4691 }
4692 }
4693 }
4694
4695 if (arg_boot) {
4696 if (path_is_os_tree(arg_directory) <= 0) {
4697 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
4698 r = -EINVAL;
4699 goto finish;
4700 }
4701 } else {
4702 const char *p;
4703
4704 p = strjoina(arg_directory,
4705 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
4706 if (access(p, F_OK) < 0) {
4707 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
4708 r = -EINVAL;
4709 goto finish;
4710 }
4711 }
4712
4713 } else {
4714 char template[] = "/tmp/nspawn-root-XXXXXX";
4715
4716 assert(arg_image);
4717 assert(!arg_template);
4718
4719 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4720 if (r == -EBUSY) {
4721 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4722 goto finish;
4723 }
4724 if (r < 0) {
4725 r = log_error_errno(r, "Failed to create image lock: %m");
4726 goto finish;
4727 }
4728
4729 if (!mkdtemp(template)) {
4730 log_error_errno(errno, "Failed to create temporary directory: %m");
4731 r = -errno;
4732 goto finish;
4733 }
4734
4735 arg_directory = strdup(template);
4736 if (!arg_directory) {
4737 r = log_oom();
4738 goto finish;
4739 }
4740
4741 image_fd = setup_image(&device_path, &loop_nr);
4742 if (image_fd < 0) {
4743 r = image_fd;
4744 goto finish;
4745 }
4746
4747 r = dissect_image(image_fd,
4748 &root_device, &root_device_rw,
4749 &home_device, &home_device_rw,
4750 &srv_device, &srv_device_rw,
4751 &secondary);
4752 if (r < 0)
4753 goto finish;
4754 }
4755
4756 r = custom_mounts_prepare();
4757 if (r < 0)
4758 goto finish;
4759
4760 interactive =
4761 isatty(STDIN_FILENO) > 0 &&
4762 isatty(STDOUT_FILENO) > 0;
4763
4764 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4765 if (master < 0) {
4766 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
4767 goto finish;
4768 }
4769
4770 r = ptsname_malloc(master, &console);
4771 if (r < 0) {
4772 r = log_error_errno(r, "Failed to determine tty name: %m");
4773 goto finish;
4774 }
4775
4776 if (unlockpt(master) < 0) {
4777 r = log_error_errno(errno, "Failed to unlock tty: %m");
4778 goto finish;
4779 }
4780
4781 if (!arg_quiet)
4782 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4783 arg_machine, arg_image ?: arg_directory);
4784
4785 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
4786
4787 assert_se(sigemptyset(&mask_chld) == 0);
4788 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4789
4790 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4791 r = log_error_errno(errno, "Failed to become subreaper: %m");
4792 goto finish;
4793 }
4794
4795 for (;;) {
4796 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
4797 uid_shift_socket_pair[2] = { -1, -1 };
4798 ContainerStatus container_status;
4799 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4800 static const struct sigaction sa = {
4801 .sa_handler = nop_handler,
4802 .sa_flags = SA_NOCLDSTOP,
4803 };
4804 int ifi = 0;
4805 ssize_t l;
4806 _cleanup_event_unref_ sd_event *event = NULL;
4807 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4808 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4809 char last_char = 0;
4810
4811 r = barrier_create(&barrier);
4812 if (r < 0) {
4813 log_error_errno(r, "Cannot initialize IPC barrier: %m");
4814 goto finish;
4815 }
4816
4817 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
4818 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4819 goto finish;
4820 }
4821
4822 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
4823 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4824 goto finish;
4825 }
4826
4827 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
4828 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
4829 goto finish;
4830 }
4831
4832 if (arg_userns)
4833 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
4834 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4835 goto finish;
4836 }
4837
4838 /* Child can be killed before execv(), so handle SIGCHLD
4839 * in order to interrupt parent's blocking calls and
4840 * give it a chance to call wait() and terminate. */
4841 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4842 if (r < 0) {
4843 r = log_error_errno(errno, "Failed to change the signal mask: %m");
4844 goto finish;
4845 }
4846
4847 r = sigaction(SIGCHLD, &sa, NULL);
4848 if (r < 0) {
4849 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4850 goto finish;
4851 }
4852
4853 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
4854 if (pid < 0) {
4855 if (errno == EINVAL)
4856 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
4857 else
4858 r = log_error_errno(errno, "clone() failed: %m");
4859
4860 goto finish;
4861 }
4862
4863 if (pid == 0) {
4864 /* The outer child only has a file system namespace. */
4865 barrier_set_role(&barrier, BARRIER_CHILD);
4866
4867 master = safe_close(master);
4868
4869 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4870 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4871 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4872 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4873
4874 (void) reset_all_signal_handlers();
4875 (void) reset_signal_mask();
4876
4877 r = outer_child(&barrier,
4878 arg_directory,
4879 console,
4880 root_device, root_device_rw,
4881 home_device, home_device_rw,
4882 srv_device, srv_device_rw,
4883 interactive,
4884 secondary,
4885 pid_socket_pair[1],
4886 kmsg_socket_pair[1],
4887 rtnl_socket_pair[1],
4888 uid_shift_socket_pair[1],
4889 fds,
4890 argc, argv);
4891 if (r < 0)
4892 _exit(EXIT_FAILURE);
4893
4894 _exit(EXIT_SUCCESS);
4895 }
4896
4897 barrier_set_role(&barrier, BARRIER_PARENT);
4898
4899 fdset_free(fds);
4900 fds = NULL;
4901
4902 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4903 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4904 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4905
4906 /* Wait for the outer child. */
4907 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
4908 if (r < 0)
4909 goto finish;
4910 if (r != 0) {
4911 r = -EIO;
4912 goto finish;
4913 }
4914 pid = 0;
4915
4916 /* And now retrieve the PID of the inner child. */
4917 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
4918 if (l < 0) {
4919 r = log_error_errno(errno, "Failed to read inner child PID: %m");
4920 goto finish;
4921 }
4922 if (l != sizeof(pid)) {
4923 log_error("Short read while reading inner child PID: %m");
4924 r = EIO;
4925 goto finish;
4926 }
4927
4928 log_debug("Init process invoked as PID " PID_FMT, pid);
4929
4930 if (arg_userns) {
4931 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4932 log_error("Child died too early.");
4933 r = -ESRCH;
4934 goto finish;
4935 }
4936
4937 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
4938 if (l < 0) {
4939 r = log_error_errno(errno, "Failed to read UID shift: %m");
4940 goto finish;
4941 }
4942 if (l != sizeof(arg_uid_shift)) {
4943 log_error("Short read while reading UID shift: %m");
4944 r = EIO;
4945 goto finish;
4946 }
4947
4948 r = setup_uid_map(pid);
4949 if (r < 0)
4950 goto finish;
4951
4952 (void) barrier_place(&barrier); /* #2 */
4953 }
4954
4955 r = move_network_interfaces(pid);
4956 if (r < 0)
4957 goto finish;
4958
4959 r = setup_veth(pid, veth_name, &ifi);
4960 if (r < 0)
4961 goto finish;
4962
4963 r = setup_bridge(veth_name, &ifi);
4964 if (r < 0)
4965 goto finish;
4966
4967 r = setup_macvlan(pid);
4968 if (r < 0)
4969 goto finish;
4970
4971 r = setup_ipvlan(pid);
4972 if (r < 0)
4973 goto finish;
4974
4975 r = register_machine(pid, ifi);
4976 if (r < 0)
4977 goto finish;
4978
4979 r = chown_cgroup(pid);
4980 if (r < 0)
4981 goto finish;
4982
4983 /* Notify the child that the parent is ready with all
4984 * its setup (including cgroup-ification), and that
4985 * the child can now hand over control to the code to
4986 * run inside the container. */
4987 (void) barrier_place(&barrier); /* #3 */
4988
4989 /* Block SIGCHLD here, before notifying child.
4990 * process_pty() will handle it with the other signals. */
4991 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4992
4993 /* Reset signal to default */
4994 r = default_signals(SIGCHLD, -1);
4995 if (r < 0) {
4996 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4997 goto finish;
4998 }
4999
5000 /* Let the child know that we are ready and wait that the child is completely ready now. */
5001 if (!barrier_place_and_sync(&barrier)) { /* #5 */
5002 log_error("Client died too early.");
5003 r = -ESRCH;
5004 goto finish;
5005 }
5006
5007 sd_notifyf(false,
5008 "READY=1\n"
5009 "STATUS=Container running.\n"
5010 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
5011
5012 r = sd_event_new(&event);
5013 if (r < 0) {
5014 log_error_errno(r, "Failed to get default event source: %m");
5015 goto finish;
5016 }
5017
5018 if (arg_kill_signal > 0) {
5019 /* Try to kill the init system on SIGINT or SIGTERM */
5020 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
5021 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
5022 } else {
5023 /* Immediately exit */
5024 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5025 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
5026 }
5027
5028 /* simply exit on sigchld */
5029 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
5030
5031 if (arg_expose_ports) {
5032 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
5033 if (r < 0)
5034 goto finish;
5035
5036 (void) expose_ports(rtnl, &exposed);
5037 }
5038
5039 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
5040
5041 r = pty_forward_new(event, master, true, !interactive, &forward);
5042 if (r < 0) {
5043 log_error_errno(r, "Failed to create PTY forwarder: %m");
5044 goto finish;
5045 }
5046
5047 r = sd_event_loop(event);
5048 if (r < 0) {
5049 log_error_errno(r, "Failed to run event loop: %m");
5050 goto finish;
5051 }
5052
5053 pty_forward_get_last_char(forward, &last_char);
5054
5055 forward = pty_forward_free(forward);
5056
5057 if (!arg_quiet && last_char != '\n')
5058 putc('\n', stdout);
5059
5060 /* Kill if it is not dead yet anyway */
5061 terminate_machine(pid);
5062
5063 /* Normally redundant, but better safe than sorry */
5064 kill(pid, SIGKILL);
5065
5066 r = wait_for_container(pid, &container_status);
5067 pid = 0;
5068
5069 if (r < 0)
5070 /* We failed to wait for the container, or the
5071 * container exited abnormally */
5072 goto finish;
5073 else if (r > 0 || container_status == CONTAINER_TERMINATED){
5074 /* The container exited with a non-zero
5075 * status, or with zero status and no reboot
5076 * was requested. */
5077 ret = r;
5078 break;
5079 }
5080
5081 /* CONTAINER_REBOOTED, loop again */
5082
5083 if (arg_keep_unit) {
5084 /* Special handling if we are running as a
5085 * service: instead of simply restarting the
5086 * machine we want to restart the entire
5087 * service, so let's inform systemd about this
5088 * with the special exit code 133. The service
5089 * file uses RestartForceExitStatus=133 so
5090 * that this results in a full nspawn
5091 * restart. This is necessary since we might
5092 * have cgroup parameters set we want to have
5093 * flushed out. */
5094 ret = 133;
5095 r = 0;
5096 break;
5097 }
5098
5099 flush_ports(&exposed);
5100 }
5101
5102 finish:
5103 sd_notify(false,
5104 "STOPPING=1\n"
5105 "STATUS=Terminating...");
5106
5107 if (pid > 0)
5108 kill(pid, SIGKILL);
5109
5110 /* Try to flush whatever is still queued in the pty */
5111 if (master >= 0)
5112 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
5113
5114 loop_remove(loop_nr, &image_fd);
5115
5116 if (remove_subvol && arg_directory) {
5117 int k;
5118
5119 k = btrfs_subvol_remove(arg_directory, true);
5120 if (k < 0)
5121 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
5122 }
5123
5124 if (arg_machine) {
5125 const char *p;
5126
5127 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5128 (void) rm_rf(p, REMOVE_ROOT);
5129 }
5130
5131 free(arg_directory);
5132 free(arg_template);
5133 free(arg_image);
5134 free(arg_machine);
5135 free(arg_user);
5136 strv_free(arg_setenv);
5137 strv_free(arg_network_interfaces);
5138 strv_free(arg_network_macvlan);
5139 strv_free(arg_network_ipvlan);
5140 custom_mount_free_all();
5141
5142 flush_ports(&exposed);
5143
5144 while (arg_expose_ports) {
5145 ExposePort *p = arg_expose_ports;
5146 LIST_REMOVE(ports, arg_expose_ports, p);
5147 free(p);
5148 }
5149
5150 return r < 0 ? EXIT_FAILURE : ret;
5151 }