]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #953 from poettering/ebadf
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <net/if.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
41 #include <sys/file.h>
42
43 #ifdef HAVE_SELINUX
44 #include <selinux/selinux.h>
45 #endif
46
47 #ifdef HAVE_SECCOMP
48 #include <seccomp.h>
49 #endif
50
51 #ifdef HAVE_BLKID
52 #include <blkid/blkid.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-netlink.h"
59 #include "random-util.h"
60 #include "log.h"
61 #include "util.h"
62 #include "mkdir.h"
63 #include "rm-rf.h"
64 #include "macro.h"
65 #include "missing.h"
66 #include "cgroup-util.h"
67 #include "strv.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
71 #include "fdset.h"
72 #include "build.h"
73 #include "fileio.h"
74 #include "bus-util.h"
75 #include "bus-error.h"
76 #include "ptyfwd.h"
77 #include "env-util.h"
78 #include "netlink-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
81 #include "gpt.h"
82 #include "siphash24.h"
83 #include "copy.h"
84 #include "base-filesystem.h"
85 #include "barrier.h"
86 #include "event-util.h"
87 #include "capability.h"
88 #include "cap-list.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
91 #include "list.h"
92 #include "in-addr-util.h"
93 #include "firewall-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
100
101 #ifdef HAVE_SECCOMP
102 #include "seccomp-util.h"
103 #endif
104
105 typedef struct ExposePort {
106 int protocol;
107 uint16_t host_port;
108 uint16_t container_port;
109 LIST_FIELDS(struct ExposePort, ports);
110 } ExposePort;
111
112 typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115 } ContainerStatus;
116
117 typedef enum LinkJournal {
118 LINK_NO,
119 LINK_AUTO,
120 LINK_HOST,
121 LINK_GUEST
122 } LinkJournal;
123
124 typedef enum Volatile {
125 VOLATILE_NO,
126 VOLATILE_YES,
127 VOLATILE_STATE,
128 } Volatile;
129
130 typedef enum CustomMountType {
131 CUSTOM_MOUNT_BIND,
132 CUSTOM_MOUNT_TMPFS,
133 CUSTOM_MOUNT_OVERLAY,
134 } CustomMountType;
135
136 typedef struct CustomMount {
137 CustomMountType type;
138 bool read_only;
139 char *source; /* for overlayfs this is the upper directory */
140 char *destination;
141 char *options;
142 char *work_dir;
143 char **lower;
144 } CustomMount;
145
146 static char *arg_directory = NULL;
147 static char *arg_template = NULL;
148 static char *arg_user = NULL;
149 static sd_id128_t arg_uuid = {};
150 static char *arg_machine = NULL;
151 static const char *arg_selinux_context = NULL;
152 static const char *arg_selinux_apifs_context = NULL;
153 static const char *arg_slice = NULL;
154 static bool arg_private_network = false;
155 static bool arg_read_only = false;
156 static bool arg_boot = false;
157 static bool arg_ephemeral = false;
158 static LinkJournal arg_link_journal = LINK_AUTO;
159 static bool arg_link_journal_try = false;
160 static uint64_t arg_retain =
161 (1ULL << CAP_CHOWN) |
162 (1ULL << CAP_DAC_OVERRIDE) |
163 (1ULL << CAP_DAC_READ_SEARCH) |
164 (1ULL << CAP_FOWNER) |
165 (1ULL << CAP_FSETID) |
166 (1ULL << CAP_IPC_OWNER) |
167 (1ULL << CAP_KILL) |
168 (1ULL << CAP_LEASE) |
169 (1ULL << CAP_LINUX_IMMUTABLE) |
170 (1ULL << CAP_NET_BIND_SERVICE) |
171 (1ULL << CAP_NET_BROADCAST) |
172 (1ULL << CAP_NET_RAW) |
173 (1ULL << CAP_SETGID) |
174 (1ULL << CAP_SETFCAP) |
175 (1ULL << CAP_SETPCAP) |
176 (1ULL << CAP_SETUID) |
177 (1ULL << CAP_SYS_ADMIN) |
178 (1ULL << CAP_SYS_CHROOT) |
179 (1ULL << CAP_SYS_NICE) |
180 (1ULL << CAP_SYS_PTRACE) |
181 (1ULL << CAP_SYS_TTY_CONFIG) |
182 (1ULL << CAP_SYS_RESOURCE) |
183 (1ULL << CAP_SYS_BOOT) |
184 (1ULL << CAP_AUDIT_WRITE) |
185 (1ULL << CAP_AUDIT_CONTROL) |
186 (1ULL << CAP_MKNOD);
187 static CustomMount *arg_custom_mounts = NULL;
188 static unsigned arg_n_custom_mounts = 0;
189 static char **arg_setenv = NULL;
190 static bool arg_quiet = false;
191 static bool arg_share_system = false;
192 static bool arg_register = true;
193 static bool arg_keep_unit = false;
194 static char **arg_network_interfaces = NULL;
195 static char **arg_network_macvlan = NULL;
196 static char **arg_network_ipvlan = NULL;
197 static bool arg_network_veth = false;
198 static const char *arg_network_bridge = NULL;
199 static unsigned long arg_personality = PERSONALITY_INVALID;
200 static char *arg_image = NULL;
201 static Volatile arg_volatile = VOLATILE_NO;
202 static ExposePort *arg_expose_ports = NULL;
203 static char **arg_property = NULL;
204 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
205 static bool arg_userns = false;
206 static int arg_kill_signal = 0;
207
208 static void help(void) {
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
211 " -h --help Show this help\n"
212 " --version Print version string\n"
213 " -q --quiet Do not show status information\n"
214 " -D --directory=PATH Root directory for the container\n"
215 " --template=PATH Initialize root directory from template directory,\n"
216 " if missing\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
222 " -M --machine=NAME Set the machine name for the container\n"
223 " --uuid=UUID Set a specific machine UUID for the container\n"
224 " -S --slice=SLICE Place the container in the specified slice\n"
225 " --property=NAME=VALUE Set scope unit property\n"
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
231 " container\n"
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
238 " -n --network-veth Add a virtual ethernet connection between host\n"
239 " and container\n"
240 " --network-bridge=INTERFACE\n"
241 " Add a virtual ethernet connection between host\n"
242 " and container and add it to an existing bridge on\n"
243 " the host\n"
244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
245 " Expose a container IP port on the host\n"
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
252 " --capability=CAP In addition to the default, retain specified\n"
253 " capability\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
259 " --read-only Mount the root directory read-only\n"
260 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
261 " the container\n"
262 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
263 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
264 " --overlay=PATH[:PATH...]:PATH\n"
265 " Create an overlay mount from the host to \n"
266 " the container\n"
267 " --overlay-ro=PATH[:PATH...]:PATH\n"
268 " Similar, but creates a read-only overlay mount\n"
269 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
270 " --share-system Share system namespaces with host\n"
271 " --register=BOOLEAN Register container as machine\n"
272 " --keep-unit Do not register a scope for the machine, reuse\n"
273 " the service unit nspawn is running in\n"
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 , program_invocation_short_name);
276 }
277
278 static CustomMount* custom_mount_add(CustomMountType t) {
279 CustomMount *c, *ret;
280
281 c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
282 if (!c)
283 return NULL;
284
285 arg_custom_mounts = c;
286 ret = arg_custom_mounts + arg_n_custom_mounts;
287 arg_n_custom_mounts++;
288
289 *ret = (CustomMount) { .type = t };
290
291 return ret;
292 }
293
294 static void custom_mount_free_all(void) {
295 unsigned i;
296
297 for (i = 0; i < arg_n_custom_mounts; i++) {
298 CustomMount *m = &arg_custom_mounts[i];
299
300 free(m->source);
301 free(m->destination);
302 free(m->options);
303
304 if (m->work_dir) {
305 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
306 free(m->work_dir);
307 }
308
309 strv_free(m->lower);
310 }
311
312 arg_custom_mounts = mfree(arg_custom_mounts);
313 arg_n_custom_mounts = 0;
314 }
315
316 static int custom_mount_compare(const void *a, const void *b) {
317 const CustomMount *x = a, *y = b;
318 int r;
319
320 r = path_compare(x->destination, y->destination);
321 if (r != 0)
322 return r;
323
324 if (x->type < y->type)
325 return -1;
326 if (x->type > y->type)
327 return 1;
328
329 return 0;
330 }
331
332 static int custom_mounts_prepare(void) {
333 unsigned i;
334 int r;
335
336 /* Ensure the mounts are applied prefix first. */
337 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
338
339 /* Allocate working directories for the overlay file systems that need it */
340 for (i = 0; i < arg_n_custom_mounts; i++) {
341 CustomMount *m = &arg_custom_mounts[i];
342
343 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
344 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
345 return -EINVAL;
346 }
347
348 if (m->type != CUSTOM_MOUNT_OVERLAY)
349 continue;
350
351 if (m->work_dir)
352 continue;
353
354 if (m->read_only)
355 continue;
356
357 r = tempfn_random(m->source, NULL, &m->work_dir);
358 if (r < 0)
359 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
360 }
361
362 return 0;
363 }
364
365 static int set_sanitized_path(char **b, const char *path) {
366 char *p;
367
368 assert(b);
369 assert(path);
370
371 p = canonicalize_file_name(path);
372 if (!p) {
373 if (errno != ENOENT)
374 return -errno;
375
376 p = path_make_absolute_cwd(path);
377 if (!p)
378 return -ENOMEM;
379 }
380
381 free(*b);
382 *b = path_kill_slashes(p);
383 return 0;
384 }
385
386 static int parse_argv(int argc, char *argv[]) {
387
388 enum {
389 ARG_VERSION = 0x100,
390 ARG_PRIVATE_NETWORK,
391 ARG_UUID,
392 ARG_READ_ONLY,
393 ARG_CAPABILITY,
394 ARG_DROP_CAPABILITY,
395 ARG_LINK_JOURNAL,
396 ARG_BIND,
397 ARG_BIND_RO,
398 ARG_TMPFS,
399 ARG_OVERLAY,
400 ARG_OVERLAY_RO,
401 ARG_SETENV,
402 ARG_SHARE_SYSTEM,
403 ARG_REGISTER,
404 ARG_KEEP_UNIT,
405 ARG_NETWORK_INTERFACE,
406 ARG_NETWORK_MACVLAN,
407 ARG_NETWORK_IPVLAN,
408 ARG_NETWORK_BRIDGE,
409 ARG_PERSONALITY,
410 ARG_VOLATILE,
411 ARG_TEMPLATE,
412 ARG_PROPERTY,
413 ARG_PRIVATE_USERS,
414 ARG_KILL_SIGNAL,
415 };
416
417 static const struct option options[] = {
418 { "help", no_argument, NULL, 'h' },
419 { "version", no_argument, NULL, ARG_VERSION },
420 { "directory", required_argument, NULL, 'D' },
421 { "template", required_argument, NULL, ARG_TEMPLATE },
422 { "ephemeral", no_argument, NULL, 'x' },
423 { "user", required_argument, NULL, 'u' },
424 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
425 { "boot", no_argument, NULL, 'b' },
426 { "uuid", required_argument, NULL, ARG_UUID },
427 { "read-only", no_argument, NULL, ARG_READ_ONLY },
428 { "capability", required_argument, NULL, ARG_CAPABILITY },
429 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
430 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
431 { "bind", required_argument, NULL, ARG_BIND },
432 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
433 { "tmpfs", required_argument, NULL, ARG_TMPFS },
434 { "overlay", required_argument, NULL, ARG_OVERLAY },
435 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
436 { "machine", required_argument, NULL, 'M' },
437 { "slice", required_argument, NULL, 'S' },
438 { "setenv", required_argument, NULL, ARG_SETENV },
439 { "selinux-context", required_argument, NULL, 'Z' },
440 { "selinux-apifs-context", required_argument, NULL, 'L' },
441 { "quiet", no_argument, NULL, 'q' },
442 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
443 { "register", required_argument, NULL, ARG_REGISTER },
444 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
445 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
446 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
447 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
448 { "network-veth", no_argument, NULL, 'n' },
449 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
450 { "personality", required_argument, NULL, ARG_PERSONALITY },
451 { "image", required_argument, NULL, 'i' },
452 { "volatile", optional_argument, NULL, ARG_VOLATILE },
453 { "port", required_argument, NULL, 'p' },
454 { "property", required_argument, NULL, ARG_PROPERTY },
455 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
456 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
457 {}
458 };
459
460 int c, r;
461 uint64_t plus = 0, minus = 0;
462
463 assert(argc >= 0);
464 assert(argv);
465
466 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
467
468 switch (c) {
469
470 case 'h':
471 help();
472 return 0;
473
474 case ARG_VERSION:
475 puts(PACKAGE_STRING);
476 puts(SYSTEMD_FEATURES);
477 return 0;
478
479 case 'D':
480 r = set_sanitized_path(&arg_directory, optarg);
481 if (r < 0)
482 return log_error_errno(r, "Invalid root directory: %m");
483
484 break;
485
486 case ARG_TEMPLATE:
487 r = set_sanitized_path(&arg_template, optarg);
488 if (r < 0)
489 return log_error_errno(r, "Invalid template directory: %m");
490
491 break;
492
493 case 'i':
494 r = set_sanitized_path(&arg_image, optarg);
495 if (r < 0)
496 return log_error_errno(r, "Invalid image path: %m");
497
498 break;
499
500 case 'x':
501 arg_ephemeral = true;
502 break;
503
504 case 'u':
505 r = free_and_strdup(&arg_user, optarg);
506 if (r < 0)
507 return log_oom();
508
509 break;
510
511 case ARG_NETWORK_BRIDGE:
512 arg_network_bridge = optarg;
513
514 /* fall through */
515
516 case 'n':
517 arg_network_veth = true;
518 arg_private_network = true;
519 break;
520
521 case ARG_NETWORK_INTERFACE:
522 if (strv_extend(&arg_network_interfaces, optarg) < 0)
523 return log_oom();
524
525 arg_private_network = true;
526 break;
527
528 case ARG_NETWORK_MACVLAN:
529 if (strv_extend(&arg_network_macvlan, optarg) < 0)
530 return log_oom();
531
532 arg_private_network = true;
533 break;
534
535 case ARG_NETWORK_IPVLAN:
536 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
537 return log_oom();
538
539 /* fall through */
540
541 case ARG_PRIVATE_NETWORK:
542 arg_private_network = true;
543 break;
544
545 case 'b':
546 arg_boot = true;
547 break;
548
549 case ARG_UUID:
550 r = sd_id128_from_string(optarg, &arg_uuid);
551 if (r < 0) {
552 log_error("Invalid UUID: %s", optarg);
553 return r;
554 }
555 break;
556
557 case 'S':
558 arg_slice = optarg;
559 break;
560
561 case 'M':
562 if (isempty(optarg)) {
563 arg_machine = mfree(arg_machine);
564 } else {
565 if (!machine_name_is_valid(optarg)) {
566 log_error("Invalid machine name: %s", optarg);
567 return -EINVAL;
568 }
569
570 r = free_and_strdup(&arg_machine, optarg);
571 if (r < 0)
572 return log_oom();
573
574 break;
575 }
576
577 case 'Z':
578 arg_selinux_context = optarg;
579 break;
580
581 case 'L':
582 arg_selinux_apifs_context = optarg;
583 break;
584
585 case ARG_READ_ONLY:
586 arg_read_only = true;
587 break;
588
589 case ARG_CAPABILITY:
590 case ARG_DROP_CAPABILITY: {
591 const char *state, *word;
592 size_t length;
593
594 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
595 _cleanup_free_ char *t;
596
597 t = strndup(word, length);
598 if (!t)
599 return log_oom();
600
601 if (streq(t, "all")) {
602 if (c == ARG_CAPABILITY)
603 plus = (uint64_t) -1;
604 else
605 minus = (uint64_t) -1;
606 } else {
607 int cap;
608
609 cap = capability_from_name(t);
610 if (cap < 0) {
611 log_error("Failed to parse capability %s.", t);
612 return -EINVAL;
613 }
614
615 if (c == ARG_CAPABILITY)
616 plus |= 1ULL << (uint64_t) cap;
617 else
618 minus |= 1ULL << (uint64_t) cap;
619 }
620 }
621
622 break;
623 }
624
625 case 'j':
626 arg_link_journal = LINK_GUEST;
627 arg_link_journal_try = true;
628 break;
629
630 case ARG_LINK_JOURNAL:
631 if (streq(optarg, "auto")) {
632 arg_link_journal = LINK_AUTO;
633 arg_link_journal_try = false;
634 } else if (streq(optarg, "no")) {
635 arg_link_journal = LINK_NO;
636 arg_link_journal_try = false;
637 } else if (streq(optarg, "guest")) {
638 arg_link_journal = LINK_GUEST;
639 arg_link_journal_try = false;
640 } else if (streq(optarg, "host")) {
641 arg_link_journal = LINK_HOST;
642 arg_link_journal_try = false;
643 } else if (streq(optarg, "try-guest")) {
644 arg_link_journal = LINK_GUEST;
645 arg_link_journal_try = true;
646 } else if (streq(optarg, "try-host")) {
647 arg_link_journal = LINK_HOST;
648 arg_link_journal_try = true;
649 } else {
650 log_error("Failed to parse link journal mode %s", optarg);
651 return -EINVAL;
652 }
653
654 break;
655
656 case ARG_BIND:
657 case ARG_BIND_RO: {
658 const char *current = optarg;
659 _cleanup_free_ char *source = NULL, *destination = NULL;
660 CustomMount *m;
661 _cleanup_strv_free_ char **strv = NULL;
662
663 r = extract_many_words(&current, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
664 switch (r) {
665 case 1:
666 destination = strdup(source);
667 case 2:
668 break;
669 case -ENOMEM:
670 return log_oom();
671 default:
672 log_error("Invalid bind mount specification: %s", optarg);
673 return -EINVAL;
674 }
675
676 if (!source || !destination)
677 return log_oom();
678
679 if (!path_is_absolute(source) || !path_is_absolute(destination)) {
680 log_error("Invalid bind mount specification: %s", optarg);
681 return -EINVAL;
682 }
683
684 m = custom_mount_add(CUSTOM_MOUNT_BIND);
685 if (!m)
686 return log_oom();
687
688 m->source = source;
689 m->destination = destination;
690 m->read_only = c == ARG_BIND_RO;
691
692 source = destination = NULL;
693
694 break;
695 }
696
697 case ARG_TMPFS: {
698 const char *current = optarg;
699 _cleanup_free_ char *path = NULL, *opts = NULL;
700 CustomMount *m;
701
702 r = extract_first_word(&current, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
703 if (r == -ENOMEM)
704 return log_oom();
705 else if (r < 0) {
706 log_error("Invalid tmpfs specification: %s", optarg);
707 return r;
708 }
709 if (r)
710 opts = strdup(current);
711 else
712 opts = strdup("mode=0755");
713
714 if (!path || !opts)
715 return log_oom();
716
717 if (!path_is_absolute(path)) {
718 log_error("Invalid tmpfs specification: %s", optarg);
719 return -EINVAL;
720 }
721
722 m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
723 if (!m)
724 return log_oom();
725
726 m->destination = path;
727 m->options = opts;
728
729 path = opts = NULL;
730
731 break;
732 }
733
734 case ARG_OVERLAY:
735 case ARG_OVERLAY_RO: {
736 _cleanup_free_ char *upper = NULL, *destination = NULL;
737 _cleanup_strv_free_ char **lower = NULL;
738 CustomMount *m;
739 unsigned n = 0;
740 char **i;
741
742 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
743 if (r == -ENOMEM)
744 return log_oom();
745 else if (r < 0) {
746 log_error("Invalid overlay specification: %s", optarg);
747 return r;
748 }
749
750 STRV_FOREACH(i, lower) {
751 if (!path_is_absolute(*i)) {
752 log_error("Overlay path %s is not absolute.", *i);
753 return -EINVAL;
754 }
755
756 n++;
757 }
758
759 if (n < 2) {
760 log_error("--overlay= needs at least two colon-separated directories specified.");
761 return -EINVAL;
762 }
763
764 if (n == 2) {
765 /* If two parameters are specified,
766 * the first one is the lower, the
767 * second one the upper directory. And
768 * we'll also define the destination
769 * mount point the same as the upper. */
770 upper = lower[1];
771 lower[1] = NULL;
772
773 destination = strdup(upper);
774 if (!destination)
775 return log_oom();
776
777 } else {
778 upper = lower[n - 2];
779 destination = lower[n - 1];
780 lower[n - 2] = NULL;
781 }
782
783 m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
784 if (!m)
785 return log_oom();
786
787 m->destination = destination;
788 m->source = upper;
789 m->lower = lower;
790 m->read_only = c == ARG_OVERLAY_RO;
791
792 upper = destination = NULL;
793 lower = NULL;
794
795 break;
796 }
797
798 case ARG_SETENV: {
799 char **n;
800
801 if (!env_assignment_is_valid(optarg)) {
802 log_error("Environment variable assignment '%s' is not valid.", optarg);
803 return -EINVAL;
804 }
805
806 n = strv_env_set(arg_setenv, optarg);
807 if (!n)
808 return log_oom();
809
810 strv_free(arg_setenv);
811 arg_setenv = n;
812 break;
813 }
814
815 case 'q':
816 arg_quiet = true;
817 break;
818
819 case ARG_SHARE_SYSTEM:
820 arg_share_system = true;
821 break;
822
823 case ARG_REGISTER:
824 r = parse_boolean(optarg);
825 if (r < 0) {
826 log_error("Failed to parse --register= argument: %s", optarg);
827 return r;
828 }
829
830 arg_register = r;
831 break;
832
833 case ARG_KEEP_UNIT:
834 arg_keep_unit = true;
835 break;
836
837 case ARG_PERSONALITY:
838
839 arg_personality = personality_from_string(optarg);
840 if (arg_personality == PERSONALITY_INVALID) {
841 log_error("Unknown or unsupported personality '%s'.", optarg);
842 return -EINVAL;
843 }
844
845 break;
846
847 case ARG_VOLATILE:
848
849 if (!optarg)
850 arg_volatile = VOLATILE_YES;
851 else {
852 r = parse_boolean(optarg);
853 if (r < 0) {
854 if (streq(optarg, "state"))
855 arg_volatile = VOLATILE_STATE;
856 else {
857 log_error("Failed to parse --volatile= argument: %s", optarg);
858 return r;
859 }
860 } else
861 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
862 }
863
864 break;
865
866 case 'p': {
867 const char *split, *e;
868 uint16_t container_port, host_port;
869 int protocol;
870 ExposePort *p;
871
872 if ((e = startswith(optarg, "tcp:")))
873 protocol = IPPROTO_TCP;
874 else if ((e = startswith(optarg, "udp:")))
875 protocol = IPPROTO_UDP;
876 else {
877 e = optarg;
878 protocol = IPPROTO_TCP;
879 }
880
881 split = strchr(e, ':');
882 if (split) {
883 char v[split - e + 1];
884
885 memcpy(v, e, split - e);
886 v[split - e] = 0;
887
888 r = safe_atou16(v, &host_port);
889 if (r < 0 || host_port <= 0) {
890 log_error("Failed to parse host port: %s", optarg);
891 return -EINVAL;
892 }
893
894 r = safe_atou16(split + 1, &container_port);
895 } else {
896 r = safe_atou16(e, &container_port);
897 host_port = container_port;
898 }
899
900 if (r < 0 || container_port <= 0) {
901 log_error("Failed to parse host port: %s", optarg);
902 return -EINVAL;
903 }
904
905 LIST_FOREACH(ports, p, arg_expose_ports) {
906 if (p->protocol == protocol && p->host_port == host_port) {
907 log_error("Duplicate port specification: %s", optarg);
908 return -EINVAL;
909 }
910 }
911
912 p = new(ExposePort, 1);
913 if (!p)
914 return log_oom();
915
916 p->protocol = protocol;
917 p->host_port = host_port;
918 p->container_port = container_port;
919
920 LIST_PREPEND(ports, arg_expose_ports, p);
921
922 break;
923 }
924
925 case ARG_PROPERTY:
926 if (strv_extend(&arg_property, optarg) < 0)
927 return log_oom();
928
929 break;
930
931 case ARG_PRIVATE_USERS:
932 if (optarg) {
933 _cleanup_free_ char *buffer = NULL;
934 const char *range, *shift;
935
936 range = strchr(optarg, ':');
937 if (range) {
938 buffer = strndup(optarg, range - optarg);
939 if (!buffer)
940 return log_oom();
941 shift = buffer;
942
943 range++;
944 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
945 log_error("Failed to parse UID range: %s", range);
946 return -EINVAL;
947 }
948 } else
949 shift = optarg;
950
951 if (parse_uid(shift, &arg_uid_shift) < 0) {
952 log_error("Failed to parse UID: %s", optarg);
953 return -EINVAL;
954 }
955 }
956
957 arg_userns = true;
958 break;
959
960 case ARG_KILL_SIGNAL:
961 arg_kill_signal = signal_from_string_try_harder(optarg);
962 if (arg_kill_signal < 0) {
963 log_error("Cannot parse signal: %s", optarg);
964 return -EINVAL;
965 }
966
967 break;
968
969 case '?':
970 return -EINVAL;
971
972 default:
973 assert_not_reached("Unhandled option");
974 }
975
976 if (arg_share_system)
977 arg_register = false;
978
979 if (arg_boot && arg_share_system) {
980 log_error("--boot and --share-system may not be combined.");
981 return -EINVAL;
982 }
983
984 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
985 log_error("--keep-unit may not be used when invoked from a user session.");
986 return -EINVAL;
987 }
988
989 if (arg_directory && arg_image) {
990 log_error("--directory= and --image= may not be combined.");
991 return -EINVAL;
992 }
993
994 if (arg_template && arg_image) {
995 log_error("--template= and --image= may not be combined.");
996 return -EINVAL;
997 }
998
999 if (arg_template && !(arg_directory || arg_machine)) {
1000 log_error("--template= needs --directory= or --machine=.");
1001 return -EINVAL;
1002 }
1003
1004 if (arg_ephemeral && arg_template) {
1005 log_error("--ephemeral and --template= may not be combined.");
1006 return -EINVAL;
1007 }
1008
1009 if (arg_ephemeral && arg_image) {
1010 log_error("--ephemeral and --image= may not be combined.");
1011 return -EINVAL;
1012 }
1013
1014 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1015 log_error("--ephemeral and --link-journal= may not be combined.");
1016 return -EINVAL;
1017 }
1018
1019 if (arg_volatile != VOLATILE_NO && arg_read_only) {
1020 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1021 return -EINVAL;
1022 }
1023
1024 if (arg_expose_ports && !arg_private_network) {
1025 log_error("Cannot use --port= without private networking.");
1026 return -EINVAL;
1027 }
1028
1029 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
1030 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
1031
1032 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1033
1034 if (arg_boot && arg_kill_signal <= 0)
1035 arg_kill_signal = SIGRTMIN+3;
1036
1037 return 1;
1038 }
1039
1040 static int tmpfs_patch_options(const char *options, char **ret) {
1041 char *buf = NULL;
1042
1043 if (arg_userns && arg_uid_shift != 0) {
1044 assert(arg_uid_shift != UID_INVALID);
1045
1046 if (options)
1047 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
1048 else
1049 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
1050 if (!buf)
1051 return -ENOMEM;
1052
1053 options = buf;
1054 }
1055
1056 #ifdef HAVE_SELINUX
1057 if (arg_selinux_apifs_context) {
1058 char *t;
1059
1060 if (options)
1061 t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
1062 else
1063 t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
1064 if (!t) {
1065 free(buf);
1066 return -ENOMEM;
1067 }
1068
1069 free(buf);
1070 buf = t;
1071 }
1072 #endif
1073
1074 *ret = buf;
1075 return !!buf;
1076 }
1077
1078 static int mount_all(const char *dest, bool userns) {
1079
1080 typedef struct MountPoint {
1081 const char *what;
1082 const char *where;
1083 const char *type;
1084 const char *options;
1085 unsigned long flags;
1086 bool fatal;
1087 bool userns;
1088 } MountPoint;
1089
1090 static const MountPoint mount_table[] = {
1091 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
1092 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
1093 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
1094 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
1095 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
1096 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
1097 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1098 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1099 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
1100 #ifdef HAVE_SELINUX
1101 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
1102 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */
1103 #endif
1104 };
1105
1106 unsigned k;
1107 int r;
1108
1109 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
1110 _cleanup_free_ char *where = NULL, *options = NULL;
1111 const char *o;
1112
1113 if (userns != mount_table[k].userns)
1114 continue;
1115
1116 where = prefix_root(dest, mount_table[k].where);
1117 if (!where)
1118 return log_oom();
1119
1120 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
1121 if (r < 0 && r != -ENOENT)
1122 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
1123
1124 /* Skip this entry if it is not a remount. */
1125 if (mount_table[k].what && r > 0)
1126 continue;
1127
1128 r = mkdir_p(where, 0755);
1129 if (r < 0) {
1130 if (mount_table[k].fatal)
1131 return log_error_errno(r, "Failed to create directory %s: %m", where);
1132
1133 log_warning_errno(r, "Failed to create directory %s: %m", where);
1134 continue;
1135 }
1136
1137 o = mount_table[k].options;
1138 if (streq_ptr(mount_table[k].type, "tmpfs")) {
1139 r = tmpfs_patch_options(o, &options);
1140 if (r < 0)
1141 return log_oom();
1142 if (r > 0)
1143 o = options;
1144 }
1145
1146 if (mount(mount_table[k].what,
1147 where,
1148 mount_table[k].type,
1149 mount_table[k].flags,
1150 o) < 0) {
1151
1152 if (mount_table[k].fatal)
1153 return log_error_errno(errno, "mount(%s) failed: %m", where);
1154
1155 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
1156 }
1157 }
1158
1159 return 0;
1160 }
1161
1162 static int mount_bind(const char *dest, CustomMount *m) {
1163 struct stat source_st, dest_st;
1164 const char *where;
1165 int r;
1166
1167 assert(m);
1168
1169 if (stat(m->source, &source_st) < 0)
1170 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
1171
1172 where = prefix_roota(dest, m->destination);
1173
1174 if (stat(where, &dest_st) >= 0) {
1175 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
1176 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
1177 return -EINVAL;
1178 }
1179
1180 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
1181 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
1182 return -EINVAL;
1183 }
1184
1185 } else if (errno == ENOENT) {
1186 r = mkdir_parents_label(where, 0755);
1187 if (r < 0)
1188 return log_error_errno(r, "Failed to make parents of %s: %m", where);
1189 } else {
1190 log_error_errno(errno, "Failed to stat %s: %m", where);
1191 return -errno;
1192 }
1193
1194 /* Create the mount point. Any non-directory file can be
1195 * mounted on any non-directory file (regular, fifo, socket,
1196 * char, block).
1197 */
1198 if (S_ISDIR(source_st.st_mode))
1199 r = mkdir_label(where, 0755);
1200 else
1201 r = touch(where);
1202 if (r < 0 && r != -EEXIST)
1203 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1204
1205 if (mount(m->source, where, NULL, MS_BIND, NULL) < 0)
1206 return log_error_errno(errno, "mount(%s) failed: %m", where);
1207
1208 if (m->read_only) {
1209 r = bind_remount_recursive(where, true);
1210 if (r < 0)
1211 return log_error_errno(r, "Read-only bind mount failed: %m");
1212 }
1213
1214 return 0;
1215 }
1216
1217 static int mount_tmpfs(const char *dest, CustomMount *m) {
1218 const char *where, *options;
1219 _cleanup_free_ char *buf = NULL;
1220 int r;
1221
1222 assert(dest);
1223 assert(m);
1224
1225 where = prefix_roota(dest, m->destination);
1226
1227 r = mkdir_p_label(where, 0755);
1228 if (r < 0 && r != -EEXIST)
1229 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1230
1231 r = tmpfs_patch_options(m->options, &buf);
1232 if (r < 0)
1233 return log_oom();
1234 options = r > 0 ? buf : m->options;
1235
1236 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
1237 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1238
1239 return 0;
1240 }
1241
1242 static char *joined_and_escaped_lower_dirs(char * const *lower) {
1243 _cleanup_strv_free_ char **sv = NULL;
1244
1245 sv = strv_copy(lower);
1246 if (!sv)
1247 return NULL;
1248
1249 strv_reverse(sv);
1250
1251 if (!strv_shell_escape(sv, ",:"))
1252 return NULL;
1253
1254 return strv_join(sv, ":");
1255 }
1256
1257 static int mount_overlay(const char *dest, CustomMount *m) {
1258 _cleanup_free_ char *lower = NULL;
1259 const char *where, *options;
1260 int r;
1261
1262 assert(dest);
1263 assert(m);
1264
1265 where = prefix_roota(dest, m->destination);
1266
1267 r = mkdir_label(where, 0755);
1268 if (r < 0 && r != -EEXIST)
1269 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
1270
1271 (void) mkdir_p_label(m->source, 0755);
1272
1273 lower = joined_and_escaped_lower_dirs(m->lower);
1274 if (!lower)
1275 return log_oom();
1276
1277 if (m->read_only) {
1278 _cleanup_free_ char *escaped_source = NULL;
1279
1280 escaped_source = shell_escape(m->source, ",:");
1281 if (!escaped_source)
1282 return log_oom();
1283
1284 options = strjoina("lowerdir=", escaped_source, ":", lower);
1285 } else {
1286 _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL;
1287
1288 assert(m->work_dir);
1289 (void) mkdir_label(m->work_dir, 0700);
1290
1291 escaped_source = shell_escape(m->source, ",:");
1292 if (!escaped_source)
1293 return log_oom();
1294 escaped_work_dir = shell_escape(m->work_dir, ",:");
1295 if (!escaped_work_dir)
1296 return log_oom();
1297
1298 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
1299 }
1300
1301 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
1302 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
1303
1304 return 0;
1305 }
1306
1307 static int mount_custom(const char *dest) {
1308 unsigned i;
1309 int r;
1310
1311 assert(dest);
1312
1313 for (i = 0; i < arg_n_custom_mounts; i++) {
1314 CustomMount *m = &arg_custom_mounts[i];
1315
1316 switch (m->type) {
1317
1318 case CUSTOM_MOUNT_BIND:
1319 r = mount_bind(dest, m);
1320 break;
1321
1322 case CUSTOM_MOUNT_TMPFS:
1323 r = mount_tmpfs(dest, m);
1324 break;
1325
1326 case CUSTOM_MOUNT_OVERLAY:
1327 r = mount_overlay(dest, m);
1328 break;
1329
1330 default:
1331 assert_not_reached("Unknown custom mount type");
1332 }
1333
1334 if (r < 0)
1335 return r;
1336 }
1337
1338 return 0;
1339 }
1340
1341 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1342 char *to;
1343 int r;
1344
1345 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1346
1347 r = path_is_mount_point(to, 0);
1348 if (r < 0 && r != -ENOENT)
1349 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1350 if (r > 0)
1351 return 0;
1352
1353 mkdir_p(to, 0755);
1354
1355 /* The superblock mount options of the mount point need to be
1356 * identical to the hosts', and hence writable... */
1357 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1358 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1359
1360 /* ... hence let's only make the bind mount read-only, not the
1361 * superblock. */
1362 if (read_only) {
1363 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1364 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1365 }
1366 return 1;
1367 }
1368
1369 static int mount_cgroup(const char *dest) {
1370 _cleanup_set_free_free_ Set *controllers = NULL;
1371 const char *cgroup_root;
1372 int r;
1373
1374 controllers = set_new(&string_hash_ops);
1375 if (!controllers)
1376 return log_oom();
1377
1378 r = cg_kernel_controllers(controllers);
1379 if (r < 0)
1380 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1381
1382 for (;;) {
1383 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1384
1385 controller = set_steal_first(controllers);
1386 if (!controller)
1387 break;
1388
1389 origin = prefix_root("/sys/fs/cgroup/", controller);
1390 if (!origin)
1391 return log_oom();
1392
1393 r = readlink_malloc(origin, &combined);
1394 if (r == -EINVAL) {
1395 /* Not a symbolic link, but directly a single cgroup hierarchy */
1396
1397 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1398 if (r < 0)
1399 return r;
1400
1401 } else if (r < 0)
1402 return log_error_errno(r, "Failed to read link %s: %m", origin);
1403 else {
1404 _cleanup_free_ char *target = NULL;
1405
1406 target = prefix_root(dest, origin);
1407 if (!target)
1408 return log_oom();
1409
1410 /* A symbolic link, a combination of controllers in one hierarchy */
1411
1412 if (!filename_is_valid(combined)) {
1413 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1414 continue;
1415 }
1416
1417 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1418 if (r < 0)
1419 return r;
1420
1421 r = symlink_idempotent(combined, target);
1422 if (r == -EINVAL) {
1423 log_error("Invalid existing symlink for combined hierarchy");
1424 return r;
1425 }
1426 if (r < 0)
1427 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1428 }
1429 }
1430
1431 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1432 if (r < 0)
1433 return r;
1434
1435 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1436 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1437 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1438
1439 return 0;
1440 }
1441
1442 static int mount_systemd_cgroup_writable(const char *dest) {
1443 _cleanup_free_ char *own_cgroup_path = NULL;
1444 const char *systemd_root, *systemd_own;
1445 int r;
1446
1447 assert(dest);
1448
1449 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1450 if (r < 0)
1451 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1452
1453 /* Make our own cgroup a (writable) bind mount */
1454 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1455 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1456 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1457
1458 /* And then remount the systemd cgroup root read-only */
1459 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
1460 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1461 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1462
1463 return 0;
1464 }
1465
1466 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1467 assert(p);
1468
1469 if (!arg_userns)
1470 return 0;
1471
1472 if (uid == UID_INVALID && gid == GID_INVALID)
1473 return 0;
1474
1475 if (uid != UID_INVALID) {
1476 uid += arg_uid_shift;
1477
1478 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1479 return -EOVERFLOW;
1480 }
1481
1482 if (gid != GID_INVALID) {
1483 gid += (gid_t) arg_uid_shift;
1484
1485 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1486 return -EOVERFLOW;
1487 }
1488
1489 if (lchown(p, uid, gid) < 0)
1490 return -errno;
1491
1492 return 0;
1493 }
1494
1495 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1496 const char *q;
1497
1498 q = prefix_roota(root, path);
1499 if (mkdir(q, mode) < 0) {
1500 if (errno == EEXIST)
1501 return 0;
1502 return -errno;
1503 }
1504
1505 return userns_lchown(q, uid, gid);
1506 }
1507
1508 static int setup_timezone(const char *dest) {
1509 _cleanup_free_ char *p = NULL, *q = NULL;
1510 const char *where, *check, *what;
1511 char *z, *y;
1512 int r;
1513
1514 assert(dest);
1515
1516 /* Fix the timezone, if possible */
1517 r = readlink_malloc("/etc/localtime", &p);
1518 if (r < 0) {
1519 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1520 return 0;
1521 }
1522
1523 z = path_startswith(p, "../usr/share/zoneinfo/");
1524 if (!z)
1525 z = path_startswith(p, "/usr/share/zoneinfo/");
1526 if (!z) {
1527 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1528 return 0;
1529 }
1530
1531 where = prefix_roota(dest, "/etc/localtime");
1532 r = readlink_malloc(where, &q);
1533 if (r >= 0) {
1534 y = path_startswith(q, "../usr/share/zoneinfo/");
1535 if (!y)
1536 y = path_startswith(q, "/usr/share/zoneinfo/");
1537
1538 /* Already pointing to the right place? Then do nothing .. */
1539 if (y && streq(y, z))
1540 return 0;
1541 }
1542
1543 check = strjoina("/usr/share/zoneinfo/", z);
1544 check = prefix_root(dest, check);
1545 if (laccess(check, F_OK) < 0) {
1546 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1547 return 0;
1548 }
1549
1550 r = unlink(where);
1551 if (r < 0 && errno != ENOENT) {
1552 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1553 return 0;
1554 }
1555
1556 what = strjoina("../usr/share/zoneinfo/", z);
1557 if (symlink(what, where) < 0) {
1558 log_error_errno(errno, "Failed to correct timezone of container: %m");
1559 return 0;
1560 }
1561
1562 r = userns_lchown(where, 0, 0);
1563 if (r < 0)
1564 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1565
1566 return 0;
1567 }
1568
1569 static int setup_resolv_conf(const char *dest) {
1570 const char *where = NULL;
1571 int r;
1572
1573 assert(dest);
1574
1575 if (arg_private_network)
1576 return 0;
1577
1578 /* Fix resolv.conf, if possible */
1579 where = prefix_roota(dest, "/etc/resolv.conf");
1580
1581 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1582 if (r < 0) {
1583 /* If the file already exists as symlink, let's
1584 * suppress the warning, under the assumption that
1585 * resolved or something similar runs inside and the
1586 * symlink points there.
1587 *
1588 * If the disk image is read-only, there's also no
1589 * point in complaining.
1590 */
1591 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1592 "Failed to copy /etc/resolv.conf to %s: %m", where);
1593 return 0;
1594 }
1595
1596 r = userns_lchown(where, 0, 0);
1597 if (r < 0)
1598 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1599
1600 return 0;
1601 }
1602
1603 static int setup_volatile_state(const char *directory) {
1604 _cleanup_free_ char *buf = NULL;
1605 const char *p, *options;
1606 int r;
1607
1608 assert(directory);
1609
1610 if (arg_volatile != VOLATILE_STATE)
1611 return 0;
1612
1613 /* --volatile=state means we simply overmount /var
1614 with a tmpfs, and the rest read-only. */
1615
1616 r = bind_remount_recursive(directory, true);
1617 if (r < 0)
1618 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1619
1620 p = prefix_roota(directory, "/var");
1621 r = mkdir(p, 0755);
1622 if (r < 0 && errno != EEXIST)
1623 return log_error_errno(errno, "Failed to create %s: %m", directory);
1624
1625 options = "mode=755";
1626 r = tmpfs_patch_options(options, &buf);
1627 if (r < 0)
1628 return log_oom();
1629 if (r > 0)
1630 options = buf;
1631
1632 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
1633 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1634
1635 return 0;
1636 }
1637
1638 static int setup_volatile(const char *directory) {
1639 bool tmpfs_mounted = false, bind_mounted = false;
1640 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1641 _cleanup_free_ char *buf = NULL;
1642 const char *f, *t, *options;
1643 int r;
1644
1645 assert(directory);
1646
1647 if (arg_volatile != VOLATILE_YES)
1648 return 0;
1649
1650 /* --volatile=yes means we mount a tmpfs to the root dir, and
1651 the original /usr to use inside it, and that read-only. */
1652
1653 if (!mkdtemp(template))
1654 return log_error_errno(errno, "Failed to create temporary directory: %m");
1655
1656 options = "mode=755";
1657 r = tmpfs_patch_options(options, &buf);
1658 if (r < 0)
1659 return log_oom();
1660 if (r > 0)
1661 options = buf;
1662
1663 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
1664 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1665 goto fail;
1666 }
1667
1668 tmpfs_mounted = true;
1669
1670 f = prefix_roota(directory, "/usr");
1671 t = prefix_roota(template, "/usr");
1672
1673 r = mkdir(t, 0755);
1674 if (r < 0 && errno != EEXIST) {
1675 r = log_error_errno(errno, "Failed to create %s: %m", t);
1676 goto fail;
1677 }
1678
1679 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
1680 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
1681 goto fail;
1682 }
1683
1684 bind_mounted = true;
1685
1686 r = bind_remount_recursive(t, true);
1687 if (r < 0) {
1688 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1689 goto fail;
1690 }
1691
1692 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1693 r = log_error_errno(errno, "Failed to move root mount: %m");
1694 goto fail;
1695 }
1696
1697 (void) rmdir(template);
1698
1699 return 0;
1700
1701 fail:
1702 if (bind_mounted)
1703 (void) umount(t);
1704
1705 if (tmpfs_mounted)
1706 (void) umount(template);
1707 (void) rmdir(template);
1708 return r;
1709 }
1710
1711 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1712 assert(s);
1713
1714 snprintf(s, 37,
1715 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1716 SD_ID128_FORMAT_VAL(id));
1717
1718 return s;
1719 }
1720
1721 static int setup_boot_id(const char *dest) {
1722 const char *from, *to;
1723 sd_id128_t rnd = {};
1724 char as_uuid[37];
1725 int r;
1726
1727 if (arg_share_system)
1728 return 0;
1729
1730 /* Generate a new randomized boot ID, so that each boot-up of
1731 * the container gets a new one */
1732
1733 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1734 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1735
1736 r = sd_id128_randomize(&rnd);
1737 if (r < 0)
1738 return log_error_errno(r, "Failed to generate random boot id: %m");
1739
1740 id128_format_as_uuid(rnd, as_uuid);
1741
1742 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1743 if (r < 0)
1744 return log_error_errno(r, "Failed to write boot id: %m");
1745
1746 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1747 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1748 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1749 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1750
1751 unlink(from);
1752 return r;
1753 }
1754
1755 static int copy_devnodes(const char *dest) {
1756
1757 static const char devnodes[] =
1758 "null\0"
1759 "zero\0"
1760 "full\0"
1761 "random\0"
1762 "urandom\0"
1763 "tty\0"
1764 "net/tun\0";
1765
1766 const char *d;
1767 int r = 0;
1768 _cleanup_umask_ mode_t u;
1769
1770 assert(dest);
1771
1772 u = umask(0000);
1773
1774 /* Create /dev/net, so that we can create /dev/net/tun in it */
1775 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1776 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1777
1778 NULSTR_FOREACH(d, devnodes) {
1779 _cleanup_free_ char *from = NULL, *to = NULL;
1780 struct stat st;
1781
1782 from = strappend("/dev/", d);
1783 to = prefix_root(dest, from);
1784
1785 if (stat(from, &st) < 0) {
1786
1787 if (errno != ENOENT)
1788 return log_error_errno(errno, "Failed to stat %s: %m", from);
1789
1790 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1791
1792 log_error("%s is not a char or block device, cannot copy.", from);
1793 return -EIO;
1794
1795 } else {
1796 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1797 if (errno != EPERM)
1798 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1799
1800 /* Some systems abusively restrict mknod but
1801 * allow bind mounts. */
1802 r = touch(to);
1803 if (r < 0)
1804 return log_error_errno(r, "touch (%s) failed: %m", to);
1805 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1806 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1807 }
1808
1809 r = userns_lchown(to, 0, 0);
1810 if (r < 0)
1811 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1812 }
1813 }
1814
1815 return r;
1816 }
1817
1818 static int setup_pts(const char *dest) {
1819 _cleanup_free_ char *options = NULL;
1820 const char *p;
1821
1822 #ifdef HAVE_SELINUX
1823 if (arg_selinux_apifs_context)
1824 (void) asprintf(&options,
1825 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1826 arg_uid_shift + TTY_GID,
1827 arg_selinux_apifs_context);
1828 else
1829 #endif
1830 (void) asprintf(&options,
1831 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1832 arg_uid_shift + TTY_GID);
1833
1834 if (!options)
1835 return log_oom();
1836
1837 /* Mount /dev/pts itself */
1838 p = prefix_roota(dest, "/dev/pts");
1839 if (mkdir(p, 0755) < 0)
1840 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1841 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1842 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1843 if (userns_lchown(p, 0, 0) < 0)
1844 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1845
1846 /* Create /dev/ptmx symlink */
1847 p = prefix_roota(dest, "/dev/ptmx");
1848 if (symlink("pts/ptmx", p) < 0)
1849 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1850 if (userns_lchown(p, 0, 0) < 0)
1851 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1852
1853 /* And fix /dev/pts/ptmx ownership */
1854 p = prefix_roota(dest, "/dev/pts/ptmx");
1855 if (userns_lchown(p, 0, 0) < 0)
1856 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1857
1858 return 0;
1859 }
1860
1861 static int setup_dev_console(const char *dest, const char *console) {
1862 _cleanup_umask_ mode_t u;
1863 const char *to;
1864 int r;
1865
1866 assert(dest);
1867 assert(console);
1868
1869 u = umask(0000);
1870
1871 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1872 if (r < 0)
1873 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1874
1875 /* We need to bind mount the right tty to /dev/console since
1876 * ptys can only exist on pts file systems. To have something
1877 * to bind mount things on we create a empty regular file. */
1878
1879 to = prefix_roota(dest, "/dev/console");
1880 r = touch(to);
1881 if (r < 0)
1882 return log_error_errno(r, "touch() for /dev/console failed: %m");
1883
1884 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1885 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1886
1887 return 0;
1888 }
1889
1890 static int setup_kmsg(const char *dest, int kmsg_socket) {
1891 const char *from, *to;
1892 _cleanup_umask_ mode_t u;
1893 int fd, k;
1894 union {
1895 struct cmsghdr cmsghdr;
1896 uint8_t buf[CMSG_SPACE(sizeof(int))];
1897 } control = {};
1898 struct msghdr mh = {
1899 .msg_control = &control,
1900 .msg_controllen = sizeof(control),
1901 };
1902 struct cmsghdr *cmsg;
1903
1904 assert(kmsg_socket >= 0);
1905
1906 u = umask(0000);
1907
1908 /* We create the kmsg FIFO as /run/kmsg, but immediately
1909 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1910 * on the reading side behave very similar to /proc/kmsg,
1911 * their writing side behaves differently from /dev/kmsg in
1912 * that writing blocks when nothing is reading. In order to
1913 * avoid any problems with containers deadlocking due to this
1914 * we simply make /dev/kmsg unavailable to the container. */
1915 from = prefix_roota(dest, "/run/kmsg");
1916 to = prefix_roota(dest, "/proc/kmsg");
1917
1918 if (mkfifo(from, 0600) < 0)
1919 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1920 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1921 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1922
1923 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1924 if (fd < 0)
1925 return log_error_errno(errno, "Failed to open fifo: %m");
1926
1927 cmsg = CMSG_FIRSTHDR(&mh);
1928 cmsg->cmsg_level = SOL_SOCKET;
1929 cmsg->cmsg_type = SCM_RIGHTS;
1930 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1931 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1932
1933 mh.msg_controllen = cmsg->cmsg_len;
1934
1935 /* Store away the fd in the socket, so that it stays open as
1936 * long as we run the child */
1937 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1938 safe_close(fd);
1939
1940 if (k < 0)
1941 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1942
1943 /* And now make the FIFO unavailable as /run/kmsg... */
1944 (void) unlink(from);
1945
1946 return 0;
1947 }
1948
1949 static int send_rtnl(int send_fd) {
1950 union {
1951 struct cmsghdr cmsghdr;
1952 uint8_t buf[CMSG_SPACE(sizeof(int))];
1953 } control = {};
1954 struct msghdr mh = {
1955 .msg_control = &control,
1956 .msg_controllen = sizeof(control),
1957 };
1958 struct cmsghdr *cmsg;
1959 _cleanup_close_ int fd = -1;
1960 ssize_t k;
1961
1962 assert(send_fd >= 0);
1963
1964 if (!arg_expose_ports)
1965 return 0;
1966
1967 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1968 if (fd < 0)
1969 return log_error_errno(errno, "Failed to allocate container netlink: %m");
1970
1971 cmsg = CMSG_FIRSTHDR(&mh);
1972 cmsg->cmsg_level = SOL_SOCKET;
1973 cmsg->cmsg_type = SCM_RIGHTS;
1974 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1975 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1976
1977 mh.msg_controllen = cmsg->cmsg_len;
1978
1979 /* Store away the fd in the socket, so that it stays open as
1980 * long as we run the child */
1981 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1982 if (k < 0)
1983 return log_error_errno(errno, "Failed to send netlink fd: %m");
1984
1985 return 0;
1986 }
1987
1988 static int flush_ports(union in_addr_union *exposed) {
1989 ExposePort *p;
1990 int r, af = AF_INET;
1991
1992 assert(exposed);
1993
1994 if (!arg_expose_ports)
1995 return 0;
1996
1997 if (in_addr_is_null(af, exposed))
1998 return 0;
1999
2000 log_debug("Lost IP address.");
2001
2002 LIST_FOREACH(ports, p, arg_expose_ports) {
2003 r = fw_add_local_dnat(false,
2004 af,
2005 p->protocol,
2006 NULL,
2007 NULL, 0,
2008 NULL, 0,
2009 p->host_port,
2010 exposed,
2011 p->container_port,
2012 NULL);
2013 if (r < 0)
2014 log_warning_errno(r, "Failed to modify firewall: %m");
2015 }
2016
2017 *exposed = IN_ADDR_NULL;
2018 return 0;
2019 }
2020
2021 static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
2022 _cleanup_free_ struct local_address *addresses = NULL;
2023 _cleanup_free_ char *pretty = NULL;
2024 union in_addr_union new_exposed;
2025 ExposePort *p;
2026 bool add;
2027 int af = AF_INET, r;
2028
2029 assert(exposed);
2030
2031 /* Invoked each time an address is added or removed inside the
2032 * container */
2033
2034 if (!arg_expose_ports)
2035 return 0;
2036
2037 r = local_addresses(rtnl, 0, af, &addresses);
2038 if (r < 0)
2039 return log_error_errno(r, "Failed to enumerate local addresses: %m");
2040
2041 add = r > 0 &&
2042 addresses[0].family == af &&
2043 addresses[0].scope < RT_SCOPE_LINK;
2044
2045 if (!add)
2046 return flush_ports(exposed);
2047
2048 new_exposed = addresses[0].address;
2049 if (in_addr_equal(af, exposed, &new_exposed))
2050 return 0;
2051
2052 in_addr_to_string(af, &new_exposed, &pretty);
2053 log_debug("New container IP is %s.", strna(pretty));
2054
2055 LIST_FOREACH(ports, p, arg_expose_ports) {
2056
2057 r = fw_add_local_dnat(true,
2058 af,
2059 p->protocol,
2060 NULL,
2061 NULL, 0,
2062 NULL, 0,
2063 p->host_port,
2064 &new_exposed,
2065 p->container_port,
2066 in_addr_is_null(af, exposed) ? NULL : exposed);
2067 if (r < 0)
2068 log_warning_errno(r, "Failed to modify firewall: %m");
2069 }
2070
2071 *exposed = new_exposed;
2072 return 0;
2073 }
2074
2075 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2076 union in_addr_union *exposed = userdata;
2077
2078 assert(rtnl);
2079 assert(m);
2080 assert(exposed);
2081
2082 expose_ports(rtnl, exposed);
2083 return 0;
2084 }
2085
2086 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
2087 union {
2088 struct cmsghdr cmsghdr;
2089 uint8_t buf[CMSG_SPACE(sizeof(int))];
2090 } control = {};
2091 struct msghdr mh = {
2092 .msg_control = &control,
2093 .msg_controllen = sizeof(control),
2094 };
2095 struct cmsghdr *cmsg;
2096 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2097 int fd, r;
2098 ssize_t k;
2099
2100 assert(event);
2101 assert(recv_fd >= 0);
2102 assert(ret);
2103
2104 if (!arg_expose_ports)
2105 return 0;
2106
2107 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
2108 if (k < 0)
2109 return log_error_errno(errno, "Failed to recv netlink fd: %m");
2110
2111 cmsg = CMSG_FIRSTHDR(&mh);
2112 assert(cmsg->cmsg_level == SOL_SOCKET);
2113 assert(cmsg->cmsg_type == SCM_RIGHTS);
2114 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
2115 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
2116
2117 r = sd_netlink_open_fd(&rtnl, fd);
2118 if (r < 0) {
2119 safe_close(fd);
2120 return log_error_errno(r, "Failed to create rtnl object: %m");
2121 }
2122
2123 r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
2124 if (r < 0)
2125 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
2126
2127 r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
2128 if (r < 0)
2129 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
2130
2131 r = sd_netlink_attach_event(rtnl, event, 0);
2132 if (r < 0)
2133 return log_error_errno(r, "Failed to add to even loop: %m");
2134
2135 *ret = rtnl;
2136 rtnl = NULL;
2137
2138 return 0;
2139 }
2140
2141 static int setup_hostname(void) {
2142
2143 if (arg_share_system)
2144 return 0;
2145
2146 if (sethostname_idempotent(arg_machine) < 0)
2147 return -errno;
2148
2149 return 0;
2150 }
2151
2152 static int setup_journal(const char *directory) {
2153 sd_id128_t machine_id, this_id;
2154 _cleanup_free_ char *b = NULL, *d = NULL;
2155 const char *etc_machine_id, *p, *q;
2156 char *id;
2157 int r;
2158
2159 /* Don't link journals in ephemeral mode */
2160 if (arg_ephemeral)
2161 return 0;
2162
2163 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2164
2165 r = read_one_line_file(etc_machine_id, &b);
2166 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
2167 return 0;
2168 else if (r < 0)
2169 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
2170
2171 id = strstrip(b);
2172 if (isempty(id) && arg_link_journal == LINK_AUTO)
2173 return 0;
2174
2175 /* Verify validity */
2176 r = sd_id128_from_string(id, &machine_id);
2177 if (r < 0)
2178 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
2179
2180 r = sd_id128_get_machine(&this_id);
2181 if (r < 0)
2182 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2183
2184 if (sd_id128_equal(machine_id, this_id)) {
2185 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
2186 "Host and machine ids are equal (%s): refusing to link journals", id);
2187 if (arg_link_journal == LINK_AUTO)
2188 return 0;
2189 return -EEXIST;
2190 }
2191
2192 if (arg_link_journal == LINK_NO)
2193 return 0;
2194
2195 r = userns_mkdir(directory, "/var", 0755, 0, 0);
2196 if (r < 0)
2197 return log_error_errno(r, "Failed to create /var: %m");
2198
2199 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
2200 if (r < 0)
2201 return log_error_errno(r, "Failed to create /var/log: %m");
2202
2203 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
2204 if (r < 0)
2205 return log_error_errno(r, "Failed to create /var/log/journal: %m");
2206
2207 p = strjoina("/var/log/journal/", id);
2208 q = prefix_roota(directory, p);
2209
2210 if (path_is_mount_point(p, 0) > 0) {
2211 if (arg_link_journal != LINK_AUTO) {
2212 log_error("%s: already a mount point, refusing to use for journal", p);
2213 return -EEXIST;
2214 }
2215
2216 return 0;
2217 }
2218
2219 if (path_is_mount_point(q, 0) > 0) {
2220 if (arg_link_journal != LINK_AUTO) {
2221 log_error("%s: already a mount point, refusing to use for journal", q);
2222 return -EEXIST;
2223 }
2224
2225 return 0;
2226 }
2227
2228 r = readlink_and_make_absolute(p, &d);
2229 if (r >= 0) {
2230 if ((arg_link_journal == LINK_GUEST ||
2231 arg_link_journal == LINK_AUTO) &&
2232 path_equal(d, q)) {
2233
2234 r = userns_mkdir(directory, p, 0755, 0, 0);
2235 if (r < 0)
2236 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2237 return 0;
2238 }
2239
2240 if (unlink(p) < 0)
2241 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2242 } else if (r == -EINVAL) {
2243
2244 if (arg_link_journal == LINK_GUEST &&
2245 rmdir(p) < 0) {
2246
2247 if (errno == ENOTDIR) {
2248 log_error("%s already exists and is neither a symlink nor a directory", p);
2249 return r;
2250 } else {
2251 log_error_errno(errno, "Failed to remove %s: %m", p);
2252 return -errno;
2253 }
2254 }
2255 } else if (r != -ENOENT) {
2256 log_error_errno(errno, "readlink(%s) failed: %m", p);
2257 return r;
2258 }
2259
2260 if (arg_link_journal == LINK_GUEST) {
2261
2262 if (symlink(q, p) < 0) {
2263 if (arg_link_journal_try) {
2264 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2265 return 0;
2266 } else {
2267 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2268 return -errno;
2269 }
2270 }
2271
2272 r = userns_mkdir(directory, p, 0755, 0, 0);
2273 if (r < 0)
2274 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2275 return 0;
2276 }
2277
2278 if (arg_link_journal == LINK_HOST) {
2279 /* don't create parents here -- if the host doesn't have
2280 * permanent journal set up, don't force it here */
2281 r = mkdir(p, 0755);
2282 if (r < 0) {
2283 if (arg_link_journal_try) {
2284 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
2285 return 0;
2286 } else {
2287 log_error_errno(errno, "Failed to create %s: %m", p);
2288 return r;
2289 }
2290 }
2291
2292 } else if (access(p, F_OK) < 0)
2293 return 0;
2294
2295 if (dir_is_empty(q) == 0)
2296 log_warning("%s is not empty, proceeding anyway.", q);
2297
2298 r = userns_mkdir(directory, p, 0755, 0, 0);
2299 if (r < 0) {
2300 log_error_errno(errno, "Failed to create %s: %m", q);
2301 return r;
2302 }
2303
2304 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2305 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2306
2307 return 0;
2308 }
2309
2310 static int drop_capabilities(void) {
2311 return capability_bounding_set_drop(~arg_retain, false);
2312 }
2313
2314 static int register_machine(pid_t pid, int local_ifindex) {
2315 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2316 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2317 int r;
2318
2319 if (!arg_register)
2320 return 0;
2321
2322 r = sd_bus_default_system(&bus);
2323 if (r < 0)
2324 return log_error_errno(r, "Failed to open system bus: %m");
2325
2326 if (arg_keep_unit) {
2327 r = sd_bus_call_method(
2328 bus,
2329 "org.freedesktop.machine1",
2330 "/org/freedesktop/machine1",
2331 "org.freedesktop.machine1.Manager",
2332 "RegisterMachineWithNetwork",
2333 &error,
2334 NULL,
2335 "sayssusai",
2336 arg_machine,
2337 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2338 "nspawn",
2339 "container",
2340 (uint32_t) pid,
2341 strempty(arg_directory),
2342 local_ifindex > 0 ? 1 : 0, local_ifindex);
2343 } else {
2344 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
2345 char **i;
2346 unsigned j;
2347
2348 r = sd_bus_message_new_method_call(
2349 bus,
2350 &m,
2351 "org.freedesktop.machine1",
2352 "/org/freedesktop/machine1",
2353 "org.freedesktop.machine1.Manager",
2354 "CreateMachineWithNetwork");
2355 if (r < 0)
2356 return bus_log_create_error(r);
2357
2358 r = sd_bus_message_append(
2359 m,
2360 "sayssusai",
2361 arg_machine,
2362 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2363 "nspawn",
2364 "container",
2365 (uint32_t) pid,
2366 strempty(arg_directory),
2367 local_ifindex > 0 ? 1 : 0, local_ifindex);
2368 if (r < 0)
2369 return bus_log_create_error(r);
2370
2371 r = sd_bus_message_open_container(m, 'a', "(sv)");
2372 if (r < 0)
2373 return bus_log_create_error(r);
2374
2375 if (!isempty(arg_slice)) {
2376 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2377 if (r < 0)
2378 return bus_log_create_error(r);
2379 }
2380
2381 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2382 if (r < 0)
2383 return bus_log_create_error(r);
2384
2385 /* If you make changes here, also make sure to update
2386 * systemd-nspawn@.service, to keep the device
2387 * policies in sync regardless if we are run with or
2388 * without the --keep-unit switch. */
2389 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2390 /* Allow the container to
2391 * access and create the API
2392 * device nodes, so that
2393 * PrivateDevices= in the
2394 * container can work
2395 * fine */
2396 "/dev/null", "rwm",
2397 "/dev/zero", "rwm",
2398 "/dev/full", "rwm",
2399 "/dev/random", "rwm",
2400 "/dev/urandom", "rwm",
2401 "/dev/tty", "rwm",
2402 "/dev/net/tun", "rwm",
2403 /* Allow the container
2404 * access to ptys. However,
2405 * do not permit the
2406 * container to ever create
2407 * these device nodes. */
2408 "/dev/pts/ptmx", "rw",
2409 "char-pts", "rw");
2410 if (r < 0)
2411 return bus_log_create_error(r);
2412
2413 for (j = 0; j < arg_n_custom_mounts; j++) {
2414 CustomMount *cm = &arg_custom_mounts[j];
2415
2416 if (cm->type != CUSTOM_MOUNT_BIND)
2417 continue;
2418
2419 r = is_device_node(cm->source);
2420 if (r < 0)
2421 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
2422
2423 if (r) {
2424 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
2425 cm->source, cm->read_only ? "r" : "rw");
2426 if (r < 0)
2427 return log_error_errno(r, "Failed to append message arguments: %m");
2428 }
2429 }
2430
2431 if (arg_kill_signal != 0) {
2432 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
2433 if (r < 0)
2434 return bus_log_create_error(r);
2435
2436 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
2437 if (r < 0)
2438 return bus_log_create_error(r);
2439 }
2440
2441 STRV_FOREACH(i, arg_property) {
2442 r = sd_bus_message_open_container(m, 'r', "sv");
2443 if (r < 0)
2444 return bus_log_create_error(r);
2445
2446 r = bus_append_unit_property_assignment(m, *i);
2447 if (r < 0)
2448 return r;
2449
2450 r = sd_bus_message_close_container(m);
2451 if (r < 0)
2452 return bus_log_create_error(r);
2453 }
2454
2455 r = sd_bus_message_close_container(m);
2456 if (r < 0)
2457 return bus_log_create_error(r);
2458
2459 r = sd_bus_call(bus, m, 0, &error, NULL);
2460 }
2461
2462 if (r < 0) {
2463 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2464 return r;
2465 }
2466
2467 return 0;
2468 }
2469
2470 static int terminate_machine(pid_t pid) {
2471 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2472 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2473 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2474 const char *path;
2475 int r;
2476
2477 if (!arg_register)
2478 return 0;
2479
2480 /* If we are reusing the unit, then just exit, systemd will do
2481 * the right thing when we exit. */
2482 if (arg_keep_unit)
2483 return 0;
2484
2485 r = sd_bus_default_system(&bus);
2486 if (r < 0)
2487 return log_error_errno(r, "Failed to open system bus: %m");
2488
2489 r = sd_bus_call_method(
2490 bus,
2491 "org.freedesktop.machine1",
2492 "/org/freedesktop/machine1",
2493 "org.freedesktop.machine1.Manager",
2494 "GetMachineByPID",
2495 &error,
2496 &reply,
2497 "u",
2498 (uint32_t) pid);
2499 if (r < 0) {
2500 /* Note that the machine might already have been
2501 * cleaned up automatically, hence don't consider it a
2502 * failure if we cannot get the machine object. */
2503 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2504 return 0;
2505 }
2506
2507 r = sd_bus_message_read(reply, "o", &path);
2508 if (r < 0)
2509 return bus_log_parse_error(r);
2510
2511 r = sd_bus_call_method(
2512 bus,
2513 "org.freedesktop.machine1",
2514 path,
2515 "org.freedesktop.machine1.Machine",
2516 "Terminate",
2517 &error,
2518 NULL,
2519 NULL);
2520 if (r < 0) {
2521 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2522 return 0;
2523 }
2524
2525 return 0;
2526 }
2527
2528 static int reset_audit_loginuid(void) {
2529 _cleanup_free_ char *p = NULL;
2530 int r;
2531
2532 if (arg_share_system)
2533 return 0;
2534
2535 r = read_one_line_file("/proc/self/loginuid", &p);
2536 if (r == -ENOENT)
2537 return 0;
2538 if (r < 0)
2539 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2540
2541 /* Already reset? */
2542 if (streq(p, "4294967295"))
2543 return 0;
2544
2545 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
2546 if (r < 0) {
2547 log_error_errno(r,
2548 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2549 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2550 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2551 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2552 "using systemd-nspawn. Sleeping for 5s... (%m)");
2553
2554 sleep(5);
2555 }
2556
2557 return 0;
2558 }
2559
2560 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2561 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2562 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2563
2564 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2565 uint8_t result[8];
2566 size_t l, sz;
2567 uint8_t *v, *i;
2568 int r;
2569
2570 l = strlen(arg_machine);
2571 sz = sizeof(sd_id128_t) + l;
2572 if (idx > 0)
2573 sz += sizeof(idx);
2574
2575 v = alloca(sz);
2576
2577 /* fetch some persistent data unique to the host */
2578 r = sd_id128_get_machine((sd_id128_t*) v);
2579 if (r < 0)
2580 return r;
2581
2582 /* combine with some data unique (on this host) to this
2583 * container instance */
2584 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2585 if (idx > 0) {
2586 idx = htole64(idx);
2587 memcpy(i, &idx, sizeof(idx));
2588 }
2589
2590 /* Let's hash the host machine ID plus the container name. We
2591 * use a fixed, but originally randomly created hash key here. */
2592 siphash24(result, v, sz, hash_key.bytes);
2593
2594 assert_cc(ETH_ALEN <= sizeof(result));
2595 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2596
2597 /* see eth_random_addr in the kernel */
2598 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2599 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2600
2601 return 0;
2602 }
2603
2604 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2605 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2606 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2607 struct ether_addr mac_host, mac_container;
2608 int r, i;
2609
2610 if (!arg_private_network)
2611 return 0;
2612
2613 if (!arg_network_veth)
2614 return 0;
2615
2616 /* Use two different interface name prefixes depending whether
2617 * we are in bridge mode or not. */
2618 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2619 arg_network_bridge ? "vb" : "ve", arg_machine);
2620
2621 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2622 if (r < 0)
2623 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2624
2625 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2626 if (r < 0)
2627 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2628
2629 r = sd_netlink_open(&rtnl);
2630 if (r < 0)
2631 return log_error_errno(r, "Failed to connect to netlink: %m");
2632
2633 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2634 if (r < 0)
2635 return log_error_errno(r, "Failed to allocate netlink message: %m");
2636
2637 r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
2638 if (r < 0)
2639 return log_error_errno(r, "Failed to add netlink interface name: %m");
2640
2641 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2642 if (r < 0)
2643 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2644
2645 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2646 if (r < 0)
2647 return log_error_errno(r, "Failed to open netlink container: %m");
2648
2649 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2650 if (r < 0)
2651 return log_error_errno(r, "Failed to open netlink container: %m");
2652
2653 r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
2654 if (r < 0)
2655 return log_error_errno(r, "Failed to open netlink container: %m");
2656
2657 r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
2658 if (r < 0)
2659 return log_error_errno(r, "Failed to add netlink interface name: %m");
2660
2661 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2662 if (r < 0)
2663 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2664
2665 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2666 if (r < 0)
2667 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2668
2669 r = sd_netlink_message_close_container(m);
2670 if (r < 0)
2671 return log_error_errno(r, "Failed to close netlink container: %m");
2672
2673 r = sd_netlink_message_close_container(m);
2674 if (r < 0)
2675 return log_error_errno(r, "Failed to close netlink container: %m");
2676
2677 r = sd_netlink_message_close_container(m);
2678 if (r < 0)
2679 return log_error_errno(r, "Failed to close netlink container: %m");
2680
2681 r = sd_netlink_call(rtnl, m, 0, NULL);
2682 if (r < 0)
2683 return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
2684
2685 i = (int) if_nametoindex(iface_name);
2686 if (i <= 0)
2687 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2688
2689 *ifi = i;
2690
2691 return 0;
2692 }
2693
2694 static int setup_bridge(const char veth_name[], int *ifi) {
2695 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2696 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2697 int r, bridge;
2698
2699 if (!arg_private_network)
2700 return 0;
2701
2702 if (!arg_network_veth)
2703 return 0;
2704
2705 if (!arg_network_bridge)
2706 return 0;
2707
2708 bridge = (int) if_nametoindex(arg_network_bridge);
2709 if (bridge <= 0)
2710 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2711
2712 *ifi = bridge;
2713
2714 r = sd_netlink_open(&rtnl);
2715 if (r < 0)
2716 return log_error_errno(r, "Failed to connect to netlink: %m");
2717
2718 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2719 if (r < 0)
2720 return log_error_errno(r, "Failed to allocate netlink message: %m");
2721
2722 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2723 if (r < 0)
2724 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2725
2726 r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
2727 if (r < 0)
2728 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2729
2730 r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
2731 if (r < 0)
2732 return log_error_errno(r, "Failed to add netlink master field: %m");
2733
2734 r = sd_netlink_call(rtnl, m, 0, NULL);
2735 if (r < 0)
2736 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2737
2738 return 0;
2739 }
2740
2741 static int parse_interface(struct udev *udev, const char *name) {
2742 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2743 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2744 int ifi;
2745
2746 ifi = (int) if_nametoindex(name);
2747 if (ifi <= 0)
2748 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2749
2750 sprintf(ifi_str, "n%i", ifi);
2751 d = udev_device_new_from_device_id(udev, ifi_str);
2752 if (!d)
2753 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2754
2755 if (udev_device_get_is_initialized(d) <= 0) {
2756 log_error("Network interface %s is not initialized yet.", name);
2757 return -EBUSY;
2758 }
2759
2760 return ifi;
2761 }
2762
2763 static int move_network_interfaces(pid_t pid) {
2764 _cleanup_udev_unref_ struct udev *udev = NULL;
2765 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2766 char **i;
2767 int r;
2768
2769 if (!arg_private_network)
2770 return 0;
2771
2772 if (strv_isempty(arg_network_interfaces))
2773 return 0;
2774
2775 r = sd_netlink_open(&rtnl);
2776 if (r < 0)
2777 return log_error_errno(r, "Failed to connect to netlink: %m");
2778
2779 udev = udev_new();
2780 if (!udev) {
2781 log_error("Failed to connect to udev.");
2782 return -ENOMEM;
2783 }
2784
2785 STRV_FOREACH(i, arg_network_interfaces) {
2786 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2787 int ifi;
2788
2789 ifi = parse_interface(udev, *i);
2790 if (ifi < 0)
2791 return ifi;
2792
2793 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2794 if (r < 0)
2795 return log_error_errno(r, "Failed to allocate netlink message: %m");
2796
2797 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2798 if (r < 0)
2799 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2800
2801 r = sd_netlink_call(rtnl, m, 0, NULL);
2802 if (r < 0)
2803 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2804 }
2805
2806 return 0;
2807 }
2808
2809 static int setup_macvlan(pid_t pid) {
2810 _cleanup_udev_unref_ struct udev *udev = NULL;
2811 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2812 unsigned idx = 0;
2813 char **i;
2814 int r;
2815
2816 if (!arg_private_network)
2817 return 0;
2818
2819 if (strv_isempty(arg_network_macvlan))
2820 return 0;
2821
2822 r = sd_netlink_open(&rtnl);
2823 if (r < 0)
2824 return log_error_errno(r, "Failed to connect to netlink: %m");
2825
2826 udev = udev_new();
2827 if (!udev) {
2828 log_error("Failed to connect to udev.");
2829 return -ENOMEM;
2830 }
2831
2832 STRV_FOREACH(i, arg_network_macvlan) {
2833 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2834 _cleanup_free_ char *n = NULL;
2835 struct ether_addr mac;
2836 int ifi;
2837
2838 ifi = parse_interface(udev, *i);
2839 if (ifi < 0)
2840 return ifi;
2841
2842 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2843 if (r < 0)
2844 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2845
2846 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2847 if (r < 0)
2848 return log_error_errno(r, "Failed to allocate netlink message: %m");
2849
2850 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2851 if (r < 0)
2852 return log_error_errno(r, "Failed to add netlink interface index: %m");
2853
2854 n = strappend("mv-", *i);
2855 if (!n)
2856 return log_oom();
2857
2858 strshorten(n, IFNAMSIZ-1);
2859
2860 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2861 if (r < 0)
2862 return log_error_errno(r, "Failed to add netlink interface name: %m");
2863
2864 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2865 if (r < 0)
2866 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2867
2868 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2869 if (r < 0)
2870 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2871
2872 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2873 if (r < 0)
2874 return log_error_errno(r, "Failed to open netlink container: %m");
2875
2876 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2877 if (r < 0)
2878 return log_error_errno(r, "Failed to open netlink container: %m");
2879
2880 r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2881 if (r < 0)
2882 return log_error_errno(r, "Failed to append macvlan mode: %m");
2883
2884 r = sd_netlink_message_close_container(m);
2885 if (r < 0)
2886 return log_error_errno(r, "Failed to close netlink container: %m");
2887
2888 r = sd_netlink_message_close_container(m);
2889 if (r < 0)
2890 return log_error_errno(r, "Failed to close netlink container: %m");
2891
2892 r = sd_netlink_call(rtnl, m, 0, NULL);
2893 if (r < 0)
2894 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2895 }
2896
2897 return 0;
2898 }
2899
2900 static int setup_ipvlan(pid_t pid) {
2901 _cleanup_udev_unref_ struct udev *udev = NULL;
2902 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2903 char **i;
2904 int r;
2905
2906 if (!arg_private_network)
2907 return 0;
2908
2909 if (strv_isempty(arg_network_ipvlan))
2910 return 0;
2911
2912 r = sd_netlink_open(&rtnl);
2913 if (r < 0)
2914 return log_error_errno(r, "Failed to connect to netlink: %m");
2915
2916 udev = udev_new();
2917 if (!udev) {
2918 log_error("Failed to connect to udev.");
2919 return -ENOMEM;
2920 }
2921
2922 STRV_FOREACH(i, arg_network_ipvlan) {
2923 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2924 _cleanup_free_ char *n = NULL;
2925 int ifi;
2926
2927 ifi = parse_interface(udev, *i);
2928 if (ifi < 0)
2929 return ifi;
2930
2931 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2932 if (r < 0)
2933 return log_error_errno(r, "Failed to allocate netlink message: %m");
2934
2935 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2936 if (r < 0)
2937 return log_error_errno(r, "Failed to add netlink interface index: %m");
2938
2939 n = strappend("iv-", *i);
2940 if (!n)
2941 return log_oom();
2942
2943 strshorten(n, IFNAMSIZ-1);
2944
2945 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2946 if (r < 0)
2947 return log_error_errno(r, "Failed to add netlink interface name: %m");
2948
2949 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2950 if (r < 0)
2951 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2952
2953 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2954 if (r < 0)
2955 return log_error_errno(r, "Failed to open netlink container: %m");
2956
2957 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2958 if (r < 0)
2959 return log_error_errno(r, "Failed to open netlink container: %m");
2960
2961 r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2962 if (r < 0)
2963 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2964
2965 r = sd_netlink_message_close_container(m);
2966 if (r < 0)
2967 return log_error_errno(r, "Failed to close netlink container: %m");
2968
2969 r = sd_netlink_message_close_container(m);
2970 if (r < 0)
2971 return log_error_errno(r, "Failed to close netlink container: %m");
2972
2973 r = sd_netlink_call(rtnl, m, 0, NULL);
2974 if (r < 0)
2975 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2976 }
2977
2978 return 0;
2979 }
2980
2981 static int setup_seccomp(void) {
2982
2983 #ifdef HAVE_SECCOMP
2984 static const struct {
2985 uint64_t capability;
2986 int syscall_num;
2987 } blacklist[] = {
2988 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
2989 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
2990 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
2991 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
2992 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
2993 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
2994 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
2995 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
2996 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
2997 { CAP_SYSLOG, SCMP_SYS(syslog) },
2998 };
2999
3000 scmp_filter_ctx seccomp;
3001 unsigned i;
3002 int r;
3003
3004 seccomp = seccomp_init(SCMP_ACT_ALLOW);
3005 if (!seccomp)
3006 return log_oom();
3007
3008 r = seccomp_add_secondary_archs(seccomp);
3009 if (r < 0) {
3010 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
3011 goto finish;
3012 }
3013
3014 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
3015 if (arg_retain & (1ULL << blacklist[i].capability))
3016 continue;
3017
3018 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
3019 if (r == -EFAULT)
3020 continue; /* unknown syscall */
3021 if (r < 0) {
3022 log_error_errno(r, "Failed to block syscall: %m");
3023 goto finish;
3024 }
3025 }
3026
3027
3028 /*
3029 Audit is broken in containers, much of the userspace audit
3030 hookup will fail if running inside a container. We don't
3031 care and just turn off creation of audit sockets.
3032
3033 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
3034 with EAFNOSUPPORT which audit userspace uses as indication
3035 that audit is disabled in the kernel.
3036 */
3037
3038 r = seccomp_rule_add(
3039 seccomp,
3040 SCMP_ACT_ERRNO(EAFNOSUPPORT),
3041 SCMP_SYS(socket),
3042 2,
3043 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
3044 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
3045 if (r < 0) {
3046 log_error_errno(r, "Failed to add audit seccomp rule: %m");
3047 goto finish;
3048 }
3049
3050 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
3051 if (r < 0) {
3052 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
3053 goto finish;
3054 }
3055
3056 r = seccomp_load(seccomp);
3057 if (r == -EINVAL) {
3058 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3059 r = 0;
3060 goto finish;
3061 }
3062 if (r < 0) {
3063 log_error_errno(r, "Failed to install seccomp audit filter: %m");
3064 goto finish;
3065 }
3066
3067 finish:
3068 seccomp_release(seccomp);
3069 return r;
3070 #else
3071 return 0;
3072 #endif
3073
3074 }
3075
3076 static int setup_propagate(const char *root) {
3077 const char *p, *q;
3078
3079 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3080 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
3081 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3082 (void) mkdir_p(p, 0600);
3083
3084 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
3085 return log_error_errno(errno, "Failed to create /run/systemd: %m");
3086
3087 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3088 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
3089
3090 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3091 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
3092
3093 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
3094 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
3095 return log_error_errno(errno, "Failed to install propagation bind mount.");
3096
3097 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
3098 return log_error_errno(errno, "Failed to make propagation mount read-only");
3099
3100 return 0;
3101 }
3102
3103 static int setup_image(char **device_path, int *loop_nr) {
3104 struct loop_info64 info = {
3105 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
3106 };
3107 _cleanup_close_ int fd = -1, control = -1, loop = -1;
3108 _cleanup_free_ char* loopdev = NULL;
3109 struct stat st;
3110 int r, nr;
3111
3112 assert(device_path);
3113 assert(loop_nr);
3114 assert(arg_image);
3115
3116 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3117 if (fd < 0)
3118 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
3119
3120 if (fstat(fd, &st) < 0)
3121 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
3122
3123 if (S_ISBLK(st.st_mode)) {
3124 char *p;
3125
3126 p = strdup(arg_image);
3127 if (!p)
3128 return log_oom();
3129
3130 *device_path = p;
3131
3132 *loop_nr = -1;
3133
3134 r = fd;
3135 fd = -1;
3136
3137 return r;
3138 }
3139
3140 if (!S_ISREG(st.st_mode)) {
3141 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
3142 return -EINVAL;
3143 }
3144
3145 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3146 if (control < 0)
3147 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
3148
3149 nr = ioctl(control, LOOP_CTL_GET_FREE);
3150 if (nr < 0)
3151 return log_error_errno(errno, "Failed to allocate loop device: %m");
3152
3153 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
3154 return log_oom();
3155
3156 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3157 if (loop < 0)
3158 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
3159
3160 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
3161 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
3162
3163 if (arg_read_only)
3164 info.lo_flags |= LO_FLAGS_READ_ONLY;
3165
3166 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
3167 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
3168
3169 *device_path = loopdev;
3170 loopdev = NULL;
3171
3172 *loop_nr = nr;
3173
3174 r = loop;
3175 loop = -1;
3176
3177 return r;
3178 }
3179
3180 #define PARTITION_TABLE_BLURB \
3181 "Note that the disk image needs to either contain only a single MBR partition of\n" \
3182 "type 0x83 that is marked bootable, or a single GPT partition of type " \
3183 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
3184 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3185 "to be bootable with systemd-nspawn."
3186
3187 static int dissect_image(
3188 int fd,
3189 char **root_device, bool *root_device_rw,
3190 char **home_device, bool *home_device_rw,
3191 char **srv_device, bool *srv_device_rw,
3192 bool *secondary) {
3193
3194 #ifdef HAVE_BLKID
3195 int home_nr = -1, srv_nr = -1;
3196 #ifdef GPT_ROOT_NATIVE
3197 int root_nr = -1;
3198 #endif
3199 #ifdef GPT_ROOT_SECONDARY
3200 int secondary_root_nr = -1;
3201 #endif
3202 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
3203 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
3204 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
3205 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3206 _cleanup_udev_unref_ struct udev *udev = NULL;
3207 struct udev_list_entry *first, *item;
3208 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
3209 bool is_gpt, is_mbr, multiple_generic = false;
3210 const char *pttype = NULL;
3211 blkid_partlist pl;
3212 struct stat st;
3213 unsigned i;
3214 int r;
3215
3216 assert(fd >= 0);
3217 assert(root_device);
3218 assert(home_device);
3219 assert(srv_device);
3220 assert(secondary);
3221 assert(arg_image);
3222
3223 b = blkid_new_probe();
3224 if (!b)
3225 return log_oom();
3226
3227 errno = 0;
3228 r = blkid_probe_set_device(b, fd, 0, 0);
3229 if (r != 0) {
3230 if (errno == 0)
3231 return log_oom();
3232
3233 log_error_errno(errno, "Failed to set device on blkid probe: %m");
3234 return -errno;
3235 }
3236
3237 blkid_probe_enable_partitions(b, 1);
3238 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
3239
3240 errno = 0;
3241 r = blkid_do_safeprobe(b);
3242 if (r == -2 || r == 1) {
3243 log_error("Failed to identify any partition table on\n"
3244 " %s\n"
3245 PARTITION_TABLE_BLURB, arg_image);
3246 return -EINVAL;
3247 } else if (r != 0) {
3248 if (errno == 0)
3249 errno = EIO;
3250 log_error_errno(errno, "Failed to probe: %m");
3251 return -errno;
3252 }
3253
3254 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
3255
3256 is_gpt = streq_ptr(pttype, "gpt");
3257 is_mbr = streq_ptr(pttype, "dos");
3258
3259 if (!is_gpt && !is_mbr) {
3260 log_error("No GPT or MBR partition table discovered on\n"
3261 " %s\n"
3262 PARTITION_TABLE_BLURB, arg_image);
3263 return -EINVAL;
3264 }
3265
3266 errno = 0;
3267 pl = blkid_probe_get_partitions(b);
3268 if (!pl) {
3269 if (errno == 0)
3270 return log_oom();
3271
3272 log_error("Failed to list partitions of %s", arg_image);
3273 return -errno;
3274 }
3275
3276 udev = udev_new();
3277 if (!udev)
3278 return log_oom();
3279
3280 if (fstat(fd, &st) < 0)
3281 return log_error_errno(errno, "Failed to stat block device: %m");
3282
3283 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
3284 if (!d)
3285 return log_oom();
3286
3287 for (i = 0;; i++) {
3288 int n, m;
3289
3290 if (i >= 10) {
3291 log_error("Kernel partitions never appeared.");
3292 return -ENXIO;
3293 }
3294
3295 e = udev_enumerate_new(udev);
3296 if (!e)
3297 return log_oom();
3298
3299 r = udev_enumerate_add_match_parent(e, d);
3300 if (r < 0)
3301 return log_oom();
3302
3303 r = udev_enumerate_scan_devices(e);
3304 if (r < 0)
3305 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
3306
3307 /* Count the partitions enumerated by the kernel */
3308 n = 0;
3309 first = udev_enumerate_get_list_entry(e);
3310 udev_list_entry_foreach(item, first)
3311 n++;
3312
3313 /* Count the partitions enumerated by blkid */
3314 m = blkid_partlist_numof_partitions(pl);
3315 if (n == m + 1)
3316 break;
3317 if (n > m + 1) {
3318 log_error("blkid and kernel partition list do not match.");
3319 return -EIO;
3320 }
3321 if (n < m + 1) {
3322 unsigned j;
3323
3324 /* The kernel has probed fewer partitions than
3325 * blkid? Maybe the kernel prober is still
3326 * running or it got EBUSY because udev
3327 * already opened the device. Let's reprobe
3328 * the device, which is a synchronous call
3329 * that waits until probing is complete. */
3330
3331 for (j = 0; j < 20; j++) {
3332
3333 r = ioctl(fd, BLKRRPART, 0);
3334 if (r < 0)
3335 r = -errno;
3336 if (r >= 0 || r != -EBUSY)
3337 break;
3338
3339 /* If something else has the device
3340 * open, such as an udev rule, the
3341 * ioctl will return EBUSY. Since
3342 * there's no way to wait until it
3343 * isn't busy anymore, let's just wait
3344 * a bit, and try again.
3345 *
3346 * This is really something they
3347 * should fix in the kernel! */
3348
3349 usleep(50 * USEC_PER_MSEC);
3350 }
3351
3352 if (r < 0)
3353 return log_error_errno(r, "Failed to reread partition table: %m");
3354 }
3355
3356 e = udev_enumerate_unref(e);
3357 }
3358
3359 first = udev_enumerate_get_list_entry(e);
3360 udev_list_entry_foreach(item, first) {
3361 _cleanup_udev_device_unref_ struct udev_device *q;
3362 const char *node;
3363 unsigned long long flags;
3364 blkid_partition pp;
3365 dev_t qn;
3366 int nr;
3367
3368 errno = 0;
3369 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
3370 if (!q) {
3371 if (!errno)
3372 errno = ENOMEM;
3373
3374 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
3375 return -errno;
3376 }
3377
3378 qn = udev_device_get_devnum(q);
3379 if (major(qn) == 0)
3380 continue;
3381
3382 if (st.st_rdev == qn)
3383 continue;
3384
3385 node = udev_device_get_devnode(q);
3386 if (!node)
3387 continue;
3388
3389 pp = blkid_partlist_devno_to_partition(pl, qn);
3390 if (!pp)
3391 continue;
3392
3393 flags = blkid_partition_get_flags(pp);
3394
3395 nr = blkid_partition_get_partno(pp);
3396 if (nr < 0)
3397 continue;
3398
3399 if (is_gpt) {
3400 sd_id128_t type_id;
3401 const char *stype;
3402
3403 if (flags & GPT_FLAG_NO_AUTO)
3404 continue;
3405
3406 stype = blkid_partition_get_type_string(pp);
3407 if (!stype)
3408 continue;
3409
3410 if (sd_id128_from_string(stype, &type_id) < 0)
3411 continue;
3412
3413 if (sd_id128_equal(type_id, GPT_HOME)) {
3414
3415 if (home && nr >= home_nr)
3416 continue;
3417
3418 home_nr = nr;
3419 home_rw = !(flags & GPT_FLAG_READ_ONLY);
3420
3421 r = free_and_strdup(&home, node);
3422 if (r < 0)
3423 return log_oom();
3424
3425 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3426
3427 if (srv && nr >= srv_nr)
3428 continue;
3429
3430 srv_nr = nr;
3431 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3432
3433 r = free_and_strdup(&srv, node);
3434 if (r < 0)
3435 return log_oom();
3436 }
3437 #ifdef GPT_ROOT_NATIVE
3438 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3439
3440 if (root && nr >= root_nr)
3441 continue;
3442
3443 root_nr = nr;
3444 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3445
3446 r = free_and_strdup(&root, node);
3447 if (r < 0)
3448 return log_oom();
3449 }
3450 #endif
3451 #ifdef GPT_ROOT_SECONDARY
3452 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3453
3454 if (secondary_root && nr >= secondary_root_nr)
3455 continue;
3456
3457 secondary_root_nr = nr;
3458 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3459
3460 r = free_and_strdup(&secondary_root, node);
3461 if (r < 0)
3462 return log_oom();
3463 }
3464 #endif
3465 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3466
3467 if (generic)
3468 multiple_generic = true;
3469 else {
3470 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3471
3472 r = free_and_strdup(&generic, node);
3473 if (r < 0)
3474 return log_oom();
3475 }
3476 }
3477
3478 } else if (is_mbr) {
3479 int type;
3480
3481 if (flags != 0x80) /* Bootable flag */
3482 continue;
3483
3484 type = blkid_partition_get_type(pp);
3485 if (type != 0x83) /* Linux partition */
3486 continue;
3487
3488 if (generic)
3489 multiple_generic = true;
3490 else {
3491 generic_rw = true;
3492
3493 r = free_and_strdup(&root, node);
3494 if (r < 0)
3495 return log_oom();
3496 }
3497 }
3498 }
3499
3500 if (root) {
3501 *root_device = root;
3502 root = NULL;
3503
3504 *root_device_rw = root_rw;
3505 *secondary = false;
3506 } else if (secondary_root) {
3507 *root_device = secondary_root;
3508 secondary_root = NULL;
3509
3510 *root_device_rw = secondary_root_rw;
3511 *secondary = true;
3512 } else if (generic) {
3513
3514 /* There were no partitions with precise meanings
3515 * around, but we found generic partitions. In this
3516 * case, if there's only one, we can go ahead and boot
3517 * it, otherwise we bail out, because we really cannot
3518 * make any sense of it. */
3519
3520 if (multiple_generic) {
3521 log_error("Identified multiple bootable Linux partitions on\n"
3522 " %s\n"
3523 PARTITION_TABLE_BLURB, arg_image);
3524 return -EINVAL;
3525 }
3526
3527 *root_device = generic;
3528 generic = NULL;
3529
3530 *root_device_rw = generic_rw;
3531 *secondary = false;
3532 } else {
3533 log_error("Failed to identify root partition in disk image\n"
3534 " %s\n"
3535 PARTITION_TABLE_BLURB, arg_image);
3536 return -EINVAL;
3537 }
3538
3539 if (home) {
3540 *home_device = home;
3541 home = NULL;
3542
3543 *home_device_rw = home_rw;
3544 }
3545
3546 if (srv) {
3547 *srv_device = srv;
3548 srv = NULL;
3549
3550 *srv_device_rw = srv_rw;
3551 }
3552
3553 return 0;
3554 #else
3555 log_error("--image= is not supported, compiled without blkid support.");
3556 return -EOPNOTSUPP;
3557 #endif
3558 }
3559
3560 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3561 #ifdef HAVE_BLKID
3562 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3563 const char *fstype, *p;
3564 int r;
3565
3566 assert(what);
3567 assert(where);
3568
3569 if (arg_read_only)
3570 rw = false;
3571
3572 if (directory)
3573 p = strjoina(where, directory);
3574 else
3575 p = where;
3576
3577 errno = 0;
3578 b = blkid_new_probe_from_filename(what);
3579 if (!b) {
3580 if (errno == 0)
3581 return log_oom();
3582 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3583 return -errno;
3584 }
3585
3586 blkid_probe_enable_superblocks(b, 1);
3587 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3588
3589 errno = 0;
3590 r = blkid_do_safeprobe(b);
3591 if (r == -1 || r == 1) {
3592 log_error("Cannot determine file system type of %s", what);
3593 return -EINVAL;
3594 } else if (r != 0) {
3595 if (errno == 0)
3596 errno = EIO;
3597 log_error_errno(errno, "Failed to probe %s: %m", what);
3598 return -errno;
3599 }
3600
3601 errno = 0;
3602 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3603 if (errno == 0)
3604 errno = EINVAL;
3605 log_error("Failed to determine file system type of %s", what);
3606 return -errno;
3607 }
3608
3609 if (streq(fstype, "crypto_LUKS")) {
3610 log_error("nspawn currently does not support LUKS disk images.");
3611 return -EOPNOTSUPP;
3612 }
3613
3614 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3615 return log_error_errno(errno, "Failed to mount %s: %m", what);
3616
3617 return 0;
3618 #else
3619 log_error("--image= is not supported, compiled without blkid support.");
3620 return -EOPNOTSUPP;
3621 #endif
3622 }
3623
3624 static int mount_devices(
3625 const char *where,
3626 const char *root_device, bool root_device_rw,
3627 const char *home_device, bool home_device_rw,
3628 const char *srv_device, bool srv_device_rw) {
3629 int r;
3630
3631 assert(where);
3632
3633 if (root_device) {
3634 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3635 if (r < 0)
3636 return log_error_errno(r, "Failed to mount root directory: %m");
3637 }
3638
3639 if (home_device) {
3640 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3641 if (r < 0)
3642 return log_error_errno(r, "Failed to mount home directory: %m");
3643 }
3644
3645 if (srv_device) {
3646 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3647 if (r < 0)
3648 return log_error_errno(r, "Failed to mount server data directory: %m");
3649 }
3650
3651 return 0;
3652 }
3653
3654 static void loop_remove(int nr, int *image_fd) {
3655 _cleanup_close_ int control = -1;
3656 int r;
3657
3658 if (nr < 0)
3659 return;
3660
3661 if (image_fd && *image_fd >= 0) {
3662 r = ioctl(*image_fd, LOOP_CLR_FD);
3663 if (r < 0)
3664 log_debug_errno(errno, "Failed to close loop image: %m");
3665 *image_fd = safe_close(*image_fd);
3666 }
3667
3668 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3669 if (control < 0) {
3670 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3671 return;
3672 }
3673
3674 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3675 if (r < 0)
3676 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3677 }
3678
3679 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3680 int pipe_fds[2];
3681 pid_t pid;
3682
3683 assert(database);
3684 assert(key);
3685 assert(rpid);
3686
3687 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3688 return log_error_errno(errno, "Failed to allocate pipe: %m");
3689
3690 pid = fork();
3691 if (pid < 0)
3692 return log_error_errno(errno, "Failed to fork getent child: %m");
3693 else if (pid == 0) {
3694 int nullfd;
3695 char *empty_env = NULL;
3696
3697 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3698 _exit(EXIT_FAILURE);
3699
3700 if (pipe_fds[0] > 2)
3701 safe_close(pipe_fds[0]);
3702 if (pipe_fds[1] > 2)
3703 safe_close(pipe_fds[1]);
3704
3705 nullfd = open("/dev/null", O_RDWR);
3706 if (nullfd < 0)
3707 _exit(EXIT_FAILURE);
3708
3709 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3710 _exit(EXIT_FAILURE);
3711
3712 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3713 _exit(EXIT_FAILURE);
3714
3715 if (nullfd > 2)
3716 safe_close(nullfd);
3717
3718 (void) reset_all_signal_handlers();
3719 (void) reset_signal_mask();
3720 close_all_fds(NULL, 0);
3721
3722 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3723 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3724 _exit(EXIT_FAILURE);
3725 }
3726
3727 pipe_fds[1] = safe_close(pipe_fds[1]);
3728
3729 *rpid = pid;
3730
3731 return pipe_fds[0];
3732 }
3733
3734 static int change_uid_gid(char **_home) {
3735 char line[LINE_MAX], *x, *u, *g, *h;
3736 const char *word, *state;
3737 _cleanup_free_ uid_t *uids = NULL;
3738 _cleanup_free_ char *home = NULL;
3739 _cleanup_fclose_ FILE *f = NULL;
3740 _cleanup_close_ int fd = -1;
3741 unsigned n_uids = 0;
3742 size_t sz = 0, l;
3743 uid_t uid;
3744 gid_t gid;
3745 pid_t pid;
3746 int r;
3747
3748 assert(_home);
3749
3750 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3751 /* Reset everything fully to 0, just in case */
3752
3753 r = reset_uid_gid();
3754 if (r < 0)
3755 return log_error_errno(r, "Failed to become root: %m");
3756
3757 *_home = NULL;
3758 return 0;
3759 }
3760
3761 /* First, get user credentials */
3762 fd = spawn_getent("passwd", arg_user, &pid);
3763 if (fd < 0)
3764 return fd;
3765
3766 f = fdopen(fd, "r");
3767 if (!f)
3768 return log_oom();
3769 fd = -1;
3770
3771 if (!fgets(line, sizeof(line), f)) {
3772
3773 if (!ferror(f)) {
3774 log_error("Failed to resolve user %s.", arg_user);
3775 return -ESRCH;
3776 }
3777
3778 log_error_errno(errno, "Failed to read from getent: %m");
3779 return -errno;
3780 }
3781
3782 truncate_nl(line);
3783
3784 wait_for_terminate_and_warn("getent passwd", pid, true);
3785
3786 x = strchr(line, ':');
3787 if (!x) {
3788 log_error("/etc/passwd entry has invalid user field.");
3789 return -EIO;
3790 }
3791
3792 u = strchr(x+1, ':');
3793 if (!u) {
3794 log_error("/etc/passwd entry has invalid password field.");
3795 return -EIO;
3796 }
3797
3798 u++;
3799 g = strchr(u, ':');
3800 if (!g) {
3801 log_error("/etc/passwd entry has invalid UID field.");
3802 return -EIO;
3803 }
3804
3805 *g = 0;
3806 g++;
3807 x = strchr(g, ':');
3808 if (!x) {
3809 log_error("/etc/passwd entry has invalid GID field.");
3810 return -EIO;
3811 }
3812
3813 *x = 0;
3814 h = strchr(x+1, ':');
3815 if (!h) {
3816 log_error("/etc/passwd entry has invalid GECOS field.");
3817 return -EIO;
3818 }
3819
3820 h++;
3821 x = strchr(h, ':');
3822 if (!x) {
3823 log_error("/etc/passwd entry has invalid home directory field.");
3824 return -EIO;
3825 }
3826
3827 *x = 0;
3828
3829 r = parse_uid(u, &uid);
3830 if (r < 0) {
3831 log_error("Failed to parse UID of user.");
3832 return -EIO;
3833 }
3834
3835 r = parse_gid(g, &gid);
3836 if (r < 0) {
3837 log_error("Failed to parse GID of user.");
3838 return -EIO;
3839 }
3840
3841 home = strdup(h);
3842 if (!home)
3843 return log_oom();
3844
3845 /* Second, get group memberships */
3846 fd = spawn_getent("initgroups", arg_user, &pid);
3847 if (fd < 0)
3848 return fd;
3849
3850 fclose(f);
3851 f = fdopen(fd, "r");
3852 if (!f)
3853 return log_oom();
3854 fd = -1;
3855
3856 if (!fgets(line, sizeof(line), f)) {
3857 if (!ferror(f)) {
3858 log_error("Failed to resolve user %s.", arg_user);
3859 return -ESRCH;
3860 }
3861
3862 log_error_errno(errno, "Failed to read from getent: %m");
3863 return -errno;
3864 }
3865
3866 truncate_nl(line);
3867
3868 wait_for_terminate_and_warn("getent initgroups", pid, true);
3869
3870 /* Skip over the username and subsequent separator whitespace */
3871 x = line;
3872 x += strcspn(x, WHITESPACE);
3873 x += strspn(x, WHITESPACE);
3874
3875 FOREACH_WORD(word, l, x, state) {
3876 char c[l+1];
3877
3878 memcpy(c, word, l);
3879 c[l] = 0;
3880
3881 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3882 return log_oom();
3883
3884 r = parse_uid(c, &uids[n_uids++]);
3885 if (r < 0) {
3886 log_error("Failed to parse group data from getent.");
3887 return -EIO;
3888 }
3889 }
3890
3891 r = mkdir_parents(home, 0775);
3892 if (r < 0)
3893 return log_error_errno(r, "Failed to make home root directory: %m");
3894
3895 r = mkdir_safe(home, 0755, uid, gid);
3896 if (r < 0 && r != -EEXIST)
3897 return log_error_errno(r, "Failed to make home directory: %m");
3898
3899 (void) fchown(STDIN_FILENO, uid, gid);
3900 (void) fchown(STDOUT_FILENO, uid, gid);
3901 (void) fchown(STDERR_FILENO, uid, gid);
3902
3903 if (setgroups(n_uids, uids) < 0)
3904 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3905
3906 if (setresgid(gid, gid, gid) < 0)
3907 return log_error_errno(errno, "setregid() failed: %m");
3908
3909 if (setresuid(uid, uid, uid) < 0)
3910 return log_error_errno(errno, "setreuid() failed: %m");
3911
3912 if (_home) {
3913 *_home = home;
3914 home = NULL;
3915 }
3916
3917 return 0;
3918 }
3919
3920 /*
3921 * Return values:
3922 * < 0 : wait_for_terminate() failed to get the state of the
3923 * container, the container was terminated by a signal, or
3924 * failed for an unknown reason. No change is made to the
3925 * container argument.
3926 * > 0 : The program executed in the container terminated with an
3927 * error. The exit code of the program executed in the
3928 * container is returned. The container argument has been set
3929 * to CONTAINER_TERMINATED.
3930 * 0 : The container is being rebooted, has been shut down or exited
3931 * successfully. The container argument has been set to either
3932 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3933 *
3934 * That is, success is indicated by a return value of zero, and an
3935 * error is indicated by a non-zero value.
3936 */
3937 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3938 siginfo_t status;
3939 int r;
3940
3941 r = wait_for_terminate(pid, &status);
3942 if (r < 0)
3943 return log_warning_errno(r, "Failed to wait for container: %m");
3944
3945 switch (status.si_code) {
3946
3947 case CLD_EXITED:
3948 if (status.si_status == 0) {
3949 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3950
3951 } else
3952 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3953
3954 *container = CONTAINER_TERMINATED;
3955 return status.si_status;
3956
3957 case CLD_KILLED:
3958 if (status.si_status == SIGINT) {
3959
3960 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3961 *container = CONTAINER_TERMINATED;
3962 return 0;
3963
3964 } else if (status.si_status == SIGHUP) {
3965
3966 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3967 *container = CONTAINER_REBOOTED;
3968 return 0;
3969 }
3970
3971 /* CLD_KILLED fallthrough */
3972
3973 case CLD_DUMPED:
3974 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3975 return -EIO;
3976
3977 default:
3978 log_error("Container %s failed due to unknown reason.", arg_machine);
3979 return -EIO;
3980 }
3981
3982 return r;
3983 }
3984
3985 static void nop_handler(int sig) {}
3986
3987 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3988 pid_t pid;
3989
3990 pid = PTR_TO_UINT32(userdata);
3991 if (pid > 0) {
3992 if (kill(pid, arg_kill_signal) >= 0) {
3993 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3994 sd_event_source_set_userdata(s, NULL);
3995 return 0;
3996 }
3997 }
3998
3999 sd_event_exit(sd_event_source_get_event(s), 0);
4000 return 0;
4001 }
4002
4003 static int determine_names(void) {
4004 int r;
4005
4006 if (!arg_image && !arg_directory) {
4007 if (arg_machine) {
4008 _cleanup_(image_unrefp) Image *i = NULL;
4009
4010 r = image_find(arg_machine, &i);
4011 if (r < 0)
4012 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
4013 else if (r == 0) {
4014 log_error("No image for machine '%s': %m", arg_machine);
4015 return -ENOENT;
4016 }
4017
4018 if (i->type == IMAGE_RAW)
4019 r = set_sanitized_path(&arg_image, i->path);
4020 else
4021 r = set_sanitized_path(&arg_directory, i->path);
4022 if (r < 0)
4023 return log_error_errno(r, "Invalid image directory: %m");
4024
4025 if (!arg_ephemeral)
4026 arg_read_only = arg_read_only || i->read_only;
4027 } else
4028 arg_directory = get_current_dir_name();
4029
4030 if (!arg_directory && !arg_machine) {
4031 log_error("Failed to determine path, please use -D or -i.");
4032 return -EINVAL;
4033 }
4034 }
4035
4036 if (!arg_machine) {
4037 if (arg_directory && path_equal(arg_directory, "/"))
4038 arg_machine = gethostname_malloc();
4039 else
4040 arg_machine = strdup(basename(arg_image ?: arg_directory));
4041
4042 if (!arg_machine)
4043 return log_oom();
4044
4045 hostname_cleanup(arg_machine);
4046 if (!machine_name_is_valid(arg_machine)) {
4047 log_error("Failed to determine machine name automatically, please use -M.");
4048 return -EINVAL;
4049 }
4050
4051 if (arg_ephemeral) {
4052 char *b;
4053
4054 /* Add a random suffix when this is an
4055 * ephemeral machine, so that we can run many
4056 * instances at once without manually having
4057 * to specify -M each time. */
4058
4059 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
4060 return log_oom();
4061
4062 free(arg_machine);
4063 arg_machine = b;
4064 }
4065 }
4066
4067 return 0;
4068 }
4069
4070 static int determine_uid_shift(const char *directory) {
4071 int r;
4072
4073 if (!arg_userns) {
4074 arg_uid_shift = 0;
4075 return 0;
4076 }
4077
4078 if (arg_uid_shift == UID_INVALID) {
4079 struct stat st;
4080
4081 r = stat(directory, &st);
4082 if (r < 0)
4083 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
4084
4085 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
4086
4087 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
4088 log_error("UID and GID base of %s don't match.", directory);
4089 return -EINVAL;
4090 }
4091
4092 arg_uid_range = UINT32_C(0x10000);
4093 }
4094
4095 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
4096 log_error("UID base too high for UID range.");
4097 return -EINVAL;
4098 }
4099
4100 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
4101 return 0;
4102 }
4103
4104 static int inner_child(
4105 Barrier *barrier,
4106 const char *directory,
4107 bool secondary,
4108 int kmsg_socket,
4109 int rtnl_socket,
4110 FDSet *fds,
4111 int argc,
4112 char *argv[]) {
4113
4114 _cleanup_free_ char *home = NULL;
4115 unsigned n_env = 2;
4116 const char *envp[] = {
4117 "PATH=" DEFAULT_PATH_SPLIT_USR,
4118 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4119 NULL, /* TERM */
4120 NULL, /* HOME */
4121 NULL, /* USER */
4122 NULL, /* LOGNAME */
4123 NULL, /* container_uuid */
4124 NULL, /* LISTEN_FDS */
4125 NULL, /* LISTEN_PID */
4126 NULL
4127 };
4128
4129 _cleanup_strv_free_ char **env_use = NULL;
4130 int r;
4131
4132 assert(barrier);
4133 assert(directory);
4134 assert(kmsg_socket >= 0);
4135
4136 if (arg_userns) {
4137 /* Tell the parent, that it now can write the UID map. */
4138 (void) barrier_place(barrier); /* #1 */
4139
4140 /* Wait until the parent wrote the UID map */
4141 if (!barrier_place_and_sync(barrier)) { /* #2 */
4142 log_error("Parent died too early");
4143 return -ESRCH;
4144 }
4145 }
4146
4147 r = mount_all(NULL, true);
4148 if (r < 0)
4149 return r;
4150
4151 /* Wait until we are cgroup-ified, so that we
4152 * can mount the right cgroup path writable */
4153 if (!barrier_place_and_sync(barrier)) { /* #3 */
4154 log_error("Parent died too early");
4155 return -ESRCH;
4156 }
4157
4158 r = mount_systemd_cgroup_writable("");
4159 if (r < 0)
4160 return r;
4161
4162 r = reset_uid_gid();
4163 if (r < 0)
4164 return log_error_errno(r, "Couldn't become new root: %m");
4165
4166 r = setup_boot_id(NULL);
4167 if (r < 0)
4168 return r;
4169
4170 r = setup_kmsg(NULL, kmsg_socket);
4171 if (r < 0)
4172 return r;
4173 kmsg_socket = safe_close(kmsg_socket);
4174
4175 umask(0022);
4176
4177 if (setsid() < 0)
4178 return log_error_errno(errno, "setsid() failed: %m");
4179
4180 if (arg_private_network)
4181 loopback_setup();
4182
4183 r = send_rtnl(rtnl_socket);
4184 if (r < 0)
4185 return r;
4186 rtnl_socket = safe_close(rtnl_socket);
4187
4188 if (drop_capabilities() < 0)
4189 return log_error_errno(errno, "drop_capabilities() failed: %m");
4190
4191 setup_hostname();
4192
4193 if (arg_personality != PERSONALITY_INVALID) {
4194 if (personality(arg_personality) < 0)
4195 return log_error_errno(errno, "personality() failed: %m");
4196 } else if (secondary) {
4197 if (personality(PER_LINUX32) < 0)
4198 return log_error_errno(errno, "personality() failed: %m");
4199 }
4200
4201 #ifdef HAVE_SELINUX
4202 if (arg_selinux_context)
4203 if (setexeccon((security_context_t) arg_selinux_context) < 0)
4204 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4205 #endif
4206
4207 r = change_uid_gid(&home);
4208 if (r < 0)
4209 return r;
4210
4211 envp[n_env] = strv_find_prefix(environ, "TERM=");
4212 if (envp[n_env])
4213 n_env ++;
4214
4215 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4216 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4217 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
4218 return log_oom();
4219
4220 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4221 char as_uuid[37];
4222
4223 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
4224 return log_oom();
4225 }
4226
4227 if (fdset_size(fds) > 0) {
4228 r = fdset_cloexec(fds, false);
4229 if (r < 0)
4230 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4231
4232 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
4233 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
4234 return log_oom();
4235 }
4236
4237 env_use = strv_env_merge(2, envp, arg_setenv);
4238 if (!env_use)
4239 return log_oom();
4240
4241 /* Let the parent know that we are ready and
4242 * wait until the parent is ready with the
4243 * setup, too... */
4244 if (!barrier_place_and_sync(barrier)) { /* #4 */
4245 log_error("Parent died too early");
4246 return -ESRCH;
4247 }
4248
4249 /* Now, explicitly close the log, so that we
4250 * then can close all remaining fds. Closing
4251 * the log explicitly first has the benefit
4252 * that the logging subsystem knows about it,
4253 * and is thus ready to be reopened should we
4254 * need it again. Note that the other fds
4255 * closed here are at least the locking and
4256 * barrier fds. */
4257 log_close();
4258 (void) fdset_close_others(fds);
4259
4260 if (arg_boot) {
4261 char **a;
4262 size_t m;
4263
4264 /* Automatically search for the init system */
4265
4266 m = 1 + argc - optind;
4267 a = newa(char*, m + 1);
4268 memcpy(a + 1, argv + optind, m * sizeof(char*));
4269
4270 a[0] = (char*) "/usr/lib/systemd/systemd";
4271 execve(a[0], a, env_use);
4272
4273 a[0] = (char*) "/lib/systemd/systemd";
4274 execve(a[0], a, env_use);
4275
4276 a[0] = (char*) "/sbin/init";
4277 execve(a[0], a, env_use);
4278 } else if (argc > optind)
4279 execvpe(argv[optind], argv + optind, env_use);
4280 else {
4281 chdir(home ? home : "/root");
4282 execle("/bin/bash", "-bash", NULL, env_use);
4283 execle("/bin/sh", "-sh", NULL, env_use);
4284 }
4285
4286 (void) log_open();
4287 return log_error_errno(errno, "execv() failed: %m");
4288 }
4289
4290 static int outer_child(
4291 Barrier *barrier,
4292 const char *directory,
4293 const char *console,
4294 const char *root_device, bool root_device_rw,
4295 const char *home_device, bool home_device_rw,
4296 const char *srv_device, bool srv_device_rw,
4297 bool interactive,
4298 bool secondary,
4299 int pid_socket,
4300 int kmsg_socket,
4301 int rtnl_socket,
4302 int uid_shift_socket,
4303 FDSet *fds,
4304 int argc,
4305 char *argv[]) {
4306
4307 pid_t pid;
4308 ssize_t l;
4309 int r;
4310
4311 assert(barrier);
4312 assert(directory);
4313 assert(console);
4314 assert(pid_socket >= 0);
4315 assert(kmsg_socket >= 0);
4316
4317 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
4318 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4319
4320 if (interactive) {
4321 close_nointr(STDIN_FILENO);
4322 close_nointr(STDOUT_FILENO);
4323 close_nointr(STDERR_FILENO);
4324
4325 r = open_terminal(console, O_RDWR);
4326 if (r != STDIN_FILENO) {
4327 if (r >= 0) {
4328 safe_close(r);
4329 r = -EINVAL;
4330 }
4331
4332 return log_error_errno(r, "Failed to open console: %m");
4333 }
4334
4335 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4336 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
4337 return log_error_errno(errno, "Failed to duplicate console: %m");
4338 }
4339
4340 r = reset_audit_loginuid();
4341 if (r < 0)
4342 return r;
4343
4344 /* Mark everything as slave, so that we still
4345 * receive mounts from the real root, but don't
4346 * propagate mounts to the real root. */
4347 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
4348 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4349
4350 r = mount_devices(directory,
4351 root_device, root_device_rw,
4352 home_device, home_device_rw,
4353 srv_device, srv_device_rw);
4354 if (r < 0)
4355 return r;
4356
4357 r = determine_uid_shift(directory);
4358 if (r < 0)
4359 return r;
4360
4361 if (arg_userns) {
4362 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
4363 if (l < 0)
4364 return log_error_errno(errno, "Failed to send UID shift: %m");
4365 if (l != sizeof(arg_uid_shift)) {
4366 log_error("Short write while sending UID shift.");
4367 return -EIO;
4368 }
4369 }
4370
4371 /* Turn directory into bind mount */
4372 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
4373 return log_error_errno(errno, "Failed to make bind mount: %m");
4374
4375 r = setup_volatile(directory);
4376 if (r < 0)
4377 return r;
4378
4379 r = setup_volatile_state(directory);
4380 if (r < 0)
4381 return r;
4382
4383 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
4384 if (r < 0)
4385 return r;
4386
4387 if (arg_read_only) {
4388 r = bind_remount_recursive(directory, true);
4389 if (r < 0)
4390 return log_error_errno(r, "Failed to make tree read-only: %m");
4391 }
4392
4393 r = mount_all(directory, false);
4394 if (r < 0)
4395 return r;
4396
4397 if (copy_devnodes(directory) < 0)
4398 return r;
4399
4400 dev_setup(directory, arg_uid_shift, arg_uid_shift);
4401
4402 if (setup_pts(directory) < 0)
4403 return r;
4404
4405 r = setup_propagate(directory);
4406 if (r < 0)
4407 return r;
4408
4409 r = setup_dev_console(directory, console);
4410 if (r < 0)
4411 return r;
4412
4413 r = setup_seccomp();
4414 if (r < 0)
4415 return r;
4416
4417 r = setup_timezone(directory);
4418 if (r < 0)
4419 return r;
4420
4421 r = setup_resolv_conf(directory);
4422 if (r < 0)
4423 return r;
4424
4425 r = setup_journal(directory);
4426 if (r < 0)
4427 return r;
4428
4429 r = mount_custom(directory);
4430 if (r < 0)
4431 return r;
4432
4433 r = mount_cgroup(directory);
4434 if (r < 0)
4435 return r;
4436
4437 r = mount_move_root(directory);
4438 if (r < 0)
4439 return log_error_errno(r, "Failed to move root directory: %m");
4440
4441 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4442 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
4443 (arg_private_network ? CLONE_NEWNET : 0) |
4444 (arg_userns ? CLONE_NEWUSER : 0),
4445 NULL);
4446 if (pid < 0)
4447 return log_error_errno(errno, "Failed to fork inner child: %m");
4448
4449 if (pid == 0) {
4450 pid_socket = safe_close(pid_socket);
4451 uid_shift_socket = safe_close(uid_shift_socket);
4452
4453 /* The inner child has all namespaces that are
4454 * requested, so that we all are owned by the user if
4455 * user namespaces are turned on. */
4456
4457 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
4458 if (r < 0)
4459 _exit(EXIT_FAILURE);
4460
4461 _exit(EXIT_SUCCESS);
4462 }
4463
4464 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4465 if (l < 0)
4466 return log_error_errno(errno, "Failed to send PID: %m");
4467 if (l != sizeof(pid)) {
4468 log_error("Short write while sending PID.");
4469 return -EIO;
4470 }
4471
4472 pid_socket = safe_close(pid_socket);
4473
4474 return 0;
4475 }
4476
4477 static int setup_uid_map(pid_t pid) {
4478 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4479 int r;
4480
4481 assert(pid > 1);
4482
4483 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4484 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4485 r = write_string_file(uid_map, line, 0);
4486 if (r < 0)
4487 return log_error_errno(r, "Failed to write UID map: %m");
4488
4489 /* We always assign the same UID and GID ranges */
4490 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4491 r = write_string_file(uid_map, line, 0);
4492 if (r < 0)
4493 return log_error_errno(r, "Failed to write GID map: %m");
4494
4495 return 0;
4496 }
4497
4498 static int chown_cgroup(pid_t pid) {
4499 _cleanup_free_ char *path = NULL, *fs = NULL;
4500 _cleanup_close_ int fd = -1;
4501 const char *fn;
4502 int r;
4503
4504 r = cg_pid_get_path(NULL, pid, &path);
4505 if (r < 0)
4506 return log_error_errno(r, "Failed to get container cgroup path: %m");
4507
4508 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
4509 if (r < 0)
4510 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
4511
4512 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
4513 if (fd < 0)
4514 return log_error_errno(errno, "Failed to open %s: %m", fs);
4515
4516 FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4517 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
4518 log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
4519
4520 return 0;
4521 }
4522
4523 int main(int argc, char *argv[]) {
4524
4525 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
4526 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4527 _cleanup_close_ int master = -1, image_fd = -1;
4528 _cleanup_fdset_free_ FDSet *fds = NULL;
4529 int r, n_fd_passed, loop_nr = -1;
4530 char veth_name[IFNAMSIZ];
4531 bool secondary = false, remove_subvol = false;
4532 sigset_t mask_chld;
4533 pid_t pid = 0;
4534 int ret = EXIT_SUCCESS;
4535 union in_addr_union exposed = {};
4536 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4537 bool interactive;
4538
4539 log_parse_environment();
4540 log_open();
4541
4542 r = parse_argv(argc, argv);
4543 if (r <= 0)
4544 goto finish;
4545
4546 r = determine_names();
4547 if (r < 0)
4548 goto finish;
4549
4550 if (geteuid() != 0) {
4551 log_error("Need to be root.");
4552 r = -EPERM;
4553 goto finish;
4554 }
4555
4556 n_fd_passed = sd_listen_fds(false);
4557 if (n_fd_passed > 0) {
4558 r = fdset_new_listen_fds(&fds, false);
4559 if (r < 0) {
4560 log_error_errno(r, "Failed to collect file descriptors: %m");
4561 goto finish;
4562 }
4563 }
4564
4565 if (arg_directory) {
4566 assert(!arg_image);
4567
4568 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4569 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4570 r = -EINVAL;
4571 goto finish;
4572 }
4573
4574 if (arg_ephemeral) {
4575 _cleanup_free_ char *np = NULL;
4576
4577 /* If the specified path is a mount point we
4578 * generate the new snapshot immediately
4579 * inside it under a random name. However if
4580 * the specified is not a mount point we
4581 * create the new snapshot in the parent
4582 * directory, just next to it. */
4583 r = path_is_mount_point(arg_directory, 0);
4584 if (r < 0) {
4585 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4586 goto finish;
4587 }
4588 if (r > 0)
4589 r = tempfn_random_child(arg_directory, "machine.", &np);
4590 else
4591 r = tempfn_random(arg_directory, "machine.", &np);
4592 if (r < 0) {
4593 log_error_errno(r, "Failed to generate name for snapshot: %m");
4594 goto finish;
4595 }
4596
4597 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4598 if (r < 0) {
4599 log_error_errno(r, "Failed to lock %s: %m", np);
4600 goto finish;
4601 }
4602
4603 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4604 if (r < 0) {
4605 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4606 goto finish;
4607 }
4608
4609 free(arg_directory);
4610 arg_directory = np;
4611 np = NULL;
4612
4613 remove_subvol = true;
4614
4615 } else {
4616 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4617 if (r == -EBUSY) {
4618 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4619 goto finish;
4620 }
4621 if (r < 0) {
4622 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4623 return r;
4624 }
4625
4626 if (arg_template) {
4627 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4628 if (r == -EEXIST) {
4629 if (!arg_quiet)
4630 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4631 } else if (r < 0) {
4632 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4633 goto finish;
4634 } else {
4635 if (!arg_quiet)
4636 log_info("Populated %s from template %s.", arg_directory, arg_template);
4637 }
4638 }
4639 }
4640
4641 if (arg_boot) {
4642 if (path_is_os_tree(arg_directory) <= 0) {
4643 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
4644 r = -EINVAL;
4645 goto finish;
4646 }
4647 } else {
4648 const char *p;
4649
4650 p = strjoina(arg_directory,
4651 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
4652 if (access(p, F_OK) < 0) {
4653 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
4654 r = -EINVAL;
4655 goto finish;
4656 }
4657 }
4658
4659 } else {
4660 char template[] = "/tmp/nspawn-root-XXXXXX";
4661
4662 assert(arg_image);
4663 assert(!arg_template);
4664
4665 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4666 if (r == -EBUSY) {
4667 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4668 goto finish;
4669 }
4670 if (r < 0) {
4671 r = log_error_errno(r, "Failed to create image lock: %m");
4672 goto finish;
4673 }
4674
4675 if (!mkdtemp(template)) {
4676 log_error_errno(errno, "Failed to create temporary directory: %m");
4677 r = -errno;
4678 goto finish;
4679 }
4680
4681 arg_directory = strdup(template);
4682 if (!arg_directory) {
4683 r = log_oom();
4684 goto finish;
4685 }
4686
4687 image_fd = setup_image(&device_path, &loop_nr);
4688 if (image_fd < 0) {
4689 r = image_fd;
4690 goto finish;
4691 }
4692
4693 r = dissect_image(image_fd,
4694 &root_device, &root_device_rw,
4695 &home_device, &home_device_rw,
4696 &srv_device, &srv_device_rw,
4697 &secondary);
4698 if (r < 0)
4699 goto finish;
4700 }
4701
4702 r = custom_mounts_prepare();
4703 if (r < 0)
4704 goto finish;
4705
4706 interactive =
4707 isatty(STDIN_FILENO) > 0 &&
4708 isatty(STDOUT_FILENO) > 0;
4709
4710 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4711 if (master < 0) {
4712 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
4713 goto finish;
4714 }
4715
4716 r = ptsname_malloc(master, &console);
4717 if (r < 0) {
4718 r = log_error_errno(r, "Failed to determine tty name: %m");
4719 goto finish;
4720 }
4721
4722 if (unlockpt(master) < 0) {
4723 r = log_error_errno(errno, "Failed to unlock tty: %m");
4724 goto finish;
4725 }
4726
4727 if (!arg_quiet)
4728 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4729 arg_machine, arg_image ?: arg_directory);
4730
4731 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
4732
4733 assert_se(sigemptyset(&mask_chld) == 0);
4734 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4735
4736 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4737 r = log_error_errno(errno, "Failed to become subreaper: %m");
4738 goto finish;
4739 }
4740
4741 for (;;) {
4742 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
4743 uid_shift_socket_pair[2] = { -1, -1 };
4744 ContainerStatus container_status;
4745 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4746 static const struct sigaction sa = {
4747 .sa_handler = nop_handler,
4748 .sa_flags = SA_NOCLDSTOP,
4749 };
4750 int ifi = 0;
4751 ssize_t l;
4752 _cleanup_event_unref_ sd_event *event = NULL;
4753 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4754 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4755 char last_char = 0;
4756
4757 r = barrier_create(&barrier);
4758 if (r < 0) {
4759 log_error_errno(r, "Cannot initialize IPC barrier: %m");
4760 goto finish;
4761 }
4762
4763 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
4764 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4765 goto finish;
4766 }
4767
4768 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
4769 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4770 goto finish;
4771 }
4772
4773 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
4774 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
4775 goto finish;
4776 }
4777
4778 if (arg_userns)
4779 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
4780 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4781 goto finish;
4782 }
4783
4784 /* Child can be killed before execv(), so handle SIGCHLD
4785 * in order to interrupt parent's blocking calls and
4786 * give it a chance to call wait() and terminate. */
4787 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4788 if (r < 0) {
4789 r = log_error_errno(errno, "Failed to change the signal mask: %m");
4790 goto finish;
4791 }
4792
4793 r = sigaction(SIGCHLD, &sa, NULL);
4794 if (r < 0) {
4795 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4796 goto finish;
4797 }
4798
4799 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
4800 if (pid < 0) {
4801 if (errno == EINVAL)
4802 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
4803 else
4804 r = log_error_errno(errno, "clone() failed: %m");
4805
4806 goto finish;
4807 }
4808
4809 if (pid == 0) {
4810 /* The outer child only has a file system namespace. */
4811 barrier_set_role(&barrier, BARRIER_CHILD);
4812
4813 master = safe_close(master);
4814
4815 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4816 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4817 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4818 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4819
4820 (void) reset_all_signal_handlers();
4821 (void) reset_signal_mask();
4822
4823 r = outer_child(&barrier,
4824 arg_directory,
4825 console,
4826 root_device, root_device_rw,
4827 home_device, home_device_rw,
4828 srv_device, srv_device_rw,
4829 interactive,
4830 secondary,
4831 pid_socket_pair[1],
4832 kmsg_socket_pair[1],
4833 rtnl_socket_pair[1],
4834 uid_shift_socket_pair[1],
4835 fds,
4836 argc, argv);
4837 if (r < 0)
4838 _exit(EXIT_FAILURE);
4839
4840 _exit(EXIT_SUCCESS);
4841 }
4842
4843 barrier_set_role(&barrier, BARRIER_PARENT);
4844
4845 fdset_free(fds);
4846 fds = NULL;
4847
4848 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4849 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4850 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4851
4852 /* Wait for the outer child. */
4853 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
4854 if (r < 0)
4855 goto finish;
4856 if (r != 0) {
4857 r = -EIO;
4858 goto finish;
4859 }
4860 pid = 0;
4861
4862 /* And now retrieve the PID of the inner child. */
4863 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
4864 if (l < 0) {
4865 r = log_error_errno(errno, "Failed to read inner child PID: %m");
4866 goto finish;
4867 }
4868 if (l != sizeof(pid)) {
4869 log_error("Short read while reading inner child PID: %m");
4870 r = EIO;
4871 goto finish;
4872 }
4873
4874 log_debug("Init process invoked as PID " PID_FMT, pid);
4875
4876 if (arg_userns) {
4877 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4878 log_error("Child died too early.");
4879 r = -ESRCH;
4880 goto finish;
4881 }
4882
4883 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
4884 if (l < 0) {
4885 r = log_error_errno(errno, "Failed to read UID shift: %m");
4886 goto finish;
4887 }
4888 if (l != sizeof(arg_uid_shift)) {
4889 log_error("Short read while reading UID shift: %m");
4890 r = EIO;
4891 goto finish;
4892 }
4893
4894 r = setup_uid_map(pid);
4895 if (r < 0)
4896 goto finish;
4897
4898 (void) barrier_place(&barrier); /* #2 */
4899 }
4900
4901 r = move_network_interfaces(pid);
4902 if (r < 0)
4903 goto finish;
4904
4905 r = setup_veth(pid, veth_name, &ifi);
4906 if (r < 0)
4907 goto finish;
4908
4909 r = setup_bridge(veth_name, &ifi);
4910 if (r < 0)
4911 goto finish;
4912
4913 r = setup_macvlan(pid);
4914 if (r < 0)
4915 goto finish;
4916
4917 r = setup_ipvlan(pid);
4918 if (r < 0)
4919 goto finish;
4920
4921 r = register_machine(pid, ifi);
4922 if (r < 0)
4923 goto finish;
4924
4925 r = chown_cgroup(pid);
4926 if (r < 0)
4927 goto finish;
4928
4929 /* Notify the child that the parent is ready with all
4930 * its setup (including cgroup-ification), and that
4931 * the child can now hand over control to the code to
4932 * run inside the container. */
4933 (void) barrier_place(&barrier); /* #3 */
4934
4935 /* Block SIGCHLD here, before notifying child.
4936 * process_pty() will handle it with the other signals. */
4937 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4938
4939 /* Reset signal to default */
4940 r = default_signals(SIGCHLD, -1);
4941 if (r < 0) {
4942 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4943 goto finish;
4944 }
4945
4946 /* Let the child know that we are ready and wait that the child is completely ready now. */
4947 if (!barrier_place_and_sync(&barrier)) { /* #5 */
4948 log_error("Client died too early.");
4949 r = -ESRCH;
4950 goto finish;
4951 }
4952
4953 sd_notifyf(false,
4954 "READY=1\n"
4955 "STATUS=Container running.\n"
4956 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4957
4958 r = sd_event_new(&event);
4959 if (r < 0) {
4960 log_error_errno(r, "Failed to get default event source: %m");
4961 goto finish;
4962 }
4963
4964 if (arg_kill_signal > 0) {
4965 /* Try to kill the init system on SIGINT or SIGTERM */
4966 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4967 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4968 } else {
4969 /* Immediately exit */
4970 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4971 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4972 }
4973
4974 /* simply exit on sigchld */
4975 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4976
4977 if (arg_expose_ports) {
4978 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4979 if (r < 0)
4980 goto finish;
4981
4982 (void) expose_ports(rtnl, &exposed);
4983 }
4984
4985 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4986
4987 r = pty_forward_new(event, master, true, !interactive, &forward);
4988 if (r < 0) {
4989 log_error_errno(r, "Failed to create PTY forwarder: %m");
4990 goto finish;
4991 }
4992
4993 r = sd_event_loop(event);
4994 if (r < 0) {
4995 log_error_errno(r, "Failed to run event loop: %m");
4996 goto finish;
4997 }
4998
4999 pty_forward_get_last_char(forward, &last_char);
5000
5001 forward = pty_forward_free(forward);
5002
5003 if (!arg_quiet && last_char != '\n')
5004 putc('\n', stdout);
5005
5006 /* Kill if it is not dead yet anyway */
5007 terminate_machine(pid);
5008
5009 /* Normally redundant, but better safe than sorry */
5010 kill(pid, SIGKILL);
5011
5012 r = wait_for_container(pid, &container_status);
5013 pid = 0;
5014
5015 if (r < 0)
5016 /* We failed to wait for the container, or the
5017 * container exited abnormally */
5018 goto finish;
5019 else if (r > 0 || container_status == CONTAINER_TERMINATED){
5020 /* The container exited with a non-zero
5021 * status, or with zero status and no reboot
5022 * was requested. */
5023 ret = r;
5024 break;
5025 }
5026
5027 /* CONTAINER_REBOOTED, loop again */
5028
5029 if (arg_keep_unit) {
5030 /* Special handling if we are running as a
5031 * service: instead of simply restarting the
5032 * machine we want to restart the entire
5033 * service, so let's inform systemd about this
5034 * with the special exit code 133. The service
5035 * file uses RestartForceExitStatus=133 so
5036 * that this results in a full nspawn
5037 * restart. This is necessary since we might
5038 * have cgroup parameters set we want to have
5039 * flushed out. */
5040 ret = 133;
5041 r = 0;
5042 break;
5043 }
5044
5045 flush_ports(&exposed);
5046 }
5047
5048 finish:
5049 sd_notify(false,
5050 "STOPPING=1\n"
5051 "STATUS=Terminating...");
5052
5053 if (pid > 0)
5054 kill(pid, SIGKILL);
5055
5056 /* Try to flush whatever is still queued in the pty */
5057 if (master >= 0)
5058 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
5059
5060 loop_remove(loop_nr, &image_fd);
5061
5062 if (remove_subvol && arg_directory) {
5063 int k;
5064
5065 k = btrfs_subvol_remove(arg_directory, true);
5066 if (k < 0)
5067 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
5068 }
5069
5070 if (arg_machine) {
5071 const char *p;
5072
5073 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5074 (void) rm_rf(p, REMOVE_ROOT);
5075 }
5076
5077 free(arg_directory);
5078 free(arg_template);
5079 free(arg_image);
5080 free(arg_machine);
5081 free(arg_user);
5082 strv_free(arg_setenv);
5083 strv_free(arg_network_interfaces);
5084 strv_free(arg_network_macvlan);
5085 strv_free(arg_network_ipvlan);
5086 custom_mount_free_all();
5087
5088 flush_ports(&exposed);
5089
5090 while (arg_expose_ports) {
5091 ExposePort *p = arg_expose_ports;
5092 LIST_REMOVE(ports, arg_expose_ports, p);
5093 free(p);
5094 }
5095
5096 return r < 0 ? EXIT_FAILURE : ret;
5097 }