]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #1043 from phomes/master
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <net/if.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
41 #include <sys/file.h>
42
43 #ifdef HAVE_SELINUX
44 #include <selinux/selinux.h>
45 #endif
46
47 #ifdef HAVE_SECCOMP
48 #include <seccomp.h>
49 #endif
50
51 #ifdef HAVE_BLKID
52 #include <blkid/blkid.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-netlink.h"
59 #include "random-util.h"
60 #include "log.h"
61 #include "util.h"
62 #include "mkdir.h"
63 #include "rm-rf.h"
64 #include "macro.h"
65 #include "missing.h"
66 #include "cgroup-util.h"
67 #include "strv.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
71 #include "fdset.h"
72 #include "build.h"
73 #include "fileio.h"
74 #include "bus-util.h"
75 #include "bus-error.h"
76 #include "ptyfwd.h"
77 #include "env-util.h"
78 #include "netlink-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
81 #include "gpt.h"
82 #include "siphash24.h"
83 #include "copy.h"
84 #include "base-filesystem.h"
85 #include "barrier.h"
86 #include "event-util.h"
87 #include "capability.h"
88 #include "cap-list.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
91 #include "list.h"
92 #include "in-addr-util.h"
93 #include "firewall-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
100
101 #ifdef HAVE_SECCOMP
102 #include "seccomp-util.h"
103 #endif
104
105 typedef struct ExposePort {
106 int protocol;
107 uint16_t host_port;
108 uint16_t container_port;
109 LIST_FIELDS(struct ExposePort, ports);
110 } ExposePort;
111
112 typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115 } ContainerStatus;
116
117 typedef enum LinkJournal {
118 LINK_NO,
119 LINK_AUTO,
120 LINK_HOST,
121 LINK_GUEST
122 } LinkJournal;
123
124 typedef enum Volatile {
125 VOLATILE_NO,
126 VOLATILE_YES,
127 VOLATILE_STATE,
128 } Volatile;
129
130 typedef enum CustomMountType {
131 CUSTOM_MOUNT_BIND,
132 CUSTOM_MOUNT_TMPFS,
133 CUSTOM_MOUNT_OVERLAY,
134 } CustomMountType;
135
136 typedef struct CustomMount {
137 CustomMountType type;
138 bool read_only;
139 char *source; /* for overlayfs this is the upper directory */
140 char *destination;
141 char *options;
142 char *work_dir;
143 char **lower;
144 } CustomMount;
145
146 static char *arg_directory = NULL;
147 static char *arg_template = NULL;
148 static char *arg_user = NULL;
149 static sd_id128_t arg_uuid = {};
150 static char *arg_machine = NULL;
151 static const char *arg_selinux_context = NULL;
152 static const char *arg_selinux_apifs_context = NULL;
153 static const char *arg_slice = NULL;
154 static bool arg_private_network = false;
155 static bool arg_read_only = false;
156 static bool arg_boot = false;
157 static bool arg_ephemeral = false;
158 static LinkJournal arg_link_journal = LINK_AUTO;
159 static bool arg_link_journal_try = false;
160 static uint64_t arg_retain =
161 (1ULL << CAP_CHOWN) |
162 (1ULL << CAP_DAC_OVERRIDE) |
163 (1ULL << CAP_DAC_READ_SEARCH) |
164 (1ULL << CAP_FOWNER) |
165 (1ULL << CAP_FSETID) |
166 (1ULL << CAP_IPC_OWNER) |
167 (1ULL << CAP_KILL) |
168 (1ULL << CAP_LEASE) |
169 (1ULL << CAP_LINUX_IMMUTABLE) |
170 (1ULL << CAP_NET_BIND_SERVICE) |
171 (1ULL << CAP_NET_BROADCAST) |
172 (1ULL << CAP_NET_RAW) |
173 (1ULL << CAP_SETGID) |
174 (1ULL << CAP_SETFCAP) |
175 (1ULL << CAP_SETPCAP) |
176 (1ULL << CAP_SETUID) |
177 (1ULL << CAP_SYS_ADMIN) |
178 (1ULL << CAP_SYS_CHROOT) |
179 (1ULL << CAP_SYS_NICE) |
180 (1ULL << CAP_SYS_PTRACE) |
181 (1ULL << CAP_SYS_TTY_CONFIG) |
182 (1ULL << CAP_SYS_RESOURCE) |
183 (1ULL << CAP_SYS_BOOT) |
184 (1ULL << CAP_AUDIT_WRITE) |
185 (1ULL << CAP_AUDIT_CONTROL) |
186 (1ULL << CAP_MKNOD);
187 static CustomMount *arg_custom_mounts = NULL;
188 static unsigned arg_n_custom_mounts = 0;
189 static char **arg_setenv = NULL;
190 static bool arg_quiet = false;
191 static bool arg_share_system = false;
192 static bool arg_register = true;
193 static bool arg_keep_unit = false;
194 static char **arg_network_interfaces = NULL;
195 static char **arg_network_macvlan = NULL;
196 static char **arg_network_ipvlan = NULL;
197 static bool arg_network_veth = false;
198 static const char *arg_network_bridge = NULL;
199 static unsigned long arg_personality = PERSONALITY_INVALID;
200 static char *arg_image = NULL;
201 static Volatile arg_volatile = VOLATILE_NO;
202 static ExposePort *arg_expose_ports = NULL;
203 static char **arg_property = NULL;
204 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
205 static bool arg_userns = false;
206 static int arg_kill_signal = 0;
207
208 static void help(void) {
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
211 " -h --help Show this help\n"
212 " --version Print version string\n"
213 " -q --quiet Do not show status information\n"
214 " -D --directory=PATH Root directory for the container\n"
215 " --template=PATH Initialize root directory from template directory,\n"
216 " if missing\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
222 " -M --machine=NAME Set the machine name for the container\n"
223 " --uuid=UUID Set a specific machine UUID for the container\n"
224 " -S --slice=SLICE Place the container in the specified slice\n"
225 " --property=NAME=VALUE Set scope unit property\n"
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
231 " container\n"
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
238 " -n --network-veth Add a virtual ethernet connection between host\n"
239 " and container\n"
240 " --network-bridge=INTERFACE\n"
241 " Add a virtual ethernet connection between host\n"
242 " and container and add it to an existing bridge on\n"
243 " the host\n"
244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
245 " Expose a container IP port on the host\n"
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
252 " --capability=CAP In addition to the default, retain specified\n"
253 " capability\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
259 " --read-only Mount the root directory read-only\n"
260 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
261 " the container\n"
262 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
263 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
264 " --overlay=PATH[:PATH...]:PATH\n"
265 " Create an overlay mount from the host to \n"
266 " the container\n"
267 " --overlay-ro=PATH[:PATH...]:PATH\n"
268 " Similar, but creates a read-only overlay mount\n"
269 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
270 " --share-system Share system namespaces with host\n"
271 " --register=BOOLEAN Register container as machine\n"
272 " --keep-unit Do not register a scope for the machine, reuse\n"
273 " the service unit nspawn is running in\n"
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 , program_invocation_short_name);
276 }
277
278 static CustomMount* custom_mount_add(CustomMountType t) {
279 CustomMount *c, *ret;
280
281 c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
282 if (!c)
283 return NULL;
284
285 arg_custom_mounts = c;
286 ret = arg_custom_mounts + arg_n_custom_mounts;
287 arg_n_custom_mounts++;
288
289 *ret = (CustomMount) { .type = t };
290
291 return ret;
292 }
293
294 static void custom_mount_free_all(void) {
295 unsigned i;
296
297 for (i = 0; i < arg_n_custom_mounts; i++) {
298 CustomMount *m = &arg_custom_mounts[i];
299
300 free(m->source);
301 free(m->destination);
302 free(m->options);
303
304 if (m->work_dir) {
305 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
306 free(m->work_dir);
307 }
308
309 strv_free(m->lower);
310 }
311
312 arg_custom_mounts = mfree(arg_custom_mounts);
313 arg_n_custom_mounts = 0;
314 }
315
316 static int custom_mount_compare(const void *a, const void *b) {
317 const CustomMount *x = a, *y = b;
318 int r;
319
320 r = path_compare(x->destination, y->destination);
321 if (r != 0)
322 return r;
323
324 if (x->type < y->type)
325 return -1;
326 if (x->type > y->type)
327 return 1;
328
329 return 0;
330 }
331
332 static int custom_mounts_prepare(void) {
333 unsigned i;
334 int r;
335
336 /* Ensure the mounts are applied prefix first. */
337 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
338
339 /* Allocate working directories for the overlay file systems that need it */
340 for (i = 0; i < arg_n_custom_mounts; i++) {
341 CustomMount *m = &arg_custom_mounts[i];
342
343 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
344 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
345 return -EINVAL;
346 }
347
348 if (m->type != CUSTOM_MOUNT_OVERLAY)
349 continue;
350
351 if (m->work_dir)
352 continue;
353
354 if (m->read_only)
355 continue;
356
357 r = tempfn_random(m->source, NULL, &m->work_dir);
358 if (r < 0)
359 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
360 }
361
362 return 0;
363 }
364
365 static int set_sanitized_path(char **b, const char *path) {
366 char *p;
367
368 assert(b);
369 assert(path);
370
371 p = canonicalize_file_name(path);
372 if (!p) {
373 if (errno != ENOENT)
374 return -errno;
375
376 p = path_make_absolute_cwd(path);
377 if (!p)
378 return -ENOMEM;
379 }
380
381 free(*b);
382 *b = path_kill_slashes(p);
383 return 0;
384 }
385
386 static int parse_argv(int argc, char *argv[]) {
387
388 enum {
389 ARG_VERSION = 0x100,
390 ARG_PRIVATE_NETWORK,
391 ARG_UUID,
392 ARG_READ_ONLY,
393 ARG_CAPABILITY,
394 ARG_DROP_CAPABILITY,
395 ARG_LINK_JOURNAL,
396 ARG_BIND,
397 ARG_BIND_RO,
398 ARG_TMPFS,
399 ARG_OVERLAY,
400 ARG_OVERLAY_RO,
401 ARG_SETENV,
402 ARG_SHARE_SYSTEM,
403 ARG_REGISTER,
404 ARG_KEEP_UNIT,
405 ARG_NETWORK_INTERFACE,
406 ARG_NETWORK_MACVLAN,
407 ARG_NETWORK_IPVLAN,
408 ARG_NETWORK_BRIDGE,
409 ARG_PERSONALITY,
410 ARG_VOLATILE,
411 ARG_TEMPLATE,
412 ARG_PROPERTY,
413 ARG_PRIVATE_USERS,
414 ARG_KILL_SIGNAL,
415 };
416
417 static const struct option options[] = {
418 { "help", no_argument, NULL, 'h' },
419 { "version", no_argument, NULL, ARG_VERSION },
420 { "directory", required_argument, NULL, 'D' },
421 { "template", required_argument, NULL, ARG_TEMPLATE },
422 { "ephemeral", no_argument, NULL, 'x' },
423 { "user", required_argument, NULL, 'u' },
424 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
425 { "boot", no_argument, NULL, 'b' },
426 { "uuid", required_argument, NULL, ARG_UUID },
427 { "read-only", no_argument, NULL, ARG_READ_ONLY },
428 { "capability", required_argument, NULL, ARG_CAPABILITY },
429 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
430 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
431 { "bind", required_argument, NULL, ARG_BIND },
432 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
433 { "tmpfs", required_argument, NULL, ARG_TMPFS },
434 { "overlay", required_argument, NULL, ARG_OVERLAY },
435 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
436 { "machine", required_argument, NULL, 'M' },
437 { "slice", required_argument, NULL, 'S' },
438 { "setenv", required_argument, NULL, ARG_SETENV },
439 { "selinux-context", required_argument, NULL, 'Z' },
440 { "selinux-apifs-context", required_argument, NULL, 'L' },
441 { "quiet", no_argument, NULL, 'q' },
442 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
443 { "register", required_argument, NULL, ARG_REGISTER },
444 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
445 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
446 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
447 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
448 { "network-veth", no_argument, NULL, 'n' },
449 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
450 { "personality", required_argument, NULL, ARG_PERSONALITY },
451 { "image", required_argument, NULL, 'i' },
452 { "volatile", optional_argument, NULL, ARG_VOLATILE },
453 { "port", required_argument, NULL, 'p' },
454 { "property", required_argument, NULL, ARG_PROPERTY },
455 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
456 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
457 {}
458 };
459
460 int c, r;
461 uint64_t plus = 0, minus = 0;
462
463 assert(argc >= 0);
464 assert(argv);
465
466 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
467
468 switch (c) {
469
470 case 'h':
471 help();
472 return 0;
473
474 case ARG_VERSION:
475 puts(PACKAGE_STRING);
476 puts(SYSTEMD_FEATURES);
477 return 0;
478
479 case 'D':
480 r = set_sanitized_path(&arg_directory, optarg);
481 if (r < 0)
482 return log_error_errno(r, "Invalid root directory: %m");
483
484 break;
485
486 case ARG_TEMPLATE:
487 r = set_sanitized_path(&arg_template, optarg);
488 if (r < 0)
489 return log_error_errno(r, "Invalid template directory: %m");
490
491 break;
492
493 case 'i':
494 r = set_sanitized_path(&arg_image, optarg);
495 if (r < 0)
496 return log_error_errno(r, "Invalid image path: %m");
497
498 break;
499
500 case 'x':
501 arg_ephemeral = true;
502 break;
503
504 case 'u':
505 r = free_and_strdup(&arg_user, optarg);
506 if (r < 0)
507 return log_oom();
508
509 break;
510
511 case ARG_NETWORK_BRIDGE:
512 arg_network_bridge = optarg;
513
514 /* fall through */
515
516 case 'n':
517 arg_network_veth = true;
518 arg_private_network = true;
519 break;
520
521 case ARG_NETWORK_INTERFACE:
522 if (strv_extend(&arg_network_interfaces, optarg) < 0)
523 return log_oom();
524
525 arg_private_network = true;
526 break;
527
528 case ARG_NETWORK_MACVLAN:
529 if (strv_extend(&arg_network_macvlan, optarg) < 0)
530 return log_oom();
531
532 arg_private_network = true;
533 break;
534
535 case ARG_NETWORK_IPVLAN:
536 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
537 return log_oom();
538
539 /* fall through */
540
541 case ARG_PRIVATE_NETWORK:
542 arg_private_network = true;
543 break;
544
545 case 'b':
546 arg_boot = true;
547 break;
548
549 case ARG_UUID:
550 r = sd_id128_from_string(optarg, &arg_uuid);
551 if (r < 0) {
552 log_error("Invalid UUID: %s", optarg);
553 return r;
554 }
555 break;
556
557 case 'S':
558 arg_slice = optarg;
559 break;
560
561 case 'M':
562 if (isempty(optarg))
563 arg_machine = mfree(arg_machine);
564 else {
565 if (!machine_name_is_valid(optarg)) {
566 log_error("Invalid machine name: %s", optarg);
567 return -EINVAL;
568 }
569
570 r = free_and_strdup(&arg_machine, optarg);
571 if (r < 0)
572 return log_oom();
573
574 break;
575 }
576
577 case 'Z':
578 arg_selinux_context = optarg;
579 break;
580
581 case 'L':
582 arg_selinux_apifs_context = optarg;
583 break;
584
585 case ARG_READ_ONLY:
586 arg_read_only = true;
587 break;
588
589 case ARG_CAPABILITY:
590 case ARG_DROP_CAPABILITY: {
591 const char *state, *word;
592 size_t length;
593
594 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
595 _cleanup_free_ char *t;
596
597 t = strndup(word, length);
598 if (!t)
599 return log_oom();
600
601 if (streq(t, "all")) {
602 if (c == ARG_CAPABILITY)
603 plus = (uint64_t) -1;
604 else
605 minus = (uint64_t) -1;
606 } else {
607 int cap;
608
609 cap = capability_from_name(t);
610 if (cap < 0) {
611 log_error("Failed to parse capability %s.", t);
612 return -EINVAL;
613 }
614
615 if (c == ARG_CAPABILITY)
616 plus |= 1ULL << (uint64_t) cap;
617 else
618 minus |= 1ULL << (uint64_t) cap;
619 }
620 }
621
622 break;
623 }
624
625 case 'j':
626 arg_link_journal = LINK_GUEST;
627 arg_link_journal_try = true;
628 break;
629
630 case ARG_LINK_JOURNAL:
631 if (streq(optarg, "auto")) {
632 arg_link_journal = LINK_AUTO;
633 arg_link_journal_try = false;
634 } else if (streq(optarg, "no")) {
635 arg_link_journal = LINK_NO;
636 arg_link_journal_try = false;
637 } else if (streq(optarg, "guest")) {
638 arg_link_journal = LINK_GUEST;
639 arg_link_journal_try = false;
640 } else if (streq(optarg, "host")) {
641 arg_link_journal = LINK_HOST;
642 arg_link_journal_try = false;
643 } else if (streq(optarg, "try-guest")) {
644 arg_link_journal = LINK_GUEST;
645 arg_link_journal_try = true;
646 } else if (streq(optarg, "try-host")) {
647 arg_link_journal = LINK_HOST;
648 arg_link_journal_try = true;
649 } else {
650 log_error("Failed to parse link journal mode %s", optarg);
651 return -EINVAL;
652 }
653
654 break;
655
656 case ARG_BIND:
657 case ARG_BIND_RO: {
658 const char *current = optarg;
659 _cleanup_free_ char *source = NULL, *destination = NULL;
660 CustomMount *m;
661
662 r = extract_many_words(&current, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
663 switch (r) {
664 case 1:
665 destination = strdup(source);
666 case 2:
667 break;
668 case -ENOMEM:
669 return log_oom();
670 default:
671 log_error("Invalid bind mount specification: %s", optarg);
672 return -EINVAL;
673 }
674
675 if (!source || !destination)
676 return log_oom();
677
678 if (!path_is_absolute(source) || !path_is_absolute(destination)) {
679 log_error("Invalid bind mount specification: %s", optarg);
680 return -EINVAL;
681 }
682
683 m = custom_mount_add(CUSTOM_MOUNT_BIND);
684 if (!m)
685 return log_oom();
686
687 m->source = source;
688 m->destination = destination;
689 m->read_only = c == ARG_BIND_RO;
690
691 source = destination = NULL;
692
693 break;
694 }
695
696 case ARG_TMPFS: {
697 const char *current = optarg;
698 _cleanup_free_ char *path = NULL, *opts = NULL;
699 CustomMount *m;
700
701 r = extract_first_word(&current, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
702 if (r == -ENOMEM)
703 return log_oom();
704 else if (r < 0) {
705 log_error("Invalid tmpfs specification: %s", optarg);
706 return r;
707 }
708 if (r)
709 opts = strdup(current);
710 else
711 opts = strdup("mode=0755");
712
713 if (!path || !opts)
714 return log_oom();
715
716 if (!path_is_absolute(path)) {
717 log_error("Invalid tmpfs specification: %s", optarg);
718 return -EINVAL;
719 }
720
721 m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
722 if (!m)
723 return log_oom();
724
725 m->destination = path;
726 m->options = opts;
727
728 path = opts = NULL;
729
730 break;
731 }
732
733 case ARG_OVERLAY:
734 case ARG_OVERLAY_RO: {
735 _cleanup_free_ char *upper = NULL, *destination = NULL;
736 _cleanup_strv_free_ char **lower = NULL;
737 CustomMount *m;
738 unsigned n = 0;
739 char **i;
740
741 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
742 if (r == -ENOMEM)
743 return log_oom();
744 else if (r < 0) {
745 log_error("Invalid overlay specification: %s", optarg);
746 return r;
747 }
748
749 STRV_FOREACH(i, lower) {
750 if (!path_is_absolute(*i)) {
751 log_error("Overlay path %s is not absolute.", *i);
752 return -EINVAL;
753 }
754
755 n++;
756 }
757
758 if (n < 2) {
759 log_error("--overlay= needs at least two colon-separated directories specified.");
760 return -EINVAL;
761 }
762
763 if (n == 2) {
764 /* If two parameters are specified,
765 * the first one is the lower, the
766 * second one the upper directory. And
767 * we'll also define the destination
768 * mount point the same as the upper. */
769 upper = lower[1];
770 lower[1] = NULL;
771
772 destination = strdup(upper);
773 if (!destination)
774 return log_oom();
775
776 } else {
777 upper = lower[n - 2];
778 destination = lower[n - 1];
779 lower[n - 2] = NULL;
780 }
781
782 m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
783 if (!m)
784 return log_oom();
785
786 m->destination = destination;
787 m->source = upper;
788 m->lower = lower;
789 m->read_only = c == ARG_OVERLAY_RO;
790
791 upper = destination = NULL;
792 lower = NULL;
793
794 break;
795 }
796
797 case ARG_SETENV: {
798 char **n;
799
800 if (!env_assignment_is_valid(optarg)) {
801 log_error("Environment variable assignment '%s' is not valid.", optarg);
802 return -EINVAL;
803 }
804
805 n = strv_env_set(arg_setenv, optarg);
806 if (!n)
807 return log_oom();
808
809 strv_free(arg_setenv);
810 arg_setenv = n;
811 break;
812 }
813
814 case 'q':
815 arg_quiet = true;
816 break;
817
818 case ARG_SHARE_SYSTEM:
819 arg_share_system = true;
820 break;
821
822 case ARG_REGISTER:
823 r = parse_boolean(optarg);
824 if (r < 0) {
825 log_error("Failed to parse --register= argument: %s", optarg);
826 return r;
827 }
828
829 arg_register = r;
830 break;
831
832 case ARG_KEEP_UNIT:
833 arg_keep_unit = true;
834 break;
835
836 case ARG_PERSONALITY:
837
838 arg_personality = personality_from_string(optarg);
839 if (arg_personality == PERSONALITY_INVALID) {
840 log_error("Unknown or unsupported personality '%s'.", optarg);
841 return -EINVAL;
842 }
843
844 break;
845
846 case ARG_VOLATILE:
847
848 if (!optarg)
849 arg_volatile = VOLATILE_YES;
850 else {
851 r = parse_boolean(optarg);
852 if (r < 0) {
853 if (streq(optarg, "state"))
854 arg_volatile = VOLATILE_STATE;
855 else {
856 log_error("Failed to parse --volatile= argument: %s", optarg);
857 return r;
858 }
859 } else
860 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
861 }
862
863 break;
864
865 case 'p': {
866 const char *split, *e;
867 uint16_t container_port, host_port;
868 int protocol;
869 ExposePort *p;
870
871 if ((e = startswith(optarg, "tcp:")))
872 protocol = IPPROTO_TCP;
873 else if ((e = startswith(optarg, "udp:")))
874 protocol = IPPROTO_UDP;
875 else {
876 e = optarg;
877 protocol = IPPROTO_TCP;
878 }
879
880 split = strchr(e, ':');
881 if (split) {
882 char v[split - e + 1];
883
884 memcpy(v, e, split - e);
885 v[split - e] = 0;
886
887 r = safe_atou16(v, &host_port);
888 if (r < 0 || host_port <= 0) {
889 log_error("Failed to parse host port: %s", optarg);
890 return -EINVAL;
891 }
892
893 r = safe_atou16(split + 1, &container_port);
894 } else {
895 r = safe_atou16(e, &container_port);
896 host_port = container_port;
897 }
898
899 if (r < 0 || container_port <= 0) {
900 log_error("Failed to parse host port: %s", optarg);
901 return -EINVAL;
902 }
903
904 LIST_FOREACH(ports, p, arg_expose_ports) {
905 if (p->protocol == protocol && p->host_port == host_port) {
906 log_error("Duplicate port specification: %s", optarg);
907 return -EINVAL;
908 }
909 }
910
911 p = new(ExposePort, 1);
912 if (!p)
913 return log_oom();
914
915 p->protocol = protocol;
916 p->host_port = host_port;
917 p->container_port = container_port;
918
919 LIST_PREPEND(ports, arg_expose_ports, p);
920
921 break;
922 }
923
924 case ARG_PROPERTY:
925 if (strv_extend(&arg_property, optarg) < 0)
926 return log_oom();
927
928 break;
929
930 case ARG_PRIVATE_USERS:
931 if (optarg) {
932 _cleanup_free_ char *buffer = NULL;
933 const char *range, *shift;
934
935 range = strchr(optarg, ':');
936 if (range) {
937 buffer = strndup(optarg, range - optarg);
938 if (!buffer)
939 return log_oom();
940 shift = buffer;
941
942 range++;
943 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
944 log_error("Failed to parse UID range: %s", range);
945 return -EINVAL;
946 }
947 } else
948 shift = optarg;
949
950 if (parse_uid(shift, &arg_uid_shift) < 0) {
951 log_error("Failed to parse UID: %s", optarg);
952 return -EINVAL;
953 }
954 }
955
956 arg_userns = true;
957 break;
958
959 case ARG_KILL_SIGNAL:
960 arg_kill_signal = signal_from_string_try_harder(optarg);
961 if (arg_kill_signal < 0) {
962 log_error("Cannot parse signal: %s", optarg);
963 return -EINVAL;
964 }
965
966 break;
967
968 case '?':
969 return -EINVAL;
970
971 default:
972 assert_not_reached("Unhandled option");
973 }
974
975 if (arg_share_system)
976 arg_register = false;
977
978 if (arg_boot && arg_share_system) {
979 log_error("--boot and --share-system may not be combined.");
980 return -EINVAL;
981 }
982
983 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
984 log_error("--keep-unit may not be used when invoked from a user session.");
985 return -EINVAL;
986 }
987
988 if (arg_directory && arg_image) {
989 log_error("--directory= and --image= may not be combined.");
990 return -EINVAL;
991 }
992
993 if (arg_template && arg_image) {
994 log_error("--template= and --image= may not be combined.");
995 return -EINVAL;
996 }
997
998 if (arg_template && !(arg_directory || arg_machine)) {
999 log_error("--template= needs --directory= or --machine=.");
1000 return -EINVAL;
1001 }
1002
1003 if (arg_ephemeral && arg_template) {
1004 log_error("--ephemeral and --template= may not be combined.");
1005 return -EINVAL;
1006 }
1007
1008 if (arg_ephemeral && arg_image) {
1009 log_error("--ephemeral and --image= may not be combined.");
1010 return -EINVAL;
1011 }
1012
1013 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1014 log_error("--ephemeral and --link-journal= may not be combined.");
1015 return -EINVAL;
1016 }
1017
1018 if (arg_volatile != VOLATILE_NO && arg_read_only) {
1019 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1020 return -EINVAL;
1021 }
1022
1023 if (arg_expose_ports && !arg_private_network) {
1024 log_error("Cannot use --port= without private networking.");
1025 return -EINVAL;
1026 }
1027
1028 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
1029 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
1030
1031 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1032
1033 if (arg_boot && arg_kill_signal <= 0)
1034 arg_kill_signal = SIGRTMIN+3;
1035
1036 return 1;
1037 }
1038
1039 static int tmpfs_patch_options(const char *options, char **ret) {
1040 char *buf = NULL;
1041
1042 if (arg_userns && arg_uid_shift != 0) {
1043 assert(arg_uid_shift != UID_INVALID);
1044
1045 if (options)
1046 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
1047 else
1048 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
1049 if (!buf)
1050 return -ENOMEM;
1051
1052 options = buf;
1053 }
1054
1055 #ifdef HAVE_SELINUX
1056 if (arg_selinux_apifs_context) {
1057 char *t;
1058
1059 if (options)
1060 t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
1061 else
1062 t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
1063 if (!t) {
1064 free(buf);
1065 return -ENOMEM;
1066 }
1067
1068 free(buf);
1069 buf = t;
1070 }
1071 #endif
1072
1073 *ret = buf;
1074 return !!buf;
1075 }
1076
1077 static int mount_all(const char *dest, bool userns) {
1078
1079 typedef struct MountPoint {
1080 const char *what;
1081 const char *where;
1082 const char *type;
1083 const char *options;
1084 unsigned long flags;
1085 bool fatal;
1086 bool userns;
1087 } MountPoint;
1088
1089 static const MountPoint mount_table[] = {
1090 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
1091 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
1092 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
1093 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
1094 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
1095 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
1096 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1097 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1098 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
1099 #ifdef HAVE_SELINUX
1100 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
1101 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */
1102 #endif
1103 };
1104
1105 unsigned k;
1106 int r;
1107
1108 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
1109 _cleanup_free_ char *where = NULL, *options = NULL;
1110 const char *o;
1111
1112 if (userns != mount_table[k].userns)
1113 continue;
1114
1115 where = prefix_root(dest, mount_table[k].where);
1116 if (!where)
1117 return log_oom();
1118
1119 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
1120 if (r < 0 && r != -ENOENT)
1121 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
1122
1123 /* Skip this entry if it is not a remount. */
1124 if (mount_table[k].what && r > 0)
1125 continue;
1126
1127 r = mkdir_p(where, 0755);
1128 if (r < 0) {
1129 if (mount_table[k].fatal)
1130 return log_error_errno(r, "Failed to create directory %s: %m", where);
1131
1132 log_warning_errno(r, "Failed to create directory %s: %m", where);
1133 continue;
1134 }
1135
1136 o = mount_table[k].options;
1137 if (streq_ptr(mount_table[k].type, "tmpfs")) {
1138 r = tmpfs_patch_options(o, &options);
1139 if (r < 0)
1140 return log_oom();
1141 if (r > 0)
1142 o = options;
1143 }
1144
1145 if (mount(mount_table[k].what,
1146 where,
1147 mount_table[k].type,
1148 mount_table[k].flags,
1149 o) < 0) {
1150
1151 if (mount_table[k].fatal)
1152 return log_error_errno(errno, "mount(%s) failed: %m", where);
1153
1154 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
1155 }
1156 }
1157
1158 return 0;
1159 }
1160
1161 static int mount_bind(const char *dest, CustomMount *m) {
1162 struct stat source_st, dest_st;
1163 const char *where;
1164 int r;
1165
1166 assert(m);
1167
1168 if (stat(m->source, &source_st) < 0)
1169 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
1170
1171 where = prefix_roota(dest, m->destination);
1172
1173 if (stat(where, &dest_st) >= 0) {
1174 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
1175 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
1176 return -EINVAL;
1177 }
1178
1179 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
1180 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
1181 return -EINVAL;
1182 }
1183
1184 } else if (errno == ENOENT) {
1185 r = mkdir_parents_label(where, 0755);
1186 if (r < 0)
1187 return log_error_errno(r, "Failed to make parents of %s: %m", where);
1188 } else {
1189 log_error_errno(errno, "Failed to stat %s: %m", where);
1190 return -errno;
1191 }
1192
1193 /* Create the mount point. Any non-directory file can be
1194 * mounted on any non-directory file (regular, fifo, socket,
1195 * char, block).
1196 */
1197 if (S_ISDIR(source_st.st_mode))
1198 r = mkdir_label(where, 0755);
1199 else
1200 r = touch(where);
1201 if (r < 0 && r != -EEXIST)
1202 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1203
1204 if (mount(m->source, where, NULL, MS_BIND, NULL) < 0)
1205 return log_error_errno(errno, "mount(%s) failed: %m", where);
1206
1207 if (m->read_only) {
1208 r = bind_remount_recursive(where, true);
1209 if (r < 0)
1210 return log_error_errno(r, "Read-only bind mount failed: %m");
1211 }
1212
1213 return 0;
1214 }
1215
1216 static int mount_tmpfs(const char *dest, CustomMount *m) {
1217 const char *where, *options;
1218 _cleanup_free_ char *buf = NULL;
1219 int r;
1220
1221 assert(dest);
1222 assert(m);
1223
1224 where = prefix_roota(dest, m->destination);
1225
1226 r = mkdir_p_label(where, 0755);
1227 if (r < 0 && r != -EEXIST)
1228 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1229
1230 r = tmpfs_patch_options(m->options, &buf);
1231 if (r < 0)
1232 return log_oom();
1233 options = r > 0 ? buf : m->options;
1234
1235 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
1236 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1237
1238 return 0;
1239 }
1240
1241 static char *joined_and_escaped_lower_dirs(char * const *lower) {
1242 _cleanup_strv_free_ char **sv = NULL;
1243
1244 sv = strv_copy(lower);
1245 if (!sv)
1246 return NULL;
1247
1248 strv_reverse(sv);
1249
1250 if (!strv_shell_escape(sv, ",:"))
1251 return NULL;
1252
1253 return strv_join(sv, ":");
1254 }
1255
1256 static int mount_overlay(const char *dest, CustomMount *m) {
1257 _cleanup_free_ char *lower = NULL;
1258 const char *where, *options;
1259 int r;
1260
1261 assert(dest);
1262 assert(m);
1263
1264 where = prefix_roota(dest, m->destination);
1265
1266 r = mkdir_label(where, 0755);
1267 if (r < 0 && r != -EEXIST)
1268 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
1269
1270 (void) mkdir_p_label(m->source, 0755);
1271
1272 lower = joined_and_escaped_lower_dirs(m->lower);
1273 if (!lower)
1274 return log_oom();
1275
1276 if (m->read_only) {
1277 _cleanup_free_ char *escaped_source = NULL;
1278
1279 escaped_source = shell_escape(m->source, ",:");
1280 if (!escaped_source)
1281 return log_oom();
1282
1283 options = strjoina("lowerdir=", escaped_source, ":", lower);
1284 } else {
1285 _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL;
1286
1287 assert(m->work_dir);
1288 (void) mkdir_label(m->work_dir, 0700);
1289
1290 escaped_source = shell_escape(m->source, ",:");
1291 if (!escaped_source)
1292 return log_oom();
1293 escaped_work_dir = shell_escape(m->work_dir, ",:");
1294 if (!escaped_work_dir)
1295 return log_oom();
1296
1297 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
1298 }
1299
1300 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
1301 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
1302
1303 return 0;
1304 }
1305
1306 static int mount_custom(const char *dest) {
1307 unsigned i;
1308 int r;
1309
1310 assert(dest);
1311
1312 for (i = 0; i < arg_n_custom_mounts; i++) {
1313 CustomMount *m = &arg_custom_mounts[i];
1314
1315 switch (m->type) {
1316
1317 case CUSTOM_MOUNT_BIND:
1318 r = mount_bind(dest, m);
1319 break;
1320
1321 case CUSTOM_MOUNT_TMPFS:
1322 r = mount_tmpfs(dest, m);
1323 break;
1324
1325 case CUSTOM_MOUNT_OVERLAY:
1326 r = mount_overlay(dest, m);
1327 break;
1328
1329 default:
1330 assert_not_reached("Unknown custom mount type");
1331 }
1332
1333 if (r < 0)
1334 return r;
1335 }
1336
1337 return 0;
1338 }
1339
1340 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1341 char *to;
1342 int r;
1343
1344 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1345
1346 r = path_is_mount_point(to, 0);
1347 if (r < 0 && r != -ENOENT)
1348 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1349 if (r > 0)
1350 return 0;
1351
1352 mkdir_p(to, 0755);
1353
1354 /* The superblock mount options of the mount point need to be
1355 * identical to the hosts', and hence writable... */
1356 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1357 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1358
1359 /* ... hence let's only make the bind mount read-only, not the
1360 * superblock. */
1361 if (read_only) {
1362 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1363 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1364 }
1365 return 1;
1366 }
1367
1368 static int mount_cgroup(const char *dest) {
1369 _cleanup_set_free_free_ Set *controllers = NULL;
1370 const char *cgroup_root;
1371 int r;
1372
1373 controllers = set_new(&string_hash_ops);
1374 if (!controllers)
1375 return log_oom();
1376
1377 r = cg_kernel_controllers(controllers);
1378 if (r < 0)
1379 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1380
1381 for (;;) {
1382 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1383
1384 controller = set_steal_first(controllers);
1385 if (!controller)
1386 break;
1387
1388 origin = prefix_root("/sys/fs/cgroup/", controller);
1389 if (!origin)
1390 return log_oom();
1391
1392 r = readlink_malloc(origin, &combined);
1393 if (r == -EINVAL) {
1394 /* Not a symbolic link, but directly a single cgroup hierarchy */
1395
1396 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1397 if (r < 0)
1398 return r;
1399
1400 } else if (r < 0)
1401 return log_error_errno(r, "Failed to read link %s: %m", origin);
1402 else {
1403 _cleanup_free_ char *target = NULL;
1404
1405 target = prefix_root(dest, origin);
1406 if (!target)
1407 return log_oom();
1408
1409 /* A symbolic link, a combination of controllers in one hierarchy */
1410
1411 if (!filename_is_valid(combined)) {
1412 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1413 continue;
1414 }
1415
1416 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1417 if (r < 0)
1418 return r;
1419
1420 r = symlink_idempotent(combined, target);
1421 if (r == -EINVAL) {
1422 log_error("Invalid existing symlink for combined hierarchy");
1423 return r;
1424 }
1425 if (r < 0)
1426 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1427 }
1428 }
1429
1430 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1431 if (r < 0)
1432 return r;
1433
1434 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1435 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1436 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1437
1438 return 0;
1439 }
1440
1441 static int mount_systemd_cgroup_writable(const char *dest) {
1442 _cleanup_free_ char *own_cgroup_path = NULL;
1443 const char *systemd_root, *systemd_own;
1444 int r;
1445
1446 assert(dest);
1447
1448 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1449 if (r < 0)
1450 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1451
1452 /* Make our own cgroup a (writable) bind mount */
1453 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1454 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1455 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1456
1457 /* And then remount the systemd cgroup root read-only */
1458 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
1459 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1460 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1461
1462 return 0;
1463 }
1464
1465 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1466 assert(p);
1467
1468 if (!arg_userns)
1469 return 0;
1470
1471 if (uid == UID_INVALID && gid == GID_INVALID)
1472 return 0;
1473
1474 if (uid != UID_INVALID) {
1475 uid += arg_uid_shift;
1476
1477 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1478 return -EOVERFLOW;
1479 }
1480
1481 if (gid != GID_INVALID) {
1482 gid += (gid_t) arg_uid_shift;
1483
1484 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1485 return -EOVERFLOW;
1486 }
1487
1488 if (lchown(p, uid, gid) < 0)
1489 return -errno;
1490
1491 return 0;
1492 }
1493
1494 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1495 const char *q;
1496
1497 q = prefix_roota(root, path);
1498 if (mkdir(q, mode) < 0) {
1499 if (errno == EEXIST)
1500 return 0;
1501 return -errno;
1502 }
1503
1504 return userns_lchown(q, uid, gid);
1505 }
1506
1507 static int setup_timezone(const char *dest) {
1508 _cleanup_free_ char *p = NULL, *q = NULL;
1509 const char *where, *check, *what;
1510 char *z, *y;
1511 int r;
1512
1513 assert(dest);
1514
1515 /* Fix the timezone, if possible */
1516 r = readlink_malloc("/etc/localtime", &p);
1517 if (r < 0) {
1518 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1519 return 0;
1520 }
1521
1522 z = path_startswith(p, "../usr/share/zoneinfo/");
1523 if (!z)
1524 z = path_startswith(p, "/usr/share/zoneinfo/");
1525 if (!z) {
1526 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1527 return 0;
1528 }
1529
1530 where = prefix_roota(dest, "/etc/localtime");
1531 r = readlink_malloc(where, &q);
1532 if (r >= 0) {
1533 y = path_startswith(q, "../usr/share/zoneinfo/");
1534 if (!y)
1535 y = path_startswith(q, "/usr/share/zoneinfo/");
1536
1537 /* Already pointing to the right place? Then do nothing .. */
1538 if (y && streq(y, z))
1539 return 0;
1540 }
1541
1542 check = strjoina("/usr/share/zoneinfo/", z);
1543 check = prefix_root(dest, check);
1544 if (laccess(check, F_OK) < 0) {
1545 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1546 return 0;
1547 }
1548
1549 r = unlink(where);
1550 if (r < 0 && errno != ENOENT) {
1551 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1552 return 0;
1553 }
1554
1555 what = strjoina("../usr/share/zoneinfo/", z);
1556 if (symlink(what, where) < 0) {
1557 log_error_errno(errno, "Failed to correct timezone of container: %m");
1558 return 0;
1559 }
1560
1561 r = userns_lchown(where, 0, 0);
1562 if (r < 0)
1563 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1564
1565 return 0;
1566 }
1567
1568 static int setup_resolv_conf(const char *dest) {
1569 const char *where = NULL;
1570 int r;
1571
1572 assert(dest);
1573
1574 if (arg_private_network)
1575 return 0;
1576
1577 /* Fix resolv.conf, if possible */
1578 where = prefix_roota(dest, "/etc/resolv.conf");
1579
1580 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1581 if (r < 0) {
1582 /* If the file already exists as symlink, let's
1583 * suppress the warning, under the assumption that
1584 * resolved or something similar runs inside and the
1585 * symlink points there.
1586 *
1587 * If the disk image is read-only, there's also no
1588 * point in complaining.
1589 */
1590 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1591 "Failed to copy /etc/resolv.conf to %s: %m", where);
1592 return 0;
1593 }
1594
1595 r = userns_lchown(where, 0, 0);
1596 if (r < 0)
1597 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1598
1599 return 0;
1600 }
1601
1602 static int setup_volatile_state(const char *directory) {
1603 _cleanup_free_ char *buf = NULL;
1604 const char *p, *options;
1605 int r;
1606
1607 assert(directory);
1608
1609 if (arg_volatile != VOLATILE_STATE)
1610 return 0;
1611
1612 /* --volatile=state means we simply overmount /var
1613 with a tmpfs, and the rest read-only. */
1614
1615 r = bind_remount_recursive(directory, true);
1616 if (r < 0)
1617 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1618
1619 p = prefix_roota(directory, "/var");
1620 r = mkdir(p, 0755);
1621 if (r < 0 && errno != EEXIST)
1622 return log_error_errno(errno, "Failed to create %s: %m", directory);
1623
1624 options = "mode=755";
1625 r = tmpfs_patch_options(options, &buf);
1626 if (r < 0)
1627 return log_oom();
1628 if (r > 0)
1629 options = buf;
1630
1631 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
1632 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1633
1634 return 0;
1635 }
1636
1637 static int setup_volatile(const char *directory) {
1638 bool tmpfs_mounted = false, bind_mounted = false;
1639 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1640 _cleanup_free_ char *buf = NULL;
1641 const char *f, *t, *options;
1642 int r;
1643
1644 assert(directory);
1645
1646 if (arg_volatile != VOLATILE_YES)
1647 return 0;
1648
1649 /* --volatile=yes means we mount a tmpfs to the root dir, and
1650 the original /usr to use inside it, and that read-only. */
1651
1652 if (!mkdtemp(template))
1653 return log_error_errno(errno, "Failed to create temporary directory: %m");
1654
1655 options = "mode=755";
1656 r = tmpfs_patch_options(options, &buf);
1657 if (r < 0)
1658 return log_oom();
1659 if (r > 0)
1660 options = buf;
1661
1662 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
1663 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1664 goto fail;
1665 }
1666
1667 tmpfs_mounted = true;
1668
1669 f = prefix_roota(directory, "/usr");
1670 t = prefix_roota(template, "/usr");
1671
1672 r = mkdir(t, 0755);
1673 if (r < 0 && errno != EEXIST) {
1674 r = log_error_errno(errno, "Failed to create %s: %m", t);
1675 goto fail;
1676 }
1677
1678 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
1679 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
1680 goto fail;
1681 }
1682
1683 bind_mounted = true;
1684
1685 r = bind_remount_recursive(t, true);
1686 if (r < 0) {
1687 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1688 goto fail;
1689 }
1690
1691 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1692 r = log_error_errno(errno, "Failed to move root mount: %m");
1693 goto fail;
1694 }
1695
1696 (void) rmdir(template);
1697
1698 return 0;
1699
1700 fail:
1701 if (bind_mounted)
1702 (void) umount(t);
1703
1704 if (tmpfs_mounted)
1705 (void) umount(template);
1706 (void) rmdir(template);
1707 return r;
1708 }
1709
1710 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1711 assert(s);
1712
1713 snprintf(s, 37,
1714 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1715 SD_ID128_FORMAT_VAL(id));
1716
1717 return s;
1718 }
1719
1720 static int setup_boot_id(const char *dest) {
1721 const char *from, *to;
1722 sd_id128_t rnd = {};
1723 char as_uuid[37];
1724 int r;
1725
1726 if (arg_share_system)
1727 return 0;
1728
1729 /* Generate a new randomized boot ID, so that each boot-up of
1730 * the container gets a new one */
1731
1732 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1733 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1734
1735 r = sd_id128_randomize(&rnd);
1736 if (r < 0)
1737 return log_error_errno(r, "Failed to generate random boot id: %m");
1738
1739 id128_format_as_uuid(rnd, as_uuid);
1740
1741 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1742 if (r < 0)
1743 return log_error_errno(r, "Failed to write boot id: %m");
1744
1745 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1746 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1747 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1748 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1749
1750 unlink(from);
1751 return r;
1752 }
1753
1754 static int copy_devnodes(const char *dest) {
1755
1756 static const char devnodes[] =
1757 "null\0"
1758 "zero\0"
1759 "full\0"
1760 "random\0"
1761 "urandom\0"
1762 "tty\0"
1763 "net/tun\0";
1764
1765 const char *d;
1766 int r = 0;
1767 _cleanup_umask_ mode_t u;
1768
1769 assert(dest);
1770
1771 u = umask(0000);
1772
1773 /* Create /dev/net, so that we can create /dev/net/tun in it */
1774 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1775 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1776
1777 NULSTR_FOREACH(d, devnodes) {
1778 _cleanup_free_ char *from = NULL, *to = NULL;
1779 struct stat st;
1780
1781 from = strappend("/dev/", d);
1782 to = prefix_root(dest, from);
1783
1784 if (stat(from, &st) < 0) {
1785
1786 if (errno != ENOENT)
1787 return log_error_errno(errno, "Failed to stat %s: %m", from);
1788
1789 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1790
1791 log_error("%s is not a char or block device, cannot copy.", from);
1792 return -EIO;
1793
1794 } else {
1795 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1796 if (errno != EPERM)
1797 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1798
1799 /* Some systems abusively restrict mknod but
1800 * allow bind mounts. */
1801 r = touch(to);
1802 if (r < 0)
1803 return log_error_errno(r, "touch (%s) failed: %m", to);
1804 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1805 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1806 }
1807
1808 r = userns_lchown(to, 0, 0);
1809 if (r < 0)
1810 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1811 }
1812 }
1813
1814 return r;
1815 }
1816
1817 static int setup_pts(const char *dest) {
1818 _cleanup_free_ char *options = NULL;
1819 const char *p;
1820
1821 #ifdef HAVE_SELINUX
1822 if (arg_selinux_apifs_context)
1823 (void) asprintf(&options,
1824 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1825 arg_uid_shift + TTY_GID,
1826 arg_selinux_apifs_context);
1827 else
1828 #endif
1829 (void) asprintf(&options,
1830 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1831 arg_uid_shift + TTY_GID);
1832
1833 if (!options)
1834 return log_oom();
1835
1836 /* Mount /dev/pts itself */
1837 p = prefix_roota(dest, "/dev/pts");
1838 if (mkdir(p, 0755) < 0)
1839 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1840 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1841 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1842 if (userns_lchown(p, 0, 0) < 0)
1843 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1844
1845 /* Create /dev/ptmx symlink */
1846 p = prefix_roota(dest, "/dev/ptmx");
1847 if (symlink("pts/ptmx", p) < 0)
1848 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1849 if (userns_lchown(p, 0, 0) < 0)
1850 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1851
1852 /* And fix /dev/pts/ptmx ownership */
1853 p = prefix_roota(dest, "/dev/pts/ptmx");
1854 if (userns_lchown(p, 0, 0) < 0)
1855 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1856
1857 return 0;
1858 }
1859
1860 static int setup_dev_console(const char *dest, const char *console) {
1861 _cleanup_umask_ mode_t u;
1862 const char *to;
1863 int r;
1864
1865 assert(dest);
1866 assert(console);
1867
1868 u = umask(0000);
1869
1870 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1871 if (r < 0)
1872 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1873
1874 /* We need to bind mount the right tty to /dev/console since
1875 * ptys can only exist on pts file systems. To have something
1876 * to bind mount things on we create a empty regular file. */
1877
1878 to = prefix_roota(dest, "/dev/console");
1879 r = touch(to);
1880 if (r < 0)
1881 return log_error_errno(r, "touch() for /dev/console failed: %m");
1882
1883 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1884 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1885
1886 return 0;
1887 }
1888
1889 static int setup_kmsg(const char *dest, int kmsg_socket) {
1890 const char *from, *to;
1891 _cleanup_umask_ mode_t u;
1892 int fd, k;
1893 union {
1894 struct cmsghdr cmsghdr;
1895 uint8_t buf[CMSG_SPACE(sizeof(int))];
1896 } control = {};
1897 struct msghdr mh = {
1898 .msg_control = &control,
1899 .msg_controllen = sizeof(control),
1900 };
1901 struct cmsghdr *cmsg;
1902
1903 assert(kmsg_socket >= 0);
1904
1905 u = umask(0000);
1906
1907 /* We create the kmsg FIFO as /run/kmsg, but immediately
1908 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1909 * on the reading side behave very similar to /proc/kmsg,
1910 * their writing side behaves differently from /dev/kmsg in
1911 * that writing blocks when nothing is reading. In order to
1912 * avoid any problems with containers deadlocking due to this
1913 * we simply make /dev/kmsg unavailable to the container. */
1914 from = prefix_roota(dest, "/run/kmsg");
1915 to = prefix_roota(dest, "/proc/kmsg");
1916
1917 if (mkfifo(from, 0600) < 0)
1918 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1919 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1920 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1921
1922 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1923 if (fd < 0)
1924 return log_error_errno(errno, "Failed to open fifo: %m");
1925
1926 cmsg = CMSG_FIRSTHDR(&mh);
1927 cmsg->cmsg_level = SOL_SOCKET;
1928 cmsg->cmsg_type = SCM_RIGHTS;
1929 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1930 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1931
1932 mh.msg_controllen = cmsg->cmsg_len;
1933
1934 /* Store away the fd in the socket, so that it stays open as
1935 * long as we run the child */
1936 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1937 safe_close(fd);
1938
1939 if (k < 0)
1940 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1941
1942 /* And now make the FIFO unavailable as /run/kmsg... */
1943 (void) unlink(from);
1944
1945 return 0;
1946 }
1947
1948 static int send_rtnl(int send_fd) {
1949 union {
1950 struct cmsghdr cmsghdr;
1951 uint8_t buf[CMSG_SPACE(sizeof(int))];
1952 } control = {};
1953 struct msghdr mh = {
1954 .msg_control = &control,
1955 .msg_controllen = sizeof(control),
1956 };
1957 struct cmsghdr *cmsg;
1958 _cleanup_close_ int fd = -1;
1959 ssize_t k;
1960
1961 assert(send_fd >= 0);
1962
1963 if (!arg_expose_ports)
1964 return 0;
1965
1966 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1967 if (fd < 0)
1968 return log_error_errno(errno, "Failed to allocate container netlink: %m");
1969
1970 cmsg = CMSG_FIRSTHDR(&mh);
1971 cmsg->cmsg_level = SOL_SOCKET;
1972 cmsg->cmsg_type = SCM_RIGHTS;
1973 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1974 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1975
1976 mh.msg_controllen = cmsg->cmsg_len;
1977
1978 /* Store away the fd in the socket, so that it stays open as
1979 * long as we run the child */
1980 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1981 if (k < 0)
1982 return log_error_errno(errno, "Failed to send netlink fd: %m");
1983
1984 return 0;
1985 }
1986
1987 static int flush_ports(union in_addr_union *exposed) {
1988 ExposePort *p;
1989 int r, af = AF_INET;
1990
1991 assert(exposed);
1992
1993 if (!arg_expose_ports)
1994 return 0;
1995
1996 if (in_addr_is_null(af, exposed))
1997 return 0;
1998
1999 log_debug("Lost IP address.");
2000
2001 LIST_FOREACH(ports, p, arg_expose_ports) {
2002 r = fw_add_local_dnat(false,
2003 af,
2004 p->protocol,
2005 NULL,
2006 NULL, 0,
2007 NULL, 0,
2008 p->host_port,
2009 exposed,
2010 p->container_port,
2011 NULL);
2012 if (r < 0)
2013 log_warning_errno(r, "Failed to modify firewall: %m");
2014 }
2015
2016 *exposed = IN_ADDR_NULL;
2017 return 0;
2018 }
2019
2020 static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
2021 _cleanup_free_ struct local_address *addresses = NULL;
2022 _cleanup_free_ char *pretty = NULL;
2023 union in_addr_union new_exposed;
2024 ExposePort *p;
2025 bool add;
2026 int af = AF_INET, r;
2027
2028 assert(exposed);
2029
2030 /* Invoked each time an address is added or removed inside the
2031 * container */
2032
2033 if (!arg_expose_ports)
2034 return 0;
2035
2036 r = local_addresses(rtnl, 0, af, &addresses);
2037 if (r < 0)
2038 return log_error_errno(r, "Failed to enumerate local addresses: %m");
2039
2040 add = r > 0 &&
2041 addresses[0].family == af &&
2042 addresses[0].scope < RT_SCOPE_LINK;
2043
2044 if (!add)
2045 return flush_ports(exposed);
2046
2047 new_exposed = addresses[0].address;
2048 if (in_addr_equal(af, exposed, &new_exposed))
2049 return 0;
2050
2051 in_addr_to_string(af, &new_exposed, &pretty);
2052 log_debug("New container IP is %s.", strna(pretty));
2053
2054 LIST_FOREACH(ports, p, arg_expose_ports) {
2055
2056 r = fw_add_local_dnat(true,
2057 af,
2058 p->protocol,
2059 NULL,
2060 NULL, 0,
2061 NULL, 0,
2062 p->host_port,
2063 &new_exposed,
2064 p->container_port,
2065 in_addr_is_null(af, exposed) ? NULL : exposed);
2066 if (r < 0)
2067 log_warning_errno(r, "Failed to modify firewall: %m");
2068 }
2069
2070 *exposed = new_exposed;
2071 return 0;
2072 }
2073
2074 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2075 union in_addr_union *exposed = userdata;
2076
2077 assert(rtnl);
2078 assert(m);
2079 assert(exposed);
2080
2081 expose_ports(rtnl, exposed);
2082 return 0;
2083 }
2084
2085 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
2086 union {
2087 struct cmsghdr cmsghdr;
2088 uint8_t buf[CMSG_SPACE(sizeof(int))];
2089 } control = {};
2090 struct msghdr mh = {
2091 .msg_control = &control,
2092 .msg_controllen = sizeof(control),
2093 };
2094 struct cmsghdr *cmsg;
2095 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2096 int fd, r;
2097 ssize_t k;
2098
2099 assert(event);
2100 assert(recv_fd >= 0);
2101 assert(ret);
2102
2103 if (!arg_expose_ports)
2104 return 0;
2105
2106 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
2107 if (k < 0)
2108 return log_error_errno(errno, "Failed to recv netlink fd: %m");
2109
2110 cmsg = CMSG_FIRSTHDR(&mh);
2111 assert(cmsg->cmsg_level == SOL_SOCKET);
2112 assert(cmsg->cmsg_type == SCM_RIGHTS);
2113 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
2114 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
2115
2116 r = sd_netlink_open_fd(&rtnl, fd);
2117 if (r < 0) {
2118 safe_close(fd);
2119 return log_error_errno(r, "Failed to create rtnl object: %m");
2120 }
2121
2122 r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
2123 if (r < 0)
2124 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
2125
2126 r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
2127 if (r < 0)
2128 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
2129
2130 r = sd_netlink_attach_event(rtnl, event, 0);
2131 if (r < 0)
2132 return log_error_errno(r, "Failed to add to even loop: %m");
2133
2134 *ret = rtnl;
2135 rtnl = NULL;
2136
2137 return 0;
2138 }
2139
2140 static int setup_hostname(void) {
2141
2142 if (arg_share_system)
2143 return 0;
2144
2145 if (sethostname_idempotent(arg_machine) < 0)
2146 return -errno;
2147
2148 return 0;
2149 }
2150
2151 static int setup_journal(const char *directory) {
2152 sd_id128_t machine_id, this_id;
2153 _cleanup_free_ char *b = NULL, *d = NULL;
2154 const char *etc_machine_id, *p, *q;
2155 char *id;
2156 int r;
2157
2158 /* Don't link journals in ephemeral mode */
2159 if (arg_ephemeral)
2160 return 0;
2161
2162 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2163
2164 r = read_one_line_file(etc_machine_id, &b);
2165 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
2166 return 0;
2167 else if (r < 0)
2168 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
2169
2170 id = strstrip(b);
2171 if (isempty(id) && arg_link_journal == LINK_AUTO)
2172 return 0;
2173
2174 /* Verify validity */
2175 r = sd_id128_from_string(id, &machine_id);
2176 if (r < 0)
2177 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
2178
2179 r = sd_id128_get_machine(&this_id);
2180 if (r < 0)
2181 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2182
2183 if (sd_id128_equal(machine_id, this_id)) {
2184 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
2185 "Host and machine ids are equal (%s): refusing to link journals", id);
2186 if (arg_link_journal == LINK_AUTO)
2187 return 0;
2188 return -EEXIST;
2189 }
2190
2191 if (arg_link_journal == LINK_NO)
2192 return 0;
2193
2194 r = userns_mkdir(directory, "/var", 0755, 0, 0);
2195 if (r < 0)
2196 return log_error_errno(r, "Failed to create /var: %m");
2197
2198 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
2199 if (r < 0)
2200 return log_error_errno(r, "Failed to create /var/log: %m");
2201
2202 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
2203 if (r < 0)
2204 return log_error_errno(r, "Failed to create /var/log/journal: %m");
2205
2206 p = strjoina("/var/log/journal/", id);
2207 q = prefix_roota(directory, p);
2208
2209 if (path_is_mount_point(p, 0) > 0) {
2210 if (arg_link_journal != LINK_AUTO) {
2211 log_error("%s: already a mount point, refusing to use for journal", p);
2212 return -EEXIST;
2213 }
2214
2215 return 0;
2216 }
2217
2218 if (path_is_mount_point(q, 0) > 0) {
2219 if (arg_link_journal != LINK_AUTO) {
2220 log_error("%s: already a mount point, refusing to use for journal", q);
2221 return -EEXIST;
2222 }
2223
2224 return 0;
2225 }
2226
2227 r = readlink_and_make_absolute(p, &d);
2228 if (r >= 0) {
2229 if ((arg_link_journal == LINK_GUEST ||
2230 arg_link_journal == LINK_AUTO) &&
2231 path_equal(d, q)) {
2232
2233 r = userns_mkdir(directory, p, 0755, 0, 0);
2234 if (r < 0)
2235 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2236 return 0;
2237 }
2238
2239 if (unlink(p) < 0)
2240 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2241 } else if (r == -EINVAL) {
2242
2243 if (arg_link_journal == LINK_GUEST &&
2244 rmdir(p) < 0) {
2245
2246 if (errno == ENOTDIR) {
2247 log_error("%s already exists and is neither a symlink nor a directory", p);
2248 return r;
2249 } else {
2250 log_error_errno(errno, "Failed to remove %s: %m", p);
2251 return -errno;
2252 }
2253 }
2254 } else if (r != -ENOENT) {
2255 log_error_errno(errno, "readlink(%s) failed: %m", p);
2256 return r;
2257 }
2258
2259 if (arg_link_journal == LINK_GUEST) {
2260
2261 if (symlink(q, p) < 0) {
2262 if (arg_link_journal_try) {
2263 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2264 return 0;
2265 } else {
2266 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2267 return -errno;
2268 }
2269 }
2270
2271 r = userns_mkdir(directory, p, 0755, 0, 0);
2272 if (r < 0)
2273 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2274 return 0;
2275 }
2276
2277 if (arg_link_journal == LINK_HOST) {
2278 /* don't create parents here -- if the host doesn't have
2279 * permanent journal set up, don't force it here */
2280 r = mkdir(p, 0755);
2281 if (r < 0) {
2282 if (arg_link_journal_try) {
2283 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
2284 return 0;
2285 } else {
2286 log_error_errno(errno, "Failed to create %s: %m", p);
2287 return r;
2288 }
2289 }
2290
2291 } else if (access(p, F_OK) < 0)
2292 return 0;
2293
2294 if (dir_is_empty(q) == 0)
2295 log_warning("%s is not empty, proceeding anyway.", q);
2296
2297 r = userns_mkdir(directory, p, 0755, 0, 0);
2298 if (r < 0) {
2299 log_error_errno(errno, "Failed to create %s: %m", q);
2300 return r;
2301 }
2302
2303 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2304 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2305
2306 return 0;
2307 }
2308
2309 static int drop_capabilities(void) {
2310 return capability_bounding_set_drop(~arg_retain, false);
2311 }
2312
2313 static int register_machine(pid_t pid, int local_ifindex) {
2314 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2315 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2316 int r;
2317
2318 if (!arg_register)
2319 return 0;
2320
2321 r = sd_bus_default_system(&bus);
2322 if (r < 0)
2323 return log_error_errno(r, "Failed to open system bus: %m");
2324
2325 if (arg_keep_unit) {
2326 r = sd_bus_call_method(
2327 bus,
2328 "org.freedesktop.machine1",
2329 "/org/freedesktop/machine1",
2330 "org.freedesktop.machine1.Manager",
2331 "RegisterMachineWithNetwork",
2332 &error,
2333 NULL,
2334 "sayssusai",
2335 arg_machine,
2336 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2337 "nspawn",
2338 "container",
2339 (uint32_t) pid,
2340 strempty(arg_directory),
2341 local_ifindex > 0 ? 1 : 0, local_ifindex);
2342 } else {
2343 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
2344 char **i;
2345 unsigned j;
2346
2347 r = sd_bus_message_new_method_call(
2348 bus,
2349 &m,
2350 "org.freedesktop.machine1",
2351 "/org/freedesktop/machine1",
2352 "org.freedesktop.machine1.Manager",
2353 "CreateMachineWithNetwork");
2354 if (r < 0)
2355 return bus_log_create_error(r);
2356
2357 r = sd_bus_message_append(
2358 m,
2359 "sayssusai",
2360 arg_machine,
2361 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2362 "nspawn",
2363 "container",
2364 (uint32_t) pid,
2365 strempty(arg_directory),
2366 local_ifindex > 0 ? 1 : 0, local_ifindex);
2367 if (r < 0)
2368 return bus_log_create_error(r);
2369
2370 r = sd_bus_message_open_container(m, 'a', "(sv)");
2371 if (r < 0)
2372 return bus_log_create_error(r);
2373
2374 if (!isempty(arg_slice)) {
2375 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2376 if (r < 0)
2377 return bus_log_create_error(r);
2378 }
2379
2380 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2381 if (r < 0)
2382 return bus_log_create_error(r);
2383
2384 /* If you make changes here, also make sure to update
2385 * systemd-nspawn@.service, to keep the device
2386 * policies in sync regardless if we are run with or
2387 * without the --keep-unit switch. */
2388 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2389 /* Allow the container to
2390 * access and create the API
2391 * device nodes, so that
2392 * PrivateDevices= in the
2393 * container can work
2394 * fine */
2395 "/dev/null", "rwm",
2396 "/dev/zero", "rwm",
2397 "/dev/full", "rwm",
2398 "/dev/random", "rwm",
2399 "/dev/urandom", "rwm",
2400 "/dev/tty", "rwm",
2401 "/dev/net/tun", "rwm",
2402 /* Allow the container
2403 * access to ptys. However,
2404 * do not permit the
2405 * container to ever create
2406 * these device nodes. */
2407 "/dev/pts/ptmx", "rw",
2408 "char-pts", "rw");
2409 if (r < 0)
2410 return bus_log_create_error(r);
2411
2412 for (j = 0; j < arg_n_custom_mounts; j++) {
2413 CustomMount *cm = &arg_custom_mounts[j];
2414
2415 if (cm->type != CUSTOM_MOUNT_BIND)
2416 continue;
2417
2418 r = is_device_node(cm->source);
2419 if (r < 0)
2420 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
2421
2422 if (r) {
2423 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
2424 cm->source, cm->read_only ? "r" : "rw");
2425 if (r < 0)
2426 return log_error_errno(r, "Failed to append message arguments: %m");
2427 }
2428 }
2429
2430 if (arg_kill_signal != 0) {
2431 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
2432 if (r < 0)
2433 return bus_log_create_error(r);
2434
2435 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
2436 if (r < 0)
2437 return bus_log_create_error(r);
2438 }
2439
2440 STRV_FOREACH(i, arg_property) {
2441 r = sd_bus_message_open_container(m, 'r', "sv");
2442 if (r < 0)
2443 return bus_log_create_error(r);
2444
2445 r = bus_append_unit_property_assignment(m, *i);
2446 if (r < 0)
2447 return r;
2448
2449 r = sd_bus_message_close_container(m);
2450 if (r < 0)
2451 return bus_log_create_error(r);
2452 }
2453
2454 r = sd_bus_message_close_container(m);
2455 if (r < 0)
2456 return bus_log_create_error(r);
2457
2458 r = sd_bus_call(bus, m, 0, &error, NULL);
2459 }
2460
2461 if (r < 0) {
2462 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2463 return r;
2464 }
2465
2466 return 0;
2467 }
2468
2469 static int terminate_machine(pid_t pid) {
2470 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2471 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2472 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2473 const char *path;
2474 int r;
2475
2476 if (!arg_register)
2477 return 0;
2478
2479 /* If we are reusing the unit, then just exit, systemd will do
2480 * the right thing when we exit. */
2481 if (arg_keep_unit)
2482 return 0;
2483
2484 r = sd_bus_default_system(&bus);
2485 if (r < 0)
2486 return log_error_errno(r, "Failed to open system bus: %m");
2487
2488 r = sd_bus_call_method(
2489 bus,
2490 "org.freedesktop.machine1",
2491 "/org/freedesktop/machine1",
2492 "org.freedesktop.machine1.Manager",
2493 "GetMachineByPID",
2494 &error,
2495 &reply,
2496 "u",
2497 (uint32_t) pid);
2498 if (r < 0) {
2499 /* Note that the machine might already have been
2500 * cleaned up automatically, hence don't consider it a
2501 * failure if we cannot get the machine object. */
2502 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2503 return 0;
2504 }
2505
2506 r = sd_bus_message_read(reply, "o", &path);
2507 if (r < 0)
2508 return bus_log_parse_error(r);
2509
2510 r = sd_bus_call_method(
2511 bus,
2512 "org.freedesktop.machine1",
2513 path,
2514 "org.freedesktop.machine1.Machine",
2515 "Terminate",
2516 &error,
2517 NULL,
2518 NULL);
2519 if (r < 0) {
2520 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2521 return 0;
2522 }
2523
2524 return 0;
2525 }
2526
2527 static int reset_audit_loginuid(void) {
2528 _cleanup_free_ char *p = NULL;
2529 int r;
2530
2531 if (arg_share_system)
2532 return 0;
2533
2534 r = read_one_line_file("/proc/self/loginuid", &p);
2535 if (r == -ENOENT)
2536 return 0;
2537 if (r < 0)
2538 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2539
2540 /* Already reset? */
2541 if (streq(p, "4294967295"))
2542 return 0;
2543
2544 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
2545 if (r < 0) {
2546 log_error_errno(r,
2547 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2548 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2549 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2550 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2551 "using systemd-nspawn. Sleeping for 5s... (%m)");
2552
2553 sleep(5);
2554 }
2555
2556 return 0;
2557 }
2558
2559 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2560 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2561 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2562
2563 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2564 uint8_t result[8];
2565 size_t l, sz;
2566 uint8_t *v, *i;
2567 int r;
2568
2569 l = strlen(arg_machine);
2570 sz = sizeof(sd_id128_t) + l;
2571 if (idx > 0)
2572 sz += sizeof(idx);
2573
2574 v = alloca(sz);
2575
2576 /* fetch some persistent data unique to the host */
2577 r = sd_id128_get_machine((sd_id128_t*) v);
2578 if (r < 0)
2579 return r;
2580
2581 /* combine with some data unique (on this host) to this
2582 * container instance */
2583 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2584 if (idx > 0) {
2585 idx = htole64(idx);
2586 memcpy(i, &idx, sizeof(idx));
2587 }
2588
2589 /* Let's hash the host machine ID plus the container name. We
2590 * use a fixed, but originally randomly created hash key here. */
2591 siphash24(result, v, sz, hash_key.bytes);
2592
2593 assert_cc(ETH_ALEN <= sizeof(result));
2594 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2595
2596 /* see eth_random_addr in the kernel */
2597 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2598 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2599
2600 return 0;
2601 }
2602
2603 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2604 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2605 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2606 struct ether_addr mac_host, mac_container;
2607 int r, i;
2608
2609 if (!arg_private_network)
2610 return 0;
2611
2612 if (!arg_network_veth)
2613 return 0;
2614
2615 /* Use two different interface name prefixes depending whether
2616 * we are in bridge mode or not. */
2617 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2618 arg_network_bridge ? "vb" : "ve", arg_machine);
2619
2620 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2621 if (r < 0)
2622 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2623
2624 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2625 if (r < 0)
2626 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2627
2628 r = sd_netlink_open(&rtnl);
2629 if (r < 0)
2630 return log_error_errno(r, "Failed to connect to netlink: %m");
2631
2632 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2633 if (r < 0)
2634 return log_error_errno(r, "Failed to allocate netlink message: %m");
2635
2636 r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
2637 if (r < 0)
2638 return log_error_errno(r, "Failed to add netlink interface name: %m");
2639
2640 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2641 if (r < 0)
2642 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2643
2644 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2645 if (r < 0)
2646 return log_error_errno(r, "Failed to open netlink container: %m");
2647
2648 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2649 if (r < 0)
2650 return log_error_errno(r, "Failed to open netlink container: %m");
2651
2652 r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
2653 if (r < 0)
2654 return log_error_errno(r, "Failed to open netlink container: %m");
2655
2656 r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
2657 if (r < 0)
2658 return log_error_errno(r, "Failed to add netlink interface name: %m");
2659
2660 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2661 if (r < 0)
2662 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2663
2664 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2665 if (r < 0)
2666 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2667
2668 r = sd_netlink_message_close_container(m);
2669 if (r < 0)
2670 return log_error_errno(r, "Failed to close netlink container: %m");
2671
2672 r = sd_netlink_message_close_container(m);
2673 if (r < 0)
2674 return log_error_errno(r, "Failed to close netlink container: %m");
2675
2676 r = sd_netlink_message_close_container(m);
2677 if (r < 0)
2678 return log_error_errno(r, "Failed to close netlink container: %m");
2679
2680 r = sd_netlink_call(rtnl, m, 0, NULL);
2681 if (r < 0)
2682 return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
2683
2684 i = (int) if_nametoindex(iface_name);
2685 if (i <= 0)
2686 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2687
2688 *ifi = i;
2689
2690 return 0;
2691 }
2692
2693 static int setup_bridge(const char veth_name[], int *ifi) {
2694 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2695 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2696 int r, bridge;
2697
2698 if (!arg_private_network)
2699 return 0;
2700
2701 if (!arg_network_veth)
2702 return 0;
2703
2704 if (!arg_network_bridge)
2705 return 0;
2706
2707 bridge = (int) if_nametoindex(arg_network_bridge);
2708 if (bridge <= 0)
2709 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2710
2711 *ifi = bridge;
2712
2713 r = sd_netlink_open(&rtnl);
2714 if (r < 0)
2715 return log_error_errno(r, "Failed to connect to netlink: %m");
2716
2717 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2718 if (r < 0)
2719 return log_error_errno(r, "Failed to allocate netlink message: %m");
2720
2721 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2722 if (r < 0)
2723 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2724
2725 r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
2726 if (r < 0)
2727 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2728
2729 r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
2730 if (r < 0)
2731 return log_error_errno(r, "Failed to add netlink master field: %m");
2732
2733 r = sd_netlink_call(rtnl, m, 0, NULL);
2734 if (r < 0)
2735 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2736
2737 return 0;
2738 }
2739
2740 static int parse_interface(struct udev *udev, const char *name) {
2741 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2742 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2743 int ifi;
2744
2745 ifi = (int) if_nametoindex(name);
2746 if (ifi <= 0)
2747 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2748
2749 sprintf(ifi_str, "n%i", ifi);
2750 d = udev_device_new_from_device_id(udev, ifi_str);
2751 if (!d)
2752 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2753
2754 if (udev_device_get_is_initialized(d) <= 0) {
2755 log_error("Network interface %s is not initialized yet.", name);
2756 return -EBUSY;
2757 }
2758
2759 return ifi;
2760 }
2761
2762 static int move_network_interfaces(pid_t pid) {
2763 _cleanup_udev_unref_ struct udev *udev = NULL;
2764 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2765 char **i;
2766 int r;
2767
2768 if (!arg_private_network)
2769 return 0;
2770
2771 if (strv_isempty(arg_network_interfaces))
2772 return 0;
2773
2774 r = sd_netlink_open(&rtnl);
2775 if (r < 0)
2776 return log_error_errno(r, "Failed to connect to netlink: %m");
2777
2778 udev = udev_new();
2779 if (!udev) {
2780 log_error("Failed to connect to udev.");
2781 return -ENOMEM;
2782 }
2783
2784 STRV_FOREACH(i, arg_network_interfaces) {
2785 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2786 int ifi;
2787
2788 ifi = parse_interface(udev, *i);
2789 if (ifi < 0)
2790 return ifi;
2791
2792 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2793 if (r < 0)
2794 return log_error_errno(r, "Failed to allocate netlink message: %m");
2795
2796 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2797 if (r < 0)
2798 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2799
2800 r = sd_netlink_call(rtnl, m, 0, NULL);
2801 if (r < 0)
2802 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2803 }
2804
2805 return 0;
2806 }
2807
2808 static int setup_macvlan(pid_t pid) {
2809 _cleanup_udev_unref_ struct udev *udev = NULL;
2810 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2811 unsigned idx = 0;
2812 char **i;
2813 int r;
2814
2815 if (!arg_private_network)
2816 return 0;
2817
2818 if (strv_isempty(arg_network_macvlan))
2819 return 0;
2820
2821 r = sd_netlink_open(&rtnl);
2822 if (r < 0)
2823 return log_error_errno(r, "Failed to connect to netlink: %m");
2824
2825 udev = udev_new();
2826 if (!udev) {
2827 log_error("Failed to connect to udev.");
2828 return -ENOMEM;
2829 }
2830
2831 STRV_FOREACH(i, arg_network_macvlan) {
2832 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2833 _cleanup_free_ char *n = NULL;
2834 struct ether_addr mac;
2835 int ifi;
2836
2837 ifi = parse_interface(udev, *i);
2838 if (ifi < 0)
2839 return ifi;
2840
2841 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2842 if (r < 0)
2843 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2844
2845 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2846 if (r < 0)
2847 return log_error_errno(r, "Failed to allocate netlink message: %m");
2848
2849 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2850 if (r < 0)
2851 return log_error_errno(r, "Failed to add netlink interface index: %m");
2852
2853 n = strappend("mv-", *i);
2854 if (!n)
2855 return log_oom();
2856
2857 strshorten(n, IFNAMSIZ-1);
2858
2859 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2860 if (r < 0)
2861 return log_error_errno(r, "Failed to add netlink interface name: %m");
2862
2863 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2864 if (r < 0)
2865 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2866
2867 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2868 if (r < 0)
2869 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2870
2871 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2872 if (r < 0)
2873 return log_error_errno(r, "Failed to open netlink container: %m");
2874
2875 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2876 if (r < 0)
2877 return log_error_errno(r, "Failed to open netlink container: %m");
2878
2879 r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2880 if (r < 0)
2881 return log_error_errno(r, "Failed to append macvlan mode: %m");
2882
2883 r = sd_netlink_message_close_container(m);
2884 if (r < 0)
2885 return log_error_errno(r, "Failed to close netlink container: %m");
2886
2887 r = sd_netlink_message_close_container(m);
2888 if (r < 0)
2889 return log_error_errno(r, "Failed to close netlink container: %m");
2890
2891 r = sd_netlink_call(rtnl, m, 0, NULL);
2892 if (r < 0)
2893 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2894 }
2895
2896 return 0;
2897 }
2898
2899 static int setup_ipvlan(pid_t pid) {
2900 _cleanup_udev_unref_ struct udev *udev = NULL;
2901 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2902 char **i;
2903 int r;
2904
2905 if (!arg_private_network)
2906 return 0;
2907
2908 if (strv_isempty(arg_network_ipvlan))
2909 return 0;
2910
2911 r = sd_netlink_open(&rtnl);
2912 if (r < 0)
2913 return log_error_errno(r, "Failed to connect to netlink: %m");
2914
2915 udev = udev_new();
2916 if (!udev) {
2917 log_error("Failed to connect to udev.");
2918 return -ENOMEM;
2919 }
2920
2921 STRV_FOREACH(i, arg_network_ipvlan) {
2922 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2923 _cleanup_free_ char *n = NULL;
2924 int ifi;
2925
2926 ifi = parse_interface(udev, *i);
2927 if (ifi < 0)
2928 return ifi;
2929
2930 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2931 if (r < 0)
2932 return log_error_errno(r, "Failed to allocate netlink message: %m");
2933
2934 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2935 if (r < 0)
2936 return log_error_errno(r, "Failed to add netlink interface index: %m");
2937
2938 n = strappend("iv-", *i);
2939 if (!n)
2940 return log_oom();
2941
2942 strshorten(n, IFNAMSIZ-1);
2943
2944 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2945 if (r < 0)
2946 return log_error_errno(r, "Failed to add netlink interface name: %m");
2947
2948 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2949 if (r < 0)
2950 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2951
2952 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2953 if (r < 0)
2954 return log_error_errno(r, "Failed to open netlink container: %m");
2955
2956 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2957 if (r < 0)
2958 return log_error_errno(r, "Failed to open netlink container: %m");
2959
2960 r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2961 if (r < 0)
2962 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2963
2964 r = sd_netlink_message_close_container(m);
2965 if (r < 0)
2966 return log_error_errno(r, "Failed to close netlink container: %m");
2967
2968 r = sd_netlink_message_close_container(m);
2969 if (r < 0)
2970 return log_error_errno(r, "Failed to close netlink container: %m");
2971
2972 r = sd_netlink_call(rtnl, m, 0, NULL);
2973 if (r < 0)
2974 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2975 }
2976
2977 return 0;
2978 }
2979
2980 static int setup_seccomp(void) {
2981
2982 #ifdef HAVE_SECCOMP
2983 static const struct {
2984 uint64_t capability;
2985 int syscall_num;
2986 } blacklist[] = {
2987 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
2988 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
2989 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
2990 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
2991 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
2992 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
2993 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
2994 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
2995 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
2996 { CAP_SYSLOG, SCMP_SYS(syslog) },
2997 };
2998
2999 scmp_filter_ctx seccomp;
3000 unsigned i;
3001 int r;
3002
3003 seccomp = seccomp_init(SCMP_ACT_ALLOW);
3004 if (!seccomp)
3005 return log_oom();
3006
3007 r = seccomp_add_secondary_archs(seccomp);
3008 if (r < 0) {
3009 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
3010 goto finish;
3011 }
3012
3013 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
3014 if (arg_retain & (1ULL << blacklist[i].capability))
3015 continue;
3016
3017 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
3018 if (r == -EFAULT)
3019 continue; /* unknown syscall */
3020 if (r < 0) {
3021 log_error_errno(r, "Failed to block syscall: %m");
3022 goto finish;
3023 }
3024 }
3025
3026
3027 /*
3028 Audit is broken in containers, much of the userspace audit
3029 hookup will fail if running inside a container. We don't
3030 care and just turn off creation of audit sockets.
3031
3032 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
3033 with EAFNOSUPPORT which audit userspace uses as indication
3034 that audit is disabled in the kernel.
3035 */
3036
3037 r = seccomp_rule_add(
3038 seccomp,
3039 SCMP_ACT_ERRNO(EAFNOSUPPORT),
3040 SCMP_SYS(socket),
3041 2,
3042 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
3043 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
3044 if (r < 0) {
3045 log_error_errno(r, "Failed to add audit seccomp rule: %m");
3046 goto finish;
3047 }
3048
3049 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
3050 if (r < 0) {
3051 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
3052 goto finish;
3053 }
3054
3055 r = seccomp_load(seccomp);
3056 if (r == -EINVAL) {
3057 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3058 r = 0;
3059 goto finish;
3060 }
3061 if (r < 0) {
3062 log_error_errno(r, "Failed to install seccomp audit filter: %m");
3063 goto finish;
3064 }
3065
3066 finish:
3067 seccomp_release(seccomp);
3068 return r;
3069 #else
3070 return 0;
3071 #endif
3072
3073 }
3074
3075 static int setup_propagate(const char *root) {
3076 const char *p, *q;
3077
3078 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3079 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
3080 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3081 (void) mkdir_p(p, 0600);
3082
3083 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
3084 return log_error_errno(errno, "Failed to create /run/systemd: %m");
3085
3086 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3087 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
3088
3089 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3090 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
3091
3092 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
3093 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
3094 return log_error_errno(errno, "Failed to install propagation bind mount.");
3095
3096 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
3097 return log_error_errno(errno, "Failed to make propagation mount read-only");
3098
3099 return 0;
3100 }
3101
3102 static int setup_image(char **device_path, int *loop_nr) {
3103 struct loop_info64 info = {
3104 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
3105 };
3106 _cleanup_close_ int fd = -1, control = -1, loop = -1;
3107 _cleanup_free_ char* loopdev = NULL;
3108 struct stat st;
3109 int r, nr;
3110
3111 assert(device_path);
3112 assert(loop_nr);
3113 assert(arg_image);
3114
3115 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3116 if (fd < 0)
3117 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
3118
3119 if (fstat(fd, &st) < 0)
3120 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
3121
3122 if (S_ISBLK(st.st_mode)) {
3123 char *p;
3124
3125 p = strdup(arg_image);
3126 if (!p)
3127 return log_oom();
3128
3129 *device_path = p;
3130
3131 *loop_nr = -1;
3132
3133 r = fd;
3134 fd = -1;
3135
3136 return r;
3137 }
3138
3139 if (!S_ISREG(st.st_mode)) {
3140 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
3141 return -EINVAL;
3142 }
3143
3144 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3145 if (control < 0)
3146 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
3147
3148 nr = ioctl(control, LOOP_CTL_GET_FREE);
3149 if (nr < 0)
3150 return log_error_errno(errno, "Failed to allocate loop device: %m");
3151
3152 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
3153 return log_oom();
3154
3155 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3156 if (loop < 0)
3157 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
3158
3159 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
3160 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
3161
3162 if (arg_read_only)
3163 info.lo_flags |= LO_FLAGS_READ_ONLY;
3164
3165 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
3166 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
3167
3168 *device_path = loopdev;
3169 loopdev = NULL;
3170
3171 *loop_nr = nr;
3172
3173 r = loop;
3174 loop = -1;
3175
3176 return r;
3177 }
3178
3179 #define PARTITION_TABLE_BLURB \
3180 "Note that the disk image needs to either contain only a single MBR partition of\n" \
3181 "type 0x83 that is marked bootable, or a single GPT partition of type " \
3182 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
3183 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3184 "to be bootable with systemd-nspawn."
3185
3186 static int dissect_image(
3187 int fd,
3188 char **root_device, bool *root_device_rw,
3189 char **home_device, bool *home_device_rw,
3190 char **srv_device, bool *srv_device_rw,
3191 bool *secondary) {
3192
3193 #ifdef HAVE_BLKID
3194 int home_nr = -1, srv_nr = -1;
3195 #ifdef GPT_ROOT_NATIVE
3196 int root_nr = -1;
3197 #endif
3198 #ifdef GPT_ROOT_SECONDARY
3199 int secondary_root_nr = -1;
3200 #endif
3201 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
3202 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
3203 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
3204 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3205 _cleanup_udev_unref_ struct udev *udev = NULL;
3206 struct udev_list_entry *first, *item;
3207 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
3208 bool is_gpt, is_mbr, multiple_generic = false;
3209 const char *pttype = NULL;
3210 blkid_partlist pl;
3211 struct stat st;
3212 unsigned i;
3213 int r;
3214
3215 assert(fd >= 0);
3216 assert(root_device);
3217 assert(home_device);
3218 assert(srv_device);
3219 assert(secondary);
3220 assert(arg_image);
3221
3222 b = blkid_new_probe();
3223 if (!b)
3224 return log_oom();
3225
3226 errno = 0;
3227 r = blkid_probe_set_device(b, fd, 0, 0);
3228 if (r != 0) {
3229 if (errno == 0)
3230 return log_oom();
3231
3232 log_error_errno(errno, "Failed to set device on blkid probe: %m");
3233 return -errno;
3234 }
3235
3236 blkid_probe_enable_partitions(b, 1);
3237 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
3238
3239 errno = 0;
3240 r = blkid_do_safeprobe(b);
3241 if (r == -2 || r == 1) {
3242 log_error("Failed to identify any partition table on\n"
3243 " %s\n"
3244 PARTITION_TABLE_BLURB, arg_image);
3245 return -EINVAL;
3246 } else if (r != 0) {
3247 if (errno == 0)
3248 errno = EIO;
3249 log_error_errno(errno, "Failed to probe: %m");
3250 return -errno;
3251 }
3252
3253 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
3254
3255 is_gpt = streq_ptr(pttype, "gpt");
3256 is_mbr = streq_ptr(pttype, "dos");
3257
3258 if (!is_gpt && !is_mbr) {
3259 log_error("No GPT or MBR partition table discovered on\n"
3260 " %s\n"
3261 PARTITION_TABLE_BLURB, arg_image);
3262 return -EINVAL;
3263 }
3264
3265 errno = 0;
3266 pl = blkid_probe_get_partitions(b);
3267 if (!pl) {
3268 if (errno == 0)
3269 return log_oom();
3270
3271 log_error("Failed to list partitions of %s", arg_image);
3272 return -errno;
3273 }
3274
3275 udev = udev_new();
3276 if (!udev)
3277 return log_oom();
3278
3279 if (fstat(fd, &st) < 0)
3280 return log_error_errno(errno, "Failed to stat block device: %m");
3281
3282 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
3283 if (!d)
3284 return log_oom();
3285
3286 for (i = 0;; i++) {
3287 int n, m;
3288
3289 if (i >= 10) {
3290 log_error("Kernel partitions never appeared.");
3291 return -ENXIO;
3292 }
3293
3294 e = udev_enumerate_new(udev);
3295 if (!e)
3296 return log_oom();
3297
3298 r = udev_enumerate_add_match_parent(e, d);
3299 if (r < 0)
3300 return log_oom();
3301
3302 r = udev_enumerate_scan_devices(e);
3303 if (r < 0)
3304 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
3305
3306 /* Count the partitions enumerated by the kernel */
3307 n = 0;
3308 first = udev_enumerate_get_list_entry(e);
3309 udev_list_entry_foreach(item, first)
3310 n++;
3311
3312 /* Count the partitions enumerated by blkid */
3313 m = blkid_partlist_numof_partitions(pl);
3314 if (n == m + 1)
3315 break;
3316 if (n > m + 1) {
3317 log_error("blkid and kernel partition list do not match.");
3318 return -EIO;
3319 }
3320 if (n < m + 1) {
3321 unsigned j;
3322
3323 /* The kernel has probed fewer partitions than
3324 * blkid? Maybe the kernel prober is still
3325 * running or it got EBUSY because udev
3326 * already opened the device. Let's reprobe
3327 * the device, which is a synchronous call
3328 * that waits until probing is complete. */
3329
3330 for (j = 0; j < 20; j++) {
3331
3332 r = ioctl(fd, BLKRRPART, 0);
3333 if (r < 0)
3334 r = -errno;
3335 if (r >= 0 || r != -EBUSY)
3336 break;
3337
3338 /* If something else has the device
3339 * open, such as an udev rule, the
3340 * ioctl will return EBUSY. Since
3341 * there's no way to wait until it
3342 * isn't busy anymore, let's just wait
3343 * a bit, and try again.
3344 *
3345 * This is really something they
3346 * should fix in the kernel! */
3347
3348 usleep(50 * USEC_PER_MSEC);
3349 }
3350
3351 if (r < 0)
3352 return log_error_errno(r, "Failed to reread partition table: %m");
3353 }
3354
3355 e = udev_enumerate_unref(e);
3356 }
3357
3358 first = udev_enumerate_get_list_entry(e);
3359 udev_list_entry_foreach(item, first) {
3360 _cleanup_udev_device_unref_ struct udev_device *q;
3361 const char *node;
3362 unsigned long long flags;
3363 blkid_partition pp;
3364 dev_t qn;
3365 int nr;
3366
3367 errno = 0;
3368 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
3369 if (!q) {
3370 if (!errno)
3371 errno = ENOMEM;
3372
3373 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
3374 return -errno;
3375 }
3376
3377 qn = udev_device_get_devnum(q);
3378 if (major(qn) == 0)
3379 continue;
3380
3381 if (st.st_rdev == qn)
3382 continue;
3383
3384 node = udev_device_get_devnode(q);
3385 if (!node)
3386 continue;
3387
3388 pp = blkid_partlist_devno_to_partition(pl, qn);
3389 if (!pp)
3390 continue;
3391
3392 flags = blkid_partition_get_flags(pp);
3393
3394 nr = blkid_partition_get_partno(pp);
3395 if (nr < 0)
3396 continue;
3397
3398 if (is_gpt) {
3399 sd_id128_t type_id;
3400 const char *stype;
3401
3402 if (flags & GPT_FLAG_NO_AUTO)
3403 continue;
3404
3405 stype = blkid_partition_get_type_string(pp);
3406 if (!stype)
3407 continue;
3408
3409 if (sd_id128_from_string(stype, &type_id) < 0)
3410 continue;
3411
3412 if (sd_id128_equal(type_id, GPT_HOME)) {
3413
3414 if (home && nr >= home_nr)
3415 continue;
3416
3417 home_nr = nr;
3418 home_rw = !(flags & GPT_FLAG_READ_ONLY);
3419
3420 r = free_and_strdup(&home, node);
3421 if (r < 0)
3422 return log_oom();
3423
3424 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3425
3426 if (srv && nr >= srv_nr)
3427 continue;
3428
3429 srv_nr = nr;
3430 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3431
3432 r = free_and_strdup(&srv, node);
3433 if (r < 0)
3434 return log_oom();
3435 }
3436 #ifdef GPT_ROOT_NATIVE
3437 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3438
3439 if (root && nr >= root_nr)
3440 continue;
3441
3442 root_nr = nr;
3443 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3444
3445 r = free_and_strdup(&root, node);
3446 if (r < 0)
3447 return log_oom();
3448 }
3449 #endif
3450 #ifdef GPT_ROOT_SECONDARY
3451 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3452
3453 if (secondary_root && nr >= secondary_root_nr)
3454 continue;
3455
3456 secondary_root_nr = nr;
3457 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3458
3459 r = free_and_strdup(&secondary_root, node);
3460 if (r < 0)
3461 return log_oom();
3462 }
3463 #endif
3464 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3465
3466 if (generic)
3467 multiple_generic = true;
3468 else {
3469 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3470
3471 r = free_and_strdup(&generic, node);
3472 if (r < 0)
3473 return log_oom();
3474 }
3475 }
3476
3477 } else if (is_mbr) {
3478 int type;
3479
3480 if (flags != 0x80) /* Bootable flag */
3481 continue;
3482
3483 type = blkid_partition_get_type(pp);
3484 if (type != 0x83) /* Linux partition */
3485 continue;
3486
3487 if (generic)
3488 multiple_generic = true;
3489 else {
3490 generic_rw = true;
3491
3492 r = free_and_strdup(&root, node);
3493 if (r < 0)
3494 return log_oom();
3495 }
3496 }
3497 }
3498
3499 if (root) {
3500 *root_device = root;
3501 root = NULL;
3502
3503 *root_device_rw = root_rw;
3504 *secondary = false;
3505 } else if (secondary_root) {
3506 *root_device = secondary_root;
3507 secondary_root = NULL;
3508
3509 *root_device_rw = secondary_root_rw;
3510 *secondary = true;
3511 } else if (generic) {
3512
3513 /* There were no partitions with precise meanings
3514 * around, but we found generic partitions. In this
3515 * case, if there's only one, we can go ahead and boot
3516 * it, otherwise we bail out, because we really cannot
3517 * make any sense of it. */
3518
3519 if (multiple_generic) {
3520 log_error("Identified multiple bootable Linux partitions on\n"
3521 " %s\n"
3522 PARTITION_TABLE_BLURB, arg_image);
3523 return -EINVAL;
3524 }
3525
3526 *root_device = generic;
3527 generic = NULL;
3528
3529 *root_device_rw = generic_rw;
3530 *secondary = false;
3531 } else {
3532 log_error("Failed to identify root partition in disk image\n"
3533 " %s\n"
3534 PARTITION_TABLE_BLURB, arg_image);
3535 return -EINVAL;
3536 }
3537
3538 if (home) {
3539 *home_device = home;
3540 home = NULL;
3541
3542 *home_device_rw = home_rw;
3543 }
3544
3545 if (srv) {
3546 *srv_device = srv;
3547 srv = NULL;
3548
3549 *srv_device_rw = srv_rw;
3550 }
3551
3552 return 0;
3553 #else
3554 log_error("--image= is not supported, compiled without blkid support.");
3555 return -EOPNOTSUPP;
3556 #endif
3557 }
3558
3559 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3560 #ifdef HAVE_BLKID
3561 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3562 const char *fstype, *p;
3563 int r;
3564
3565 assert(what);
3566 assert(where);
3567
3568 if (arg_read_only)
3569 rw = false;
3570
3571 if (directory)
3572 p = strjoina(where, directory);
3573 else
3574 p = where;
3575
3576 errno = 0;
3577 b = blkid_new_probe_from_filename(what);
3578 if (!b) {
3579 if (errno == 0)
3580 return log_oom();
3581 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3582 return -errno;
3583 }
3584
3585 blkid_probe_enable_superblocks(b, 1);
3586 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3587
3588 errno = 0;
3589 r = blkid_do_safeprobe(b);
3590 if (r == -1 || r == 1) {
3591 log_error("Cannot determine file system type of %s", what);
3592 return -EINVAL;
3593 } else if (r != 0) {
3594 if (errno == 0)
3595 errno = EIO;
3596 log_error_errno(errno, "Failed to probe %s: %m", what);
3597 return -errno;
3598 }
3599
3600 errno = 0;
3601 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3602 if (errno == 0)
3603 errno = EINVAL;
3604 log_error("Failed to determine file system type of %s", what);
3605 return -errno;
3606 }
3607
3608 if (streq(fstype, "crypto_LUKS")) {
3609 log_error("nspawn currently does not support LUKS disk images.");
3610 return -EOPNOTSUPP;
3611 }
3612
3613 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3614 return log_error_errno(errno, "Failed to mount %s: %m", what);
3615
3616 return 0;
3617 #else
3618 log_error("--image= is not supported, compiled without blkid support.");
3619 return -EOPNOTSUPP;
3620 #endif
3621 }
3622
3623 static int mount_devices(
3624 const char *where,
3625 const char *root_device, bool root_device_rw,
3626 const char *home_device, bool home_device_rw,
3627 const char *srv_device, bool srv_device_rw) {
3628 int r;
3629
3630 assert(where);
3631
3632 if (root_device) {
3633 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3634 if (r < 0)
3635 return log_error_errno(r, "Failed to mount root directory: %m");
3636 }
3637
3638 if (home_device) {
3639 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3640 if (r < 0)
3641 return log_error_errno(r, "Failed to mount home directory: %m");
3642 }
3643
3644 if (srv_device) {
3645 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3646 if (r < 0)
3647 return log_error_errno(r, "Failed to mount server data directory: %m");
3648 }
3649
3650 return 0;
3651 }
3652
3653 static void loop_remove(int nr, int *image_fd) {
3654 _cleanup_close_ int control = -1;
3655 int r;
3656
3657 if (nr < 0)
3658 return;
3659
3660 if (image_fd && *image_fd >= 0) {
3661 r = ioctl(*image_fd, LOOP_CLR_FD);
3662 if (r < 0)
3663 log_debug_errno(errno, "Failed to close loop image: %m");
3664 *image_fd = safe_close(*image_fd);
3665 }
3666
3667 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3668 if (control < 0) {
3669 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3670 return;
3671 }
3672
3673 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3674 if (r < 0)
3675 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3676 }
3677
3678 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3679 int pipe_fds[2];
3680 pid_t pid;
3681
3682 assert(database);
3683 assert(key);
3684 assert(rpid);
3685
3686 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3687 return log_error_errno(errno, "Failed to allocate pipe: %m");
3688
3689 pid = fork();
3690 if (pid < 0)
3691 return log_error_errno(errno, "Failed to fork getent child: %m");
3692 else if (pid == 0) {
3693 int nullfd;
3694 char *empty_env = NULL;
3695
3696 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3697 _exit(EXIT_FAILURE);
3698
3699 if (pipe_fds[0] > 2)
3700 safe_close(pipe_fds[0]);
3701 if (pipe_fds[1] > 2)
3702 safe_close(pipe_fds[1]);
3703
3704 nullfd = open("/dev/null", O_RDWR);
3705 if (nullfd < 0)
3706 _exit(EXIT_FAILURE);
3707
3708 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3709 _exit(EXIT_FAILURE);
3710
3711 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3712 _exit(EXIT_FAILURE);
3713
3714 if (nullfd > 2)
3715 safe_close(nullfd);
3716
3717 (void) reset_all_signal_handlers();
3718 (void) reset_signal_mask();
3719 close_all_fds(NULL, 0);
3720
3721 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3722 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3723 _exit(EXIT_FAILURE);
3724 }
3725
3726 pipe_fds[1] = safe_close(pipe_fds[1]);
3727
3728 *rpid = pid;
3729
3730 return pipe_fds[0];
3731 }
3732
3733 static int change_uid_gid(char **_home) {
3734 char line[LINE_MAX], *x, *u, *g, *h;
3735 const char *word, *state;
3736 _cleanup_free_ uid_t *uids = NULL;
3737 _cleanup_free_ char *home = NULL;
3738 _cleanup_fclose_ FILE *f = NULL;
3739 _cleanup_close_ int fd = -1;
3740 unsigned n_uids = 0;
3741 size_t sz = 0, l;
3742 uid_t uid;
3743 gid_t gid;
3744 pid_t pid;
3745 int r;
3746
3747 assert(_home);
3748
3749 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3750 /* Reset everything fully to 0, just in case */
3751
3752 r = reset_uid_gid();
3753 if (r < 0)
3754 return log_error_errno(r, "Failed to become root: %m");
3755
3756 *_home = NULL;
3757 return 0;
3758 }
3759
3760 /* First, get user credentials */
3761 fd = spawn_getent("passwd", arg_user, &pid);
3762 if (fd < 0)
3763 return fd;
3764
3765 f = fdopen(fd, "r");
3766 if (!f)
3767 return log_oom();
3768 fd = -1;
3769
3770 if (!fgets(line, sizeof(line), f)) {
3771
3772 if (!ferror(f)) {
3773 log_error("Failed to resolve user %s.", arg_user);
3774 return -ESRCH;
3775 }
3776
3777 log_error_errno(errno, "Failed to read from getent: %m");
3778 return -errno;
3779 }
3780
3781 truncate_nl(line);
3782
3783 wait_for_terminate_and_warn("getent passwd", pid, true);
3784
3785 x = strchr(line, ':');
3786 if (!x) {
3787 log_error("/etc/passwd entry has invalid user field.");
3788 return -EIO;
3789 }
3790
3791 u = strchr(x+1, ':');
3792 if (!u) {
3793 log_error("/etc/passwd entry has invalid password field.");
3794 return -EIO;
3795 }
3796
3797 u++;
3798 g = strchr(u, ':');
3799 if (!g) {
3800 log_error("/etc/passwd entry has invalid UID field.");
3801 return -EIO;
3802 }
3803
3804 *g = 0;
3805 g++;
3806 x = strchr(g, ':');
3807 if (!x) {
3808 log_error("/etc/passwd entry has invalid GID field.");
3809 return -EIO;
3810 }
3811
3812 *x = 0;
3813 h = strchr(x+1, ':');
3814 if (!h) {
3815 log_error("/etc/passwd entry has invalid GECOS field.");
3816 return -EIO;
3817 }
3818
3819 h++;
3820 x = strchr(h, ':');
3821 if (!x) {
3822 log_error("/etc/passwd entry has invalid home directory field.");
3823 return -EIO;
3824 }
3825
3826 *x = 0;
3827
3828 r = parse_uid(u, &uid);
3829 if (r < 0) {
3830 log_error("Failed to parse UID of user.");
3831 return -EIO;
3832 }
3833
3834 r = parse_gid(g, &gid);
3835 if (r < 0) {
3836 log_error("Failed to parse GID of user.");
3837 return -EIO;
3838 }
3839
3840 home = strdup(h);
3841 if (!home)
3842 return log_oom();
3843
3844 /* Second, get group memberships */
3845 fd = spawn_getent("initgroups", arg_user, &pid);
3846 if (fd < 0)
3847 return fd;
3848
3849 fclose(f);
3850 f = fdopen(fd, "r");
3851 if (!f)
3852 return log_oom();
3853 fd = -1;
3854
3855 if (!fgets(line, sizeof(line), f)) {
3856 if (!ferror(f)) {
3857 log_error("Failed to resolve user %s.", arg_user);
3858 return -ESRCH;
3859 }
3860
3861 log_error_errno(errno, "Failed to read from getent: %m");
3862 return -errno;
3863 }
3864
3865 truncate_nl(line);
3866
3867 wait_for_terminate_and_warn("getent initgroups", pid, true);
3868
3869 /* Skip over the username and subsequent separator whitespace */
3870 x = line;
3871 x += strcspn(x, WHITESPACE);
3872 x += strspn(x, WHITESPACE);
3873
3874 FOREACH_WORD(word, l, x, state) {
3875 char c[l+1];
3876
3877 memcpy(c, word, l);
3878 c[l] = 0;
3879
3880 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3881 return log_oom();
3882
3883 r = parse_uid(c, &uids[n_uids++]);
3884 if (r < 0) {
3885 log_error("Failed to parse group data from getent.");
3886 return -EIO;
3887 }
3888 }
3889
3890 r = mkdir_parents(home, 0775);
3891 if (r < 0)
3892 return log_error_errno(r, "Failed to make home root directory: %m");
3893
3894 r = mkdir_safe(home, 0755, uid, gid);
3895 if (r < 0 && r != -EEXIST)
3896 return log_error_errno(r, "Failed to make home directory: %m");
3897
3898 (void) fchown(STDIN_FILENO, uid, gid);
3899 (void) fchown(STDOUT_FILENO, uid, gid);
3900 (void) fchown(STDERR_FILENO, uid, gid);
3901
3902 if (setgroups(n_uids, uids) < 0)
3903 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3904
3905 if (setresgid(gid, gid, gid) < 0)
3906 return log_error_errno(errno, "setregid() failed: %m");
3907
3908 if (setresuid(uid, uid, uid) < 0)
3909 return log_error_errno(errno, "setreuid() failed: %m");
3910
3911 if (_home) {
3912 *_home = home;
3913 home = NULL;
3914 }
3915
3916 return 0;
3917 }
3918
3919 /*
3920 * Return values:
3921 * < 0 : wait_for_terminate() failed to get the state of the
3922 * container, the container was terminated by a signal, or
3923 * failed for an unknown reason. No change is made to the
3924 * container argument.
3925 * > 0 : The program executed in the container terminated with an
3926 * error. The exit code of the program executed in the
3927 * container is returned. The container argument has been set
3928 * to CONTAINER_TERMINATED.
3929 * 0 : The container is being rebooted, has been shut down or exited
3930 * successfully. The container argument has been set to either
3931 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3932 *
3933 * That is, success is indicated by a return value of zero, and an
3934 * error is indicated by a non-zero value.
3935 */
3936 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3937 siginfo_t status;
3938 int r;
3939
3940 r = wait_for_terminate(pid, &status);
3941 if (r < 0)
3942 return log_warning_errno(r, "Failed to wait for container: %m");
3943
3944 switch (status.si_code) {
3945
3946 case CLD_EXITED:
3947 if (status.si_status == 0) {
3948 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3949
3950 } else
3951 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3952
3953 *container = CONTAINER_TERMINATED;
3954 return status.si_status;
3955
3956 case CLD_KILLED:
3957 if (status.si_status == SIGINT) {
3958
3959 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3960 *container = CONTAINER_TERMINATED;
3961 return 0;
3962
3963 } else if (status.si_status == SIGHUP) {
3964
3965 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3966 *container = CONTAINER_REBOOTED;
3967 return 0;
3968 }
3969
3970 /* CLD_KILLED fallthrough */
3971
3972 case CLD_DUMPED:
3973 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3974 return -EIO;
3975
3976 default:
3977 log_error("Container %s failed due to unknown reason.", arg_machine);
3978 return -EIO;
3979 }
3980
3981 return r;
3982 }
3983
3984 static void nop_handler(int sig) {}
3985
3986 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3987 pid_t pid;
3988
3989 pid = PTR_TO_UINT32(userdata);
3990 if (pid > 0) {
3991 if (kill(pid, arg_kill_signal) >= 0) {
3992 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3993 sd_event_source_set_userdata(s, NULL);
3994 return 0;
3995 }
3996 }
3997
3998 sd_event_exit(sd_event_source_get_event(s), 0);
3999 return 0;
4000 }
4001
4002 static int determine_names(void) {
4003 int r;
4004
4005 if (arg_template && !arg_directory && arg_machine) {
4006
4007 /* If --template= was specified then we should not
4008 * search for a machine, but instead create a new one
4009 * in /var/lib/machine. */
4010
4011 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
4012 if (!arg_directory)
4013 return log_oom();
4014 }
4015
4016 if (!arg_image && !arg_directory) {
4017 if (arg_machine) {
4018 _cleanup_(image_unrefp) Image *i = NULL;
4019
4020 r = image_find(arg_machine, &i);
4021 if (r < 0)
4022 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
4023 else if (r == 0) {
4024 log_error("No image for machine '%s': %m", arg_machine);
4025 return -ENOENT;
4026 }
4027
4028 if (i->type == IMAGE_RAW)
4029 r = set_sanitized_path(&arg_image, i->path);
4030 else
4031 r = set_sanitized_path(&arg_directory, i->path);
4032 if (r < 0)
4033 return log_error_errno(r, "Invalid image directory: %m");
4034
4035 if (!arg_ephemeral)
4036 arg_read_only = arg_read_only || i->read_only;
4037 } else
4038 arg_directory = get_current_dir_name();
4039
4040 if (!arg_directory && !arg_machine) {
4041 log_error("Failed to determine path, please use -D or -i.");
4042 return -EINVAL;
4043 }
4044 }
4045
4046 if (!arg_machine) {
4047 if (arg_directory && path_equal(arg_directory, "/"))
4048 arg_machine = gethostname_malloc();
4049 else
4050 arg_machine = strdup(basename(arg_image ?: arg_directory));
4051
4052 if (!arg_machine)
4053 return log_oom();
4054
4055 hostname_cleanup(arg_machine);
4056 if (!machine_name_is_valid(arg_machine)) {
4057 log_error("Failed to determine machine name automatically, please use -M.");
4058 return -EINVAL;
4059 }
4060
4061 if (arg_ephemeral) {
4062 char *b;
4063
4064 /* Add a random suffix when this is an
4065 * ephemeral machine, so that we can run many
4066 * instances at once without manually having
4067 * to specify -M each time. */
4068
4069 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
4070 return log_oom();
4071
4072 free(arg_machine);
4073 arg_machine = b;
4074 }
4075 }
4076
4077 return 0;
4078 }
4079
4080 static int determine_uid_shift(const char *directory) {
4081 int r;
4082
4083 if (!arg_userns) {
4084 arg_uid_shift = 0;
4085 return 0;
4086 }
4087
4088 if (arg_uid_shift == UID_INVALID) {
4089 struct stat st;
4090
4091 r = stat(directory, &st);
4092 if (r < 0)
4093 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
4094
4095 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
4096
4097 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
4098 log_error("UID and GID base of %s don't match.", directory);
4099 return -EINVAL;
4100 }
4101
4102 arg_uid_range = UINT32_C(0x10000);
4103 }
4104
4105 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
4106 log_error("UID base too high for UID range.");
4107 return -EINVAL;
4108 }
4109
4110 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
4111 return 0;
4112 }
4113
4114 static int inner_child(
4115 Barrier *barrier,
4116 const char *directory,
4117 bool secondary,
4118 int kmsg_socket,
4119 int rtnl_socket,
4120 FDSet *fds,
4121 int argc,
4122 char *argv[]) {
4123
4124 _cleanup_free_ char *home = NULL;
4125 unsigned n_env = 2;
4126 const char *envp[] = {
4127 "PATH=" DEFAULT_PATH_SPLIT_USR,
4128 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4129 NULL, /* TERM */
4130 NULL, /* HOME */
4131 NULL, /* USER */
4132 NULL, /* LOGNAME */
4133 NULL, /* container_uuid */
4134 NULL, /* LISTEN_FDS */
4135 NULL, /* LISTEN_PID */
4136 NULL
4137 };
4138
4139 _cleanup_strv_free_ char **env_use = NULL;
4140 int r;
4141
4142 assert(barrier);
4143 assert(directory);
4144 assert(kmsg_socket >= 0);
4145
4146 if (arg_userns) {
4147 /* Tell the parent, that it now can write the UID map. */
4148 (void) barrier_place(barrier); /* #1 */
4149
4150 /* Wait until the parent wrote the UID map */
4151 if (!barrier_place_and_sync(barrier)) { /* #2 */
4152 log_error("Parent died too early");
4153 return -ESRCH;
4154 }
4155 }
4156
4157 r = mount_all(NULL, true);
4158 if (r < 0)
4159 return r;
4160
4161 /* Wait until we are cgroup-ified, so that we
4162 * can mount the right cgroup path writable */
4163 if (!barrier_place_and_sync(barrier)) { /* #3 */
4164 log_error("Parent died too early");
4165 return -ESRCH;
4166 }
4167
4168 r = mount_systemd_cgroup_writable("");
4169 if (r < 0)
4170 return r;
4171
4172 r = reset_uid_gid();
4173 if (r < 0)
4174 return log_error_errno(r, "Couldn't become new root: %m");
4175
4176 r = setup_boot_id(NULL);
4177 if (r < 0)
4178 return r;
4179
4180 r = setup_kmsg(NULL, kmsg_socket);
4181 if (r < 0)
4182 return r;
4183 kmsg_socket = safe_close(kmsg_socket);
4184
4185 umask(0022);
4186
4187 if (setsid() < 0)
4188 return log_error_errno(errno, "setsid() failed: %m");
4189
4190 if (arg_private_network)
4191 loopback_setup();
4192
4193 r = send_rtnl(rtnl_socket);
4194 if (r < 0)
4195 return r;
4196 rtnl_socket = safe_close(rtnl_socket);
4197
4198 if (drop_capabilities() < 0)
4199 return log_error_errno(errno, "drop_capabilities() failed: %m");
4200
4201 setup_hostname();
4202
4203 if (arg_personality != PERSONALITY_INVALID) {
4204 if (personality(arg_personality) < 0)
4205 return log_error_errno(errno, "personality() failed: %m");
4206 } else if (secondary) {
4207 if (personality(PER_LINUX32) < 0)
4208 return log_error_errno(errno, "personality() failed: %m");
4209 }
4210
4211 #ifdef HAVE_SELINUX
4212 if (arg_selinux_context)
4213 if (setexeccon((security_context_t) arg_selinux_context) < 0)
4214 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4215 #endif
4216
4217 r = change_uid_gid(&home);
4218 if (r < 0)
4219 return r;
4220
4221 envp[n_env] = strv_find_prefix(environ, "TERM=");
4222 if (envp[n_env])
4223 n_env ++;
4224
4225 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4226 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4227 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
4228 return log_oom();
4229
4230 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4231 char as_uuid[37];
4232
4233 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
4234 return log_oom();
4235 }
4236
4237 if (fdset_size(fds) > 0) {
4238 r = fdset_cloexec(fds, false);
4239 if (r < 0)
4240 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4241
4242 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
4243 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
4244 return log_oom();
4245 }
4246
4247 env_use = strv_env_merge(2, envp, arg_setenv);
4248 if (!env_use)
4249 return log_oom();
4250
4251 /* Let the parent know that we are ready and
4252 * wait until the parent is ready with the
4253 * setup, too... */
4254 if (!barrier_place_and_sync(barrier)) { /* #4 */
4255 log_error("Parent died too early");
4256 return -ESRCH;
4257 }
4258
4259 /* Now, explicitly close the log, so that we
4260 * then can close all remaining fds. Closing
4261 * the log explicitly first has the benefit
4262 * that the logging subsystem knows about it,
4263 * and is thus ready to be reopened should we
4264 * need it again. Note that the other fds
4265 * closed here are at least the locking and
4266 * barrier fds. */
4267 log_close();
4268 (void) fdset_close_others(fds);
4269
4270 if (arg_boot) {
4271 char **a;
4272 size_t m;
4273
4274 /* Automatically search for the init system */
4275
4276 m = 1 + argc - optind;
4277 a = newa(char*, m + 1);
4278 memcpy(a + 1, argv + optind, m * sizeof(char*));
4279
4280 a[0] = (char*) "/usr/lib/systemd/systemd";
4281 execve(a[0], a, env_use);
4282
4283 a[0] = (char*) "/lib/systemd/systemd";
4284 execve(a[0], a, env_use);
4285
4286 a[0] = (char*) "/sbin/init";
4287 execve(a[0], a, env_use);
4288 } else if (argc > optind)
4289 execvpe(argv[optind], argv + optind, env_use);
4290 else {
4291 chdir(home ? home : "/root");
4292 execle("/bin/bash", "-bash", NULL, env_use);
4293 execle("/bin/sh", "-sh", NULL, env_use);
4294 }
4295
4296 (void) log_open();
4297 return log_error_errno(errno, "execv() failed: %m");
4298 }
4299
4300 static int outer_child(
4301 Barrier *barrier,
4302 const char *directory,
4303 const char *console,
4304 const char *root_device, bool root_device_rw,
4305 const char *home_device, bool home_device_rw,
4306 const char *srv_device, bool srv_device_rw,
4307 bool interactive,
4308 bool secondary,
4309 int pid_socket,
4310 int kmsg_socket,
4311 int rtnl_socket,
4312 int uid_shift_socket,
4313 FDSet *fds,
4314 int argc,
4315 char *argv[]) {
4316
4317 pid_t pid;
4318 ssize_t l;
4319 int r;
4320
4321 assert(barrier);
4322 assert(directory);
4323 assert(console);
4324 assert(pid_socket >= 0);
4325 assert(kmsg_socket >= 0);
4326
4327 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
4328 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4329
4330 if (interactive) {
4331 close_nointr(STDIN_FILENO);
4332 close_nointr(STDOUT_FILENO);
4333 close_nointr(STDERR_FILENO);
4334
4335 r = open_terminal(console, O_RDWR);
4336 if (r != STDIN_FILENO) {
4337 if (r >= 0) {
4338 safe_close(r);
4339 r = -EINVAL;
4340 }
4341
4342 return log_error_errno(r, "Failed to open console: %m");
4343 }
4344
4345 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4346 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
4347 return log_error_errno(errno, "Failed to duplicate console: %m");
4348 }
4349
4350 r = reset_audit_loginuid();
4351 if (r < 0)
4352 return r;
4353
4354 /* Mark everything as slave, so that we still
4355 * receive mounts from the real root, but don't
4356 * propagate mounts to the real root. */
4357 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
4358 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4359
4360 r = mount_devices(directory,
4361 root_device, root_device_rw,
4362 home_device, home_device_rw,
4363 srv_device, srv_device_rw);
4364 if (r < 0)
4365 return r;
4366
4367 r = determine_uid_shift(directory);
4368 if (r < 0)
4369 return r;
4370
4371 if (arg_userns) {
4372 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
4373 if (l < 0)
4374 return log_error_errno(errno, "Failed to send UID shift: %m");
4375 if (l != sizeof(arg_uid_shift)) {
4376 log_error("Short write while sending UID shift.");
4377 return -EIO;
4378 }
4379 }
4380
4381 /* Turn directory into bind mount */
4382 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
4383 return log_error_errno(errno, "Failed to make bind mount: %m");
4384
4385 r = setup_volatile(directory);
4386 if (r < 0)
4387 return r;
4388
4389 r = setup_volatile_state(directory);
4390 if (r < 0)
4391 return r;
4392
4393 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
4394 if (r < 0)
4395 return r;
4396
4397 if (arg_read_only) {
4398 r = bind_remount_recursive(directory, true);
4399 if (r < 0)
4400 return log_error_errno(r, "Failed to make tree read-only: %m");
4401 }
4402
4403 r = mount_all(directory, false);
4404 if (r < 0)
4405 return r;
4406
4407 if (copy_devnodes(directory) < 0)
4408 return r;
4409
4410 dev_setup(directory, arg_uid_shift, arg_uid_shift);
4411
4412 if (setup_pts(directory) < 0)
4413 return r;
4414
4415 r = setup_propagate(directory);
4416 if (r < 0)
4417 return r;
4418
4419 r = setup_dev_console(directory, console);
4420 if (r < 0)
4421 return r;
4422
4423 r = setup_seccomp();
4424 if (r < 0)
4425 return r;
4426
4427 r = setup_timezone(directory);
4428 if (r < 0)
4429 return r;
4430
4431 r = setup_resolv_conf(directory);
4432 if (r < 0)
4433 return r;
4434
4435 r = setup_journal(directory);
4436 if (r < 0)
4437 return r;
4438
4439 r = mount_custom(directory);
4440 if (r < 0)
4441 return r;
4442
4443 r = mount_cgroup(directory);
4444 if (r < 0)
4445 return r;
4446
4447 r = mount_move_root(directory);
4448 if (r < 0)
4449 return log_error_errno(r, "Failed to move root directory: %m");
4450
4451 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4452 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
4453 (arg_private_network ? CLONE_NEWNET : 0) |
4454 (arg_userns ? CLONE_NEWUSER : 0),
4455 NULL);
4456 if (pid < 0)
4457 return log_error_errno(errno, "Failed to fork inner child: %m");
4458
4459 if (pid == 0) {
4460 pid_socket = safe_close(pid_socket);
4461 uid_shift_socket = safe_close(uid_shift_socket);
4462
4463 /* The inner child has all namespaces that are
4464 * requested, so that we all are owned by the user if
4465 * user namespaces are turned on. */
4466
4467 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
4468 if (r < 0)
4469 _exit(EXIT_FAILURE);
4470
4471 _exit(EXIT_SUCCESS);
4472 }
4473
4474 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4475 if (l < 0)
4476 return log_error_errno(errno, "Failed to send PID: %m");
4477 if (l != sizeof(pid)) {
4478 log_error("Short write while sending PID.");
4479 return -EIO;
4480 }
4481
4482 pid_socket = safe_close(pid_socket);
4483
4484 return 0;
4485 }
4486
4487 static int setup_uid_map(pid_t pid) {
4488 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4489 int r;
4490
4491 assert(pid > 1);
4492
4493 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4494 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4495 r = write_string_file(uid_map, line, 0);
4496 if (r < 0)
4497 return log_error_errno(r, "Failed to write UID map: %m");
4498
4499 /* We always assign the same UID and GID ranges */
4500 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4501 r = write_string_file(uid_map, line, 0);
4502 if (r < 0)
4503 return log_error_errno(r, "Failed to write GID map: %m");
4504
4505 return 0;
4506 }
4507
4508 static int chown_cgroup(pid_t pid) {
4509 _cleanup_free_ char *path = NULL, *fs = NULL;
4510 _cleanup_close_ int fd = -1;
4511 const char *fn;
4512 int r;
4513
4514 r = cg_pid_get_path(NULL, pid, &path);
4515 if (r < 0)
4516 return log_error_errno(r, "Failed to get container cgroup path: %m");
4517
4518 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
4519 if (r < 0)
4520 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
4521
4522 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
4523 if (fd < 0)
4524 return log_error_errno(errno, "Failed to open %s: %m", fs);
4525
4526 FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4527 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
4528 log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
4529
4530 return 0;
4531 }
4532
4533 int main(int argc, char *argv[]) {
4534
4535 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
4536 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4537 _cleanup_close_ int master = -1, image_fd = -1;
4538 _cleanup_fdset_free_ FDSet *fds = NULL;
4539 int r, n_fd_passed, loop_nr = -1;
4540 char veth_name[IFNAMSIZ];
4541 bool secondary = false, remove_subvol = false;
4542 sigset_t mask_chld;
4543 pid_t pid = 0;
4544 int ret = EXIT_SUCCESS;
4545 union in_addr_union exposed = {};
4546 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4547 bool interactive;
4548
4549 log_parse_environment();
4550 log_open();
4551
4552 r = parse_argv(argc, argv);
4553 if (r <= 0)
4554 goto finish;
4555
4556 r = determine_names();
4557 if (r < 0)
4558 goto finish;
4559
4560 if (geteuid() != 0) {
4561 log_error("Need to be root.");
4562 r = -EPERM;
4563 goto finish;
4564 }
4565
4566 n_fd_passed = sd_listen_fds(false);
4567 if (n_fd_passed > 0) {
4568 r = fdset_new_listen_fds(&fds, false);
4569 if (r < 0) {
4570 log_error_errno(r, "Failed to collect file descriptors: %m");
4571 goto finish;
4572 }
4573 }
4574
4575 if (arg_directory) {
4576 assert(!arg_image);
4577
4578 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4579 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4580 r = -EINVAL;
4581 goto finish;
4582 }
4583
4584 if (arg_ephemeral) {
4585 _cleanup_free_ char *np = NULL;
4586
4587 /* If the specified path is a mount point we
4588 * generate the new snapshot immediately
4589 * inside it under a random name. However if
4590 * the specified is not a mount point we
4591 * create the new snapshot in the parent
4592 * directory, just next to it. */
4593 r = path_is_mount_point(arg_directory, 0);
4594 if (r < 0) {
4595 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4596 goto finish;
4597 }
4598 if (r > 0)
4599 r = tempfn_random_child(arg_directory, "machine.", &np);
4600 else
4601 r = tempfn_random(arg_directory, "machine.", &np);
4602 if (r < 0) {
4603 log_error_errno(r, "Failed to generate name for snapshot: %m");
4604 goto finish;
4605 }
4606
4607 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4608 if (r < 0) {
4609 log_error_errno(r, "Failed to lock %s: %m", np);
4610 goto finish;
4611 }
4612
4613 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4614 if (r < 0) {
4615 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4616 goto finish;
4617 }
4618
4619 free(arg_directory);
4620 arg_directory = np;
4621 np = NULL;
4622
4623 remove_subvol = true;
4624
4625 } else {
4626 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4627 if (r == -EBUSY) {
4628 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4629 goto finish;
4630 }
4631 if (r < 0) {
4632 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4633 return r;
4634 }
4635
4636 if (arg_template) {
4637 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4638 if (r == -EEXIST) {
4639 if (!arg_quiet)
4640 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4641 } else if (r < 0) {
4642 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4643 goto finish;
4644 } else {
4645 if (!arg_quiet)
4646 log_info("Populated %s from template %s.", arg_directory, arg_template);
4647 }
4648 }
4649 }
4650
4651 if (arg_boot) {
4652 if (path_is_os_tree(arg_directory) <= 0) {
4653 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
4654 r = -EINVAL;
4655 goto finish;
4656 }
4657 } else {
4658 const char *p;
4659
4660 p = strjoina(arg_directory,
4661 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
4662 if (access(p, F_OK) < 0) {
4663 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
4664 r = -EINVAL;
4665 goto finish;
4666 }
4667 }
4668
4669 } else {
4670 char template[] = "/tmp/nspawn-root-XXXXXX";
4671
4672 assert(arg_image);
4673 assert(!arg_template);
4674
4675 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4676 if (r == -EBUSY) {
4677 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4678 goto finish;
4679 }
4680 if (r < 0) {
4681 r = log_error_errno(r, "Failed to create image lock: %m");
4682 goto finish;
4683 }
4684
4685 if (!mkdtemp(template)) {
4686 log_error_errno(errno, "Failed to create temporary directory: %m");
4687 r = -errno;
4688 goto finish;
4689 }
4690
4691 arg_directory = strdup(template);
4692 if (!arg_directory) {
4693 r = log_oom();
4694 goto finish;
4695 }
4696
4697 image_fd = setup_image(&device_path, &loop_nr);
4698 if (image_fd < 0) {
4699 r = image_fd;
4700 goto finish;
4701 }
4702
4703 r = dissect_image(image_fd,
4704 &root_device, &root_device_rw,
4705 &home_device, &home_device_rw,
4706 &srv_device, &srv_device_rw,
4707 &secondary);
4708 if (r < 0)
4709 goto finish;
4710 }
4711
4712 r = custom_mounts_prepare();
4713 if (r < 0)
4714 goto finish;
4715
4716 interactive =
4717 isatty(STDIN_FILENO) > 0 &&
4718 isatty(STDOUT_FILENO) > 0;
4719
4720 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4721 if (master < 0) {
4722 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
4723 goto finish;
4724 }
4725
4726 r = ptsname_malloc(master, &console);
4727 if (r < 0) {
4728 r = log_error_errno(r, "Failed to determine tty name: %m");
4729 goto finish;
4730 }
4731
4732 if (unlockpt(master) < 0) {
4733 r = log_error_errno(errno, "Failed to unlock tty: %m");
4734 goto finish;
4735 }
4736
4737 if (!arg_quiet)
4738 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4739 arg_machine, arg_image ?: arg_directory);
4740
4741 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
4742
4743 assert_se(sigemptyset(&mask_chld) == 0);
4744 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4745
4746 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4747 r = log_error_errno(errno, "Failed to become subreaper: %m");
4748 goto finish;
4749 }
4750
4751 for (;;) {
4752 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
4753 uid_shift_socket_pair[2] = { -1, -1 };
4754 ContainerStatus container_status;
4755 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4756 static const struct sigaction sa = {
4757 .sa_handler = nop_handler,
4758 .sa_flags = SA_NOCLDSTOP,
4759 };
4760 int ifi = 0;
4761 ssize_t l;
4762 _cleanup_event_unref_ sd_event *event = NULL;
4763 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4764 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4765 char last_char = 0;
4766
4767 r = barrier_create(&barrier);
4768 if (r < 0) {
4769 log_error_errno(r, "Cannot initialize IPC barrier: %m");
4770 goto finish;
4771 }
4772
4773 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
4774 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4775 goto finish;
4776 }
4777
4778 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
4779 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4780 goto finish;
4781 }
4782
4783 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
4784 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
4785 goto finish;
4786 }
4787
4788 if (arg_userns)
4789 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
4790 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4791 goto finish;
4792 }
4793
4794 /* Child can be killed before execv(), so handle SIGCHLD
4795 * in order to interrupt parent's blocking calls and
4796 * give it a chance to call wait() and terminate. */
4797 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4798 if (r < 0) {
4799 r = log_error_errno(errno, "Failed to change the signal mask: %m");
4800 goto finish;
4801 }
4802
4803 r = sigaction(SIGCHLD, &sa, NULL);
4804 if (r < 0) {
4805 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4806 goto finish;
4807 }
4808
4809 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
4810 if (pid < 0) {
4811 if (errno == EINVAL)
4812 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
4813 else
4814 r = log_error_errno(errno, "clone() failed: %m");
4815
4816 goto finish;
4817 }
4818
4819 if (pid == 0) {
4820 /* The outer child only has a file system namespace. */
4821 barrier_set_role(&barrier, BARRIER_CHILD);
4822
4823 master = safe_close(master);
4824
4825 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4826 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4827 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4828 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4829
4830 (void) reset_all_signal_handlers();
4831 (void) reset_signal_mask();
4832
4833 r = outer_child(&barrier,
4834 arg_directory,
4835 console,
4836 root_device, root_device_rw,
4837 home_device, home_device_rw,
4838 srv_device, srv_device_rw,
4839 interactive,
4840 secondary,
4841 pid_socket_pair[1],
4842 kmsg_socket_pair[1],
4843 rtnl_socket_pair[1],
4844 uid_shift_socket_pair[1],
4845 fds,
4846 argc, argv);
4847 if (r < 0)
4848 _exit(EXIT_FAILURE);
4849
4850 _exit(EXIT_SUCCESS);
4851 }
4852
4853 barrier_set_role(&barrier, BARRIER_PARENT);
4854
4855 fdset_free(fds);
4856 fds = NULL;
4857
4858 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4859 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4860 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4861
4862 /* Wait for the outer child. */
4863 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
4864 if (r < 0)
4865 goto finish;
4866 if (r != 0) {
4867 r = -EIO;
4868 goto finish;
4869 }
4870 pid = 0;
4871
4872 /* And now retrieve the PID of the inner child. */
4873 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
4874 if (l < 0) {
4875 r = log_error_errno(errno, "Failed to read inner child PID: %m");
4876 goto finish;
4877 }
4878 if (l != sizeof(pid)) {
4879 log_error("Short read while reading inner child PID: %m");
4880 r = EIO;
4881 goto finish;
4882 }
4883
4884 log_debug("Init process invoked as PID " PID_FMT, pid);
4885
4886 if (arg_userns) {
4887 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4888 log_error("Child died too early.");
4889 r = -ESRCH;
4890 goto finish;
4891 }
4892
4893 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
4894 if (l < 0) {
4895 r = log_error_errno(errno, "Failed to read UID shift: %m");
4896 goto finish;
4897 }
4898 if (l != sizeof(arg_uid_shift)) {
4899 log_error("Short read while reading UID shift: %m");
4900 r = EIO;
4901 goto finish;
4902 }
4903
4904 r = setup_uid_map(pid);
4905 if (r < 0)
4906 goto finish;
4907
4908 (void) barrier_place(&barrier); /* #2 */
4909 }
4910
4911 r = move_network_interfaces(pid);
4912 if (r < 0)
4913 goto finish;
4914
4915 r = setup_veth(pid, veth_name, &ifi);
4916 if (r < 0)
4917 goto finish;
4918
4919 r = setup_bridge(veth_name, &ifi);
4920 if (r < 0)
4921 goto finish;
4922
4923 r = setup_macvlan(pid);
4924 if (r < 0)
4925 goto finish;
4926
4927 r = setup_ipvlan(pid);
4928 if (r < 0)
4929 goto finish;
4930
4931 r = register_machine(pid, ifi);
4932 if (r < 0)
4933 goto finish;
4934
4935 r = chown_cgroup(pid);
4936 if (r < 0)
4937 goto finish;
4938
4939 /* Notify the child that the parent is ready with all
4940 * its setup (including cgroup-ification), and that
4941 * the child can now hand over control to the code to
4942 * run inside the container. */
4943 (void) barrier_place(&barrier); /* #3 */
4944
4945 /* Block SIGCHLD here, before notifying child.
4946 * process_pty() will handle it with the other signals. */
4947 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4948
4949 /* Reset signal to default */
4950 r = default_signals(SIGCHLD, -1);
4951 if (r < 0) {
4952 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4953 goto finish;
4954 }
4955
4956 /* Let the child know that we are ready and wait that the child is completely ready now. */
4957 if (!barrier_place_and_sync(&barrier)) { /* #5 */
4958 log_error("Client died too early.");
4959 r = -ESRCH;
4960 goto finish;
4961 }
4962
4963 sd_notifyf(false,
4964 "READY=1\n"
4965 "STATUS=Container running.\n"
4966 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4967
4968 r = sd_event_new(&event);
4969 if (r < 0) {
4970 log_error_errno(r, "Failed to get default event source: %m");
4971 goto finish;
4972 }
4973
4974 if (arg_kill_signal > 0) {
4975 /* Try to kill the init system on SIGINT or SIGTERM */
4976 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4977 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4978 } else {
4979 /* Immediately exit */
4980 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4981 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4982 }
4983
4984 /* simply exit on sigchld */
4985 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4986
4987 if (arg_expose_ports) {
4988 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4989 if (r < 0)
4990 goto finish;
4991
4992 (void) expose_ports(rtnl, &exposed);
4993 }
4994
4995 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4996
4997 r = pty_forward_new(event, master, true, !interactive, &forward);
4998 if (r < 0) {
4999 log_error_errno(r, "Failed to create PTY forwarder: %m");
5000 goto finish;
5001 }
5002
5003 r = sd_event_loop(event);
5004 if (r < 0) {
5005 log_error_errno(r, "Failed to run event loop: %m");
5006 goto finish;
5007 }
5008
5009 pty_forward_get_last_char(forward, &last_char);
5010
5011 forward = pty_forward_free(forward);
5012
5013 if (!arg_quiet && last_char != '\n')
5014 putc('\n', stdout);
5015
5016 /* Kill if it is not dead yet anyway */
5017 terminate_machine(pid);
5018
5019 /* Normally redundant, but better safe than sorry */
5020 kill(pid, SIGKILL);
5021
5022 r = wait_for_container(pid, &container_status);
5023 pid = 0;
5024
5025 if (r < 0)
5026 /* We failed to wait for the container, or the
5027 * container exited abnormally */
5028 goto finish;
5029 else if (r > 0 || container_status == CONTAINER_TERMINATED){
5030 /* The container exited with a non-zero
5031 * status, or with zero status and no reboot
5032 * was requested. */
5033 ret = r;
5034 break;
5035 }
5036
5037 /* CONTAINER_REBOOTED, loop again */
5038
5039 if (arg_keep_unit) {
5040 /* Special handling if we are running as a
5041 * service: instead of simply restarting the
5042 * machine we want to restart the entire
5043 * service, so let's inform systemd about this
5044 * with the special exit code 133. The service
5045 * file uses RestartForceExitStatus=133 so
5046 * that this results in a full nspawn
5047 * restart. This is necessary since we might
5048 * have cgroup parameters set we want to have
5049 * flushed out. */
5050 ret = 133;
5051 r = 0;
5052 break;
5053 }
5054
5055 flush_ports(&exposed);
5056 }
5057
5058 finish:
5059 sd_notify(false,
5060 "STOPPING=1\n"
5061 "STATUS=Terminating...");
5062
5063 if (pid > 0)
5064 kill(pid, SIGKILL);
5065
5066 /* Try to flush whatever is still queued in the pty */
5067 if (master >= 0)
5068 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
5069
5070 loop_remove(loop_nr, &image_fd);
5071
5072 if (remove_subvol && arg_directory) {
5073 int k;
5074
5075 k = btrfs_subvol_remove(arg_directory, true);
5076 if (k < 0)
5077 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
5078 }
5079
5080 if (arg_machine) {
5081 const char *p;
5082
5083 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5084 (void) rm_rf(p, REMOVE_ROOT);
5085 }
5086
5087 free(arg_directory);
5088 free(arg_template);
5089 free(arg_image);
5090 free(arg_machine);
5091 free(arg_user);
5092 strv_free(arg_setenv);
5093 strv_free(arg_network_interfaces);
5094 strv_free(arg_network_macvlan);
5095 strv_free(arg_network_ipvlan);
5096 custom_mount_free_all();
5097
5098 flush_ports(&exposed);
5099
5100 while (arg_expose_ports) {
5101 ExposePort *p = arg_expose_ports;
5102 LIST_REMOVE(ports, arg_expose_ports, p);
5103 free(p);
5104 }
5105
5106 return r < 0 ? EXIT_FAILURE : ret;
5107 }