1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/mount.h>
31 #include <sys/prctl.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
44 #include <selinux/selinux.h>
52 #include <blkid/blkid.h>
55 #include "sd-daemon.h"
58 #include "sd-netlink.h"
59 #include "random-util.h"
66 #include "cgroup-util.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
75 #include "bus-error.h"
78 #include "netlink-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
82 #include "siphash24.h"
84 #include "base-filesystem.h"
86 #include "event-util.h"
87 #include "capability.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
92 #include "in-addr-util.h"
93 #include "firewall-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
102 #include "seccomp-util.h"
105 typedef struct ExposePort
{
108 uint16_t container_port
;
109 LIST_FIELDS(struct ExposePort
, ports
);
112 typedef enum ContainerStatus
{
113 CONTAINER_TERMINATED
,
117 typedef enum LinkJournal
{
124 typedef enum Volatile
{
130 typedef enum CustomMountType
{
133 CUSTOM_MOUNT_OVERLAY
,
136 typedef struct CustomMount
{
137 CustomMountType type
;
139 char *source
; /* for overlayfs this is the upper directory */
146 static char *arg_directory
= NULL
;
147 static char *arg_template
= NULL
;
148 static char *arg_user
= NULL
;
149 static sd_id128_t arg_uuid
= {};
150 static char *arg_machine
= NULL
;
151 static const char *arg_selinux_context
= NULL
;
152 static const char *arg_selinux_apifs_context
= NULL
;
153 static const char *arg_slice
= NULL
;
154 static bool arg_private_network
= false;
155 static bool arg_read_only
= false;
156 static bool arg_boot
= false;
157 static bool arg_ephemeral
= false;
158 static LinkJournal arg_link_journal
= LINK_AUTO
;
159 static bool arg_link_journal_try
= false;
160 static uint64_t arg_retain
=
161 (1ULL << CAP_CHOWN
) |
162 (1ULL << CAP_DAC_OVERRIDE
) |
163 (1ULL << CAP_DAC_READ_SEARCH
) |
164 (1ULL << CAP_FOWNER
) |
165 (1ULL << CAP_FSETID
) |
166 (1ULL << CAP_IPC_OWNER
) |
168 (1ULL << CAP_LEASE
) |
169 (1ULL << CAP_LINUX_IMMUTABLE
) |
170 (1ULL << CAP_NET_BIND_SERVICE
) |
171 (1ULL << CAP_NET_BROADCAST
) |
172 (1ULL << CAP_NET_RAW
) |
173 (1ULL << CAP_SETGID
) |
174 (1ULL << CAP_SETFCAP
) |
175 (1ULL << CAP_SETPCAP
) |
176 (1ULL << CAP_SETUID
) |
177 (1ULL << CAP_SYS_ADMIN
) |
178 (1ULL << CAP_SYS_CHROOT
) |
179 (1ULL << CAP_SYS_NICE
) |
180 (1ULL << CAP_SYS_PTRACE
) |
181 (1ULL << CAP_SYS_TTY_CONFIG
) |
182 (1ULL << CAP_SYS_RESOURCE
) |
183 (1ULL << CAP_SYS_BOOT
) |
184 (1ULL << CAP_AUDIT_WRITE
) |
185 (1ULL << CAP_AUDIT_CONTROL
) |
187 static CustomMount
*arg_custom_mounts
= NULL
;
188 static unsigned arg_n_custom_mounts
= 0;
189 static char **arg_setenv
= NULL
;
190 static bool arg_quiet
= false;
191 static bool arg_share_system
= false;
192 static bool arg_register
= true;
193 static bool arg_keep_unit
= false;
194 static char **arg_network_interfaces
= NULL
;
195 static char **arg_network_macvlan
= NULL
;
196 static char **arg_network_ipvlan
= NULL
;
197 static bool arg_network_veth
= false;
198 static const char *arg_network_bridge
= NULL
;
199 static unsigned long arg_personality
= PERSONALITY_INVALID
;
200 static char *arg_image
= NULL
;
201 static Volatile arg_volatile
= VOLATILE_NO
;
202 static ExposePort
*arg_expose_ports
= NULL
;
203 static char **arg_property
= NULL
;
204 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
205 static bool arg_userns
= false;
206 static int arg_kill_signal
= 0;
208 static void help(void) {
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
211 " -h --help Show this help\n"
212 " --version Print version string\n"
213 " -q --quiet Do not show status information\n"
214 " -D --directory=PATH Root directory for the container\n"
215 " --template=PATH Initialize root directory from template directory,\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
222 " -M --machine=NAME Set the machine name for the container\n"
223 " --uuid=UUID Set a specific machine UUID for the container\n"
224 " -S --slice=SLICE Place the container in the specified slice\n"
225 " --property=NAME=VALUE Set scope unit property\n"
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
238 " -n --network-veth Add a virtual ethernet connection between host\n"
240 " --network-bridge=INTERFACE\n"
241 " Add a virtual ethernet connection between host\n"
242 " and container and add it to an existing bridge on\n"
244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
245 " Expose a container IP port on the host\n"
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
252 " --capability=CAP In addition to the default, retain specified\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
259 " --read-only Mount the root directory read-only\n"
260 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
262 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
263 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
264 " --overlay=PATH[:PATH...]:PATH\n"
265 " Create an overlay mount from the host to \n"
267 " --overlay-ro=PATH[:PATH...]:PATH\n"
268 " Similar, but creates a read-only overlay mount\n"
269 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
270 " --share-system Share system namespaces with host\n"
271 " --register=BOOLEAN Register container as machine\n"
272 " --keep-unit Do not register a scope for the machine, reuse\n"
273 " the service unit nspawn is running in\n"
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 , program_invocation_short_name
);
278 static CustomMount
* custom_mount_add(CustomMountType t
) {
279 CustomMount
*c
, *ret
;
281 c
= realloc(arg_custom_mounts
, (arg_n_custom_mounts
+ 1) * sizeof(CustomMount
));
285 arg_custom_mounts
= c
;
286 ret
= arg_custom_mounts
+ arg_n_custom_mounts
;
287 arg_n_custom_mounts
++;
289 *ret
= (CustomMount
) { .type
= t
};
294 static void custom_mount_free_all(void) {
297 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
298 CustomMount
*m
= &arg_custom_mounts
[i
];
301 free(m
->destination
);
305 (void) rm_rf(m
->work_dir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
312 free(arg_custom_mounts
);
313 arg_custom_mounts
= NULL
;
314 arg_n_custom_mounts
= 0;
317 static int custom_mount_compare(const void *a
, const void *b
) {
318 const CustomMount
*x
= a
, *y
= b
;
321 r
= path_compare(x
->destination
, y
->destination
);
325 if (x
->type
< y
->type
)
327 if (x
->type
> y
->type
)
333 static int custom_mounts_prepare(void) {
337 /* Ensure the mounts are applied prefix first. */
338 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
340 /* Allocate working directories for the overlay file systems that need it */
341 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
342 CustomMount
*m
= &arg_custom_mounts
[i
];
344 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
353 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
355 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
361 static int set_sanitized_path(char **b
, const char *path
) {
367 p
= canonicalize_file_name(path
);
372 p
= path_make_absolute_cwd(path
);
378 *b
= path_kill_slashes(p
);
382 static int parse_argv(int argc
, char *argv
[]) {
401 ARG_NETWORK_INTERFACE
,
413 static const struct option options
[] = {
414 { "help", no_argument
, NULL
, 'h' },
415 { "version", no_argument
, NULL
, ARG_VERSION
},
416 { "directory", required_argument
, NULL
, 'D' },
417 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
418 { "ephemeral", no_argument
, NULL
, 'x' },
419 { "user", required_argument
, NULL
, 'u' },
420 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
421 { "boot", no_argument
, NULL
, 'b' },
422 { "uuid", required_argument
, NULL
, ARG_UUID
},
423 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
424 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
425 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
426 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
427 { "bind", required_argument
, NULL
, ARG_BIND
},
428 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
429 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
430 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
431 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
432 { "machine", required_argument
, NULL
, 'M' },
433 { "slice", required_argument
, NULL
, 'S' },
434 { "setenv", required_argument
, NULL
, ARG_SETENV
},
435 { "selinux-context", required_argument
, NULL
, 'Z' },
436 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
437 { "quiet", no_argument
, NULL
, 'q' },
438 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
439 { "register", required_argument
, NULL
, ARG_REGISTER
},
440 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
441 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
442 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
443 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
444 { "network-veth", no_argument
, NULL
, 'n' },
445 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
446 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
447 { "image", required_argument
, NULL
, 'i' },
448 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
449 { "port", required_argument
, NULL
, 'p' },
450 { "property", required_argument
, NULL
, ARG_PROPERTY
},
451 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
452 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
457 uint64_t plus
= 0, minus
= 0;
462 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
471 puts(PACKAGE_STRING
);
472 puts(SYSTEMD_FEATURES
);
476 r
= set_sanitized_path(&arg_directory
, optarg
);
478 return log_error_errno(r
, "Invalid root directory: %m");
483 r
= set_sanitized_path(&arg_template
, optarg
);
485 return log_error_errno(r
, "Invalid template directory: %m");
490 r
= set_sanitized_path(&arg_image
, optarg
);
492 return log_error_errno(r
, "Invalid image path: %m");
497 arg_ephemeral
= true;
502 arg_user
= strdup(optarg
);
508 case ARG_NETWORK_BRIDGE
:
509 arg_network_bridge
= optarg
;
514 arg_network_veth
= true;
515 arg_private_network
= true;
518 case ARG_NETWORK_INTERFACE
:
519 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
522 arg_private_network
= true;
525 case ARG_NETWORK_MACVLAN
:
526 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
529 arg_private_network
= true;
532 case ARG_NETWORK_IPVLAN
:
533 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
538 case ARG_PRIVATE_NETWORK
:
539 arg_private_network
= true;
547 r
= sd_id128_from_string(optarg
, &arg_uuid
);
549 log_error("Invalid UUID: %s", optarg
);
559 if (isempty(optarg
)) {
563 if (!machine_name_is_valid(optarg
)) {
564 log_error("Invalid machine name: %s", optarg
);
568 r
= free_and_strdup(&arg_machine
, optarg
);
576 arg_selinux_context
= optarg
;
580 arg_selinux_apifs_context
= optarg
;
584 arg_read_only
= true;
588 case ARG_DROP_CAPABILITY
: {
589 const char *state
, *word
;
592 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
593 _cleanup_free_
char *t
;
595 t
= strndup(word
, length
);
599 if (streq(t
, "all")) {
600 if (c
== ARG_CAPABILITY
)
601 plus
= (uint64_t) -1;
603 minus
= (uint64_t) -1;
607 cap
= capability_from_name(t
);
609 log_error("Failed to parse capability %s.", t
);
613 if (c
== ARG_CAPABILITY
)
614 plus
|= 1ULL << (uint64_t) cap
;
616 minus
|= 1ULL << (uint64_t) cap
;
624 arg_link_journal
= LINK_GUEST
;
625 arg_link_journal_try
= true;
628 case ARG_LINK_JOURNAL
:
629 if (streq(optarg
, "auto")) {
630 arg_link_journal
= LINK_AUTO
;
631 arg_link_journal_try
= false;
632 } else if (streq(optarg
, "no")) {
633 arg_link_journal
= LINK_NO
;
634 arg_link_journal_try
= false;
635 } else if (streq(optarg
, "guest")) {
636 arg_link_journal
= LINK_GUEST
;
637 arg_link_journal_try
= false;
638 } else if (streq(optarg
, "host")) {
639 arg_link_journal
= LINK_HOST
;
640 arg_link_journal_try
= false;
641 } else if (streq(optarg
, "try-guest")) {
642 arg_link_journal
= LINK_GUEST
;
643 arg_link_journal_try
= true;
644 } else if (streq(optarg
, "try-host")) {
645 arg_link_journal
= LINK_HOST
;
646 arg_link_journal_try
= true;
648 log_error("Failed to parse link journal mode %s", optarg
);
656 _cleanup_free_
char *source
= NULL
, *destination
= NULL
;
660 e
= strchr(optarg
, ':');
662 source
= strndup(optarg
, e
- optarg
);
663 destination
= strdup(e
+ 1);
665 source
= strdup(optarg
);
666 destination
= strdup(optarg
);
669 if (!source
|| !destination
)
672 if (!path_is_absolute(source
) || !path_is_absolute(destination
)) {
673 log_error("Invalid bind mount specification: %s", optarg
);
677 m
= custom_mount_add(CUSTOM_MOUNT_BIND
);
682 m
->destination
= destination
;
683 m
->read_only
= c
== ARG_BIND_RO
;
685 source
= destination
= NULL
;
691 _cleanup_free_
char *path
= NULL
, *opts
= NULL
;
695 e
= strchr(optarg
, ':');
697 path
= strndup(optarg
, e
- optarg
);
698 opts
= strdup(e
+ 1);
700 path
= strdup(optarg
);
701 opts
= strdup("mode=0755");
707 if (!path_is_absolute(path
)) {
708 log_error("Invalid tmpfs specification: %s", optarg
);
712 m
= custom_mount_add(CUSTOM_MOUNT_TMPFS
);
716 m
->destination
= path
;
725 case ARG_OVERLAY_RO
: {
726 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
727 _cleanup_strv_free_
char **lower
= NULL
;
732 lower
= strv_split(optarg
, ":");
736 STRV_FOREACH(i
, lower
) {
737 if (!path_is_absolute(*i
)) {
738 log_error("Overlay path %s is not absolute.", *i
);
746 log_error("--overlay= needs at least two colon-separated directories specified.");
751 /* If two parameters are specified,
752 * the first one is the lower, the
753 * second one the upper directory. And
754 * we'll also define the the
755 * destination mount point the same as
760 destination
= strdup(upper
);
765 upper
= lower
[n
- 2];
766 destination
= lower
[n
- 1];
770 m
= custom_mount_add(CUSTOM_MOUNT_OVERLAY
);
774 m
->destination
= destination
;
777 m
->read_only
= c
== ARG_OVERLAY_RO
;
779 upper
= destination
= NULL
;
788 if (!env_assignment_is_valid(optarg
)) {
789 log_error("Environment variable assignment '%s' is not valid.", optarg
);
793 n
= strv_env_set(arg_setenv
, optarg
);
797 strv_free(arg_setenv
);
806 case ARG_SHARE_SYSTEM
:
807 arg_share_system
= true;
811 r
= parse_boolean(optarg
);
813 log_error("Failed to parse --register= argument: %s", optarg
);
821 arg_keep_unit
= true;
824 case ARG_PERSONALITY
:
826 arg_personality
= personality_from_string(optarg
);
827 if (arg_personality
== PERSONALITY_INVALID
) {
828 log_error("Unknown or unsupported personality '%s'.", optarg
);
837 arg_volatile
= VOLATILE_YES
;
839 r
= parse_boolean(optarg
);
841 if (streq(optarg
, "state"))
842 arg_volatile
= VOLATILE_STATE
;
844 log_error("Failed to parse --volatile= argument: %s", optarg
);
848 arg_volatile
= r
? VOLATILE_YES
: VOLATILE_NO
;
854 const char *split
, *e
;
855 uint16_t container_port
, host_port
;
859 if ((e
= startswith(optarg
, "tcp:")))
860 protocol
= IPPROTO_TCP
;
861 else if ((e
= startswith(optarg
, "udp:")))
862 protocol
= IPPROTO_UDP
;
865 protocol
= IPPROTO_TCP
;
868 split
= strchr(e
, ':');
870 char v
[split
- e
+ 1];
872 memcpy(v
, e
, split
- e
);
875 r
= safe_atou16(v
, &host_port
);
876 if (r
< 0 || host_port
<= 0) {
877 log_error("Failed to parse host port: %s", optarg
);
881 r
= safe_atou16(split
+ 1, &container_port
);
883 r
= safe_atou16(e
, &container_port
);
884 host_port
= container_port
;
887 if (r
< 0 || container_port
<= 0) {
888 log_error("Failed to parse host port: %s", optarg
);
892 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
893 if (p
->protocol
== protocol
&& p
->host_port
== host_port
) {
894 log_error("Duplicate port specification: %s", optarg
);
899 p
= new(ExposePort
, 1);
903 p
->protocol
= protocol
;
904 p
->host_port
= host_port
;
905 p
->container_port
= container_port
;
907 LIST_PREPEND(ports
, arg_expose_ports
, p
);
913 if (strv_extend(&arg_property
, optarg
) < 0)
918 case ARG_PRIVATE_USERS
:
920 _cleanup_free_
char *buffer
= NULL
;
921 const char *range
, *shift
;
923 range
= strchr(optarg
, ':');
925 buffer
= strndup(optarg
, range
- optarg
);
931 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
932 log_error("Failed to parse UID range: %s", range
);
938 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
939 log_error("Failed to parse UID: %s", optarg
);
947 case ARG_KILL_SIGNAL
:
948 arg_kill_signal
= signal_from_string_try_harder(optarg
);
949 if (arg_kill_signal
< 0) {
950 log_error("Cannot parse signal: %s", optarg
);
960 assert_not_reached("Unhandled option");
963 if (arg_share_system
)
964 arg_register
= false;
966 if (arg_boot
&& arg_share_system
) {
967 log_error("--boot and --share-system may not be combined.");
971 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
972 log_error("--keep-unit may not be used when invoked from a user session.");
976 if (arg_directory
&& arg_image
) {
977 log_error("--directory= and --image= may not be combined.");
981 if (arg_template
&& arg_image
) {
982 log_error("--template= and --image= may not be combined.");
986 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
987 log_error("--template= needs --directory= or --machine=.");
991 if (arg_ephemeral
&& arg_template
) {
992 log_error("--ephemeral and --template= may not be combined.");
996 if (arg_ephemeral
&& arg_image
) {
997 log_error("--ephemeral and --image= may not be combined.");
1001 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
1002 log_error("--ephemeral and --link-journal= may not be combined.");
1006 if (arg_volatile
!= VOLATILE_NO
&& arg_read_only
) {
1007 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1011 if (arg_expose_ports
&& !arg_private_network
) {
1012 log_error("Cannot use --port= without private networking.");
1016 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
1017 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
1019 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
1021 if (arg_boot
&& arg_kill_signal
<= 0)
1022 arg_kill_signal
= SIGRTMIN
+3;
1027 static int tmpfs_patch_options(const char *options
, char **ret
) {
1030 if (arg_userns
&& arg_uid_shift
!= 0) {
1033 (void) asprintf(&buf
, "%s,uid=" UID_FMT
",gid=" UID_FMT
, options
, arg_uid_shift
, arg_uid_shift
);
1035 (void) asprintf(&buf
, "uid=" UID_FMT
",gid=" UID_FMT
, arg_uid_shift
, arg_uid_shift
);
1043 if (arg_selinux_apifs_context
) {
1047 t
= strjoin(options
, ",context=\"", arg_selinux_apifs_context
, "\"", NULL
);
1049 t
= strjoin("context=\"", arg_selinux_apifs_context
, "\"", NULL
);
1064 static int mount_all(const char *dest
, bool userns
) {
1066 typedef struct MountPoint
{
1070 const char *options
;
1071 unsigned long flags
;
1076 static const MountPoint mount_table
[] = {
1077 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true, true },
1078 { "/proc/sys", "/proc/sys", NULL
, NULL
, MS_BIND
, true, true }, /* Bind mount first */
1079 { NULL
, "/proc/sys", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_REMOUNT
, true, true }, /* Then, make it r/o */
1080 { "sysfs", "/sys", "sysfs", NULL
, MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true, false },
1081 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, true, false },
1082 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
, true, false },
1083 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true, false },
1084 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true, false },
1085 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME
, true, false },
1087 { "/sys/fs/selinux", "/sys/fs/selinux", NULL
, NULL
, MS_BIND
, false, false }, /* Bind mount first */
1088 { NULL
, "/sys/fs/selinux", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_REMOUNT
, false, false }, /* Then, make it r/o */
1095 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
1096 _cleanup_free_
char *where
= NULL
, *options
= NULL
;
1099 if (userns
!= mount_table
[k
].userns
)
1102 where
= prefix_root(dest
, mount_table
[k
].where
);
1106 r
= path_is_mount_point(where
, AT_SYMLINK_FOLLOW
);
1107 if (r
< 0 && r
!= -ENOENT
)
1108 return log_error_errno(r
, "Failed to detect whether %s is a mount point: %m", where
);
1110 /* Skip this entry if it is not a remount. */
1111 if (mount_table
[k
].what
&& r
> 0)
1114 r
= mkdir_p(where
, 0755);
1116 if (mount_table
[k
].fatal
)
1117 return log_error_errno(r
, "Failed to create directory %s: %m", where
);
1119 log_warning_errno(r
, "Failed to create directory %s: %m", where
);
1123 o
= mount_table
[k
].options
;
1124 if (streq_ptr(mount_table
[k
].type
, "tmpfs")) {
1125 r
= tmpfs_patch_options(o
, &options
);
1132 if (mount(mount_table
[k
].what
,
1134 mount_table
[k
].type
,
1135 mount_table
[k
].flags
,
1138 if (mount_table
[k
].fatal
)
1139 return log_error_errno(errno
, "mount(%s) failed: %m", where
);
1141 log_warning_errno(errno
, "mount(%s) failed, ignoring: %m", where
);
1148 static int mount_bind(const char *dest
, CustomMount
*m
) {
1149 struct stat source_st
, dest_st
;
1155 if (stat(m
->source
, &source_st
) < 0)
1156 return log_error_errno(errno
, "Failed to stat %s: %m", m
->source
);
1158 where
= prefix_roota(dest
, m
->destination
);
1160 if (stat(where
, &dest_st
) >= 0) {
1161 if (S_ISDIR(source_st
.st_mode
) && !S_ISDIR(dest_st
.st_mode
)) {
1162 log_error("Cannot bind mount directory %s on file %s.", m
->source
, where
);
1166 if (!S_ISDIR(source_st
.st_mode
) && S_ISDIR(dest_st
.st_mode
)) {
1167 log_error("Cannot bind mount file %s on directory %s.", m
->source
, where
);
1171 } else if (errno
== ENOENT
) {
1172 r
= mkdir_parents_label(where
, 0755);
1174 return log_error_errno(r
, "Failed to make parents of %s: %m", where
);
1176 log_error_errno(errno
, "Failed to stat %s: %m", where
);
1180 /* Create the mount point. Any non-directory file can be
1181 * mounted on any non-directory file (regular, fifo, socket,
1184 if (S_ISDIR(source_st
.st_mode
))
1185 r
= mkdir_label(where
, 0755);
1188 if (r
< 0 && r
!= -EEXIST
)
1189 return log_error_errno(r
, "Failed to create mount point %s: %m", where
);
1191 if (mount(m
->source
, where
, NULL
, MS_BIND
, NULL
) < 0)
1192 return log_error_errno(errno
, "mount(%s) failed: %m", where
);
1195 r
= bind_remount_recursive(where
, true);
1197 return log_error_errno(r
, "Read-only bind mount failed: %m");
1203 static int mount_tmpfs(const char *dest
, CustomMount
*m
) {
1204 const char *where
, *options
;
1205 _cleanup_free_
char *buf
= NULL
;
1211 where
= prefix_roota(dest
, m
->destination
);
1213 r
= mkdir_p_label(where
, 0755);
1214 if (r
< 0 && r
!= -EEXIST
)
1215 return log_error_errno(r
, "Creating mount point for tmpfs %s failed: %m", where
);
1217 r
= tmpfs_patch_options(m
->options
, &buf
);
1220 options
= r
> 0 ? buf
: m
->options
;
1222 if (mount("tmpfs", where
, "tmpfs", MS_NODEV
|MS_STRICTATIME
, options
) < 0)
1223 return log_error_errno(errno
, "tmpfs mount to %s failed: %m", where
);
1228 static int mount_overlay(const char *dest
, CustomMount
*m
) {
1229 _cleanup_free_
char *lower
= NULL
;
1230 const char *where
, *options
;
1236 where
= prefix_roota(dest
, m
->destination
);
1238 r
= mkdir_label(where
, 0755);
1239 if (r
< 0 && r
!= -EEXIST
)
1240 return log_error_errno(r
, "Creating mount point for overlay %s failed: %m", where
);
1242 (void) mkdir_p_label(m
->source
, 0755);
1244 strv_reverse(m
->lower
);
1245 lower
= strv_join(m
->lower
, ":");
1246 strv_reverse(m
->lower
);
1251 options
= strjoina("lowerdir=", m
->source
, ":", lower
);
1253 assert(m
->work_dir
);
1254 (void) mkdir_label(m
->work_dir
, 0700);
1256 options
= strjoina("lowerdir=", lower
, ",upperdir=", m
->source
, ",workdir=", m
->work_dir
);
1259 if (mount("overlay", where
, "overlay", m
->read_only
? MS_RDONLY
: 0, options
) < 0)
1260 return log_error_errno(errno
, "overlay mount to %s failed: %m", where
);
1265 static int mount_custom(const char *dest
) {
1271 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
1272 CustomMount
*m
= &arg_custom_mounts
[i
];
1276 case CUSTOM_MOUNT_BIND
:
1277 r
= mount_bind(dest
, m
);
1280 case CUSTOM_MOUNT_TMPFS
:
1281 r
= mount_tmpfs(dest
, m
);
1284 case CUSTOM_MOUNT_OVERLAY
:
1285 r
= mount_overlay(dest
, m
);
1289 assert_not_reached("Unknown custom mount type");
1299 static int mount_cgroup_hierarchy(const char *dest
, const char *controller
, const char *hierarchy
, bool read_only
) {
1303 to
= strjoina(dest
, "/sys/fs/cgroup/", hierarchy
);
1305 r
= path_is_mount_point(to
, 0);
1306 if (r
< 0 && r
!= -ENOENT
)
1307 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", to
);
1313 /* The superblock mount options of the mount point need to be
1314 * identical to the hosts', and hence writable... */
1315 if (mount("cgroup", to
, "cgroup", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, controller
) < 0)
1316 return log_error_errno(errno
, "Failed to mount to %s: %m", to
);
1318 /* ... hence let's only make the bind mount read-only, not the
1321 if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
) < 0)
1322 return log_error_errno(errno
, "Failed to remount %s read-only: %m", to
);
1327 static int mount_cgroup(const char *dest
) {
1328 _cleanup_set_free_free_ Set
*controllers
= NULL
;
1329 const char *cgroup_root
;
1332 controllers
= set_new(&string_hash_ops
);
1336 r
= cg_kernel_controllers(controllers
);
1338 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
1341 _cleanup_free_
char *controller
= NULL
, *origin
= NULL
, *combined
= NULL
;
1343 controller
= set_steal_first(controllers
);
1347 origin
= prefix_root("/sys/fs/cgroup/", controller
);
1351 r
= readlink_malloc(origin
, &combined
);
1353 /* Not a symbolic link, but directly a single cgroup hierarchy */
1355 r
= mount_cgroup_hierarchy(dest
, controller
, controller
, true);
1360 return log_error_errno(r
, "Failed to read link %s: %m", origin
);
1362 _cleanup_free_
char *target
= NULL
;
1364 target
= prefix_root(dest
, origin
);
1368 /* A symbolic link, a combination of controllers in one hierarchy */
1370 if (!filename_is_valid(combined
)) {
1371 log_warning("Ignoring invalid combined hierarchy %s.", combined
);
1375 r
= mount_cgroup_hierarchy(dest
, combined
, combined
, true);
1379 r
= symlink_idempotent(combined
, target
);
1381 log_error("Invalid existing symlink for combined hierarchy");
1385 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
1389 r
= mount_cgroup_hierarchy(dest
, "name=systemd,xattr", "systemd", false);
1393 cgroup_root
= prefix_roota(dest
, "/sys/fs/cgroup");
1394 if (mount(NULL
, cgroup_root
, NULL
, MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755") < 0)
1395 return log_error_errno(errno
, "Failed to remount %s read-only: %m", cgroup_root
);
1400 static int mount_systemd_cgroup_writable(const char *dest
) {
1401 _cleanup_free_
char *own_cgroup_path
= NULL
;
1402 const char *systemd_root
, *systemd_own
;
1407 r
= cg_pid_get_path(NULL
, 0, &own_cgroup_path
);
1409 return log_error_errno(r
, "Failed to determine our own cgroup path: %m");
1411 /* Make our own cgroup a (writable) bind mount */
1412 systemd_own
= strjoina(dest
, "/sys/fs/cgroup/systemd", own_cgroup_path
);
1413 if (mount(systemd_own
, systemd_own
, NULL
, MS_BIND
, NULL
) < 0)
1414 return log_error_errno(errno
, "Failed to turn %s into a bind mount: %m", own_cgroup_path
);
1416 /* And then remount the systemd cgroup root read-only */
1417 systemd_root
= prefix_roota(dest
, "/sys/fs/cgroup/systemd");
1418 if (mount(NULL
, systemd_root
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
) < 0)
1419 return log_error_errno(errno
, "Failed to mount cgroup root read-only: %m");
1424 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
1430 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
1433 if (uid
!= UID_INVALID
) {
1434 uid
+= arg_uid_shift
;
1436 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
1440 if (gid
!= GID_INVALID
) {
1441 gid
+= (gid_t
) arg_uid_shift
;
1443 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
1447 if (lchown(p
, uid
, gid
) < 0)
1453 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
1456 q
= prefix_roota(root
, path
);
1457 if (mkdir(q
, mode
) < 0) {
1458 if (errno
== EEXIST
)
1463 return userns_lchown(q
, uid
, gid
);
1466 static int setup_timezone(const char *dest
) {
1467 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
1468 const char *where
, *check
, *what
;
1474 /* Fix the timezone, if possible */
1475 r
= readlink_malloc("/etc/localtime", &p
);
1477 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1481 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1483 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1485 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1489 where
= prefix_roota(dest
, "/etc/localtime");
1490 r
= readlink_malloc(where
, &q
);
1492 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1494 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1496 /* Already pointing to the right place? Then do nothing .. */
1497 if (y
&& streq(y
, z
))
1501 check
= strjoina("/usr/share/zoneinfo/", z
);
1502 check
= prefix_root(dest
, check
);
1503 if (laccess(check
, F_OK
) < 0) {
1504 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1509 if (r
< 0 && errno
!= ENOENT
) {
1510 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1514 what
= strjoina("../usr/share/zoneinfo/", z
);
1515 if (symlink(what
, where
) < 0) {
1516 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1520 r
= userns_lchown(where
, 0, 0);
1522 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1527 static int setup_resolv_conf(const char *dest
) {
1528 const char *where
= NULL
;
1533 if (arg_private_network
)
1536 /* Fix resolv.conf, if possible */
1537 where
= prefix_roota(dest
, "/etc/resolv.conf");
1539 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1541 /* If the file already exists as symlink, let's
1542 * suppress the warning, under the assumption that
1543 * resolved or something similar runs inside and the
1544 * symlink points there.
1546 * If the disk image is read-only, there's also no
1547 * point in complaining.
1549 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1550 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1554 r
= userns_lchown(where
, 0, 0);
1556 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1561 static int setup_volatile_state(const char *directory
) {
1562 _cleanup_free_
char *buf
= NULL
;
1563 const char *p
, *options
;
1568 if (arg_volatile
!= VOLATILE_STATE
)
1571 /* --volatile=state means we simply overmount /var
1572 with a tmpfs, and the rest read-only. */
1574 r
= bind_remount_recursive(directory
, true);
1576 return log_error_errno(r
, "Failed to remount %s read-only: %m", directory
);
1578 p
= prefix_roota(directory
, "/var");
1580 if (r
< 0 && errno
!= EEXIST
)
1581 return log_error_errno(errno
, "Failed to create %s: %m", directory
);
1583 options
= "mode=755";
1584 r
= tmpfs_patch_options(options
, &buf
);
1590 if (mount("tmpfs", p
, "tmpfs", MS_STRICTATIME
, options
) < 0)
1591 return log_error_errno(errno
, "Failed to mount tmpfs to /var: %m");
1596 static int setup_volatile(const char *directory
) {
1597 bool tmpfs_mounted
= false, bind_mounted
= false;
1598 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1599 _cleanup_free_
char *buf
= NULL
;
1600 const char *f
, *t
, *options
;
1605 if (arg_volatile
!= VOLATILE_YES
)
1608 /* --volatile=yes means we mount a tmpfs to the root dir, and
1609 the original /usr to use inside it, and that read-only. */
1611 if (!mkdtemp(template))
1612 return log_error_errno(errno
, "Failed to create temporary directory: %m");
1614 options
= "mode=755";
1615 r
= tmpfs_patch_options(options
, &buf
);
1621 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME
, options
) < 0) {
1622 r
= log_error_errno(errno
, "Failed to mount tmpfs for root directory: %m");
1626 tmpfs_mounted
= true;
1628 f
= prefix_roota(directory
, "/usr");
1629 t
= prefix_roota(template, "/usr");
1632 if (r
< 0 && errno
!= EEXIST
) {
1633 r
= log_error_errno(errno
, "Failed to create %s: %m", t
);
1637 if (mount(f
, t
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0) {
1638 r
= log_error_errno(errno
, "Failed to create /usr bind mount: %m");
1642 bind_mounted
= true;
1644 r
= bind_remount_recursive(t
, true);
1646 log_error_errno(r
, "Failed to remount %s read-only: %m", t
);
1650 if (mount(template, directory
, NULL
, MS_MOVE
, NULL
) < 0) {
1651 r
= log_error_errno(errno
, "Failed to move root mount: %m");
1655 (void) rmdir(template);
1664 (void) umount(template);
1665 (void) rmdir(template);
1669 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1673 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1674 SD_ID128_FORMAT_VAL(id
));
1679 static int setup_boot_id(const char *dest
) {
1680 const char *from
, *to
;
1681 sd_id128_t rnd
= {};
1685 if (arg_share_system
)
1688 /* Generate a new randomized boot ID, so that each boot-up of
1689 * the container gets a new one */
1691 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1692 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1694 r
= sd_id128_randomize(&rnd
);
1696 return log_error_errno(r
, "Failed to generate random boot id: %m");
1698 id128_format_as_uuid(rnd
, as_uuid
);
1700 r
= write_string_file(from
, as_uuid
);
1702 return log_error_errno(r
, "Failed to write boot id: %m");
1704 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1705 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1706 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1707 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1713 static int copy_devnodes(const char *dest
) {
1715 static const char devnodes
[] =
1726 _cleanup_umask_ mode_t u
;
1732 /* Create /dev/net, so that we can create /dev/net/tun in it */
1733 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1734 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1736 NULSTR_FOREACH(d
, devnodes
) {
1737 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1740 from
= strappend("/dev/", d
);
1741 to
= prefix_root(dest
, from
);
1743 if (stat(from
, &st
) < 0) {
1745 if (errno
!= ENOENT
)
1746 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1748 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1750 log_error("%s is not a char or block device, cannot copy.", from
);
1754 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1756 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1758 /* Some systems abusively restrict mknod but
1759 * allow bind mounts. */
1762 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1763 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1764 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1767 r
= userns_lchown(to
, 0, 0);
1769 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1776 static int setup_pts(const char *dest
) {
1777 _cleanup_free_
char *options
= NULL
;
1781 if (arg_selinux_apifs_context
)
1782 (void) asprintf(&options
,
1783 "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT
",gid=" GID_FMT
",context=\"%s\"",
1785 arg_uid_shift
+ TTY_GID
,
1786 arg_selinux_apifs_context
);
1789 (void) asprintf(&options
,
1790 "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT
",gid=" GID_FMT
,
1792 arg_uid_shift
+ TTY_GID
);
1797 /* Mount /dev/pts itself */
1798 p
= prefix_roota(dest
, "/dev/pts");
1799 if (mkdir(p
, 0755) < 0)
1800 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1801 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1802 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1803 if (userns_lchown(p
, 0, 0) < 0)
1804 return log_error_errno(errno
, "Failed to chown /dev/pts: %m");
1806 /* Create /dev/ptmx symlink */
1807 p
= prefix_roota(dest
, "/dev/ptmx");
1808 if (symlink("pts/ptmx", p
) < 0)
1809 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1810 if (userns_lchown(p
, 0, 0) < 0)
1811 return log_error_errno(errno
, "Failed to chown /dev/ptmx: %m");
1813 /* And fix /dev/pts/ptmx ownership */
1814 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1815 if (userns_lchown(p
, 0, 0) < 0)
1816 return log_error_errno(errno
, "Failed to chown /dev/pts/ptmx: %m");
1821 static int setup_dev_console(const char *dest
, const char *console
) {
1822 _cleanup_umask_ mode_t u
;
1831 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1833 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1835 /* We need to bind mount the right tty to /dev/console since
1836 * ptys can only exist on pts file systems. To have something
1837 * to bind mount things on we create a empty regular file. */
1839 to
= prefix_roota(dest
, "/dev/console");
1842 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1844 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1845 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1850 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1851 const char *from
, *to
;
1852 _cleanup_umask_ mode_t u
;
1855 struct cmsghdr cmsghdr
;
1856 uint8_t buf
[CMSG_SPACE(sizeof(int))];
1858 struct msghdr mh
= {
1859 .msg_control
= &control
,
1860 .msg_controllen
= sizeof(control
),
1862 struct cmsghdr
*cmsg
;
1864 assert(kmsg_socket
>= 0);
1868 /* We create the kmsg FIFO as /run/kmsg, but immediately
1869 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1870 * on the reading side behave very similar to /proc/kmsg,
1871 * their writing side behaves differently from /dev/kmsg in
1872 * that writing blocks when nothing is reading. In order to
1873 * avoid any problems with containers deadlocking due to this
1874 * we simply make /dev/kmsg unavailable to the container. */
1875 from
= prefix_roota(dest
, "/run/kmsg");
1876 to
= prefix_roota(dest
, "/proc/kmsg");
1878 if (mkfifo(from
, 0600) < 0)
1879 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1880 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1881 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1883 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1885 return log_error_errno(errno
, "Failed to open fifo: %m");
1887 cmsg
= CMSG_FIRSTHDR(&mh
);
1888 cmsg
->cmsg_level
= SOL_SOCKET
;
1889 cmsg
->cmsg_type
= SCM_RIGHTS
;
1890 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
1891 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
1893 mh
.msg_controllen
= cmsg
->cmsg_len
;
1895 /* Store away the fd in the socket, so that it stays open as
1896 * long as we run the child */
1897 k
= sendmsg(kmsg_socket
, &mh
, MSG_NOSIGNAL
);
1901 return log_error_errno(errno
, "Failed to send FIFO fd: %m");
1903 /* And now make the FIFO unavailable as /run/kmsg... */
1904 (void) unlink(from
);
1909 static int send_rtnl(int send_fd
) {
1911 struct cmsghdr cmsghdr
;
1912 uint8_t buf
[CMSG_SPACE(sizeof(int))];
1914 struct msghdr mh
= {
1915 .msg_control
= &control
,
1916 .msg_controllen
= sizeof(control
),
1918 struct cmsghdr
*cmsg
;
1919 _cleanup_close_
int fd
= -1;
1922 assert(send_fd
>= 0);
1924 if (!arg_expose_ports
)
1927 fd
= socket(PF_NETLINK
, SOCK_RAW
|SOCK_CLOEXEC
|SOCK_NONBLOCK
, NETLINK_ROUTE
);
1929 return log_error_errno(errno
, "Failed to allocate container netlink: %m");
1931 cmsg
= CMSG_FIRSTHDR(&mh
);
1932 cmsg
->cmsg_level
= SOL_SOCKET
;
1933 cmsg
->cmsg_type
= SCM_RIGHTS
;
1934 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
1935 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
1937 mh
.msg_controllen
= cmsg
->cmsg_len
;
1939 /* Store away the fd in the socket, so that it stays open as
1940 * long as we run the child */
1941 k
= sendmsg(send_fd
, &mh
, MSG_NOSIGNAL
);
1943 return log_error_errno(errno
, "Failed to send netlink fd: %m");
1948 static int flush_ports(union in_addr_union
*exposed
) {
1950 int r
, af
= AF_INET
;
1954 if (!arg_expose_ports
)
1957 if (in_addr_is_null(af
, exposed
))
1960 log_debug("Lost IP address.");
1962 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
1963 r
= fw_add_local_dnat(false,
1974 log_warning_errno(r
, "Failed to modify firewall: %m");
1977 *exposed
= IN_ADDR_NULL
;
1981 static int expose_ports(sd_netlink
*rtnl
, union in_addr_union
*exposed
) {
1982 _cleanup_free_
struct local_address
*addresses
= NULL
;
1983 _cleanup_free_
char *pretty
= NULL
;
1984 union in_addr_union new_exposed
;
1987 int af
= AF_INET
, r
;
1991 /* Invoked each time an address is added or removed inside the
1994 if (!arg_expose_ports
)
1997 r
= local_addresses(rtnl
, 0, af
, &addresses
);
1999 return log_error_errno(r
, "Failed to enumerate local addresses: %m");
2002 addresses
[0].family
== af
&&
2003 addresses
[0].scope
< RT_SCOPE_LINK
;
2006 return flush_ports(exposed
);
2008 new_exposed
= addresses
[0].address
;
2009 if (in_addr_equal(af
, exposed
, &new_exposed
))
2012 in_addr_to_string(af
, &new_exposed
, &pretty
);
2013 log_debug("New container IP is %s.", strna(pretty
));
2015 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
2017 r
= fw_add_local_dnat(true,
2026 in_addr_is_null(af
, exposed
) ? NULL
: exposed
);
2028 log_warning_errno(r
, "Failed to modify firewall: %m");
2031 *exposed
= new_exposed
;
2035 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
2036 union in_addr_union
*exposed
= userdata
;
2042 expose_ports(rtnl
, exposed
);
2046 static int watch_rtnl(sd_event
*event
, int recv_fd
, union in_addr_union
*exposed
, sd_netlink
**ret
) {
2048 struct cmsghdr cmsghdr
;
2049 uint8_t buf
[CMSG_SPACE(sizeof(int))];
2051 struct msghdr mh
= {
2052 .msg_control
= &control
,
2053 .msg_controllen
= sizeof(control
),
2055 struct cmsghdr
*cmsg
;
2056 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2061 assert(recv_fd
>= 0);
2064 if (!arg_expose_ports
)
2067 k
= recvmsg(recv_fd
, &mh
, MSG_NOSIGNAL
);
2069 return log_error_errno(errno
, "Failed to recv netlink fd: %m");
2071 cmsg
= CMSG_FIRSTHDR(&mh
);
2072 assert(cmsg
->cmsg_level
== SOL_SOCKET
);
2073 assert(cmsg
->cmsg_type
== SCM_RIGHTS
);
2074 assert(cmsg
->cmsg_len
== CMSG_LEN(sizeof(int)));
2075 memcpy(&fd
, CMSG_DATA(cmsg
), sizeof(int));
2077 r
= sd_netlink_open_fd(&rtnl
, fd
);
2080 return log_error_errno(r
, "Failed to create rtnl object: %m");
2083 r
= sd_netlink_add_match(rtnl
, RTM_NEWADDR
, on_address_change
, exposed
);
2085 return log_error_errno(r
, "Failed to subscribe to RTM_NEWADDR messages: %m");
2087 r
= sd_netlink_add_match(rtnl
, RTM_DELADDR
, on_address_change
, exposed
);
2089 return log_error_errno(r
, "Failed to subscribe to RTM_DELADDR messages: %m");
2091 r
= sd_netlink_attach_event(rtnl
, event
, 0);
2093 return log_error_errno(r
, "Failed to add to even loop: %m");
2101 static int setup_hostname(void) {
2103 if (arg_share_system
)
2106 if (sethostname_idempotent(arg_machine
) < 0)
2112 static int setup_journal(const char *directory
) {
2113 sd_id128_t machine_id
, this_id
;
2114 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
2115 const char *etc_machine_id
, *p
, *q
;
2119 /* Don't link journals in ephemeral mode */
2123 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
2125 r
= read_one_line_file(etc_machine_id
, &b
);
2126 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
2129 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
2132 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
2135 /* Verify validity */
2136 r
= sd_id128_from_string(id
, &machine_id
);
2138 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
2140 r
= sd_id128_get_machine(&this_id
);
2142 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
2144 if (sd_id128_equal(machine_id
, this_id
)) {
2145 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
2146 "Host and machine ids are equal (%s): refusing to link journals", id
);
2147 if (arg_link_journal
== LINK_AUTO
)
2152 if (arg_link_journal
== LINK_NO
)
2155 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
2157 return log_error_errno(r
, "Failed to create /var: %m");
2159 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
2161 return log_error_errno(r
, "Failed to create /var/log: %m");
2163 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
2165 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
2167 p
= strjoina("/var/log/journal/", id
);
2168 q
= prefix_roota(directory
, p
);
2170 if (path_is_mount_point(p
, 0) > 0) {
2171 if (arg_link_journal
!= LINK_AUTO
) {
2172 log_error("%s: already a mount point, refusing to use for journal", p
);
2179 if (path_is_mount_point(q
, 0) > 0) {
2180 if (arg_link_journal
!= LINK_AUTO
) {
2181 log_error("%s: already a mount point, refusing to use for journal", q
);
2188 r
= readlink_and_make_absolute(p
, &d
);
2190 if ((arg_link_journal
== LINK_GUEST
||
2191 arg_link_journal
== LINK_AUTO
) &&
2194 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2196 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
2201 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
2202 } else if (r
== -EINVAL
) {
2204 if (arg_link_journal
== LINK_GUEST
&&
2207 if (errno
== ENOTDIR
) {
2208 log_error("%s already exists and is neither a symlink nor a directory", p
);
2211 log_error_errno(errno
, "Failed to remove %s: %m", p
);
2215 } else if (r
!= -ENOENT
) {
2216 log_error_errno(errno
, "readlink(%s) failed: %m", p
);
2220 if (arg_link_journal
== LINK_GUEST
) {
2222 if (symlink(q
, p
) < 0) {
2223 if (arg_link_journal_try
) {
2224 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
2227 log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
2232 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2234 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
2238 if (arg_link_journal
== LINK_HOST
) {
2239 /* don't create parents here -- if the host doesn't have
2240 * permanent journal set up, don't force it here */
2243 if (arg_link_journal_try
) {
2244 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
2247 log_error_errno(errno
, "Failed to create %s: %m", p
);
2252 } else if (access(p
, F_OK
) < 0)
2255 if (dir_is_empty(q
) == 0)
2256 log_warning("%s is not empty, proceeding anyway.", q
);
2258 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2260 log_error_errno(errno
, "Failed to create %s: %m", q
);
2264 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
2265 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
2270 static int drop_capabilities(void) {
2271 return capability_bounding_set_drop(~arg_retain
, false);
2274 static int register_machine(pid_t pid
, int local_ifindex
) {
2275 _cleanup_bus_error_free_ sd_bus_error error
= SD_BUS_ERROR_NULL
;
2276 _cleanup_bus_close_unref_ sd_bus
*bus
= NULL
;
2282 r
= sd_bus_default_system(&bus
);
2284 return log_error_errno(r
, "Failed to open system bus: %m");
2286 if (arg_keep_unit
) {
2287 r
= sd_bus_call_method(
2289 "org.freedesktop.machine1",
2290 "/org/freedesktop/machine1",
2291 "org.freedesktop.machine1.Manager",
2292 "RegisterMachineWithNetwork",
2297 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid
),
2301 strempty(arg_directory
),
2302 local_ifindex
> 0 ? 1 : 0, local_ifindex
);
2304 _cleanup_bus_message_unref_ sd_bus_message
*m
= NULL
;
2308 r
= sd_bus_message_new_method_call(
2311 "org.freedesktop.machine1",
2312 "/org/freedesktop/machine1",
2313 "org.freedesktop.machine1.Manager",
2314 "CreateMachineWithNetwork");
2316 return bus_log_create_error(r
);
2318 r
= sd_bus_message_append(
2322 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid
),
2326 strempty(arg_directory
),
2327 local_ifindex
> 0 ? 1 : 0, local_ifindex
);
2329 return bus_log_create_error(r
);
2331 r
= sd_bus_message_open_container(m
, 'a', "(sv)");
2333 return bus_log_create_error(r
);
2335 if (!isempty(arg_slice
)) {
2336 r
= sd_bus_message_append(m
, "(sv)", "Slice", "s", arg_slice
);
2338 return bus_log_create_error(r
);
2341 r
= sd_bus_message_append(m
, "(sv)", "DevicePolicy", "s", "strict");
2343 return bus_log_create_error(r
);
2345 /* If you make changes here, also make sure to update
2346 * systemd-nspawn@.service, to keep the device
2347 * policies in sync regardless if we are run with or
2348 * without the --keep-unit switch. */
2349 r
= sd_bus_message_append(m
, "(sv)", "DeviceAllow", "a(ss)", 9,
2350 /* Allow the container to
2351 * access and create the API
2352 * device nodes, so that
2353 * PrivateDevices= in the
2354 * container can work
2359 "/dev/random", "rwm",
2360 "/dev/urandom", "rwm",
2362 "/dev/net/tun", "rwm",
2363 /* Allow the container
2364 * access to ptys. However,
2366 * container to ever create
2367 * these device nodes. */
2368 "/dev/pts/ptmx", "rw",
2371 return bus_log_create_error(r
);
2373 for (j
= 0; j
< arg_n_custom_mounts
; j
++) {
2374 CustomMount
*cm
= &arg_custom_mounts
[j
];
2376 if (cm
->type
!= CUSTOM_MOUNT_BIND
)
2379 r
= is_device_node(cm
->source
);
2381 return log_error_errno(r
, "Failed to stat %s: %m", cm
->source
);
2384 r
= sd_bus_message_append(m
, "(sv)", "DeviceAllow", "a(ss)", 1,
2385 cm
->source
, cm
->read_only
? "r" : "rw");
2387 return log_error_errno(r
, "Failed to append message arguments: %m");
2391 if (arg_kill_signal
!= 0) {
2392 r
= sd_bus_message_append(m
, "(sv)", "KillSignal", "i", arg_kill_signal
);
2394 return bus_log_create_error(r
);
2396 r
= sd_bus_message_append(m
, "(sv)", "KillMode", "s", "mixed");
2398 return bus_log_create_error(r
);
2401 STRV_FOREACH(i
, arg_property
) {
2402 r
= sd_bus_message_open_container(m
, 'r', "sv");
2404 return bus_log_create_error(r
);
2406 r
= bus_append_unit_property_assignment(m
, *i
);
2410 r
= sd_bus_message_close_container(m
);
2412 return bus_log_create_error(r
);
2415 r
= sd_bus_message_close_container(m
);
2417 return bus_log_create_error(r
);
2419 r
= sd_bus_call(bus
, m
, 0, &error
, NULL
);
2423 log_error("Failed to register machine: %s", bus_error_message(&error
, r
));
2430 static int terminate_machine(pid_t pid
) {
2431 _cleanup_bus_error_free_ sd_bus_error error
= SD_BUS_ERROR_NULL
;
2432 _cleanup_bus_message_unref_ sd_bus_message
*reply
= NULL
;
2433 _cleanup_bus_close_unref_ sd_bus
*bus
= NULL
;
2440 /* If we are reusing the unit, then just exit, systemd will do
2441 * the right thing when we exit. */
2445 r
= sd_bus_default_system(&bus
);
2447 return log_error_errno(r
, "Failed to open system bus: %m");
2449 r
= sd_bus_call_method(
2451 "org.freedesktop.machine1",
2452 "/org/freedesktop/machine1",
2453 "org.freedesktop.machine1.Manager",
2460 /* Note that the machine might already have been
2461 * cleaned up automatically, hence don't consider it a
2462 * failure if we cannot get the machine object. */
2463 log_debug("Failed to get machine: %s", bus_error_message(&error
, r
));
2467 r
= sd_bus_message_read(reply
, "o", &path
);
2469 return bus_log_parse_error(r
);
2471 r
= sd_bus_call_method(
2473 "org.freedesktop.machine1",
2475 "org.freedesktop.machine1.Machine",
2481 log_debug("Failed to terminate machine: %s", bus_error_message(&error
, r
));
2488 static int reset_audit_loginuid(void) {
2489 _cleanup_free_
char *p
= NULL
;
2492 if (arg_share_system
)
2495 r
= read_one_line_file("/proc/self/loginuid", &p
);
2499 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
2501 /* Already reset? */
2502 if (streq(p
, "4294967295"))
2505 r
= write_string_file("/proc/self/loginuid", "4294967295");
2508 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2509 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2510 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2511 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2512 "using systemd-nspawn. Sleeping for 5s... (%m)");
2520 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2521 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2522 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2524 static int generate_mac(struct ether_addr
*mac
, sd_id128_t hash_key
, uint64_t idx
) {
2530 l
= strlen(arg_machine
);
2531 sz
= sizeof(sd_id128_t
) + l
;
2537 /* fetch some persistent data unique to the host */
2538 r
= sd_id128_get_machine((sd_id128_t
*) v
);
2542 /* combine with some data unique (on this host) to this
2543 * container instance */
2544 i
= mempcpy(v
+ sizeof(sd_id128_t
), arg_machine
, l
);
2547 memcpy(i
, &idx
, sizeof(idx
));
2550 /* Let's hash the host machine ID plus the container name. We
2551 * use a fixed, but originally randomly created hash key here. */
2552 siphash24(result
, v
, sz
, hash_key
.bytes
);
2554 assert_cc(ETH_ALEN
<= sizeof(result
));
2555 memcpy(mac
->ether_addr_octet
, result
, ETH_ALEN
);
2557 /* see eth_random_addr in the kernel */
2558 mac
->ether_addr_octet
[0] &= 0xfe; /* clear multicast bit */
2559 mac
->ether_addr_octet
[0] |= 0x02; /* set local assignment bit (IEEE802) */
2564 static int setup_veth(pid_t pid
, char iface_name
[IFNAMSIZ
], int *ifi
) {
2565 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2566 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2567 struct ether_addr mac_host
, mac_container
;
2570 if (!arg_private_network
)
2573 if (!arg_network_veth
)
2576 /* Use two different interface name prefixes depending whether
2577 * we are in bridge mode or not. */
2578 snprintf(iface_name
, IFNAMSIZ
- 1, "%s-%s",
2579 arg_network_bridge
? "vb" : "ve", arg_machine
);
2581 r
= generate_mac(&mac_container
, CONTAINER_HASH_KEY
, 0);
2583 return log_error_errno(r
, "Failed to generate predictable MAC address for container side: %m");
2585 r
= generate_mac(&mac_host
, HOST_HASH_KEY
, 0);
2587 return log_error_errno(r
, "Failed to generate predictable MAC address for host side: %m");
2589 r
= sd_netlink_open(&rtnl
);
2591 return log_error_errno(r
, "Failed to connect to netlink: %m");
2593 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2595 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2597 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, iface_name
);
2599 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2601 r
= sd_netlink_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac_host
);
2603 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2605 r
= sd_netlink_message_open_container(m
, IFLA_LINKINFO
);
2607 return log_error_errno(r
, "Failed to open netlink container: %m");
2609 r
= sd_netlink_message_open_container_union(m
, IFLA_INFO_DATA
, "veth");
2611 return log_error_errno(r
, "Failed to open netlink container: %m");
2613 r
= sd_netlink_message_open_container(m
, VETH_INFO_PEER
);
2615 return log_error_errno(r
, "Failed to open netlink container: %m");
2617 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, "host0");
2619 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2621 r
= sd_netlink_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac_container
);
2623 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2625 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2627 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2629 r
= sd_netlink_message_close_container(m
);
2631 return log_error_errno(r
, "Failed to close netlink container: %m");
2633 r
= sd_netlink_message_close_container(m
);
2635 return log_error_errno(r
, "Failed to close netlink container: %m");
2637 r
= sd_netlink_message_close_container(m
);
2639 return log_error_errno(r
, "Failed to close netlink container: %m");
2641 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2643 return log_error_errno(r
, "Failed to add new veth interfaces (host0, %s): %m", iface_name
);
2645 i
= (int) if_nametoindex(iface_name
);
2647 return log_error_errno(errno
, "Failed to resolve interface %s: %m", iface_name
);
2654 static int setup_bridge(const char veth_name
[], int *ifi
) {
2655 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2656 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2659 if (!arg_private_network
)
2662 if (!arg_network_veth
)
2665 if (!arg_network_bridge
)
2668 bridge
= (int) if_nametoindex(arg_network_bridge
);
2670 return log_error_errno(errno
, "Failed to resolve interface %s: %m", arg_network_bridge
);
2674 r
= sd_netlink_open(&rtnl
);
2676 return log_error_errno(r
, "Failed to connect to netlink: %m");
2678 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_SETLINK
, 0);
2680 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2682 r
= sd_rtnl_message_link_set_flags(m
, IFF_UP
, IFF_UP
);
2684 return log_error_errno(r
, "Failed to set IFF_UP flag: %m");
2686 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, veth_name
);
2688 return log_error_errno(r
, "Failed to add netlink interface name field: %m");
2690 r
= sd_netlink_message_append_u32(m
, IFLA_MASTER
, bridge
);
2692 return log_error_errno(r
, "Failed to add netlink master field: %m");
2694 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2696 return log_error_errno(r
, "Failed to add veth interface to bridge: %m");
2701 static int parse_interface(struct udev
*udev
, const char *name
) {
2702 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
2703 char ifi_str
[2 + DECIMAL_STR_MAX(int)];
2706 ifi
= (int) if_nametoindex(name
);
2708 return log_error_errno(errno
, "Failed to resolve interface %s: %m", name
);
2710 sprintf(ifi_str
, "n%i", ifi
);
2711 d
= udev_device_new_from_device_id(udev
, ifi_str
);
2713 return log_error_errno(errno
, "Failed to get udev device for interface %s: %m", name
);
2715 if (udev_device_get_is_initialized(d
) <= 0) {
2716 log_error("Network interface %s is not initialized yet.", name
);
2723 static int move_network_interfaces(pid_t pid
) {
2724 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2725 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2729 if (!arg_private_network
)
2732 if (strv_isempty(arg_network_interfaces
))
2735 r
= sd_netlink_open(&rtnl
);
2737 return log_error_errno(r
, "Failed to connect to netlink: %m");
2741 log_error("Failed to connect to udev.");
2745 STRV_FOREACH(i
, arg_network_interfaces
) {
2746 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2749 ifi
= parse_interface(udev
, *i
);
2753 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_SETLINK
, ifi
);
2755 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2757 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2759 return log_error_errno(r
, "Failed to append namespace PID to netlink message: %m");
2761 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2763 return log_error_errno(r
, "Failed to move interface %s to namespace: %m", *i
);
2769 static int setup_macvlan(pid_t pid
) {
2770 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2771 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2776 if (!arg_private_network
)
2779 if (strv_isempty(arg_network_macvlan
))
2782 r
= sd_netlink_open(&rtnl
);
2784 return log_error_errno(r
, "Failed to connect to netlink: %m");
2788 log_error("Failed to connect to udev.");
2792 STRV_FOREACH(i
, arg_network_macvlan
) {
2793 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2794 _cleanup_free_
char *n
= NULL
;
2795 struct ether_addr mac
;
2798 ifi
= parse_interface(udev
, *i
);
2802 r
= generate_mac(&mac
, MACVLAN_HASH_KEY
, idx
++);
2804 return log_error_errno(r
, "Failed to create MACVLAN MAC address: %m");
2806 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2808 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2810 r
= sd_netlink_message_append_u32(m
, IFLA_LINK
, ifi
);
2812 return log_error_errno(r
, "Failed to add netlink interface index: %m");
2814 n
= strappend("mv-", *i
);
2818 strshorten(n
, IFNAMSIZ
-1);
2820 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, n
);
2822 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2824 r
= sd_netlink_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac
);
2826 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2828 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2830 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2832 r
= sd_netlink_message_open_container(m
, IFLA_LINKINFO
);
2834 return log_error_errno(r
, "Failed to open netlink container: %m");
2836 r
= sd_netlink_message_open_container_union(m
, IFLA_INFO_DATA
, "macvlan");
2838 return log_error_errno(r
, "Failed to open netlink container: %m");
2840 r
= sd_netlink_message_append_u32(m
, IFLA_MACVLAN_MODE
, MACVLAN_MODE_BRIDGE
);
2842 return log_error_errno(r
, "Failed to append macvlan mode: %m");
2844 r
= sd_netlink_message_close_container(m
);
2846 return log_error_errno(r
, "Failed to close netlink container: %m");
2848 r
= sd_netlink_message_close_container(m
);
2850 return log_error_errno(r
, "Failed to close netlink container: %m");
2852 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2854 return log_error_errno(r
, "Failed to add new macvlan interfaces: %m");
2860 static int setup_ipvlan(pid_t pid
) {
2861 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2862 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2866 if (!arg_private_network
)
2869 if (strv_isempty(arg_network_ipvlan
))
2872 r
= sd_netlink_open(&rtnl
);
2874 return log_error_errno(r
, "Failed to connect to netlink: %m");
2878 log_error("Failed to connect to udev.");
2882 STRV_FOREACH(i
, arg_network_ipvlan
) {
2883 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2884 _cleanup_free_
char *n
= NULL
;
2887 ifi
= parse_interface(udev
, *i
);
2891 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2893 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2895 r
= sd_netlink_message_append_u32(m
, IFLA_LINK
, ifi
);
2897 return log_error_errno(r
, "Failed to add netlink interface index: %m");
2899 n
= strappend("iv-", *i
);
2903 strshorten(n
, IFNAMSIZ
-1);
2905 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, n
);
2907 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2909 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2911 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2913 r
= sd_netlink_message_open_container(m
, IFLA_LINKINFO
);
2915 return log_error_errno(r
, "Failed to open netlink container: %m");
2917 r
= sd_netlink_message_open_container_union(m
, IFLA_INFO_DATA
, "ipvlan");
2919 return log_error_errno(r
, "Failed to open netlink container: %m");
2921 r
= sd_netlink_message_append_u16(m
, IFLA_IPVLAN_MODE
, IPVLAN_MODE_L2
);
2923 return log_error_errno(r
, "Failed to add ipvlan mode: %m");
2925 r
= sd_netlink_message_close_container(m
);
2927 return log_error_errno(r
, "Failed to close netlink container: %m");
2929 r
= sd_netlink_message_close_container(m
);
2931 return log_error_errno(r
, "Failed to close netlink container: %m");
2933 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2935 return log_error_errno(r
, "Failed to add new ipvlan interfaces: %m");
2941 static int setup_seccomp(void) {
2944 static const struct {
2945 uint64_t capability
;
2948 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
2949 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
2950 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
2951 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
2952 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
2953 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
2954 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
2955 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
2956 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
2957 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
2960 scmp_filter_ctx seccomp
;
2964 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
2968 r
= seccomp_add_secondary_archs(seccomp
);
2970 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
2974 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
2975 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
2978 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
2980 continue; /* unknown syscall */
2982 log_error_errno(r
, "Failed to block syscall: %m");
2989 Audit is broken in containers, much of the userspace audit
2990 hookup will fail if running inside a container. We don't
2991 care and just turn off creation of audit sockets.
2993 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2994 with EAFNOSUPPORT which audit userspace uses as indication
2995 that audit is disabled in the kernel.
2998 r
= seccomp_rule_add(
3000 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
3003 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
3004 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
3006 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
3010 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
3012 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
3016 r
= seccomp_load(seccomp
);
3018 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3023 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
3028 seccomp_release(seccomp
);
3036 static int setup_propagate(const char *root
) {
3039 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3040 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
3041 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3042 (void) mkdir_p(p
, 0600);
3044 if (userns_mkdir(root
, "/run/systemd", 0755, 0, 0) < 0)
3045 return log_error_errno(errno
, "Failed to create /run/systemd: %m");
3047 if (userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3048 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn: %m");
3050 if (userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3051 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn/incoming: %m");
3053 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
3054 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
3055 return log_error_errno(errno
, "Failed to install propagation bind mount.");
3057 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
3058 return log_error_errno(errno
, "Failed to make propagation mount read-only");
3063 static int setup_image(char **device_path
, int *loop_nr
) {
3064 struct loop_info64 info
= {
3065 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
3067 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
3068 _cleanup_free_
char* loopdev
= NULL
;
3072 assert(device_path
);
3076 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
3078 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
3080 if (fstat(fd
, &st
) < 0)
3081 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
3083 if (S_ISBLK(st
.st_mode
)) {
3086 p
= strdup(arg_image
);
3100 if (!S_ISREG(st
.st_mode
)) {
3101 log_error_errno(errno
, "%s is not a regular file or block device: %m", arg_image
);
3105 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
3107 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
3109 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
3111 return log_error_errno(errno
, "Failed to allocate loop device: %m");
3113 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
3116 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
3118 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
3120 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
3121 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
3124 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
3126 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
3127 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
3129 *device_path
= loopdev
;
3140 #define PARTITION_TABLE_BLURB \
3141 "Note that the disk image needs to either contain only a single MBR partition of\n" \
3142 "type 0x83 that is marked bootable, or a single GPT partition of type " \
3143 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
3144 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3145 "to be bootable with systemd-nspawn."
3147 static int dissect_image(
3149 char **root_device
, bool *root_device_rw
,
3150 char **home_device
, bool *home_device_rw
,
3151 char **srv_device
, bool *srv_device_rw
,
3155 int home_nr
= -1, srv_nr
= -1;
3156 #ifdef GPT_ROOT_NATIVE
3159 #ifdef GPT_ROOT_SECONDARY
3160 int secondary_root_nr
= -1;
3162 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
3163 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
3164 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
3165 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
3166 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
3167 struct udev_list_entry
*first
, *item
;
3168 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
3169 bool is_gpt
, is_mbr
, multiple_generic
= false;
3170 const char *pttype
= NULL
;
3177 assert(root_device
);
3178 assert(home_device
);
3183 b
= blkid_new_probe();
3188 r
= blkid_probe_set_device(b
, fd
, 0, 0);
3193 log_error_errno(errno
, "Failed to set device on blkid probe: %m");
3197 blkid_probe_enable_partitions(b
, 1);
3198 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
3201 r
= blkid_do_safeprobe(b
);
3202 if (r
== -2 || r
== 1) {
3203 log_error("Failed to identify any partition table on\n"
3205 PARTITION_TABLE_BLURB
, arg_image
);
3207 } else if (r
!= 0) {
3210 log_error_errno(errno
, "Failed to probe: %m");
3214 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
3216 is_gpt
= streq_ptr(pttype
, "gpt");
3217 is_mbr
= streq_ptr(pttype
, "dos");
3219 if (!is_gpt
&& !is_mbr
) {
3220 log_error("No GPT or MBR partition table discovered on\n"
3222 PARTITION_TABLE_BLURB
, arg_image
);
3227 pl
= blkid_probe_get_partitions(b
);
3232 log_error("Failed to list partitions of %s", arg_image
);
3240 if (fstat(fd
, &st
) < 0)
3241 return log_error_errno(errno
, "Failed to stat block device: %m");
3243 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
3251 log_error("Kernel partitions never appeared.");
3255 e
= udev_enumerate_new(udev
);
3259 r
= udev_enumerate_add_match_parent(e
, d
);
3263 r
= udev_enumerate_scan_devices(e
);
3265 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
3267 /* Count the partitions enumerated by the kernel */
3269 first
= udev_enumerate_get_list_entry(e
);
3270 udev_list_entry_foreach(item
, first
)
3273 /* Count the partitions enumerated by blkid */
3274 m
= blkid_partlist_numof_partitions(pl
);
3278 log_error("blkid and kernel partition list do not match.");
3284 /* The kernel has probed fewer partitions than
3285 * blkid? Maybe the kernel prober is still
3286 * running or it got EBUSY because udev
3287 * already opened the device. Let's reprobe
3288 * the device, which is a synchronous call
3289 * that waits until probing is complete. */
3291 for (j
= 0; j
< 20; j
++) {
3293 r
= ioctl(fd
, BLKRRPART
, 0);
3296 if (r
>= 0 || r
!= -EBUSY
)
3299 /* If something else has the device
3300 * open, such as an udev rule, the
3301 * ioctl will return EBUSY. Since
3302 * there's no way to wait until it
3303 * isn't busy anymore, let's just wait
3304 * a bit, and try again.
3306 * This is really something they
3307 * should fix in the kernel! */
3309 usleep(50 * USEC_PER_MSEC
);
3313 return log_error_errno(r
, "Failed to reread partition table: %m");
3316 e
= udev_enumerate_unref(e
);
3319 first
= udev_enumerate_get_list_entry(e
);
3320 udev_list_entry_foreach(item
, first
) {
3321 _cleanup_udev_device_unref_
struct udev_device
*q
;
3323 unsigned long long flags
;
3329 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
3334 log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
3338 qn
= udev_device_get_devnum(q
);
3342 if (st
.st_rdev
== qn
)
3345 node
= udev_device_get_devnode(q
);
3349 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
3353 flags
= blkid_partition_get_flags(pp
);
3355 nr
= blkid_partition_get_partno(pp
);
3363 if (flags
& GPT_FLAG_NO_AUTO
)
3366 stype
= blkid_partition_get_type_string(pp
);
3370 if (sd_id128_from_string(stype
, &type_id
) < 0)
3373 if (sd_id128_equal(type_id
, GPT_HOME
)) {
3375 if (home
&& nr
>= home_nr
)
3379 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3381 r
= free_and_strdup(&home
, node
);
3385 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
3387 if (srv
&& nr
>= srv_nr
)
3391 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3393 r
= free_and_strdup(&srv
, node
);
3397 #ifdef GPT_ROOT_NATIVE
3398 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
3400 if (root
&& nr
>= root_nr
)
3404 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3406 r
= free_and_strdup(&root
, node
);
3411 #ifdef GPT_ROOT_SECONDARY
3412 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
3414 if (secondary_root
&& nr
>= secondary_root_nr
)
3417 secondary_root_nr
= nr
;
3418 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3420 r
= free_and_strdup(&secondary_root
, node
);
3425 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
3428 multiple_generic
= true;
3430 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3432 r
= free_and_strdup(&generic
, node
);
3438 } else if (is_mbr
) {
3441 if (flags
!= 0x80) /* Bootable flag */
3444 type
= blkid_partition_get_type(pp
);
3445 if (type
!= 0x83) /* Linux partition */
3449 multiple_generic
= true;
3453 r
= free_and_strdup(&root
, node
);
3461 *root_device
= root
;
3464 *root_device_rw
= root_rw
;
3466 } else if (secondary_root
) {
3467 *root_device
= secondary_root
;
3468 secondary_root
= NULL
;
3470 *root_device_rw
= secondary_root_rw
;
3472 } else if (generic
) {
3474 /* There were no partitions with precise meanings
3475 * around, but we found generic partitions. In this
3476 * case, if there's only one, we can go ahead and boot
3477 * it, otherwise we bail out, because we really cannot
3478 * make any sense of it. */
3480 if (multiple_generic
) {
3481 log_error("Identified multiple bootable Linux partitions on\n"
3483 PARTITION_TABLE_BLURB
, arg_image
);
3487 *root_device
= generic
;
3490 *root_device_rw
= generic_rw
;
3493 log_error("Failed to identify root partition in disk image\n"
3495 PARTITION_TABLE_BLURB
, arg_image
);
3500 *home_device
= home
;
3503 *home_device_rw
= home_rw
;
3510 *srv_device_rw
= srv_rw
;
3515 log_error("--image= is not supported, compiled without blkid support.");
3520 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
3522 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
3523 const char *fstype
, *p
;
3533 p
= strjoina(where
, directory
);
3538 b
= blkid_new_probe_from_filename(what
);
3542 log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
3546 blkid_probe_enable_superblocks(b
, 1);
3547 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
3550 r
= blkid_do_safeprobe(b
);
3551 if (r
== -1 || r
== 1) {
3552 log_error("Cannot determine file system type of %s", what
);
3554 } else if (r
!= 0) {
3557 log_error_errno(errno
, "Failed to probe %s: %m", what
);
3562 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
3565 log_error("Failed to determine file system type of %s", what
);
3569 if (streq(fstype
, "crypto_LUKS")) {
3570 log_error("nspawn currently does not support LUKS disk images.");
3574 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
3575 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
3579 log_error("--image= is not supported, compiled without blkid support.");
3584 static int mount_devices(
3586 const char *root_device
, bool root_device_rw
,
3587 const char *home_device
, bool home_device_rw
,
3588 const char *srv_device
, bool srv_device_rw
) {
3594 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
3596 return log_error_errno(r
, "Failed to mount root directory: %m");
3600 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
3602 return log_error_errno(r
, "Failed to mount home directory: %m");
3606 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
3608 return log_error_errno(r
, "Failed to mount server data directory: %m");
3614 static void loop_remove(int nr
, int *image_fd
) {
3615 _cleanup_close_
int control
= -1;
3621 if (image_fd
&& *image_fd
>= 0) {
3622 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
3624 log_debug_errno(errno
, "Failed to close loop image: %m");
3625 *image_fd
= safe_close(*image_fd
);
3628 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
3630 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
3634 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
3636 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
3639 static int spawn_getent(const char *database
, const char *key
, pid_t
*rpid
) {
3647 if (pipe2(pipe_fds
, O_CLOEXEC
) < 0)
3648 return log_error_errno(errno
, "Failed to allocate pipe: %m");
3652 return log_error_errno(errno
, "Failed to fork getent child: %m");
3653 else if (pid
== 0) {
3655 char *empty_env
= NULL
;
3657 if (dup3(pipe_fds
[1], STDOUT_FILENO
, 0) < 0)
3658 _exit(EXIT_FAILURE
);
3660 if (pipe_fds
[0] > 2)
3661 safe_close(pipe_fds
[0]);
3662 if (pipe_fds
[1] > 2)
3663 safe_close(pipe_fds
[1]);
3665 nullfd
= open("/dev/null", O_RDWR
);
3667 _exit(EXIT_FAILURE
);
3669 if (dup3(nullfd
, STDIN_FILENO
, 0) < 0)
3670 _exit(EXIT_FAILURE
);
3672 if (dup3(nullfd
, STDERR_FILENO
, 0) < 0)
3673 _exit(EXIT_FAILURE
);
3678 (void) reset_all_signal_handlers();
3679 (void) reset_signal_mask();
3680 close_all_fds(NULL
, 0);
3682 execle("/usr/bin/getent", "getent", database
, key
, NULL
, &empty_env
);
3683 execle("/bin/getent", "getent", database
, key
, NULL
, &empty_env
);
3684 _exit(EXIT_FAILURE
);
3687 pipe_fds
[1] = safe_close(pipe_fds
[1]);
3694 static int change_uid_gid(char **_home
) {
3695 char line
[LINE_MAX
], *x
, *u
, *g
, *h
;
3696 const char *word
, *state
;
3697 _cleanup_free_ uid_t
*uids
= NULL
;
3698 _cleanup_free_
char *home
= NULL
;
3699 _cleanup_fclose_
FILE *f
= NULL
;
3700 _cleanup_close_
int fd
= -1;
3701 unsigned n_uids
= 0;
3710 if (!arg_user
|| streq(arg_user
, "root") || streq(arg_user
, "0")) {
3711 /* Reset everything fully to 0, just in case */
3713 r
= reset_uid_gid();
3715 return log_error_errno(r
, "Failed to become root: %m");
3721 /* First, get user credentials */
3722 fd
= spawn_getent("passwd", arg_user
, &pid
);
3726 f
= fdopen(fd
, "r");
3731 if (!fgets(line
, sizeof(line
), f
)) {
3734 log_error("Failed to resolve user %s.", arg_user
);
3738 log_error_errno(errno
, "Failed to read from getent: %m");
3744 wait_for_terminate_and_warn("getent passwd", pid
, true);
3746 x
= strchr(line
, ':');
3748 log_error("/etc/passwd entry has invalid user field.");
3752 u
= strchr(x
+1, ':');
3754 log_error("/etc/passwd entry has invalid password field.");
3761 log_error("/etc/passwd entry has invalid UID field.");
3769 log_error("/etc/passwd entry has invalid GID field.");
3774 h
= strchr(x
+1, ':');
3776 log_error("/etc/passwd entry has invalid GECOS field.");
3783 log_error("/etc/passwd entry has invalid home directory field.");
3789 r
= parse_uid(u
, &uid
);
3791 log_error("Failed to parse UID of user.");
3795 r
= parse_gid(g
, &gid
);
3797 log_error("Failed to parse GID of user.");
3805 /* Second, get group memberships */
3806 fd
= spawn_getent("initgroups", arg_user
, &pid
);
3811 f
= fdopen(fd
, "r");
3816 if (!fgets(line
, sizeof(line
), f
)) {
3818 log_error("Failed to resolve user %s.", arg_user
);
3822 log_error_errno(errno
, "Failed to read from getent: %m");
3828 wait_for_terminate_and_warn("getent initgroups", pid
, true);
3830 /* Skip over the username and subsequent separator whitespace */
3832 x
+= strcspn(x
, WHITESPACE
);
3833 x
+= strspn(x
, WHITESPACE
);
3835 FOREACH_WORD(word
, l
, x
, state
) {
3841 if (!GREEDY_REALLOC(uids
, sz
, n_uids
+1))
3844 r
= parse_uid(c
, &uids
[n_uids
++]);
3846 log_error("Failed to parse group data from getent.");
3851 r
= mkdir_parents(home
, 0775);
3853 return log_error_errno(r
, "Failed to make home root directory: %m");
3855 r
= mkdir_safe(home
, 0755, uid
, gid
);
3856 if (r
< 0 && r
!= -EEXIST
)
3857 return log_error_errno(r
, "Failed to make home directory: %m");
3859 (void) fchown(STDIN_FILENO
, uid
, gid
);
3860 (void) fchown(STDOUT_FILENO
, uid
, gid
);
3861 (void) fchown(STDERR_FILENO
, uid
, gid
);
3863 if (setgroups(n_uids
, uids
) < 0)
3864 return log_error_errno(errno
, "Failed to set auxiliary groups: %m");
3866 if (setresgid(gid
, gid
, gid
) < 0)
3867 return log_error_errno(errno
, "setregid() failed: %m");
3869 if (setresuid(uid
, uid
, uid
) < 0)
3870 return log_error_errno(errno
, "setreuid() failed: %m");
3882 * < 0 : wait_for_terminate() failed to get the state of the
3883 * container, the container was terminated by a signal, or
3884 * failed for an unknown reason. No change is made to the
3885 * container argument.
3886 * > 0 : The program executed in the container terminated with an
3887 * error. The exit code of the program executed in the
3888 * container is returned. The container argument has been set
3889 * to CONTAINER_TERMINATED.
3890 * 0 : The container is being rebooted, has been shut down or exited
3891 * successfully. The container argument has been set to either
3892 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3894 * That is, success is indicated by a return value of zero, and an
3895 * error is indicated by a non-zero value.
3897 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
3901 r
= wait_for_terminate(pid
, &status
);
3903 return log_warning_errno(r
, "Failed to wait for container: %m");
3905 switch (status
.si_code
) {
3908 if (status
.si_status
== 0) {
3909 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
3912 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
3914 *container
= CONTAINER_TERMINATED
;
3915 return status
.si_status
;
3918 if (status
.si_status
== SIGINT
) {
3920 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
3921 *container
= CONTAINER_TERMINATED
;
3924 } else if (status
.si_status
== SIGHUP
) {
3926 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
3927 *container
= CONTAINER_REBOOTED
;
3931 /* CLD_KILLED fallthrough */
3934 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
3938 log_error("Container %s failed due to unknown reason.", arg_machine
);
3945 static void nop_handler(int sig
) {}
3947 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
3950 pid
= PTR_TO_UINT32(userdata
);
3952 if (kill(pid
, arg_kill_signal
) >= 0) {
3953 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3954 sd_event_source_set_userdata(s
, NULL
);
3959 sd_event_exit(sd_event_source_get_event(s
), 0);
3963 static int determine_names(void) {
3966 if (!arg_image
&& !arg_directory
) {
3968 _cleanup_(image_unrefp
) Image
*i
= NULL
;
3970 r
= image_find(arg_machine
, &i
);
3972 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
3974 log_error("No image for machine '%s': %m", arg_machine
);
3978 if (i
->type
== IMAGE_RAW
)
3979 r
= set_sanitized_path(&arg_image
, i
->path
);
3981 r
= set_sanitized_path(&arg_directory
, i
->path
);
3983 return log_error_errno(r
, "Invalid image directory: %m");
3986 arg_read_only
= arg_read_only
|| i
->read_only
;
3988 arg_directory
= get_current_dir_name();
3990 if (!arg_directory
&& !arg_machine
) {
3991 log_error("Failed to determine path, please use -D or -i.");
3997 if (arg_directory
&& path_equal(arg_directory
, "/"))
3998 arg_machine
= gethostname_malloc();
4000 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
4005 hostname_cleanup(arg_machine
, false);
4006 if (!machine_name_is_valid(arg_machine
)) {
4007 log_error("Failed to determine machine name automatically, please use -M.");
4011 if (arg_ephemeral
) {
4014 /* Add a random suffix when this is an
4015 * ephemeral machine, so that we can run many
4016 * instances at once without manually having
4017 * to specify -M each time. */
4019 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
4030 static int determine_uid_shift(const char *directory
) {
4038 if (arg_uid_shift
== UID_INVALID
) {
4041 r
= stat(directory
, &st
);
4043 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
4045 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
4047 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
4048 log_error("UID and GID base of %s don't match.", directory
);
4052 arg_uid_range
= UINT32_C(0x10000);
4055 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
4056 log_error("UID base too high for UID range.");
4060 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
4064 static int inner_child(
4066 const char *directory
,
4074 _cleanup_free_
char *home
= NULL
;
4076 const char *envp
[] = {
4077 "PATH=" DEFAULT_PATH_SPLIT_USR
,
4078 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4083 NULL
, /* container_uuid */
4084 NULL
, /* LISTEN_FDS */
4085 NULL
, /* LISTEN_PID */
4089 _cleanup_strv_free_
char **env_use
= NULL
;
4094 assert(kmsg_socket
>= 0);
4097 /* Tell the parent, that it now can write the UID map. */
4098 (void) barrier_place(barrier
); /* #1 */
4100 /* Wait until the parent wrote the UID map */
4101 if (!barrier_place_and_sync(barrier
)) { /* #2 */
4102 log_error("Parent died too early");
4107 r
= mount_all(NULL
, true);
4111 /* Wait until we are cgroup-ified, so that we
4112 * can mount the right cgroup path writable */
4113 if (!barrier_place_and_sync(barrier
)) { /* #3 */
4114 log_error("Parent died too early");
4118 r
= mount_systemd_cgroup_writable("");
4122 r
= reset_uid_gid();
4124 return log_error_errno(r
, "Couldn't become new root: %m");
4126 r
= setup_boot_id(NULL
);
4130 r
= setup_kmsg(NULL
, kmsg_socket
);
4133 kmsg_socket
= safe_close(kmsg_socket
);
4138 return log_error_errno(errno
, "setsid() failed: %m");
4140 if (arg_private_network
)
4143 r
= send_rtnl(rtnl_socket
);
4146 rtnl_socket
= safe_close(rtnl_socket
);
4148 if (drop_capabilities() < 0)
4149 return log_error_errno(errno
, "drop_capabilities() failed: %m");
4153 if (arg_personality
!= PERSONALITY_INVALID
) {
4154 if (personality(arg_personality
) < 0)
4155 return log_error_errno(errno
, "personality() failed: %m");
4156 } else if (secondary
) {
4157 if (personality(PER_LINUX32
) < 0)
4158 return log_error_errno(errno
, "personality() failed: %m");
4162 if (arg_selinux_context
)
4163 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
4164 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
4167 r
= change_uid_gid(&home
);
4171 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
4175 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
4176 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
4177 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
4180 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
4183 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
4187 if (fdset_size(fds
) > 0) {
4188 r
= fdset_cloexec(fds
, false);
4190 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
4192 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
4193 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
4197 env_use
= strv_env_merge(2, envp
, arg_setenv
);
4201 /* Let the parent know that we are ready and
4202 * wait until the parent is ready with the
4204 if (!barrier_place_and_sync(barrier
)) { /* #4 */
4205 log_error("Parent died too early");
4209 /* Now, explicitly close the log, so that we
4210 * then can close all remaining fds. Closing
4211 * the log explicitly first has the benefit
4212 * that the logging subsystem knows about it,
4213 * and is thus ready to be reopened should we
4214 * need it again. Note that the other fds
4215 * closed here are at least the locking and
4218 (void) fdset_close_others(fds
);
4224 /* Automatically search for the init system */
4226 m
= 1 + argc
- optind
;
4227 a
= newa(char*, m
+ 1);
4228 memcpy(a
+ 1, argv
+ optind
, m
* sizeof(char*));
4230 a
[0] = (char*) "/usr/lib/systemd/systemd";
4231 execve(a
[0], a
, env_use
);
4233 a
[0] = (char*) "/lib/systemd/systemd";
4234 execve(a
[0], a
, env_use
);
4236 a
[0] = (char*) "/sbin/init";
4237 execve(a
[0], a
, env_use
);
4238 } else if (argc
> optind
)
4239 execvpe(argv
[optind
], argv
+ optind
, env_use
);
4241 chdir(home
? home
: "/root");
4242 execle("/bin/bash", "-bash", NULL
, env_use
);
4243 execle("/bin/sh", "-sh", NULL
, env_use
);
4247 return log_error_errno(errno
, "execv() failed: %m");
4250 static int outer_child(
4252 const char *directory
,
4253 const char *console
,
4254 const char *root_device
, bool root_device_rw
,
4255 const char *home_device
, bool home_device_rw
,
4256 const char *srv_device
, bool srv_device_rw
,
4273 assert(pid_socket
>= 0);
4274 assert(kmsg_socket
>= 0);
4276 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
4277 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
4280 close_nointr(STDIN_FILENO
);
4281 close_nointr(STDOUT_FILENO
);
4282 close_nointr(STDERR_FILENO
);
4284 r
= open_terminal(console
, O_RDWR
);
4285 if (r
!= STDIN_FILENO
) {
4291 return log_error_errno(r
, "Failed to open console: %m");
4294 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
4295 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
4296 return log_error_errno(errno
, "Failed to duplicate console: %m");
4299 r
= reset_audit_loginuid();
4303 /* Mark everything as slave, so that we still
4304 * receive mounts from the real root, but don't
4305 * propagate mounts to the real root. */
4306 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
4307 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
4309 r
= mount_devices(directory
,
4310 root_device
, root_device_rw
,
4311 home_device
, home_device_rw
,
4312 srv_device
, srv_device_rw
);
4316 /* Turn directory into bind mount */
4317 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
4318 return log_error_errno(errno
, "Failed to make bind mount: %m");
4320 r
= setup_volatile(directory
);
4324 r
= setup_volatile_state(directory
);
4328 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
4332 if (arg_read_only
) {
4333 r
= bind_remount_recursive(directory
, true);
4335 return log_error_errno(r
, "Failed to make tree read-only: %m");
4338 r
= mount_all(directory
, false);
4342 if (copy_devnodes(directory
) < 0)
4345 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
4347 if (setup_pts(directory
) < 0)
4350 r
= setup_propagate(directory
);
4354 r
= setup_dev_console(directory
, console
);
4358 r
= setup_seccomp();
4362 r
= setup_timezone(directory
);
4366 r
= setup_resolv_conf(directory
);
4370 r
= setup_journal(directory
);
4374 r
= mount_custom(directory
);
4378 r
= mount_cgroup(directory
);
4382 r
= mount_move_root(directory
);
4384 return log_error_errno(r
, "Failed to move root directory: %m");
4386 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
4387 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
4388 (arg_private_network
? CLONE_NEWNET
: 0) |
4389 (arg_userns
? CLONE_NEWUSER
: 0),
4392 return log_error_errno(errno
, "Failed to fork inner child: %m");
4395 pid_socket
= safe_close(pid_socket
);
4397 /* The inner child has all namespaces that are
4398 * requested, so that we all are owned by the user if
4399 * user namespaces are turned on. */
4401 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
, argc
, argv
);
4403 _exit(EXIT_FAILURE
);
4405 _exit(EXIT_SUCCESS
);
4408 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
4410 return log_error_errno(errno
, "Failed to send PID: %m");
4411 if (l
!= sizeof(pid
)) {
4412 log_error("Short write while sending PID.");
4416 pid_socket
= safe_close(pid_socket
);
4421 static int setup_uid_map(pid_t pid
) {
4422 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
4427 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
4428 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
4429 r
= write_string_file(uid_map
, line
);
4431 return log_error_errno(r
, "Failed to write UID map: %m");
4433 /* We always assign the same UID and GID ranges */
4434 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
4435 r
= write_string_file(uid_map
, line
);
4437 return log_error_errno(r
, "Failed to write GID map: %m");
4442 static int chown_cgroup(pid_t pid
) {
4443 _cleanup_free_
char *path
= NULL
, *fs
= NULL
;
4444 _cleanup_close_
int fd
= -1;
4448 r
= cg_pid_get_path(NULL
, pid
, &path
);
4450 return log_error_errno(r
, "Failed to get container cgroup path: %m");
4452 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &fs
);
4454 return log_error_errno(r
, "Failed to get file system path for container cgroup: %m");
4456 fd
= open(fs
, O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
);
4458 return log_error_errno(errno
, "Failed to open %s: %m", fs
);
4460 FOREACH_STRING(fn
, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4461 if (fchownat(fd
, fn
, arg_uid_shift
, arg_uid_shift
, 0) < 0)
4462 log_warning_errno(errno
, "Failed to chown() cgroup file %s, ignoring: %m", fn
);
4467 int main(int argc
, char *argv
[]) {
4469 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
4470 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
4471 _cleanup_close_
int master
= -1, image_fd
= -1;
4472 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
4473 int r
, n_fd_passed
, loop_nr
= -1;
4474 char veth_name
[IFNAMSIZ
];
4475 bool secondary
= false, remove_subvol
= false;
4478 int ret
= EXIT_SUCCESS
;
4479 union in_addr_union exposed
= {};
4480 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
4483 log_parse_environment();
4486 r
= parse_argv(argc
, argv
);
4490 r
= determine_names();
4494 r
= determine_uid_shift(arg_directory
);
4498 if (geteuid() != 0) {
4499 log_error("Need to be root.");
4504 n_fd_passed
= sd_listen_fds(false);
4505 if (n_fd_passed
> 0) {
4506 r
= fdset_new_listen_fds(&fds
, false);
4508 log_error_errno(r
, "Failed to collect file descriptors: %m");
4513 if (arg_directory
) {
4516 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
4517 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4522 if (arg_ephemeral
) {
4523 _cleanup_free_
char *np
= NULL
;
4525 /* If the specified path is a mount point we
4526 * generate the new snapshot immediately
4527 * inside it under a random name. However if
4528 * the specified is not a mount point we
4529 * create the new snapshot in the parent
4530 * directory, just next to it. */
4531 r
= path_is_mount_point(arg_directory
, 0);
4533 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
4537 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
4539 r
= tempfn_random(arg_directory
, "machine.", &np
);
4541 log_error_errno(r
, "Failed to generate name for snapshot: %m");
4545 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
4547 log_error_errno(r
, "Failed to lock %s: %m", np
);
4551 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
4553 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
4557 free(arg_directory
);
4561 remove_subvol
= true;
4564 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
4566 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
4570 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
4575 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
4578 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
4580 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
4584 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
4590 if (path_is_os_tree(arg_directory
) <= 0) {
4591 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
4598 p
= strjoina(arg_directory
,
4599 argc
> optind
&& path_is_absolute(argv
[optind
]) ? argv
[optind
] : "/usr/bin/");
4600 if (access(p
, F_OK
) < 0) {
4601 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory
);
4608 char template[] = "/tmp/nspawn-root-XXXXXX";
4611 assert(!arg_template
);
4613 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
4615 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
4619 r
= log_error_errno(r
, "Failed to create image lock: %m");
4623 if (!mkdtemp(template)) {
4624 log_error_errno(errno
, "Failed to create temporary directory: %m");
4629 arg_directory
= strdup(template);
4630 if (!arg_directory
) {
4635 image_fd
= setup_image(&device_path
, &loop_nr
);
4641 r
= dissect_image(image_fd
,
4642 &root_device
, &root_device_rw
,
4643 &home_device
, &home_device_rw
,
4644 &srv_device
, &srv_device_rw
,
4650 r
= custom_mounts_prepare();
4655 isatty(STDIN_FILENO
) > 0 &&
4656 isatty(STDOUT_FILENO
) > 0;
4658 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
4660 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
4664 r
= ptsname_malloc(master
, &console
);
4666 r
= log_error_errno(r
, "Failed to determine tty name: %m");
4670 if (unlockpt(master
) < 0) {
4671 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
4676 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4677 arg_machine
, arg_image
?: arg_directory
);
4679 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
4681 assert_se(sigemptyset(&mask_chld
) == 0);
4682 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
4684 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
4685 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
4690 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 };
4691 ContainerStatus container_status
;
4692 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
4693 static const struct sigaction sa
= {
4694 .sa_handler
= nop_handler
,
4695 .sa_flags
= SA_NOCLDSTOP
,
4699 _cleanup_event_unref_ sd_event
*event
= NULL
;
4700 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
4701 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
4704 r
= barrier_create(&barrier
);
4706 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
4710 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
4711 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
4715 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
4716 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
4720 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
4721 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
4725 /* Child can be killed before execv(), so handle SIGCHLD
4726 * in order to interrupt parent's blocking calls and
4727 * give it a chance to call wait() and terminate. */
4728 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
4730 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
4734 r
= sigaction(SIGCHLD
, &sa
, NULL
);
4736 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
4740 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
4742 if (errno
== EINVAL
)
4743 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
4745 r
= log_error_errno(errno
, "clone() failed: %m");
4751 /* The outer child only has a file system namespace. */
4752 barrier_set_role(&barrier
, BARRIER_CHILD
);
4754 master
= safe_close(master
);
4756 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
4757 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
4758 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
4760 (void) reset_all_signal_handlers();
4761 (void) reset_signal_mask();
4763 r
= outer_child(&barrier
,
4766 root_device
, root_device_rw
,
4767 home_device
, home_device_rw
,
4768 srv_device
, srv_device_rw
,
4772 kmsg_socket_pair
[1],
4773 rtnl_socket_pair
[1],
4777 _exit(EXIT_FAILURE
);
4779 _exit(EXIT_SUCCESS
);
4782 barrier_set_role(&barrier
, BARRIER_PARENT
);
4787 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
4788 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
4789 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
4791 /* Wait for the outer child. */
4792 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
4801 /* And now retrieve the PID of the inner child. */
4802 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
4804 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
4807 if (l
!= sizeof(pid
)) {
4808 log_error("Short read while reading inner child PID: %m");
4813 log_debug("Init process invoked as PID " PID_FMT
, pid
);
4816 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
4817 log_error("Child died too early.");
4822 r
= setup_uid_map(pid
);
4826 (void) barrier_place(&barrier
); /* #2 */
4829 r
= move_network_interfaces(pid
);
4833 r
= setup_veth(pid
, veth_name
, &ifi
);
4837 r
= setup_bridge(veth_name
, &ifi
);
4841 r
= setup_macvlan(pid
);
4845 r
= setup_ipvlan(pid
);
4849 r
= register_machine(pid
, ifi
);
4853 r
= chown_cgroup(pid
);
4857 /* Notify the child that the parent is ready with all
4858 * its setup (including cgroup-ification), and that
4859 * the child can now hand over control to the code to
4860 * run inside the container. */
4861 (void) barrier_place(&barrier
); /* #3 */
4863 /* Block SIGCHLD here, before notifying child.
4864 * process_pty() will handle it with the other signals. */
4865 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
4867 /* Reset signal to default */
4868 r
= default_signals(SIGCHLD
, -1);
4870 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
4874 /* Let the child know that we are ready and wait that the child is completely ready now. */
4875 if (!barrier_place_and_sync(&barrier
)) { /* #5 */
4876 log_error("Client died too early.");
4883 "STATUS=Container running.\n"
4884 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
4886 r
= sd_event_new(&event
);
4888 log_error_errno(r
, "Failed to get default event source: %m");
4892 if (arg_kill_signal
> 0) {
4893 /* Try to kill the init system on SIGINT or SIGTERM */
4894 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
4895 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
4897 /* Immediately exit */
4898 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
4899 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
4902 /* simply exit on sigchld */
4903 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
4905 if (arg_expose_ports
) {
4906 r
= watch_rtnl(event
, rtnl_socket_pair
[0], &exposed
, &rtnl
);
4910 (void) expose_ports(rtnl
, &exposed
);
4913 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
4915 r
= pty_forward_new(event
, master
, true, !interactive
, &forward
);
4917 log_error_errno(r
, "Failed to create PTY forwarder: %m");
4921 r
= sd_event_loop(event
);
4923 log_error_errno(r
, "Failed to run event loop: %m");
4927 pty_forward_get_last_char(forward
, &last_char
);
4929 forward
= pty_forward_free(forward
);
4931 if (!arg_quiet
&& last_char
!= '\n')
4934 /* Kill if it is not dead yet anyway */
4935 terminate_machine(pid
);
4937 /* Normally redundant, but better safe than sorry */
4940 r
= wait_for_container(pid
, &container_status
);
4944 /* We failed to wait for the container, or the
4945 * container exited abnormally */
4947 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
4948 /* The container exited with a non-zero
4949 * status, or with zero status and no reboot
4955 /* CONTAINER_REBOOTED, loop again */
4957 if (arg_keep_unit
) {
4958 /* Special handling if we are running as a
4959 * service: instead of simply restarting the
4960 * machine we want to restart the entire
4961 * service, so let's inform systemd about this
4962 * with the special exit code 133. The service
4963 * file uses RestartForceExitStatus=133 so
4964 * that this results in a full nspawn
4965 * restart. This is necessary since we might
4966 * have cgroup parameters set we want to have
4973 flush_ports(&exposed
);
4979 "STATUS=Terminating...");
4984 /* Try to flush whatever is still queued in the pty */
4986 (void) copy_bytes(master
, STDOUT_FILENO
, (off_t
) -1, false);
4988 loop_remove(loop_nr
, &image_fd
);
4990 if (remove_subvol
&& arg_directory
) {
4993 k
= btrfs_subvol_remove(arg_directory
, true);
4995 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
5001 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
5002 (void) rm_rf(p
, REMOVE_ROOT
);
5005 free(arg_directory
);
5010 strv_free(arg_setenv
);
5011 strv_free(arg_network_interfaces
);
5012 strv_free(arg_network_macvlan
);
5013 strv_free(arg_network_ipvlan
);
5014 custom_mount_free_all();
5016 flush_ports(&exposed
);
5018 while (arg_expose_ports
) {
5019 ExposePort
*p
= arg_expose_ports
;
5020 LIST_REMOVE(ports
, arg_expose_ports
, p
);
5024 return r
< 0 ? EXIT_FAILURE
: ret
;