1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/mount.h>
31 #include <sys/prctl.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
44 #include <selinux/selinux.h>
52 #include <blkid/blkid.h>
55 #include "sd-daemon.h"
59 #include "random-util.h"
66 #include "cgroup-util.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
75 #include "bus-error.h"
78 #include "rtnl-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
82 #include "siphash24.h"
84 #include "base-filesystem.h"
86 #include "event-util.h"
87 #include "capability.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
92 #include "in-addr-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
102 #include "seccomp-util.h"
105 typedef struct ExposePort
{
108 uint16_t container_port
;
109 LIST_FIELDS(struct ExposePort
, ports
);
112 typedef enum ContainerStatus
{
113 CONTAINER_TERMINATED
,
117 typedef enum LinkJournal
{
124 typedef enum Volatile
{
130 typedef enum CustomMountType
{
133 CUSTOM_MOUNT_OVERLAY
,
136 typedef struct CustomMount
{
137 CustomMountType type
;
139 char *source
; /* for overlayfs this is the upper directory */
146 static char *arg_directory
= NULL
;
147 static char *arg_template
= NULL
;
148 static char *arg_user
= NULL
;
149 static sd_id128_t arg_uuid
= {};
150 static char *arg_machine
= NULL
;
151 static const char *arg_selinux_context
= NULL
;
152 static const char *arg_selinux_apifs_context
= NULL
;
153 static const char *arg_slice
= NULL
;
154 static bool arg_private_network
= false;
155 static bool arg_read_only
= false;
156 static bool arg_boot
= false;
157 static bool arg_ephemeral
= false;
158 static LinkJournal arg_link_journal
= LINK_AUTO
;
159 static bool arg_link_journal_try
= false;
160 static uint64_t arg_retain
=
161 (1ULL << CAP_CHOWN
) |
162 (1ULL << CAP_DAC_OVERRIDE
) |
163 (1ULL << CAP_DAC_READ_SEARCH
) |
164 (1ULL << CAP_FOWNER
) |
165 (1ULL << CAP_FSETID
) |
166 (1ULL << CAP_IPC_OWNER
) |
168 (1ULL << CAP_LEASE
) |
169 (1ULL << CAP_LINUX_IMMUTABLE
) |
170 (1ULL << CAP_NET_BIND_SERVICE
) |
171 (1ULL << CAP_NET_BROADCAST
) |
172 (1ULL << CAP_NET_RAW
) |
173 (1ULL << CAP_SETGID
) |
174 (1ULL << CAP_SETFCAP
) |
175 (1ULL << CAP_SETPCAP
) |
176 (1ULL << CAP_SETUID
) |
177 (1ULL << CAP_SYS_ADMIN
) |
178 (1ULL << CAP_SYS_CHROOT
) |
179 (1ULL << CAP_SYS_NICE
) |
180 (1ULL << CAP_SYS_PTRACE
) |
181 (1ULL << CAP_SYS_TTY_CONFIG
) |
182 (1ULL << CAP_SYS_RESOURCE
) |
183 (1ULL << CAP_SYS_BOOT
) |
184 (1ULL << CAP_AUDIT_WRITE
) |
185 (1ULL << CAP_AUDIT_CONTROL
) |
187 static CustomMount
*arg_custom_mounts
= NULL
;
188 static unsigned arg_n_custom_mounts
= 0;
189 static char **arg_setenv
= NULL
;
190 static bool arg_quiet
= false;
191 static bool arg_share_system
= false;
192 static bool arg_register
= true;
193 static bool arg_keep_unit
= false;
194 static char **arg_network_interfaces
= NULL
;
195 static char **arg_network_macvlan
= NULL
;
196 static char **arg_network_ipvlan
= NULL
;
197 static bool arg_network_veth
= false;
198 static const char *arg_network_bridge
= NULL
;
199 static unsigned long arg_personality
= PERSONALITY_INVALID
;
200 static char *arg_image
= NULL
;
201 static Volatile arg_volatile
= VOLATILE_NO
;
202 static ExposePort
*arg_expose_ports
= NULL
;
203 static char **arg_property
= NULL
;
204 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
205 static bool arg_userns
= false;
206 static int arg_kill_signal
= 0;
208 static void help(void) {
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
211 " -h --help Show this help\n"
212 " --version Print version string\n"
213 " -q --quiet Do not show status information\n"
214 " -D --directory=PATH Root directory for the container\n"
215 " --template=PATH Initialize root directory from template directory,\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
222 " -M --machine=NAME Set the machine name for the container\n"
223 " --uuid=UUID Set a specific machine UUID for the container\n"
224 " -S --slice=SLICE Place the container in the specified slice\n"
225 " --property=NAME=VALUE Set scope unit property\n"
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
238 " -n --network-veth Add a virtual ethernet connection between host\n"
240 " --network-bridge=INTERFACE\n"
241 " Add a virtual ethernet connection between host\n"
242 " and container and add it to an existing bridge on\n"
244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
245 " Expose a container IP port on the host\n"
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
252 " --capability=CAP In addition to the default, retain specified\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
259 " --read-only Mount the root directory read-only\n"
260 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
262 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
263 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
264 " --overlay=PATH[:PATH...]:PATH\n"
265 " Create an overlay mount from the host to \n"
267 " --overlay-ro=PATH[:PATH...]:PATH\n"
268 " Similar, but creates a read-only overlay mount\n"
269 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
270 " --share-system Share system namespaces with host\n"
271 " --register=BOOLEAN Register container as machine\n"
272 " --keep-unit Do not register a scope for the machine, reuse\n"
273 " the service unit nspawn is running in\n"
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 , program_invocation_short_name
);
278 static CustomMount
* custom_mount_add(CustomMountType t
) {
279 CustomMount
*c
, *ret
;
281 c
= realloc(arg_custom_mounts
, (arg_n_custom_mounts
+ 1) * sizeof(CustomMount
));
285 arg_custom_mounts
= c
;
286 ret
= arg_custom_mounts
+ arg_n_custom_mounts
;
287 arg_n_custom_mounts
++;
289 *ret
= (CustomMount
) { .type
= t
};
294 static void custom_mount_free_all(void) {
297 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
298 CustomMount
*m
= &arg_custom_mounts
[i
];
301 free(m
->destination
);
305 (void) rm_rf(m
->work_dir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
312 free(arg_custom_mounts
);
313 arg_custom_mounts
= NULL
;
314 arg_n_custom_mounts
= 0;
317 static int custom_mount_compare(const void *a
, const void *b
) {
318 const CustomMount
*x
= a
, *y
= b
;
321 r
= path_compare(x
->destination
, y
->destination
);
325 if (x
->type
< y
->type
)
327 if (x
->type
> y
->type
)
333 static int custom_mounts_prepare(void) {
337 /* Ensure the mounts are applied prefix first. */
338 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
340 /* Allocate working directories for the overlay file systems that need it */
341 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
342 CustomMount
*m
= &arg_custom_mounts
[i
];
344 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
353 r
= tempfn_random(m
->source
, &m
->work_dir
);
355 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
361 static int set_sanitized_path(char **b
, const char *path
) {
367 p
= canonicalize_file_name(path
);
372 p
= path_make_absolute_cwd(path
);
378 *b
= path_kill_slashes(p
);
382 static int parse_argv(int argc
, char *argv
[]) {
401 ARG_NETWORK_INTERFACE
,
413 static const struct option options
[] = {
414 { "help", no_argument
, NULL
, 'h' },
415 { "version", no_argument
, NULL
, ARG_VERSION
},
416 { "directory", required_argument
, NULL
, 'D' },
417 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
418 { "ephemeral", no_argument
, NULL
, 'x' },
419 { "user", required_argument
, NULL
, 'u' },
420 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
421 { "boot", no_argument
, NULL
, 'b' },
422 { "uuid", required_argument
, NULL
, ARG_UUID
},
423 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
424 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
425 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
426 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
427 { "bind", required_argument
, NULL
, ARG_BIND
},
428 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
429 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
430 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
431 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
432 { "machine", required_argument
, NULL
, 'M' },
433 { "slice", required_argument
, NULL
, 'S' },
434 { "setenv", required_argument
, NULL
, ARG_SETENV
},
435 { "selinux-context", required_argument
, NULL
, 'Z' },
436 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
437 { "quiet", no_argument
, NULL
, 'q' },
438 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
439 { "register", required_argument
, NULL
, ARG_REGISTER
},
440 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
441 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
442 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
443 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
444 { "network-veth", no_argument
, NULL
, 'n' },
445 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
446 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
447 { "image", required_argument
, NULL
, 'i' },
448 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
449 { "port", required_argument
, NULL
, 'p' },
450 { "property", required_argument
, NULL
, ARG_PROPERTY
},
451 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
452 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
457 uint64_t plus
= 0, minus
= 0;
462 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
471 puts(PACKAGE_STRING
);
472 puts(SYSTEMD_FEATURES
);
476 r
= set_sanitized_path(&arg_directory
, optarg
);
478 return log_error_errno(r
, "Invalid root directory: %m");
483 r
= set_sanitized_path(&arg_template
, optarg
);
485 return log_error_errno(r
, "Invalid template directory: %m");
490 r
= set_sanitized_path(&arg_image
, optarg
);
492 return log_error_errno(r
, "Invalid image path: %m");
497 arg_ephemeral
= true;
502 arg_user
= strdup(optarg
);
508 case ARG_NETWORK_BRIDGE
:
509 arg_network_bridge
= optarg
;
514 arg_network_veth
= true;
515 arg_private_network
= true;
518 case ARG_NETWORK_INTERFACE
:
519 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
522 arg_private_network
= true;
525 case ARG_NETWORK_MACVLAN
:
526 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
529 arg_private_network
= true;
532 case ARG_NETWORK_IPVLAN
:
533 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
538 case ARG_PRIVATE_NETWORK
:
539 arg_private_network
= true;
547 r
= sd_id128_from_string(optarg
, &arg_uuid
);
549 log_error("Invalid UUID: %s", optarg
);
559 if (isempty(optarg
)) {
563 if (!machine_name_is_valid(optarg
)) {
564 log_error("Invalid machine name: %s", optarg
);
568 r
= free_and_strdup(&arg_machine
, optarg
);
576 arg_selinux_context
= optarg
;
580 arg_selinux_apifs_context
= optarg
;
584 arg_read_only
= true;
588 case ARG_DROP_CAPABILITY
: {
589 const char *state
, *word
;
592 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
593 _cleanup_free_
char *t
;
595 t
= strndup(word
, length
);
599 if (streq(t
, "all")) {
600 if (c
== ARG_CAPABILITY
)
601 plus
= (uint64_t) -1;
603 minus
= (uint64_t) -1;
607 cap
= capability_from_name(t
);
609 log_error("Failed to parse capability %s.", t
);
613 if (c
== ARG_CAPABILITY
)
614 plus
|= 1ULL << (uint64_t) cap
;
616 minus
|= 1ULL << (uint64_t) cap
;
624 arg_link_journal
= LINK_GUEST
;
625 arg_link_journal_try
= true;
628 case ARG_LINK_JOURNAL
:
629 if (streq(optarg
, "auto")) {
630 arg_link_journal
= LINK_AUTO
;
631 arg_link_journal_try
= false;
632 } else if (streq(optarg
, "no")) {
633 arg_link_journal
= LINK_NO
;
634 arg_link_journal_try
= false;
635 } else if (streq(optarg
, "guest")) {
636 arg_link_journal
= LINK_GUEST
;
637 arg_link_journal_try
= false;
638 } else if (streq(optarg
, "host")) {
639 arg_link_journal
= LINK_HOST
;
640 arg_link_journal_try
= false;
641 } else if (streq(optarg
, "try-guest")) {
642 arg_link_journal
= LINK_GUEST
;
643 arg_link_journal_try
= true;
644 } else if (streq(optarg
, "try-host")) {
645 arg_link_journal
= LINK_HOST
;
646 arg_link_journal_try
= true;
648 log_error("Failed to parse link journal mode %s", optarg
);
656 _cleanup_free_
char *source
= NULL
, *destination
= NULL
;
660 e
= strchr(optarg
, ':');
662 source
= strndup(optarg
, e
- optarg
);
663 destination
= strdup(e
+ 1);
665 source
= strdup(optarg
);
666 destination
= strdup(optarg
);
669 if (!source
|| !destination
)
672 if (!path_is_absolute(source
) || !path_is_absolute(destination
)) {
673 log_error("Invalid bind mount specification: %s", optarg
);
677 m
= custom_mount_add(CUSTOM_MOUNT_BIND
);
682 m
->destination
= destination
;
683 m
->read_only
= c
== ARG_BIND_RO
;
685 source
= destination
= NULL
;
691 _cleanup_free_
char *path
= NULL
, *opts
= NULL
;
695 e
= strchr(optarg
, ':');
697 path
= strndup(optarg
, e
- optarg
);
698 opts
= strdup(e
+ 1);
700 path
= strdup(optarg
);
701 opts
= strdup("mode=0755");
707 if (!path_is_absolute(path
)) {
708 log_error("Invalid tmpfs specification: %s", optarg
);
712 m
= custom_mount_add(CUSTOM_MOUNT_TMPFS
);
716 m
->destination
= path
;
725 case ARG_OVERLAY_RO
: {
726 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
727 _cleanup_strv_free_
char **lower
= NULL
;
732 lower
= strv_split(optarg
, ":");
736 STRV_FOREACH(i
, lower
) {
737 if (!path_is_absolute(*i
)) {
738 log_error("Overlay path %s is not absolute.", *i
);
746 log_error("--overlay= needs at least two colon-separated directories specified.");
751 /* If two parameters are specified,
752 * the first one is the lower, the
753 * second one the upper directory. And
754 * we'll also define the the
755 * destination mount point the same as
760 destination
= strdup(upper
);
765 upper
= lower
[n
- 2];
766 destination
= lower
[n
- 1];
770 m
= custom_mount_add(CUSTOM_MOUNT_OVERLAY
);
774 m
->destination
= destination
;
777 m
->read_only
= c
== ARG_OVERLAY_RO
;
779 upper
= destination
= NULL
;
788 if (!env_assignment_is_valid(optarg
)) {
789 log_error("Environment variable assignment '%s' is not valid.", optarg
);
793 n
= strv_env_set(arg_setenv
, optarg
);
797 strv_free(arg_setenv
);
806 case ARG_SHARE_SYSTEM
:
807 arg_share_system
= true;
811 r
= parse_boolean(optarg
);
813 log_error("Failed to parse --register= argument: %s", optarg
);
821 arg_keep_unit
= true;
824 case ARG_PERSONALITY
:
826 arg_personality
= personality_from_string(optarg
);
827 if (arg_personality
== PERSONALITY_INVALID
) {
828 log_error("Unknown or unsupported personality '%s'.", optarg
);
837 arg_volatile
= VOLATILE_YES
;
839 r
= parse_boolean(optarg
);
841 if (streq(optarg
, "state"))
842 arg_volatile
= VOLATILE_STATE
;
844 log_error("Failed to parse --volatile= argument: %s", optarg
);
848 arg_volatile
= r
? VOLATILE_YES
: VOLATILE_NO
;
854 const char *split
, *e
;
855 uint16_t container_port
, host_port
;
859 if ((e
= startswith(optarg
, "tcp:")))
860 protocol
= IPPROTO_TCP
;
861 else if ((e
= startswith(optarg
, "udp:")))
862 protocol
= IPPROTO_UDP
;
865 protocol
= IPPROTO_TCP
;
868 split
= strchr(e
, ':');
870 char v
[split
- e
+ 1];
872 memcpy(v
, e
, split
- e
);
875 r
= safe_atou16(v
, &host_port
);
876 if (r
< 0 || host_port
<= 0) {
877 log_error("Failed to parse host port: %s", optarg
);
881 r
= safe_atou16(split
+ 1, &container_port
);
883 r
= safe_atou16(e
, &container_port
);
884 host_port
= container_port
;
887 if (r
< 0 || container_port
<= 0) {
888 log_error("Failed to parse host port: %s", optarg
);
892 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
893 if (p
->protocol
== protocol
&& p
->host_port
== host_port
) {
894 log_error("Duplicate port specification: %s", optarg
);
899 p
= new(ExposePort
, 1);
903 p
->protocol
= protocol
;
904 p
->host_port
= host_port
;
905 p
->container_port
= container_port
;
907 LIST_PREPEND(ports
, arg_expose_ports
, p
);
913 if (strv_extend(&arg_property
, optarg
) < 0)
918 case ARG_PRIVATE_USERS
:
920 _cleanup_free_
char *buffer
= NULL
;
921 const char *range
, *shift
;
923 range
= strchr(optarg
, ':');
925 buffer
= strndup(optarg
, range
- optarg
);
931 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
932 log_error("Failed to parse UID range: %s", range
);
938 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
939 log_error("Failed to parse UID: %s", optarg
);
947 case ARG_KILL_SIGNAL
:
948 arg_kill_signal
= signal_from_string_try_harder(optarg
);
949 if (arg_kill_signal
< 0) {
950 log_error("Cannot parse signal: %s", optarg
);
960 assert_not_reached("Unhandled option");
963 if (arg_share_system
)
964 arg_register
= false;
966 if (arg_boot
&& arg_share_system
) {
967 log_error("--boot and --share-system may not be combined.");
971 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
972 log_error("--keep-unit may not be used when invoked from a user session.");
976 if (arg_directory
&& arg_image
) {
977 log_error("--directory= and --image= may not be combined.");
981 if (arg_template
&& arg_image
) {
982 log_error("--template= and --image= may not be combined.");
986 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
987 log_error("--template= needs --directory= or --machine=.");
991 if (arg_ephemeral
&& arg_template
) {
992 log_error("--ephemeral and --template= may not be combined.");
996 if (arg_ephemeral
&& arg_image
) {
997 log_error("--ephemeral and --image= may not be combined.");
1001 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
1002 log_error("--ephemeral and --link-journal= may not be combined.");
1006 if (arg_volatile
!= VOLATILE_NO
&& arg_read_only
) {
1007 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1011 if (arg_expose_ports
&& !arg_private_network
) {
1012 log_error("Cannot use --port= without private networking.");
1016 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
1018 if (arg_boot
&& arg_kill_signal
<= 0)
1019 arg_kill_signal
= SIGRTMIN
+3;
1024 static int tmpfs_patch_options(const char *options
, char **ret
) {
1027 if (arg_userns
&& arg_uid_shift
!= 0) {
1030 (void) asprintf(&buf
, "%s,uid=" UID_FMT
",gid=" UID_FMT
, options
, arg_uid_shift
, arg_uid_shift
);
1032 (void) asprintf(&buf
, "uid=" UID_FMT
",gid=" UID_FMT
, arg_uid_shift
, arg_uid_shift
);
1040 if (arg_selinux_apifs_context
) {
1044 t
= strjoin(options
, ",context=\"", arg_selinux_apifs_context
, "\"", NULL
);
1046 t
= strjoin("context=\"", arg_selinux_apifs_context
, "\"", NULL
);
1061 static int mount_all(const char *dest
, bool userns
) {
1063 typedef struct MountPoint
{
1067 const char *options
;
1068 unsigned long flags
;
1073 static const MountPoint mount_table
[] = {
1074 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true, true },
1075 { "/proc/sys", "/proc/sys", NULL
, NULL
, MS_BIND
, true, true }, /* Bind mount first */
1076 { NULL
, "/proc/sys", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_REMOUNT
, true, true }, /* Then, make it r/o */
1077 { "sysfs", "/sys", "sysfs", NULL
, MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true, false },
1078 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, true, false },
1079 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
, true, false },
1080 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true, false },
1081 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true, false },
1082 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME
, true, false },
1084 { "/sys/fs/selinux", "/sys/fs/selinux", NULL
, NULL
, MS_BIND
, false, false }, /* Bind mount first */
1085 { NULL
, "/sys/fs/selinux", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_REMOUNT
, false, false }, /* Then, make it r/o */
1092 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
1093 _cleanup_free_
char *where
= NULL
, *options
= NULL
;
1096 if (userns
!= mount_table
[k
].userns
)
1099 where
= prefix_root(dest
, mount_table
[k
].where
);
1103 r
= path_is_mount_point(where
, AT_SYMLINK_FOLLOW
);
1104 if (r
< 0 && r
!= -ENOENT
)
1105 return log_error_errno(r
, "Failed to detect whether %s is a mount point: %m", where
);
1107 /* Skip this entry if it is not a remount. */
1108 if (mount_table
[k
].what
&& r
> 0)
1111 r
= mkdir_p(where
, 0755);
1113 if (mount_table
[k
].fatal
)
1114 return log_error_errno(r
, "Failed to create directory %s: %m", where
);
1116 log_warning_errno(r
, "Failed to create directory %s: %m", where
);
1120 o
= mount_table
[k
].options
;
1121 if (streq_ptr(mount_table
[k
].type
, "tmpfs")) {
1122 r
= tmpfs_patch_options(o
, &options
);
1129 if (mount(mount_table
[k
].what
,
1131 mount_table
[k
].type
,
1132 mount_table
[k
].flags
,
1135 if (mount_table
[k
].fatal
)
1136 return log_error_errno(errno
, "mount(%s) failed: %m", where
);
1138 log_warning_errno(errno
, "mount(%s) failed, ignoring: %m", where
);
1145 static int mount_bind(const char *dest
, CustomMount
*m
) {
1146 struct stat source_st
, dest_st
;
1152 if (stat(m
->source
, &source_st
) < 0)
1153 return log_error_errno(errno
, "Failed to stat %s: %m", m
->source
);
1155 where
= prefix_roota(dest
, m
->destination
);
1157 if (stat(where
, &dest_st
) >= 0) {
1158 if (S_ISDIR(source_st
.st_mode
) && !S_ISDIR(dest_st
.st_mode
)) {
1159 log_error("Cannot bind mount directory %s on file %s.", m
->source
, where
);
1163 if (!S_ISDIR(source_st
.st_mode
) && S_ISDIR(dest_st
.st_mode
)) {
1164 log_error("Cannot bind mount file %s on directory %s.", m
->source
, where
);
1168 } else if (errno
== ENOENT
) {
1169 r
= mkdir_parents_label(where
, 0755);
1171 return log_error_errno(r
, "Failed to make parents of %s: %m", where
);
1173 log_error_errno(errno
, "Failed to stat %s: %m", where
);
1177 /* Create the mount point. Any non-directory file can be
1178 * mounted on any non-directory file (regular, fifo, socket,
1181 if (S_ISDIR(source_st
.st_mode
))
1182 r
= mkdir_label(where
, 0755);
1185 if (r
< 0 && r
!= -EEXIST
)
1186 return log_error_errno(r
, "Failed to create mount point %s: %m", where
);
1188 if (mount(m
->source
, where
, NULL
, MS_BIND
, NULL
) < 0)
1189 return log_error_errno(errno
, "mount(%s) failed: %m", where
);
1192 r
= bind_remount_recursive(where
, true);
1194 return log_error_errno(r
, "Read-only bind mount failed: %m");
1200 static int mount_tmpfs(const char *dest
, CustomMount
*m
) {
1201 const char *where
, *options
;
1202 _cleanup_free_
char *buf
= NULL
;
1208 where
= prefix_roota(dest
, m
->destination
);
1210 r
= mkdir_p_label(where
, 0755);
1211 if (r
< 0 && r
!= -EEXIST
)
1212 return log_error_errno(r
, "Creating mount point for tmpfs %s failed: %m", where
);
1214 r
= tmpfs_patch_options(m
->options
, &buf
);
1217 options
= r
> 0 ? buf
: m
->options
;
1219 if (mount("tmpfs", where
, "tmpfs", MS_NODEV
|MS_STRICTATIME
, options
) < 0)
1220 return log_error_errno(errno
, "tmpfs mount to %s failed: %m", where
);
1225 static int mount_overlay(const char *dest
, CustomMount
*m
) {
1226 _cleanup_free_
char *lower
= NULL
;
1227 const char *where
, *options
;
1233 where
= prefix_roota(dest
, m
->destination
);
1235 r
= mkdir_label(where
, 0755);
1236 if (r
< 0 && r
!= -EEXIST
)
1237 return log_error_errno(r
, "Creating mount point for overlay %s failed: %m", where
);
1239 (void) mkdir_p_label(m
->source
, 0755);
1241 strv_reverse(m
->lower
);
1242 lower
= strv_join(m
->lower
, ":");
1243 strv_reverse(m
->lower
);
1248 options
= strjoina("lowerdir=", m
->source
, ":", lower
);
1250 assert(m
->work_dir
);
1251 (void) mkdir_label(m
->work_dir
, 0700);
1253 options
= strjoina("lowerdir=", lower
, ",upperdir=", m
->source
, ",workdir=", m
->work_dir
);
1256 if (mount("overlay", where
, "overlay", m
->read_only
? MS_RDONLY
: 0, options
) < 0)
1257 return log_error_errno(errno
, "overlay mount to %s failed: %m", where
);
1262 static int mount_custom(const char *dest
) {
1268 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
1269 CustomMount
*m
= &arg_custom_mounts
[i
];
1273 case CUSTOM_MOUNT_BIND
:
1274 r
= mount_bind(dest
, m
);
1277 case CUSTOM_MOUNT_TMPFS
:
1278 r
= mount_tmpfs(dest
, m
);
1281 case CUSTOM_MOUNT_OVERLAY
:
1282 r
= mount_overlay(dest
, m
);
1286 assert_not_reached("Unknown custom mount type");
1296 static int mount_cgroup_hierarchy(const char *dest
, const char *controller
, const char *hierarchy
, bool read_only
) {
1300 to
= strjoina(dest
, "/sys/fs/cgroup/", hierarchy
);
1302 r
= path_is_mount_point(to
, 0);
1303 if (r
< 0 && r
!= -ENOENT
)
1304 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", to
);
1310 /* The superblock mount options of the mount point need to be
1311 * identical to the hosts', and hence writable... */
1312 if (mount("cgroup", to
, "cgroup", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, controller
) < 0)
1313 return log_error_errno(errno
, "Failed to mount to %s: %m", to
);
1315 /* ... hence let's only make the bind mount read-only, not the
1318 if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
) < 0)
1319 return log_error_errno(errno
, "Failed to remount %s read-only: %m", to
);
1324 static int mount_cgroup(const char *dest
) {
1325 _cleanup_set_free_free_ Set
*controllers
= NULL
;
1326 const char *cgroup_root
;
1329 controllers
= set_new(&string_hash_ops
);
1333 r
= cg_kernel_controllers(controllers
);
1335 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
1338 _cleanup_free_
char *controller
= NULL
, *origin
= NULL
, *combined
= NULL
;
1340 controller
= set_steal_first(controllers
);
1344 origin
= prefix_root("/sys/fs/cgroup/", controller
);
1348 r
= readlink_malloc(origin
, &combined
);
1350 /* Not a symbolic link, but directly a single cgroup hierarchy */
1352 r
= mount_cgroup_hierarchy(dest
, controller
, controller
, true);
1357 return log_error_errno(r
, "Failed to read link %s: %m", origin
);
1359 _cleanup_free_
char *target
= NULL
;
1361 target
= prefix_root(dest
, origin
);
1365 /* A symbolic link, a combination of controllers in one hierarchy */
1367 if (!filename_is_valid(combined
)) {
1368 log_warning("Ignoring invalid combined hierarchy %s.", combined
);
1372 r
= mount_cgroup_hierarchy(dest
, combined
, combined
, true);
1376 r
= symlink_idempotent(combined
, target
);
1378 log_error("Invalid existing symlink for combined hierarchy");
1382 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
1386 r
= mount_cgroup_hierarchy(dest
, "name=systemd,xattr", "systemd", false);
1390 cgroup_root
= prefix_roota(dest
, "/sys/fs/cgroup");
1391 if (mount(NULL
, cgroup_root
, NULL
, MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755") < 0)
1392 return log_error_errno(errno
, "Failed to remount %s read-only: %m", cgroup_root
);
1397 static int mount_systemd_cgroup_writable(const char *dest
) {
1398 _cleanup_free_
char *own_cgroup_path
= NULL
;
1399 const char *systemd_root
, *systemd_own
;
1404 r
= cg_pid_get_path(NULL
, 0, &own_cgroup_path
);
1406 return log_error_errno(r
, "Failed to determine our own cgroup path: %m");
1408 /* Make our own cgroup a (writable) bind mount */
1409 systemd_own
= strjoina(dest
, "/sys/fs/cgroup/systemd", own_cgroup_path
);
1410 if (mount(systemd_own
, systemd_own
, NULL
, MS_BIND
, NULL
) < 0)
1411 return log_error_errno(errno
, "Failed to turn %s into a bind mount: %m", own_cgroup_path
);
1413 /* And then remount the systemd cgroup root read-only */
1414 systemd_root
= prefix_roota(dest
, "/sys/fs/cgroup/systemd");
1415 if (mount(NULL
, systemd_root
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
) < 0)
1416 return log_error_errno(errno
, "Failed to mount cgroup root read-only: %m");
1421 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
1427 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
1430 if (uid
!= UID_INVALID
) {
1431 uid
+= arg_uid_shift
;
1433 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
1437 if (gid
!= GID_INVALID
) {
1438 gid
+= (gid_t
) arg_uid_shift
;
1440 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
1444 if (lchown(p
, uid
, gid
) < 0)
1450 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
1453 q
= prefix_roota(root
, path
);
1454 if (mkdir(q
, mode
) < 0) {
1455 if (errno
== EEXIST
)
1460 return userns_lchown(q
, uid
, gid
);
1463 static int setup_timezone(const char *dest
) {
1464 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
1465 const char *where
, *check
, *what
;
1471 /* Fix the timezone, if possible */
1472 r
= readlink_malloc("/etc/localtime", &p
);
1474 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1478 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1480 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1482 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1486 where
= prefix_roota(dest
, "/etc/localtime");
1487 r
= readlink_malloc(where
, &q
);
1489 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1491 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1493 /* Already pointing to the right place? Then do nothing .. */
1494 if (y
&& streq(y
, z
))
1498 check
= strjoina("/usr/share/zoneinfo/", z
);
1499 check
= prefix_root(dest
, check
);
1500 if (laccess(check
, F_OK
) < 0) {
1501 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1506 if (r
< 0 && errno
!= ENOENT
) {
1507 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1511 what
= strjoina("../usr/share/zoneinfo/", z
);
1512 if (symlink(what
, where
) < 0) {
1513 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1517 r
= userns_lchown(where
, 0, 0);
1519 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1524 static int setup_resolv_conf(const char *dest
) {
1525 const char *where
= NULL
;
1530 if (arg_private_network
)
1533 /* Fix resolv.conf, if possible */
1534 where
= prefix_roota(dest
, "/etc/resolv.conf");
1536 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1538 log_warning_errno(r
, "Failed to copy /etc/resolv.conf to %s: %m", where
);
1542 r
= userns_lchown(where
, 0, 0);
1544 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1549 static int setup_volatile_state(const char *directory
) {
1550 _cleanup_free_
char *buf
= NULL
;
1551 const char *p
, *options
;
1556 if (arg_volatile
!= VOLATILE_STATE
)
1559 /* --volatile=state means we simply overmount /var
1560 with a tmpfs, and the rest read-only. */
1562 r
= bind_remount_recursive(directory
, true);
1564 return log_error_errno(r
, "Failed to remount %s read-only: %m", directory
);
1566 p
= prefix_roota(directory
, "/var");
1568 if (r
< 0 && errno
!= EEXIST
)
1569 return log_error_errno(errno
, "Failed to create %s: %m", directory
);
1571 options
= "mode=755";
1572 r
= tmpfs_patch_options(options
, &buf
);
1578 if (mount("tmpfs", p
, "tmpfs", MS_STRICTATIME
, options
) < 0)
1579 return log_error_errno(errno
, "Failed to mount tmpfs to /var: %m");
1584 static int setup_volatile(const char *directory
) {
1585 bool tmpfs_mounted
= false, bind_mounted
= false;
1586 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1587 _cleanup_free_
char *buf
= NULL
;
1588 const char *f
, *t
, *options
;
1593 if (arg_volatile
!= VOLATILE_YES
)
1596 /* --volatile=yes means we mount a tmpfs to the root dir, and
1597 the original /usr to use inside it, and that read-only. */
1599 if (!mkdtemp(template))
1600 return log_error_errno(errno
, "Failed to create temporary directory: %m");
1602 options
= "mode=755";
1603 r
= tmpfs_patch_options(options
, &buf
);
1609 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME
, options
) < 0) {
1610 r
= log_error_errno(errno
, "Failed to mount tmpfs for root directory: %m");
1614 tmpfs_mounted
= true;
1616 f
= prefix_roota(directory
, "/usr");
1617 t
= prefix_roota(template, "/usr");
1620 if (r
< 0 && errno
!= EEXIST
) {
1621 r
= log_error_errno(errno
, "Failed to create %s: %m", t
);
1625 if (mount(f
, t
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0) {
1626 r
= log_error_errno(errno
, "Failed to create /usr bind mount: %m");
1630 bind_mounted
= true;
1632 r
= bind_remount_recursive(t
, true);
1634 log_error_errno(r
, "Failed to remount %s read-only: %m", t
);
1638 if (mount(template, directory
, NULL
, MS_MOVE
, NULL
) < 0) {
1639 r
= log_error_errno(errno
, "Failed to move root mount: %m");
1643 (void) rmdir(template);
1652 (void) umount(template);
1653 (void) rmdir(template);
1657 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1661 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1662 SD_ID128_FORMAT_VAL(id
));
1667 static int setup_boot_id(const char *dest
) {
1668 const char *from
, *to
;
1669 sd_id128_t rnd
= {};
1673 if (arg_share_system
)
1676 /* Generate a new randomized boot ID, so that each boot-up of
1677 * the container gets a new one */
1679 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1680 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1682 r
= sd_id128_randomize(&rnd
);
1684 return log_error_errno(r
, "Failed to generate random boot id: %m");
1686 id128_format_as_uuid(rnd
, as_uuid
);
1688 r
= write_string_file(from
, as_uuid
);
1690 return log_error_errno(r
, "Failed to write boot id: %m");
1692 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1693 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1694 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1695 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1701 static int copy_devnodes(const char *dest
) {
1703 static const char devnodes
[] =
1714 _cleanup_umask_ mode_t u
;
1720 /* Create /dev/net, so that we can create /dev/net/tun in it */
1721 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1722 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1724 NULSTR_FOREACH(d
, devnodes
) {
1725 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1728 from
= strappend("/dev/", d
);
1729 to
= prefix_root(dest
, from
);
1731 if (stat(from
, &st
) < 0) {
1733 if (errno
!= ENOENT
)
1734 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1736 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1738 log_error("%s is not a char or block device, cannot copy.", from
);
1742 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1744 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1746 /* Some systems abusively restrict mknod but
1747 * allow bind mounts. */
1750 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1751 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1752 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1755 r
= userns_lchown(to
, 0, 0);
1757 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1764 static int setup_pts(const char *dest
) {
1765 _cleanup_free_
char *options
= NULL
;
1769 if (arg_selinux_apifs_context
)
1770 (void) asprintf(&options
,
1771 "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT
",gid=" GID_FMT
",context=\"%s\"",
1773 arg_uid_shift
+ TTY_GID
,
1774 arg_selinux_apifs_context
);
1777 (void) asprintf(&options
,
1778 "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT
",gid=" GID_FMT
,
1780 arg_uid_shift
+ TTY_GID
);
1785 /* Mount /dev/pts itself */
1786 p
= prefix_roota(dest
, "/dev/pts");
1787 if (mkdir(p
, 0755) < 0)
1788 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1789 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1790 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1791 if (userns_lchown(p
, 0, 0) < 0)
1792 return log_error_errno(errno
, "Failed to chown /dev/pts: %m");
1794 /* Create /dev/ptmx symlink */
1795 p
= prefix_roota(dest
, "/dev/ptmx");
1796 if (symlink("pts/ptmx", p
) < 0)
1797 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1798 if (userns_lchown(p
, 0, 0) < 0)
1799 return log_error_errno(errno
, "Failed to chown /dev/ptmx: %m");
1801 /* And fix /dev/pts/ptmx ownership */
1802 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1803 if (userns_lchown(p
, 0, 0) < 0)
1804 return log_error_errno(errno
, "Failed to chown /dev/pts/ptmx: %m");
1809 static int setup_dev_console(const char *dest
, const char *console
) {
1810 _cleanup_umask_ mode_t u
;
1819 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1821 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1823 /* We need to bind mount the right tty to /dev/console since
1824 * ptys can only exist on pts file systems. To have something
1825 * to bind mount things on we create a empty regular file. */
1827 to
= prefix_roota(dest
, "/dev/console");
1830 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1832 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1833 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1838 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1839 const char *from
, *to
;
1840 _cleanup_umask_ mode_t u
;
1843 struct cmsghdr cmsghdr
;
1844 uint8_t buf
[CMSG_SPACE(sizeof(int))];
1846 struct msghdr mh
= {
1847 .msg_control
= &control
,
1848 .msg_controllen
= sizeof(control
),
1850 struct cmsghdr
*cmsg
;
1852 assert(kmsg_socket
>= 0);
1856 /* We create the kmsg FIFO as /run/kmsg, but immediately
1857 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1858 * on the reading side behave very similar to /proc/kmsg,
1859 * their writing side behaves differently from /dev/kmsg in
1860 * that writing blocks when nothing is reading. In order to
1861 * avoid any problems with containers deadlocking due to this
1862 * we simply make /dev/kmsg unavailable to the container. */
1863 from
= prefix_roota(dest
, "/run/kmsg");
1864 to
= prefix_roota(dest
, "/proc/kmsg");
1866 if (mkfifo(from
, 0600) < 0)
1867 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1868 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1869 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1871 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1873 return log_error_errno(errno
, "Failed to open fifo: %m");
1875 cmsg
= CMSG_FIRSTHDR(&mh
);
1876 cmsg
->cmsg_level
= SOL_SOCKET
;
1877 cmsg
->cmsg_type
= SCM_RIGHTS
;
1878 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
1879 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
1881 mh
.msg_controllen
= cmsg
->cmsg_len
;
1883 /* Store away the fd in the socket, so that it stays open as
1884 * long as we run the child */
1885 k
= sendmsg(kmsg_socket
, &mh
, MSG_NOSIGNAL
);
1889 return log_error_errno(errno
, "Failed to send FIFO fd: %m");
1891 /* And now make the FIFO unavailable as /run/kmsg... */
1892 (void) unlink(from
);
1897 static int send_rtnl(int send_fd
) {
1899 struct cmsghdr cmsghdr
;
1900 uint8_t buf
[CMSG_SPACE(sizeof(int))];
1902 struct msghdr mh
= {
1903 .msg_control
= &control
,
1904 .msg_controllen
= sizeof(control
),
1906 struct cmsghdr
*cmsg
;
1907 _cleanup_close_
int fd
= -1;
1910 assert(send_fd
>= 0);
1912 if (!arg_expose_ports
)
1915 fd
= socket(PF_NETLINK
, SOCK_RAW
|SOCK_CLOEXEC
|SOCK_NONBLOCK
, NETLINK_ROUTE
);
1917 return log_error_errno(errno
, "Failed to allocate container netlink: %m");
1919 cmsg
= CMSG_FIRSTHDR(&mh
);
1920 cmsg
->cmsg_level
= SOL_SOCKET
;
1921 cmsg
->cmsg_type
= SCM_RIGHTS
;
1922 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
1923 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
1925 mh
.msg_controllen
= cmsg
->cmsg_len
;
1927 /* Store away the fd in the socket, so that it stays open as
1928 * long as we run the child */
1929 k
= sendmsg(send_fd
, &mh
, MSG_NOSIGNAL
);
1931 return log_error_errno(errno
, "Failed to send netlink fd: %m");
1936 static int flush_ports(union in_addr_union
*exposed
) {
1938 int r
, af
= AF_INET
;
1942 if (!arg_expose_ports
)
1945 if (in_addr_is_null(af
, exposed
))
1948 log_debug("Lost IP address.");
1950 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
1951 r
= fw_add_local_dnat(false,
1962 log_warning_errno(r
, "Failed to modify firewall: %m");
1965 *exposed
= IN_ADDR_NULL
;
1969 static int expose_ports(sd_rtnl
*rtnl
, union in_addr_union
*exposed
) {
1970 _cleanup_free_
struct local_address
*addresses
= NULL
;
1971 _cleanup_free_
char *pretty
= NULL
;
1972 union in_addr_union new_exposed
;
1975 int af
= AF_INET
, r
;
1979 /* Invoked each time an address is added or removed inside the
1982 if (!arg_expose_ports
)
1985 r
= local_addresses(rtnl
, 0, af
, &addresses
);
1987 return log_error_errno(r
, "Failed to enumerate local addresses: %m");
1990 addresses
[0].family
== af
&&
1991 addresses
[0].scope
< RT_SCOPE_LINK
;
1994 return flush_ports(exposed
);
1996 new_exposed
= addresses
[0].address
;
1997 if (in_addr_equal(af
, exposed
, &new_exposed
))
2000 in_addr_to_string(af
, &new_exposed
, &pretty
);
2001 log_debug("New container IP is %s.", strna(pretty
));
2003 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
2005 r
= fw_add_local_dnat(true,
2014 in_addr_is_null(af
, exposed
) ? NULL
: exposed
);
2016 log_warning_errno(r
, "Failed to modify firewall: %m");
2019 *exposed
= new_exposed
;
2023 static int on_address_change(sd_rtnl
*rtnl
, sd_rtnl_message
*m
, void *userdata
) {
2024 union in_addr_union
*exposed
= userdata
;
2030 expose_ports(rtnl
, exposed
);
2034 static int watch_rtnl(sd_event
*event
, int recv_fd
, union in_addr_union
*exposed
, sd_rtnl
**ret
) {
2036 struct cmsghdr cmsghdr
;
2037 uint8_t buf
[CMSG_SPACE(sizeof(int))];
2039 struct msghdr mh
= {
2040 .msg_control
= &control
,
2041 .msg_controllen
= sizeof(control
),
2043 struct cmsghdr
*cmsg
;
2044 _cleanup_rtnl_unref_ sd_rtnl
*rtnl
= NULL
;
2049 assert(recv_fd
>= 0);
2052 if (!arg_expose_ports
)
2055 k
= recvmsg(recv_fd
, &mh
, MSG_NOSIGNAL
);
2057 return log_error_errno(errno
, "Failed to recv netlink fd: %m");
2059 cmsg
= CMSG_FIRSTHDR(&mh
);
2060 assert(cmsg
->cmsg_level
== SOL_SOCKET
);
2061 assert(cmsg
->cmsg_type
== SCM_RIGHTS
);
2062 assert(cmsg
->cmsg_len
== CMSG_LEN(sizeof(int)));
2063 memcpy(&fd
, CMSG_DATA(cmsg
), sizeof(int));
2065 r
= sd_rtnl_open_fd(&rtnl
, fd
, 1, RTNLGRP_IPV4_IFADDR
);
2068 return log_error_errno(r
, "Failed to create rtnl object: %m");
2071 r
= sd_rtnl_add_match(rtnl
, RTM_NEWADDR
, on_address_change
, exposed
);
2073 return log_error_errno(r
, "Failed to subscribe to RTM_NEWADDR messages: %m");
2075 r
= sd_rtnl_add_match(rtnl
, RTM_DELADDR
, on_address_change
, exposed
);
2077 return log_error_errno(r
, "Failed to subscribe to RTM_DELADDR messages: %m");
2079 r
= sd_rtnl_attach_event(rtnl
, event
, 0);
2081 return log_error_errno(r
, "Failed to add to even loop: %m");
2089 static int setup_hostname(void) {
2091 if (arg_share_system
)
2094 if (sethostname_idempotent(arg_machine
) < 0)
2100 static int setup_journal(const char *directory
) {
2101 sd_id128_t machine_id
, this_id
;
2102 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
2103 const char *etc_machine_id
, *p
, *q
;
2107 /* Don't link journals in ephemeral mode */
2111 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
2113 r
= read_one_line_file(etc_machine_id
, &b
);
2114 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
2117 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
2120 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
2123 /* Verify validity */
2124 r
= sd_id128_from_string(id
, &machine_id
);
2126 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
2128 r
= sd_id128_get_machine(&this_id
);
2130 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
2132 if (sd_id128_equal(machine_id
, this_id
)) {
2133 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
2134 "Host and machine ids are equal (%s): refusing to link journals", id
);
2135 if (arg_link_journal
== LINK_AUTO
)
2140 if (arg_link_journal
== LINK_NO
)
2143 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
2145 return log_error_errno(r
, "Failed to create /var: %m");
2147 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
2149 return log_error_errno(r
, "Failed to create /var/log: %m");
2151 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
2153 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
2155 p
= strjoina("/var/log/journal/", id
);
2156 q
= prefix_roota(directory
, p
);
2158 if (path_is_mount_point(p
, 0) > 0) {
2159 if (arg_link_journal
!= LINK_AUTO
) {
2160 log_error("%s: already a mount point, refusing to use for journal", p
);
2167 if (path_is_mount_point(q
, 0) > 0) {
2168 if (arg_link_journal
!= LINK_AUTO
) {
2169 log_error("%s: already a mount point, refusing to use for journal", q
);
2176 r
= readlink_and_make_absolute(p
, &d
);
2178 if ((arg_link_journal
== LINK_GUEST
||
2179 arg_link_journal
== LINK_AUTO
) &&
2182 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2184 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
2189 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
2190 } else if (r
== -EINVAL
) {
2192 if (arg_link_journal
== LINK_GUEST
&&
2195 if (errno
== ENOTDIR
) {
2196 log_error("%s already exists and is neither a symlink nor a directory", p
);
2199 log_error_errno(errno
, "Failed to remove %s: %m", p
);
2203 } else if (r
!= -ENOENT
) {
2204 log_error_errno(errno
, "readlink(%s) failed: %m", p
);
2208 if (arg_link_journal
== LINK_GUEST
) {
2210 if (symlink(q
, p
) < 0) {
2211 if (arg_link_journal_try
) {
2212 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
2215 log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
2220 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2222 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
2226 if (arg_link_journal
== LINK_HOST
) {
2227 /* don't create parents here -- if the host doesn't have
2228 * permanent journal set up, don't force it here */
2231 if (arg_link_journal_try
) {
2232 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
2235 log_error_errno(errno
, "Failed to create %s: %m", p
);
2240 } else if (access(p
, F_OK
) < 0)
2243 if (dir_is_empty(q
) == 0)
2244 log_warning("%s is not empty, proceeding anyway.", q
);
2246 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2248 log_error_errno(errno
, "Failed to create %s: %m", q
);
2252 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
2253 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
2258 static int drop_capabilities(void) {
2259 return capability_bounding_set_drop(~arg_retain
, false);
2262 static int register_machine(pid_t pid
, int local_ifindex
) {
2263 _cleanup_bus_error_free_ sd_bus_error error
= SD_BUS_ERROR_NULL
;
2264 _cleanup_bus_close_unref_ sd_bus
*bus
= NULL
;
2270 r
= sd_bus_default_system(&bus
);
2272 return log_error_errno(r
, "Failed to open system bus: %m");
2274 if (arg_keep_unit
) {
2275 r
= sd_bus_call_method(
2277 "org.freedesktop.machine1",
2278 "/org/freedesktop/machine1",
2279 "org.freedesktop.machine1.Manager",
2280 "RegisterMachineWithNetwork",
2285 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid
),
2289 strempty(arg_directory
),
2290 local_ifindex
> 0 ? 1 : 0, local_ifindex
);
2292 _cleanup_bus_message_unref_ sd_bus_message
*m
= NULL
;
2296 r
= sd_bus_message_new_method_call(
2299 "org.freedesktop.machine1",
2300 "/org/freedesktop/machine1",
2301 "org.freedesktop.machine1.Manager",
2302 "CreateMachineWithNetwork");
2304 return bus_log_create_error(r
);
2306 r
= sd_bus_message_append(
2310 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid
),
2314 strempty(arg_directory
),
2315 local_ifindex
> 0 ? 1 : 0, local_ifindex
);
2317 return bus_log_create_error(r
);
2319 r
= sd_bus_message_open_container(m
, 'a', "(sv)");
2321 return bus_log_create_error(r
);
2323 if (!isempty(arg_slice
)) {
2324 r
= sd_bus_message_append(m
, "(sv)", "Slice", "s", arg_slice
);
2326 return bus_log_create_error(r
);
2329 r
= sd_bus_message_append(m
, "(sv)", "DevicePolicy", "s", "strict");
2331 return bus_log_create_error(r
);
2333 /* If you make changes here, also make sure to update
2334 * systemd-nspawn@.service, to keep the device
2335 * policies in sync regardless if we are run with or
2336 * without the --keep-unit switch. */
2337 r
= sd_bus_message_append(m
, "(sv)", "DeviceAllow", "a(ss)", 9,
2338 /* Allow the container to
2339 * access and create the API
2340 * device nodes, so that
2341 * PrivateDevices= in the
2342 * container can work
2347 "/dev/random", "rwm",
2348 "/dev/urandom", "rwm",
2350 "/dev/net/tun", "rwm",
2351 /* Allow the container
2352 * access to ptys. However,
2354 * container to ever create
2355 * these device nodes. */
2356 "/dev/pts/ptmx", "rw",
2359 return bus_log_create_error(r
);
2361 for (j
= 0; j
< arg_n_custom_mounts
; j
++) {
2362 CustomMount
*cm
= &arg_custom_mounts
[j
];
2364 if (cm
->type
!= CUSTOM_MOUNT_BIND
)
2367 r
= is_device_node(cm
->source
);
2369 return log_error_errno(r
, "Failed to stat %s: %m", cm
->source
);
2372 r
= sd_bus_message_append(m
, "(sv)", "DeviceAllow", "a(ss)", 1,
2373 cm
->source
, cm
->read_only
? "r" : "rw");
2375 return log_error_errno(r
, "Failed to append message arguments: %m");
2379 if (arg_kill_signal
!= 0) {
2380 r
= sd_bus_message_append(m
, "(sv)", "KillSignal", "i", arg_kill_signal
);
2382 return bus_log_create_error(r
);
2384 r
= sd_bus_message_append(m
, "(sv)", "KillMode", "s", "mixed");
2386 return bus_log_create_error(r
);
2389 STRV_FOREACH(i
, arg_property
) {
2390 r
= sd_bus_message_open_container(m
, 'r', "sv");
2392 return bus_log_create_error(r
);
2394 r
= bus_append_unit_property_assignment(m
, *i
);
2398 r
= sd_bus_message_close_container(m
);
2400 return bus_log_create_error(r
);
2403 r
= sd_bus_message_close_container(m
);
2405 return bus_log_create_error(r
);
2407 r
= sd_bus_call(bus
, m
, 0, &error
, NULL
);
2411 log_error("Failed to register machine: %s", bus_error_message(&error
, r
));
2418 static int terminate_machine(pid_t pid
) {
2419 _cleanup_bus_error_free_ sd_bus_error error
= SD_BUS_ERROR_NULL
;
2420 _cleanup_bus_message_unref_ sd_bus_message
*reply
= NULL
;
2421 _cleanup_bus_close_unref_ sd_bus
*bus
= NULL
;
2428 /* If we are reusing the unit, then just exit, systemd will do
2429 * the right thing when we exit. */
2433 r
= sd_bus_default_system(&bus
);
2435 return log_error_errno(r
, "Failed to open system bus: %m");
2437 r
= sd_bus_call_method(
2439 "org.freedesktop.machine1",
2440 "/org/freedesktop/machine1",
2441 "org.freedesktop.machine1.Manager",
2448 /* Note that the machine might already have been
2449 * cleaned up automatically, hence don't consider it a
2450 * failure if we cannot get the machine object. */
2451 log_debug("Failed to get machine: %s", bus_error_message(&error
, r
));
2455 r
= sd_bus_message_read(reply
, "o", &path
);
2457 return bus_log_parse_error(r
);
2459 r
= sd_bus_call_method(
2461 "org.freedesktop.machine1",
2463 "org.freedesktop.machine1.Machine",
2469 log_debug("Failed to terminate machine: %s", bus_error_message(&error
, r
));
2476 static int reset_audit_loginuid(void) {
2477 _cleanup_free_
char *p
= NULL
;
2480 if (arg_share_system
)
2483 r
= read_one_line_file("/proc/self/loginuid", &p
);
2487 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
2489 /* Already reset? */
2490 if (streq(p
, "4294967295"))
2493 r
= write_string_file("/proc/self/loginuid", "4294967295");
2496 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2497 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2498 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2499 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2500 "using systemd-nspawn. Sleeping for 5s... (%m)");
2508 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2509 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2510 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2512 static int generate_mac(struct ether_addr
*mac
, sd_id128_t hash_key
, uint64_t idx
) {
2518 l
= strlen(arg_machine
);
2519 sz
= sizeof(sd_id128_t
) + l
;
2525 /* fetch some persistent data unique to the host */
2526 r
= sd_id128_get_machine((sd_id128_t
*) v
);
2530 /* combine with some data unique (on this host) to this
2531 * container instance */
2532 i
= mempcpy(v
+ sizeof(sd_id128_t
), arg_machine
, l
);
2535 memcpy(i
, &idx
, sizeof(idx
));
2538 /* Let's hash the host machine ID plus the container name. We
2539 * use a fixed, but originally randomly created hash key here. */
2540 siphash24(result
, v
, sz
, hash_key
.bytes
);
2542 assert_cc(ETH_ALEN
<= sizeof(result
));
2543 memcpy(mac
->ether_addr_octet
, result
, ETH_ALEN
);
2545 /* see eth_random_addr in the kernel */
2546 mac
->ether_addr_octet
[0] &= 0xfe; /* clear multicast bit */
2547 mac
->ether_addr_octet
[0] |= 0x02; /* set local assignment bit (IEEE802) */
2552 static int setup_veth(pid_t pid
, char iface_name
[IFNAMSIZ
], int *ifi
) {
2553 _cleanup_rtnl_message_unref_ sd_rtnl_message
*m
= NULL
;
2554 _cleanup_rtnl_unref_ sd_rtnl
*rtnl
= NULL
;
2555 struct ether_addr mac_host
, mac_container
;
2558 if (!arg_private_network
)
2561 if (!arg_network_veth
)
2564 /* Use two different interface name prefixes depending whether
2565 * we are in bridge mode or not. */
2566 snprintf(iface_name
, IFNAMSIZ
- 1, "%s-%s",
2567 arg_network_bridge
? "vb" : "ve", arg_machine
);
2569 r
= generate_mac(&mac_container
, CONTAINER_HASH_KEY
, 0);
2571 return log_error_errno(r
, "Failed to generate predictable MAC address for container side: %m");
2573 r
= generate_mac(&mac_host
, HOST_HASH_KEY
, 0);
2575 return log_error_errno(r
, "Failed to generate predictable MAC address for host side: %m");
2577 r
= sd_rtnl_open(&rtnl
, 0);
2579 return log_error_errno(r
, "Failed to connect to netlink: %m");
2581 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2583 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2585 r
= sd_rtnl_message_append_string(m
, IFLA_IFNAME
, iface_name
);
2587 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2589 r
= sd_rtnl_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac_host
);
2591 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2593 r
= sd_rtnl_message_open_container(m
, IFLA_LINKINFO
);
2595 return log_error_errno(r
, "Failed to open netlink container: %m");
2597 r
= sd_rtnl_message_open_container_union(m
, IFLA_INFO_DATA
, "veth");
2599 return log_error_errno(r
, "Failed to open netlink container: %m");
2601 r
= sd_rtnl_message_open_container(m
, VETH_INFO_PEER
);
2603 return log_error_errno(r
, "Failed to open netlink container: %m");
2605 r
= sd_rtnl_message_append_string(m
, IFLA_IFNAME
, "host0");
2607 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2609 r
= sd_rtnl_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac_container
);
2611 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2613 r
= sd_rtnl_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2615 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2617 r
= sd_rtnl_message_close_container(m
);
2619 return log_error_errno(r
, "Failed to close netlink container: %m");
2621 r
= sd_rtnl_message_close_container(m
);
2623 return log_error_errno(r
, "Failed to close netlink container: %m");
2625 r
= sd_rtnl_message_close_container(m
);
2627 return log_error_errno(r
, "Failed to close netlink container: %m");
2629 r
= sd_rtnl_call(rtnl
, m
, 0, NULL
);
2631 return log_error_errno(r
, "Failed to add new veth interfaces (host0, %s): %m", iface_name
);
2633 i
= (int) if_nametoindex(iface_name
);
2635 return log_error_errno(errno
, "Failed to resolve interface %s: %m", iface_name
);
2642 static int setup_bridge(const char veth_name
[], int *ifi
) {
2643 _cleanup_rtnl_message_unref_ sd_rtnl_message
*m
= NULL
;
2644 _cleanup_rtnl_unref_ sd_rtnl
*rtnl
= NULL
;
2647 if (!arg_private_network
)
2650 if (!arg_network_veth
)
2653 if (!arg_network_bridge
)
2656 bridge
= (int) if_nametoindex(arg_network_bridge
);
2658 return log_error_errno(errno
, "Failed to resolve interface %s: %m", arg_network_bridge
);
2662 r
= sd_rtnl_open(&rtnl
, 0);
2664 return log_error_errno(r
, "Failed to connect to netlink: %m");
2666 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_SETLINK
, 0);
2668 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2670 r
= sd_rtnl_message_link_set_flags(m
, IFF_UP
, IFF_UP
);
2672 return log_error_errno(r
, "Failed to set IFF_UP flag: %m");
2674 r
= sd_rtnl_message_append_string(m
, IFLA_IFNAME
, veth_name
);
2676 return log_error_errno(r
, "Failed to add netlink interface name field: %m");
2678 r
= sd_rtnl_message_append_u32(m
, IFLA_MASTER
, bridge
);
2680 return log_error_errno(r
, "Failed to add netlink master field: %m");
2682 r
= sd_rtnl_call(rtnl
, m
, 0, NULL
);
2684 return log_error_errno(r
, "Failed to add veth interface to bridge: %m");
2689 static int parse_interface(struct udev
*udev
, const char *name
) {
2690 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
2691 char ifi_str
[2 + DECIMAL_STR_MAX(int)];
2694 ifi
= (int) if_nametoindex(name
);
2696 return log_error_errno(errno
, "Failed to resolve interface %s: %m", name
);
2698 sprintf(ifi_str
, "n%i", ifi
);
2699 d
= udev_device_new_from_device_id(udev
, ifi_str
);
2701 return log_error_errno(errno
, "Failed to get udev device for interface %s: %m", name
);
2703 if (udev_device_get_is_initialized(d
) <= 0) {
2704 log_error("Network interface %s is not initialized yet.", name
);
2711 static int move_network_interfaces(pid_t pid
) {
2712 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2713 _cleanup_rtnl_unref_ sd_rtnl
*rtnl
= NULL
;
2717 if (!arg_private_network
)
2720 if (strv_isempty(arg_network_interfaces
))
2723 r
= sd_rtnl_open(&rtnl
, 0);
2725 return log_error_errno(r
, "Failed to connect to netlink: %m");
2729 log_error("Failed to connect to udev.");
2733 STRV_FOREACH(i
, arg_network_interfaces
) {
2734 _cleanup_rtnl_message_unref_ sd_rtnl_message
*m
= NULL
;
2737 ifi
= parse_interface(udev
, *i
);
2741 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_SETLINK
, ifi
);
2743 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2745 r
= sd_rtnl_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2747 return log_error_errno(r
, "Failed to append namespace PID to netlink message: %m");
2749 r
= sd_rtnl_call(rtnl
, m
, 0, NULL
);
2751 return log_error_errno(r
, "Failed to move interface %s to namespace: %m", *i
);
2757 static int setup_macvlan(pid_t pid
) {
2758 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2759 _cleanup_rtnl_unref_ sd_rtnl
*rtnl
= NULL
;
2764 if (!arg_private_network
)
2767 if (strv_isempty(arg_network_macvlan
))
2770 r
= sd_rtnl_open(&rtnl
, 0);
2772 return log_error_errno(r
, "Failed to connect to netlink: %m");
2776 log_error("Failed to connect to udev.");
2780 STRV_FOREACH(i
, arg_network_macvlan
) {
2781 _cleanup_rtnl_message_unref_ sd_rtnl_message
*m
= NULL
;
2782 _cleanup_free_
char *n
= NULL
;
2783 struct ether_addr mac
;
2786 ifi
= parse_interface(udev
, *i
);
2790 r
= generate_mac(&mac
, MACVLAN_HASH_KEY
, idx
++);
2792 return log_error_errno(r
, "Failed to create MACVLAN MAC address: %m");
2794 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2796 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2798 r
= sd_rtnl_message_append_u32(m
, IFLA_LINK
, ifi
);
2800 return log_error_errno(r
, "Failed to add netlink interface index: %m");
2802 n
= strappend("mv-", *i
);
2806 strshorten(n
, IFNAMSIZ
-1);
2808 r
= sd_rtnl_message_append_string(m
, IFLA_IFNAME
, n
);
2810 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2812 r
= sd_rtnl_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac
);
2814 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2816 r
= sd_rtnl_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2818 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2820 r
= sd_rtnl_message_open_container(m
, IFLA_LINKINFO
);
2822 return log_error_errno(r
, "Failed to open netlink container: %m");
2824 r
= sd_rtnl_message_open_container_union(m
, IFLA_INFO_DATA
, "macvlan");
2826 return log_error_errno(r
, "Failed to open netlink container: %m");
2828 r
= sd_rtnl_message_append_u32(m
, IFLA_MACVLAN_MODE
, MACVLAN_MODE_BRIDGE
);
2830 return log_error_errno(r
, "Failed to append macvlan mode: %m");
2832 r
= sd_rtnl_message_close_container(m
);
2834 return log_error_errno(r
, "Failed to close netlink container: %m");
2836 r
= sd_rtnl_message_close_container(m
);
2838 return log_error_errno(r
, "Failed to close netlink container: %m");
2840 r
= sd_rtnl_call(rtnl
, m
, 0, NULL
);
2842 return log_error_errno(r
, "Failed to add new macvlan interfaces: %m");
2848 static int setup_ipvlan(pid_t pid
) {
2849 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2850 _cleanup_rtnl_unref_ sd_rtnl
*rtnl
= NULL
;
2854 if (!arg_private_network
)
2857 if (strv_isempty(arg_network_ipvlan
))
2860 r
= sd_rtnl_open(&rtnl
, 0);
2862 return log_error_errno(r
, "Failed to connect to netlink: %m");
2866 log_error("Failed to connect to udev.");
2870 STRV_FOREACH(i
, arg_network_ipvlan
) {
2871 _cleanup_rtnl_message_unref_ sd_rtnl_message
*m
= NULL
;
2872 _cleanup_free_
char *n
= NULL
;
2875 ifi
= parse_interface(udev
, *i
);
2879 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2881 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2883 r
= sd_rtnl_message_append_u32(m
, IFLA_LINK
, ifi
);
2885 return log_error_errno(r
, "Failed to add netlink interface index: %m");
2887 n
= strappend("iv-", *i
);
2891 strshorten(n
, IFNAMSIZ
-1);
2893 r
= sd_rtnl_message_append_string(m
, IFLA_IFNAME
, n
);
2895 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2897 r
= sd_rtnl_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2899 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2901 r
= sd_rtnl_message_open_container(m
, IFLA_LINKINFO
);
2903 return log_error_errno(r
, "Failed to open netlink container: %m");
2905 r
= sd_rtnl_message_open_container_union(m
, IFLA_INFO_DATA
, "ipvlan");
2907 return log_error_errno(r
, "Failed to open netlink container: %m");
2909 r
= sd_rtnl_message_append_u16(m
, IFLA_IPVLAN_MODE
, IPVLAN_MODE_L2
);
2911 return log_error_errno(r
, "Failed to add ipvlan mode: %m");
2913 r
= sd_rtnl_message_close_container(m
);
2915 return log_error_errno(r
, "Failed to close netlink container: %m");
2917 r
= sd_rtnl_message_close_container(m
);
2919 return log_error_errno(r
, "Failed to close netlink container: %m");
2921 r
= sd_rtnl_call(rtnl
, m
, 0, NULL
);
2923 return log_error_errno(r
, "Failed to add new ipvlan interfaces: %m");
2929 static int setup_seccomp(void) {
2932 static const struct {
2933 uint64_t capability
;
2936 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
2937 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
2938 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
2939 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
2940 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
2941 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
2942 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
2943 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
2944 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
2945 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
2948 scmp_filter_ctx seccomp
;
2952 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
2956 r
= seccomp_add_secondary_archs(seccomp
);
2958 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
2962 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
2963 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
2966 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
2968 continue; /* unknown syscall */
2970 log_error_errno(r
, "Failed to block syscall: %m");
2977 Audit is broken in containers, much of the userspace audit
2978 hookup will fail if running inside a container. We don't
2979 care and just turn off creation of audit sockets.
2981 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2982 with EAFNOSUPPORT which audit userspace uses as indication
2983 that audit is disabled in the kernel.
2986 r
= seccomp_rule_add(
2988 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
2991 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
2992 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
2994 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
2998 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
3000 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
3004 r
= seccomp_load(seccomp
);
3006 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
3009 seccomp_release(seccomp
);
3017 static int setup_propagate(const char *root
) {
3020 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3021 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
3022 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3023 (void) mkdir_p(p
, 0600);
3025 if (userns_mkdir(root
, "/run/systemd", 0755, 0, 0) < 0)
3026 return log_error_errno(errno
, "Failed to create /run/systemd: %m");
3028 if (userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3029 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn: %m");
3031 if (userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3032 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn/incoming: %m");
3034 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
3035 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
3036 return log_error_errno(errno
, "Failed to install propagation bind mount.");
3038 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
3039 return log_error_errno(errno
, "Failed to make propagation mount read-only");
3044 static int setup_image(char **device_path
, int *loop_nr
) {
3045 struct loop_info64 info
= {
3046 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
3048 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
3049 _cleanup_free_
char* loopdev
= NULL
;
3053 assert(device_path
);
3057 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
3059 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
3061 if (fstat(fd
, &st
) < 0)
3062 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
3064 if (S_ISBLK(st
.st_mode
)) {
3067 p
= strdup(arg_image
);
3081 if (!S_ISREG(st
.st_mode
)) {
3082 log_error_errno(errno
, "%s is not a regular file or block device: %m", arg_image
);
3086 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
3088 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
3090 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
3092 return log_error_errno(errno
, "Failed to allocate loop device: %m");
3094 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
3097 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
3099 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
3101 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
3102 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
3105 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
3107 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
3108 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
3110 *device_path
= loopdev
;
3121 #define PARTITION_TABLE_BLURB \
3122 "Note that the disk image needs to either contain only a single MBR partition of\n" \
3123 "type 0x83 that is marked bootable, or a single GPT partition of type " \
3124 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
3125 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3126 "to be bootable with systemd-nspawn."
3128 static int dissect_image(
3130 char **root_device
, bool *root_device_rw
,
3131 char **home_device
, bool *home_device_rw
,
3132 char **srv_device
, bool *srv_device_rw
,
3136 int home_nr
= -1, srv_nr
= -1;
3137 #ifdef GPT_ROOT_NATIVE
3140 #ifdef GPT_ROOT_SECONDARY
3141 int secondary_root_nr
= -1;
3143 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
3144 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
3145 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
3146 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
3147 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
3148 struct udev_list_entry
*first
, *item
;
3149 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
3150 bool is_gpt
, is_mbr
, multiple_generic
= false;
3151 const char *pttype
= NULL
;
3158 assert(root_device
);
3159 assert(home_device
);
3164 b
= blkid_new_probe();
3169 r
= blkid_probe_set_device(b
, fd
, 0, 0);
3174 log_error_errno(errno
, "Failed to set device on blkid probe: %m");
3178 blkid_probe_enable_partitions(b
, 1);
3179 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
3182 r
= blkid_do_safeprobe(b
);
3183 if (r
== -2 || r
== 1) {
3184 log_error("Failed to identify any partition table on\n"
3186 PARTITION_TABLE_BLURB
, arg_image
);
3188 } else if (r
!= 0) {
3191 log_error_errno(errno
, "Failed to probe: %m");
3195 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
3197 is_gpt
= streq_ptr(pttype
, "gpt");
3198 is_mbr
= streq_ptr(pttype
, "dos");
3200 if (!is_gpt
&& !is_mbr
) {
3201 log_error("No GPT or MBR partition table discovered on\n"
3203 PARTITION_TABLE_BLURB
, arg_image
);
3208 pl
= blkid_probe_get_partitions(b
);
3213 log_error("Failed to list partitions of %s", arg_image
);
3221 if (fstat(fd
, &st
) < 0)
3222 return log_error_errno(errno
, "Failed to stat block device: %m");
3224 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
3232 log_error("Kernel partitions never appeared.");
3236 e
= udev_enumerate_new(udev
);
3240 r
= udev_enumerate_add_match_parent(e
, d
);
3244 r
= udev_enumerate_scan_devices(e
);
3246 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
3248 /* Count the partitions enumerated by the kernel */
3250 first
= udev_enumerate_get_list_entry(e
);
3251 udev_list_entry_foreach(item
, first
)
3254 /* Count the partitions enumerated by blkid */
3255 m
= blkid_partlist_numof_partitions(pl
);
3259 log_error("blkid and kernel partition list do not match.");
3265 /* The kernel has probed fewer partitions than
3266 * blkid? Maybe the kernel prober is still
3267 * running or it got EBUSY because udev
3268 * already opened the device. Let's reprobe
3269 * the device, which is a synchronous call
3270 * that waits until probing is complete. */
3272 for (j
= 0; j
< 20; j
++) {
3274 r
= ioctl(fd
, BLKRRPART
, 0);
3277 if (r
>= 0 || r
!= -EBUSY
)
3280 /* If something else has the device
3281 * open, such as an udev rule, the
3282 * ioctl will return EBUSY. Since
3283 * there's no way to wait until it
3284 * isn't busy anymore, let's just wait
3285 * a bit, and try again.
3287 * This is really something they
3288 * should fix in the kernel! */
3290 usleep(50 * USEC_PER_MSEC
);
3294 return log_error_errno(r
, "Failed to reread partition table: %m");
3297 e
= udev_enumerate_unref(e
);
3300 first
= udev_enumerate_get_list_entry(e
);
3301 udev_list_entry_foreach(item
, first
) {
3302 _cleanup_udev_device_unref_
struct udev_device
*q
;
3304 unsigned long long flags
;
3310 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
3315 log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
3319 qn
= udev_device_get_devnum(q
);
3323 if (st
.st_rdev
== qn
)
3326 node
= udev_device_get_devnode(q
);
3330 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
3334 flags
= blkid_partition_get_flags(pp
);
3336 nr
= blkid_partition_get_partno(pp
);
3344 if (flags
& GPT_FLAG_NO_AUTO
)
3347 stype
= blkid_partition_get_type_string(pp
);
3351 if (sd_id128_from_string(stype
, &type_id
) < 0)
3354 if (sd_id128_equal(type_id
, GPT_HOME
)) {
3356 if (home
&& nr
>= home_nr
)
3360 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3362 r
= free_and_strdup(&home
, node
);
3366 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
3368 if (srv
&& nr
>= srv_nr
)
3372 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3374 r
= free_and_strdup(&srv
, node
);
3378 #ifdef GPT_ROOT_NATIVE
3379 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
3381 if (root
&& nr
>= root_nr
)
3385 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3387 r
= free_and_strdup(&root
, node
);
3392 #ifdef GPT_ROOT_SECONDARY
3393 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
3395 if (secondary_root
&& nr
>= secondary_root_nr
)
3398 secondary_root_nr
= nr
;
3399 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3401 r
= free_and_strdup(&secondary_root
, node
);
3406 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
3409 multiple_generic
= true;
3411 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3413 r
= free_and_strdup(&generic
, node
);
3419 } else if (is_mbr
) {
3422 if (flags
!= 0x80) /* Bootable flag */
3425 type
= blkid_partition_get_type(pp
);
3426 if (type
!= 0x83) /* Linux partition */
3430 multiple_generic
= true;
3434 r
= free_and_strdup(&root
, node
);
3442 *root_device
= root
;
3445 *root_device_rw
= root_rw
;
3447 } else if (secondary_root
) {
3448 *root_device
= secondary_root
;
3449 secondary_root
= NULL
;
3451 *root_device_rw
= secondary_root_rw
;
3453 } else if (generic
) {
3455 /* There were no partitions with precise meanings
3456 * around, but we found generic partitions. In this
3457 * case, if there's only one, we can go ahead and boot
3458 * it, otherwise we bail out, because we really cannot
3459 * make any sense of it. */
3461 if (multiple_generic
) {
3462 log_error("Identified multiple bootable Linux partitions on\n"
3464 PARTITION_TABLE_BLURB
, arg_image
);
3468 *root_device
= generic
;
3471 *root_device_rw
= generic_rw
;
3474 log_error("Failed to identify root partition in disk image\n"
3476 PARTITION_TABLE_BLURB
, arg_image
);
3481 *home_device
= home
;
3484 *home_device_rw
= home_rw
;
3491 *srv_device_rw
= srv_rw
;
3496 log_error("--image= is not supported, compiled without blkid support.");
3501 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
3503 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
3504 const char *fstype
, *p
;
3514 p
= strjoina(where
, directory
);
3519 b
= blkid_new_probe_from_filename(what
);
3523 log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
3527 blkid_probe_enable_superblocks(b
, 1);
3528 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
3531 r
= blkid_do_safeprobe(b
);
3532 if (r
== -1 || r
== 1) {
3533 log_error("Cannot determine file system type of %s", what
);
3535 } else if (r
!= 0) {
3538 log_error_errno(errno
, "Failed to probe %s: %m", what
);
3543 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
3546 log_error("Failed to determine file system type of %s", what
);
3550 if (streq(fstype
, "crypto_LUKS")) {
3551 log_error("nspawn currently does not support LUKS disk images.");
3555 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
3556 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
3560 log_error("--image= is not supported, compiled without blkid support.");
3565 static int mount_devices(
3567 const char *root_device
, bool root_device_rw
,
3568 const char *home_device
, bool home_device_rw
,
3569 const char *srv_device
, bool srv_device_rw
) {
3575 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
3577 return log_error_errno(r
, "Failed to mount root directory: %m");
3581 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
3583 return log_error_errno(r
, "Failed to mount home directory: %m");
3587 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
3589 return log_error_errno(r
, "Failed to mount server data directory: %m");
3595 static void loop_remove(int nr
, int *image_fd
) {
3596 _cleanup_close_
int control
= -1;
3602 if (image_fd
&& *image_fd
>= 0) {
3603 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
3605 log_debug_errno(errno
, "Failed to close loop image: %m");
3606 *image_fd
= safe_close(*image_fd
);
3609 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
3611 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
3615 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
3617 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
3620 static int spawn_getent(const char *database
, const char *key
, pid_t
*rpid
) {
3628 if (pipe2(pipe_fds
, O_CLOEXEC
) < 0)
3629 return log_error_errno(errno
, "Failed to allocate pipe: %m");
3633 return log_error_errno(errno
, "Failed to fork getent child: %m");
3634 else if (pid
== 0) {
3636 char *empty_env
= NULL
;
3638 if (dup3(pipe_fds
[1], STDOUT_FILENO
, 0) < 0)
3639 _exit(EXIT_FAILURE
);
3641 if (pipe_fds
[0] > 2)
3642 safe_close(pipe_fds
[0]);
3643 if (pipe_fds
[1] > 2)
3644 safe_close(pipe_fds
[1]);
3646 nullfd
= open("/dev/null", O_RDWR
);
3648 _exit(EXIT_FAILURE
);
3650 if (dup3(nullfd
, STDIN_FILENO
, 0) < 0)
3651 _exit(EXIT_FAILURE
);
3653 if (dup3(nullfd
, STDERR_FILENO
, 0) < 0)
3654 _exit(EXIT_FAILURE
);
3659 (void) reset_all_signal_handlers();
3660 (void) reset_signal_mask();
3661 close_all_fds(NULL
, 0);
3663 execle("/usr/bin/getent", "getent", database
, key
, NULL
, &empty_env
);
3664 execle("/bin/getent", "getent", database
, key
, NULL
, &empty_env
);
3665 _exit(EXIT_FAILURE
);
3668 pipe_fds
[1] = safe_close(pipe_fds
[1]);
3675 static int change_uid_gid(char **_home
) {
3676 char line
[LINE_MAX
], *x
, *u
, *g
, *h
;
3677 const char *word
, *state
;
3678 _cleanup_free_ uid_t
*uids
= NULL
;
3679 _cleanup_free_
char *home
= NULL
;
3680 _cleanup_fclose_
FILE *f
= NULL
;
3681 _cleanup_close_
int fd
= -1;
3682 unsigned n_uids
= 0;
3691 if (!arg_user
|| streq(arg_user
, "root") || streq(arg_user
, "0")) {
3692 /* Reset everything fully to 0, just in case */
3694 r
= reset_uid_gid();
3696 return log_error_errno(r
, "Failed to become root: %m");
3702 /* First, get user credentials */
3703 fd
= spawn_getent("passwd", arg_user
, &pid
);
3707 f
= fdopen(fd
, "r");
3712 if (!fgets(line
, sizeof(line
), f
)) {
3715 log_error("Failed to resolve user %s.", arg_user
);
3719 log_error_errno(errno
, "Failed to read from getent: %m");
3725 wait_for_terminate_and_warn("getent passwd", pid
, true);
3727 x
= strchr(line
, ':');
3729 log_error("/etc/passwd entry has invalid user field.");
3733 u
= strchr(x
+1, ':');
3735 log_error("/etc/passwd entry has invalid password field.");
3742 log_error("/etc/passwd entry has invalid UID field.");
3750 log_error("/etc/passwd entry has invalid GID field.");
3755 h
= strchr(x
+1, ':');
3757 log_error("/etc/passwd entry has invalid GECOS field.");
3764 log_error("/etc/passwd entry has invalid home directory field.");
3770 r
= parse_uid(u
, &uid
);
3772 log_error("Failed to parse UID of user.");
3776 r
= parse_gid(g
, &gid
);
3778 log_error("Failed to parse GID of user.");
3786 /* Second, get group memberships */
3787 fd
= spawn_getent("initgroups", arg_user
, &pid
);
3792 f
= fdopen(fd
, "r");
3797 if (!fgets(line
, sizeof(line
), f
)) {
3799 log_error("Failed to resolve user %s.", arg_user
);
3803 log_error_errno(errno
, "Failed to read from getent: %m");
3809 wait_for_terminate_and_warn("getent initgroups", pid
, true);
3811 /* Skip over the username and subsequent separator whitespace */
3813 x
+= strcspn(x
, WHITESPACE
);
3814 x
+= strspn(x
, WHITESPACE
);
3816 FOREACH_WORD(word
, l
, x
, state
) {
3822 if (!GREEDY_REALLOC(uids
, sz
, n_uids
+1))
3825 r
= parse_uid(c
, &uids
[n_uids
++]);
3827 log_error("Failed to parse group data from getent.");
3832 r
= mkdir_parents(home
, 0775);
3834 return log_error_errno(r
, "Failed to make home root directory: %m");
3836 r
= mkdir_safe(home
, 0755, uid
, gid
);
3837 if (r
< 0 && r
!= -EEXIST
)
3838 return log_error_errno(r
, "Failed to make home directory: %m");
3840 (void) fchown(STDIN_FILENO
, uid
, gid
);
3841 (void) fchown(STDOUT_FILENO
, uid
, gid
);
3842 (void) fchown(STDERR_FILENO
, uid
, gid
);
3844 if (setgroups(n_uids
, uids
) < 0)
3845 return log_error_errno(errno
, "Failed to set auxiliary groups: %m");
3847 if (setresgid(gid
, gid
, gid
) < 0)
3848 return log_error_errno(errno
, "setregid() failed: %m");
3850 if (setresuid(uid
, uid
, uid
) < 0)
3851 return log_error_errno(errno
, "setreuid() failed: %m");
3863 * < 0 : wait_for_terminate() failed to get the state of the
3864 * container, the container was terminated by a signal, or
3865 * failed for an unknown reason. No change is made to the
3866 * container argument.
3867 * > 0 : The program executed in the container terminated with an
3868 * error. The exit code of the program executed in the
3869 * container is returned. The container argument has been set
3870 * to CONTAINER_TERMINATED.
3871 * 0 : The container is being rebooted, has been shut down or exited
3872 * successfully. The container argument has been set to either
3873 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3875 * That is, success is indicated by a return value of zero, and an
3876 * error is indicated by a non-zero value.
3878 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
3882 r
= wait_for_terminate(pid
, &status
);
3884 return log_warning_errno(r
, "Failed to wait for container: %m");
3886 switch (status
.si_code
) {
3889 if (status
.si_status
== 0) {
3890 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
3893 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
3895 *container
= CONTAINER_TERMINATED
;
3896 return status
.si_status
;
3899 if (status
.si_status
== SIGINT
) {
3901 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
3902 *container
= CONTAINER_TERMINATED
;
3905 } else if (status
.si_status
== SIGHUP
) {
3907 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
3908 *container
= CONTAINER_REBOOTED
;
3912 /* CLD_KILLED fallthrough */
3915 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
3919 log_error("Container %s failed due to unknown reason.", arg_machine
);
3926 static void nop_handler(int sig
) {}
3928 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
3931 pid
= PTR_TO_UINT32(userdata
);
3933 if (kill(pid
, arg_kill_signal
) >= 0) {
3934 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3935 sd_event_source_set_userdata(s
, NULL
);
3940 sd_event_exit(sd_event_source_get_event(s
), 0);
3944 static int determine_names(void) {
3947 if (!arg_image
&& !arg_directory
) {
3949 _cleanup_(image_unrefp
) Image
*i
= NULL
;
3951 r
= image_find(arg_machine
, &i
);
3953 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
3955 log_error("No image for machine '%s': %m", arg_machine
);
3959 if (i
->type
== IMAGE_RAW
)
3960 r
= set_sanitized_path(&arg_image
, i
->path
);
3962 r
= set_sanitized_path(&arg_directory
, i
->path
);
3964 return log_error_errno(r
, "Invalid image directory: %m");
3967 arg_read_only
= arg_read_only
|| i
->read_only
;
3969 arg_directory
= get_current_dir_name();
3971 if (!arg_directory
&& !arg_machine
) {
3972 log_error("Failed to determine path, please use -D or -i.");
3978 if (arg_directory
&& path_equal(arg_directory
, "/"))
3979 arg_machine
= gethostname_malloc();
3981 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
3986 hostname_cleanup(arg_machine
, false);
3987 if (!machine_name_is_valid(arg_machine
)) {
3988 log_error("Failed to determine machine name automatically, please use -M.");
3992 if (arg_ephemeral
) {
3995 /* Add a random suffix when this is an
3996 * ephemeral machine, so that we can run many
3997 * instances at once without manually having
3998 * to specify -M each time. */
4000 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
4011 static int determine_uid_shift(const char *directory
) {
4019 if (arg_uid_shift
== UID_INVALID
) {
4022 r
= stat(directory
, &st
);
4024 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
4026 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
4028 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
4029 log_error("UID and GID base of %s don't match.", directory
);
4033 arg_uid_range
= UINT32_C(0x10000);
4036 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
4037 log_error("UID base too high for UID range.");
4041 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
4045 static int inner_child(
4047 const char *directory
,
4055 _cleanup_free_
char *home
= NULL
;
4057 const char *envp
[] = {
4058 "PATH=" DEFAULT_PATH_SPLIT_USR
,
4059 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4064 NULL
, /* container_uuid */
4065 NULL
, /* LISTEN_FDS */
4066 NULL
, /* LISTEN_PID */
4070 _cleanup_strv_free_
char **env_use
= NULL
;
4075 assert(kmsg_socket
>= 0);
4078 /* Tell the parent, that it now can write the UID map. */
4079 (void) barrier_place(barrier
); /* #1 */
4081 /* Wait until the parent wrote the UID map */
4082 if (!barrier_place_and_sync(barrier
)) { /* #2 */
4083 log_error("Parent died too early");
4088 r
= mount_all(NULL
, true);
4092 /* Wait until we are cgroup-ified, so that we
4093 * can mount the right cgroup path writable */
4094 if (!barrier_place_and_sync(barrier
)) { /* #3 */
4095 log_error("Parent died too early");
4099 r
= mount_systemd_cgroup_writable("");
4103 r
= reset_uid_gid();
4105 return log_error_errno(r
, "Couldn't become new root: %m");
4107 r
= setup_boot_id(NULL
);
4111 r
= setup_kmsg(NULL
, kmsg_socket
);
4114 kmsg_socket
= safe_close(kmsg_socket
);
4119 return log_error_errno(errno
, "setsid() failed: %m");
4121 if (arg_private_network
)
4124 r
= send_rtnl(rtnl_socket
);
4127 rtnl_socket
= safe_close(rtnl_socket
);
4129 if (drop_capabilities() < 0)
4130 return log_error_errno(errno
, "drop_capabilities() failed: %m");
4134 if (arg_personality
!= PERSONALITY_INVALID
) {
4135 if (personality(arg_personality
) < 0)
4136 return log_error_errno(errno
, "personality() failed: %m");
4137 } else if (secondary
) {
4138 if (personality(PER_LINUX32
) < 0)
4139 return log_error_errno(errno
, "personality() failed: %m");
4143 if (arg_selinux_context
)
4144 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
4145 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
4148 r
= change_uid_gid(&home
);
4152 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
4156 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
4157 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
4158 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
4161 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
4164 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
4168 if (fdset_size(fds
) > 0) {
4169 r
= fdset_cloexec(fds
, false);
4171 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
4173 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
4174 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
4178 env_use
= strv_env_merge(2, envp
, arg_setenv
);
4182 /* Let the parent know that we are ready and
4183 * wait until the parent is ready with the
4185 if (!barrier_place_and_sync(barrier
)) { /* #4 */
4186 log_error("Parent died too early");
4190 /* Now, explicitly close the log, so that we
4191 * then can close all remaining fds. Closing
4192 * the log explicitly first has the benefit
4193 * that the logging subsystem knows about it,
4194 * and is thus ready to be reopened should we
4195 * need it again. Note that the other fds
4196 * closed here are at least the locking and
4199 (void) fdset_close_others(fds
);
4205 /* Automatically search for the init system */
4207 m
= 1 + argc
- optind
;
4208 a
= newa(char*, m
+ 1);
4209 memcpy(a
+ 1, argv
+ optind
, m
* sizeof(char*));
4211 a
[0] = (char*) "/usr/lib/systemd/systemd";
4212 execve(a
[0], a
, env_use
);
4214 a
[0] = (char*) "/lib/systemd/systemd";
4215 execve(a
[0], a
, env_use
);
4217 a
[0] = (char*) "/sbin/init";
4218 execve(a
[0], a
, env_use
);
4219 } else if (argc
> optind
)
4220 execvpe(argv
[optind
], argv
+ optind
, env_use
);
4222 chdir(home
? home
: "/root");
4223 execle("/bin/bash", "-bash", NULL
, env_use
);
4224 execle("/bin/sh", "-sh", NULL
, env_use
);
4228 return log_error_errno(errno
, "execv() failed: %m");
4231 static int outer_child(
4233 const char *directory
,
4234 const char *console
,
4235 const char *root_device
, bool root_device_rw
,
4236 const char *home_device
, bool home_device_rw
,
4237 const char *srv_device
, bool srv_device_rw
,
4254 assert(pid_socket
>= 0);
4255 assert(kmsg_socket
>= 0);
4257 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
4258 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
4261 close_nointr(STDIN_FILENO
);
4262 close_nointr(STDOUT_FILENO
);
4263 close_nointr(STDERR_FILENO
);
4265 r
= open_terminal(console
, O_RDWR
);
4266 if (r
!= STDIN_FILENO
) {
4272 return log_error_errno(r
, "Failed to open console: %m");
4275 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
4276 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
4277 return log_error_errno(errno
, "Failed to duplicate console: %m");
4280 r
= reset_audit_loginuid();
4284 /* Mark everything as slave, so that we still
4285 * receive mounts from the real root, but don't
4286 * propagate mounts to the real root. */
4287 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
4288 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
4290 r
= mount_devices(directory
,
4291 root_device
, root_device_rw
,
4292 home_device
, home_device_rw
,
4293 srv_device
, srv_device_rw
);
4297 r
= determine_uid_shift(directory
);
4301 /* Turn directory into bind mount */
4302 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
4303 return log_error_errno(errno
, "Failed to make bind mount: %m");
4305 r
= setup_volatile(directory
);
4309 r
= setup_volatile_state(directory
);
4313 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
4317 if (arg_read_only
) {
4318 r
= bind_remount_recursive(directory
, true);
4320 return log_error_errno(r
, "Failed to make tree read-only: %m");
4323 r
= mount_all(directory
, false);
4327 if (copy_devnodes(directory
) < 0)
4330 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
4332 if (setup_pts(directory
) < 0)
4335 r
= setup_propagate(directory
);
4339 r
= setup_dev_console(directory
, console
);
4343 r
= setup_seccomp();
4347 r
= setup_timezone(directory
);
4351 r
= setup_resolv_conf(directory
);
4355 r
= setup_journal(directory
);
4359 r
= mount_custom(directory
);
4363 r
= mount_cgroup(directory
);
4367 r
= mount_move_root(directory
);
4369 return log_error_errno(r
, "Failed to move root directory: %m");
4371 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
4372 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
4373 (arg_private_network
? CLONE_NEWNET
: 0) |
4374 (arg_userns
? CLONE_NEWUSER
: 0),
4377 return log_error_errno(errno
, "Failed to fork inner child: %m");
4380 pid_socket
= safe_close(pid_socket
);
4382 /* The inner child has all namespaces that are
4383 * requested, so that we all are owned by the user if
4384 * user namespaces are turned on. */
4386 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
, argc
, argv
);
4388 _exit(EXIT_FAILURE
);
4390 _exit(EXIT_SUCCESS
);
4393 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
4395 return log_error_errno(errno
, "Failed to send PID: %m");
4396 if (l
!= sizeof(pid
)) {
4397 log_error("Short write while sending PID.");
4401 pid_socket
= safe_close(pid_socket
);
4406 static int setup_uid_map(pid_t pid
) {
4407 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
4412 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
4413 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
4414 r
= write_string_file(uid_map
, line
);
4416 return log_error_errno(r
, "Failed to write UID map: %m");
4418 /* We always assign the same UID and GID ranges */
4419 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
4420 r
= write_string_file(uid_map
, line
);
4422 return log_error_errno(r
, "Failed to write GID map: %m");
4427 static int chown_cgroup(pid_t pid
) {
4428 _cleanup_free_
char *path
= NULL
, *fs
= NULL
;
4429 _cleanup_close_
int fd
= -1;
4433 r
= cg_pid_get_path(NULL
, pid
, &path
);
4435 return log_error_errno(r
, "Failed to get container cgroup path: %m");
4437 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &fs
);
4439 return log_error_errno(r
, "Failed to get file system path for container cgroup: %m");
4441 fd
= open(fs
, O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
);
4443 return log_error_errno(errno
, "Failed to open %s: %m", fs
);
4445 FOREACH_STRING(fn
, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4446 if (fchownat(fd
, fn
, arg_uid_shift
, arg_uid_shift
, 0) < 0)
4447 log_warning_errno(errno
, "Failed to chown() cgroup file %s, ignoring: %m", fn
);
4452 int main(int argc
, char *argv
[]) {
4454 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
4455 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
4456 _cleanup_close_
int master
= -1, image_fd
= -1;
4457 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
4458 int r
, n_fd_passed
, loop_nr
= -1;
4459 char veth_name
[IFNAMSIZ
];
4460 bool secondary
= false, remove_subvol
= false;
4461 sigset_t mask
, mask_chld
;
4463 int ret
= EXIT_SUCCESS
;
4464 union in_addr_union exposed
= {};
4465 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
4468 log_parse_environment();
4471 r
= parse_argv(argc
, argv
);
4475 r
= determine_names();
4479 if (geteuid() != 0) {
4480 log_error("Need to be root.");
4485 n_fd_passed
= sd_listen_fds(false);
4486 if (n_fd_passed
> 0) {
4487 r
= fdset_new_listen_fds(&fds
, false);
4489 log_error_errno(r
, "Failed to collect file descriptors: %m");
4494 if (arg_directory
) {
4497 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
4498 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4503 if (arg_ephemeral
) {
4504 _cleanup_free_
char *np
= NULL
;
4506 /* If the specified path is a mount point we
4507 * generate the new snapshot immediately
4508 * inside it under a random name. However if
4509 * the specified is not a mount point we
4510 * create the new snapshot in the parent
4511 * directory, just next to it. */
4512 r
= path_is_mount_point(arg_directory
, 0);
4514 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
4518 r
= tempfn_random_child(arg_directory
, &np
);
4520 r
= tempfn_random(arg_directory
, &np
);
4522 log_error_errno(r
, "Failed to generate name for snapshot: %m");
4526 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
4528 log_error_errno(r
, "Failed to lock %s: %m", np
);
4532 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
4534 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
4538 free(arg_directory
);
4542 remove_subvol
= true;
4545 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
4547 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
4551 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
4556 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
4559 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
4561 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
4565 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
4571 if (path_is_os_tree(arg_directory
) <= 0) {
4572 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
4579 p
= strjoina(arg_directory
,
4580 argc
> optind
&& path_is_absolute(argv
[optind
]) ? argv
[optind
] : "/usr/bin/");
4581 if (access(p
, F_OK
) < 0) {
4582 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory
);
4589 char template[] = "/tmp/nspawn-root-XXXXXX";
4592 assert(!arg_template
);
4594 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
4596 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
4600 r
= log_error_errno(r
, "Failed to create image lock: %m");
4604 if (!mkdtemp(template)) {
4605 log_error_errno(errno
, "Failed to create temporary directory: %m");
4610 arg_directory
= strdup(template);
4611 if (!arg_directory
) {
4616 image_fd
= setup_image(&device_path
, &loop_nr
);
4622 r
= dissect_image(image_fd
,
4623 &root_device
, &root_device_rw
,
4624 &home_device
, &home_device_rw
,
4625 &srv_device
, &srv_device_rw
,
4631 r
= custom_mounts_prepare();
4636 isatty(STDIN_FILENO
) > 0 &&
4637 isatty(STDOUT_FILENO
) > 0;
4639 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
4641 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
4645 r
= ptsname_malloc(master
, &console
);
4647 r
= log_error_errno(r
, "Failed to determine tty name: %m");
4651 if (unlockpt(master
) < 0) {
4652 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
4657 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4658 arg_machine
, arg_image
?: arg_directory
);
4660 assert_se(sigemptyset(&mask
) == 0);
4661 sigset_add_many(&mask
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1);
4662 assert_se(sigprocmask(SIG_BLOCK
, &mask
, NULL
) == 0);
4664 assert_se(sigemptyset(&mask_chld
) == 0);
4665 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
4667 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
4668 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
4673 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 };
4674 ContainerStatus container_status
;
4675 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
4676 static const struct sigaction sa
= {
4677 .sa_handler
= nop_handler
,
4678 .sa_flags
= SA_NOCLDSTOP
,
4682 _cleanup_event_unref_ sd_event
*event
= NULL
;
4683 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
4684 _cleanup_rtnl_unref_ sd_rtnl
*rtnl
= NULL
;
4687 r
= barrier_create(&barrier
);
4689 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
4693 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
4694 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
4698 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
4699 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
4703 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
4704 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
4708 /* Child can be killed before execv(), so handle SIGCHLD
4709 * in order to interrupt parent's blocking calls and
4710 * give it a chance to call wait() and terminate. */
4711 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
4713 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
4717 r
= sigaction(SIGCHLD
, &sa
, NULL
);
4719 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
4723 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
4725 if (errno
== EINVAL
)
4726 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
4728 r
= log_error_errno(errno
, "clone() failed: %m");
4734 /* The outer child only has a file system namespace. */
4735 barrier_set_role(&barrier
, BARRIER_CHILD
);
4737 master
= safe_close(master
);
4739 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
4740 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
4741 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
4743 (void) reset_all_signal_handlers();
4744 (void) reset_signal_mask();
4746 r
= outer_child(&barrier
,
4749 root_device
, root_device_rw
,
4750 home_device
, home_device_rw
,
4751 srv_device
, srv_device_rw
,
4755 kmsg_socket_pair
[1],
4756 rtnl_socket_pair
[1],
4760 _exit(EXIT_FAILURE
);
4762 _exit(EXIT_SUCCESS
);
4765 barrier_set_role(&barrier
, BARRIER_PARENT
);
4770 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
4771 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
4772 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
4774 /* Wait for the outer child. */
4775 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
4784 /* And now retrieve the PID of the inner child. */
4785 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
4787 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
4790 if (l
!= sizeof(pid
)) {
4791 log_error("Short read while reading inner child PID: %m");
4796 log_debug("Init process invoked as PID " PID_FMT
, pid
);
4799 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
4800 log_error("Child died too early.");
4805 r
= setup_uid_map(pid
);
4809 (void) barrier_place(&barrier
); /* #2 */
4812 r
= move_network_interfaces(pid
);
4816 r
= setup_veth(pid
, veth_name
, &ifi
);
4820 r
= setup_bridge(veth_name
, &ifi
);
4824 r
= setup_macvlan(pid
);
4828 r
= setup_ipvlan(pid
);
4832 r
= register_machine(pid
, ifi
);
4836 r
= chown_cgroup(pid
);
4840 /* Notify the child that the parent is ready with all
4841 * its setup (including cgroup-ification), and that
4842 * the child can now hand over control to the code to
4843 * run inside the container. */
4844 (void) barrier_place(&barrier
); /* #3 */
4846 /* Block SIGCHLD here, before notifying child.
4847 * process_pty() will handle it with the other signals. */
4848 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
4850 /* Reset signal to default */
4851 r
= default_signals(SIGCHLD
, -1);
4853 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
4857 /* Let the child know that we are ready and wait that the child is completely ready now. */
4858 if (!barrier_place_and_sync(&barrier
)) { /* #5 */
4859 log_error("Client died too early.");
4866 "STATUS=Container running.\n"
4867 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
4869 r
= sd_event_new(&event
);
4871 log_error_errno(r
, "Failed to get default event source: %m");
4875 if (arg_kill_signal
> 0) {
4876 /* Try to kill the init system on SIGINT or SIGTERM */
4877 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
4878 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
4880 /* Immediately exit */
4881 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
4882 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
4885 /* simply exit on sigchld */
4886 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
4888 if (arg_expose_ports
) {
4889 r
= watch_rtnl(event
, rtnl_socket_pair
[0], &exposed
, &rtnl
);
4893 (void) expose_ports(rtnl
, &exposed
);
4896 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
4898 r
= pty_forward_new(event
, master
, true, !interactive
, &forward
);
4900 log_error_errno(r
, "Failed to create PTY forwarder: %m");
4904 r
= sd_event_loop(event
);
4906 log_error_errno(r
, "Failed to run event loop: %m");
4910 pty_forward_get_last_char(forward
, &last_char
);
4912 forward
= pty_forward_free(forward
);
4914 if (!arg_quiet
&& last_char
!= '\n')
4917 /* Kill if it is not dead yet anyway */
4918 terminate_machine(pid
);
4920 /* Normally redundant, but better safe than sorry */
4923 r
= wait_for_container(pid
, &container_status
);
4927 /* We failed to wait for the container, or the
4928 * container exited abnormally */
4930 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
4931 /* The container exited with a non-zero
4932 * status, or with zero status and no reboot
4938 /* CONTAINER_REBOOTED, loop again */
4940 if (arg_keep_unit
) {
4941 /* Special handling if we are running as a
4942 * service: instead of simply restarting the
4943 * machine we want to restart the entire
4944 * service, so let's inform systemd about this
4945 * with the special exit code 133. The service
4946 * file uses RestartForceExitStatus=133 so
4947 * that this results in a full nspawn
4948 * restart. This is necessary since we might
4949 * have cgroup parameters set we want to have
4956 flush_ports(&exposed
);
4962 "STATUS=Terminating...");
4967 loop_remove(loop_nr
, &image_fd
);
4969 if (remove_subvol
&& arg_directory
) {
4972 k
= btrfs_subvol_remove(arg_directory
, true);
4974 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
4980 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
4981 (void) rm_rf(p
, REMOVE_ROOT
);
4984 free(arg_directory
);
4989 strv_free(arg_setenv
);
4990 strv_free(arg_network_interfaces
);
4991 strv_free(arg_network_macvlan
);
4992 strv_free(arg_network_ipvlan
);
4993 custom_mount_free_all();
4995 flush_ports(&exposed
);
4997 while (arg_expose_ports
) {
4998 ExposePort
*p
= arg_expose_ports
;
4999 LIST_REMOVE(ports
, arg_expose_ports
, p
);
5003 return r
< 0 ? EXIT_FAILURE
: ret
;