1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/mount.h>
31 #include <sys/prctl.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
44 #include <selinux/selinux.h>
52 #include <blkid/blkid.h>
55 #include "sd-daemon.h"
58 #include "sd-netlink.h"
59 #include "random-util.h"
66 #include "cgroup-util.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
75 #include "bus-error.h"
78 #include "netlink-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
82 #include "siphash24.h"
84 #include "base-filesystem.h"
86 #include "event-util.h"
87 #include "capability.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
92 #include "in-addr-util.h"
93 #include "firewall-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
102 #include "seccomp-util.h"
106 #include "nspawn-settings.h"
107 #include "nspawn-mount.h"
109 typedef enum ContainerStatus
{
110 CONTAINER_TERMINATED
,
114 typedef enum LinkJournal
{
121 static char *arg_directory
= NULL
;
122 static char *arg_template
= NULL
;
123 static char *arg_user
= NULL
;
124 static sd_id128_t arg_uuid
= {};
125 static char *arg_machine
= NULL
;
126 static const char *arg_selinux_context
= NULL
;
127 static const char *arg_selinux_apifs_context
= NULL
;
128 static const char *arg_slice
= NULL
;
129 static bool arg_private_network
= false;
130 static bool arg_read_only
= false;
131 static bool arg_boot
= false;
132 static bool arg_ephemeral
= false;
133 static LinkJournal arg_link_journal
= LINK_AUTO
;
134 static bool arg_link_journal_try
= false;
135 static uint64_t arg_retain
=
136 (1ULL << CAP_CHOWN
) |
137 (1ULL << CAP_DAC_OVERRIDE
) |
138 (1ULL << CAP_DAC_READ_SEARCH
) |
139 (1ULL << CAP_FOWNER
) |
140 (1ULL << CAP_FSETID
) |
141 (1ULL << CAP_IPC_OWNER
) |
143 (1ULL << CAP_LEASE
) |
144 (1ULL << CAP_LINUX_IMMUTABLE
) |
145 (1ULL << CAP_NET_BIND_SERVICE
) |
146 (1ULL << CAP_NET_BROADCAST
) |
147 (1ULL << CAP_NET_RAW
) |
148 (1ULL << CAP_SETGID
) |
149 (1ULL << CAP_SETFCAP
) |
150 (1ULL << CAP_SETPCAP
) |
151 (1ULL << CAP_SETUID
) |
152 (1ULL << CAP_SYS_ADMIN
) |
153 (1ULL << CAP_SYS_CHROOT
) |
154 (1ULL << CAP_SYS_NICE
) |
155 (1ULL << CAP_SYS_PTRACE
) |
156 (1ULL << CAP_SYS_TTY_CONFIG
) |
157 (1ULL << CAP_SYS_RESOURCE
) |
158 (1ULL << CAP_SYS_BOOT
) |
159 (1ULL << CAP_AUDIT_WRITE
) |
160 (1ULL << CAP_AUDIT_CONTROL
) |
162 static CustomMount
*arg_custom_mounts
= NULL
;
163 static unsigned arg_n_custom_mounts
= 0;
164 static char **arg_setenv
= NULL
;
165 static bool arg_quiet
= false;
166 static bool arg_share_system
= false;
167 static bool arg_register
= true;
168 static bool arg_keep_unit
= false;
169 static char **arg_network_interfaces
= NULL
;
170 static char **arg_network_macvlan
= NULL
;
171 static char **arg_network_ipvlan
= NULL
;
172 static bool arg_network_veth
= false;
173 static char *arg_network_bridge
= NULL
;
174 static unsigned long arg_personality
= PERSONALITY_INVALID
;
175 static char *arg_image
= NULL
;
176 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
177 static ExposePort
*arg_expose_ports
= NULL
;
178 static char **arg_property
= NULL
;
179 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
180 static bool arg_userns
= false;
181 static int arg_kill_signal
= 0;
182 static bool arg_unified_cgroup_hierarchy
= false;
183 static SettingsMask arg_settings_mask
= 0;
184 static int arg_settings_trusted
= -1;
185 static char **arg_parameters
= NULL
;
187 static void help(void) {
188 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
189 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
190 " -h --help Show this help\n"
191 " --version Print version string\n"
192 " -q --quiet Do not show status information\n"
193 " -D --directory=PATH Root directory for the container\n"
194 " --template=PATH Initialize root directory from template directory,\n"
196 " -x --ephemeral Run container with snapshot of root directory, and\n"
197 " remove it after exit\n"
198 " -i --image=PATH File system device or disk image for the container\n"
199 " -b --boot Boot up full system (i.e. invoke init)\n"
200 " -u --user=USER Run the command under specified user or uid\n"
201 " -M --machine=NAME Set the machine name for the container\n"
202 " --uuid=UUID Set a specific machine UUID for the container\n"
203 " -S --slice=SLICE Place the container in the specified slice\n"
204 " --property=NAME=VALUE Set scope unit property\n"
205 " --private-users[=UIDBASE[:NUIDS]]\n"
206 " Run within user namespace\n"
207 " --private-network Disable network in container\n"
208 " --network-interface=INTERFACE\n"
209 " Assign an existing network interface to the\n"
211 " --network-macvlan=INTERFACE\n"
212 " Create a macvlan network interface based on an\n"
213 " existing network interface to the container\n"
214 " --network-ipvlan=INTERFACE\n"
215 " Create a ipvlan network interface based on an\n"
216 " existing network interface to the container\n"
217 " -n --network-veth Add a virtual ethernet connection between host\n"
219 " --network-bridge=INTERFACE\n"
220 " Add a virtual ethernet connection between host\n"
221 " and container and add it to an existing bridge on\n"
223 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
224 " Expose a container IP port on the host\n"
225 " -Z --selinux-context=SECLABEL\n"
226 " Set the SELinux security context to be used by\n"
227 " processes in the container\n"
228 " -L --selinux-apifs-context=SECLABEL\n"
229 " Set the SELinux security context to be used by\n"
230 " API/tmpfs file systems in the container\n"
231 " --capability=CAP In addition to the default, retain specified\n"
233 " --drop-capability=CAP Drop the specified capability from the default set\n"
234 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
235 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
236 " try-guest, try-host\n"
237 " -j Equivalent to --link-journal=try-guest\n"
238 " --read-only Mount the root directory read-only\n"
239 " --bind=PATH[:PATH[:OPTIONS]]\n"
240 " Bind mount a file or directory from the host into\n"
242 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
243 " Similar, but creates a read-only bind mount\n"
244 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
245 " --overlay=PATH[:PATH...]:PATH\n"
246 " Create an overlay mount from the host to \n"
248 " --overlay-ro=PATH[:PATH...]:PATH\n"
249 " Similar, but creates a read-only overlay mount\n"
250 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
251 " --share-system Share system namespaces with host\n"
252 " --register=BOOLEAN Register container as machine\n"
253 " --keep-unit Do not register a scope for the machine, reuse\n"
254 " the service unit nspawn is running in\n"
255 " --volatile[=MODE] Run the system in volatile mode\n"
256 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
257 , program_invocation_short_name
);
261 static int custom_mounts_prepare(void) {
265 /* Ensure the mounts are applied prefix first. */
266 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
268 /* Allocate working directories for the overlay file systems that need it */
269 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
270 CustomMount
*m
= &arg_custom_mounts
[i
];
272 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
273 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
277 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
286 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
288 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
294 static int set_sanitized_path(char **b
, const char *path
) {
300 p
= canonicalize_file_name(path
);
305 p
= path_make_absolute_cwd(path
);
311 *b
= path_kill_slashes(p
);
315 static int detect_unified_cgroup_hierarchy(void) {
319 /* Allow the user to control whether the unified hierarchy is used */
320 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
322 r
= parse_boolean(e
);
324 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
326 arg_unified_cgroup_hierarchy
= r
;
330 /* Otherwise inherit the default from the host system */
333 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
335 arg_unified_cgroup_hierarchy
= r
;
339 int expose_port_parse(ExposePort
**l
, const char *s
) {
341 const char *split
, *e
;
342 uint16_t container_port
, host_port
;
347 if ((e
= startswith(s
, "tcp:")))
348 protocol
= IPPROTO_TCP
;
349 else if ((e
= startswith(s
, "udp:")))
350 protocol
= IPPROTO_UDP
;
353 protocol
= IPPROTO_TCP
;
356 split
= strchr(e
, ':');
358 char v
[split
- e
+ 1];
360 memcpy(v
, e
, split
- e
);
363 r
= safe_atou16(v
, &host_port
);
364 if (r
< 0 || host_port
<= 0)
367 r
= safe_atou16(split
+ 1, &container_port
);
369 r
= safe_atou16(e
, &container_port
);
370 host_port
= container_port
;
373 if (r
< 0 || container_port
<= 0)
376 LIST_FOREACH(ports
, p
, arg_expose_ports
)
377 if (p
->protocol
== protocol
&& p
->host_port
== host_port
)
380 p
= new(ExposePort
, 1);
384 p
->protocol
= protocol
;
385 p
->host_port
= host_port
;
386 p
->container_port
= container_port
;
388 LIST_PREPEND(ports
, *l
, p
);
393 static int parse_argv(int argc
, char *argv
[]) {
412 ARG_NETWORK_INTERFACE
,
425 static const struct option options
[] = {
426 { "help", no_argument
, NULL
, 'h' },
427 { "version", no_argument
, NULL
, ARG_VERSION
},
428 { "directory", required_argument
, NULL
, 'D' },
429 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
430 { "ephemeral", no_argument
, NULL
, 'x' },
431 { "user", required_argument
, NULL
, 'u' },
432 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
433 { "boot", no_argument
, NULL
, 'b' },
434 { "uuid", required_argument
, NULL
, ARG_UUID
},
435 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
436 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
437 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
438 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
439 { "bind", required_argument
, NULL
, ARG_BIND
},
440 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
441 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
442 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
443 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
444 { "machine", required_argument
, NULL
, 'M' },
445 { "slice", required_argument
, NULL
, 'S' },
446 { "setenv", required_argument
, NULL
, ARG_SETENV
},
447 { "selinux-context", required_argument
, NULL
, 'Z' },
448 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
449 { "quiet", no_argument
, NULL
, 'q' },
450 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
451 { "register", required_argument
, NULL
, ARG_REGISTER
},
452 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
453 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
454 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
455 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
456 { "network-veth", no_argument
, NULL
, 'n' },
457 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
458 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
459 { "image", required_argument
, NULL
, 'i' },
460 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
461 { "port", required_argument
, NULL
, 'p' },
462 { "property", required_argument
, NULL
, ARG_PROPERTY
},
463 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
464 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
465 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
470 uint64_t plus
= 0, minus
= 0;
471 bool mask_all_settings
= false, mask_no_settings
= false;
476 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
485 puts(PACKAGE_STRING
);
486 puts(SYSTEMD_FEATURES
);
490 r
= set_sanitized_path(&arg_directory
, optarg
);
492 return log_error_errno(r
, "Invalid root directory: %m");
497 r
= set_sanitized_path(&arg_template
, optarg
);
499 return log_error_errno(r
, "Invalid template directory: %m");
504 r
= set_sanitized_path(&arg_image
, optarg
);
506 return log_error_errno(r
, "Invalid image path: %m");
511 arg_ephemeral
= true;
515 r
= free_and_strdup(&arg_user
, optarg
);
519 arg_settings_mask
|= SETTING_USER
;
522 case ARG_NETWORK_BRIDGE
:
523 r
= free_and_strdup(&arg_network_bridge
, optarg
);
530 arg_network_veth
= true;
531 arg_private_network
= true;
532 arg_settings_mask
|= SETTING_NETWORK
;
535 case ARG_NETWORK_INTERFACE
:
536 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
539 arg_private_network
= true;
540 arg_settings_mask
|= SETTING_NETWORK
;
543 case ARG_NETWORK_MACVLAN
:
544 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
547 arg_private_network
= true;
548 arg_settings_mask
|= SETTING_NETWORK
;
551 case ARG_NETWORK_IPVLAN
:
552 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
557 case ARG_PRIVATE_NETWORK
:
558 arg_private_network
= true;
559 arg_settings_mask
|= SETTING_NETWORK
;
564 arg_settings_mask
|= SETTING_BOOT
;
568 r
= sd_id128_from_string(optarg
, &arg_uuid
);
570 log_error("Invalid UUID: %s", optarg
);
574 arg_settings_mask
|= SETTING_MACHINE_ID
;
583 arg_machine
= mfree(arg_machine
);
585 if (!machine_name_is_valid(optarg
)) {
586 log_error("Invalid machine name: %s", optarg
);
590 r
= free_and_strdup(&arg_machine
, optarg
);
598 arg_selinux_context
= optarg
;
602 arg_selinux_apifs_context
= optarg
;
606 arg_read_only
= true;
607 arg_settings_mask
|= SETTING_READ_ONLY
;
611 case ARG_DROP_CAPABILITY
: {
612 const char *state
, *word
;
615 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
616 _cleanup_free_
char *t
;
618 t
= strndup(word
, length
);
622 if (streq(t
, "all")) {
623 if (c
== ARG_CAPABILITY
)
624 plus
= (uint64_t) -1;
626 minus
= (uint64_t) -1;
630 cap
= capability_from_name(t
);
632 log_error("Failed to parse capability %s.", t
);
636 if (c
== ARG_CAPABILITY
)
637 plus
|= 1ULL << (uint64_t) cap
;
639 minus
|= 1ULL << (uint64_t) cap
;
643 arg_settings_mask
|= SETTING_CAPABILITY
;
648 arg_link_journal
= LINK_GUEST
;
649 arg_link_journal_try
= true;
652 case ARG_LINK_JOURNAL
:
653 if (streq(optarg
, "auto")) {
654 arg_link_journal
= LINK_AUTO
;
655 arg_link_journal_try
= false;
656 } else if (streq(optarg
, "no")) {
657 arg_link_journal
= LINK_NO
;
658 arg_link_journal_try
= false;
659 } else if (streq(optarg
, "guest")) {
660 arg_link_journal
= LINK_GUEST
;
661 arg_link_journal_try
= false;
662 } else if (streq(optarg
, "host")) {
663 arg_link_journal
= LINK_HOST
;
664 arg_link_journal_try
= false;
665 } else if (streq(optarg
, "try-guest")) {
666 arg_link_journal
= LINK_GUEST
;
667 arg_link_journal_try
= true;
668 } else if (streq(optarg
, "try-host")) {
669 arg_link_journal
= LINK_HOST
;
670 arg_link_journal_try
= true;
672 log_error("Failed to parse link journal mode %s", optarg
);
680 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
682 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
684 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
688 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
690 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
692 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
696 case ARG_OVERLAY_RO
: {
697 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
698 _cleanup_strv_free_
char **lower
= NULL
;
703 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
707 log_error("Invalid overlay specification: %s", optarg
);
711 STRV_FOREACH(i
, lower
) {
712 if (!path_is_absolute(*i
)) {
713 log_error("Overlay path %s is not absolute.", *i
);
721 log_error("--overlay= needs at least two colon-separated directories specified.");
726 /* If two parameters are specified,
727 * the first one is the lower, the
728 * second one the upper directory. And
729 * we'll also define the destination
730 * mount point the same as the upper. */
734 destination
= strdup(upper
);
739 upper
= lower
[n
- 2];
740 destination
= lower
[n
- 1];
744 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
748 m
->destination
= destination
;
751 m
->read_only
= c
== ARG_OVERLAY_RO
;
753 upper
= destination
= NULL
;
756 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
763 if (!env_assignment_is_valid(optarg
)) {
764 log_error("Environment variable assignment '%s' is not valid.", optarg
);
768 n
= strv_env_set(arg_setenv
, optarg
);
772 strv_free(arg_setenv
);
775 arg_settings_mask
|= SETTING_ENVIRONMENT
;
783 case ARG_SHARE_SYSTEM
:
784 arg_share_system
= true;
788 r
= parse_boolean(optarg
);
790 log_error("Failed to parse --register= argument: %s", optarg
);
798 arg_keep_unit
= true;
801 case ARG_PERSONALITY
:
803 arg_personality
= personality_from_string(optarg
);
804 if (arg_personality
== PERSONALITY_INVALID
) {
805 log_error("Unknown or unsupported personality '%s'.", optarg
);
809 arg_settings_mask
|= SETTING_PERSONALITY
;
815 arg_volatile_mode
= VOLATILE_YES
;
819 m
= volatile_mode_from_string(optarg
);
821 log_error("Failed to parse --volatile= argument: %s", optarg
);
824 arg_volatile_mode
= m
;
827 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
831 r
= expose_port_parse(&arg_expose_ports
, optarg
);
833 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
835 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
837 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
841 if (strv_extend(&arg_property
, optarg
) < 0)
846 case ARG_PRIVATE_USERS
:
848 _cleanup_free_
char *buffer
= NULL
;
849 const char *range
, *shift
;
851 range
= strchr(optarg
, ':');
853 buffer
= strndup(optarg
, range
- optarg
);
859 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
860 log_error("Failed to parse UID range: %s", range
);
866 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
867 log_error("Failed to parse UID: %s", optarg
);
875 case ARG_KILL_SIGNAL
:
876 arg_kill_signal
= signal_from_string_try_harder(optarg
);
877 if (arg_kill_signal
< 0) {
878 log_error("Cannot parse signal: %s", optarg
);
882 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
887 /* no → do not read files
888 * yes → read files, do not override cmdline, trust only subset
889 * override → read files, override cmdline, trust only subset
890 * trusted → read files, do not override cmdline, trust all
893 r
= parse_boolean(optarg
);
895 if (streq(optarg
, "trusted")) {
896 mask_all_settings
= false;
897 mask_no_settings
= false;
898 arg_settings_trusted
= true;
900 } else if (streq(optarg
, "override")) {
901 mask_all_settings
= false;
902 mask_no_settings
= true;
903 arg_settings_trusted
= -1;
905 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
908 mask_all_settings
= false;
909 mask_no_settings
= false;
910 arg_settings_trusted
= -1;
913 mask_all_settings
= true;
914 mask_no_settings
= false;
915 arg_settings_trusted
= false;
924 assert_not_reached("Unhandled option");
927 if (arg_share_system
)
928 arg_register
= false;
930 if (arg_boot
&& arg_share_system
) {
931 log_error("--boot and --share-system may not be combined.");
935 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
936 log_error("--keep-unit may not be used when invoked from a user session.");
940 if (arg_directory
&& arg_image
) {
941 log_error("--directory= and --image= may not be combined.");
945 if (arg_template
&& arg_image
) {
946 log_error("--template= and --image= may not be combined.");
950 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
951 log_error("--template= needs --directory= or --machine=.");
955 if (arg_ephemeral
&& arg_template
) {
956 log_error("--ephemeral and --template= may not be combined.");
960 if (arg_ephemeral
&& arg_image
) {
961 log_error("--ephemeral and --image= may not be combined.");
965 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
966 log_error("--ephemeral and --link-journal= may not be combined.");
970 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
971 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
974 arg_parameters
= strv_copy(argv
+ optind
);
978 arg_settings_mask
|= SETTING_BOOT
;
981 /* Load all settings from .nspawn files */
982 if (mask_no_settings
)
983 arg_settings_mask
= 0;
985 /* Don't load any settings from .nspawn files */
986 if (mask_all_settings
)
987 arg_settings_mask
= _SETTINGS_MASK_ALL
;
989 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
991 r
= detect_unified_cgroup_hierarchy();
998 static int verify_arguments(void) {
1000 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
1001 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1005 if (arg_expose_ports
&& !arg_private_network
) {
1006 log_error("Cannot use --port= without private networking.");
1010 if (arg_boot
&& arg_kill_signal
<= 0)
1011 arg_kill_signal
= SIGRTMIN
+3;
1016 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
1022 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
1025 if (uid
!= UID_INVALID
) {
1026 uid
+= arg_uid_shift
;
1028 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
1032 if (gid
!= GID_INVALID
) {
1033 gid
+= (gid_t
) arg_uid_shift
;
1035 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
1039 if (lchown(p
, uid
, gid
) < 0)
1045 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
1048 q
= prefix_roota(root
, path
);
1049 if (mkdir(q
, mode
) < 0) {
1050 if (errno
== EEXIST
)
1055 return userns_lchown(q
, uid
, gid
);
1058 static int setup_timezone(const char *dest
) {
1059 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
1060 const char *where
, *check
, *what
;
1066 /* Fix the timezone, if possible */
1067 r
= readlink_malloc("/etc/localtime", &p
);
1069 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1073 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1075 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1077 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1081 where
= prefix_roota(dest
, "/etc/localtime");
1082 r
= readlink_malloc(where
, &q
);
1084 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1086 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1088 /* Already pointing to the right place? Then do nothing .. */
1089 if (y
&& streq(y
, z
))
1093 check
= strjoina("/usr/share/zoneinfo/", z
);
1094 check
= prefix_root(dest
, check
);
1095 if (laccess(check
, F_OK
) < 0) {
1096 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1101 if (r
< 0 && errno
!= ENOENT
) {
1102 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1106 what
= strjoina("../usr/share/zoneinfo/", z
);
1107 if (symlink(what
, where
) < 0) {
1108 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1112 r
= userns_lchown(where
, 0, 0);
1114 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1119 static int setup_resolv_conf(const char *dest
) {
1120 const char *where
= NULL
;
1125 if (arg_private_network
)
1128 /* Fix resolv.conf, if possible */
1129 where
= prefix_roota(dest
, "/etc/resolv.conf");
1131 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1133 /* If the file already exists as symlink, let's
1134 * suppress the warning, under the assumption that
1135 * resolved or something similar runs inside and the
1136 * symlink points there.
1138 * If the disk image is read-only, there's also no
1139 * point in complaining.
1141 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1142 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1146 r
= userns_lchown(where
, 0, 0);
1148 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1153 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1157 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1158 SD_ID128_FORMAT_VAL(id
));
1163 static int setup_boot_id(const char *dest
) {
1164 const char *from
, *to
;
1165 sd_id128_t rnd
= {};
1169 if (arg_share_system
)
1172 /* Generate a new randomized boot ID, so that each boot-up of
1173 * the container gets a new one */
1175 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1176 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1178 r
= sd_id128_randomize(&rnd
);
1180 return log_error_errno(r
, "Failed to generate random boot id: %m");
1182 id128_format_as_uuid(rnd
, as_uuid
);
1184 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1186 return log_error_errno(r
, "Failed to write boot id: %m");
1188 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1189 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1190 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1191 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1197 static int copy_devnodes(const char *dest
) {
1199 static const char devnodes
[] =
1210 _cleanup_umask_ mode_t u
;
1216 /* Create /dev/net, so that we can create /dev/net/tun in it */
1217 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1218 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1220 NULSTR_FOREACH(d
, devnodes
) {
1221 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1224 from
= strappend("/dev/", d
);
1225 to
= prefix_root(dest
, from
);
1227 if (stat(from
, &st
) < 0) {
1229 if (errno
!= ENOENT
)
1230 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1232 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1234 log_error("%s is not a char or block device, cannot copy.", from
);
1238 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1240 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1242 /* Some systems abusively restrict mknod but
1243 * allow bind mounts. */
1246 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1247 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1248 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1251 r
= userns_lchown(to
, 0, 0);
1253 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1260 static int setup_pts(const char *dest
) {
1261 _cleanup_free_
char *options
= NULL
;
1265 if (arg_selinux_apifs_context
)
1266 (void) asprintf(&options
,
1267 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1268 arg_uid_shift
+ TTY_GID
,
1269 arg_selinux_apifs_context
);
1272 (void) asprintf(&options
,
1273 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1274 arg_uid_shift
+ TTY_GID
);
1279 /* Mount /dev/pts itself */
1280 p
= prefix_roota(dest
, "/dev/pts");
1281 if (mkdir(p
, 0755) < 0)
1282 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1283 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1284 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1285 if (userns_lchown(p
, 0, 0) < 0)
1286 return log_error_errno(errno
, "Failed to chown /dev/pts: %m");
1288 /* Create /dev/ptmx symlink */
1289 p
= prefix_roota(dest
, "/dev/ptmx");
1290 if (symlink("pts/ptmx", p
) < 0)
1291 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1292 if (userns_lchown(p
, 0, 0) < 0)
1293 return log_error_errno(errno
, "Failed to chown /dev/ptmx: %m");
1295 /* And fix /dev/pts/ptmx ownership */
1296 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1297 if (userns_lchown(p
, 0, 0) < 0)
1298 return log_error_errno(errno
, "Failed to chown /dev/pts/ptmx: %m");
1303 static int setup_dev_console(const char *dest
, const char *console
) {
1304 _cleanup_umask_ mode_t u
;
1313 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1315 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1317 /* We need to bind mount the right tty to /dev/console since
1318 * ptys can only exist on pts file systems. To have something
1319 * to bind mount things on we create a empty regular file. */
1321 to
= prefix_roota(dest
, "/dev/console");
1324 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1326 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1327 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1332 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1333 const char *from
, *to
;
1334 _cleanup_umask_ mode_t u
;
1337 struct cmsghdr cmsghdr
;
1338 uint8_t buf
[CMSG_SPACE(sizeof(int))];
1340 struct msghdr mh
= {
1341 .msg_control
= &control
,
1342 .msg_controllen
= sizeof(control
),
1344 struct cmsghdr
*cmsg
;
1346 assert(kmsg_socket
>= 0);
1350 /* We create the kmsg FIFO as /run/kmsg, but immediately
1351 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1352 * on the reading side behave very similar to /proc/kmsg,
1353 * their writing side behaves differently from /dev/kmsg in
1354 * that writing blocks when nothing is reading. In order to
1355 * avoid any problems with containers deadlocking due to this
1356 * we simply make /dev/kmsg unavailable to the container. */
1357 from
= prefix_roota(dest
, "/run/kmsg");
1358 to
= prefix_roota(dest
, "/proc/kmsg");
1360 if (mkfifo(from
, 0600) < 0)
1361 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1362 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1363 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1365 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1367 return log_error_errno(errno
, "Failed to open fifo: %m");
1369 cmsg
= CMSG_FIRSTHDR(&mh
);
1370 cmsg
->cmsg_level
= SOL_SOCKET
;
1371 cmsg
->cmsg_type
= SCM_RIGHTS
;
1372 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
1373 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
1375 mh
.msg_controllen
= cmsg
->cmsg_len
;
1377 /* Store away the fd in the socket, so that it stays open as
1378 * long as we run the child */
1379 k
= sendmsg(kmsg_socket
, &mh
, MSG_NOSIGNAL
);
1383 return log_error_errno(errno
, "Failed to send FIFO fd: %m");
1385 /* And now make the FIFO unavailable as /run/kmsg... */
1386 (void) unlink(from
);
1391 static int send_rtnl(int send_fd
) {
1393 struct cmsghdr cmsghdr
;
1394 uint8_t buf
[CMSG_SPACE(sizeof(int))];
1396 struct msghdr mh
= {
1397 .msg_control
= &control
,
1398 .msg_controllen
= sizeof(control
),
1400 struct cmsghdr
*cmsg
;
1401 _cleanup_close_
int fd
= -1;
1404 assert(send_fd
>= 0);
1406 if (!arg_expose_ports
)
1409 fd
= socket(PF_NETLINK
, SOCK_RAW
|SOCK_CLOEXEC
|SOCK_NONBLOCK
, NETLINK_ROUTE
);
1411 return log_error_errno(errno
, "Failed to allocate container netlink: %m");
1413 cmsg
= CMSG_FIRSTHDR(&mh
);
1414 cmsg
->cmsg_level
= SOL_SOCKET
;
1415 cmsg
->cmsg_type
= SCM_RIGHTS
;
1416 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
1417 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
1419 mh
.msg_controllen
= cmsg
->cmsg_len
;
1421 /* Store away the fd in the socket, so that it stays open as
1422 * long as we run the child */
1423 k
= sendmsg(send_fd
, &mh
, MSG_NOSIGNAL
);
1425 return log_error_errno(errno
, "Failed to send netlink fd: %m");
1430 static int flush_ports(union in_addr_union
*exposed
) {
1432 int r
, af
= AF_INET
;
1436 if (!arg_expose_ports
)
1439 if (in_addr_is_null(af
, exposed
))
1442 log_debug("Lost IP address.");
1444 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
1445 r
= fw_add_local_dnat(false,
1456 log_warning_errno(r
, "Failed to modify firewall: %m");
1459 *exposed
= IN_ADDR_NULL
;
1463 static int expose_ports(sd_netlink
*rtnl
, union in_addr_union
*exposed
) {
1464 _cleanup_free_
struct local_address
*addresses
= NULL
;
1465 _cleanup_free_
char *pretty
= NULL
;
1466 union in_addr_union new_exposed
;
1469 int af
= AF_INET
, r
;
1473 /* Invoked each time an address is added or removed inside the
1476 if (!arg_expose_ports
)
1479 r
= local_addresses(rtnl
, 0, af
, &addresses
);
1481 return log_error_errno(r
, "Failed to enumerate local addresses: %m");
1484 addresses
[0].family
== af
&&
1485 addresses
[0].scope
< RT_SCOPE_LINK
;
1488 return flush_ports(exposed
);
1490 new_exposed
= addresses
[0].address
;
1491 if (in_addr_equal(af
, exposed
, &new_exposed
))
1494 in_addr_to_string(af
, &new_exposed
, &pretty
);
1495 log_debug("New container IP is %s.", strna(pretty
));
1497 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
1499 r
= fw_add_local_dnat(true,
1508 in_addr_is_null(af
, exposed
) ? NULL
: exposed
);
1510 log_warning_errno(r
, "Failed to modify firewall: %m");
1513 *exposed
= new_exposed
;
1517 void expose_port_free_all(ExposePort
*p
) {
1521 LIST_REMOVE(ports
, p
, q
);
1526 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1527 union in_addr_union
*exposed
= userdata
;
1533 expose_ports(rtnl
, exposed
);
1537 static int watch_rtnl(sd_event
*event
, int recv_fd
, union in_addr_union
*exposed
, sd_netlink
**ret
) {
1539 struct cmsghdr cmsghdr
;
1540 uint8_t buf
[CMSG_SPACE(sizeof(int))];
1542 struct msghdr mh
= {
1543 .msg_control
= &control
,
1544 .msg_controllen
= sizeof(control
),
1546 struct cmsghdr
*cmsg
;
1547 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
1552 assert(recv_fd
>= 0);
1555 if (!arg_expose_ports
)
1558 k
= recvmsg(recv_fd
, &mh
, MSG_NOSIGNAL
);
1560 return log_error_errno(errno
, "Failed to recv netlink fd: %m");
1562 cmsg
= CMSG_FIRSTHDR(&mh
);
1563 assert(cmsg
->cmsg_level
== SOL_SOCKET
);
1564 assert(cmsg
->cmsg_type
== SCM_RIGHTS
);
1565 assert(cmsg
->cmsg_len
== CMSG_LEN(sizeof(int)));
1566 memcpy(&fd
, CMSG_DATA(cmsg
), sizeof(int));
1568 r
= sd_netlink_open_fd(&rtnl
, fd
);
1571 return log_error_errno(r
, "Failed to create rtnl object: %m");
1574 r
= sd_netlink_add_match(rtnl
, RTM_NEWADDR
, on_address_change
, exposed
);
1576 return log_error_errno(r
, "Failed to subscribe to RTM_NEWADDR messages: %m");
1578 r
= sd_netlink_add_match(rtnl
, RTM_DELADDR
, on_address_change
, exposed
);
1580 return log_error_errno(r
, "Failed to subscribe to RTM_DELADDR messages: %m");
1582 r
= sd_netlink_attach_event(rtnl
, event
, 0);
1584 return log_error_errno(r
, "Failed to add to even loop: %m");
1592 static int setup_hostname(void) {
1594 if (arg_share_system
)
1597 if (sethostname_idempotent(arg_machine
) < 0)
1603 static int setup_journal(const char *directory
) {
1604 sd_id128_t machine_id
, this_id
;
1605 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
1606 const char *etc_machine_id
, *p
, *q
;
1610 /* Don't link journals in ephemeral mode */
1614 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
1616 r
= read_one_line_file(etc_machine_id
, &b
);
1617 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
1620 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
1623 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
1626 /* Verify validity */
1627 r
= sd_id128_from_string(id
, &machine_id
);
1629 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
1631 r
= sd_id128_get_machine(&this_id
);
1633 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1635 if (sd_id128_equal(machine_id
, this_id
)) {
1636 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
1637 "Host and machine ids are equal (%s): refusing to link journals", id
);
1638 if (arg_link_journal
== LINK_AUTO
)
1643 if (arg_link_journal
== LINK_NO
)
1646 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1648 return log_error_errno(r
, "Failed to create /var: %m");
1650 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1652 return log_error_errno(r
, "Failed to create /var/log: %m");
1654 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1656 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1658 p
= strjoina("/var/log/journal/", id
);
1659 q
= prefix_roota(directory
, p
);
1661 if (path_is_mount_point(p
, 0) > 0) {
1662 if (arg_link_journal
!= LINK_AUTO
) {
1663 log_error("%s: already a mount point, refusing to use for journal", p
);
1670 if (path_is_mount_point(q
, 0) > 0) {
1671 if (arg_link_journal
!= LINK_AUTO
) {
1672 log_error("%s: already a mount point, refusing to use for journal", q
);
1679 r
= readlink_and_make_absolute(p
, &d
);
1681 if ((arg_link_journal
== LINK_GUEST
||
1682 arg_link_journal
== LINK_AUTO
) &&
1685 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1687 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1692 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1693 } else if (r
== -EINVAL
) {
1695 if (arg_link_journal
== LINK_GUEST
&&
1698 if (errno
== ENOTDIR
) {
1699 log_error("%s already exists and is neither a symlink nor a directory", p
);
1702 log_error_errno(errno
, "Failed to remove %s: %m", p
);
1706 } else if (r
!= -ENOENT
) {
1707 log_error_errno(errno
, "readlink(%s) failed: %m", p
);
1711 if (arg_link_journal
== LINK_GUEST
) {
1713 if (symlink(q
, p
) < 0) {
1714 if (arg_link_journal_try
) {
1715 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1718 log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1723 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1725 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1729 if (arg_link_journal
== LINK_HOST
) {
1730 /* don't create parents here -- if the host doesn't have
1731 * permanent journal set up, don't force it here */
1734 if (arg_link_journal_try
) {
1735 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1738 log_error_errno(errno
, "Failed to create %s: %m", p
);
1743 } else if (access(p
, F_OK
) < 0)
1746 if (dir_is_empty(q
) == 0)
1747 log_warning("%s is not empty, proceeding anyway.", q
);
1749 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1751 log_error_errno(errno
, "Failed to create %s: %m", q
);
1755 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1756 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1761 static int drop_capabilities(void) {
1762 return capability_bounding_set_drop(~arg_retain
, false);
1765 static int register_machine(pid_t pid
, int local_ifindex
) {
1766 _cleanup_bus_error_free_ sd_bus_error error
= SD_BUS_ERROR_NULL
;
1767 _cleanup_bus_flush_close_unref_ sd_bus
*bus
= NULL
;
1773 r
= sd_bus_default_system(&bus
);
1775 return log_error_errno(r
, "Failed to open system bus: %m");
1777 if (arg_keep_unit
) {
1778 r
= sd_bus_call_method(
1780 "org.freedesktop.machine1",
1781 "/org/freedesktop/machine1",
1782 "org.freedesktop.machine1.Manager",
1783 "RegisterMachineWithNetwork",
1788 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid
),
1792 strempty(arg_directory
),
1793 local_ifindex
> 0 ? 1 : 0, local_ifindex
);
1795 _cleanup_bus_message_unref_ sd_bus_message
*m
= NULL
;
1799 r
= sd_bus_message_new_method_call(
1802 "org.freedesktop.machine1",
1803 "/org/freedesktop/machine1",
1804 "org.freedesktop.machine1.Manager",
1805 "CreateMachineWithNetwork");
1807 return bus_log_create_error(r
);
1809 r
= sd_bus_message_append(
1813 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid
),
1817 strempty(arg_directory
),
1818 local_ifindex
> 0 ? 1 : 0, local_ifindex
);
1820 return bus_log_create_error(r
);
1822 r
= sd_bus_message_open_container(m
, 'a', "(sv)");
1824 return bus_log_create_error(r
);
1826 if (!isempty(arg_slice
)) {
1827 r
= sd_bus_message_append(m
, "(sv)", "Slice", "s", arg_slice
);
1829 return bus_log_create_error(r
);
1832 r
= sd_bus_message_append(m
, "(sv)", "DevicePolicy", "s", "strict");
1834 return bus_log_create_error(r
);
1836 /* If you make changes here, also make sure to update
1837 * systemd-nspawn@.service, to keep the device
1838 * policies in sync regardless if we are run with or
1839 * without the --keep-unit switch. */
1840 r
= sd_bus_message_append(m
, "(sv)", "DeviceAllow", "a(ss)", 9,
1841 /* Allow the container to
1842 * access and create the API
1843 * device nodes, so that
1844 * PrivateDevices= in the
1845 * container can work
1850 "/dev/random", "rwm",
1851 "/dev/urandom", "rwm",
1853 "/dev/net/tun", "rwm",
1854 /* Allow the container
1855 * access to ptys. However,
1857 * container to ever create
1858 * these device nodes. */
1859 "/dev/pts/ptmx", "rw",
1862 return bus_log_create_error(r
);
1864 for (j
= 0; j
< arg_n_custom_mounts
; j
++) {
1865 CustomMount
*cm
= &arg_custom_mounts
[j
];
1867 if (cm
->type
!= CUSTOM_MOUNT_BIND
)
1870 r
= is_device_node(cm
->source
);
1872 return log_error_errno(r
, "Failed to stat %s: %m", cm
->source
);
1875 r
= sd_bus_message_append(m
, "(sv)", "DeviceAllow", "a(ss)", 1,
1876 cm
->source
, cm
->read_only
? "r" : "rw");
1878 return log_error_errno(r
, "Failed to append message arguments: %m");
1882 if (arg_kill_signal
!= 0) {
1883 r
= sd_bus_message_append(m
, "(sv)", "KillSignal", "i", arg_kill_signal
);
1885 return bus_log_create_error(r
);
1887 r
= sd_bus_message_append(m
, "(sv)", "KillMode", "s", "mixed");
1889 return bus_log_create_error(r
);
1892 STRV_FOREACH(i
, arg_property
) {
1893 r
= sd_bus_message_open_container(m
, 'r', "sv");
1895 return bus_log_create_error(r
);
1897 r
= bus_append_unit_property_assignment(m
, *i
);
1901 r
= sd_bus_message_close_container(m
);
1903 return bus_log_create_error(r
);
1906 r
= sd_bus_message_close_container(m
);
1908 return bus_log_create_error(r
);
1910 r
= sd_bus_call(bus
, m
, 0, &error
, NULL
);
1914 log_error("Failed to register machine: %s", bus_error_message(&error
, r
));
1921 static int terminate_machine(pid_t pid
) {
1922 _cleanup_bus_error_free_ sd_bus_error error
= SD_BUS_ERROR_NULL
;
1923 _cleanup_bus_message_unref_ sd_bus_message
*reply
= NULL
;
1924 _cleanup_bus_flush_close_unref_ sd_bus
*bus
= NULL
;
1931 /* If we are reusing the unit, then just exit, systemd will do
1932 * the right thing when we exit. */
1936 r
= sd_bus_default_system(&bus
);
1938 return log_error_errno(r
, "Failed to open system bus: %m");
1940 r
= sd_bus_call_method(
1942 "org.freedesktop.machine1",
1943 "/org/freedesktop/machine1",
1944 "org.freedesktop.machine1.Manager",
1951 /* Note that the machine might already have been
1952 * cleaned up automatically, hence don't consider it a
1953 * failure if we cannot get the machine object. */
1954 log_debug("Failed to get machine: %s", bus_error_message(&error
, r
));
1958 r
= sd_bus_message_read(reply
, "o", &path
);
1960 return bus_log_parse_error(r
);
1962 r
= sd_bus_call_method(
1964 "org.freedesktop.machine1",
1966 "org.freedesktop.machine1.Machine",
1972 log_debug("Failed to terminate machine: %s", bus_error_message(&error
, r
));
1979 static int reset_audit_loginuid(void) {
1980 _cleanup_free_
char *p
= NULL
;
1983 if (arg_share_system
)
1986 r
= read_one_line_file("/proc/self/loginuid", &p
);
1990 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1992 /* Already reset? */
1993 if (streq(p
, "4294967295"))
1996 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1999 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2000 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2001 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2002 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2003 "using systemd-nspawn. Sleeping for 5s... (%m)");
2011 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2012 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2013 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2015 static int generate_mac(struct ether_addr
*mac
, sd_id128_t hash_key
, uint64_t idx
) {
2021 l
= strlen(arg_machine
);
2022 sz
= sizeof(sd_id128_t
) + l
;
2028 /* fetch some persistent data unique to the host */
2029 r
= sd_id128_get_machine((sd_id128_t
*) v
);
2033 /* combine with some data unique (on this host) to this
2034 * container instance */
2035 i
= mempcpy(v
+ sizeof(sd_id128_t
), arg_machine
, l
);
2038 memcpy(i
, &idx
, sizeof(idx
));
2041 /* Let's hash the host machine ID plus the container name. We
2042 * use a fixed, but originally randomly created hash key here. */
2043 siphash24(result
, v
, sz
, hash_key
.bytes
);
2045 assert_cc(ETH_ALEN
<= sizeof(result
));
2046 memcpy(mac
->ether_addr_octet
, result
, ETH_ALEN
);
2048 /* see eth_random_addr in the kernel */
2049 mac
->ether_addr_octet
[0] &= 0xfe; /* clear multicast bit */
2050 mac
->ether_addr_octet
[0] |= 0x02; /* set local assignment bit (IEEE802) */
2055 static int setup_veth(pid_t pid
, char iface_name
[IFNAMSIZ
], int *ifi
) {
2056 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2057 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2058 struct ether_addr mac_host
, mac_container
;
2061 if (!arg_private_network
)
2064 if (!arg_network_veth
)
2067 /* Use two different interface name prefixes depending whether
2068 * we are in bridge mode or not. */
2069 snprintf(iface_name
, IFNAMSIZ
- 1, "%s-%s",
2070 arg_network_bridge
? "vb" : "ve", arg_machine
);
2072 r
= generate_mac(&mac_container
, CONTAINER_HASH_KEY
, 0);
2074 return log_error_errno(r
, "Failed to generate predictable MAC address for container side: %m");
2076 r
= generate_mac(&mac_host
, HOST_HASH_KEY
, 0);
2078 return log_error_errno(r
, "Failed to generate predictable MAC address for host side: %m");
2080 r
= sd_netlink_open(&rtnl
);
2082 return log_error_errno(r
, "Failed to connect to netlink: %m");
2084 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2086 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2088 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, iface_name
);
2090 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2092 r
= sd_netlink_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac_host
);
2094 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2096 r
= sd_netlink_message_open_container(m
, IFLA_LINKINFO
);
2098 return log_error_errno(r
, "Failed to open netlink container: %m");
2100 r
= sd_netlink_message_open_container_union(m
, IFLA_INFO_DATA
, "veth");
2102 return log_error_errno(r
, "Failed to open netlink container: %m");
2104 r
= sd_netlink_message_open_container(m
, VETH_INFO_PEER
);
2106 return log_error_errno(r
, "Failed to open netlink container: %m");
2108 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, "host0");
2110 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2112 r
= sd_netlink_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac_container
);
2114 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2116 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2118 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2120 r
= sd_netlink_message_close_container(m
);
2122 return log_error_errno(r
, "Failed to close netlink container: %m");
2124 r
= sd_netlink_message_close_container(m
);
2126 return log_error_errno(r
, "Failed to close netlink container: %m");
2128 r
= sd_netlink_message_close_container(m
);
2130 return log_error_errno(r
, "Failed to close netlink container: %m");
2132 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2134 return log_error_errno(r
, "Failed to add new veth interfaces (host0, %s): %m", iface_name
);
2136 i
= (int) if_nametoindex(iface_name
);
2138 return log_error_errno(errno
, "Failed to resolve interface %s: %m", iface_name
);
2145 static int setup_bridge(const char veth_name
[], int *ifi
) {
2146 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2147 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2150 if (!arg_private_network
)
2153 if (!arg_network_veth
)
2156 if (!arg_network_bridge
)
2159 bridge
= (int) if_nametoindex(arg_network_bridge
);
2161 return log_error_errno(errno
, "Failed to resolve interface %s: %m", arg_network_bridge
);
2165 r
= sd_netlink_open(&rtnl
);
2167 return log_error_errno(r
, "Failed to connect to netlink: %m");
2169 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_SETLINK
, 0);
2171 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2173 r
= sd_rtnl_message_link_set_flags(m
, IFF_UP
, IFF_UP
);
2175 return log_error_errno(r
, "Failed to set IFF_UP flag: %m");
2177 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, veth_name
);
2179 return log_error_errno(r
, "Failed to add netlink interface name field: %m");
2181 r
= sd_netlink_message_append_u32(m
, IFLA_MASTER
, bridge
);
2183 return log_error_errno(r
, "Failed to add netlink master field: %m");
2185 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2187 return log_error_errno(r
, "Failed to add veth interface to bridge: %m");
2192 static int parse_interface(struct udev
*udev
, const char *name
) {
2193 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
2194 char ifi_str
[2 + DECIMAL_STR_MAX(int)];
2197 ifi
= (int) if_nametoindex(name
);
2199 return log_error_errno(errno
, "Failed to resolve interface %s: %m", name
);
2201 sprintf(ifi_str
, "n%i", ifi
);
2202 d
= udev_device_new_from_device_id(udev
, ifi_str
);
2204 return log_error_errno(errno
, "Failed to get udev device for interface %s: %m", name
);
2206 if (udev_device_get_is_initialized(d
) <= 0) {
2207 log_error("Network interface %s is not initialized yet.", name
);
2214 static int move_network_interfaces(pid_t pid
) {
2215 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2216 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2220 if (!arg_private_network
)
2223 if (strv_isempty(arg_network_interfaces
))
2226 r
= sd_netlink_open(&rtnl
);
2228 return log_error_errno(r
, "Failed to connect to netlink: %m");
2232 log_error("Failed to connect to udev.");
2236 STRV_FOREACH(i
, arg_network_interfaces
) {
2237 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2240 ifi
= parse_interface(udev
, *i
);
2244 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_SETLINK
, ifi
);
2246 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2248 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2250 return log_error_errno(r
, "Failed to append namespace PID to netlink message: %m");
2252 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2254 return log_error_errno(r
, "Failed to move interface %s to namespace: %m", *i
);
2260 static int setup_macvlan(pid_t pid
) {
2261 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2262 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2267 if (!arg_private_network
)
2270 if (strv_isempty(arg_network_macvlan
))
2273 r
= sd_netlink_open(&rtnl
);
2275 return log_error_errno(r
, "Failed to connect to netlink: %m");
2279 log_error("Failed to connect to udev.");
2283 STRV_FOREACH(i
, arg_network_macvlan
) {
2284 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2285 _cleanup_free_
char *n
= NULL
;
2286 struct ether_addr mac
;
2289 ifi
= parse_interface(udev
, *i
);
2293 r
= generate_mac(&mac
, MACVLAN_HASH_KEY
, idx
++);
2295 return log_error_errno(r
, "Failed to create MACVLAN MAC address: %m");
2297 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2299 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2301 r
= sd_netlink_message_append_u32(m
, IFLA_LINK
, ifi
);
2303 return log_error_errno(r
, "Failed to add netlink interface index: %m");
2305 n
= strappend("mv-", *i
);
2309 strshorten(n
, IFNAMSIZ
-1);
2311 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, n
);
2313 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2315 r
= sd_netlink_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac
);
2317 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2319 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2321 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2323 r
= sd_netlink_message_open_container(m
, IFLA_LINKINFO
);
2325 return log_error_errno(r
, "Failed to open netlink container: %m");
2327 r
= sd_netlink_message_open_container_union(m
, IFLA_INFO_DATA
, "macvlan");
2329 return log_error_errno(r
, "Failed to open netlink container: %m");
2331 r
= sd_netlink_message_append_u32(m
, IFLA_MACVLAN_MODE
, MACVLAN_MODE_BRIDGE
);
2333 return log_error_errno(r
, "Failed to append macvlan mode: %m");
2335 r
= sd_netlink_message_close_container(m
);
2337 return log_error_errno(r
, "Failed to close netlink container: %m");
2339 r
= sd_netlink_message_close_container(m
);
2341 return log_error_errno(r
, "Failed to close netlink container: %m");
2343 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2345 return log_error_errno(r
, "Failed to add new macvlan interfaces: %m");
2351 static int setup_ipvlan(pid_t pid
) {
2352 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2353 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2357 if (!arg_private_network
)
2360 if (strv_isempty(arg_network_ipvlan
))
2363 r
= sd_netlink_open(&rtnl
);
2365 return log_error_errno(r
, "Failed to connect to netlink: %m");
2369 log_error("Failed to connect to udev.");
2373 STRV_FOREACH(i
, arg_network_ipvlan
) {
2374 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2375 _cleanup_free_
char *n
= NULL
;
2378 ifi
= parse_interface(udev
, *i
);
2382 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2384 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2386 r
= sd_netlink_message_append_u32(m
, IFLA_LINK
, ifi
);
2388 return log_error_errno(r
, "Failed to add netlink interface index: %m");
2390 n
= strappend("iv-", *i
);
2394 strshorten(n
, IFNAMSIZ
-1);
2396 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, n
);
2398 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2400 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2402 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2404 r
= sd_netlink_message_open_container(m
, IFLA_LINKINFO
);
2406 return log_error_errno(r
, "Failed to open netlink container: %m");
2408 r
= sd_netlink_message_open_container_union(m
, IFLA_INFO_DATA
, "ipvlan");
2410 return log_error_errno(r
, "Failed to open netlink container: %m");
2412 r
= sd_netlink_message_append_u16(m
, IFLA_IPVLAN_MODE
, IPVLAN_MODE_L2
);
2414 return log_error_errno(r
, "Failed to add ipvlan mode: %m");
2416 r
= sd_netlink_message_close_container(m
);
2418 return log_error_errno(r
, "Failed to close netlink container: %m");
2420 r
= sd_netlink_message_close_container(m
);
2422 return log_error_errno(r
, "Failed to close netlink container: %m");
2424 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2426 return log_error_errno(r
, "Failed to add new ipvlan interfaces: %m");
2432 static int setup_seccomp(void) {
2435 static const struct {
2436 uint64_t capability
;
2439 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
2440 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
2441 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
2442 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
2443 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
2444 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
2445 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
2446 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
2447 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
2448 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
2451 scmp_filter_ctx seccomp
;
2455 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
2459 r
= seccomp_add_secondary_archs(seccomp
);
2461 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
2465 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
2466 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
2469 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
2471 continue; /* unknown syscall */
2473 log_error_errno(r
, "Failed to block syscall: %m");
2480 Audit is broken in containers, much of the userspace audit
2481 hookup will fail if running inside a container. We don't
2482 care and just turn off creation of audit sockets.
2484 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2485 with EAFNOSUPPORT which audit userspace uses as indication
2486 that audit is disabled in the kernel.
2489 r
= seccomp_rule_add(
2491 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
2494 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
2495 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
2497 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
2501 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
2503 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
2507 r
= seccomp_load(seccomp
);
2509 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
2514 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
2519 seccomp_release(seccomp
);
2527 static int setup_propagate(const char *root
) {
2530 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2531 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2532 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
2533 (void) mkdir_p(p
, 0600);
2535 if (userns_mkdir(root
, "/run/systemd", 0755, 0, 0) < 0)
2536 return log_error_errno(errno
, "Failed to create /run/systemd: %m");
2538 if (userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0) < 0)
2539 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn: %m");
2541 if (userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
2542 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn/incoming: %m");
2544 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
2545 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
2546 return log_error_errno(errno
, "Failed to install propagation bind mount.");
2548 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
2549 return log_error_errno(errno
, "Failed to make propagation mount read-only");
2554 static int setup_image(char **device_path
, int *loop_nr
) {
2555 struct loop_info64 info
= {
2556 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
2558 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
2559 _cleanup_free_
char* loopdev
= NULL
;
2563 assert(device_path
);
2567 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
2569 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
2571 if (fstat(fd
, &st
) < 0)
2572 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
2574 if (S_ISBLK(st
.st_mode
)) {
2577 p
= strdup(arg_image
);
2591 if (!S_ISREG(st
.st_mode
)) {
2592 log_error_errno(errno
, "%s is not a regular file or block device: %m", arg_image
);
2596 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2598 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
2600 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
2602 return log_error_errno(errno
, "Failed to allocate loop device: %m");
2604 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
2607 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
2609 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
2611 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
2612 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
2615 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
2617 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
2618 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
2620 *device_path
= loopdev
;
2631 #define PARTITION_TABLE_BLURB \
2632 "Note that the disk image needs to either contain only a single MBR partition of\n" \
2633 "type 0x83 that is marked bootable, or a single GPT partition of type " \
2634 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2635 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2636 "to be bootable with systemd-nspawn."
2638 static int dissect_image(
2640 char **root_device
, bool *root_device_rw
,
2641 char **home_device
, bool *home_device_rw
,
2642 char **srv_device
, bool *srv_device_rw
,
2646 int home_nr
= -1, srv_nr
= -1;
2647 #ifdef GPT_ROOT_NATIVE
2650 #ifdef GPT_ROOT_SECONDARY
2651 int secondary_root_nr
= -1;
2653 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
2654 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
2655 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
2656 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2657 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2658 struct udev_list_entry
*first
, *item
;
2659 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
2660 bool is_gpt
, is_mbr
, multiple_generic
= false;
2661 const char *pttype
= NULL
;
2668 assert(root_device
);
2669 assert(home_device
);
2674 b
= blkid_new_probe();
2679 r
= blkid_probe_set_device(b
, fd
, 0, 0);
2684 log_error_errno(errno
, "Failed to set device on blkid probe: %m");
2688 blkid_probe_enable_partitions(b
, 1);
2689 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
2692 r
= blkid_do_safeprobe(b
);
2693 if (r
== -2 || r
== 1) {
2694 log_error("Failed to identify any partition table on\n"
2696 PARTITION_TABLE_BLURB
, arg_image
);
2698 } else if (r
!= 0) {
2701 log_error_errno(errno
, "Failed to probe: %m");
2705 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
2707 is_gpt
= streq_ptr(pttype
, "gpt");
2708 is_mbr
= streq_ptr(pttype
, "dos");
2710 if (!is_gpt
&& !is_mbr
) {
2711 log_error("No GPT or MBR partition table discovered on\n"
2713 PARTITION_TABLE_BLURB
, arg_image
);
2718 pl
= blkid_probe_get_partitions(b
);
2723 log_error("Failed to list partitions of %s", arg_image
);
2731 if (fstat(fd
, &st
) < 0)
2732 return log_error_errno(errno
, "Failed to stat block device: %m");
2734 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
2742 log_error("Kernel partitions never appeared.");
2746 e
= udev_enumerate_new(udev
);
2750 r
= udev_enumerate_add_match_parent(e
, d
);
2754 r
= udev_enumerate_scan_devices(e
);
2756 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
2758 /* Count the partitions enumerated by the kernel */
2760 first
= udev_enumerate_get_list_entry(e
);
2761 udev_list_entry_foreach(item
, first
)
2764 /* Count the partitions enumerated by blkid */
2765 m
= blkid_partlist_numof_partitions(pl
);
2769 log_error("blkid and kernel partition list do not match.");
2775 /* The kernel has probed fewer partitions than
2776 * blkid? Maybe the kernel prober is still
2777 * running or it got EBUSY because udev
2778 * already opened the device. Let's reprobe
2779 * the device, which is a synchronous call
2780 * that waits until probing is complete. */
2782 for (j
= 0; j
< 20; j
++) {
2784 r
= ioctl(fd
, BLKRRPART
, 0);
2787 if (r
>= 0 || r
!= -EBUSY
)
2790 /* If something else has the device
2791 * open, such as an udev rule, the
2792 * ioctl will return EBUSY. Since
2793 * there's no way to wait until it
2794 * isn't busy anymore, let's just wait
2795 * a bit, and try again.
2797 * This is really something they
2798 * should fix in the kernel! */
2800 usleep(50 * USEC_PER_MSEC
);
2804 return log_error_errno(r
, "Failed to reread partition table: %m");
2807 e
= udev_enumerate_unref(e
);
2810 first
= udev_enumerate_get_list_entry(e
);
2811 udev_list_entry_foreach(item
, first
) {
2812 _cleanup_udev_device_unref_
struct udev_device
*q
;
2814 unsigned long long flags
;
2820 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
2825 log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
2829 qn
= udev_device_get_devnum(q
);
2833 if (st
.st_rdev
== qn
)
2836 node
= udev_device_get_devnode(q
);
2840 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
2844 flags
= blkid_partition_get_flags(pp
);
2846 nr
= blkid_partition_get_partno(pp
);
2854 if (flags
& GPT_FLAG_NO_AUTO
)
2857 stype
= blkid_partition_get_type_string(pp
);
2861 if (sd_id128_from_string(stype
, &type_id
) < 0)
2864 if (sd_id128_equal(type_id
, GPT_HOME
)) {
2866 if (home
&& nr
>= home_nr
)
2870 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2872 r
= free_and_strdup(&home
, node
);
2876 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
2878 if (srv
&& nr
>= srv_nr
)
2882 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2884 r
= free_and_strdup(&srv
, node
);
2888 #ifdef GPT_ROOT_NATIVE
2889 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
2891 if (root
&& nr
>= root_nr
)
2895 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2897 r
= free_and_strdup(&root
, node
);
2902 #ifdef GPT_ROOT_SECONDARY
2903 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
2905 if (secondary_root
&& nr
>= secondary_root_nr
)
2908 secondary_root_nr
= nr
;
2909 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2911 r
= free_and_strdup(&secondary_root
, node
);
2916 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
2919 multiple_generic
= true;
2921 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2923 r
= free_and_strdup(&generic
, node
);
2929 } else if (is_mbr
) {
2932 if (flags
!= 0x80) /* Bootable flag */
2935 type
= blkid_partition_get_type(pp
);
2936 if (type
!= 0x83) /* Linux partition */
2940 multiple_generic
= true;
2944 r
= free_and_strdup(&root
, node
);
2952 *root_device
= root
;
2955 *root_device_rw
= root_rw
;
2957 } else if (secondary_root
) {
2958 *root_device
= secondary_root
;
2959 secondary_root
= NULL
;
2961 *root_device_rw
= secondary_root_rw
;
2963 } else if (generic
) {
2965 /* There were no partitions with precise meanings
2966 * around, but we found generic partitions. In this
2967 * case, if there's only one, we can go ahead and boot
2968 * it, otherwise we bail out, because we really cannot
2969 * make any sense of it. */
2971 if (multiple_generic
) {
2972 log_error("Identified multiple bootable Linux partitions on\n"
2974 PARTITION_TABLE_BLURB
, arg_image
);
2978 *root_device
= generic
;
2981 *root_device_rw
= generic_rw
;
2984 log_error("Failed to identify root partition in disk image\n"
2986 PARTITION_TABLE_BLURB
, arg_image
);
2991 *home_device
= home
;
2994 *home_device_rw
= home_rw
;
3001 *srv_device_rw
= srv_rw
;
3006 log_error("--image= is not supported, compiled without blkid support.");
3011 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
3013 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
3014 const char *fstype
, *p
;
3024 p
= strjoina(where
, directory
);
3029 b
= blkid_new_probe_from_filename(what
);
3033 log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
3037 blkid_probe_enable_superblocks(b
, 1);
3038 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
3041 r
= blkid_do_safeprobe(b
);
3042 if (r
== -1 || r
== 1) {
3043 log_error("Cannot determine file system type of %s", what
);
3045 } else if (r
!= 0) {
3048 log_error_errno(errno
, "Failed to probe %s: %m", what
);
3053 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
3056 log_error("Failed to determine file system type of %s", what
);
3060 if (streq(fstype
, "crypto_LUKS")) {
3061 log_error("nspawn currently does not support LUKS disk images.");
3065 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
3066 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
3070 log_error("--image= is not supported, compiled without blkid support.");
3075 static int mount_devices(
3077 const char *root_device
, bool root_device_rw
,
3078 const char *home_device
, bool home_device_rw
,
3079 const char *srv_device
, bool srv_device_rw
) {
3085 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
3087 return log_error_errno(r
, "Failed to mount root directory: %m");
3091 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
3093 return log_error_errno(r
, "Failed to mount home directory: %m");
3097 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
3099 return log_error_errno(r
, "Failed to mount server data directory: %m");
3105 static void loop_remove(int nr
, int *image_fd
) {
3106 _cleanup_close_
int control
= -1;
3112 if (image_fd
&& *image_fd
>= 0) {
3113 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
3115 log_debug_errno(errno
, "Failed to close loop image: %m");
3116 *image_fd
= safe_close(*image_fd
);
3119 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
3121 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
3125 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
3127 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
3130 static int spawn_getent(const char *database
, const char *key
, pid_t
*rpid
) {
3138 if (pipe2(pipe_fds
, O_CLOEXEC
) < 0)
3139 return log_error_errno(errno
, "Failed to allocate pipe: %m");
3143 return log_error_errno(errno
, "Failed to fork getent child: %m");
3144 else if (pid
== 0) {
3146 char *empty_env
= NULL
;
3148 if (dup3(pipe_fds
[1], STDOUT_FILENO
, 0) < 0)
3149 _exit(EXIT_FAILURE
);
3151 if (pipe_fds
[0] > 2)
3152 safe_close(pipe_fds
[0]);
3153 if (pipe_fds
[1] > 2)
3154 safe_close(pipe_fds
[1]);
3156 nullfd
= open("/dev/null", O_RDWR
);
3158 _exit(EXIT_FAILURE
);
3160 if (dup3(nullfd
, STDIN_FILENO
, 0) < 0)
3161 _exit(EXIT_FAILURE
);
3163 if (dup3(nullfd
, STDERR_FILENO
, 0) < 0)
3164 _exit(EXIT_FAILURE
);
3169 (void) reset_all_signal_handlers();
3170 (void) reset_signal_mask();
3171 close_all_fds(NULL
, 0);
3173 execle("/usr/bin/getent", "getent", database
, key
, NULL
, &empty_env
);
3174 execle("/bin/getent", "getent", database
, key
, NULL
, &empty_env
);
3175 _exit(EXIT_FAILURE
);
3178 pipe_fds
[1] = safe_close(pipe_fds
[1]);
3185 static int change_uid_gid(char **_home
) {
3186 char line
[LINE_MAX
], *x
, *u
, *g
, *h
;
3187 const char *word
, *state
;
3188 _cleanup_free_ uid_t
*uids
= NULL
;
3189 _cleanup_free_
char *home
= NULL
;
3190 _cleanup_fclose_
FILE *f
= NULL
;
3191 _cleanup_close_
int fd
= -1;
3192 unsigned n_uids
= 0;
3201 if (!arg_user
|| streq(arg_user
, "root") || streq(arg_user
, "0")) {
3202 /* Reset everything fully to 0, just in case */
3204 r
= reset_uid_gid();
3206 return log_error_errno(r
, "Failed to become root: %m");
3212 /* First, get user credentials */
3213 fd
= spawn_getent("passwd", arg_user
, &pid
);
3217 f
= fdopen(fd
, "r");
3222 if (!fgets(line
, sizeof(line
), f
)) {
3225 log_error("Failed to resolve user %s.", arg_user
);
3229 log_error_errno(errno
, "Failed to read from getent: %m");
3235 wait_for_terminate_and_warn("getent passwd", pid
, true);
3237 x
= strchr(line
, ':');
3239 log_error("/etc/passwd entry has invalid user field.");
3243 u
= strchr(x
+1, ':');
3245 log_error("/etc/passwd entry has invalid password field.");
3252 log_error("/etc/passwd entry has invalid UID field.");
3260 log_error("/etc/passwd entry has invalid GID field.");
3265 h
= strchr(x
+1, ':');
3267 log_error("/etc/passwd entry has invalid GECOS field.");
3274 log_error("/etc/passwd entry has invalid home directory field.");
3280 r
= parse_uid(u
, &uid
);
3282 log_error("Failed to parse UID of user.");
3286 r
= parse_gid(g
, &gid
);
3288 log_error("Failed to parse GID of user.");
3296 /* Second, get group memberships */
3297 fd
= spawn_getent("initgroups", arg_user
, &pid
);
3302 f
= fdopen(fd
, "r");
3307 if (!fgets(line
, sizeof(line
), f
)) {
3309 log_error("Failed to resolve user %s.", arg_user
);
3313 log_error_errno(errno
, "Failed to read from getent: %m");
3319 wait_for_terminate_and_warn("getent initgroups", pid
, true);
3321 /* Skip over the username and subsequent separator whitespace */
3323 x
+= strcspn(x
, WHITESPACE
);
3324 x
+= strspn(x
, WHITESPACE
);
3326 FOREACH_WORD(word
, l
, x
, state
) {
3332 if (!GREEDY_REALLOC(uids
, sz
, n_uids
+1))
3335 r
= parse_uid(c
, &uids
[n_uids
++]);
3337 log_error("Failed to parse group data from getent.");
3342 r
= mkdir_parents(home
, 0775);
3344 return log_error_errno(r
, "Failed to make home root directory: %m");
3346 r
= mkdir_safe(home
, 0755, uid
, gid
);
3347 if (r
< 0 && r
!= -EEXIST
)
3348 return log_error_errno(r
, "Failed to make home directory: %m");
3350 (void) fchown(STDIN_FILENO
, uid
, gid
);
3351 (void) fchown(STDOUT_FILENO
, uid
, gid
);
3352 (void) fchown(STDERR_FILENO
, uid
, gid
);
3354 if (setgroups(n_uids
, uids
) < 0)
3355 return log_error_errno(errno
, "Failed to set auxiliary groups: %m");
3357 if (setresgid(gid
, gid
, gid
) < 0)
3358 return log_error_errno(errno
, "setregid() failed: %m");
3360 if (setresuid(uid
, uid
, uid
) < 0)
3361 return log_error_errno(errno
, "setreuid() failed: %m");
3373 * < 0 : wait_for_terminate() failed to get the state of the
3374 * container, the container was terminated by a signal, or
3375 * failed for an unknown reason. No change is made to the
3376 * container argument.
3377 * > 0 : The program executed in the container terminated with an
3378 * error. The exit code of the program executed in the
3379 * container is returned. The container argument has been set
3380 * to CONTAINER_TERMINATED.
3381 * 0 : The container is being rebooted, has been shut down or exited
3382 * successfully. The container argument has been set to either
3383 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3385 * That is, success is indicated by a return value of zero, and an
3386 * error is indicated by a non-zero value.
3388 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
3392 r
= wait_for_terminate(pid
, &status
);
3394 return log_warning_errno(r
, "Failed to wait for container: %m");
3396 switch (status
.si_code
) {
3399 if (status
.si_status
== 0) {
3400 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
3403 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
3405 *container
= CONTAINER_TERMINATED
;
3406 return status
.si_status
;
3409 if (status
.si_status
== SIGINT
) {
3411 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
3412 *container
= CONTAINER_TERMINATED
;
3415 } else if (status
.si_status
== SIGHUP
) {
3417 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
3418 *container
= CONTAINER_REBOOTED
;
3422 /* CLD_KILLED fallthrough */
3425 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
3429 log_error("Container %s failed due to unknown reason.", arg_machine
);
3436 static void nop_handler(int sig
) {}
3438 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
3441 pid
= PTR_TO_UINT32(userdata
);
3443 if (kill(pid
, arg_kill_signal
) >= 0) {
3444 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3445 sd_event_source_set_userdata(s
, NULL
);
3450 sd_event_exit(sd_event_source_get_event(s
), 0);
3454 static int determine_names(void) {
3457 if (arg_template
&& !arg_directory
&& arg_machine
) {
3459 /* If --template= was specified then we should not
3460 * search for a machine, but instead create a new one
3461 * in /var/lib/machine. */
3463 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
3468 if (!arg_image
&& !arg_directory
) {
3470 _cleanup_(image_unrefp
) Image
*i
= NULL
;
3472 r
= image_find(arg_machine
, &i
);
3474 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
3476 log_error("No image for machine '%s': %m", arg_machine
);
3480 if (i
->type
== IMAGE_RAW
)
3481 r
= set_sanitized_path(&arg_image
, i
->path
);
3483 r
= set_sanitized_path(&arg_directory
, i
->path
);
3485 return log_error_errno(r
, "Invalid image directory: %m");
3488 arg_read_only
= arg_read_only
|| i
->read_only
;
3490 arg_directory
= get_current_dir_name();
3492 if (!arg_directory
&& !arg_machine
) {
3493 log_error("Failed to determine path, please use -D or -i.");
3499 if (arg_directory
&& path_equal(arg_directory
, "/"))
3500 arg_machine
= gethostname_malloc();
3502 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
3507 hostname_cleanup(arg_machine
);
3508 if (!machine_name_is_valid(arg_machine
)) {
3509 log_error("Failed to determine machine name automatically, please use -M.");
3513 if (arg_ephemeral
) {
3516 /* Add a random suffix when this is an
3517 * ephemeral machine, so that we can run many
3518 * instances at once without manually having
3519 * to specify -M each time. */
3521 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
3532 static int determine_uid_shift(const char *directory
) {
3540 if (arg_uid_shift
== UID_INVALID
) {
3543 r
= stat(directory
, &st
);
3545 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
3547 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
3549 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
3550 log_error("UID and GID base of %s don't match.", directory
);
3554 arg_uid_range
= UINT32_C(0x10000);
3557 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
3558 log_error("UID base too high for UID range.");
3562 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
3566 static int inner_child(
3568 const char *directory
,
3574 _cleanup_free_
char *home
= NULL
;
3576 const char *envp
[] = {
3577 "PATH=" DEFAULT_PATH_SPLIT_USR
,
3578 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3583 NULL
, /* container_uuid */
3584 NULL
, /* LISTEN_FDS */
3585 NULL
, /* LISTEN_PID */
3589 _cleanup_strv_free_
char **env_use
= NULL
;
3594 assert(kmsg_socket
>= 0);
3599 /* Tell the parent, that it now can write the UID map. */
3600 (void) barrier_place(barrier
); /* #1 */
3602 /* Wait until the parent wrote the UID map */
3603 if (!barrier_place_and_sync(barrier
)) { /* #2 */
3604 log_error("Parent died too early");
3609 r
= mount_all(NULL
, true, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
3613 /* Wait until we are cgroup-ified, so that we
3614 * can mount the right cgroup path writable */
3615 if (!barrier_place_and_sync(barrier
)) { /* #3 */
3616 log_error("Parent died too early");
3620 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
3624 r
= reset_uid_gid();
3626 return log_error_errno(r
, "Couldn't become new root: %m");
3628 r
= setup_boot_id(NULL
);
3632 r
= setup_kmsg(NULL
, kmsg_socket
);
3635 kmsg_socket
= safe_close(kmsg_socket
);
3640 return log_error_errno(errno
, "setsid() failed: %m");
3642 if (arg_private_network
)
3645 r
= send_rtnl(rtnl_socket
);
3648 rtnl_socket
= safe_close(rtnl_socket
);
3650 if (drop_capabilities() < 0)
3651 return log_error_errno(errno
, "drop_capabilities() failed: %m");
3655 if (arg_personality
!= PERSONALITY_INVALID
) {
3656 if (personality(arg_personality
) < 0)
3657 return log_error_errno(errno
, "personality() failed: %m");
3658 } else if (secondary
) {
3659 if (personality(PER_LINUX32
) < 0)
3660 return log_error_errno(errno
, "personality() failed: %m");
3664 if (arg_selinux_context
)
3665 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
3666 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
3669 r
= change_uid_gid(&home
);
3673 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
3677 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
3678 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
3679 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
3682 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
3685 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
3689 if (fdset_size(fds
) > 0) {
3690 r
= fdset_cloexec(fds
, false);
3692 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
3694 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
3695 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
3699 env_use
= strv_env_merge(2, envp
, arg_setenv
);
3703 /* Let the parent know that we are ready and
3704 * wait until the parent is ready with the
3706 if (!barrier_place_and_sync(barrier
)) { /* #4 */
3707 log_error("Parent died too early");
3711 /* Now, explicitly close the log, so that we
3712 * then can close all remaining fds. Closing
3713 * the log explicitly first has the benefit
3714 * that the logging subsystem knows about it,
3715 * and is thus ready to be reopened should we
3716 * need it again. Note that the other fds
3717 * closed here are at least the locking and
3720 (void) fdset_close_others(fds
);
3726 /* Automatically search for the init system */
3728 m
= 1 + strv_length(arg_parameters
);
3729 a
= newa(char*, m
+ 1);
3730 if (strv_isempty(arg_parameters
))
3733 memcpy(a
+ 1, arg_parameters
, m
* sizeof(char*));
3735 a
[0] = (char*) "/usr/lib/systemd/systemd";
3736 execve(a
[0], a
, env_use
);
3738 a
[0] = (char*) "/lib/systemd/systemd";
3739 execve(a
[0], a
, env_use
);
3741 a
[0] = (char*) "/sbin/init";
3742 execve(a
[0], a
, env_use
);
3743 } else if (!strv_isempty(arg_parameters
))
3744 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
3746 chdir(home
?: "/root");
3747 execle("/bin/bash", "-bash", NULL
, env_use
);
3748 execle("/bin/sh", "-sh", NULL
, env_use
);
3752 return log_error_errno(errno
, "execv() failed: %m");
3755 static int outer_child(
3757 const char *directory
,
3758 const char *console
,
3759 const char *root_device
, bool root_device_rw
,
3760 const char *home_device
, bool home_device_rw
,
3761 const char *srv_device
, bool srv_device_rw
,
3767 int uid_shift_socket
,
3777 assert(pid_socket
>= 0);
3778 assert(kmsg_socket
>= 0);
3782 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
3783 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
3786 close_nointr(STDIN_FILENO
);
3787 close_nointr(STDOUT_FILENO
);
3788 close_nointr(STDERR_FILENO
);
3790 r
= open_terminal(console
, O_RDWR
);
3791 if (r
!= STDIN_FILENO
) {
3797 return log_error_errno(r
, "Failed to open console: %m");
3800 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
3801 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
3802 return log_error_errno(errno
, "Failed to duplicate console: %m");
3805 r
= reset_audit_loginuid();
3809 /* Mark everything as slave, so that we still
3810 * receive mounts from the real root, but don't
3811 * propagate mounts to the real root. */
3812 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
3813 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
3815 r
= mount_devices(directory
,
3816 root_device
, root_device_rw
,
3817 home_device
, home_device_rw
,
3818 srv_device
, srv_device_rw
);
3822 r
= determine_uid_shift(directory
);
3827 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
3829 return log_error_errno(errno
, "Failed to send UID shift: %m");
3830 if (l
!= sizeof(arg_uid_shift
)) {
3831 log_error("Short write while sending UID shift.");
3836 /* Turn directory into bind mount */
3837 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
3838 return log_error_errno(errno
, "Failed to make bind mount: %m");
3840 r
= setup_volatile(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
3844 r
= setup_volatile_state(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
3848 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
3852 if (arg_read_only
) {
3853 r
= bind_remount_recursive(directory
, true);
3855 return log_error_errno(r
, "Failed to make tree read-only: %m");
3858 r
= mount_all(directory
, false, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
3862 if (copy_devnodes(directory
) < 0)
3865 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
3867 if (setup_pts(directory
) < 0)
3870 r
= setup_propagate(directory
);
3874 r
= setup_dev_console(directory
, console
);
3878 r
= setup_seccomp();
3882 r
= setup_timezone(directory
);
3886 r
= setup_resolv_conf(directory
);
3890 r
= setup_journal(directory
);
3894 r
= mount_custom(directory
, arg_custom_mounts
, arg_n_custom_mounts
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
3898 r
= mount_cgroups(directory
, arg_unified_cgroup_hierarchy
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
3902 r
= mount_move_root(directory
);
3904 return log_error_errno(r
, "Failed to move root directory: %m");
3906 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
3907 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
3908 (arg_private_network
? CLONE_NEWNET
: 0) |
3909 (arg_userns
? CLONE_NEWUSER
: 0),
3912 return log_error_errno(errno
, "Failed to fork inner child: %m");
3914 pid_socket
= safe_close(pid_socket
);
3915 uid_shift_socket
= safe_close(uid_shift_socket
);
3917 /* The inner child has all namespaces that are
3918 * requested, so that we all are owned by the user if
3919 * user namespaces are turned on. */
3921 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
3923 _exit(EXIT_FAILURE
);
3925 _exit(EXIT_SUCCESS
);
3928 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
3930 return log_error_errno(errno
, "Failed to send PID: %m");
3931 if (l
!= sizeof(pid
)) {
3932 log_error("Short write while sending PID.");
3936 pid_socket
= safe_close(pid_socket
);
3941 static int setup_uid_map(pid_t pid
) {
3942 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
3947 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
3948 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
3949 r
= write_string_file(uid_map
, line
, 0);
3951 return log_error_errno(r
, "Failed to write UID map: %m");
3953 /* We always assign the same UID and GID ranges */
3954 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
3955 r
= write_string_file(uid_map
, line
, 0);
3957 return log_error_errno(r
, "Failed to write GID map: %m");
3962 static int chown_cgroup(pid_t pid
) {
3963 _cleanup_free_
char *path
= NULL
, *fs
= NULL
;
3964 _cleanup_close_
int fd
= -1;
3968 r
= cg_pid_get_path(NULL
, pid
, &path
);
3970 return log_error_errno(r
, "Failed to get container cgroup path: %m");
3972 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &fs
);
3974 return log_error_errno(r
, "Failed to get file system path for container cgroup: %m");
3976 fd
= open(fs
, O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
);
3978 return log_error_errno(errno
, "Failed to open %s: %m", fs
);
3983 "notify_on_release",
3985 "cgroup.clone_children",
3986 "cgroup.controllers",
3987 "cgroup.subtree_control",
3989 if (fchownat(fd
, fn
, arg_uid_shift
, arg_uid_shift
, 0) < 0)
3990 log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
,
3991 "Failed to chown() cgroup file %s, ignoring: %m", fn
);
3996 static int sync_cgroup(pid_t pid
) {
3997 _cleanup_free_
char *cgroup
= NULL
;
3998 char tree
[] = "/tmp/unifiedXXXXXX", pid_string
[DECIMAL_STR_MAX(pid
) + 1];
3999 bool undo_mount
= false;
4003 unified
= cg_unified();
4005 return log_error_errno(unified
, "Failed to determine whether the unified hierachy is used: %m");
4007 if ((unified
> 0) == arg_unified_cgroup_hierarchy
)
4010 /* When the host uses the legacy cgroup setup, but the
4011 * container shall use the unified hierarchy, let's make sure
4012 * we copy the path from the name=systemd hierarchy into the
4013 * unified hierarchy. Similar for the reverse situation. */
4015 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &cgroup
);
4017 return log_error_errno(r
, "Failed to get control group of " PID_FMT
": %m", pid
);
4019 /* In order to access the unified hierarchy we need to mount it */
4021 return log_error_errno(errno
, "Failed to generate temporary mount point for unified hierarchy: %m");
4024 r
= mount("cgroup", tree
, "cgroup", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "none,name=systemd,xattr");
4026 r
= mount("cgroup", tree
, "cgroup", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "__DEVEL__sane_behavior");
4028 r
= log_error_errno(errno
, "Failed to mount unified hierarchy: %m");
4034 fn
= strjoina(tree
, cgroup
, "/cgroup.procs");
4035 (void) mkdir_parents(fn
, 0755);
4037 sprintf(pid_string
, PID_FMT
, pid
);
4038 r
= write_string_file(fn
, pid_string
, 0);
4040 log_error_errno(r
, "Failed to move process: %m");
4044 (void) umount(tree
);
4050 static int create_subcgroup(pid_t pid
) {
4051 _cleanup_free_
char *cgroup
= NULL
;
4054 CGroupMask supported
;
4056 /* In the unified hierarchy inner nodes may only only contain
4057 * subgroups, but not processes. Hence, if we running in the
4058 * unified hierarchy and the container does the same, and we
4059 * did not create a scope unit for the container move us and
4060 * the container into two separate subcgroups. */
4065 if (!arg_unified_cgroup_hierarchy
)
4068 unified
= cg_unified();
4070 return log_error_errno(unified
, "Failed to determine whether the unified hierachy is used: %m");
4074 r
= cg_mask_supported(&supported
);
4076 return log_error_errno(r
, "Failed to determine supported controllers: %m");
4078 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, 0, &cgroup
);
4080 return log_error_errno(r
, "Failed to get our control group: %m");
4082 child
= strjoina(cgroup
, "/payload");
4083 r
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, child
, pid
);
4085 return log_error_errno(r
, "Failed to create %s subcgroup: %m", child
);
4087 child
= strjoina(cgroup
, "/supervisor");
4088 r
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, child
, 0);
4090 return log_error_errno(r
, "Failed to create %s subcgroup: %m", child
);
4092 /* Try to enable as many controllers as possible for the new payload. */
4093 (void) cg_enable_everywhere(supported
, supported
, cgroup
);
4097 static int load_settings(void) {
4098 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
4099 _cleanup_fclose_
FILE *f
= NULL
;
4100 _cleanup_free_
char *p
= NULL
;
4104 /* If all settings are masked, there's no point in looking for
4105 * the settings file */
4106 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
4109 fn
= strjoina(arg_machine
, ".nspawn");
4111 /* We first look in the admin's directories in /etc and /run */
4112 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4113 _cleanup_free_
char *j
= NULL
;
4115 j
= strjoin(i
, "/", fn
, NULL
);
4124 /* By default we trust configuration from /etc and /run */
4125 if (arg_settings_trusted
< 0)
4126 arg_settings_trusted
= true;
4131 if (errno
!= ENOENT
)
4132 return log_error_errno(errno
, "Failed to open %s: %m", j
);
4136 /* After that, let's look for a file next to the
4137 * actual image we shall boot. */
4140 p
= file_in_same_dir(arg_image
, fn
);
4143 } else if (arg_directory
) {
4144 p
= file_in_same_dir(arg_directory
, fn
);
4151 if (!f
&& errno
!= ENOENT
)
4152 return log_error_errno(errno
, "Failed to open %s: %m", p
);
4154 /* By default we do not trust configuration from /var/lib/machines */
4155 if (arg_settings_trusted
< 0)
4156 arg_settings_trusted
= false;
4163 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
4165 r
= settings_load(f
, p
, &settings
);
4169 /* Copy over bits from the settings, unless they have been
4170 * explicitly masked by command line switches. */
4172 if ((arg_settings_mask
& SETTING_BOOT
) == 0 &&
4173 settings
->boot
>= 0) {
4174 arg_boot
= settings
->boot
;
4176 strv_free(arg_parameters
);
4177 arg_parameters
= settings
->parameters
;
4178 settings
->parameters
= NULL
;
4181 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
4182 settings
->environment
) {
4183 strv_free(arg_setenv
);
4184 arg_setenv
= settings
->environment
;
4185 settings
->environment
= NULL
;
4188 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
4191 arg_user
= settings
->user
;
4192 settings
->user
= NULL
;
4195 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
4197 if (!arg_settings_trusted
&& settings
->capability
!= 0)
4198 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
4200 arg_retain
|= settings
->capability
;
4202 arg_retain
&= ~settings
->drop_capability
;
4205 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
4206 settings
->kill_signal
> 0)
4207 arg_kill_signal
= settings
->kill_signal
;
4209 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
4210 settings
->personality
!= PERSONALITY_INVALID
)
4211 arg_personality
= settings
->personality
;
4213 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
4214 !sd_id128_is_null(settings
->machine_id
)) {
4216 if (!arg_settings_trusted
)
4217 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
4219 arg_uuid
= settings
->machine_id
;
4222 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
4223 settings
->read_only
>= 0)
4224 arg_read_only
= settings
->read_only
;
4226 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
4227 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
4228 arg_volatile_mode
= settings
->volatile_mode
;
4230 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
4231 settings
->n_custom_mounts
> 0) {
4233 if (!arg_settings_trusted
)
4234 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
4236 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
4237 arg_custom_mounts
= settings
->custom_mounts
;
4238 arg_n_custom_mounts
= settings
->n_custom_mounts
;
4240 settings
->custom_mounts
= NULL
;
4241 settings
->n_custom_mounts
= 0;
4245 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
4246 (settings
->private_network
>= 0 ||
4247 settings
->network_veth
>= 0 ||
4248 settings
->network_bridge
||
4249 settings
->network_interfaces
||
4250 settings
->network_macvlan
||
4251 settings
->network_ipvlan
)) {
4253 if (!arg_settings_trusted
)
4254 log_warning("Ignoring network settings, file %s is not trusted.", p
);
4256 strv_free(arg_network_interfaces
);
4257 arg_network_interfaces
= settings
->network_interfaces
;
4258 settings
->network_interfaces
= NULL
;
4260 strv_free(arg_network_macvlan
);
4261 arg_network_macvlan
= settings
->network_macvlan
;
4262 settings
->network_macvlan
= NULL
;
4264 strv_free(arg_network_ipvlan
);
4265 arg_network_ipvlan
= settings
->network_ipvlan
;
4266 settings
->network_ipvlan
= NULL
;
4268 free(arg_network_bridge
);
4269 arg_network_bridge
= settings
->network_bridge
;
4270 settings
->network_bridge
= NULL
;
4272 arg_network_veth
= settings
->network_veth
> 0 || settings
->network_bridge
;
4274 arg_private_network
= true; /* all these settings imply private networking */
4278 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
4279 settings
->expose_ports
) {
4281 if (!arg_settings_trusted
)
4282 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
4284 expose_port_free_all(arg_expose_ports
);
4285 arg_expose_ports
= settings
->expose_ports
;
4286 settings
->expose_ports
= NULL
;
4293 int main(int argc
, char *argv
[]) {
4295 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
4296 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
4297 _cleanup_close_
int master
= -1, image_fd
= -1;
4298 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
4299 int r
, n_fd_passed
, loop_nr
= -1;
4300 char veth_name
[IFNAMSIZ
];
4301 bool secondary
= false, remove_subvol
= false;
4304 int ret
= EXIT_SUCCESS
;
4305 union in_addr_union exposed
= {};
4306 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
4309 log_parse_environment();
4312 r
= parse_argv(argc
, argv
);
4316 if (geteuid() != 0) {
4317 log_error("Need to be root.");
4321 r
= determine_names();
4325 r
= load_settings();
4329 r
= verify_arguments();
4333 n_fd_passed
= sd_listen_fds(false);
4334 if (n_fd_passed
> 0) {
4335 r
= fdset_new_listen_fds(&fds
, false);
4337 log_error_errno(r
, "Failed to collect file descriptors: %m");
4342 if (arg_directory
) {
4345 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
4346 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4351 if (arg_ephemeral
) {
4352 _cleanup_free_
char *np
= NULL
;
4354 /* If the specified path is a mount point we
4355 * generate the new snapshot immediately
4356 * inside it under a random name. However if
4357 * the specified is not a mount point we
4358 * create the new snapshot in the parent
4359 * directory, just next to it. */
4360 r
= path_is_mount_point(arg_directory
, 0);
4362 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
4366 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
4368 r
= tempfn_random(arg_directory
, "machine.", &np
);
4370 log_error_errno(r
, "Failed to generate name for snapshot: %m");
4374 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
4376 log_error_errno(r
, "Failed to lock %s: %m", np
);
4380 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
4382 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
4386 free(arg_directory
);
4390 remove_subvol
= true;
4393 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
4395 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
4399 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
4404 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
4407 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
4409 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
4413 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
4419 if (path_is_os_tree(arg_directory
) <= 0) {
4420 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
4427 p
= strjoina(arg_directory
,
4428 argc
> optind
&& path_is_absolute(argv
[optind
]) ? argv
[optind
] : "/usr/bin/");
4429 if (access(p
, F_OK
) < 0) {
4430 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory
);
4437 char template[] = "/tmp/nspawn-root-XXXXXX";
4440 assert(!arg_template
);
4442 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
4444 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
4448 r
= log_error_errno(r
, "Failed to create image lock: %m");
4452 if (!mkdtemp(template)) {
4453 log_error_errno(errno
, "Failed to create temporary directory: %m");
4458 arg_directory
= strdup(template);
4459 if (!arg_directory
) {
4464 image_fd
= setup_image(&device_path
, &loop_nr
);
4470 r
= dissect_image(image_fd
,
4471 &root_device
, &root_device_rw
,
4472 &home_device
, &home_device_rw
,
4473 &srv_device
, &srv_device_rw
,
4479 r
= custom_mounts_prepare();
4484 isatty(STDIN_FILENO
) > 0 &&
4485 isatty(STDOUT_FILENO
) > 0;
4487 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
4489 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
4493 r
= ptsname_malloc(master
, &console
);
4495 r
= log_error_errno(r
, "Failed to determine tty name: %m");
4499 if (unlockpt(master
) < 0) {
4500 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
4505 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4506 arg_machine
, arg_image
?: arg_directory
);
4508 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
4510 assert_se(sigemptyset(&mask_chld
) == 0);
4511 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
4513 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
4514 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
4519 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 },
4520 uid_shift_socket_pair
[2] = { -1, -1 };
4521 ContainerStatus container_status
;
4522 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
4523 static const struct sigaction sa
= {
4524 .sa_handler
= nop_handler
,
4525 .sa_flags
= SA_NOCLDSTOP
,
4529 _cleanup_event_unref_ sd_event
*event
= NULL
;
4530 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
4531 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
4534 r
= barrier_create(&barrier
);
4536 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
4540 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
4541 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
4545 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
4546 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
4550 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
4551 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
4556 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
4557 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
4561 /* Child can be killed before execv(), so handle SIGCHLD
4562 * in order to interrupt parent's blocking calls and
4563 * give it a chance to call wait() and terminate. */
4564 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
4566 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
4570 r
= sigaction(SIGCHLD
, &sa
, NULL
);
4572 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
4576 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
4578 if (errno
== EINVAL
)
4579 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
4581 r
= log_error_errno(errno
, "clone() failed: %m");
4587 /* The outer child only has a file system namespace. */
4588 barrier_set_role(&barrier
, BARRIER_CHILD
);
4590 master
= safe_close(master
);
4592 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
4593 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
4594 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
4595 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
4597 (void) reset_all_signal_handlers();
4598 (void) reset_signal_mask();
4600 r
= outer_child(&barrier
,
4603 root_device
, root_device_rw
,
4604 home_device
, home_device_rw
,
4605 srv_device
, srv_device_rw
,
4609 kmsg_socket_pair
[1],
4610 rtnl_socket_pair
[1],
4611 uid_shift_socket_pair
[1],
4614 _exit(EXIT_FAILURE
);
4616 _exit(EXIT_SUCCESS
);
4619 barrier_set_role(&barrier
, BARRIER_PARENT
);
4624 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
4625 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
4626 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
4628 /* Wait for the outer child. */
4629 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
4638 /* And now retrieve the PID of the inner child. */
4639 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
4641 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
4644 if (l
!= sizeof(pid
)) {
4645 log_error("Short read while reading inner child PID: %m");
4650 log_debug("Init process invoked as PID " PID_FMT
, pid
);
4653 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
4654 log_error("Child died too early.");
4659 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
4661 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
4664 if (l
!= sizeof(arg_uid_shift
)) {
4665 log_error("Short read while reading UID shift: %m");
4670 r
= setup_uid_map(pid
);
4674 (void) barrier_place(&barrier
); /* #2 */
4677 r
= move_network_interfaces(pid
);
4681 r
= setup_veth(pid
, veth_name
, &ifi
);
4685 r
= setup_bridge(veth_name
, &ifi
);
4689 r
= setup_macvlan(pid
);
4693 r
= setup_ipvlan(pid
);
4697 r
= register_machine(pid
, ifi
);
4701 r
= sync_cgroup(pid
);
4705 r
= create_subcgroup(pid
);
4709 r
= chown_cgroup(pid
);
4713 /* Notify the child that the parent is ready with all
4714 * its setup (including cgroup-ification), and that
4715 * the child can now hand over control to the code to
4716 * run inside the container. */
4717 (void) barrier_place(&barrier
); /* #3 */
4719 /* Block SIGCHLD here, before notifying child.
4720 * process_pty() will handle it with the other signals. */
4721 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
4723 /* Reset signal to default */
4724 r
= default_signals(SIGCHLD
, -1);
4726 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
4730 /* Let the child know that we are ready and wait that the child is completely ready now. */
4731 if (!barrier_place_and_sync(&barrier
)) { /* #5 */
4732 log_error("Client died too early.");
4739 "STATUS=Container running.\n"
4740 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
4742 r
= sd_event_new(&event
);
4744 log_error_errno(r
, "Failed to get default event source: %m");
4748 if (arg_kill_signal
> 0) {
4749 /* Try to kill the init system on SIGINT or SIGTERM */
4750 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
4751 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
4753 /* Immediately exit */
4754 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
4755 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
4758 /* simply exit on sigchld */
4759 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
4761 if (arg_expose_ports
) {
4762 r
= watch_rtnl(event
, rtnl_socket_pair
[0], &exposed
, &rtnl
);
4766 (void) expose_ports(rtnl
, &exposed
);
4769 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
4771 r
= pty_forward_new(event
, master
, true, !interactive
, &forward
);
4773 log_error_errno(r
, "Failed to create PTY forwarder: %m");
4777 r
= sd_event_loop(event
);
4779 log_error_errno(r
, "Failed to run event loop: %m");
4783 pty_forward_get_last_char(forward
, &last_char
);
4785 forward
= pty_forward_free(forward
);
4787 if (!arg_quiet
&& last_char
!= '\n')
4790 /* Kill if it is not dead yet anyway */
4791 terminate_machine(pid
);
4793 /* Normally redundant, but better safe than sorry */
4796 r
= wait_for_container(pid
, &container_status
);
4800 /* We failed to wait for the container, or the
4801 * container exited abnormally */
4803 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
4804 /* The container exited with a non-zero
4805 * status, or with zero status and no reboot
4811 /* CONTAINER_REBOOTED, loop again */
4813 if (arg_keep_unit
) {
4814 /* Special handling if we are running as a
4815 * service: instead of simply restarting the
4816 * machine we want to restart the entire
4817 * service, so let's inform systemd about this
4818 * with the special exit code 133. The service
4819 * file uses RestartForceExitStatus=133 so
4820 * that this results in a full nspawn
4821 * restart. This is necessary since we might
4822 * have cgroup parameters set we want to have
4829 flush_ports(&exposed
);
4835 "STATUS=Terminating...");
4840 /* Try to flush whatever is still queued in the pty */
4842 (void) copy_bytes(master
, STDOUT_FILENO
, (off_t
) -1, false);
4844 loop_remove(loop_nr
, &image_fd
);
4846 if (remove_subvol
&& arg_directory
) {
4849 k
= btrfs_subvol_remove(arg_directory
, true);
4851 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
4857 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
4858 (void) rm_rf(p
, REMOVE_ROOT
);
4861 flush_ports(&exposed
);
4863 free(arg_directory
);
4868 strv_free(arg_setenv
);
4869 free(arg_network_bridge
);
4870 strv_free(arg_network_interfaces
);
4871 strv_free(arg_network_macvlan
);
4872 strv_free(arg_network_ipvlan
);
4873 strv_free(arg_parameters
);
4874 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
4875 expose_port_free_all(arg_expose_ports
);
4877 return r
< 0 ? EXIT_FAILURE
: ret
;