1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/mount.h>
31 #include <sys/prctl.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <sys/personality.h>
38 #include <linux/loop.h>
42 #include <selinux/selinux.h>
50 #include <blkid/blkid.h>
53 #include "sd-daemon.h"
56 #include "random-util.h"
63 #include "cgroup-util.h"
65 #include "path-util.h"
66 #include "loopback-setup.h"
67 #include "dev-setup.h"
72 #include "bus-error.h"
75 #include "netlink-util.h"
76 #include "udev-util.h"
77 #include "blkid-util.h"
80 #include "base-filesystem.h"
82 #include "event-util.h"
83 #include "capability.h"
85 #include "btrfs-util.h"
86 #include "machine-image.h"
88 #include "in-addr-util.h"
89 #include "formats-util.h"
90 #include "process-util.h"
91 #include "terminal-util.h"
92 #include "hostname-util.h"
93 #include "signal-util.h"
96 #include "seccomp-util.h"
100 #include "nspawn-settings.h"
101 #include "nspawn-mount.h"
102 #include "nspawn-network.h"
103 #include "nspawn-expose-ports.h"
105 typedef enum ContainerStatus
{
106 CONTAINER_TERMINATED
,
110 typedef enum LinkJournal
{
117 static char *arg_directory
= NULL
;
118 static char *arg_template
= NULL
;
119 static char *arg_user
= NULL
;
120 static sd_id128_t arg_uuid
= {};
121 static char *arg_machine
= NULL
;
122 static const char *arg_selinux_context
= NULL
;
123 static const char *arg_selinux_apifs_context
= NULL
;
124 static const char *arg_slice
= NULL
;
125 static bool arg_private_network
= false;
126 static bool arg_read_only
= false;
127 static bool arg_boot
= false;
128 static bool arg_ephemeral
= false;
129 static LinkJournal arg_link_journal
= LINK_AUTO
;
130 static bool arg_link_journal_try
= false;
131 static uint64_t arg_retain
=
132 (1ULL << CAP_CHOWN
) |
133 (1ULL << CAP_DAC_OVERRIDE
) |
134 (1ULL << CAP_DAC_READ_SEARCH
) |
135 (1ULL << CAP_FOWNER
) |
136 (1ULL << CAP_FSETID
) |
137 (1ULL << CAP_IPC_OWNER
) |
139 (1ULL << CAP_LEASE
) |
140 (1ULL << CAP_LINUX_IMMUTABLE
) |
141 (1ULL << CAP_NET_BIND_SERVICE
) |
142 (1ULL << CAP_NET_BROADCAST
) |
143 (1ULL << CAP_NET_RAW
) |
144 (1ULL << CAP_SETGID
) |
145 (1ULL << CAP_SETFCAP
) |
146 (1ULL << CAP_SETPCAP
) |
147 (1ULL << CAP_SETUID
) |
148 (1ULL << CAP_SYS_ADMIN
) |
149 (1ULL << CAP_SYS_CHROOT
) |
150 (1ULL << CAP_SYS_NICE
) |
151 (1ULL << CAP_SYS_PTRACE
) |
152 (1ULL << CAP_SYS_TTY_CONFIG
) |
153 (1ULL << CAP_SYS_RESOURCE
) |
154 (1ULL << CAP_SYS_BOOT
) |
155 (1ULL << CAP_AUDIT_WRITE
) |
156 (1ULL << CAP_AUDIT_CONTROL
) |
158 static CustomMount
*arg_custom_mounts
= NULL
;
159 static unsigned arg_n_custom_mounts
= 0;
160 static char **arg_setenv
= NULL
;
161 static bool arg_quiet
= false;
162 static bool arg_share_system
= false;
163 static bool arg_register
= true;
164 static bool arg_keep_unit
= false;
165 static char **arg_network_interfaces
= NULL
;
166 static char **arg_network_macvlan
= NULL
;
167 static char **arg_network_ipvlan
= NULL
;
168 static bool arg_network_veth
= false;
169 static char *arg_network_bridge
= NULL
;
170 static unsigned long arg_personality
= PERSONALITY_INVALID
;
171 static char *arg_image
= NULL
;
172 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
173 static ExposePort
*arg_expose_ports
= NULL
;
174 static char **arg_property
= NULL
;
175 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
176 static bool arg_userns
= false;
177 static int arg_kill_signal
= 0;
178 static bool arg_unified_cgroup_hierarchy
= false;
179 static SettingsMask arg_settings_mask
= 0;
180 static int arg_settings_trusted
= -1;
181 static char **arg_parameters
= NULL
;
183 static void help(void) {
184 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
185 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
186 " -h --help Show this help\n"
187 " --version Print version string\n"
188 " -q --quiet Do not show status information\n"
189 " -D --directory=PATH Root directory for the container\n"
190 " --template=PATH Initialize root directory from template directory,\n"
192 " -x --ephemeral Run container with snapshot of root directory, and\n"
193 " remove it after exit\n"
194 " -i --image=PATH File system device or disk image for the container\n"
195 " -b --boot Boot up full system (i.e. invoke init)\n"
196 " -u --user=USER Run the command under specified user or uid\n"
197 " -M --machine=NAME Set the machine name for the container\n"
198 " --uuid=UUID Set a specific machine UUID for the container\n"
199 " -S --slice=SLICE Place the container in the specified slice\n"
200 " --property=NAME=VALUE Set scope unit property\n"
201 " --private-users[=UIDBASE[:NUIDS]]\n"
202 " Run within user namespace\n"
203 " --private-network Disable network in container\n"
204 " --network-interface=INTERFACE\n"
205 " Assign an existing network interface to the\n"
207 " --network-macvlan=INTERFACE\n"
208 " Create a macvlan network interface based on an\n"
209 " existing network interface to the container\n"
210 " --network-ipvlan=INTERFACE\n"
211 " Create a ipvlan network interface based on an\n"
212 " existing network interface to the container\n"
213 " -n --network-veth Add a virtual ethernet connection between host\n"
215 " --network-bridge=INTERFACE\n"
216 " Add a virtual ethernet connection between host\n"
217 " and container and add it to an existing bridge on\n"
219 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
220 " Expose a container IP port on the host\n"
221 " -Z --selinux-context=SECLABEL\n"
222 " Set the SELinux security context to be used by\n"
223 " processes in the container\n"
224 " -L --selinux-apifs-context=SECLABEL\n"
225 " Set the SELinux security context to be used by\n"
226 " API/tmpfs file systems in the container\n"
227 " --capability=CAP In addition to the default, retain specified\n"
229 " --drop-capability=CAP Drop the specified capability from the default set\n"
230 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
231 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
232 " try-guest, try-host\n"
233 " -j Equivalent to --link-journal=try-guest\n"
234 " --read-only Mount the root directory read-only\n"
235 " --bind=PATH[:PATH[:OPTIONS]]\n"
236 " Bind mount a file or directory from the host into\n"
238 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
239 " Similar, but creates a read-only bind mount\n"
240 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
241 " --overlay=PATH[:PATH...]:PATH\n"
242 " Create an overlay mount from the host to \n"
244 " --overlay-ro=PATH[:PATH...]:PATH\n"
245 " Similar, but creates a read-only overlay mount\n"
246 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
247 " --share-system Share system namespaces with host\n"
248 " --register=BOOLEAN Register container as machine\n"
249 " --keep-unit Do not register a scope for the machine, reuse\n"
250 " the service unit nspawn is running in\n"
251 " --volatile[=MODE] Run the system in volatile mode\n"
252 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
253 , program_invocation_short_name
);
257 static int custom_mounts_prepare(void) {
261 /* Ensure the mounts are applied prefix first. */
262 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
264 /* Allocate working directories for the overlay file systems that need it */
265 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
266 CustomMount
*m
= &arg_custom_mounts
[i
];
268 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
269 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
273 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
282 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
284 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
290 static int set_sanitized_path(char **b
, const char *path
) {
296 p
= canonicalize_file_name(path
);
301 p
= path_make_absolute_cwd(path
);
307 *b
= path_kill_slashes(p
);
311 static int detect_unified_cgroup_hierarchy(void) {
315 /* Allow the user to control whether the unified hierarchy is used */
316 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
318 r
= parse_boolean(e
);
320 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
322 arg_unified_cgroup_hierarchy
= r
;
326 /* Otherwise inherit the default from the host system */
329 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
331 arg_unified_cgroup_hierarchy
= r
;
335 static int parse_argv(int argc
, char *argv
[]) {
354 ARG_NETWORK_INTERFACE
,
367 static const struct option options
[] = {
368 { "help", no_argument
, NULL
, 'h' },
369 { "version", no_argument
, NULL
, ARG_VERSION
},
370 { "directory", required_argument
, NULL
, 'D' },
371 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
372 { "ephemeral", no_argument
, NULL
, 'x' },
373 { "user", required_argument
, NULL
, 'u' },
374 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
375 { "boot", no_argument
, NULL
, 'b' },
376 { "uuid", required_argument
, NULL
, ARG_UUID
},
377 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
378 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
379 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
380 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
381 { "bind", required_argument
, NULL
, ARG_BIND
},
382 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
383 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
384 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
385 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
386 { "machine", required_argument
, NULL
, 'M' },
387 { "slice", required_argument
, NULL
, 'S' },
388 { "setenv", required_argument
, NULL
, ARG_SETENV
},
389 { "selinux-context", required_argument
, NULL
, 'Z' },
390 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
391 { "quiet", no_argument
, NULL
, 'q' },
392 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
393 { "register", required_argument
, NULL
, ARG_REGISTER
},
394 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
395 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
396 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
397 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
398 { "network-veth", no_argument
, NULL
, 'n' },
399 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
400 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
401 { "image", required_argument
, NULL
, 'i' },
402 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
403 { "port", required_argument
, NULL
, 'p' },
404 { "property", required_argument
, NULL
, ARG_PROPERTY
},
405 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
406 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
407 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
412 uint64_t plus
= 0, minus
= 0;
413 bool mask_all_settings
= false, mask_no_settings
= false;
418 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
427 puts(PACKAGE_STRING
);
428 puts(SYSTEMD_FEATURES
);
432 r
= set_sanitized_path(&arg_directory
, optarg
);
434 return log_error_errno(r
, "Invalid root directory: %m");
439 r
= set_sanitized_path(&arg_template
, optarg
);
441 return log_error_errno(r
, "Invalid template directory: %m");
446 r
= set_sanitized_path(&arg_image
, optarg
);
448 return log_error_errno(r
, "Invalid image path: %m");
453 arg_ephemeral
= true;
457 r
= free_and_strdup(&arg_user
, optarg
);
461 arg_settings_mask
|= SETTING_USER
;
464 case ARG_NETWORK_BRIDGE
:
465 r
= free_and_strdup(&arg_network_bridge
, optarg
);
472 arg_network_veth
= true;
473 arg_private_network
= true;
474 arg_settings_mask
|= SETTING_NETWORK
;
477 case ARG_NETWORK_INTERFACE
:
478 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
481 arg_private_network
= true;
482 arg_settings_mask
|= SETTING_NETWORK
;
485 case ARG_NETWORK_MACVLAN
:
486 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
489 arg_private_network
= true;
490 arg_settings_mask
|= SETTING_NETWORK
;
493 case ARG_NETWORK_IPVLAN
:
494 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
499 case ARG_PRIVATE_NETWORK
:
500 arg_private_network
= true;
501 arg_settings_mask
|= SETTING_NETWORK
;
506 arg_settings_mask
|= SETTING_BOOT
;
510 r
= sd_id128_from_string(optarg
, &arg_uuid
);
512 log_error("Invalid UUID: %s", optarg
);
516 arg_settings_mask
|= SETTING_MACHINE_ID
;
525 arg_machine
= mfree(arg_machine
);
527 if (!machine_name_is_valid(optarg
)) {
528 log_error("Invalid machine name: %s", optarg
);
532 r
= free_and_strdup(&arg_machine
, optarg
);
540 arg_selinux_context
= optarg
;
544 arg_selinux_apifs_context
= optarg
;
548 arg_read_only
= true;
549 arg_settings_mask
|= SETTING_READ_ONLY
;
553 case ARG_DROP_CAPABILITY
: {
554 const char *state
, *word
;
557 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
558 _cleanup_free_
char *t
;
560 t
= strndup(word
, length
);
564 if (streq(t
, "all")) {
565 if (c
== ARG_CAPABILITY
)
566 plus
= (uint64_t) -1;
568 minus
= (uint64_t) -1;
572 cap
= capability_from_name(t
);
574 log_error("Failed to parse capability %s.", t
);
578 if (c
== ARG_CAPABILITY
)
579 plus
|= 1ULL << (uint64_t) cap
;
581 minus
|= 1ULL << (uint64_t) cap
;
585 arg_settings_mask
|= SETTING_CAPABILITY
;
590 arg_link_journal
= LINK_GUEST
;
591 arg_link_journal_try
= true;
594 case ARG_LINK_JOURNAL
:
595 if (streq(optarg
, "auto")) {
596 arg_link_journal
= LINK_AUTO
;
597 arg_link_journal_try
= false;
598 } else if (streq(optarg
, "no")) {
599 arg_link_journal
= LINK_NO
;
600 arg_link_journal_try
= false;
601 } else if (streq(optarg
, "guest")) {
602 arg_link_journal
= LINK_GUEST
;
603 arg_link_journal_try
= false;
604 } else if (streq(optarg
, "host")) {
605 arg_link_journal
= LINK_HOST
;
606 arg_link_journal_try
= false;
607 } else if (streq(optarg
, "try-guest")) {
608 arg_link_journal
= LINK_GUEST
;
609 arg_link_journal_try
= true;
610 } else if (streq(optarg
, "try-host")) {
611 arg_link_journal
= LINK_HOST
;
612 arg_link_journal_try
= true;
614 log_error("Failed to parse link journal mode %s", optarg
);
622 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
624 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
626 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
630 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
632 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
634 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
638 case ARG_OVERLAY_RO
: {
639 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
640 _cleanup_strv_free_
char **lower
= NULL
;
645 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
649 log_error("Invalid overlay specification: %s", optarg
);
653 STRV_FOREACH(i
, lower
) {
654 if (!path_is_absolute(*i
)) {
655 log_error("Overlay path %s is not absolute.", *i
);
663 log_error("--overlay= needs at least two colon-separated directories specified.");
668 /* If two parameters are specified,
669 * the first one is the lower, the
670 * second one the upper directory. And
671 * we'll also define the destination
672 * mount point the same as the upper. */
676 destination
= strdup(upper
);
681 upper
= lower
[n
- 2];
682 destination
= lower
[n
- 1];
686 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
690 m
->destination
= destination
;
693 m
->read_only
= c
== ARG_OVERLAY_RO
;
695 upper
= destination
= NULL
;
698 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
705 if (!env_assignment_is_valid(optarg
)) {
706 log_error("Environment variable assignment '%s' is not valid.", optarg
);
710 n
= strv_env_set(arg_setenv
, optarg
);
714 strv_free(arg_setenv
);
717 arg_settings_mask
|= SETTING_ENVIRONMENT
;
725 case ARG_SHARE_SYSTEM
:
726 arg_share_system
= true;
730 r
= parse_boolean(optarg
);
732 log_error("Failed to parse --register= argument: %s", optarg
);
740 arg_keep_unit
= true;
743 case ARG_PERSONALITY
:
745 arg_personality
= personality_from_string(optarg
);
746 if (arg_personality
== PERSONALITY_INVALID
) {
747 log_error("Unknown or unsupported personality '%s'.", optarg
);
751 arg_settings_mask
|= SETTING_PERSONALITY
;
757 arg_volatile_mode
= VOLATILE_YES
;
761 m
= volatile_mode_from_string(optarg
);
763 log_error("Failed to parse --volatile= argument: %s", optarg
);
766 arg_volatile_mode
= m
;
769 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
773 r
= expose_port_parse(&arg_expose_ports
, optarg
);
775 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
777 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
779 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
783 if (strv_extend(&arg_property
, optarg
) < 0)
788 case ARG_PRIVATE_USERS
:
790 _cleanup_free_
char *buffer
= NULL
;
791 const char *range
, *shift
;
793 range
= strchr(optarg
, ':');
795 buffer
= strndup(optarg
, range
- optarg
);
801 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
802 log_error("Failed to parse UID range: %s", range
);
808 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
809 log_error("Failed to parse UID: %s", optarg
);
817 case ARG_KILL_SIGNAL
:
818 arg_kill_signal
= signal_from_string_try_harder(optarg
);
819 if (arg_kill_signal
< 0) {
820 log_error("Cannot parse signal: %s", optarg
);
824 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
829 /* no → do not read files
830 * yes → read files, do not override cmdline, trust only subset
831 * override → read files, override cmdline, trust only subset
832 * trusted → read files, do not override cmdline, trust all
835 r
= parse_boolean(optarg
);
837 if (streq(optarg
, "trusted")) {
838 mask_all_settings
= false;
839 mask_no_settings
= false;
840 arg_settings_trusted
= true;
842 } else if (streq(optarg
, "override")) {
843 mask_all_settings
= false;
844 mask_no_settings
= true;
845 arg_settings_trusted
= -1;
847 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
850 mask_all_settings
= false;
851 mask_no_settings
= false;
852 arg_settings_trusted
= -1;
855 mask_all_settings
= true;
856 mask_no_settings
= false;
857 arg_settings_trusted
= false;
866 assert_not_reached("Unhandled option");
869 if (arg_share_system
)
870 arg_register
= false;
872 if (arg_boot
&& arg_share_system
) {
873 log_error("--boot and --share-system may not be combined.");
877 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
878 log_error("--keep-unit may not be used when invoked from a user session.");
882 if (arg_directory
&& arg_image
) {
883 log_error("--directory= and --image= may not be combined.");
887 if (arg_template
&& arg_image
) {
888 log_error("--template= and --image= may not be combined.");
892 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
893 log_error("--template= needs --directory= or --machine=.");
897 if (arg_ephemeral
&& arg_template
) {
898 log_error("--ephemeral and --template= may not be combined.");
902 if (arg_ephemeral
&& arg_image
) {
903 log_error("--ephemeral and --image= may not be combined.");
907 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
908 log_error("--ephemeral and --link-journal= may not be combined.");
912 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
913 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
916 arg_parameters
= strv_copy(argv
+ optind
);
920 arg_settings_mask
|= SETTING_BOOT
;
923 /* Load all settings from .nspawn files */
924 if (mask_no_settings
)
925 arg_settings_mask
= 0;
927 /* Don't load any settings from .nspawn files */
928 if (mask_all_settings
)
929 arg_settings_mask
= _SETTINGS_MASK_ALL
;
931 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
933 r
= detect_unified_cgroup_hierarchy();
940 static int verify_arguments(void) {
942 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
943 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
947 if (arg_expose_ports
&& !arg_private_network
) {
948 log_error("Cannot use --port= without private networking.");
952 if (arg_boot
&& arg_kill_signal
<= 0)
953 arg_kill_signal
= SIGRTMIN
+3;
958 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
964 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
967 if (uid
!= UID_INVALID
) {
968 uid
+= arg_uid_shift
;
970 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
974 if (gid
!= GID_INVALID
) {
975 gid
+= (gid_t
) arg_uid_shift
;
977 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
981 if (lchown(p
, uid
, gid
) < 0)
987 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
990 q
= prefix_roota(root
, path
);
991 if (mkdir(q
, mode
) < 0) {
997 return userns_lchown(q
, uid
, gid
);
1000 static int setup_timezone(const char *dest
) {
1001 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
1002 const char *where
, *check
, *what
;
1008 /* Fix the timezone, if possible */
1009 r
= readlink_malloc("/etc/localtime", &p
);
1011 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1015 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1017 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1019 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1023 where
= prefix_roota(dest
, "/etc/localtime");
1024 r
= readlink_malloc(where
, &q
);
1026 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1028 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1030 /* Already pointing to the right place? Then do nothing .. */
1031 if (y
&& streq(y
, z
))
1035 check
= strjoina("/usr/share/zoneinfo/", z
);
1036 check
= prefix_root(dest
, check
);
1037 if (laccess(check
, F_OK
) < 0) {
1038 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1043 if (r
< 0 && errno
!= ENOENT
) {
1044 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1048 what
= strjoina("../usr/share/zoneinfo/", z
);
1049 if (symlink(what
, where
) < 0) {
1050 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1054 r
= userns_lchown(where
, 0, 0);
1056 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1061 static int setup_resolv_conf(const char *dest
) {
1062 const char *where
= NULL
;
1067 if (arg_private_network
)
1070 /* Fix resolv.conf, if possible */
1071 where
= prefix_roota(dest
, "/etc/resolv.conf");
1073 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1075 /* If the file already exists as symlink, let's
1076 * suppress the warning, under the assumption that
1077 * resolved or something similar runs inside and the
1078 * symlink points there.
1080 * If the disk image is read-only, there's also no
1081 * point in complaining.
1083 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1084 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1088 r
= userns_lchown(where
, 0, 0);
1090 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1095 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1099 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1100 SD_ID128_FORMAT_VAL(id
));
1105 static int setup_boot_id(const char *dest
) {
1106 const char *from
, *to
;
1107 sd_id128_t rnd
= {};
1111 if (arg_share_system
)
1114 /* Generate a new randomized boot ID, so that each boot-up of
1115 * the container gets a new one */
1117 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1118 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1120 r
= sd_id128_randomize(&rnd
);
1122 return log_error_errno(r
, "Failed to generate random boot id: %m");
1124 id128_format_as_uuid(rnd
, as_uuid
);
1126 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1128 return log_error_errno(r
, "Failed to write boot id: %m");
1130 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1131 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1132 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1133 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1139 static int copy_devnodes(const char *dest
) {
1141 static const char devnodes
[] =
1152 _cleanup_umask_ mode_t u
;
1158 /* Create /dev/net, so that we can create /dev/net/tun in it */
1159 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1160 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1162 NULSTR_FOREACH(d
, devnodes
) {
1163 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1166 from
= strappend("/dev/", d
);
1167 to
= prefix_root(dest
, from
);
1169 if (stat(from
, &st
) < 0) {
1171 if (errno
!= ENOENT
)
1172 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1174 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1176 log_error("%s is not a char or block device, cannot copy.", from
);
1180 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1182 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1184 /* Some systems abusively restrict mknod but
1185 * allow bind mounts. */
1188 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1189 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1190 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1193 r
= userns_lchown(to
, 0, 0);
1195 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1202 static int setup_pts(const char *dest
) {
1203 _cleanup_free_
char *options
= NULL
;
1207 if (arg_selinux_apifs_context
)
1208 (void) asprintf(&options
,
1209 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1210 arg_uid_shift
+ TTY_GID
,
1211 arg_selinux_apifs_context
);
1214 (void) asprintf(&options
,
1215 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1216 arg_uid_shift
+ TTY_GID
);
1221 /* Mount /dev/pts itself */
1222 p
= prefix_roota(dest
, "/dev/pts");
1223 if (mkdir(p
, 0755) < 0)
1224 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1225 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1226 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1227 if (userns_lchown(p
, 0, 0) < 0)
1228 return log_error_errno(errno
, "Failed to chown /dev/pts: %m");
1230 /* Create /dev/ptmx symlink */
1231 p
= prefix_roota(dest
, "/dev/ptmx");
1232 if (symlink("pts/ptmx", p
) < 0)
1233 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1234 if (userns_lchown(p
, 0, 0) < 0)
1235 return log_error_errno(errno
, "Failed to chown /dev/ptmx: %m");
1237 /* And fix /dev/pts/ptmx ownership */
1238 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1239 if (userns_lchown(p
, 0, 0) < 0)
1240 return log_error_errno(errno
, "Failed to chown /dev/pts/ptmx: %m");
1245 static int setup_dev_console(const char *dest
, const char *console
) {
1246 _cleanup_umask_ mode_t u
;
1255 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1257 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1259 /* We need to bind mount the right tty to /dev/console since
1260 * ptys can only exist on pts file systems. To have something
1261 * to bind mount things on we create a empty regular file. */
1263 to
= prefix_roota(dest
, "/dev/console");
1266 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1268 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1269 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1274 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1275 const char *from
, *to
;
1276 _cleanup_umask_ mode_t u
;
1279 struct cmsghdr cmsghdr
;
1280 uint8_t buf
[CMSG_SPACE(sizeof(int))];
1282 struct msghdr mh
= {
1283 .msg_control
= &control
,
1284 .msg_controllen
= sizeof(control
),
1286 struct cmsghdr
*cmsg
;
1288 assert(kmsg_socket
>= 0);
1292 /* We create the kmsg FIFO as /run/kmsg, but immediately
1293 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1294 * on the reading side behave very similar to /proc/kmsg,
1295 * their writing side behaves differently from /dev/kmsg in
1296 * that writing blocks when nothing is reading. In order to
1297 * avoid any problems with containers deadlocking due to this
1298 * we simply make /dev/kmsg unavailable to the container. */
1299 from
= prefix_roota(dest
, "/run/kmsg");
1300 to
= prefix_roota(dest
, "/proc/kmsg");
1302 if (mkfifo(from
, 0600) < 0)
1303 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1304 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1305 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1307 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1309 return log_error_errno(errno
, "Failed to open fifo: %m");
1311 cmsg
= CMSG_FIRSTHDR(&mh
);
1312 cmsg
->cmsg_level
= SOL_SOCKET
;
1313 cmsg
->cmsg_type
= SCM_RIGHTS
;
1314 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
1315 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
1317 mh
.msg_controllen
= cmsg
->cmsg_len
;
1319 /* Store away the fd in the socket, so that it stays open as
1320 * long as we run the child */
1321 k
= sendmsg(kmsg_socket
, &mh
, MSG_NOSIGNAL
);
1325 return log_error_errno(errno
, "Failed to send FIFO fd: %m");
1327 /* And now make the FIFO unavailable as /run/kmsg... */
1328 (void) unlink(from
);
1333 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
1334 union in_addr_union
*exposed
= userdata
;
1340 expose_port_execute(rtnl
, arg_expose_ports
, exposed
);
1344 static int setup_hostname(void) {
1346 if (arg_share_system
)
1349 if (sethostname_idempotent(arg_machine
) < 0)
1355 static int setup_journal(const char *directory
) {
1356 sd_id128_t machine_id
, this_id
;
1357 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
1358 const char *etc_machine_id
, *p
, *q
;
1362 /* Don't link journals in ephemeral mode */
1366 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
1368 r
= read_one_line_file(etc_machine_id
, &b
);
1369 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
1372 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
1375 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
1378 /* Verify validity */
1379 r
= sd_id128_from_string(id
, &machine_id
);
1381 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
1383 r
= sd_id128_get_machine(&this_id
);
1385 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
1387 if (sd_id128_equal(machine_id
, this_id
)) {
1388 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
1389 "Host and machine ids are equal (%s): refusing to link journals", id
);
1390 if (arg_link_journal
== LINK_AUTO
)
1395 if (arg_link_journal
== LINK_NO
)
1398 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
1400 return log_error_errno(r
, "Failed to create /var: %m");
1402 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
1404 return log_error_errno(r
, "Failed to create /var/log: %m");
1406 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
1408 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
1410 p
= strjoina("/var/log/journal/", id
);
1411 q
= prefix_roota(directory
, p
);
1413 if (path_is_mount_point(p
, 0) > 0) {
1414 if (arg_link_journal
!= LINK_AUTO
) {
1415 log_error("%s: already a mount point, refusing to use for journal", p
);
1422 if (path_is_mount_point(q
, 0) > 0) {
1423 if (arg_link_journal
!= LINK_AUTO
) {
1424 log_error("%s: already a mount point, refusing to use for journal", q
);
1431 r
= readlink_and_make_absolute(p
, &d
);
1433 if ((arg_link_journal
== LINK_GUEST
||
1434 arg_link_journal
== LINK_AUTO
) &&
1437 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1439 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1444 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
1445 } else if (r
== -EINVAL
) {
1447 if (arg_link_journal
== LINK_GUEST
&&
1450 if (errno
== ENOTDIR
) {
1451 log_error("%s already exists and is neither a symlink nor a directory", p
);
1454 log_error_errno(errno
, "Failed to remove %s: %m", p
);
1458 } else if (r
!= -ENOENT
) {
1459 log_error_errno(errno
, "readlink(%s) failed: %m", p
);
1463 if (arg_link_journal
== LINK_GUEST
) {
1465 if (symlink(q
, p
) < 0) {
1466 if (arg_link_journal_try
) {
1467 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
1470 log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
1475 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1477 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
1481 if (arg_link_journal
== LINK_HOST
) {
1482 /* don't create parents here -- if the host doesn't have
1483 * permanent journal set up, don't force it here */
1486 if (arg_link_journal_try
) {
1487 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
1490 log_error_errno(errno
, "Failed to create %s: %m", p
);
1495 } else if (access(p
, F_OK
) < 0)
1498 if (dir_is_empty(q
) == 0)
1499 log_warning("%s is not empty, proceeding anyway.", q
);
1501 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
1503 log_error_errno(errno
, "Failed to create %s: %m", q
);
1507 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1508 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
1513 static int drop_capabilities(void) {
1514 return capability_bounding_set_drop(~arg_retain
, false);
1517 static int register_machine(pid_t pid
, int local_ifindex
) {
1518 _cleanup_bus_error_free_ sd_bus_error error
= SD_BUS_ERROR_NULL
;
1519 _cleanup_bus_flush_close_unref_ sd_bus
*bus
= NULL
;
1525 r
= sd_bus_default_system(&bus
);
1527 return log_error_errno(r
, "Failed to open system bus: %m");
1529 if (arg_keep_unit
) {
1530 r
= sd_bus_call_method(
1532 "org.freedesktop.machine1",
1533 "/org/freedesktop/machine1",
1534 "org.freedesktop.machine1.Manager",
1535 "RegisterMachineWithNetwork",
1540 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid
),
1544 strempty(arg_directory
),
1545 local_ifindex
> 0 ? 1 : 0, local_ifindex
);
1547 _cleanup_bus_message_unref_ sd_bus_message
*m
= NULL
;
1551 r
= sd_bus_message_new_method_call(
1554 "org.freedesktop.machine1",
1555 "/org/freedesktop/machine1",
1556 "org.freedesktop.machine1.Manager",
1557 "CreateMachineWithNetwork");
1559 return bus_log_create_error(r
);
1561 r
= sd_bus_message_append(
1565 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid
),
1569 strempty(arg_directory
),
1570 local_ifindex
> 0 ? 1 : 0, local_ifindex
);
1572 return bus_log_create_error(r
);
1574 r
= sd_bus_message_open_container(m
, 'a', "(sv)");
1576 return bus_log_create_error(r
);
1578 if (!isempty(arg_slice
)) {
1579 r
= sd_bus_message_append(m
, "(sv)", "Slice", "s", arg_slice
);
1581 return bus_log_create_error(r
);
1584 r
= sd_bus_message_append(m
, "(sv)", "DevicePolicy", "s", "strict");
1586 return bus_log_create_error(r
);
1588 /* If you make changes here, also make sure to update
1589 * systemd-nspawn@.service, to keep the device
1590 * policies in sync regardless if we are run with or
1591 * without the --keep-unit switch. */
1592 r
= sd_bus_message_append(m
, "(sv)", "DeviceAllow", "a(ss)", 9,
1593 /* Allow the container to
1594 * access and create the API
1595 * device nodes, so that
1596 * PrivateDevices= in the
1597 * container can work
1602 "/dev/random", "rwm",
1603 "/dev/urandom", "rwm",
1605 "/dev/net/tun", "rwm",
1606 /* Allow the container
1607 * access to ptys. However,
1609 * container to ever create
1610 * these device nodes. */
1611 "/dev/pts/ptmx", "rw",
1614 return bus_log_create_error(r
);
1616 for (j
= 0; j
< arg_n_custom_mounts
; j
++) {
1617 CustomMount
*cm
= &arg_custom_mounts
[j
];
1619 if (cm
->type
!= CUSTOM_MOUNT_BIND
)
1622 r
= is_device_node(cm
->source
);
1624 return log_error_errno(r
, "Failed to stat %s: %m", cm
->source
);
1627 r
= sd_bus_message_append(m
, "(sv)", "DeviceAllow", "a(ss)", 1,
1628 cm
->source
, cm
->read_only
? "r" : "rw");
1630 return log_error_errno(r
, "Failed to append message arguments: %m");
1634 if (arg_kill_signal
!= 0) {
1635 r
= sd_bus_message_append(m
, "(sv)", "KillSignal", "i", arg_kill_signal
);
1637 return bus_log_create_error(r
);
1639 r
= sd_bus_message_append(m
, "(sv)", "KillMode", "s", "mixed");
1641 return bus_log_create_error(r
);
1644 STRV_FOREACH(i
, arg_property
) {
1645 r
= sd_bus_message_open_container(m
, 'r', "sv");
1647 return bus_log_create_error(r
);
1649 r
= bus_append_unit_property_assignment(m
, *i
);
1653 r
= sd_bus_message_close_container(m
);
1655 return bus_log_create_error(r
);
1658 r
= sd_bus_message_close_container(m
);
1660 return bus_log_create_error(r
);
1662 r
= sd_bus_call(bus
, m
, 0, &error
, NULL
);
1666 log_error("Failed to register machine: %s", bus_error_message(&error
, r
));
1673 static int terminate_machine(pid_t pid
) {
1674 _cleanup_bus_error_free_ sd_bus_error error
= SD_BUS_ERROR_NULL
;
1675 _cleanup_bus_message_unref_ sd_bus_message
*reply
= NULL
;
1676 _cleanup_bus_flush_close_unref_ sd_bus
*bus
= NULL
;
1683 /* If we are reusing the unit, then just exit, systemd will do
1684 * the right thing when we exit. */
1688 r
= sd_bus_default_system(&bus
);
1690 return log_error_errno(r
, "Failed to open system bus: %m");
1692 r
= sd_bus_call_method(
1694 "org.freedesktop.machine1",
1695 "/org/freedesktop/machine1",
1696 "org.freedesktop.machine1.Manager",
1703 /* Note that the machine might already have been
1704 * cleaned up automatically, hence don't consider it a
1705 * failure if we cannot get the machine object. */
1706 log_debug("Failed to get machine: %s", bus_error_message(&error
, r
));
1710 r
= sd_bus_message_read(reply
, "o", &path
);
1712 return bus_log_parse_error(r
);
1714 r
= sd_bus_call_method(
1716 "org.freedesktop.machine1",
1718 "org.freedesktop.machine1.Machine",
1724 log_debug("Failed to terminate machine: %s", bus_error_message(&error
, r
));
1731 static int reset_audit_loginuid(void) {
1732 _cleanup_free_
char *p
= NULL
;
1735 if (arg_share_system
)
1738 r
= read_one_line_file("/proc/self/loginuid", &p
);
1742 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
1744 /* Already reset? */
1745 if (streq(p
, "4294967295"))
1748 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
1751 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1752 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1753 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1754 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1755 "using systemd-nspawn. Sleeping for 5s... (%m)");
1763 static int setup_seccomp(void) {
1766 static const struct {
1767 uint64_t capability
;
1770 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
1771 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
1772 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
1773 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
1774 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
1775 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
1776 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
1777 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
1778 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
1779 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
1782 scmp_filter_ctx seccomp
;
1786 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1790 r
= seccomp_add_secondary_archs(seccomp
);
1792 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
1796 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
1797 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
1800 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
1802 continue; /* unknown syscall */
1804 log_error_errno(r
, "Failed to block syscall: %m");
1811 Audit is broken in containers, much of the userspace audit
1812 hookup will fail if running inside a container. We don't
1813 care and just turn off creation of audit sockets.
1815 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1816 with EAFNOSUPPORT which audit userspace uses as indication
1817 that audit is disabled in the kernel.
1820 r
= seccomp_rule_add(
1822 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1825 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
1826 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
1828 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
1832 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1834 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
1838 r
= seccomp_load(seccomp
);
1840 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1845 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
1850 seccomp_release(seccomp
);
1858 static int setup_propagate(const char *root
) {
1861 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1862 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1863 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
1864 (void) mkdir_p(p
, 0600);
1866 if (userns_mkdir(root
, "/run/systemd", 0755, 0, 0) < 0)
1867 return log_error_errno(errno
, "Failed to create /run/systemd: %m");
1869 if (userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1870 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn: %m");
1872 if (userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1873 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn/incoming: %m");
1875 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
1876 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
1877 return log_error_errno(errno
, "Failed to install propagation bind mount.");
1879 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
1880 return log_error_errno(errno
, "Failed to make propagation mount read-only");
1885 static int setup_image(char **device_path
, int *loop_nr
) {
1886 struct loop_info64 info
= {
1887 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
1889 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
1890 _cleanup_free_
char* loopdev
= NULL
;
1894 assert(device_path
);
1898 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1900 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
1902 if (fstat(fd
, &st
) < 0)
1903 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
1905 if (S_ISBLK(st
.st_mode
)) {
1908 p
= strdup(arg_image
);
1922 if (!S_ISREG(st
.st_mode
)) {
1923 log_error_errno(errno
, "%s is not a regular file or block device: %m", arg_image
);
1927 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
1929 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
1931 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
1933 return log_error_errno(errno
, "Failed to allocate loop device: %m");
1935 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
1938 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
1940 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
1942 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
1943 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
1946 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
1948 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
1949 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
1951 *device_path
= loopdev
;
1962 #define PARTITION_TABLE_BLURB \
1963 "Note that the disk image needs to either contain only a single MBR partition of\n" \
1964 "type 0x83 that is marked bootable, or a single GPT partition of type " \
1965 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1966 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1967 "to be bootable with systemd-nspawn."
1969 static int dissect_image(
1971 char **root_device
, bool *root_device_rw
,
1972 char **home_device
, bool *home_device_rw
,
1973 char **srv_device
, bool *srv_device_rw
,
1977 int home_nr
= -1, srv_nr
= -1;
1978 #ifdef GPT_ROOT_NATIVE
1981 #ifdef GPT_ROOT_SECONDARY
1982 int secondary_root_nr
= -1;
1984 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
1985 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
1986 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
1987 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
1988 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
1989 struct udev_list_entry
*first
, *item
;
1990 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
1991 bool is_gpt
, is_mbr
, multiple_generic
= false;
1992 const char *pttype
= NULL
;
1999 assert(root_device
);
2000 assert(home_device
);
2005 b
= blkid_new_probe();
2010 r
= blkid_probe_set_device(b
, fd
, 0, 0);
2015 log_error_errno(errno
, "Failed to set device on blkid probe: %m");
2019 blkid_probe_enable_partitions(b
, 1);
2020 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
2023 r
= blkid_do_safeprobe(b
);
2024 if (r
== -2 || r
== 1) {
2025 log_error("Failed to identify any partition table on\n"
2027 PARTITION_TABLE_BLURB
, arg_image
);
2029 } else if (r
!= 0) {
2032 log_error_errno(errno
, "Failed to probe: %m");
2036 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
2038 is_gpt
= streq_ptr(pttype
, "gpt");
2039 is_mbr
= streq_ptr(pttype
, "dos");
2041 if (!is_gpt
&& !is_mbr
) {
2042 log_error("No GPT or MBR partition table discovered on\n"
2044 PARTITION_TABLE_BLURB
, arg_image
);
2049 pl
= blkid_probe_get_partitions(b
);
2054 log_error("Failed to list partitions of %s", arg_image
);
2062 if (fstat(fd
, &st
) < 0)
2063 return log_error_errno(errno
, "Failed to stat block device: %m");
2065 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
2073 log_error("Kernel partitions never appeared.");
2077 e
= udev_enumerate_new(udev
);
2081 r
= udev_enumerate_add_match_parent(e
, d
);
2085 r
= udev_enumerate_scan_devices(e
);
2087 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
2089 /* Count the partitions enumerated by the kernel */
2091 first
= udev_enumerate_get_list_entry(e
);
2092 udev_list_entry_foreach(item
, first
)
2095 /* Count the partitions enumerated by blkid */
2096 m
= blkid_partlist_numof_partitions(pl
);
2100 log_error("blkid and kernel partition list do not match.");
2106 /* The kernel has probed fewer partitions than
2107 * blkid? Maybe the kernel prober is still
2108 * running or it got EBUSY because udev
2109 * already opened the device. Let's reprobe
2110 * the device, which is a synchronous call
2111 * that waits until probing is complete. */
2113 for (j
= 0; j
< 20; j
++) {
2115 r
= ioctl(fd
, BLKRRPART
, 0);
2118 if (r
>= 0 || r
!= -EBUSY
)
2121 /* If something else has the device
2122 * open, such as an udev rule, the
2123 * ioctl will return EBUSY. Since
2124 * there's no way to wait until it
2125 * isn't busy anymore, let's just wait
2126 * a bit, and try again.
2128 * This is really something they
2129 * should fix in the kernel! */
2131 usleep(50 * USEC_PER_MSEC
);
2135 return log_error_errno(r
, "Failed to reread partition table: %m");
2138 e
= udev_enumerate_unref(e
);
2141 first
= udev_enumerate_get_list_entry(e
);
2142 udev_list_entry_foreach(item
, first
) {
2143 _cleanup_udev_device_unref_
struct udev_device
*q
;
2145 unsigned long long flags
;
2151 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
2156 log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
2160 qn
= udev_device_get_devnum(q
);
2164 if (st
.st_rdev
== qn
)
2167 node
= udev_device_get_devnode(q
);
2171 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
2175 flags
= blkid_partition_get_flags(pp
);
2177 nr
= blkid_partition_get_partno(pp
);
2185 if (flags
& GPT_FLAG_NO_AUTO
)
2188 stype
= blkid_partition_get_type_string(pp
);
2192 if (sd_id128_from_string(stype
, &type_id
) < 0)
2195 if (sd_id128_equal(type_id
, GPT_HOME
)) {
2197 if (home
&& nr
>= home_nr
)
2201 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2203 r
= free_and_strdup(&home
, node
);
2207 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
2209 if (srv
&& nr
>= srv_nr
)
2213 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2215 r
= free_and_strdup(&srv
, node
);
2219 #ifdef GPT_ROOT_NATIVE
2220 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
2222 if (root
&& nr
>= root_nr
)
2226 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2228 r
= free_and_strdup(&root
, node
);
2233 #ifdef GPT_ROOT_SECONDARY
2234 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
2236 if (secondary_root
&& nr
>= secondary_root_nr
)
2239 secondary_root_nr
= nr
;
2240 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2242 r
= free_and_strdup(&secondary_root
, node
);
2247 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
2250 multiple_generic
= true;
2252 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
2254 r
= free_and_strdup(&generic
, node
);
2260 } else if (is_mbr
) {
2263 if (flags
!= 0x80) /* Bootable flag */
2266 type
= blkid_partition_get_type(pp
);
2267 if (type
!= 0x83) /* Linux partition */
2271 multiple_generic
= true;
2275 r
= free_and_strdup(&root
, node
);
2283 *root_device
= root
;
2286 *root_device_rw
= root_rw
;
2288 } else if (secondary_root
) {
2289 *root_device
= secondary_root
;
2290 secondary_root
= NULL
;
2292 *root_device_rw
= secondary_root_rw
;
2294 } else if (generic
) {
2296 /* There were no partitions with precise meanings
2297 * around, but we found generic partitions. In this
2298 * case, if there's only one, we can go ahead and boot
2299 * it, otherwise we bail out, because we really cannot
2300 * make any sense of it. */
2302 if (multiple_generic
) {
2303 log_error("Identified multiple bootable Linux partitions on\n"
2305 PARTITION_TABLE_BLURB
, arg_image
);
2309 *root_device
= generic
;
2312 *root_device_rw
= generic_rw
;
2315 log_error("Failed to identify root partition in disk image\n"
2317 PARTITION_TABLE_BLURB
, arg_image
);
2322 *home_device
= home
;
2325 *home_device_rw
= home_rw
;
2332 *srv_device_rw
= srv_rw
;
2337 log_error("--image= is not supported, compiled without blkid support.");
2342 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
2344 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
2345 const char *fstype
, *p
;
2355 p
= strjoina(where
, directory
);
2360 b
= blkid_new_probe_from_filename(what
);
2364 log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
2368 blkid_probe_enable_superblocks(b
, 1);
2369 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
2372 r
= blkid_do_safeprobe(b
);
2373 if (r
== -1 || r
== 1) {
2374 log_error("Cannot determine file system type of %s", what
);
2376 } else if (r
!= 0) {
2379 log_error_errno(errno
, "Failed to probe %s: %m", what
);
2384 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
2387 log_error("Failed to determine file system type of %s", what
);
2391 if (streq(fstype
, "crypto_LUKS")) {
2392 log_error("nspawn currently does not support LUKS disk images.");
2396 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
2397 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
2401 log_error("--image= is not supported, compiled without blkid support.");
2406 static int mount_devices(
2408 const char *root_device
, bool root_device_rw
,
2409 const char *home_device
, bool home_device_rw
,
2410 const char *srv_device
, bool srv_device_rw
) {
2416 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
2418 return log_error_errno(r
, "Failed to mount root directory: %m");
2422 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
2424 return log_error_errno(r
, "Failed to mount home directory: %m");
2428 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
2430 return log_error_errno(r
, "Failed to mount server data directory: %m");
2436 static void loop_remove(int nr
, int *image_fd
) {
2437 _cleanup_close_
int control
= -1;
2443 if (image_fd
&& *image_fd
>= 0) {
2444 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
2446 log_debug_errno(errno
, "Failed to close loop image: %m");
2447 *image_fd
= safe_close(*image_fd
);
2450 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
2452 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
2456 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
2458 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
2461 static int spawn_getent(const char *database
, const char *key
, pid_t
*rpid
) {
2469 if (pipe2(pipe_fds
, O_CLOEXEC
) < 0)
2470 return log_error_errno(errno
, "Failed to allocate pipe: %m");
2474 return log_error_errno(errno
, "Failed to fork getent child: %m");
2475 else if (pid
== 0) {
2477 char *empty_env
= NULL
;
2479 if (dup3(pipe_fds
[1], STDOUT_FILENO
, 0) < 0)
2480 _exit(EXIT_FAILURE
);
2482 if (pipe_fds
[0] > 2)
2483 safe_close(pipe_fds
[0]);
2484 if (pipe_fds
[1] > 2)
2485 safe_close(pipe_fds
[1]);
2487 nullfd
= open("/dev/null", O_RDWR
);
2489 _exit(EXIT_FAILURE
);
2491 if (dup3(nullfd
, STDIN_FILENO
, 0) < 0)
2492 _exit(EXIT_FAILURE
);
2494 if (dup3(nullfd
, STDERR_FILENO
, 0) < 0)
2495 _exit(EXIT_FAILURE
);
2500 (void) reset_all_signal_handlers();
2501 (void) reset_signal_mask();
2502 close_all_fds(NULL
, 0);
2504 execle("/usr/bin/getent", "getent", database
, key
, NULL
, &empty_env
);
2505 execle("/bin/getent", "getent", database
, key
, NULL
, &empty_env
);
2506 _exit(EXIT_FAILURE
);
2509 pipe_fds
[1] = safe_close(pipe_fds
[1]);
2516 static int change_uid_gid(char **_home
) {
2517 char line
[LINE_MAX
], *x
, *u
, *g
, *h
;
2518 const char *word
, *state
;
2519 _cleanup_free_ uid_t
*uids
= NULL
;
2520 _cleanup_free_
char *home
= NULL
;
2521 _cleanup_fclose_
FILE *f
= NULL
;
2522 _cleanup_close_
int fd
= -1;
2523 unsigned n_uids
= 0;
2532 if (!arg_user
|| streq(arg_user
, "root") || streq(arg_user
, "0")) {
2533 /* Reset everything fully to 0, just in case */
2535 r
= reset_uid_gid();
2537 return log_error_errno(r
, "Failed to become root: %m");
2543 /* First, get user credentials */
2544 fd
= spawn_getent("passwd", arg_user
, &pid
);
2548 f
= fdopen(fd
, "r");
2553 if (!fgets(line
, sizeof(line
), f
)) {
2556 log_error("Failed to resolve user %s.", arg_user
);
2560 log_error_errno(errno
, "Failed to read from getent: %m");
2566 wait_for_terminate_and_warn("getent passwd", pid
, true);
2568 x
= strchr(line
, ':');
2570 log_error("/etc/passwd entry has invalid user field.");
2574 u
= strchr(x
+1, ':');
2576 log_error("/etc/passwd entry has invalid password field.");
2583 log_error("/etc/passwd entry has invalid UID field.");
2591 log_error("/etc/passwd entry has invalid GID field.");
2596 h
= strchr(x
+1, ':');
2598 log_error("/etc/passwd entry has invalid GECOS field.");
2605 log_error("/etc/passwd entry has invalid home directory field.");
2611 r
= parse_uid(u
, &uid
);
2613 log_error("Failed to parse UID of user.");
2617 r
= parse_gid(g
, &gid
);
2619 log_error("Failed to parse GID of user.");
2627 /* Second, get group memberships */
2628 fd
= spawn_getent("initgroups", arg_user
, &pid
);
2633 f
= fdopen(fd
, "r");
2638 if (!fgets(line
, sizeof(line
), f
)) {
2640 log_error("Failed to resolve user %s.", arg_user
);
2644 log_error_errno(errno
, "Failed to read from getent: %m");
2650 wait_for_terminate_and_warn("getent initgroups", pid
, true);
2652 /* Skip over the username and subsequent separator whitespace */
2654 x
+= strcspn(x
, WHITESPACE
);
2655 x
+= strspn(x
, WHITESPACE
);
2657 FOREACH_WORD(word
, l
, x
, state
) {
2663 if (!GREEDY_REALLOC(uids
, sz
, n_uids
+1))
2666 r
= parse_uid(c
, &uids
[n_uids
++]);
2668 log_error("Failed to parse group data from getent.");
2673 r
= mkdir_parents(home
, 0775);
2675 return log_error_errno(r
, "Failed to make home root directory: %m");
2677 r
= mkdir_safe(home
, 0755, uid
, gid
);
2678 if (r
< 0 && r
!= -EEXIST
)
2679 return log_error_errno(r
, "Failed to make home directory: %m");
2681 (void) fchown(STDIN_FILENO
, uid
, gid
);
2682 (void) fchown(STDOUT_FILENO
, uid
, gid
);
2683 (void) fchown(STDERR_FILENO
, uid
, gid
);
2685 if (setgroups(n_uids
, uids
) < 0)
2686 return log_error_errno(errno
, "Failed to set auxiliary groups: %m");
2688 if (setresgid(gid
, gid
, gid
) < 0)
2689 return log_error_errno(errno
, "setregid() failed: %m");
2691 if (setresuid(uid
, uid
, uid
) < 0)
2692 return log_error_errno(errno
, "setreuid() failed: %m");
2704 * < 0 : wait_for_terminate() failed to get the state of the
2705 * container, the container was terminated by a signal, or
2706 * failed for an unknown reason. No change is made to the
2707 * container argument.
2708 * > 0 : The program executed in the container terminated with an
2709 * error. The exit code of the program executed in the
2710 * container is returned. The container argument has been set
2711 * to CONTAINER_TERMINATED.
2712 * 0 : The container is being rebooted, has been shut down or exited
2713 * successfully. The container argument has been set to either
2714 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2716 * That is, success is indicated by a return value of zero, and an
2717 * error is indicated by a non-zero value.
2719 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2723 r
= wait_for_terminate(pid
, &status
);
2725 return log_warning_errno(r
, "Failed to wait for container: %m");
2727 switch (status
.si_code
) {
2730 if (status
.si_status
== 0) {
2731 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2734 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2736 *container
= CONTAINER_TERMINATED
;
2737 return status
.si_status
;
2740 if (status
.si_status
== SIGINT
) {
2742 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2743 *container
= CONTAINER_TERMINATED
;
2746 } else if (status
.si_status
== SIGHUP
) {
2748 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2749 *container
= CONTAINER_REBOOTED
;
2753 /* CLD_KILLED fallthrough */
2756 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2760 log_error("Container %s failed due to unknown reason.", arg_machine
);
2767 static void nop_handler(int sig
) {}
2769 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2772 pid
= PTR_TO_UINT32(userdata
);
2774 if (kill(pid
, arg_kill_signal
) >= 0) {
2775 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2776 sd_event_source_set_userdata(s
, NULL
);
2781 sd_event_exit(sd_event_source_get_event(s
), 0);
2785 static int determine_names(void) {
2788 if (arg_template
&& !arg_directory
&& arg_machine
) {
2790 /* If --template= was specified then we should not
2791 * search for a machine, but instead create a new one
2792 * in /var/lib/machine. */
2794 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
2799 if (!arg_image
&& !arg_directory
) {
2801 _cleanup_(image_unrefp
) Image
*i
= NULL
;
2803 r
= image_find(arg_machine
, &i
);
2805 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
2807 log_error("No image for machine '%s': %m", arg_machine
);
2811 if (i
->type
== IMAGE_RAW
)
2812 r
= set_sanitized_path(&arg_image
, i
->path
);
2814 r
= set_sanitized_path(&arg_directory
, i
->path
);
2816 return log_error_errno(r
, "Invalid image directory: %m");
2819 arg_read_only
= arg_read_only
|| i
->read_only
;
2821 arg_directory
= get_current_dir_name();
2823 if (!arg_directory
&& !arg_machine
) {
2824 log_error("Failed to determine path, please use -D or -i.");
2830 if (arg_directory
&& path_equal(arg_directory
, "/"))
2831 arg_machine
= gethostname_malloc();
2833 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
2838 hostname_cleanup(arg_machine
);
2839 if (!machine_name_is_valid(arg_machine
)) {
2840 log_error("Failed to determine machine name automatically, please use -M.");
2844 if (arg_ephemeral
) {
2847 /* Add a random suffix when this is an
2848 * ephemeral machine, so that we can run many
2849 * instances at once without manually having
2850 * to specify -M each time. */
2852 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
2863 static int determine_uid_shift(const char *directory
) {
2871 if (arg_uid_shift
== UID_INVALID
) {
2874 r
= stat(directory
, &st
);
2876 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
2878 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
2880 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
2881 log_error("UID and GID base of %s don't match.", directory
);
2885 arg_uid_range
= UINT32_C(0x10000);
2888 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
2889 log_error("UID base too high for UID range.");
2893 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
2897 static int inner_child(
2899 const char *directory
,
2905 _cleanup_free_
char *home
= NULL
;
2907 const char *envp
[] = {
2908 "PATH=" DEFAULT_PATH_SPLIT_USR
,
2909 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2914 NULL
, /* container_uuid */
2915 NULL
, /* LISTEN_FDS */
2916 NULL
, /* LISTEN_PID */
2920 _cleanup_strv_free_
char **env_use
= NULL
;
2925 assert(kmsg_socket
>= 0);
2930 /* Tell the parent, that it now can write the UID map. */
2931 (void) barrier_place(barrier
); /* #1 */
2933 /* Wait until the parent wrote the UID map */
2934 if (!barrier_place_and_sync(barrier
)) { /* #2 */
2935 log_error("Parent died too early");
2940 r
= mount_all(NULL
, true, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
2944 /* Wait until we are cgroup-ified, so that we
2945 * can mount the right cgroup path writable */
2946 if (!barrier_place_and_sync(barrier
)) { /* #3 */
2947 log_error("Parent died too early");
2951 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
2955 r
= reset_uid_gid();
2957 return log_error_errno(r
, "Couldn't become new root: %m");
2959 r
= setup_boot_id(NULL
);
2963 r
= setup_kmsg(NULL
, kmsg_socket
);
2966 kmsg_socket
= safe_close(kmsg_socket
);
2971 return log_error_errno(errno
, "setsid() failed: %m");
2973 if (arg_private_network
)
2976 if (arg_expose_ports
) {
2977 r
= expose_port_send_rtnl(rtnl_socket
);
2980 rtnl_socket
= safe_close(rtnl_socket
);
2983 if (drop_capabilities() < 0)
2984 return log_error_errno(errno
, "drop_capabilities() failed: %m");
2988 if (arg_personality
!= PERSONALITY_INVALID
) {
2989 if (personality(arg_personality
) < 0)
2990 return log_error_errno(errno
, "personality() failed: %m");
2991 } else if (secondary
) {
2992 if (personality(PER_LINUX32
) < 0)
2993 return log_error_errno(errno
, "personality() failed: %m");
2997 if (arg_selinux_context
)
2998 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
2999 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
3002 r
= change_uid_gid(&home
);
3006 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
3010 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
3011 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
3012 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
3015 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
3018 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
3022 if (fdset_size(fds
) > 0) {
3023 r
= fdset_cloexec(fds
, false);
3025 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
3027 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
3028 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
3032 env_use
= strv_env_merge(2, envp
, arg_setenv
);
3036 /* Let the parent know that we are ready and
3037 * wait until the parent is ready with the
3039 if (!barrier_place_and_sync(barrier
)) { /* #4 */
3040 log_error("Parent died too early");
3044 /* Now, explicitly close the log, so that we
3045 * then can close all remaining fds. Closing
3046 * the log explicitly first has the benefit
3047 * that the logging subsystem knows about it,
3048 * and is thus ready to be reopened should we
3049 * need it again. Note that the other fds
3050 * closed here are at least the locking and
3053 (void) fdset_close_others(fds
);
3059 /* Automatically search for the init system */
3061 m
= 1 + strv_length(arg_parameters
);
3062 a
= newa(char*, m
+ 1);
3063 if (strv_isempty(arg_parameters
))
3066 memcpy(a
+ 1, arg_parameters
, m
* sizeof(char*));
3068 a
[0] = (char*) "/usr/lib/systemd/systemd";
3069 execve(a
[0], a
, env_use
);
3071 a
[0] = (char*) "/lib/systemd/systemd";
3072 execve(a
[0], a
, env_use
);
3074 a
[0] = (char*) "/sbin/init";
3075 execve(a
[0], a
, env_use
);
3076 } else if (!strv_isempty(arg_parameters
))
3077 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
3079 chdir(home
?: "/root");
3080 execle("/bin/bash", "-bash", NULL
, env_use
);
3081 execle("/bin/sh", "-sh", NULL
, env_use
);
3085 return log_error_errno(errno
, "execv() failed: %m");
3088 static int outer_child(
3090 const char *directory
,
3091 const char *console
,
3092 const char *root_device
, bool root_device_rw
,
3093 const char *home_device
, bool home_device_rw
,
3094 const char *srv_device
, bool srv_device_rw
,
3100 int uid_shift_socket
,
3110 assert(pid_socket
>= 0);
3111 assert(kmsg_socket
>= 0);
3115 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
3116 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
3119 close_nointr(STDIN_FILENO
);
3120 close_nointr(STDOUT_FILENO
);
3121 close_nointr(STDERR_FILENO
);
3123 r
= open_terminal(console
, O_RDWR
);
3124 if (r
!= STDIN_FILENO
) {
3130 return log_error_errno(r
, "Failed to open console: %m");
3133 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
3134 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
3135 return log_error_errno(errno
, "Failed to duplicate console: %m");
3138 r
= reset_audit_loginuid();
3142 /* Mark everything as slave, so that we still
3143 * receive mounts from the real root, but don't
3144 * propagate mounts to the real root. */
3145 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
3146 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
3148 r
= mount_devices(directory
,
3149 root_device
, root_device_rw
,
3150 home_device
, home_device_rw
,
3151 srv_device
, srv_device_rw
);
3155 r
= determine_uid_shift(directory
);
3160 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
3162 return log_error_errno(errno
, "Failed to send UID shift: %m");
3163 if (l
!= sizeof(arg_uid_shift
)) {
3164 log_error("Short write while sending UID shift.");
3169 /* Turn directory into bind mount */
3170 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
3171 return log_error_errno(errno
, "Failed to make bind mount: %m");
3173 r
= setup_volatile(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
3177 r
= setup_volatile_state(directory
, arg_volatile_mode
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_context
);
3181 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
3185 if (arg_read_only
) {
3186 r
= bind_remount_recursive(directory
, true);
3188 return log_error_errno(r
, "Failed to make tree read-only: %m");
3191 r
= mount_all(directory
, false, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
3195 if (copy_devnodes(directory
) < 0)
3198 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
3200 if (setup_pts(directory
) < 0)
3203 r
= setup_propagate(directory
);
3207 r
= setup_dev_console(directory
, console
);
3211 r
= setup_seccomp();
3215 r
= setup_timezone(directory
);
3219 r
= setup_resolv_conf(directory
);
3223 r
= setup_journal(directory
);
3227 r
= mount_custom(directory
, arg_custom_mounts
, arg_n_custom_mounts
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
3231 r
= mount_cgroups(directory
, arg_unified_cgroup_hierarchy
, arg_userns
, arg_uid_shift
, arg_uid_range
, arg_selinux_apifs_context
);
3235 r
= mount_move_root(directory
);
3237 return log_error_errno(r
, "Failed to move root directory: %m");
3239 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
3240 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
3241 (arg_private_network
? CLONE_NEWNET
: 0) |
3242 (arg_userns
? CLONE_NEWUSER
: 0),
3245 return log_error_errno(errno
, "Failed to fork inner child: %m");
3247 pid_socket
= safe_close(pid_socket
);
3248 uid_shift_socket
= safe_close(uid_shift_socket
);
3250 /* The inner child has all namespaces that are
3251 * requested, so that we all are owned by the user if
3252 * user namespaces are turned on. */
3254 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
3256 _exit(EXIT_FAILURE
);
3258 _exit(EXIT_SUCCESS
);
3261 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
3263 return log_error_errno(errno
, "Failed to send PID: %m");
3264 if (l
!= sizeof(pid
)) {
3265 log_error("Short write while sending PID.");
3269 pid_socket
= safe_close(pid_socket
);
3274 static int setup_uid_map(pid_t pid
) {
3275 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
3280 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
3281 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
3282 r
= write_string_file(uid_map
, line
, 0);
3284 return log_error_errno(r
, "Failed to write UID map: %m");
3286 /* We always assign the same UID and GID ranges */
3287 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
3288 r
= write_string_file(uid_map
, line
, 0);
3290 return log_error_errno(r
, "Failed to write GID map: %m");
3295 static int chown_cgroup(pid_t pid
) {
3296 _cleanup_free_
char *path
= NULL
, *fs
= NULL
;
3297 _cleanup_close_
int fd
= -1;
3301 r
= cg_pid_get_path(NULL
, pid
, &path
);
3303 return log_error_errno(r
, "Failed to get container cgroup path: %m");
3305 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &fs
);
3307 return log_error_errno(r
, "Failed to get file system path for container cgroup: %m");
3309 fd
= open(fs
, O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
);
3311 return log_error_errno(errno
, "Failed to open %s: %m", fs
);
3316 "notify_on_release",
3318 "cgroup.clone_children",
3319 "cgroup.controllers",
3320 "cgroup.subtree_control",
3322 if (fchownat(fd
, fn
, arg_uid_shift
, arg_uid_shift
, 0) < 0)
3323 log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
,
3324 "Failed to chown() cgroup file %s, ignoring: %m", fn
);
3329 static int sync_cgroup(pid_t pid
) {
3330 _cleanup_free_
char *cgroup
= NULL
;
3331 char tree
[] = "/tmp/unifiedXXXXXX", pid_string
[DECIMAL_STR_MAX(pid
) + 1];
3332 bool undo_mount
= false;
3336 unified
= cg_unified();
3338 return log_error_errno(unified
, "Failed to determine whether the unified hierachy is used: %m");
3340 if ((unified
> 0) == arg_unified_cgroup_hierarchy
)
3343 /* When the host uses the legacy cgroup setup, but the
3344 * container shall use the unified hierarchy, let's make sure
3345 * we copy the path from the name=systemd hierarchy into the
3346 * unified hierarchy. Similar for the reverse situation. */
3348 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &cgroup
);
3350 return log_error_errno(r
, "Failed to get control group of " PID_FMT
": %m", pid
);
3352 /* In order to access the unified hierarchy we need to mount it */
3354 return log_error_errno(errno
, "Failed to generate temporary mount point for unified hierarchy: %m");
3357 r
= mount("cgroup", tree
, "cgroup", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "none,name=systemd,xattr");
3359 r
= mount("cgroup", tree
, "cgroup", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "__DEVEL__sane_behavior");
3361 r
= log_error_errno(errno
, "Failed to mount unified hierarchy: %m");
3367 fn
= strjoina(tree
, cgroup
, "/cgroup.procs");
3368 (void) mkdir_parents(fn
, 0755);
3370 sprintf(pid_string
, PID_FMT
, pid
);
3371 r
= write_string_file(fn
, pid_string
, 0);
3373 log_error_errno(r
, "Failed to move process: %m");
3377 (void) umount(tree
);
3383 static int create_subcgroup(pid_t pid
) {
3384 _cleanup_free_
char *cgroup
= NULL
;
3387 CGroupMask supported
;
3389 /* In the unified hierarchy inner nodes may only only contain
3390 * subgroups, but not processes. Hence, if we running in the
3391 * unified hierarchy and the container does the same, and we
3392 * did not create a scope unit for the container move us and
3393 * the container into two separate subcgroups. */
3398 if (!arg_unified_cgroup_hierarchy
)
3401 unified
= cg_unified();
3403 return log_error_errno(unified
, "Failed to determine whether the unified hierachy is used: %m");
3407 r
= cg_mask_supported(&supported
);
3409 return log_error_errno(r
, "Failed to determine supported controllers: %m");
3411 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, 0, &cgroup
);
3413 return log_error_errno(r
, "Failed to get our control group: %m");
3415 child
= strjoina(cgroup
, "/payload");
3416 r
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, child
, pid
);
3418 return log_error_errno(r
, "Failed to create %s subcgroup: %m", child
);
3420 child
= strjoina(cgroup
, "/supervisor");
3421 r
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, child
, 0);
3423 return log_error_errno(r
, "Failed to create %s subcgroup: %m", child
);
3425 /* Try to enable as many controllers as possible for the new payload. */
3426 (void) cg_enable_everywhere(supported
, supported
, cgroup
);
3430 static int load_settings(void) {
3431 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
3432 _cleanup_fclose_
FILE *f
= NULL
;
3433 _cleanup_free_
char *p
= NULL
;
3437 /* If all settings are masked, there's no point in looking for
3438 * the settings file */
3439 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
3442 fn
= strjoina(arg_machine
, ".nspawn");
3444 /* We first look in the admin's directories in /etc and /run */
3445 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
3446 _cleanup_free_
char *j
= NULL
;
3448 j
= strjoin(i
, "/", fn
, NULL
);
3457 /* By default we trust configuration from /etc and /run */
3458 if (arg_settings_trusted
< 0)
3459 arg_settings_trusted
= true;
3464 if (errno
!= ENOENT
)
3465 return log_error_errno(errno
, "Failed to open %s: %m", j
);
3469 /* After that, let's look for a file next to the
3470 * actual image we shall boot. */
3473 p
= file_in_same_dir(arg_image
, fn
);
3476 } else if (arg_directory
) {
3477 p
= file_in_same_dir(arg_directory
, fn
);
3484 if (!f
&& errno
!= ENOENT
)
3485 return log_error_errno(errno
, "Failed to open %s: %m", p
);
3487 /* By default we do not trust configuration from /var/lib/machines */
3488 if (arg_settings_trusted
< 0)
3489 arg_settings_trusted
= false;
3496 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
3498 r
= settings_load(f
, p
, &settings
);
3502 /* Copy over bits from the settings, unless they have been
3503 * explicitly masked by command line switches. */
3505 if ((arg_settings_mask
& SETTING_BOOT
) == 0 &&
3506 settings
->boot
>= 0) {
3507 arg_boot
= settings
->boot
;
3509 strv_free(arg_parameters
);
3510 arg_parameters
= settings
->parameters
;
3511 settings
->parameters
= NULL
;
3514 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
3515 settings
->environment
) {
3516 strv_free(arg_setenv
);
3517 arg_setenv
= settings
->environment
;
3518 settings
->environment
= NULL
;
3521 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
3524 arg_user
= settings
->user
;
3525 settings
->user
= NULL
;
3528 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
3530 if (!arg_settings_trusted
&& settings
->capability
!= 0)
3531 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
3533 arg_retain
|= settings
->capability
;
3535 arg_retain
&= ~settings
->drop_capability
;
3538 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
3539 settings
->kill_signal
> 0)
3540 arg_kill_signal
= settings
->kill_signal
;
3542 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
3543 settings
->personality
!= PERSONALITY_INVALID
)
3544 arg_personality
= settings
->personality
;
3546 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
3547 !sd_id128_is_null(settings
->machine_id
)) {
3549 if (!arg_settings_trusted
)
3550 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
3552 arg_uuid
= settings
->machine_id
;
3555 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
3556 settings
->read_only
>= 0)
3557 arg_read_only
= settings
->read_only
;
3559 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
3560 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
3561 arg_volatile_mode
= settings
->volatile_mode
;
3563 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
3564 settings
->n_custom_mounts
> 0) {
3566 if (!arg_settings_trusted
)
3567 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
3569 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
3570 arg_custom_mounts
= settings
->custom_mounts
;
3571 arg_n_custom_mounts
= settings
->n_custom_mounts
;
3573 settings
->custom_mounts
= NULL
;
3574 settings
->n_custom_mounts
= 0;
3578 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
3579 (settings
->private_network
>= 0 ||
3580 settings
->network_veth
>= 0 ||
3581 settings
->network_bridge
||
3582 settings
->network_interfaces
||
3583 settings
->network_macvlan
||
3584 settings
->network_ipvlan
)) {
3586 if (!arg_settings_trusted
)
3587 log_warning("Ignoring network settings, file %s is not trusted.", p
);
3589 strv_free(arg_network_interfaces
);
3590 arg_network_interfaces
= settings
->network_interfaces
;
3591 settings
->network_interfaces
= NULL
;
3593 strv_free(arg_network_macvlan
);
3594 arg_network_macvlan
= settings
->network_macvlan
;
3595 settings
->network_macvlan
= NULL
;
3597 strv_free(arg_network_ipvlan
);
3598 arg_network_ipvlan
= settings
->network_ipvlan
;
3599 settings
->network_ipvlan
= NULL
;
3601 free(arg_network_bridge
);
3602 arg_network_bridge
= settings
->network_bridge
;
3603 settings
->network_bridge
= NULL
;
3605 arg_network_veth
= settings
->network_veth
> 0 || settings
->network_bridge
;
3607 arg_private_network
= true; /* all these settings imply private networking */
3611 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
3612 settings
->expose_ports
) {
3614 if (!arg_settings_trusted
)
3615 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
3617 expose_port_free_all(arg_expose_ports
);
3618 arg_expose_ports
= settings
->expose_ports
;
3619 settings
->expose_ports
= NULL
;
3626 int main(int argc
, char *argv
[]) {
3628 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
3629 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
3630 _cleanup_close_
int master
= -1, image_fd
= -1;
3631 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
3632 int r
, n_fd_passed
, loop_nr
= -1;
3633 char veth_name
[IFNAMSIZ
];
3634 bool secondary
= false, remove_subvol
= false;
3637 int ret
= EXIT_SUCCESS
;
3638 union in_addr_union exposed
= {};
3639 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
3642 log_parse_environment();
3645 r
= parse_argv(argc
, argv
);
3649 if (geteuid() != 0) {
3650 log_error("Need to be root.");
3654 r
= determine_names();
3658 r
= load_settings();
3662 r
= verify_arguments();
3666 n_fd_passed
= sd_listen_fds(false);
3667 if (n_fd_passed
> 0) {
3668 r
= fdset_new_listen_fds(&fds
, false);
3670 log_error_errno(r
, "Failed to collect file descriptors: %m");
3675 if (arg_directory
) {
3678 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
3679 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3684 if (arg_ephemeral
) {
3685 _cleanup_free_
char *np
= NULL
;
3687 /* If the specified path is a mount point we
3688 * generate the new snapshot immediately
3689 * inside it under a random name. However if
3690 * the specified is not a mount point we
3691 * create the new snapshot in the parent
3692 * directory, just next to it. */
3693 r
= path_is_mount_point(arg_directory
, 0);
3695 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
3699 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
3701 r
= tempfn_random(arg_directory
, "machine.", &np
);
3703 log_error_errno(r
, "Failed to generate name for snapshot: %m");
3707 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3709 log_error_errno(r
, "Failed to lock %s: %m", np
);
3713 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
3715 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
3719 free(arg_directory
);
3723 remove_subvol
= true;
3726 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3728 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
3732 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
3737 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
3740 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
3742 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
3746 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
3752 if (path_is_os_tree(arg_directory
) <= 0) {
3753 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
3760 p
= strjoina(arg_directory
,
3761 argc
> optind
&& path_is_absolute(argv
[optind
]) ? argv
[optind
] : "/usr/bin/");
3762 if (access(p
, F_OK
) < 0) {
3763 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory
);
3770 char template[] = "/tmp/nspawn-root-XXXXXX";
3773 assert(!arg_template
);
3775 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
3777 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
3781 r
= log_error_errno(r
, "Failed to create image lock: %m");
3785 if (!mkdtemp(template)) {
3786 log_error_errno(errno
, "Failed to create temporary directory: %m");
3791 arg_directory
= strdup(template);
3792 if (!arg_directory
) {
3797 image_fd
= setup_image(&device_path
, &loop_nr
);
3803 r
= dissect_image(image_fd
,
3804 &root_device
, &root_device_rw
,
3805 &home_device
, &home_device_rw
,
3806 &srv_device
, &srv_device_rw
,
3812 r
= custom_mounts_prepare();
3817 isatty(STDIN_FILENO
) > 0 &&
3818 isatty(STDOUT_FILENO
) > 0;
3820 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
3822 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
3826 r
= ptsname_malloc(master
, &console
);
3828 r
= log_error_errno(r
, "Failed to determine tty name: %m");
3832 if (unlockpt(master
) < 0) {
3833 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
3838 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3839 arg_machine
, arg_image
?: arg_directory
);
3841 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
3843 assert_se(sigemptyset(&mask_chld
) == 0);
3844 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
3846 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
3847 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
3852 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 },
3853 uid_shift_socket_pair
[2] = { -1, -1 };
3854 ContainerStatus container_status
;
3855 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
3856 static const struct sigaction sa
= {
3857 .sa_handler
= nop_handler
,
3858 .sa_flags
= SA_NOCLDSTOP
,
3862 _cleanup_event_unref_ sd_event
*event
= NULL
;
3863 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
3864 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
3867 r
= barrier_create(&barrier
);
3869 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
3873 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
3874 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
3878 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
3879 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
3883 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
3884 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
3889 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
3890 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
3894 /* Child can be killed before execv(), so handle SIGCHLD
3895 * in order to interrupt parent's blocking calls and
3896 * give it a chance to call wait() and terminate. */
3897 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
3899 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
3903 r
= sigaction(SIGCHLD
, &sa
, NULL
);
3905 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
3909 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
3911 if (errno
== EINVAL
)
3912 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3914 r
= log_error_errno(errno
, "clone() failed: %m");
3920 /* The outer child only has a file system namespace. */
3921 barrier_set_role(&barrier
, BARRIER_CHILD
);
3923 master
= safe_close(master
);
3925 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
3926 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
3927 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
3928 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
3930 (void) reset_all_signal_handlers();
3931 (void) reset_signal_mask();
3933 r
= outer_child(&barrier
,
3936 root_device
, root_device_rw
,
3937 home_device
, home_device_rw
,
3938 srv_device
, srv_device_rw
,
3942 kmsg_socket_pair
[1],
3943 rtnl_socket_pair
[1],
3944 uid_shift_socket_pair
[1],
3947 _exit(EXIT_FAILURE
);
3949 _exit(EXIT_SUCCESS
);
3952 barrier_set_role(&barrier
, BARRIER_PARENT
);
3957 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
3958 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
3959 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
3961 /* Wait for the outer child. */
3962 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
3971 /* And now retrieve the PID of the inner child. */
3972 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
3974 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
3977 if (l
!= sizeof(pid
)) {
3978 log_error("Short read while reading inner child PID: %m");
3983 log_debug("Init process invoked as PID " PID_FMT
, pid
);
3986 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
3987 log_error("Child died too early.");
3992 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3994 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
3997 if (l
!= sizeof(arg_uid_shift
)) {
3998 log_error("Short read while reading UID shift: %m");
4003 r
= setup_uid_map(pid
);
4007 (void) barrier_place(&barrier
); /* #2 */
4010 if (arg_private_network
) {
4012 r
= move_network_interfaces(pid
, arg_network_interfaces
);
4016 if (arg_network_veth
) {
4017 r
= setup_veth(arg_machine
, pid
, veth_name
, !!arg_network_bridge
);
4023 if (arg_network_bridge
) {
4024 r
= setup_bridge(veth_name
, arg_network_bridge
);
4032 r
= setup_macvlan(arg_machine
, pid
, arg_network_macvlan
);
4036 r
= setup_ipvlan(arg_machine
, pid
, arg_network_ipvlan
);
4041 r
= register_machine(pid
, ifi
);
4045 r
= sync_cgroup(pid
);
4049 r
= create_subcgroup(pid
);
4053 r
= chown_cgroup(pid
);
4057 /* Notify the child that the parent is ready with all
4058 * its setup (including cgroup-ification), and that
4059 * the child can now hand over control to the code to
4060 * run inside the container. */
4061 (void) barrier_place(&barrier
); /* #3 */
4063 /* Block SIGCHLD here, before notifying child.
4064 * process_pty() will handle it with the other signals. */
4065 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
4067 /* Reset signal to default */
4068 r
= default_signals(SIGCHLD
, -1);
4070 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
4074 /* Let the child know that we are ready and wait that the child is completely ready now. */
4075 if (!barrier_place_and_sync(&barrier
)) { /* #5 */
4076 log_error("Client died too early.");
4083 "STATUS=Container running.\n"
4084 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
4086 r
= sd_event_new(&event
);
4088 log_error_errno(r
, "Failed to get default event source: %m");
4092 if (arg_kill_signal
> 0) {
4093 /* Try to kill the init system on SIGINT or SIGTERM */
4094 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
4095 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
4097 /* Immediately exit */
4098 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
4099 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
4102 /* simply exit on sigchld */
4103 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
4105 if (arg_expose_ports
) {
4106 r
= expose_port_watch_rtnl(event
, rtnl_socket_pair
[0], on_address_change
, &exposed
, &rtnl
);
4110 (void) expose_port_execute(rtnl
, arg_expose_ports
, &exposed
);
4113 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
4115 r
= pty_forward_new(event
, master
, true, !interactive
, &forward
);
4117 log_error_errno(r
, "Failed to create PTY forwarder: %m");
4121 r
= sd_event_loop(event
);
4123 log_error_errno(r
, "Failed to run event loop: %m");
4127 pty_forward_get_last_char(forward
, &last_char
);
4129 forward
= pty_forward_free(forward
);
4131 if (!arg_quiet
&& last_char
!= '\n')
4134 /* Kill if it is not dead yet anyway */
4135 terminate_machine(pid
);
4137 /* Normally redundant, but better safe than sorry */
4140 r
= wait_for_container(pid
, &container_status
);
4144 /* We failed to wait for the container, or the
4145 * container exited abnormally */
4147 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
4148 /* The container exited with a non-zero
4149 * status, or with zero status and no reboot
4155 /* CONTAINER_REBOOTED, loop again */
4157 if (arg_keep_unit
) {
4158 /* Special handling if we are running as a
4159 * service: instead of simply restarting the
4160 * machine we want to restart the entire
4161 * service, so let's inform systemd about this
4162 * with the special exit code 133. The service
4163 * file uses RestartForceExitStatus=133 so
4164 * that this results in a full nspawn
4165 * restart. This is necessary since we might
4166 * have cgroup parameters set we want to have
4173 expose_port_flush(arg_expose_ports
, &exposed
);
4179 "STATUS=Terminating...");
4184 /* Try to flush whatever is still queued in the pty */
4186 (void) copy_bytes(master
, STDOUT_FILENO
, (off_t
) -1, false);
4188 loop_remove(loop_nr
, &image_fd
);
4190 if (remove_subvol
&& arg_directory
) {
4193 k
= btrfs_subvol_remove(arg_directory
, true);
4195 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
4201 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
4202 (void) rm_rf(p
, REMOVE_ROOT
);
4205 expose_port_flush(arg_expose_ports
, &exposed
);
4207 free(arg_directory
);
4212 strv_free(arg_setenv
);
4213 free(arg_network_bridge
);
4214 strv_free(arg_network_interfaces
);
4215 strv_free(arg_network_macvlan
);
4216 strv_free(arg_network_ipvlan
);
4217 strv_free(arg_parameters
);
4218 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
4219 expose_port_free_all(arg_expose_ports
);
4221 return r
< 0 ? EXIT_FAILURE
: ret
;