1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/mount.h>
31 #include <sys/prctl.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
44 #include <selinux/selinux.h>
52 #include <blkid/blkid.h>
55 #include "sd-daemon.h"
58 #include "sd-netlink.h"
59 #include "random-util.h"
66 #include "cgroup-util.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
75 #include "bus-error.h"
78 #include "netlink-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
82 #include "siphash24.h"
84 #include "base-filesystem.h"
86 #include "event-util.h"
87 #include "capability.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
92 #include "in-addr-util.h"
93 #include "firewall-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
102 #include "seccomp-util.h"
106 #include "nspawn-settings.h"
108 typedef enum ContainerStatus
{
109 CONTAINER_TERMINATED
,
113 typedef enum LinkJournal
{
120 static char *arg_directory
= NULL
;
121 static char *arg_template
= NULL
;
122 static char *arg_user
= NULL
;
123 static sd_id128_t arg_uuid
= {};
124 static char *arg_machine
= NULL
;
125 static const char *arg_selinux_context
= NULL
;
126 static const char *arg_selinux_apifs_context
= NULL
;
127 static const char *arg_slice
= NULL
;
128 static bool arg_private_network
= false;
129 static bool arg_read_only
= false;
130 static bool arg_boot
= false;
131 static bool arg_ephemeral
= false;
132 static LinkJournal arg_link_journal
= LINK_AUTO
;
133 static bool arg_link_journal_try
= false;
134 static uint64_t arg_retain
=
135 (1ULL << CAP_CHOWN
) |
136 (1ULL << CAP_DAC_OVERRIDE
) |
137 (1ULL << CAP_DAC_READ_SEARCH
) |
138 (1ULL << CAP_FOWNER
) |
139 (1ULL << CAP_FSETID
) |
140 (1ULL << CAP_IPC_OWNER
) |
142 (1ULL << CAP_LEASE
) |
143 (1ULL << CAP_LINUX_IMMUTABLE
) |
144 (1ULL << CAP_NET_BIND_SERVICE
) |
145 (1ULL << CAP_NET_BROADCAST
) |
146 (1ULL << CAP_NET_RAW
) |
147 (1ULL << CAP_SETGID
) |
148 (1ULL << CAP_SETFCAP
) |
149 (1ULL << CAP_SETPCAP
) |
150 (1ULL << CAP_SETUID
) |
151 (1ULL << CAP_SYS_ADMIN
) |
152 (1ULL << CAP_SYS_CHROOT
) |
153 (1ULL << CAP_SYS_NICE
) |
154 (1ULL << CAP_SYS_PTRACE
) |
155 (1ULL << CAP_SYS_TTY_CONFIG
) |
156 (1ULL << CAP_SYS_RESOURCE
) |
157 (1ULL << CAP_SYS_BOOT
) |
158 (1ULL << CAP_AUDIT_WRITE
) |
159 (1ULL << CAP_AUDIT_CONTROL
) |
161 static CustomMount
*arg_custom_mounts
= NULL
;
162 static unsigned arg_n_custom_mounts
= 0;
163 static char **arg_setenv
= NULL
;
164 static bool arg_quiet
= false;
165 static bool arg_share_system
= false;
166 static bool arg_register
= true;
167 static bool arg_keep_unit
= false;
168 static char **arg_network_interfaces
= NULL
;
169 static char **arg_network_macvlan
= NULL
;
170 static char **arg_network_ipvlan
= NULL
;
171 static bool arg_network_veth
= false;
172 static char *arg_network_bridge
= NULL
;
173 static unsigned long arg_personality
= PERSONALITY_INVALID
;
174 static char *arg_image
= NULL
;
175 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
176 static ExposePort
*arg_expose_ports
= NULL
;
177 static char **arg_property
= NULL
;
178 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
179 static bool arg_userns
= false;
180 static int arg_kill_signal
= 0;
181 static bool arg_unified_cgroup_hierarchy
= false;
182 static SettingsMask arg_settings_mask
= 0;
183 static int arg_settings_trusted
= -1;
184 static char **arg_parameters
= NULL
;
186 static void help(void) {
187 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
188 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
189 " -h --help Show this help\n"
190 " --version Print version string\n"
191 " -q --quiet Do not show status information\n"
192 " -D --directory=PATH Root directory for the container\n"
193 " --template=PATH Initialize root directory from template directory,\n"
195 " -x --ephemeral Run container with snapshot of root directory, and\n"
196 " remove it after exit\n"
197 " -i --image=PATH File system device or disk image for the container\n"
198 " -b --boot Boot up full system (i.e. invoke init)\n"
199 " -u --user=USER Run the command under specified user or uid\n"
200 " -M --machine=NAME Set the machine name for the container\n"
201 " --uuid=UUID Set a specific machine UUID for the container\n"
202 " -S --slice=SLICE Place the container in the specified slice\n"
203 " --property=NAME=VALUE Set scope unit property\n"
204 " --private-users[=UIDBASE[:NUIDS]]\n"
205 " Run within user namespace\n"
206 " --private-network Disable network in container\n"
207 " --network-interface=INTERFACE\n"
208 " Assign an existing network interface to the\n"
210 " --network-macvlan=INTERFACE\n"
211 " Create a macvlan network interface based on an\n"
212 " existing network interface to the container\n"
213 " --network-ipvlan=INTERFACE\n"
214 " Create a ipvlan network interface based on an\n"
215 " existing network interface to the container\n"
216 " -n --network-veth Add a virtual ethernet connection between host\n"
218 " --network-bridge=INTERFACE\n"
219 " Add a virtual ethernet connection between host\n"
220 " and container and add it to an existing bridge on\n"
222 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
223 " Expose a container IP port on the host\n"
224 " -Z --selinux-context=SECLABEL\n"
225 " Set the SELinux security context to be used by\n"
226 " processes in the container\n"
227 " -L --selinux-apifs-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " API/tmpfs file systems in the container\n"
230 " --capability=CAP In addition to the default, retain specified\n"
232 " --drop-capability=CAP Drop the specified capability from the default set\n"
233 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
234 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
235 " try-guest, try-host\n"
236 " -j Equivalent to --link-journal=try-guest\n"
237 " --read-only Mount the root directory read-only\n"
238 " --bind=PATH[:PATH[:OPTIONS]]\n"
239 " Bind mount a file or directory from the host into\n"
241 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
242 " Similar, but creates a read-only bind mount\n"
243 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
244 " --overlay=PATH[:PATH...]:PATH\n"
245 " Create an overlay mount from the host to \n"
247 " --overlay-ro=PATH[:PATH...]:PATH\n"
248 " Similar, but creates a read-only overlay mount\n"
249 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
250 " --share-system Share system namespaces with host\n"
251 " --register=BOOLEAN Register container as machine\n"
252 " --keep-unit Do not register a scope for the machine, reuse\n"
253 " the service unit nspawn is running in\n"
254 " --volatile[=MODE] Run the system in volatile mode\n"
255 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
256 , program_invocation_short_name
);
259 static CustomMount
* custom_mount_add(CustomMount
**l
, unsigned *n
, CustomMountType t
) {
260 CustomMount
*c
, *ret
;
265 assert(t
< _CUSTOM_MOUNT_TYPE_MAX
);
267 c
= realloc(*l
, (*n
+ 1) * sizeof(CustomMount
));
275 *ret
= (CustomMount
) { .type
= t
};
280 void custom_mount_free_all(CustomMount
*l
, unsigned n
) {
283 for (i
= 0; i
< n
; i
++) {
284 CustomMount
*m
= l
+ i
;
287 free(m
->destination
);
291 (void) rm_rf(m
->work_dir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
301 static int custom_mount_compare(const void *a
, const void *b
) {
302 const CustomMount
*x
= a
, *y
= b
;
305 r
= path_compare(x
->destination
, y
->destination
);
309 if (x
->type
< y
->type
)
311 if (x
->type
> y
->type
)
317 static int custom_mounts_prepare(void) {
321 /* Ensure the mounts are applied prefix first. */
322 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
324 /* Allocate working directories for the overlay file systems that need it */
325 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
326 CustomMount
*m
= &arg_custom_mounts
[i
];
328 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
329 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
333 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
342 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
344 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
350 static int set_sanitized_path(char **b
, const char *path
) {
356 p
= canonicalize_file_name(path
);
361 p
= path_make_absolute_cwd(path
);
367 *b
= path_kill_slashes(p
);
371 static int detect_unified_cgroup_hierarchy(void) {
375 /* Allow the user to control whether the unified hierarchy is used */
376 e
= getenv("UNIFIED_CGROUP_HIERARCHY");
378 r
= parse_boolean(e
);
380 return log_error_errno(r
, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
382 arg_unified_cgroup_hierarchy
= r
;
386 /* Otherwise inherit the default from the host system */
389 return log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
391 arg_unified_cgroup_hierarchy
= r
;
395 VolatileMode
volatile_mode_from_string(const char *s
) {
399 return _VOLATILE_MODE_INVALID
;
401 b
= parse_boolean(s
);
407 if (streq(s
, "state"))
408 return VOLATILE_STATE
;
410 return _VOLATILE_MODE_INVALID
;
413 int expose_port_parse(ExposePort
**l
, const char *s
) {
415 const char *split
, *e
;
416 uint16_t container_port
, host_port
;
421 if ((e
= startswith(s
, "tcp:")))
422 protocol
= IPPROTO_TCP
;
423 else if ((e
= startswith(s
, "udp:")))
424 protocol
= IPPROTO_UDP
;
427 protocol
= IPPROTO_TCP
;
430 split
= strchr(e
, ':');
432 char v
[split
- e
+ 1];
434 memcpy(v
, e
, split
- e
);
437 r
= safe_atou16(v
, &host_port
);
438 if (r
< 0 || host_port
<= 0)
441 r
= safe_atou16(split
+ 1, &container_port
);
443 r
= safe_atou16(e
, &container_port
);
444 host_port
= container_port
;
447 if (r
< 0 || container_port
<= 0)
450 LIST_FOREACH(ports
, p
, arg_expose_ports
)
451 if (p
->protocol
== protocol
&& p
->host_port
== host_port
)
454 p
= new(ExposePort
, 1);
458 p
->protocol
= protocol
;
459 p
->host_port
= host_port
;
460 p
->container_port
= container_port
;
462 LIST_PREPEND(ports
, *l
, p
);
467 int bind_mount_parse(CustomMount
**l
, unsigned *n
, const char *s
, bool read_only
) {
468 _cleanup_free_
char *source
= NULL
, *destination
= NULL
, *opts
= NULL
;
476 r
= extract_many_words(&p
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
, &source
, &destination
, NULL
);
483 destination
= strdup(source
);
488 if (r
== 2 && !isempty(p
)) {
494 if (!path_is_absolute(source
))
497 if (!path_is_absolute(destination
))
500 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_BIND
);
505 m
->destination
= destination
;
506 m
->read_only
= read_only
;
509 source
= destination
= opts
= NULL
;
513 int tmpfs_mount_parse(CustomMount
**l
, unsigned *n
, const char *s
) {
514 _cleanup_free_
char *path
= NULL
, *opts
= NULL
;
523 r
= extract_first_word(&p
, &path
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
530 opts
= strdup("mode=0755");
536 if (!path_is_absolute(path
))
539 m
= custom_mount_add(l
, n
, CUSTOM_MOUNT_TMPFS
);
543 m
->destination
= path
;
550 static int parse_argv(int argc
, char *argv
[]) {
569 ARG_NETWORK_INTERFACE
,
582 static const struct option options
[] = {
583 { "help", no_argument
, NULL
, 'h' },
584 { "version", no_argument
, NULL
, ARG_VERSION
},
585 { "directory", required_argument
, NULL
, 'D' },
586 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
587 { "ephemeral", no_argument
, NULL
, 'x' },
588 { "user", required_argument
, NULL
, 'u' },
589 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
590 { "boot", no_argument
, NULL
, 'b' },
591 { "uuid", required_argument
, NULL
, ARG_UUID
},
592 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
593 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
594 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
595 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
596 { "bind", required_argument
, NULL
, ARG_BIND
},
597 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
598 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
599 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
600 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
601 { "machine", required_argument
, NULL
, 'M' },
602 { "slice", required_argument
, NULL
, 'S' },
603 { "setenv", required_argument
, NULL
, ARG_SETENV
},
604 { "selinux-context", required_argument
, NULL
, 'Z' },
605 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
606 { "quiet", no_argument
, NULL
, 'q' },
607 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
608 { "register", required_argument
, NULL
, ARG_REGISTER
},
609 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
610 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
611 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
612 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
613 { "network-veth", no_argument
, NULL
, 'n' },
614 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
615 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
616 { "image", required_argument
, NULL
, 'i' },
617 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
618 { "port", required_argument
, NULL
, 'p' },
619 { "property", required_argument
, NULL
, ARG_PROPERTY
},
620 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
621 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
622 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
627 uint64_t plus
= 0, minus
= 0;
628 bool mask_all_settings
= false, mask_no_settings
= false;
633 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
642 puts(PACKAGE_STRING
);
643 puts(SYSTEMD_FEATURES
);
647 r
= set_sanitized_path(&arg_directory
, optarg
);
649 return log_error_errno(r
, "Invalid root directory: %m");
654 r
= set_sanitized_path(&arg_template
, optarg
);
656 return log_error_errno(r
, "Invalid template directory: %m");
661 r
= set_sanitized_path(&arg_image
, optarg
);
663 return log_error_errno(r
, "Invalid image path: %m");
668 arg_ephemeral
= true;
672 r
= free_and_strdup(&arg_user
, optarg
);
676 arg_settings_mask
|= SETTING_USER
;
679 case ARG_NETWORK_BRIDGE
:
680 r
= free_and_strdup(&arg_network_bridge
, optarg
);
687 arg_network_veth
= true;
688 arg_private_network
= true;
689 arg_settings_mask
|= SETTING_NETWORK
;
692 case ARG_NETWORK_INTERFACE
:
693 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
696 arg_private_network
= true;
697 arg_settings_mask
|= SETTING_NETWORK
;
700 case ARG_NETWORK_MACVLAN
:
701 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
704 arg_private_network
= true;
705 arg_settings_mask
|= SETTING_NETWORK
;
708 case ARG_NETWORK_IPVLAN
:
709 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
714 case ARG_PRIVATE_NETWORK
:
715 arg_private_network
= true;
716 arg_settings_mask
|= SETTING_NETWORK
;
721 arg_settings_mask
|= SETTING_BOOT
;
725 r
= sd_id128_from_string(optarg
, &arg_uuid
);
727 log_error("Invalid UUID: %s", optarg
);
731 arg_settings_mask
|= SETTING_MACHINE_ID
;
740 arg_machine
= mfree(arg_machine
);
742 if (!machine_name_is_valid(optarg
)) {
743 log_error("Invalid machine name: %s", optarg
);
747 r
= free_and_strdup(&arg_machine
, optarg
);
755 arg_selinux_context
= optarg
;
759 arg_selinux_apifs_context
= optarg
;
763 arg_read_only
= true;
764 arg_settings_mask
|= SETTING_READ_ONLY
;
768 case ARG_DROP_CAPABILITY
: {
769 const char *state
, *word
;
772 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
773 _cleanup_free_
char *t
;
775 t
= strndup(word
, length
);
779 if (streq(t
, "all")) {
780 if (c
== ARG_CAPABILITY
)
781 plus
= (uint64_t) -1;
783 minus
= (uint64_t) -1;
787 cap
= capability_from_name(t
);
789 log_error("Failed to parse capability %s.", t
);
793 if (c
== ARG_CAPABILITY
)
794 plus
|= 1ULL << (uint64_t) cap
;
796 minus
|= 1ULL << (uint64_t) cap
;
800 arg_settings_mask
|= SETTING_CAPABILITY
;
805 arg_link_journal
= LINK_GUEST
;
806 arg_link_journal_try
= true;
809 case ARG_LINK_JOURNAL
:
810 if (streq(optarg
, "auto")) {
811 arg_link_journal
= LINK_AUTO
;
812 arg_link_journal_try
= false;
813 } else if (streq(optarg
, "no")) {
814 arg_link_journal
= LINK_NO
;
815 arg_link_journal_try
= false;
816 } else if (streq(optarg
, "guest")) {
817 arg_link_journal
= LINK_GUEST
;
818 arg_link_journal_try
= false;
819 } else if (streq(optarg
, "host")) {
820 arg_link_journal
= LINK_HOST
;
821 arg_link_journal_try
= false;
822 } else if (streq(optarg
, "try-guest")) {
823 arg_link_journal
= LINK_GUEST
;
824 arg_link_journal_try
= true;
825 } else if (streq(optarg
, "try-host")) {
826 arg_link_journal
= LINK_HOST
;
827 arg_link_journal_try
= true;
829 log_error("Failed to parse link journal mode %s", optarg
);
837 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
839 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
841 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
845 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
847 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
849 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
853 case ARG_OVERLAY_RO
: {
854 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
855 _cleanup_strv_free_
char **lower
= NULL
;
860 r
= strv_split_extract(&lower
, optarg
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
864 log_error("Invalid overlay specification: %s", optarg
);
868 STRV_FOREACH(i
, lower
) {
869 if (!path_is_absolute(*i
)) {
870 log_error("Overlay path %s is not absolute.", *i
);
878 log_error("--overlay= needs at least two colon-separated directories specified.");
883 /* If two parameters are specified,
884 * the first one is the lower, the
885 * second one the upper directory. And
886 * we'll also define the destination
887 * mount point the same as the upper. */
891 destination
= strdup(upper
);
896 upper
= lower
[n
- 2];
897 destination
= lower
[n
- 1];
901 m
= custom_mount_add(&arg_custom_mounts
, &arg_n_custom_mounts
, CUSTOM_MOUNT_OVERLAY
);
905 m
->destination
= destination
;
908 m
->read_only
= c
== ARG_OVERLAY_RO
;
910 upper
= destination
= NULL
;
913 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
920 if (!env_assignment_is_valid(optarg
)) {
921 log_error("Environment variable assignment '%s' is not valid.", optarg
);
925 n
= strv_env_set(arg_setenv
, optarg
);
929 strv_free(arg_setenv
);
932 arg_settings_mask
|= SETTING_ENVIRONMENT
;
940 case ARG_SHARE_SYSTEM
:
941 arg_share_system
= true;
945 r
= parse_boolean(optarg
);
947 log_error("Failed to parse --register= argument: %s", optarg
);
955 arg_keep_unit
= true;
958 case ARG_PERSONALITY
:
960 arg_personality
= personality_from_string(optarg
);
961 if (arg_personality
== PERSONALITY_INVALID
) {
962 log_error("Unknown or unsupported personality '%s'.", optarg
);
966 arg_settings_mask
|= SETTING_PERSONALITY
;
972 arg_volatile_mode
= VOLATILE_YES
;
976 m
= volatile_mode_from_string(optarg
);
978 log_error("Failed to parse --volatile= argument: %s", optarg
);
981 arg_volatile_mode
= m
;
984 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
988 r
= expose_port_parse(&arg_expose_ports
, optarg
);
990 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
992 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
994 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
998 if (strv_extend(&arg_property
, optarg
) < 0)
1003 case ARG_PRIVATE_USERS
:
1005 _cleanup_free_
char *buffer
= NULL
;
1006 const char *range
, *shift
;
1008 range
= strchr(optarg
, ':');
1010 buffer
= strndup(optarg
, range
- optarg
);
1016 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
1017 log_error("Failed to parse UID range: %s", range
);
1023 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
1024 log_error("Failed to parse UID: %s", optarg
);
1032 case ARG_KILL_SIGNAL
:
1033 arg_kill_signal
= signal_from_string_try_harder(optarg
);
1034 if (arg_kill_signal
< 0) {
1035 log_error("Cannot parse signal: %s", optarg
);
1039 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
1044 /* no → do not read files
1045 * yes → read files, do not override cmdline, trust only subset
1046 * override → read files, override cmdline, trust only subset
1047 * trusted → read files, do not override cmdline, trust all
1050 r
= parse_boolean(optarg
);
1052 if (streq(optarg
, "trusted")) {
1053 mask_all_settings
= false;
1054 mask_no_settings
= false;
1055 arg_settings_trusted
= true;
1057 } else if (streq(optarg
, "override")) {
1058 mask_all_settings
= false;
1059 mask_no_settings
= true;
1060 arg_settings_trusted
= -1;
1062 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
1065 mask_all_settings
= false;
1066 mask_no_settings
= false;
1067 arg_settings_trusted
= -1;
1070 mask_all_settings
= true;
1071 mask_no_settings
= false;
1072 arg_settings_trusted
= false;
1081 assert_not_reached("Unhandled option");
1084 if (arg_share_system
)
1085 arg_register
= false;
1087 if (arg_boot
&& arg_share_system
) {
1088 log_error("--boot and --share-system may not be combined.");
1092 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
1093 log_error("--keep-unit may not be used when invoked from a user session.");
1097 if (arg_directory
&& arg_image
) {
1098 log_error("--directory= and --image= may not be combined.");
1102 if (arg_template
&& arg_image
) {
1103 log_error("--template= and --image= may not be combined.");
1107 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
1108 log_error("--template= needs --directory= or --machine=.");
1112 if (arg_ephemeral
&& arg_template
) {
1113 log_error("--ephemeral and --template= may not be combined.");
1117 if (arg_ephemeral
&& arg_image
) {
1118 log_error("--ephemeral and --image= may not be combined.");
1122 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
1123 log_error("--ephemeral and --link-journal= may not be combined.");
1127 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
1128 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
1130 if (argc
> optind
) {
1131 arg_parameters
= strv_copy(argv
+ optind
);
1132 if (!arg_parameters
)
1135 arg_settings_mask
|= SETTING_BOOT
;
1138 /* Load all settings from .nspawn files */
1139 if (mask_no_settings
)
1140 arg_settings_mask
= 0;
1142 /* Don't load any settings from .nspawn files */
1143 if (mask_all_settings
)
1144 arg_settings_mask
= _SETTINGS_MASK_ALL
;
1146 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
1148 r
= detect_unified_cgroup_hierarchy();
1155 static int verify_arguments(void) {
1157 if (arg_volatile_mode
!= VOLATILE_NO
&& arg_read_only
) {
1158 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1162 if (arg_expose_ports
&& !arg_private_network
) {
1163 log_error("Cannot use --port= without private networking.");
1167 if (arg_boot
&& arg_kill_signal
<= 0)
1168 arg_kill_signal
= SIGRTMIN
+3;
1173 static int tmpfs_patch_options(const char *options
, char **ret
) {
1176 if (arg_userns
&& arg_uid_shift
!= 0) {
1177 assert(arg_uid_shift
!= UID_INVALID
);
1180 (void) asprintf(&buf
, "%s,uid=" UID_FMT
",gid=" UID_FMT
, options
, arg_uid_shift
, arg_uid_shift
);
1182 (void) asprintf(&buf
, "uid=" UID_FMT
",gid=" UID_FMT
, arg_uid_shift
, arg_uid_shift
);
1190 if (arg_selinux_apifs_context
) {
1194 t
= strjoin(options
, ",context=\"", arg_selinux_apifs_context
, "\"", NULL
);
1196 t
= strjoin("context=\"", arg_selinux_apifs_context
, "\"", NULL
);
1211 static int mount_all(const char *dest
, bool userns
) {
1213 typedef struct MountPoint
{
1217 const char *options
;
1218 unsigned long flags
;
1223 static const MountPoint mount_table
[] = {
1224 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true, true },
1225 { "/proc/sys", "/proc/sys", NULL
, NULL
, MS_BIND
, true, true }, /* Bind mount first */
1226 { NULL
, "/proc/sys", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, true, true }, /* Then, make it r/o */
1227 { "sysfs", "/sys", "sysfs", NULL
, MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true, false },
1228 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
, true, false },
1229 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true, false },
1230 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true, false },
1231 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME
, true, false },
1233 { "/sys/fs/selinux", "/sys/fs/selinux", NULL
, NULL
, MS_BIND
, false, false }, /* Bind mount first */
1234 { NULL
, "/sys/fs/selinux", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, false, false }, /* Then, make it r/o */
1241 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
1242 _cleanup_free_
char *where
= NULL
, *options
= NULL
;
1245 if (userns
!= mount_table
[k
].userns
)
1248 where
= prefix_root(dest
, mount_table
[k
].where
);
1252 r
= path_is_mount_point(where
, AT_SYMLINK_FOLLOW
);
1253 if (r
< 0 && r
!= -ENOENT
)
1254 return log_error_errno(r
, "Failed to detect whether %s is a mount point: %m", where
);
1256 /* Skip this entry if it is not a remount. */
1257 if (mount_table
[k
].what
&& r
> 0)
1260 r
= mkdir_p(where
, 0755);
1262 if (mount_table
[k
].fatal
)
1263 return log_error_errno(r
, "Failed to create directory %s: %m", where
);
1265 log_warning_errno(r
, "Failed to create directory %s: %m", where
);
1269 o
= mount_table
[k
].options
;
1270 if (streq_ptr(mount_table
[k
].type
, "tmpfs")) {
1271 r
= tmpfs_patch_options(o
, &options
);
1278 if (mount(mount_table
[k
].what
,
1280 mount_table
[k
].type
,
1281 mount_table
[k
].flags
,
1284 if (mount_table
[k
].fatal
)
1285 return log_error_errno(errno
, "mount(%s) failed: %m", where
);
1287 log_warning_errno(errno
, "mount(%s) failed, ignoring: %m", where
);
1294 static int parse_mount_bind_options(const char *options
, unsigned long *mount_flags
, char **mount_opts
) {
1295 const char *p
= options
;
1296 unsigned long flags
= *mount_flags
;
1302 _cleanup_free_
char *word
= NULL
;
1303 int r
= extract_first_word(&p
, &word
, ",", 0);
1305 return log_error_errno(r
, "Failed to extract mount option: %m");
1309 if (streq(word
, "rbind"))
1311 else if (streq(word
, "norbind"))
1314 log_error("Invalid bind mount option: %s", word
);
1319 *mount_flags
= flags
;
1320 /* in the future mount_opts will hold string options for mount(2) */
1326 static int mount_bind(const char *dest
, CustomMount
*m
) {
1327 struct stat source_st
, dest_st
;
1329 unsigned long mount_flags
= MS_BIND
| MS_REC
;
1330 _cleanup_free_
char *mount_opts
= NULL
;
1336 r
= parse_mount_bind_options(m
->options
, &mount_flags
, &mount_opts
);
1341 if (stat(m
->source
, &source_st
) < 0)
1342 return log_error_errno(errno
, "Failed to stat %s: %m", m
->source
);
1344 where
= prefix_roota(dest
, m
->destination
);
1346 if (stat(where
, &dest_st
) >= 0) {
1347 if (S_ISDIR(source_st
.st_mode
) && !S_ISDIR(dest_st
.st_mode
)) {
1348 log_error("Cannot bind mount directory %s on file %s.", m
->source
, where
);
1352 if (!S_ISDIR(source_st
.st_mode
) && S_ISDIR(dest_st
.st_mode
)) {
1353 log_error("Cannot bind mount file %s on directory %s.", m
->source
, where
);
1357 } else if (errno
== ENOENT
) {
1358 r
= mkdir_parents_label(where
, 0755);
1360 return log_error_errno(r
, "Failed to make parents of %s: %m", where
);
1362 log_error_errno(errno
, "Failed to stat %s: %m", where
);
1366 /* Create the mount point. Any non-directory file can be
1367 * mounted on any non-directory file (regular, fifo, socket,
1370 if (S_ISDIR(source_st
.st_mode
))
1371 r
= mkdir_label(where
, 0755);
1374 if (r
< 0 && r
!= -EEXIST
)
1375 return log_error_errno(r
, "Failed to create mount point %s: %m", where
);
1377 if (mount(m
->source
, where
, NULL
, mount_flags
, mount_opts
) < 0)
1378 return log_error_errno(errno
, "mount(%s) failed: %m", where
);
1381 r
= bind_remount_recursive(where
, true);
1383 return log_error_errno(r
, "Read-only bind mount failed: %m");
1389 static int mount_tmpfs(const char *dest
, CustomMount
*m
) {
1390 const char *where
, *options
;
1391 _cleanup_free_
char *buf
= NULL
;
1397 where
= prefix_roota(dest
, m
->destination
);
1399 r
= mkdir_p_label(where
, 0755);
1400 if (r
< 0 && r
!= -EEXIST
)
1401 return log_error_errno(r
, "Creating mount point for tmpfs %s failed: %m", where
);
1403 r
= tmpfs_patch_options(m
->options
, &buf
);
1406 options
= r
> 0 ? buf
: m
->options
;
1408 if (mount("tmpfs", where
, "tmpfs", MS_NODEV
|MS_STRICTATIME
, options
) < 0)
1409 return log_error_errno(errno
, "tmpfs mount to %s failed: %m", where
);
1414 static char *joined_and_escaped_lower_dirs(char * const *lower
) {
1415 _cleanup_strv_free_
char **sv
= NULL
;
1417 sv
= strv_copy(lower
);
1423 if (!strv_shell_escape(sv
, ",:"))
1426 return strv_join(sv
, ":");
1429 static int mount_overlay(const char *dest
, CustomMount
*m
) {
1430 _cleanup_free_
char *lower
= NULL
;
1431 const char *where
, *options
;
1437 where
= prefix_roota(dest
, m
->destination
);
1439 r
= mkdir_label(where
, 0755);
1440 if (r
< 0 && r
!= -EEXIST
)
1441 return log_error_errno(r
, "Creating mount point for overlay %s failed: %m", where
);
1443 (void) mkdir_p_label(m
->source
, 0755);
1445 lower
= joined_and_escaped_lower_dirs(m
->lower
);
1450 _cleanup_free_
char *escaped_source
= NULL
;
1452 escaped_source
= shell_escape(m
->source
, ",:");
1453 if (!escaped_source
)
1456 options
= strjoina("lowerdir=", escaped_source
, ":", lower
);
1458 _cleanup_free_
char *escaped_source
= NULL
, *escaped_work_dir
= NULL
;
1460 assert(m
->work_dir
);
1461 (void) mkdir_label(m
->work_dir
, 0700);
1463 escaped_source
= shell_escape(m
->source
, ",:");
1464 if (!escaped_source
)
1466 escaped_work_dir
= shell_escape(m
->work_dir
, ",:");
1467 if (!escaped_work_dir
)
1470 options
= strjoina("lowerdir=", lower
, ",upperdir=", escaped_source
, ",workdir=", escaped_work_dir
);
1473 if (mount("overlay", where
, "overlay", m
->read_only
? MS_RDONLY
: 0, options
) < 0)
1474 return log_error_errno(errno
, "overlay mount to %s failed: %m", where
);
1479 static int mount_custom(const char *dest
) {
1485 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
1486 CustomMount
*m
= &arg_custom_mounts
[i
];
1490 case CUSTOM_MOUNT_BIND
:
1491 r
= mount_bind(dest
, m
);
1494 case CUSTOM_MOUNT_TMPFS
:
1495 r
= mount_tmpfs(dest
, m
);
1498 case CUSTOM_MOUNT_OVERLAY
:
1499 r
= mount_overlay(dest
, m
);
1503 assert_not_reached("Unknown custom mount type");
1513 static int mount_legacy_cgroup_hierarchy(const char *dest
, const char *controller
, const char *hierarchy
, bool read_only
) {
1517 to
= strjoina(dest
, "/sys/fs/cgroup/", hierarchy
);
1519 r
= path_is_mount_point(to
, 0);
1520 if (r
< 0 && r
!= -ENOENT
)
1521 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", to
);
1527 /* The superblock mount options of the mount point need to be
1528 * identical to the hosts', and hence writable... */
1529 if (mount("cgroup", to
, "cgroup", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, controller
) < 0)
1530 return log_error_errno(errno
, "Failed to mount to %s: %m", to
);
1532 /* ... hence let's only make the bind mount read-only, not the
1535 if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
) < 0)
1536 return log_error_errno(errno
, "Failed to remount %s read-only: %m", to
);
1541 static int mount_legacy_cgroups(const char *dest
) {
1542 _cleanup_set_free_free_ Set
*controllers
= NULL
;
1543 const char *cgroup_root
;
1546 cgroup_root
= prefix_roota(dest
, "/sys/fs/cgroup");
1548 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
1549 r
= path_is_mount_point(cgroup_root
, AT_SYMLINK_FOLLOW
);
1551 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
1553 _cleanup_free_
char *options
= NULL
;
1555 r
= tmpfs_patch_options("mode=755", &options
);
1559 if (mount("tmpfs", cgroup_root
, "tmpfs", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
) < 0)
1560 return log_error_errno(errno
, "Failed to mount /sys/fs/cgroup: %m");
1563 if (cg_unified() > 0)
1564 goto skip_controllers
;
1566 controllers
= set_new(&string_hash_ops
);
1570 r
= cg_kernel_controllers(controllers
);
1572 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
1575 _cleanup_free_
char *controller
= NULL
, *origin
= NULL
, *combined
= NULL
;
1577 controller
= set_steal_first(controllers
);
1581 origin
= prefix_root("/sys/fs/cgroup/", controller
);
1585 r
= readlink_malloc(origin
, &combined
);
1587 /* Not a symbolic link, but directly a single cgroup hierarchy */
1589 r
= mount_legacy_cgroup_hierarchy(dest
, controller
, controller
, true);
1594 return log_error_errno(r
, "Failed to read link %s: %m", origin
);
1596 _cleanup_free_
char *target
= NULL
;
1598 target
= prefix_root(dest
, origin
);
1602 /* A symbolic link, a combination of controllers in one hierarchy */
1604 if (!filename_is_valid(combined
)) {
1605 log_warning("Ignoring invalid combined hierarchy %s.", combined
);
1609 r
= mount_legacy_cgroup_hierarchy(dest
, combined
, combined
, true);
1613 r
= symlink_idempotent(combined
, target
);
1615 log_error("Invalid existing symlink for combined hierarchy");
1619 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
1624 r
= mount_legacy_cgroup_hierarchy(dest
, "none,name=systemd,xattr", "systemd", false);
1628 if (mount(NULL
, cgroup_root
, NULL
, MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755") < 0)
1629 return log_error_errno(errno
, "Failed to remount %s read-only: %m", cgroup_root
);
1634 static int mount_unified_cgroups(const char *dest
) {
1640 p
= strjoina(dest
, "/sys/fs/cgroup");
1642 r
= path_is_mount_point(p
, AT_SYMLINK_FOLLOW
);
1644 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", p
);
1646 p
= strjoina(dest
, "/sys/fs/cgroup/cgroup.procs");
1647 if (access(p
, F_OK
) >= 0)
1649 if (errno
!= ENOENT
)
1650 return log_error_errno(errno
, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p
);
1652 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p
);
1656 if (mount("cgroup", p
, "cgroup", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "__DEVEL__sane_behavior") < 0)
1657 return log_error_errno(errno
, "Failed to mount unified cgroup hierarchy to %s: %m", p
);
1662 static int mount_cgroups(const char *dest
) {
1663 if (arg_unified_cgroup_hierarchy
)
1664 return mount_unified_cgroups(dest
);
1666 return mount_legacy_cgroups(dest
);
1669 static int mount_systemd_cgroup_writable(const char *dest
) {
1670 _cleanup_free_
char *own_cgroup_path
= NULL
;
1671 const char *systemd_root
, *systemd_own
;
1676 r
= cg_pid_get_path(NULL
, 0, &own_cgroup_path
);
1678 return log_error_errno(r
, "Failed to determine our own cgroup path: %m");
1680 /* If we are living in the top-level, then there's nothing to do... */
1681 if (path_equal(own_cgroup_path
, "/"))
1684 if (arg_unified_cgroup_hierarchy
) {
1685 systemd_own
= strjoina(dest
, "/sys/fs/cgroup", own_cgroup_path
);
1686 systemd_root
= prefix_roota(dest
, "/sys/fs/cgroup");
1688 systemd_own
= strjoina(dest
, "/sys/fs/cgroup/systemd", own_cgroup_path
);
1689 systemd_root
= prefix_roota(dest
, "/sys/fs/cgroup/systemd");
1692 /* Make our own cgroup a (writable) bind mount */
1693 if (mount(systemd_own
, systemd_own
, NULL
, MS_BIND
, NULL
) < 0)
1694 return log_error_errno(errno
, "Failed to turn %s into a bind mount: %m", own_cgroup_path
);
1696 /* And then remount the systemd cgroup root read-only */
1697 if (mount(NULL
, systemd_root
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
) < 0)
1698 return log_error_errno(errno
, "Failed to mount cgroup root read-only: %m");
1703 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
1709 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
1712 if (uid
!= UID_INVALID
) {
1713 uid
+= arg_uid_shift
;
1715 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
1719 if (gid
!= GID_INVALID
) {
1720 gid
+= (gid_t
) arg_uid_shift
;
1722 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
1726 if (lchown(p
, uid
, gid
) < 0)
1732 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
1735 q
= prefix_roota(root
, path
);
1736 if (mkdir(q
, mode
) < 0) {
1737 if (errno
== EEXIST
)
1742 return userns_lchown(q
, uid
, gid
);
1745 static int setup_timezone(const char *dest
) {
1746 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
1747 const char *where
, *check
, *what
;
1753 /* Fix the timezone, if possible */
1754 r
= readlink_malloc("/etc/localtime", &p
);
1756 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1760 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1762 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1764 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1768 where
= prefix_roota(dest
, "/etc/localtime");
1769 r
= readlink_malloc(where
, &q
);
1771 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1773 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1775 /* Already pointing to the right place? Then do nothing .. */
1776 if (y
&& streq(y
, z
))
1780 check
= strjoina("/usr/share/zoneinfo/", z
);
1781 check
= prefix_root(dest
, check
);
1782 if (laccess(check
, F_OK
) < 0) {
1783 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1788 if (r
< 0 && errno
!= ENOENT
) {
1789 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1793 what
= strjoina("../usr/share/zoneinfo/", z
);
1794 if (symlink(what
, where
) < 0) {
1795 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1799 r
= userns_lchown(where
, 0, 0);
1801 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1806 static int setup_resolv_conf(const char *dest
) {
1807 const char *where
= NULL
;
1812 if (arg_private_network
)
1815 /* Fix resolv.conf, if possible */
1816 where
= prefix_roota(dest
, "/etc/resolv.conf");
1818 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1820 /* If the file already exists as symlink, let's
1821 * suppress the warning, under the assumption that
1822 * resolved or something similar runs inside and the
1823 * symlink points there.
1825 * If the disk image is read-only, there's also no
1826 * point in complaining.
1828 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1829 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1833 r
= userns_lchown(where
, 0, 0);
1835 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1840 static int setup_volatile_state(const char *directory
) {
1841 _cleanup_free_
char *buf
= NULL
;
1842 const char *p
, *options
;
1847 if (arg_volatile_mode
!= VOLATILE_STATE
)
1850 /* --volatile=state means we simply overmount /var
1851 with a tmpfs, and the rest read-only. */
1853 r
= bind_remount_recursive(directory
, true);
1855 return log_error_errno(r
, "Failed to remount %s read-only: %m", directory
);
1857 p
= prefix_roota(directory
, "/var");
1859 if (r
< 0 && errno
!= EEXIST
)
1860 return log_error_errno(errno
, "Failed to create %s: %m", directory
);
1862 options
= "mode=755";
1863 r
= tmpfs_patch_options(options
, &buf
);
1869 if (mount("tmpfs", p
, "tmpfs", MS_STRICTATIME
, options
) < 0)
1870 return log_error_errno(errno
, "Failed to mount tmpfs to /var: %m");
1875 static int setup_volatile(const char *directory
) {
1876 bool tmpfs_mounted
= false, bind_mounted
= false;
1877 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1878 _cleanup_free_
char *buf
= NULL
;
1879 const char *f
, *t
, *options
;
1884 if (arg_volatile_mode
!= VOLATILE_YES
)
1887 /* --volatile=yes means we mount a tmpfs to the root dir, and
1888 the original /usr to use inside it, and that read-only. */
1890 if (!mkdtemp(template))
1891 return log_error_errno(errno
, "Failed to create temporary directory: %m");
1893 options
= "mode=755";
1894 r
= tmpfs_patch_options(options
, &buf
);
1900 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME
, options
) < 0) {
1901 r
= log_error_errno(errno
, "Failed to mount tmpfs for root directory: %m");
1905 tmpfs_mounted
= true;
1907 f
= prefix_roota(directory
, "/usr");
1908 t
= prefix_roota(template, "/usr");
1911 if (r
< 0 && errno
!= EEXIST
) {
1912 r
= log_error_errno(errno
, "Failed to create %s: %m", t
);
1916 if (mount(f
, t
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0) {
1917 r
= log_error_errno(errno
, "Failed to create /usr bind mount: %m");
1921 bind_mounted
= true;
1923 r
= bind_remount_recursive(t
, true);
1925 log_error_errno(r
, "Failed to remount %s read-only: %m", t
);
1929 if (mount(template, directory
, NULL
, MS_MOVE
, NULL
) < 0) {
1930 r
= log_error_errno(errno
, "Failed to move root mount: %m");
1934 (void) rmdir(template);
1943 (void) umount(template);
1944 (void) rmdir(template);
1948 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1952 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1953 SD_ID128_FORMAT_VAL(id
));
1958 static int setup_boot_id(const char *dest
) {
1959 const char *from
, *to
;
1960 sd_id128_t rnd
= {};
1964 if (arg_share_system
)
1967 /* Generate a new randomized boot ID, so that each boot-up of
1968 * the container gets a new one */
1970 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1971 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1973 r
= sd_id128_randomize(&rnd
);
1975 return log_error_errno(r
, "Failed to generate random boot id: %m");
1977 id128_format_as_uuid(rnd
, as_uuid
);
1979 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1981 return log_error_errno(r
, "Failed to write boot id: %m");
1983 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1984 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1985 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1986 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1992 static int copy_devnodes(const char *dest
) {
1994 static const char devnodes
[] =
2005 _cleanup_umask_ mode_t u
;
2011 /* Create /dev/net, so that we can create /dev/net/tun in it */
2012 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
2013 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
2015 NULSTR_FOREACH(d
, devnodes
) {
2016 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
2019 from
= strappend("/dev/", d
);
2020 to
= prefix_root(dest
, from
);
2022 if (stat(from
, &st
) < 0) {
2024 if (errno
!= ENOENT
)
2025 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
2027 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
2029 log_error("%s is not a char or block device, cannot copy.", from
);
2033 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
2035 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
2037 /* Some systems abusively restrict mknod but
2038 * allow bind mounts. */
2041 return log_error_errno(r
, "touch (%s) failed: %m", to
);
2042 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
2043 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
2046 r
= userns_lchown(to
, 0, 0);
2048 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
2055 static int setup_pts(const char *dest
) {
2056 _cleanup_free_
char *options
= NULL
;
2060 if (arg_selinux_apifs_context
)
2061 (void) asprintf(&options
,
2062 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
2063 arg_uid_shift
+ TTY_GID
,
2064 arg_selinux_apifs_context
);
2067 (void) asprintf(&options
,
2068 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
2069 arg_uid_shift
+ TTY_GID
);
2074 /* Mount /dev/pts itself */
2075 p
= prefix_roota(dest
, "/dev/pts");
2076 if (mkdir(p
, 0755) < 0)
2077 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
2078 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
2079 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
2080 if (userns_lchown(p
, 0, 0) < 0)
2081 return log_error_errno(errno
, "Failed to chown /dev/pts: %m");
2083 /* Create /dev/ptmx symlink */
2084 p
= prefix_roota(dest
, "/dev/ptmx");
2085 if (symlink("pts/ptmx", p
) < 0)
2086 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
2087 if (userns_lchown(p
, 0, 0) < 0)
2088 return log_error_errno(errno
, "Failed to chown /dev/ptmx: %m");
2090 /* And fix /dev/pts/ptmx ownership */
2091 p
= prefix_roota(dest
, "/dev/pts/ptmx");
2092 if (userns_lchown(p
, 0, 0) < 0)
2093 return log_error_errno(errno
, "Failed to chown /dev/pts/ptmx: %m");
2098 static int setup_dev_console(const char *dest
, const char *console
) {
2099 _cleanup_umask_ mode_t u
;
2108 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
2110 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
2112 /* We need to bind mount the right tty to /dev/console since
2113 * ptys can only exist on pts file systems. To have something
2114 * to bind mount things on we create a empty regular file. */
2116 to
= prefix_roota(dest
, "/dev/console");
2119 return log_error_errno(r
, "touch() for /dev/console failed: %m");
2121 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
2122 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
2127 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
2128 const char *from
, *to
;
2129 _cleanup_umask_ mode_t u
;
2132 struct cmsghdr cmsghdr
;
2133 uint8_t buf
[CMSG_SPACE(sizeof(int))];
2135 struct msghdr mh
= {
2136 .msg_control
= &control
,
2137 .msg_controllen
= sizeof(control
),
2139 struct cmsghdr
*cmsg
;
2141 assert(kmsg_socket
>= 0);
2145 /* We create the kmsg FIFO as /run/kmsg, but immediately
2146 * delete it after bind mounting it to /proc/kmsg. While FIFOs
2147 * on the reading side behave very similar to /proc/kmsg,
2148 * their writing side behaves differently from /dev/kmsg in
2149 * that writing blocks when nothing is reading. In order to
2150 * avoid any problems with containers deadlocking due to this
2151 * we simply make /dev/kmsg unavailable to the container. */
2152 from
= prefix_roota(dest
, "/run/kmsg");
2153 to
= prefix_roota(dest
, "/proc/kmsg");
2155 if (mkfifo(from
, 0600) < 0)
2156 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
2157 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
2158 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
2160 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
2162 return log_error_errno(errno
, "Failed to open fifo: %m");
2164 cmsg
= CMSG_FIRSTHDR(&mh
);
2165 cmsg
->cmsg_level
= SOL_SOCKET
;
2166 cmsg
->cmsg_type
= SCM_RIGHTS
;
2167 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
2168 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
2170 mh
.msg_controllen
= cmsg
->cmsg_len
;
2172 /* Store away the fd in the socket, so that it stays open as
2173 * long as we run the child */
2174 k
= sendmsg(kmsg_socket
, &mh
, MSG_NOSIGNAL
);
2178 return log_error_errno(errno
, "Failed to send FIFO fd: %m");
2180 /* And now make the FIFO unavailable as /run/kmsg... */
2181 (void) unlink(from
);
2186 static int send_rtnl(int send_fd
) {
2188 struct cmsghdr cmsghdr
;
2189 uint8_t buf
[CMSG_SPACE(sizeof(int))];
2191 struct msghdr mh
= {
2192 .msg_control
= &control
,
2193 .msg_controllen
= sizeof(control
),
2195 struct cmsghdr
*cmsg
;
2196 _cleanup_close_
int fd
= -1;
2199 assert(send_fd
>= 0);
2201 if (!arg_expose_ports
)
2204 fd
= socket(PF_NETLINK
, SOCK_RAW
|SOCK_CLOEXEC
|SOCK_NONBLOCK
, NETLINK_ROUTE
);
2206 return log_error_errno(errno
, "Failed to allocate container netlink: %m");
2208 cmsg
= CMSG_FIRSTHDR(&mh
);
2209 cmsg
->cmsg_level
= SOL_SOCKET
;
2210 cmsg
->cmsg_type
= SCM_RIGHTS
;
2211 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
2212 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
2214 mh
.msg_controllen
= cmsg
->cmsg_len
;
2216 /* Store away the fd in the socket, so that it stays open as
2217 * long as we run the child */
2218 k
= sendmsg(send_fd
, &mh
, MSG_NOSIGNAL
);
2220 return log_error_errno(errno
, "Failed to send netlink fd: %m");
2225 static int flush_ports(union in_addr_union
*exposed
) {
2227 int r
, af
= AF_INET
;
2231 if (!arg_expose_ports
)
2234 if (in_addr_is_null(af
, exposed
))
2237 log_debug("Lost IP address.");
2239 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
2240 r
= fw_add_local_dnat(false,
2251 log_warning_errno(r
, "Failed to modify firewall: %m");
2254 *exposed
= IN_ADDR_NULL
;
2258 static int expose_ports(sd_netlink
*rtnl
, union in_addr_union
*exposed
) {
2259 _cleanup_free_
struct local_address
*addresses
= NULL
;
2260 _cleanup_free_
char *pretty
= NULL
;
2261 union in_addr_union new_exposed
;
2264 int af
= AF_INET
, r
;
2268 /* Invoked each time an address is added or removed inside the
2271 if (!arg_expose_ports
)
2274 r
= local_addresses(rtnl
, 0, af
, &addresses
);
2276 return log_error_errno(r
, "Failed to enumerate local addresses: %m");
2279 addresses
[0].family
== af
&&
2280 addresses
[0].scope
< RT_SCOPE_LINK
;
2283 return flush_ports(exposed
);
2285 new_exposed
= addresses
[0].address
;
2286 if (in_addr_equal(af
, exposed
, &new_exposed
))
2289 in_addr_to_string(af
, &new_exposed
, &pretty
);
2290 log_debug("New container IP is %s.", strna(pretty
));
2292 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
2294 r
= fw_add_local_dnat(true,
2303 in_addr_is_null(af
, exposed
) ? NULL
: exposed
);
2305 log_warning_errno(r
, "Failed to modify firewall: %m");
2308 *exposed
= new_exposed
;
2312 void expose_port_free_all(ExposePort
*p
) {
2316 LIST_REMOVE(ports
, p
, q
);
2321 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
2322 union in_addr_union
*exposed
= userdata
;
2328 expose_ports(rtnl
, exposed
);
2332 static int watch_rtnl(sd_event
*event
, int recv_fd
, union in_addr_union
*exposed
, sd_netlink
**ret
) {
2334 struct cmsghdr cmsghdr
;
2335 uint8_t buf
[CMSG_SPACE(sizeof(int))];
2337 struct msghdr mh
= {
2338 .msg_control
= &control
,
2339 .msg_controllen
= sizeof(control
),
2341 struct cmsghdr
*cmsg
;
2342 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2347 assert(recv_fd
>= 0);
2350 if (!arg_expose_ports
)
2353 k
= recvmsg(recv_fd
, &mh
, MSG_NOSIGNAL
);
2355 return log_error_errno(errno
, "Failed to recv netlink fd: %m");
2357 cmsg
= CMSG_FIRSTHDR(&mh
);
2358 assert(cmsg
->cmsg_level
== SOL_SOCKET
);
2359 assert(cmsg
->cmsg_type
== SCM_RIGHTS
);
2360 assert(cmsg
->cmsg_len
== CMSG_LEN(sizeof(int)));
2361 memcpy(&fd
, CMSG_DATA(cmsg
), sizeof(int));
2363 r
= sd_netlink_open_fd(&rtnl
, fd
);
2366 return log_error_errno(r
, "Failed to create rtnl object: %m");
2369 r
= sd_netlink_add_match(rtnl
, RTM_NEWADDR
, on_address_change
, exposed
);
2371 return log_error_errno(r
, "Failed to subscribe to RTM_NEWADDR messages: %m");
2373 r
= sd_netlink_add_match(rtnl
, RTM_DELADDR
, on_address_change
, exposed
);
2375 return log_error_errno(r
, "Failed to subscribe to RTM_DELADDR messages: %m");
2377 r
= sd_netlink_attach_event(rtnl
, event
, 0);
2379 return log_error_errno(r
, "Failed to add to even loop: %m");
2387 static int setup_hostname(void) {
2389 if (arg_share_system
)
2392 if (sethostname_idempotent(arg_machine
) < 0)
2398 static int setup_journal(const char *directory
) {
2399 sd_id128_t machine_id
, this_id
;
2400 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
2401 const char *etc_machine_id
, *p
, *q
;
2405 /* Don't link journals in ephemeral mode */
2409 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
2411 r
= read_one_line_file(etc_machine_id
, &b
);
2412 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
2415 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
2418 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
2421 /* Verify validity */
2422 r
= sd_id128_from_string(id
, &machine_id
);
2424 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
2426 r
= sd_id128_get_machine(&this_id
);
2428 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
2430 if (sd_id128_equal(machine_id
, this_id
)) {
2431 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
2432 "Host and machine ids are equal (%s): refusing to link journals", id
);
2433 if (arg_link_journal
== LINK_AUTO
)
2438 if (arg_link_journal
== LINK_NO
)
2441 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
2443 return log_error_errno(r
, "Failed to create /var: %m");
2445 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
2447 return log_error_errno(r
, "Failed to create /var/log: %m");
2449 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
2451 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
2453 p
= strjoina("/var/log/journal/", id
);
2454 q
= prefix_roota(directory
, p
);
2456 if (path_is_mount_point(p
, 0) > 0) {
2457 if (arg_link_journal
!= LINK_AUTO
) {
2458 log_error("%s: already a mount point, refusing to use for journal", p
);
2465 if (path_is_mount_point(q
, 0) > 0) {
2466 if (arg_link_journal
!= LINK_AUTO
) {
2467 log_error("%s: already a mount point, refusing to use for journal", q
);
2474 r
= readlink_and_make_absolute(p
, &d
);
2476 if ((arg_link_journal
== LINK_GUEST
||
2477 arg_link_journal
== LINK_AUTO
) &&
2480 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2482 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
2487 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
2488 } else if (r
== -EINVAL
) {
2490 if (arg_link_journal
== LINK_GUEST
&&
2493 if (errno
== ENOTDIR
) {
2494 log_error("%s already exists and is neither a symlink nor a directory", p
);
2497 log_error_errno(errno
, "Failed to remove %s: %m", p
);
2501 } else if (r
!= -ENOENT
) {
2502 log_error_errno(errno
, "readlink(%s) failed: %m", p
);
2506 if (arg_link_journal
== LINK_GUEST
) {
2508 if (symlink(q
, p
) < 0) {
2509 if (arg_link_journal_try
) {
2510 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
2513 log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
2518 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2520 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
2524 if (arg_link_journal
== LINK_HOST
) {
2525 /* don't create parents here -- if the host doesn't have
2526 * permanent journal set up, don't force it here */
2529 if (arg_link_journal_try
) {
2530 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
2533 log_error_errno(errno
, "Failed to create %s: %m", p
);
2538 } else if (access(p
, F_OK
) < 0)
2541 if (dir_is_empty(q
) == 0)
2542 log_warning("%s is not empty, proceeding anyway.", q
);
2544 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2546 log_error_errno(errno
, "Failed to create %s: %m", q
);
2550 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
2551 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
2556 static int drop_capabilities(void) {
2557 return capability_bounding_set_drop(~arg_retain
, false);
2560 static int register_machine(pid_t pid
, int local_ifindex
) {
2561 _cleanup_bus_error_free_ sd_bus_error error
= SD_BUS_ERROR_NULL
;
2562 _cleanup_bus_flush_close_unref_ sd_bus
*bus
= NULL
;
2568 r
= sd_bus_default_system(&bus
);
2570 return log_error_errno(r
, "Failed to open system bus: %m");
2572 if (arg_keep_unit
) {
2573 r
= sd_bus_call_method(
2575 "org.freedesktop.machine1",
2576 "/org/freedesktop/machine1",
2577 "org.freedesktop.machine1.Manager",
2578 "RegisterMachineWithNetwork",
2583 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid
),
2587 strempty(arg_directory
),
2588 local_ifindex
> 0 ? 1 : 0, local_ifindex
);
2590 _cleanup_bus_message_unref_ sd_bus_message
*m
= NULL
;
2594 r
= sd_bus_message_new_method_call(
2597 "org.freedesktop.machine1",
2598 "/org/freedesktop/machine1",
2599 "org.freedesktop.machine1.Manager",
2600 "CreateMachineWithNetwork");
2602 return bus_log_create_error(r
);
2604 r
= sd_bus_message_append(
2608 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid
),
2612 strempty(arg_directory
),
2613 local_ifindex
> 0 ? 1 : 0, local_ifindex
);
2615 return bus_log_create_error(r
);
2617 r
= sd_bus_message_open_container(m
, 'a', "(sv)");
2619 return bus_log_create_error(r
);
2621 if (!isempty(arg_slice
)) {
2622 r
= sd_bus_message_append(m
, "(sv)", "Slice", "s", arg_slice
);
2624 return bus_log_create_error(r
);
2627 r
= sd_bus_message_append(m
, "(sv)", "DevicePolicy", "s", "strict");
2629 return bus_log_create_error(r
);
2631 /* If you make changes here, also make sure to update
2632 * systemd-nspawn@.service, to keep the device
2633 * policies in sync regardless if we are run with or
2634 * without the --keep-unit switch. */
2635 r
= sd_bus_message_append(m
, "(sv)", "DeviceAllow", "a(ss)", 9,
2636 /* Allow the container to
2637 * access and create the API
2638 * device nodes, so that
2639 * PrivateDevices= in the
2640 * container can work
2645 "/dev/random", "rwm",
2646 "/dev/urandom", "rwm",
2648 "/dev/net/tun", "rwm",
2649 /* Allow the container
2650 * access to ptys. However,
2652 * container to ever create
2653 * these device nodes. */
2654 "/dev/pts/ptmx", "rw",
2657 return bus_log_create_error(r
);
2659 for (j
= 0; j
< arg_n_custom_mounts
; j
++) {
2660 CustomMount
*cm
= &arg_custom_mounts
[j
];
2662 if (cm
->type
!= CUSTOM_MOUNT_BIND
)
2665 r
= is_device_node(cm
->source
);
2667 return log_error_errno(r
, "Failed to stat %s: %m", cm
->source
);
2670 r
= sd_bus_message_append(m
, "(sv)", "DeviceAllow", "a(ss)", 1,
2671 cm
->source
, cm
->read_only
? "r" : "rw");
2673 return log_error_errno(r
, "Failed to append message arguments: %m");
2677 if (arg_kill_signal
!= 0) {
2678 r
= sd_bus_message_append(m
, "(sv)", "KillSignal", "i", arg_kill_signal
);
2680 return bus_log_create_error(r
);
2682 r
= sd_bus_message_append(m
, "(sv)", "KillMode", "s", "mixed");
2684 return bus_log_create_error(r
);
2687 STRV_FOREACH(i
, arg_property
) {
2688 r
= sd_bus_message_open_container(m
, 'r', "sv");
2690 return bus_log_create_error(r
);
2692 r
= bus_append_unit_property_assignment(m
, *i
);
2696 r
= sd_bus_message_close_container(m
);
2698 return bus_log_create_error(r
);
2701 r
= sd_bus_message_close_container(m
);
2703 return bus_log_create_error(r
);
2705 r
= sd_bus_call(bus
, m
, 0, &error
, NULL
);
2709 log_error("Failed to register machine: %s", bus_error_message(&error
, r
));
2716 static int terminate_machine(pid_t pid
) {
2717 _cleanup_bus_error_free_ sd_bus_error error
= SD_BUS_ERROR_NULL
;
2718 _cleanup_bus_message_unref_ sd_bus_message
*reply
= NULL
;
2719 _cleanup_bus_flush_close_unref_ sd_bus
*bus
= NULL
;
2726 /* If we are reusing the unit, then just exit, systemd will do
2727 * the right thing when we exit. */
2731 r
= sd_bus_default_system(&bus
);
2733 return log_error_errno(r
, "Failed to open system bus: %m");
2735 r
= sd_bus_call_method(
2737 "org.freedesktop.machine1",
2738 "/org/freedesktop/machine1",
2739 "org.freedesktop.machine1.Manager",
2746 /* Note that the machine might already have been
2747 * cleaned up automatically, hence don't consider it a
2748 * failure if we cannot get the machine object. */
2749 log_debug("Failed to get machine: %s", bus_error_message(&error
, r
));
2753 r
= sd_bus_message_read(reply
, "o", &path
);
2755 return bus_log_parse_error(r
);
2757 r
= sd_bus_call_method(
2759 "org.freedesktop.machine1",
2761 "org.freedesktop.machine1.Machine",
2767 log_debug("Failed to terminate machine: %s", bus_error_message(&error
, r
));
2774 static int reset_audit_loginuid(void) {
2775 _cleanup_free_
char *p
= NULL
;
2778 if (arg_share_system
)
2781 r
= read_one_line_file("/proc/self/loginuid", &p
);
2785 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
2787 /* Already reset? */
2788 if (streq(p
, "4294967295"))
2791 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
2794 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2795 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2796 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2797 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2798 "using systemd-nspawn. Sleeping for 5s... (%m)");
2806 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2807 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2808 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2810 static int generate_mac(struct ether_addr
*mac
, sd_id128_t hash_key
, uint64_t idx
) {
2816 l
= strlen(arg_machine
);
2817 sz
= sizeof(sd_id128_t
) + l
;
2823 /* fetch some persistent data unique to the host */
2824 r
= sd_id128_get_machine((sd_id128_t
*) v
);
2828 /* combine with some data unique (on this host) to this
2829 * container instance */
2830 i
= mempcpy(v
+ sizeof(sd_id128_t
), arg_machine
, l
);
2833 memcpy(i
, &idx
, sizeof(idx
));
2836 /* Let's hash the host machine ID plus the container name. We
2837 * use a fixed, but originally randomly created hash key here. */
2838 siphash24(result
, v
, sz
, hash_key
.bytes
);
2840 assert_cc(ETH_ALEN
<= sizeof(result
));
2841 memcpy(mac
->ether_addr_octet
, result
, ETH_ALEN
);
2843 /* see eth_random_addr in the kernel */
2844 mac
->ether_addr_octet
[0] &= 0xfe; /* clear multicast bit */
2845 mac
->ether_addr_octet
[0] |= 0x02; /* set local assignment bit (IEEE802) */
2850 static int setup_veth(pid_t pid
, char iface_name
[IFNAMSIZ
], int *ifi
) {
2851 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2852 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2853 struct ether_addr mac_host
, mac_container
;
2856 if (!arg_private_network
)
2859 if (!arg_network_veth
)
2862 /* Use two different interface name prefixes depending whether
2863 * we are in bridge mode or not. */
2864 snprintf(iface_name
, IFNAMSIZ
- 1, "%s-%s",
2865 arg_network_bridge
? "vb" : "ve", arg_machine
);
2867 r
= generate_mac(&mac_container
, CONTAINER_HASH_KEY
, 0);
2869 return log_error_errno(r
, "Failed to generate predictable MAC address for container side: %m");
2871 r
= generate_mac(&mac_host
, HOST_HASH_KEY
, 0);
2873 return log_error_errno(r
, "Failed to generate predictable MAC address for host side: %m");
2875 r
= sd_netlink_open(&rtnl
);
2877 return log_error_errno(r
, "Failed to connect to netlink: %m");
2879 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2881 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2883 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, iface_name
);
2885 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2887 r
= sd_netlink_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac_host
);
2889 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2891 r
= sd_netlink_message_open_container(m
, IFLA_LINKINFO
);
2893 return log_error_errno(r
, "Failed to open netlink container: %m");
2895 r
= sd_netlink_message_open_container_union(m
, IFLA_INFO_DATA
, "veth");
2897 return log_error_errno(r
, "Failed to open netlink container: %m");
2899 r
= sd_netlink_message_open_container(m
, VETH_INFO_PEER
);
2901 return log_error_errno(r
, "Failed to open netlink container: %m");
2903 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, "host0");
2905 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2907 r
= sd_netlink_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac_container
);
2909 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2911 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2913 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2915 r
= sd_netlink_message_close_container(m
);
2917 return log_error_errno(r
, "Failed to close netlink container: %m");
2919 r
= sd_netlink_message_close_container(m
);
2921 return log_error_errno(r
, "Failed to close netlink container: %m");
2923 r
= sd_netlink_message_close_container(m
);
2925 return log_error_errno(r
, "Failed to close netlink container: %m");
2927 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2929 return log_error_errno(r
, "Failed to add new veth interfaces (host0, %s): %m", iface_name
);
2931 i
= (int) if_nametoindex(iface_name
);
2933 return log_error_errno(errno
, "Failed to resolve interface %s: %m", iface_name
);
2940 static int setup_bridge(const char veth_name
[], int *ifi
) {
2941 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2942 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2945 if (!arg_private_network
)
2948 if (!arg_network_veth
)
2951 if (!arg_network_bridge
)
2954 bridge
= (int) if_nametoindex(arg_network_bridge
);
2956 return log_error_errno(errno
, "Failed to resolve interface %s: %m", arg_network_bridge
);
2960 r
= sd_netlink_open(&rtnl
);
2962 return log_error_errno(r
, "Failed to connect to netlink: %m");
2964 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_SETLINK
, 0);
2966 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2968 r
= sd_rtnl_message_link_set_flags(m
, IFF_UP
, IFF_UP
);
2970 return log_error_errno(r
, "Failed to set IFF_UP flag: %m");
2972 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, veth_name
);
2974 return log_error_errno(r
, "Failed to add netlink interface name field: %m");
2976 r
= sd_netlink_message_append_u32(m
, IFLA_MASTER
, bridge
);
2978 return log_error_errno(r
, "Failed to add netlink master field: %m");
2980 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2982 return log_error_errno(r
, "Failed to add veth interface to bridge: %m");
2987 static int parse_interface(struct udev
*udev
, const char *name
) {
2988 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
2989 char ifi_str
[2 + DECIMAL_STR_MAX(int)];
2992 ifi
= (int) if_nametoindex(name
);
2994 return log_error_errno(errno
, "Failed to resolve interface %s: %m", name
);
2996 sprintf(ifi_str
, "n%i", ifi
);
2997 d
= udev_device_new_from_device_id(udev
, ifi_str
);
2999 return log_error_errno(errno
, "Failed to get udev device for interface %s: %m", name
);
3001 if (udev_device_get_is_initialized(d
) <= 0) {
3002 log_error("Network interface %s is not initialized yet.", name
);
3009 static int move_network_interfaces(pid_t pid
) {
3010 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
3011 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
3015 if (!arg_private_network
)
3018 if (strv_isempty(arg_network_interfaces
))
3021 r
= sd_netlink_open(&rtnl
);
3023 return log_error_errno(r
, "Failed to connect to netlink: %m");
3027 log_error("Failed to connect to udev.");
3031 STRV_FOREACH(i
, arg_network_interfaces
) {
3032 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
3035 ifi
= parse_interface(udev
, *i
);
3039 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_SETLINK
, ifi
);
3041 return log_error_errno(r
, "Failed to allocate netlink message: %m");
3043 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
3045 return log_error_errno(r
, "Failed to append namespace PID to netlink message: %m");
3047 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
3049 return log_error_errno(r
, "Failed to move interface %s to namespace: %m", *i
);
3055 static int setup_macvlan(pid_t pid
) {
3056 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
3057 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
3062 if (!arg_private_network
)
3065 if (strv_isempty(arg_network_macvlan
))
3068 r
= sd_netlink_open(&rtnl
);
3070 return log_error_errno(r
, "Failed to connect to netlink: %m");
3074 log_error("Failed to connect to udev.");
3078 STRV_FOREACH(i
, arg_network_macvlan
) {
3079 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
3080 _cleanup_free_
char *n
= NULL
;
3081 struct ether_addr mac
;
3084 ifi
= parse_interface(udev
, *i
);
3088 r
= generate_mac(&mac
, MACVLAN_HASH_KEY
, idx
++);
3090 return log_error_errno(r
, "Failed to create MACVLAN MAC address: %m");
3092 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
3094 return log_error_errno(r
, "Failed to allocate netlink message: %m");
3096 r
= sd_netlink_message_append_u32(m
, IFLA_LINK
, ifi
);
3098 return log_error_errno(r
, "Failed to add netlink interface index: %m");
3100 n
= strappend("mv-", *i
);
3104 strshorten(n
, IFNAMSIZ
-1);
3106 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, n
);
3108 return log_error_errno(r
, "Failed to add netlink interface name: %m");
3110 r
= sd_netlink_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac
);
3112 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
3114 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
3116 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
3118 r
= sd_netlink_message_open_container(m
, IFLA_LINKINFO
);
3120 return log_error_errno(r
, "Failed to open netlink container: %m");
3122 r
= sd_netlink_message_open_container_union(m
, IFLA_INFO_DATA
, "macvlan");
3124 return log_error_errno(r
, "Failed to open netlink container: %m");
3126 r
= sd_netlink_message_append_u32(m
, IFLA_MACVLAN_MODE
, MACVLAN_MODE_BRIDGE
);
3128 return log_error_errno(r
, "Failed to append macvlan mode: %m");
3130 r
= sd_netlink_message_close_container(m
);
3132 return log_error_errno(r
, "Failed to close netlink container: %m");
3134 r
= sd_netlink_message_close_container(m
);
3136 return log_error_errno(r
, "Failed to close netlink container: %m");
3138 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
3140 return log_error_errno(r
, "Failed to add new macvlan interfaces: %m");
3146 static int setup_ipvlan(pid_t pid
) {
3147 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
3148 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
3152 if (!arg_private_network
)
3155 if (strv_isempty(arg_network_ipvlan
))
3158 r
= sd_netlink_open(&rtnl
);
3160 return log_error_errno(r
, "Failed to connect to netlink: %m");
3164 log_error("Failed to connect to udev.");
3168 STRV_FOREACH(i
, arg_network_ipvlan
) {
3169 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
3170 _cleanup_free_
char *n
= NULL
;
3173 ifi
= parse_interface(udev
, *i
);
3177 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
3179 return log_error_errno(r
, "Failed to allocate netlink message: %m");
3181 r
= sd_netlink_message_append_u32(m
, IFLA_LINK
, ifi
);
3183 return log_error_errno(r
, "Failed to add netlink interface index: %m");
3185 n
= strappend("iv-", *i
);
3189 strshorten(n
, IFNAMSIZ
-1);
3191 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, n
);
3193 return log_error_errno(r
, "Failed to add netlink interface name: %m");
3195 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
3197 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
3199 r
= sd_netlink_message_open_container(m
, IFLA_LINKINFO
);
3201 return log_error_errno(r
, "Failed to open netlink container: %m");
3203 r
= sd_netlink_message_open_container_union(m
, IFLA_INFO_DATA
, "ipvlan");
3205 return log_error_errno(r
, "Failed to open netlink container: %m");
3207 r
= sd_netlink_message_append_u16(m
, IFLA_IPVLAN_MODE
, IPVLAN_MODE_L2
);
3209 return log_error_errno(r
, "Failed to add ipvlan mode: %m");
3211 r
= sd_netlink_message_close_container(m
);
3213 return log_error_errno(r
, "Failed to close netlink container: %m");
3215 r
= sd_netlink_message_close_container(m
);
3217 return log_error_errno(r
, "Failed to close netlink container: %m");
3219 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
3221 return log_error_errno(r
, "Failed to add new ipvlan interfaces: %m");
3227 static int setup_seccomp(void) {
3230 static const struct {
3231 uint64_t capability
;
3234 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
3235 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
3236 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
3237 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
3238 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
3239 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
3240 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
3241 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
3242 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
3243 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
3246 scmp_filter_ctx seccomp
;
3250 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
3254 r
= seccomp_add_secondary_archs(seccomp
);
3256 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
3260 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
3261 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
3264 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
3266 continue; /* unknown syscall */
3268 log_error_errno(r
, "Failed to block syscall: %m");
3275 Audit is broken in containers, much of the userspace audit
3276 hookup will fail if running inside a container. We don't
3277 care and just turn off creation of audit sockets.
3279 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
3280 with EAFNOSUPPORT which audit userspace uses as indication
3281 that audit is disabled in the kernel.
3284 r
= seccomp_rule_add(
3286 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
3289 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
3290 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
3292 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
3296 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
3298 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
3302 r
= seccomp_load(seccomp
);
3304 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3309 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
3314 seccomp_release(seccomp
);
3322 static int setup_propagate(const char *root
) {
3325 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3326 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
3327 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3328 (void) mkdir_p(p
, 0600);
3330 if (userns_mkdir(root
, "/run/systemd", 0755, 0, 0) < 0)
3331 return log_error_errno(errno
, "Failed to create /run/systemd: %m");
3333 if (userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3334 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn: %m");
3336 if (userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3337 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn/incoming: %m");
3339 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
3340 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
3341 return log_error_errno(errno
, "Failed to install propagation bind mount.");
3343 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
3344 return log_error_errno(errno
, "Failed to make propagation mount read-only");
3349 static int setup_image(char **device_path
, int *loop_nr
) {
3350 struct loop_info64 info
= {
3351 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
3353 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
3354 _cleanup_free_
char* loopdev
= NULL
;
3358 assert(device_path
);
3362 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
3364 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
3366 if (fstat(fd
, &st
) < 0)
3367 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
3369 if (S_ISBLK(st
.st_mode
)) {
3372 p
= strdup(arg_image
);
3386 if (!S_ISREG(st
.st_mode
)) {
3387 log_error_errno(errno
, "%s is not a regular file or block device: %m", arg_image
);
3391 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
3393 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
3395 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
3397 return log_error_errno(errno
, "Failed to allocate loop device: %m");
3399 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
3402 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
3404 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
3406 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
3407 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
3410 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
3412 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
3413 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
3415 *device_path
= loopdev
;
3426 #define PARTITION_TABLE_BLURB \
3427 "Note that the disk image needs to either contain only a single MBR partition of\n" \
3428 "type 0x83 that is marked bootable, or a single GPT partition of type " \
3429 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
3430 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3431 "to be bootable with systemd-nspawn."
3433 static int dissect_image(
3435 char **root_device
, bool *root_device_rw
,
3436 char **home_device
, bool *home_device_rw
,
3437 char **srv_device
, bool *srv_device_rw
,
3441 int home_nr
= -1, srv_nr
= -1;
3442 #ifdef GPT_ROOT_NATIVE
3445 #ifdef GPT_ROOT_SECONDARY
3446 int secondary_root_nr
= -1;
3448 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
3449 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
3450 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
3451 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
3452 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
3453 struct udev_list_entry
*first
, *item
;
3454 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
3455 bool is_gpt
, is_mbr
, multiple_generic
= false;
3456 const char *pttype
= NULL
;
3463 assert(root_device
);
3464 assert(home_device
);
3469 b
= blkid_new_probe();
3474 r
= blkid_probe_set_device(b
, fd
, 0, 0);
3479 log_error_errno(errno
, "Failed to set device on blkid probe: %m");
3483 blkid_probe_enable_partitions(b
, 1);
3484 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
3487 r
= blkid_do_safeprobe(b
);
3488 if (r
== -2 || r
== 1) {
3489 log_error("Failed to identify any partition table on\n"
3491 PARTITION_TABLE_BLURB
, arg_image
);
3493 } else if (r
!= 0) {
3496 log_error_errno(errno
, "Failed to probe: %m");
3500 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
3502 is_gpt
= streq_ptr(pttype
, "gpt");
3503 is_mbr
= streq_ptr(pttype
, "dos");
3505 if (!is_gpt
&& !is_mbr
) {
3506 log_error("No GPT or MBR partition table discovered on\n"
3508 PARTITION_TABLE_BLURB
, arg_image
);
3513 pl
= blkid_probe_get_partitions(b
);
3518 log_error("Failed to list partitions of %s", arg_image
);
3526 if (fstat(fd
, &st
) < 0)
3527 return log_error_errno(errno
, "Failed to stat block device: %m");
3529 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
3537 log_error("Kernel partitions never appeared.");
3541 e
= udev_enumerate_new(udev
);
3545 r
= udev_enumerate_add_match_parent(e
, d
);
3549 r
= udev_enumerate_scan_devices(e
);
3551 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
3553 /* Count the partitions enumerated by the kernel */
3555 first
= udev_enumerate_get_list_entry(e
);
3556 udev_list_entry_foreach(item
, first
)
3559 /* Count the partitions enumerated by blkid */
3560 m
= blkid_partlist_numof_partitions(pl
);
3564 log_error("blkid and kernel partition list do not match.");
3570 /* The kernel has probed fewer partitions than
3571 * blkid? Maybe the kernel prober is still
3572 * running or it got EBUSY because udev
3573 * already opened the device. Let's reprobe
3574 * the device, which is a synchronous call
3575 * that waits until probing is complete. */
3577 for (j
= 0; j
< 20; j
++) {
3579 r
= ioctl(fd
, BLKRRPART
, 0);
3582 if (r
>= 0 || r
!= -EBUSY
)
3585 /* If something else has the device
3586 * open, such as an udev rule, the
3587 * ioctl will return EBUSY. Since
3588 * there's no way to wait until it
3589 * isn't busy anymore, let's just wait
3590 * a bit, and try again.
3592 * This is really something they
3593 * should fix in the kernel! */
3595 usleep(50 * USEC_PER_MSEC
);
3599 return log_error_errno(r
, "Failed to reread partition table: %m");
3602 e
= udev_enumerate_unref(e
);
3605 first
= udev_enumerate_get_list_entry(e
);
3606 udev_list_entry_foreach(item
, first
) {
3607 _cleanup_udev_device_unref_
struct udev_device
*q
;
3609 unsigned long long flags
;
3615 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
3620 log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
3624 qn
= udev_device_get_devnum(q
);
3628 if (st
.st_rdev
== qn
)
3631 node
= udev_device_get_devnode(q
);
3635 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
3639 flags
= blkid_partition_get_flags(pp
);
3641 nr
= blkid_partition_get_partno(pp
);
3649 if (flags
& GPT_FLAG_NO_AUTO
)
3652 stype
= blkid_partition_get_type_string(pp
);
3656 if (sd_id128_from_string(stype
, &type_id
) < 0)
3659 if (sd_id128_equal(type_id
, GPT_HOME
)) {
3661 if (home
&& nr
>= home_nr
)
3665 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3667 r
= free_and_strdup(&home
, node
);
3671 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
3673 if (srv
&& nr
>= srv_nr
)
3677 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3679 r
= free_and_strdup(&srv
, node
);
3683 #ifdef GPT_ROOT_NATIVE
3684 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
3686 if (root
&& nr
>= root_nr
)
3690 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3692 r
= free_and_strdup(&root
, node
);
3697 #ifdef GPT_ROOT_SECONDARY
3698 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
3700 if (secondary_root
&& nr
>= secondary_root_nr
)
3703 secondary_root_nr
= nr
;
3704 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3706 r
= free_and_strdup(&secondary_root
, node
);
3711 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
3714 multiple_generic
= true;
3716 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3718 r
= free_and_strdup(&generic
, node
);
3724 } else if (is_mbr
) {
3727 if (flags
!= 0x80) /* Bootable flag */
3730 type
= blkid_partition_get_type(pp
);
3731 if (type
!= 0x83) /* Linux partition */
3735 multiple_generic
= true;
3739 r
= free_and_strdup(&root
, node
);
3747 *root_device
= root
;
3750 *root_device_rw
= root_rw
;
3752 } else if (secondary_root
) {
3753 *root_device
= secondary_root
;
3754 secondary_root
= NULL
;
3756 *root_device_rw
= secondary_root_rw
;
3758 } else if (generic
) {
3760 /* There were no partitions with precise meanings
3761 * around, but we found generic partitions. In this
3762 * case, if there's only one, we can go ahead and boot
3763 * it, otherwise we bail out, because we really cannot
3764 * make any sense of it. */
3766 if (multiple_generic
) {
3767 log_error("Identified multiple bootable Linux partitions on\n"
3769 PARTITION_TABLE_BLURB
, arg_image
);
3773 *root_device
= generic
;
3776 *root_device_rw
= generic_rw
;
3779 log_error("Failed to identify root partition in disk image\n"
3781 PARTITION_TABLE_BLURB
, arg_image
);
3786 *home_device
= home
;
3789 *home_device_rw
= home_rw
;
3796 *srv_device_rw
= srv_rw
;
3801 log_error("--image= is not supported, compiled without blkid support.");
3806 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
3808 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
3809 const char *fstype
, *p
;
3819 p
= strjoina(where
, directory
);
3824 b
= blkid_new_probe_from_filename(what
);
3828 log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
3832 blkid_probe_enable_superblocks(b
, 1);
3833 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
3836 r
= blkid_do_safeprobe(b
);
3837 if (r
== -1 || r
== 1) {
3838 log_error("Cannot determine file system type of %s", what
);
3840 } else if (r
!= 0) {
3843 log_error_errno(errno
, "Failed to probe %s: %m", what
);
3848 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
3851 log_error("Failed to determine file system type of %s", what
);
3855 if (streq(fstype
, "crypto_LUKS")) {
3856 log_error("nspawn currently does not support LUKS disk images.");
3860 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
3861 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
3865 log_error("--image= is not supported, compiled without blkid support.");
3870 static int mount_devices(
3872 const char *root_device
, bool root_device_rw
,
3873 const char *home_device
, bool home_device_rw
,
3874 const char *srv_device
, bool srv_device_rw
) {
3880 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
3882 return log_error_errno(r
, "Failed to mount root directory: %m");
3886 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
3888 return log_error_errno(r
, "Failed to mount home directory: %m");
3892 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
3894 return log_error_errno(r
, "Failed to mount server data directory: %m");
3900 static void loop_remove(int nr
, int *image_fd
) {
3901 _cleanup_close_
int control
= -1;
3907 if (image_fd
&& *image_fd
>= 0) {
3908 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
3910 log_debug_errno(errno
, "Failed to close loop image: %m");
3911 *image_fd
= safe_close(*image_fd
);
3914 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
3916 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
3920 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
3922 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
3925 static int spawn_getent(const char *database
, const char *key
, pid_t
*rpid
) {
3933 if (pipe2(pipe_fds
, O_CLOEXEC
) < 0)
3934 return log_error_errno(errno
, "Failed to allocate pipe: %m");
3938 return log_error_errno(errno
, "Failed to fork getent child: %m");
3939 else if (pid
== 0) {
3941 char *empty_env
= NULL
;
3943 if (dup3(pipe_fds
[1], STDOUT_FILENO
, 0) < 0)
3944 _exit(EXIT_FAILURE
);
3946 if (pipe_fds
[0] > 2)
3947 safe_close(pipe_fds
[0]);
3948 if (pipe_fds
[1] > 2)
3949 safe_close(pipe_fds
[1]);
3951 nullfd
= open("/dev/null", O_RDWR
);
3953 _exit(EXIT_FAILURE
);
3955 if (dup3(nullfd
, STDIN_FILENO
, 0) < 0)
3956 _exit(EXIT_FAILURE
);
3958 if (dup3(nullfd
, STDERR_FILENO
, 0) < 0)
3959 _exit(EXIT_FAILURE
);
3964 (void) reset_all_signal_handlers();
3965 (void) reset_signal_mask();
3966 close_all_fds(NULL
, 0);
3968 execle("/usr/bin/getent", "getent", database
, key
, NULL
, &empty_env
);
3969 execle("/bin/getent", "getent", database
, key
, NULL
, &empty_env
);
3970 _exit(EXIT_FAILURE
);
3973 pipe_fds
[1] = safe_close(pipe_fds
[1]);
3980 static int change_uid_gid(char **_home
) {
3981 char line
[LINE_MAX
], *x
, *u
, *g
, *h
;
3982 const char *word
, *state
;
3983 _cleanup_free_ uid_t
*uids
= NULL
;
3984 _cleanup_free_
char *home
= NULL
;
3985 _cleanup_fclose_
FILE *f
= NULL
;
3986 _cleanup_close_
int fd
= -1;
3987 unsigned n_uids
= 0;
3996 if (!arg_user
|| streq(arg_user
, "root") || streq(arg_user
, "0")) {
3997 /* Reset everything fully to 0, just in case */
3999 r
= reset_uid_gid();
4001 return log_error_errno(r
, "Failed to become root: %m");
4007 /* First, get user credentials */
4008 fd
= spawn_getent("passwd", arg_user
, &pid
);
4012 f
= fdopen(fd
, "r");
4017 if (!fgets(line
, sizeof(line
), f
)) {
4020 log_error("Failed to resolve user %s.", arg_user
);
4024 log_error_errno(errno
, "Failed to read from getent: %m");
4030 wait_for_terminate_and_warn("getent passwd", pid
, true);
4032 x
= strchr(line
, ':');
4034 log_error("/etc/passwd entry has invalid user field.");
4038 u
= strchr(x
+1, ':');
4040 log_error("/etc/passwd entry has invalid password field.");
4047 log_error("/etc/passwd entry has invalid UID field.");
4055 log_error("/etc/passwd entry has invalid GID field.");
4060 h
= strchr(x
+1, ':');
4062 log_error("/etc/passwd entry has invalid GECOS field.");
4069 log_error("/etc/passwd entry has invalid home directory field.");
4075 r
= parse_uid(u
, &uid
);
4077 log_error("Failed to parse UID of user.");
4081 r
= parse_gid(g
, &gid
);
4083 log_error("Failed to parse GID of user.");
4091 /* Second, get group memberships */
4092 fd
= spawn_getent("initgroups", arg_user
, &pid
);
4097 f
= fdopen(fd
, "r");
4102 if (!fgets(line
, sizeof(line
), f
)) {
4104 log_error("Failed to resolve user %s.", arg_user
);
4108 log_error_errno(errno
, "Failed to read from getent: %m");
4114 wait_for_terminate_and_warn("getent initgroups", pid
, true);
4116 /* Skip over the username and subsequent separator whitespace */
4118 x
+= strcspn(x
, WHITESPACE
);
4119 x
+= strspn(x
, WHITESPACE
);
4121 FOREACH_WORD(word
, l
, x
, state
) {
4127 if (!GREEDY_REALLOC(uids
, sz
, n_uids
+1))
4130 r
= parse_uid(c
, &uids
[n_uids
++]);
4132 log_error("Failed to parse group data from getent.");
4137 r
= mkdir_parents(home
, 0775);
4139 return log_error_errno(r
, "Failed to make home root directory: %m");
4141 r
= mkdir_safe(home
, 0755, uid
, gid
);
4142 if (r
< 0 && r
!= -EEXIST
)
4143 return log_error_errno(r
, "Failed to make home directory: %m");
4145 (void) fchown(STDIN_FILENO
, uid
, gid
);
4146 (void) fchown(STDOUT_FILENO
, uid
, gid
);
4147 (void) fchown(STDERR_FILENO
, uid
, gid
);
4149 if (setgroups(n_uids
, uids
) < 0)
4150 return log_error_errno(errno
, "Failed to set auxiliary groups: %m");
4152 if (setresgid(gid
, gid
, gid
) < 0)
4153 return log_error_errno(errno
, "setregid() failed: %m");
4155 if (setresuid(uid
, uid
, uid
) < 0)
4156 return log_error_errno(errno
, "setreuid() failed: %m");
4168 * < 0 : wait_for_terminate() failed to get the state of the
4169 * container, the container was terminated by a signal, or
4170 * failed for an unknown reason. No change is made to the
4171 * container argument.
4172 * > 0 : The program executed in the container terminated with an
4173 * error. The exit code of the program executed in the
4174 * container is returned. The container argument has been set
4175 * to CONTAINER_TERMINATED.
4176 * 0 : The container is being rebooted, has been shut down or exited
4177 * successfully. The container argument has been set to either
4178 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
4180 * That is, success is indicated by a return value of zero, and an
4181 * error is indicated by a non-zero value.
4183 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
4187 r
= wait_for_terminate(pid
, &status
);
4189 return log_warning_errno(r
, "Failed to wait for container: %m");
4191 switch (status
.si_code
) {
4194 if (status
.si_status
== 0) {
4195 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
4198 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
4200 *container
= CONTAINER_TERMINATED
;
4201 return status
.si_status
;
4204 if (status
.si_status
== SIGINT
) {
4206 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
4207 *container
= CONTAINER_TERMINATED
;
4210 } else if (status
.si_status
== SIGHUP
) {
4212 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
4213 *container
= CONTAINER_REBOOTED
;
4217 /* CLD_KILLED fallthrough */
4220 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
4224 log_error("Container %s failed due to unknown reason.", arg_machine
);
4231 static void nop_handler(int sig
) {}
4233 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
4236 pid
= PTR_TO_UINT32(userdata
);
4238 if (kill(pid
, arg_kill_signal
) >= 0) {
4239 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
4240 sd_event_source_set_userdata(s
, NULL
);
4245 sd_event_exit(sd_event_source_get_event(s
), 0);
4249 static int determine_names(void) {
4252 if (arg_template
&& !arg_directory
&& arg_machine
) {
4254 /* If --template= was specified then we should not
4255 * search for a machine, but instead create a new one
4256 * in /var/lib/machine. */
4258 arg_directory
= strjoin("/var/lib/machines/", arg_machine
, NULL
);
4263 if (!arg_image
&& !arg_directory
) {
4265 _cleanup_(image_unrefp
) Image
*i
= NULL
;
4267 r
= image_find(arg_machine
, &i
);
4269 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
4271 log_error("No image for machine '%s': %m", arg_machine
);
4275 if (i
->type
== IMAGE_RAW
)
4276 r
= set_sanitized_path(&arg_image
, i
->path
);
4278 r
= set_sanitized_path(&arg_directory
, i
->path
);
4280 return log_error_errno(r
, "Invalid image directory: %m");
4283 arg_read_only
= arg_read_only
|| i
->read_only
;
4285 arg_directory
= get_current_dir_name();
4287 if (!arg_directory
&& !arg_machine
) {
4288 log_error("Failed to determine path, please use -D or -i.");
4294 if (arg_directory
&& path_equal(arg_directory
, "/"))
4295 arg_machine
= gethostname_malloc();
4297 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
4302 hostname_cleanup(arg_machine
);
4303 if (!machine_name_is_valid(arg_machine
)) {
4304 log_error("Failed to determine machine name automatically, please use -M.");
4308 if (arg_ephemeral
) {
4311 /* Add a random suffix when this is an
4312 * ephemeral machine, so that we can run many
4313 * instances at once without manually having
4314 * to specify -M each time. */
4316 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
4327 static int determine_uid_shift(const char *directory
) {
4335 if (arg_uid_shift
== UID_INVALID
) {
4338 r
= stat(directory
, &st
);
4340 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
4342 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
4344 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
4345 log_error("UID and GID base of %s don't match.", directory
);
4349 arg_uid_range
= UINT32_C(0x10000);
4352 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
4353 log_error("UID base too high for UID range.");
4357 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
4361 static int inner_child(
4363 const char *directory
,
4369 _cleanup_free_
char *home
= NULL
;
4371 const char *envp
[] = {
4372 "PATH=" DEFAULT_PATH_SPLIT_USR
,
4373 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4378 NULL
, /* container_uuid */
4379 NULL
, /* LISTEN_FDS */
4380 NULL
, /* LISTEN_PID */
4384 _cleanup_strv_free_
char **env_use
= NULL
;
4389 assert(kmsg_socket
>= 0);
4394 /* Tell the parent, that it now can write the UID map. */
4395 (void) barrier_place(barrier
); /* #1 */
4397 /* Wait until the parent wrote the UID map */
4398 if (!barrier_place_and_sync(barrier
)) { /* #2 */
4399 log_error("Parent died too early");
4404 r
= mount_all(NULL
, true);
4408 /* Wait until we are cgroup-ified, so that we
4409 * can mount the right cgroup path writable */
4410 if (!barrier_place_and_sync(barrier
)) { /* #3 */
4411 log_error("Parent died too early");
4415 r
= mount_systemd_cgroup_writable("");
4419 r
= reset_uid_gid();
4421 return log_error_errno(r
, "Couldn't become new root: %m");
4423 r
= setup_boot_id(NULL
);
4427 r
= setup_kmsg(NULL
, kmsg_socket
);
4430 kmsg_socket
= safe_close(kmsg_socket
);
4435 return log_error_errno(errno
, "setsid() failed: %m");
4437 if (arg_private_network
)
4440 r
= send_rtnl(rtnl_socket
);
4443 rtnl_socket
= safe_close(rtnl_socket
);
4445 if (drop_capabilities() < 0)
4446 return log_error_errno(errno
, "drop_capabilities() failed: %m");
4450 if (arg_personality
!= PERSONALITY_INVALID
) {
4451 if (personality(arg_personality
) < 0)
4452 return log_error_errno(errno
, "personality() failed: %m");
4453 } else if (secondary
) {
4454 if (personality(PER_LINUX32
) < 0)
4455 return log_error_errno(errno
, "personality() failed: %m");
4459 if (arg_selinux_context
)
4460 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
4461 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
4464 r
= change_uid_gid(&home
);
4468 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
4472 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
4473 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
4474 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
4477 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
4480 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
4484 if (fdset_size(fds
) > 0) {
4485 r
= fdset_cloexec(fds
, false);
4487 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
4489 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
4490 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
4494 env_use
= strv_env_merge(2, envp
, arg_setenv
);
4498 /* Let the parent know that we are ready and
4499 * wait until the parent is ready with the
4501 if (!barrier_place_and_sync(barrier
)) { /* #4 */
4502 log_error("Parent died too early");
4506 /* Now, explicitly close the log, so that we
4507 * then can close all remaining fds. Closing
4508 * the log explicitly first has the benefit
4509 * that the logging subsystem knows about it,
4510 * and is thus ready to be reopened should we
4511 * need it again. Note that the other fds
4512 * closed here are at least the locking and
4515 (void) fdset_close_others(fds
);
4521 /* Automatically search for the init system */
4523 m
= 1 + strv_length(arg_parameters
);
4524 a
= newa(char*, m
+ 1);
4525 if (strv_isempty(arg_parameters
))
4528 memcpy(a
+ 1, arg_parameters
, m
* sizeof(char*));
4530 a
[0] = (char*) "/usr/lib/systemd/systemd";
4531 execve(a
[0], a
, env_use
);
4533 a
[0] = (char*) "/lib/systemd/systemd";
4534 execve(a
[0], a
, env_use
);
4536 a
[0] = (char*) "/sbin/init";
4537 execve(a
[0], a
, env_use
);
4538 } else if (!strv_isempty(arg_parameters
))
4539 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
4541 chdir(home
?: "/root");
4542 execle("/bin/bash", "-bash", NULL
, env_use
);
4543 execle("/bin/sh", "-sh", NULL
, env_use
);
4547 return log_error_errno(errno
, "execv() failed: %m");
4550 static int outer_child(
4552 const char *directory
,
4553 const char *console
,
4554 const char *root_device
, bool root_device_rw
,
4555 const char *home_device
, bool home_device_rw
,
4556 const char *srv_device
, bool srv_device_rw
,
4562 int uid_shift_socket
,
4572 assert(pid_socket
>= 0);
4573 assert(kmsg_socket
>= 0);
4577 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
4578 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
4581 close_nointr(STDIN_FILENO
);
4582 close_nointr(STDOUT_FILENO
);
4583 close_nointr(STDERR_FILENO
);
4585 r
= open_terminal(console
, O_RDWR
);
4586 if (r
!= STDIN_FILENO
) {
4592 return log_error_errno(r
, "Failed to open console: %m");
4595 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
4596 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
4597 return log_error_errno(errno
, "Failed to duplicate console: %m");
4600 r
= reset_audit_loginuid();
4604 /* Mark everything as slave, so that we still
4605 * receive mounts from the real root, but don't
4606 * propagate mounts to the real root. */
4607 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
4608 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
4610 r
= mount_devices(directory
,
4611 root_device
, root_device_rw
,
4612 home_device
, home_device_rw
,
4613 srv_device
, srv_device_rw
);
4617 r
= determine_uid_shift(directory
);
4622 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
4624 return log_error_errno(errno
, "Failed to send UID shift: %m");
4625 if (l
!= sizeof(arg_uid_shift
)) {
4626 log_error("Short write while sending UID shift.");
4631 /* Turn directory into bind mount */
4632 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
4633 return log_error_errno(errno
, "Failed to make bind mount: %m");
4635 r
= setup_volatile(directory
);
4639 r
= setup_volatile_state(directory
);
4643 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
4647 if (arg_read_only
) {
4648 r
= bind_remount_recursive(directory
, true);
4650 return log_error_errno(r
, "Failed to make tree read-only: %m");
4653 r
= mount_all(directory
, false);
4657 if (copy_devnodes(directory
) < 0)
4660 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
4662 if (setup_pts(directory
) < 0)
4665 r
= setup_propagate(directory
);
4669 r
= setup_dev_console(directory
, console
);
4673 r
= setup_seccomp();
4677 r
= setup_timezone(directory
);
4681 r
= setup_resolv_conf(directory
);
4685 r
= setup_journal(directory
);
4689 r
= mount_custom(directory
);
4693 r
= mount_cgroups(directory
);
4697 r
= mount_move_root(directory
);
4699 return log_error_errno(r
, "Failed to move root directory: %m");
4701 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
4702 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
4703 (arg_private_network
? CLONE_NEWNET
: 0) |
4704 (arg_userns
? CLONE_NEWUSER
: 0),
4707 return log_error_errno(errno
, "Failed to fork inner child: %m");
4709 pid_socket
= safe_close(pid_socket
);
4710 uid_shift_socket
= safe_close(uid_shift_socket
);
4712 /* The inner child has all namespaces that are
4713 * requested, so that we all are owned by the user if
4714 * user namespaces are turned on. */
4716 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
);
4718 _exit(EXIT_FAILURE
);
4720 _exit(EXIT_SUCCESS
);
4723 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
4725 return log_error_errno(errno
, "Failed to send PID: %m");
4726 if (l
!= sizeof(pid
)) {
4727 log_error("Short write while sending PID.");
4731 pid_socket
= safe_close(pid_socket
);
4736 static int setup_uid_map(pid_t pid
) {
4737 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
4742 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
4743 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
4744 r
= write_string_file(uid_map
, line
, 0);
4746 return log_error_errno(r
, "Failed to write UID map: %m");
4748 /* We always assign the same UID and GID ranges */
4749 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
4750 r
= write_string_file(uid_map
, line
, 0);
4752 return log_error_errno(r
, "Failed to write GID map: %m");
4757 static int chown_cgroup(pid_t pid
) {
4758 _cleanup_free_
char *path
= NULL
, *fs
= NULL
;
4759 _cleanup_close_
int fd
= -1;
4763 r
= cg_pid_get_path(NULL
, pid
, &path
);
4765 return log_error_errno(r
, "Failed to get container cgroup path: %m");
4767 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &fs
);
4769 return log_error_errno(r
, "Failed to get file system path for container cgroup: %m");
4771 fd
= open(fs
, O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
);
4773 return log_error_errno(errno
, "Failed to open %s: %m", fs
);
4778 "notify_on_release",
4780 "cgroup.clone_children",
4781 "cgroup.controllers",
4782 "cgroup.subtree_control",
4784 if (fchownat(fd
, fn
, arg_uid_shift
, arg_uid_shift
, 0) < 0)
4785 log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
,
4786 "Failed to chown() cgroup file %s, ignoring: %m", fn
);
4791 static int sync_cgroup(pid_t pid
) {
4792 _cleanup_free_
char *cgroup
= NULL
;
4793 char tree
[] = "/tmp/unifiedXXXXXX", pid_string
[DECIMAL_STR_MAX(pid
) + 1];
4794 bool undo_mount
= false;
4798 unified
= cg_unified();
4800 return log_error_errno(unified
, "Failed to determine whether the unified hierachy is used: %m");
4802 if ((unified
> 0) == arg_unified_cgroup_hierarchy
)
4805 /* When the host uses the legacy cgroup setup, but the
4806 * container shall use the unified hierarchy, let's make sure
4807 * we copy the path from the name=systemd hierarchy into the
4808 * unified hierarchy. Similar for the reverse situation. */
4810 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &cgroup
);
4812 return log_error_errno(r
, "Failed to get control group of " PID_FMT
": %m", pid
);
4814 /* In order to access the unified hierarchy we need to mount it */
4816 return log_error_errno(errno
, "Failed to generate temporary mount point for unified hierarchy: %m");
4819 r
= mount("cgroup", tree
, "cgroup", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "none,name=systemd,xattr");
4821 r
= mount("cgroup", tree
, "cgroup", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "__DEVEL__sane_behavior");
4823 r
= log_error_errno(errno
, "Failed to mount unified hierarchy: %m");
4829 fn
= strjoina(tree
, cgroup
, "/cgroup.procs");
4830 (void) mkdir_parents(fn
, 0755);
4832 sprintf(pid_string
, PID_FMT
, pid
);
4833 r
= write_string_file(fn
, pid_string
, 0);
4835 log_error_errno(r
, "Failed to move process: %m");
4839 (void) umount(tree
);
4845 static int create_subcgroup(pid_t pid
) {
4846 _cleanup_free_
char *cgroup
= NULL
;
4849 CGroupMask supported
;
4851 /* In the unified hierarchy inner nodes may only only contain
4852 * subgroups, but not processes. Hence, if we running in the
4853 * unified hierarchy and the container does the same, and we
4854 * did not create a scope unit for the container move us and
4855 * the container into two separate subcgroups. */
4860 if (!arg_unified_cgroup_hierarchy
)
4863 unified
= cg_unified();
4865 return log_error_errno(unified
, "Failed to determine whether the unified hierachy is used: %m");
4869 r
= cg_mask_supported(&supported
);
4871 return log_error_errno(r
, "Failed to determine supported controllers: %m");
4873 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, 0, &cgroup
);
4875 return log_error_errno(r
, "Failed to get our control group: %m");
4877 child
= strjoina(cgroup
, "/payload");
4878 r
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, child
, pid
);
4880 return log_error_errno(r
, "Failed to create %s subcgroup: %m", child
);
4882 child
= strjoina(cgroup
, "/supervisor");
4883 r
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, child
, 0);
4885 return log_error_errno(r
, "Failed to create %s subcgroup: %m", child
);
4887 /* Try to enable as many controllers as possible for the new payload. */
4888 (void) cg_enable_everywhere(supported
, supported
, cgroup
);
4892 static int load_settings(void) {
4893 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
4894 _cleanup_fclose_
FILE *f
= NULL
;
4895 _cleanup_free_
char *p
= NULL
;
4899 /* If all settings are masked, there's no point in looking for
4900 * the settings file */
4901 if ((arg_settings_mask
& _SETTINGS_MASK_ALL
) == _SETTINGS_MASK_ALL
)
4904 fn
= strjoina(arg_machine
, ".nspawn");
4906 /* We first look in the admin's directories in /etc and /run */
4907 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4908 _cleanup_free_
char *j
= NULL
;
4910 j
= strjoin(i
, "/", fn
, NULL
);
4919 /* By default we trust configuration from /etc and /run */
4920 if (arg_settings_trusted
< 0)
4921 arg_settings_trusted
= true;
4926 if (errno
!= ENOENT
)
4927 return log_error_errno(errno
, "Failed to open %s: %m", j
);
4931 /* After that, let's look for a file next to the
4932 * actual image we shall boot. */
4935 p
= file_in_same_dir(arg_image
, fn
);
4938 } else if (arg_directory
) {
4939 p
= file_in_same_dir(arg_directory
, fn
);
4946 if (!f
&& errno
!= ENOENT
)
4947 return log_error_errno(errno
, "Failed to open %s: %m", p
);
4949 /* By default we do not trust configuration from /var/lib/machines */
4950 if (arg_settings_trusted
< 0)
4951 arg_settings_trusted
= false;
4958 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
4960 r
= settings_load(f
, p
, &settings
);
4964 /* Copy over bits from the settings, unless they have been
4965 * explicitly masked by command line switches. */
4967 if ((arg_settings_mask
& SETTING_BOOT
) == 0 &&
4968 settings
->boot
>= 0) {
4969 arg_boot
= settings
->boot
;
4971 strv_free(arg_parameters
);
4972 arg_parameters
= settings
->parameters
;
4973 settings
->parameters
= NULL
;
4976 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
4977 settings
->environment
) {
4978 strv_free(arg_setenv
);
4979 arg_setenv
= settings
->environment
;
4980 settings
->environment
= NULL
;
4983 if ((arg_settings_mask
& SETTING_USER
) == 0 &&
4986 arg_user
= settings
->user
;
4987 settings
->user
= NULL
;
4990 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
4992 if (!arg_settings_trusted
&& settings
->capability
!= 0)
4993 log_warning("Ignoring Capability= setting, file %s is not trusted.", p
);
4995 arg_retain
|= settings
->capability
;
4997 arg_retain
&= ~settings
->drop_capability
;
5000 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
5001 settings
->kill_signal
> 0)
5002 arg_kill_signal
= settings
->kill_signal
;
5004 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
5005 settings
->personality
!= PERSONALITY_INVALID
)
5006 arg_personality
= settings
->personality
;
5008 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
5009 !sd_id128_is_null(settings
->machine_id
)) {
5011 if (!arg_settings_trusted
)
5012 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p
);
5014 arg_uuid
= settings
->machine_id
;
5017 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
5018 settings
->read_only
>= 0)
5019 arg_read_only
= settings
->read_only
;
5021 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
5022 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
5023 arg_volatile_mode
= settings
->volatile_mode
;
5025 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
5026 settings
->n_custom_mounts
> 0) {
5028 if (!arg_settings_trusted
)
5029 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p
);
5031 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
5032 arg_custom_mounts
= settings
->custom_mounts
;
5033 arg_n_custom_mounts
= settings
->n_custom_mounts
;
5035 settings
->custom_mounts
= NULL
;
5036 settings
->n_custom_mounts
= 0;
5040 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
5041 (settings
->private_network
>= 0 ||
5042 settings
->network_veth
>= 0 ||
5043 settings
->network_bridge
||
5044 settings
->network_interfaces
||
5045 settings
->network_macvlan
||
5046 settings
->network_ipvlan
)) {
5048 if (!arg_settings_trusted
)
5049 log_warning("Ignoring network settings, file %s is not trusted.", p
);
5051 strv_free(arg_network_interfaces
);
5052 arg_network_interfaces
= settings
->network_interfaces
;
5053 settings
->network_interfaces
= NULL
;
5055 strv_free(arg_network_macvlan
);
5056 arg_network_macvlan
= settings
->network_macvlan
;
5057 settings
->network_macvlan
= NULL
;
5059 strv_free(arg_network_ipvlan
);
5060 arg_network_ipvlan
= settings
->network_ipvlan
;
5061 settings
->network_ipvlan
= NULL
;
5063 free(arg_network_bridge
);
5064 arg_network_bridge
= settings
->network_bridge
;
5065 settings
->network_bridge
= NULL
;
5067 arg_network_veth
= settings
->network_veth
> 0 || settings
->network_bridge
;
5069 arg_private_network
= true; /* all these settings imply private networking */
5073 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
5074 settings
->expose_ports
) {
5076 if (!arg_settings_trusted
)
5077 log_warning("Ignoring Port= setting, file %s is not trusted.", p
);
5079 expose_port_free_all(arg_expose_ports
);
5080 arg_expose_ports
= settings
->expose_ports
;
5081 settings
->expose_ports
= NULL
;
5088 int main(int argc
, char *argv
[]) {
5090 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
5091 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
5092 _cleanup_close_
int master
= -1, image_fd
= -1;
5093 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
5094 int r
, n_fd_passed
, loop_nr
= -1;
5095 char veth_name
[IFNAMSIZ
];
5096 bool secondary
= false, remove_subvol
= false;
5099 int ret
= EXIT_SUCCESS
;
5100 union in_addr_union exposed
= {};
5101 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
5104 log_parse_environment();
5107 r
= parse_argv(argc
, argv
);
5111 if (geteuid() != 0) {
5112 log_error("Need to be root.");
5116 r
= determine_names();
5120 r
= load_settings();
5124 r
= verify_arguments();
5128 n_fd_passed
= sd_listen_fds(false);
5129 if (n_fd_passed
> 0) {
5130 r
= fdset_new_listen_fds(&fds
, false);
5132 log_error_errno(r
, "Failed to collect file descriptors: %m");
5137 if (arg_directory
) {
5140 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
5141 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
5146 if (arg_ephemeral
) {
5147 _cleanup_free_
char *np
= NULL
;
5149 /* If the specified path is a mount point we
5150 * generate the new snapshot immediately
5151 * inside it under a random name. However if
5152 * the specified is not a mount point we
5153 * create the new snapshot in the parent
5154 * directory, just next to it. */
5155 r
= path_is_mount_point(arg_directory
, 0);
5157 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
5161 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
5163 r
= tempfn_random(arg_directory
, "machine.", &np
);
5165 log_error_errno(r
, "Failed to generate name for snapshot: %m");
5169 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
5171 log_error_errno(r
, "Failed to lock %s: %m", np
);
5175 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
5177 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
5181 free(arg_directory
);
5185 remove_subvol
= true;
5188 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
5190 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
5194 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
5199 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
5202 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
5204 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
5208 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
5214 if (path_is_os_tree(arg_directory
) <= 0) {
5215 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
5222 p
= strjoina(arg_directory
,
5223 argc
> optind
&& path_is_absolute(argv
[optind
]) ? argv
[optind
] : "/usr/bin/");
5224 if (access(p
, F_OK
) < 0) {
5225 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory
);
5232 char template[] = "/tmp/nspawn-root-XXXXXX";
5235 assert(!arg_template
);
5237 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
5239 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
5243 r
= log_error_errno(r
, "Failed to create image lock: %m");
5247 if (!mkdtemp(template)) {
5248 log_error_errno(errno
, "Failed to create temporary directory: %m");
5253 arg_directory
= strdup(template);
5254 if (!arg_directory
) {
5259 image_fd
= setup_image(&device_path
, &loop_nr
);
5265 r
= dissect_image(image_fd
,
5266 &root_device
, &root_device_rw
,
5267 &home_device
, &home_device_rw
,
5268 &srv_device
, &srv_device_rw
,
5274 r
= custom_mounts_prepare();
5279 isatty(STDIN_FILENO
) > 0 &&
5280 isatty(STDOUT_FILENO
) > 0;
5282 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
5284 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
5288 r
= ptsname_malloc(master
, &console
);
5290 r
= log_error_errno(r
, "Failed to determine tty name: %m");
5294 if (unlockpt(master
) < 0) {
5295 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
5300 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5301 arg_machine
, arg_image
?: arg_directory
);
5303 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
5305 assert_se(sigemptyset(&mask_chld
) == 0);
5306 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
5308 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
5309 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
5314 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 },
5315 uid_shift_socket_pair
[2] = { -1, -1 };
5316 ContainerStatus container_status
;
5317 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
5318 static const struct sigaction sa
= {
5319 .sa_handler
= nop_handler
,
5320 .sa_flags
= SA_NOCLDSTOP
,
5324 _cleanup_event_unref_ sd_event
*event
= NULL
;
5325 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
5326 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
5329 r
= barrier_create(&barrier
);
5331 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
5335 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
5336 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
5340 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
5341 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
5345 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
5346 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
5351 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
5352 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
5356 /* Child can be killed before execv(), so handle SIGCHLD
5357 * in order to interrupt parent's blocking calls and
5358 * give it a chance to call wait() and terminate. */
5359 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
5361 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
5365 r
= sigaction(SIGCHLD
, &sa
, NULL
);
5367 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
5371 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
5373 if (errno
== EINVAL
)
5374 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
5376 r
= log_error_errno(errno
, "clone() failed: %m");
5382 /* The outer child only has a file system namespace. */
5383 barrier_set_role(&barrier
, BARRIER_CHILD
);
5385 master
= safe_close(master
);
5387 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
5388 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
5389 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
5390 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
5392 (void) reset_all_signal_handlers();
5393 (void) reset_signal_mask();
5395 r
= outer_child(&barrier
,
5398 root_device
, root_device_rw
,
5399 home_device
, home_device_rw
,
5400 srv_device
, srv_device_rw
,
5404 kmsg_socket_pair
[1],
5405 rtnl_socket_pair
[1],
5406 uid_shift_socket_pair
[1],
5409 _exit(EXIT_FAILURE
);
5411 _exit(EXIT_SUCCESS
);
5414 barrier_set_role(&barrier
, BARRIER_PARENT
);
5419 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
5420 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
5421 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
5423 /* Wait for the outer child. */
5424 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
5433 /* And now retrieve the PID of the inner child. */
5434 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
5436 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
5439 if (l
!= sizeof(pid
)) {
5440 log_error("Short read while reading inner child PID: %m");
5445 log_debug("Init process invoked as PID " PID_FMT
, pid
);
5448 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
5449 log_error("Child died too early.");
5454 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
5456 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
5459 if (l
!= sizeof(arg_uid_shift
)) {
5460 log_error("Short read while reading UID shift: %m");
5465 r
= setup_uid_map(pid
);
5469 (void) barrier_place(&barrier
); /* #2 */
5472 r
= move_network_interfaces(pid
);
5476 r
= setup_veth(pid
, veth_name
, &ifi
);
5480 r
= setup_bridge(veth_name
, &ifi
);
5484 r
= setup_macvlan(pid
);
5488 r
= setup_ipvlan(pid
);
5492 r
= register_machine(pid
, ifi
);
5496 r
= sync_cgroup(pid
);
5500 r
= create_subcgroup(pid
);
5504 r
= chown_cgroup(pid
);
5508 /* Notify the child that the parent is ready with all
5509 * its setup (including cgroup-ification), and that
5510 * the child can now hand over control to the code to
5511 * run inside the container. */
5512 (void) barrier_place(&barrier
); /* #3 */
5514 /* Block SIGCHLD here, before notifying child.
5515 * process_pty() will handle it with the other signals. */
5516 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
5518 /* Reset signal to default */
5519 r
= default_signals(SIGCHLD
, -1);
5521 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
5525 /* Let the child know that we are ready and wait that the child is completely ready now. */
5526 if (!barrier_place_and_sync(&barrier
)) { /* #5 */
5527 log_error("Client died too early.");
5534 "STATUS=Container running.\n"
5535 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
5537 r
= sd_event_new(&event
);
5539 log_error_errno(r
, "Failed to get default event source: %m");
5543 if (arg_kill_signal
> 0) {
5544 /* Try to kill the init system on SIGINT or SIGTERM */
5545 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
5546 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
5548 /* Immediately exit */
5549 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
5550 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
5553 /* simply exit on sigchld */
5554 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
5556 if (arg_expose_ports
) {
5557 r
= watch_rtnl(event
, rtnl_socket_pair
[0], &exposed
, &rtnl
);
5561 (void) expose_ports(rtnl
, &exposed
);
5564 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
5566 r
= pty_forward_new(event
, master
, true, !interactive
, &forward
);
5568 log_error_errno(r
, "Failed to create PTY forwarder: %m");
5572 r
= sd_event_loop(event
);
5574 log_error_errno(r
, "Failed to run event loop: %m");
5578 pty_forward_get_last_char(forward
, &last_char
);
5580 forward
= pty_forward_free(forward
);
5582 if (!arg_quiet
&& last_char
!= '\n')
5585 /* Kill if it is not dead yet anyway */
5586 terminate_machine(pid
);
5588 /* Normally redundant, but better safe than sorry */
5591 r
= wait_for_container(pid
, &container_status
);
5595 /* We failed to wait for the container, or the
5596 * container exited abnormally */
5598 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
5599 /* The container exited with a non-zero
5600 * status, or with zero status and no reboot
5606 /* CONTAINER_REBOOTED, loop again */
5608 if (arg_keep_unit
) {
5609 /* Special handling if we are running as a
5610 * service: instead of simply restarting the
5611 * machine we want to restart the entire
5612 * service, so let's inform systemd about this
5613 * with the special exit code 133. The service
5614 * file uses RestartForceExitStatus=133 so
5615 * that this results in a full nspawn
5616 * restart. This is necessary since we might
5617 * have cgroup parameters set we want to have
5624 flush_ports(&exposed
);
5630 "STATUS=Terminating...");
5635 /* Try to flush whatever is still queued in the pty */
5637 (void) copy_bytes(master
, STDOUT_FILENO
, (off_t
) -1, false);
5639 loop_remove(loop_nr
, &image_fd
);
5641 if (remove_subvol
&& arg_directory
) {
5644 k
= btrfs_subvol_remove(arg_directory
, true);
5646 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
5652 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
5653 (void) rm_rf(p
, REMOVE_ROOT
);
5656 flush_ports(&exposed
);
5658 free(arg_directory
);
5663 strv_free(arg_setenv
);
5664 free(arg_network_bridge
);
5665 strv_free(arg_network_interfaces
);
5666 strv_free(arg_network_macvlan
);
5667 strv_free(arg_network_ipvlan
);
5668 strv_free(arg_parameters
);
5669 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
5670 expose_port_free_all(arg_expose_ports
);
5672 return r
< 0 ? EXIT_FAILURE
: ret
;