1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
9 #include "sd-messages.h"
11 #include "alloc-util.h"
12 #include "bus-error.h"
13 #include "bus-internal.h"
14 #include "bus-locator.h"
15 #include "bus-unit-util.h"
17 #include "errno-util.h"
19 #include "extract-word.h"
22 #include "format-util.h"
27 #include "machine-dbus.h"
29 #include "mkdir-label.h"
30 #include "namespace-util.h"
31 #include "operation.h"
32 #include "parse-util.h"
33 #include "path-util.h"
34 #include "process-util.h"
35 #include "serialize.h"
36 #include "socket-util.h"
38 #include "stdio-util.h"
39 #include "string-table.h"
40 #include "string-util.h"
42 #include "terminal-util.h"
43 #include "tmpfile-util.h"
44 #include "uid-range.h"
45 #include "unit-name.h"
46 #include "user-util.h"
48 int machine_new(MachineClass
class, const char *name
, Machine
**ret
) {
49 _cleanup_(machine_freep
) Machine
*m
= NULL
;
51 assert(class < _MACHINE_CLASS_MAX
);
54 /* Passing class == _MACHINE_CLASS_INVALID here is fine. It
55 * means as much as "we don't know yet", and that we'll figure
56 * it out later when loading the state file. */
64 .leader
= PIDREF_NULL
,
65 .vsock_cid
= VMADDR_CID_ANY
,
69 m
->name
= strdup(name
);
78 int machine_link(Manager
*manager
, Machine
*machine
) {
89 if (machine
->class != MACHINE_HOST
) {
90 char *temp
= path_join("/run/systemd/machines", machine
->name
);
94 free_and_replace(machine
->state_file
, temp
);
97 r
= hashmap_put(manager
->machines
, machine
->name
, machine
);
101 machine
->manager
= manager
;
106 Machine
* machine_free(Machine
*m
) {
110 while (m
->operations
)
111 operation_free(m
->operations
);
113 if (m
->in_gc_queue
) {
115 LIST_REMOVE(gc_queue
, m
->manager
->machine_gc_queue
, m
);
119 machine_release_unit(m
);
121 (void) hashmap_remove(m
->manager
->machines
, m
->name
);
123 if (m
->manager
->host_machine
== m
)
124 m
->manager
->host_machine
= NULL
;
127 m
->leader_pidfd_event_source
= sd_event_source_disable_unref(m
->leader_pidfd_event_source
);
128 if (pidref_is_set(&m
->leader
)) {
130 (void) hashmap_remove_value(m
->manager
->machines_by_leader
, &m
->leader
, m
);
131 pidref_done(&m
->leader
);
134 sd_bus_message_unref(m
->create_message
);
140 free(m
->root_directory
);
142 free(m
->ssh_address
);
143 free(m
->ssh_private_key_path
);
147 int machine_save(Machine
*m
) {
158 _cleanup_(unlink_and_freep
) char *sl
= NULL
; /* auto-unlink! */
160 sl
= strjoin("/run/systemd/machines/unit:", m
->unit
);
165 r
= mkdir_safe_label("/run/systemd/machines", 0755, 0, 0, MKDIR_WARN_MODE
);
167 return log_error_errno(r
, "Failed to create /run/systemd/machines/: %m");
169 _cleanup_(unlink_and_freep
) char *temp_path
= NULL
;
170 _cleanup_fclose_
FILE *f
= NULL
;
171 r
= fopen_tmpfile_linkable(m
->state_file
, O_WRONLY
|O_CLOEXEC
, &temp_path
, &f
);
173 return log_error_errno(r
, "Failed to create state file '%s': %m", m
->state_file
);
175 if (fchmod(fileno(f
), 0644) < 0)
176 return log_error_errno(errno
, "Failed to set access mode for state file '%s' to 0644: %m", m
->state_file
);
179 "# This is private data. Do not parse.\n"
183 /* We continue to call this "SCOPE=" because it is internal only, and we want to stay compatible with old files */
184 env_file_fputs_assignment(f
, "SCOPE=", m
->unit
);
185 env_file_fputs_assignment(f
, "SCOPE_JOB=", m
->scope_job
);
187 env_file_fputs_assignment(f
, "SERVICE=", m
->service
);
188 env_file_fputs_assignment(f
, "ROOT=", m
->root_directory
);
190 if (!sd_id128_is_null(m
->id
))
191 fprintf(f
, "ID=" SD_ID128_FORMAT_STR
"\n", SD_ID128_FORMAT_VAL(m
->id
));
193 if (pidref_is_set(&m
->leader
)) {
194 fprintf(f
, "LEADER="PID_FMT
"\n", m
->leader
.pid
);
195 (void) pidref_acquire_pidfd_id(&m
->leader
);
196 if (m
->leader
.fd_id
!= 0)
197 fprintf(f
, "LEADER_PIDFDID=%" PRIu64
"\n", m
->leader
.fd_id
);
200 if (m
->class != _MACHINE_CLASS_INVALID
)
201 fprintf(f
, "CLASS=%s\n", machine_class_to_string(m
->class));
203 if (dual_timestamp_is_set(&m
->timestamp
))
205 "REALTIME="USEC_FMT
"\n"
206 "MONOTONIC="USEC_FMT
"\n",
207 m
->timestamp
.realtime
,
208 m
->timestamp
.monotonic
);
210 if (m
->n_netif
> 0) {
211 fputs("NETIF=\"", f
);
212 FOREACH_ARRAY(ifi
, m
->netif
, m
->n_netif
) {
215 fprintf(f
, "%i", *ifi
);
220 if (m
->vsock_cid
!= 0)
221 fprintf(f
, "VSOCK_CID=%u\n", m
->vsock_cid
);
223 env_file_fputs_assignment(f
, "SSH_ADDRESS=", m
->ssh_address
);
224 env_file_fputs_assignment(f
, "SSH_PRIVATE_KEY_PATH=", m
->ssh_private_key_path
);
226 r
= flink_tmpfile(f
, temp_path
, m
->state_file
, LINK_TMPFILE_REPLACE
);
228 return log_error_errno(r
, "Failed to move '%s' into place: %m", m
->state_file
);
230 temp_path
= mfree(temp_path
); /* disarm auto-destroy: temporary file does not exist anymore */
233 /* Create a symlink from the unit name to the machine name, so that we can quickly find the machine
234 * for each given unit. Ignore error. */
235 (void) symlink(m
->name
, sl
);
237 /* disarm auto-removal */
244 static void machine_unlink(Machine
*m
) {
248 const char *sl
= strjoina("/run/systemd/machines/unit:", m
->unit
);
253 (void) unlink(m
->state_file
);
256 int machine_load(Machine
*m
) {
257 _cleanup_free_
char *name
= NULL
, *realtime
= NULL
, *monotonic
= NULL
, *id
= NULL
, *leader
= NULL
, *leader_pidfdid
= NULL
,
258 *class = NULL
, *netif
= NULL
, *vsock_cid
= NULL
;
266 r
= parse_env_file(NULL
, m
->state_file
,
269 "SCOPE_JOB", &m
->scope_job
,
270 "SERVICE", &m
->service
,
271 "ROOT", &m
->root_directory
,
274 "LEADER_PIDFDID", &leader_pidfdid
,
276 "REALTIME", &realtime
,
277 "MONOTONIC", &monotonic
,
279 "VSOCK_CID", &vsock_cid
,
280 "SSH_ADDRESS", &m
->ssh_address
,
281 "SSH_PRIVATE_KEY_PATH", &m
->ssh_private_key_path
);
285 return log_error_errno(r
, "Failed to read %s: %m", m
->state_file
);
287 if (!streq_ptr(name
, m
->name
))
288 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "State file '%s' for machine '%s' reports a different name '%s', refusing", m
->state_file
, m
->name
, name
);
291 (void) sd_id128_from_string(id
, &m
->id
);
293 pidref_done(&m
->leader
);
295 r
= pidref_set_pidstr(&m
->leader
, leader
);
297 log_debug_errno(r
, "Failed to set leader PID to '%s', ignoring: %m", leader
);
298 else if (leader_pidfdid
) {
300 r
= safe_atou64(leader_pidfdid
, &fd_id
);
302 log_warning_errno(r
, "Failed to parse leader pidfd ID, ignoring: %s", leader_pidfdid
);
304 (void) pidref_acquire_pidfd_id(&m
->leader
);
306 if (fd_id
!= m
->leader
.fd_id
) {
307 log_debug("Leader PID got recycled, ignoring.");
308 pidref_done(&m
->leader
);
315 MachineClass c
= machine_class_from_string(class);
321 (void) deserialize_usec(realtime
, &m
->timestamp
.realtime
);
323 (void) deserialize_usec(monotonic
, &m
->timestamp
.monotonic
);
325 m
->netif
= mfree(m
->netif
);
328 _cleanup_free_
int *ni
= NULL
;
331 for (const char *p
= netif
;;) {
332 _cleanup_free_
char *word
= NULL
;
334 r
= extract_first_word(&p
, &word
, NULL
, 0);
340 log_warning_errno(r
, "Failed to parse NETIF: %s", netif
);
344 r
= parse_ifindex(word
);
348 if (!GREEDY_REALLOC(ni
, nr
+ 1))
354 m
->netif
= TAKE_PTR(ni
);
360 r
= safe_atou(vsock_cid
, &m
->vsock_cid
);
362 log_warning_errno(r
, "Failed to parse AF_VSOCK CID, ignoring: %s", vsock_cid
);
368 static int machine_start_scope(
371 sd_bus_message
*more_properties
,
372 sd_bus_error
*error
) {
374 _cleanup_(sd_bus_message_unrefp
) sd_bus_message
*m
= NULL
, *reply
= NULL
;
375 _cleanup_(sd_bus_error_free
) sd_bus_error e
= SD_BUS_ERROR_NULL
;
376 _cleanup_free_
char *escaped
= NULL
, *unit
= NULL
;
377 const char *description
;
381 assert(pidref_is_set(&machine
->leader
));
382 assert(!machine
->unit
);
384 escaped
= unit_name_escape(machine
->name
);
388 unit
= strjoin("machine-", escaped
, ".scope");
392 r
= bus_message_new_method_call(
393 machine
->manager
->bus
,
396 "StartTransientUnit");
400 r
= sd_bus_message_append(m
, "ss", unit
, "fail");
404 r
= sd_bus_message_open_container(m
, 'a', "(sv)");
408 r
= sd_bus_message_append(m
, "(sv)", "Slice", "s", SPECIAL_MACHINE_SLICE
);
412 description
= strjoina(machine
->class == MACHINE_VM
? "Virtual Machine " : "Container ", machine
->name
);
413 r
= sd_bus_message_append(m
, "(sv)", "Description", "s", description
);
417 r
= bus_append_scope_pidref(m
, &machine
->leader
, allow_pidfd
);
421 r
= sd_bus_message_append(m
, "(sv)(sv)(sv)(sv)",
423 "CollectMode", "s", "inactive-or-failed",
425 "TasksMax", "t", UINT64_C(16384));
429 if (more_properties
) {
430 r
= sd_bus_message_copy(m
, more_properties
, true);
435 r
= sd_bus_message_close_container(m
);
439 r
= sd_bus_message_append(m
, "a(sa(sv))", 0);
443 r
= sd_bus_call(NULL
, m
, 0, &e
, &reply
);
445 /* If this failed with a property we couldn't write, this is quite likely because the server
446 * doesn't support PIDFDs yet, let's try without. */
448 sd_bus_error_has_names(&e
, SD_BUS_ERROR_UNKNOWN_PROPERTY
, SD_BUS_ERROR_PROPERTY_READ_ONLY
))
449 return machine_start_scope(machine
, /* allow_pidfd = */ false, more_properties
, error
);
451 return sd_bus_error_move(error
, &e
);
454 machine
->unit
= TAKE_PTR(unit
);
455 machine
->referenced
= true;
458 r
= sd_bus_message_read(reply
, "o", &job
);
462 return free_and_strdup(&machine
->scope_job
, job
);
465 static int machine_ensure_scope(Machine
*m
, sd_bus_message
*properties
, sd_bus_error
*error
) {
469 assert(m
->class != MACHINE_HOST
);
472 r
= machine_start_scope(m
, /* allow_pidfd = */ true, properties
, error
);
474 return log_error_errno(r
, "Failed to start machine scope: %s", bus_error_message(error
, r
));
479 r
= hashmap_ensure_put(&m
->manager
->machines_by_unit
, &string_hash_ops
, m
->unit
, m
);
486 static int machine_dispatch_leader_pidfd(sd_event_source
*s
, int fd
, unsigned revents
, void *userdata
) {
487 Machine
*m
= ASSERT_PTR(userdata
);
489 m
->leader_pidfd_event_source
= sd_event_source_disable_unref(m
->leader_pidfd_event_source
);
490 machine_add_to_gc_queue(m
);
495 static int machine_watch_pidfd(Machine
*m
) {
500 assert(pidref_is_set(&m
->leader
));
501 assert(!m
->leader_pidfd_event_source
);
503 if (m
->leader
.fd
< 0)
506 /* If we have a pidfd for the leader, let's also track it for POLLIN, and GC the machine
507 * automatically if it dies */
509 r
= sd_event_add_io(m
->manager
->event
, &m
->leader_pidfd_event_source
, m
->leader
.fd
, EPOLLIN
, machine_dispatch_leader_pidfd
, m
);
513 (void) sd_event_source_set_description(m
->leader_pidfd_event_source
, "machine-pidfd");
518 int machine_start(Machine
*m
, sd_bus_message
*properties
, sd_bus_error
*error
) {
523 if (!IN_SET(m
->class, MACHINE_CONTAINER
, MACHINE_VM
))
529 r
= hashmap_ensure_put(&m
->manager
->machines_by_leader
, &pidref_hash_ops
, &m
->leader
, m
);
533 r
= machine_watch_pidfd(m
);
538 r
= machine_ensure_scope(m
, properties
, error
);
543 LOG_MESSAGE_ID(SD_MESSAGE_MACHINE_START_STR
),
544 LOG_ITEM("NAME=%s", m
->name
),
545 LOG_ITEM("LEADER="PID_FMT
, m
->leader
.pid
),
546 LOG_MESSAGE("New machine %s.", m
->name
));
548 if (!dual_timestamp_is_set(&m
->timestamp
))
549 dual_timestamp_now(&m
->timestamp
);
553 /* Save new machine data */
556 machine_send_signal(m
, true);
561 int machine_stop(Machine
*m
) {
566 if (!IN_SET(m
->class, MACHINE_CONTAINER
, MACHINE_VM
))
570 _cleanup_(sd_bus_error_free
) sd_bus_error error
= SD_BUS_ERROR_NULL
;
573 r
= manager_stop_unit(m
->manager
, m
->unit
, &error
, &job
);
575 return log_error_errno(r
, "Failed to stop machine unit: %s", bus_error_message(&error
, r
));
577 free_and_replace(m
->scope_job
, job
);
587 int machine_finalize(Machine
*m
) {
592 LOG_MESSAGE_ID(SD_MESSAGE_MACHINE_STOP_STR
),
593 LOG_ITEM("NAME=%s", m
->name
),
594 LOG_ITEM("LEADER="PID_FMT
, m
->leader
.pid
),
595 LOG_MESSAGE("Machine %s terminated.", m
->name
));
597 m
->stopping
= true; /* The machine is supposed to be going away. Don't try to kill it. */
601 machine_add_to_gc_queue(m
);
604 machine_send_signal(m
, false);
611 bool machine_may_gc(Machine
*m
, bool drop_not_started
) {
616 if (m
->class == MACHINE_HOST
)
619 if (drop_not_started
&& !m
->started
)
622 r
= pidref_is_alive(&m
->leader
);
626 log_debug_errno(r
, "Unable to determine if leader PID " PID_FMT
" is still alive, assuming not: %m", m
->leader
.pid
);
631 _cleanup_(sd_bus_error_free
) sd_bus_error error
= SD_BUS_ERROR_NULL
;
633 r
= manager_job_is_active(m
->manager
, m
->scope_job
, &error
);
635 log_debug_errno(r
, "Failed to determine whether job '%s' is active, assuming it is: %s", m
->scope_job
, bus_error_message(&error
, r
));
641 _cleanup_(sd_bus_error_free
) sd_bus_error error
= SD_BUS_ERROR_NULL
;
643 r
= manager_unit_is_active(m
->manager
, m
->unit
, &error
);
645 log_debug_errno(r
, "Failed to determine whether unit '%s' is active, assuming it is: %s", m
->unit
, bus_error_message(&error
, r
));
653 void machine_add_to_gc_queue(Machine
*m
) {
659 LIST_PREPEND(gc_queue
, m
->manager
->machine_gc_queue
, m
);
660 m
->in_gc_queue
= true;
662 manager_enqueue_gc(m
->manager
);
665 MachineState
machine_get_state(Machine
*s
) {
668 if (s
->class == MACHINE_HOST
)
669 return MACHINE_RUNNING
;
672 return MACHINE_CLOSING
;
675 return MACHINE_OPENING
;
677 return MACHINE_RUNNING
;
680 int machine_kill(Machine
*m
, KillWhom whom
, int signo
) {
683 if (!IN_SET(m
->class, MACHINE_VM
, MACHINE_CONTAINER
))
689 if (whom
== KILL_LEADER
) /* If we shall simply kill the leader, do so directly */
690 return pidref_kill(&m
->leader
, signo
);
692 /* Otherwise, make PID 1 do it for us, for the entire cgroup */
693 return manager_kill_unit(m
->manager
, m
->unit
, signo
, NULL
);
696 int machine_openpt(Machine
*m
, int flags
, char **ret_peer
) {
702 return openpt_allocate(flags
, ret_peer
);
704 case MACHINE_CONTAINER
:
705 if (!pidref_is_set(&m
->leader
))
708 return openpt_allocate_in_namespace(&m
->leader
, flags
, ret_peer
);
715 static int machine_bus_new(Machine
*m
, sd_bus_error
*error
, sd_bus
**ret
) {
727 case MACHINE_CONTAINER
: {
728 _cleanup_(sd_bus_close_unrefp
) sd_bus
*bus
= NULL
;
731 r
= sd_bus_new(&bus
);
733 return log_debug_errno(r
, "Failed to allocate new DBus: %m");
735 if (asprintf(&address
, "x-machine-unix:pid=%" PID_PRI
, m
->leader
.pid
) < 0)
738 bus
->address
= address
;
739 bus
->bus_client
= true;
740 bus
->trusted
= false;
741 bus
->runtime_scope
= RUNTIME_SCOPE_SYSTEM
;
743 r
= sd_bus_start(bus
);
745 return sd_bus_error_set_errnof(error
, r
, "There is no system bus in container %s.", m
->name
);
749 *ret
= TAKE_PTR(bus
);
758 int machine_start_getty(Machine
*m
, const char *ptmx_name
, sd_bus_error
*error
) {
759 _cleanup_(sd_bus_flush_close_unrefp
) sd_bus
*allocated_bus
= NULL
;
760 sd_bus
*container_bus
= NULL
;
761 const char *p
, *getty
;
767 p
= path_startswith(ptmx_name
, "/dev/pts/");
769 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL
), "Path of pseudo TTY has unexpected prefix");
771 r
= machine_bus_new(m
, error
, &allocated_bus
);
773 return log_debug_errno(r
, "Failed to create DBus to machine: %m");
775 container_bus
= allocated_bus
?: m
->manager
->bus
;
776 getty
= strjoina("container-getty@", p
, ".service");
778 r
= bus_call_method(container_bus
, bus_systemd_mgr
, "StartUnit", error
, /* ret_reply = */ NULL
, "ss", getty
, "replace");
780 return log_debug_errno(r
, "Failed to StartUnit '%s' in container '%s': %m", getty
, m
->name
);
785 int machine_start_shell(
788 const char *ptmx_name
,
793 sd_bus_error
*error
) {
794 _cleanup_close_
int pty_fd
= -EBADF
;
795 _cleanup_(sd_bus_message_unrefp
) sd_bus_message
*tm
= NULL
;
796 _cleanup_(sd_bus_flush_close_unrefp
) sd_bus
*allocated_bus
= NULL
;
797 const char *p
, *utmp_id
, *unit
, *description
;
798 sd_bus
*container_bus
= NULL
;
802 assert(ptmx_fd
>= 0);
805 if (isempty(user
) || isempty(path
) || strv_isempty(args
))
808 p
= path_startswith(ptmx_name
, "/dev/pts/");
809 utmp_id
= path_startswith(ptmx_name
, "/dev/");
811 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL
), "Path of pseudo TTY has unexpected prefix");
813 pty_fd
= pty_open_peer(ptmx_fd
, O_RDWR
|O_NOCTTY
|O_CLOEXEC
);
815 return log_debug_errno(pty_fd
, "Failed to open terminal: %m");
817 r
= machine_bus_new(m
, error
, &allocated_bus
);
819 return log_debug_errno(r
, "Failed to create DBus to machine: %m");
821 container_bus
= allocated_bus
?: m
->manager
->bus
;
822 r
= bus_message_new_method_call(container_bus
, &tm
, bus_systemd_mgr
, "StartTransientUnit");
827 unit
= strjoina("container-shell@", p
, ".service");
828 r
= sd_bus_message_append(tm
, "ss", unit
, "fail");
833 r
= sd_bus_message_open_container(tm
, 'a', "(sv)");
837 description
= strjoina("Shell for User ", user
);
838 r
= sd_bus_message_append(tm
,
839 "(sv)(sv)(sv)(sv)(sv)(sv)(sv)(sv)(sv)(sv)(sv)(sv)(sv)",
840 "Description", "s", description
,
841 "StandardInputFileDescriptor", "h", pty_fd
,
842 "StandardOutputFileDescriptor", "h", pty_fd
,
843 "StandardErrorFileDescriptor", "h", pty_fd
,
844 "SendSIGHUP", "b", true,
845 "IgnoreSIGPIPE", "b", false,
846 "KillMode", "s", "mixed",
847 "TTYPath", "s", ptmx_name
,
848 "TTYReset", "b", true,
849 "UtmpIdentifier", "s", utmp_id
,
850 "UtmpMode", "s", "user",
851 "PAMName", "s", "login",
852 "WorkingDirectory", "s", "-~");
856 r
= sd_bus_message_append(tm
, "(sv)", "User", "s", user
);
860 if (!strv_isempty(env
)) {
861 r
= sd_bus_message_open_container(tm
, 'r', "sv");
865 r
= sd_bus_message_append(tm
, "s", "Environment");
869 r
= sd_bus_message_open_container(tm
, 'v', "as");
873 r
= sd_bus_message_append_strv(tm
, env
);
877 r
= sd_bus_message_close_container(tm
);
881 r
= sd_bus_message_close_container(tm
);
887 r
= sd_bus_message_open_container(tm
, 'r', "sv");
891 r
= sd_bus_message_append(tm
, "s", "ExecStart");
895 r
= sd_bus_message_open_container(tm
, 'v', "a(sasb)");
899 r
= sd_bus_message_open_container(tm
, 'a', "(sasb)");
903 r
= sd_bus_message_open_container(tm
, 'r', "sasb");
907 r
= sd_bus_message_append(tm
, "s", path
);
911 r
= sd_bus_message_append_strv(tm
, args
);
915 r
= sd_bus_message_append(tm
, "b", true);
919 r
= sd_bus_message_close_container(tm
);
923 r
= sd_bus_message_close_container(tm
);
927 r
= sd_bus_message_close_container(tm
);
931 r
= sd_bus_message_close_container(tm
);
935 r
= sd_bus_message_close_container(tm
);
939 /* Auxiliary units */
940 r
= sd_bus_message_append(tm
, "a(sa(sv))", 0);
944 r
= sd_bus_call(container_bus
, tm
, 0, error
, NULL
);
951 char** machine_default_shell_args(const char *user
) {
952 _cleanup_strv_free_
char **args
= NULL
;
957 args
= new0(char*, 3 + 1);
961 args
[0] = strdup("sh");
965 args
[1] = strdup("-c");
969 r
= asprintf(&args
[2],
970 "shell=$(getent passwd %s 2>/dev/null | { IFS=: read _ _ _ _ _ _ x; echo \"$x\"; })\n"\
971 "exec \"${shell:-/bin/sh}\" -l", /* -l is means --login */
978 return TAKE_PTR(args
);
981 int machine_copy_from_to_operation(
984 const char *host_path
,
985 const char *container_path
,
986 bool copy_from_container
,
987 CopyFlags copy_flags
,
990 _cleanup_close_
int host_fd
= -EBADF
, target_mntns_fd
= -EBADF
, source_mntns_fd
= -EBADF
;
991 _cleanup_close_pair_
int errno_pipe_fd
[2] = EBADF_PAIR
;
992 _cleanup_free_
char *host_basename
= NULL
, *container_basename
= NULL
;
993 _cleanup_(sigkill_waitp
) pid_t child
= 0;
1001 if (isempty(host_path
) || isempty(container_path
))
1004 r
= path_extract_filename(host_path
, &host_basename
);
1006 return log_debug_errno(r
, "Failed to extract file name of '%s' path: %m", host_path
);
1008 r
= path_extract_filename(container_path
, &container_basename
);
1010 return log_debug_errno(r
, "Failed to extract file name of '%s' path: %m", container_path
);
1012 host_fd
= open_parent(host_path
, O_CLOEXEC
, 0);
1014 return log_debug_errno(host_fd
, "Failed to open host directory '%s': %m", host_path
);
1016 r
= machine_get_uid_shift(machine
, &uid_shift
);
1018 return log_debug_errno(r
, "Failed to get UID shift of machine '%s': %m", machine
->name
);
1020 target_mntns_fd
= pidref_namespace_open_by_type(&machine
->leader
, NAMESPACE_MOUNT
);
1021 if (target_mntns_fd
< 0)
1022 return log_debug_errno(target_mntns_fd
, "Failed to open mount namespace of machine '%s': %m", machine
->name
);
1024 source_mntns_fd
= namespace_open_by_type(NAMESPACE_MOUNT
);
1025 if (source_mntns_fd
< 0)
1026 return log_debug_errno(source_mntns_fd
, "Failed to open our own mount namespace: %m");
1028 if (pipe2(errno_pipe_fd
, O_CLOEXEC
|O_NONBLOCK
) < 0)
1029 return log_debug_errno(errno
, "Failed to create pipe: %m");
1031 r
= namespace_fork("(sd-copyns)",
1033 /* except_fds = */ NULL
,
1034 /* n_except_fds = */ 0,
1035 FORK_RESET_SIGNALS
|FORK_DEATHSIG_SIGKILL
,
1036 /* pidns_fd = */ -EBADF
,
1038 /* netns_fd = */ -EBADF
,
1039 /* userns_fd = */ -EBADF
,
1040 /* root_fd = */ -EBADF
,
1043 return log_debug_errno(r
, "Failed to fork into mount namespace of machine '%s': %m", machine
->name
);
1045 errno_pipe_fd
[0] = safe_close(errno_pipe_fd
[0]);
1047 _cleanup_close_
int container_fd
= -EBADF
;
1048 container_fd
= open_parent(container_path
, O_CLOEXEC
, 0);
1049 if (container_fd
< 0) {
1050 log_debug_errno(container_fd
, "Failed to open container directory: %m");
1051 report_errno_and_exit(errno_pipe_fd
[1], container_fd
);
1054 /* Rejoin the host namespace, so that /proc/self/fd/… works, which copy_tree_at() relies on
1055 * in some cases (by means of fd_reopen()) */
1056 if (setns(source_mntns_fd
, CLONE_NEWNS
) < 0) {
1057 r
= log_debug_errno(errno
, "Failed to rejoin namespace of host: %m");
1058 report_errno_and_exit(errno_pipe_fd
[1], r
);
1061 /* Run the actual copy operation. Note that when a UID shift is set we'll either clamp the UID/GID to
1062 * 0 or to the actual UID shift depending on the direction we copy. If no UID shift is set we'll copy
1063 * the UID/GIDs as they are. */
1064 if (copy_from_container
)
1070 uid_shift
== 0 ? UID_INVALID
: 0,
1071 uid_shift
== 0 ? GID_INVALID
: 0,
1073 /* denylist = */ NULL
,
1074 /* subvolumes = */ NULL
);
1081 uid_shift
== 0 ? UID_INVALID
: uid_shift
,
1082 uid_shift
== 0 ? GID_INVALID
: uid_shift
,
1084 /* denylist = */ NULL
,
1085 /* subvolumes = */ NULL
);
1087 log_debug_errno(r
, "Failed to copy tree: %m");
1089 report_errno_and_exit(errno_pipe_fd
[1], r
);
1092 errno_pipe_fd
[1] = safe_close(errno_pipe_fd
[1]);
1094 Operation
*operation
;
1095 r
= operation_new(manager
, machine
, child
, errno_pipe_fd
[0], &operation
);
1099 TAKE_FD(errno_pipe_fd
[0]);
1106 void machine_release_unit(Machine
*m
) {
1114 if (m
->referenced
) {
1115 _cleanup_(sd_bus_error_free
) sd_bus_error error
= SD_BUS_ERROR_NULL
;
1118 r
= manager_unref_unit(m
->manager
, m
->unit
, &error
);
1120 log_full_errno(ERRNO_IS_DISCONNECT(r
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1121 "Failed to drop reference to machine scope, ignoring: %s",
1122 bus_error_message(&error
, r
));
1124 m
->referenced
= false;
1127 (void) hashmap_remove_value(m
->manager
->machines_by_unit
, m
->unit
, m
);
1128 m
->unit
= mfree(m
->unit
);
1131 int machine_get_uid_shift(Machine
*m
, uid_t
*ret
) {
1132 char p
[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(pid_t
) + 1];
1133 uid_t uid_base
, uid_shift
, uid_range
;
1134 gid_t gid_base
, gid_shift
, gid_range
;
1135 _cleanup_fclose_
FILE *f
= NULL
;
1141 /* Return the base UID/GID of the specified machine. Note that this only works for containers with simple
1142 * mappings. In most cases setups should be simple like this, and administrators should only care about the
1143 * basic offset a container has relative to the host. This is what this function exposes.
1145 * If we encounter any more complex mappings we politely refuse this with ENXIO. */
1147 if (m
->class == MACHINE_HOST
) {
1152 if (m
->class != MACHINE_CONTAINER
)
1155 xsprintf(p
, "/proc/" PID_FMT
"/uid_map", m
->leader
.pid
);
1158 if (errno
== ENOENT
) {
1159 /* If the file doesn't exist, user namespacing is off in the kernel, return a zero mapping hence. */
1167 /* Read the first line. There's at least one. */
1168 r
= uid_map_read_one(f
, &uid_base
, &uid_shift
, &uid_range
);
1172 /* Not a mapping starting at 0? Then it's a complex mapping we can't expose here. */
1175 /* Insist that at least the nobody user is mapped, everything else is weird, and hence complex, and we don't support it */
1176 if (uid_range
< UID_NOBODY
)
1179 /* If there's more than one line, then we don't support this mapping. */
1180 r
= safe_fgetc(f
, NULL
);
1183 if (r
!= 0) /* Insist on EOF */
1188 xsprintf(p
, "/proc/" PID_FMT
"/gid_map", m
->leader
.pid
);
1193 /* Read the first line. There's at least one. */
1195 r
= fscanf(f
, GID_FMT
" " GID_FMT
" " GID_FMT
"\n", &gid_base
, &gid_shift
, &gid_range
);
1197 return errno_or_else(ENOMSG
);
1202 /* If there's more than one line, then we don't support this file. */
1203 r
= safe_fgetc(f
, NULL
);
1206 if (r
!= 0) /* Insist on EOF */
1209 /* If the UID and GID mapping doesn't match, we don't support this mapping. */
1210 if (uid_base
!= (uid_t
) gid_base
)
1212 if (uid_shift
!= (uid_t
) gid_shift
)
1214 if (uid_range
!= (uid_t
) gid_range
)
1217 r
= pidref_verify(&m
->leader
);
1225 static int machine_owns_uid_internal(
1227 const char *map_file
, /* "uid_map" or "gid_map" */
1229 uid_t
*ret_internal_uid
) {
1231 _cleanup_fclose_
FILE *f
= NULL
;
1235 /* This is a generic implementation for both uids and gids, under the assumptions they have the same types and semantics. */
1236 assert_cc(sizeof(uid_t
) == sizeof(gid_t
));
1240 /* Checks if the specified host UID is owned by the machine, and returns the UID it maps to
1241 * internally in the machine */
1243 if (machine
->class != MACHINE_CONTAINER
)
1246 p
= procfs_file_alloca(machine
->leader
.pid
, map_file
);
1249 log_debug_errno(errno
, "Failed to open %s, ignoring.", p
);
1254 uid_t uid_base
, uid_shift
, uid_range
, converted
;
1256 r
= uid_map_read_one(f
, &uid_base
, &uid_shift
, &uid_range
);
1262 /* The private user namespace is disabled, ignoring. */
1266 if (uid
< uid_shift
|| uid
>= uid_shift
+ uid_range
)
1269 converted
= (uid
- uid_shift
+ uid_base
);
1270 if (!uid_is_valid(converted
))
1273 r
= pidref_verify(&machine
->leader
);
1277 if (ret_internal_uid
)
1278 *ret_internal_uid
= converted
;
1284 if (ret_internal_uid
)
1285 *ret_internal_uid
= UID_INVALID
;
1290 int machine_owns_uid(Machine
*machine
, uid_t uid
, uid_t
*ret_internal_uid
) {
1291 return machine_owns_uid_internal(machine
, "uid_map", uid
, ret_internal_uid
);
1294 int machine_owns_gid(Machine
*machine
, gid_t gid
, gid_t
*ret_internal_gid
) {
1295 return machine_owns_uid_internal(machine
, "gid_map", (uid_t
) gid
, (uid_t
*) ret_internal_gid
);
1298 static int machine_translate_uid_internal(
1300 const char *map_file
, /* "uid_map" or "gid_map" */
1302 uid_t
*ret_host_uid
) {
1304 _cleanup_fclose_
FILE *f
= NULL
;
1308 /* This is a generic implementation for both uids and gids, under the assumptions they have the same types and semantics. */
1309 assert_cc(sizeof(uid_t
) == sizeof(gid_t
));
1312 assert(uid_is_valid(uid
));
1314 if (machine
->class != MACHINE_CONTAINER
)
1317 /* Translates a machine UID into a host UID */
1319 p
= procfs_file_alloca(machine
->leader
.pid
, map_file
);
1325 uid_t uid_base
, uid_shift
, uid_range
, converted
;
1327 r
= uid_map_read_one(f
, &uid_base
, &uid_shift
, &uid_range
);
1333 if (uid
< uid_base
|| uid
>= uid_base
+ uid_range
)
1336 converted
= uid
- uid_base
+ uid_shift
;
1337 if (!uid_is_valid(converted
))
1340 r
= pidref_verify(&machine
->leader
);
1345 *ret_host_uid
= converted
;
1353 int machine_translate_uid(Machine
*machine
, gid_t uid
, gid_t
*ret_host_uid
) {
1354 return machine_translate_uid_internal(machine
, "uid_map", uid
, ret_host_uid
);
1357 int machine_translate_gid(Machine
*machine
, gid_t gid
, gid_t
*ret_host_gid
) {
1358 return machine_translate_uid_internal(machine
, "gid_map", (uid_t
) gid
, (uid_t
*) ret_host_gid
);
1361 int machine_open_root_directory(Machine
*machine
) {
1366 switch (machine
->class) {
1367 case MACHINE_HOST
: {
1368 int fd
= open("/", O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
);
1370 return log_debug_errno(errno
, "Failed to open host root directory: %m");
1375 case MACHINE_CONTAINER
: {
1376 _cleanup_close_
int mntns_fd
= -EBADF
, root_fd
= -EBADF
;
1377 _cleanup_close_pair_
int errno_pipe_fd
[2] = EBADF_PAIR
, fd_pass_socket
[2] = EBADF_PAIR
;
1380 r
= pidref_namespace_open(&machine
->leader
,
1381 /* ret_pidns_fd = */ NULL
,
1383 /* ret_netns_fd = */ NULL
,
1384 /* ret_userns_fd = */ NULL
,
1387 return log_debug_errno(r
, "Failed to open mount namespace of machine '%s': %m", machine
->name
);
1389 if (pipe2(errno_pipe_fd
, O_CLOEXEC
|O_NONBLOCK
) < 0)
1390 return log_debug_errno(errno
, "Failed to open pipe: %m");
1392 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, fd_pass_socket
) < 0)
1393 return log_debug_errno(errno
, "Failed to create socket pair: %m");
1398 /* except_fds = */ NULL
,
1399 /* n_except_fds = */ 0,
1400 FORK_RESET_SIGNALS
|FORK_DEATHSIG_SIGKILL
,
1401 /* pidns_fd = */ -EBADF
,
1403 /* netns_fd = */ -EBADF
,
1404 /* userns_fd = */ -EBADF
,
1408 return log_debug_errno(r
, "Failed to fork into mount namespace of machine '%s': %m", machine
->name
);
1410 _cleanup_close_
int dfd
= -EBADF
;
1412 errno_pipe_fd
[0] = safe_close(errno_pipe_fd
[0]);
1413 fd_pass_socket
[0] = safe_close(fd_pass_socket
[0]);
1415 dfd
= open("/", O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
);
1417 log_debug_errno(errno
, "Failed to open root directory of machine '%s': %m", machine
->name
);
1418 report_errno_and_exit(errno_pipe_fd
[1], -errno
);
1421 r
= send_one_fd(fd_pass_socket
[1], dfd
, /* flags = */ 0);
1422 dfd
= safe_close(dfd
);
1424 log_debug_errno(r
, "Failed to send FD over socket: %m");
1425 report_errno_and_exit(errno_pipe_fd
[1], r
);
1428 _exit(EXIT_SUCCESS
);
1431 errno_pipe_fd
[1] = safe_close(errno_pipe_fd
[1]);
1432 fd_pass_socket
[1] = safe_close(fd_pass_socket
[1]);
1434 r
= wait_for_terminate_and_check("(sd-openrootns)", child
, /* flags = */ 0);
1436 return log_debug_errno(r
, "Failed to wait for child: %m");
1438 r
= read_errno(errno_pipe_fd
[0]); /* the function does debug reporting */
1442 int fd
= receive_one_fd(fd_pass_socket
[0], MSG_DONTWAIT
);
1444 return log_debug_errno(fd
, "Failed to receive FD from child: %m");
1454 static const char* const machine_class_table
[_MACHINE_CLASS_MAX
] = {
1455 [MACHINE_CONTAINER
] = "container",
1456 [MACHINE_VM
] = "vm",
1457 [MACHINE_HOST
] = "host",
1460 DEFINE_STRING_TABLE_LOOKUP(machine_class
, MachineClass
);
1462 static const char* const machine_state_table
[_MACHINE_STATE_MAX
] = {
1463 [MACHINE_OPENING
] = "opening",
1464 [MACHINE_RUNNING
] = "running",
1465 [MACHINE_CLOSING
] = "closing"
1468 DEFINE_STRING_TABLE_LOOKUP(machine_state
, MachineState
);
1470 static const char* const kill_whom_table
[_KILL_WHOM_MAX
] = {
1471 [KILL_LEADER
] = "leader",
1475 DEFINE_STRING_TABLE_LOOKUP(kill_whom
, KillWhom
);
1477 static const char* const acquire_metadata_table
[_ACQUIRE_METADATA_MAX
] = {
1478 [ACQUIRE_METADATA_NO
] = "no",
1479 [ACQUIRE_METADATA_YES
] = "yes",
1480 [ACQUIRE_METADATA_GRACEFUL
] = "graceful"
1483 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(acquire_metadata
, AcquireMetadata
, ACQUIRE_METADATA_YES
);