]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/vmspawn/vmspawn.c
test: add a test for #31384
[thirdparty/systemd.git] / src / vmspawn / vmspawn.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <getopt.h>
4 #include <stdint.h>
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include <sys/stat.h>
9 #include <unistd.h>
10
11 #include "bootspec.h"
12 #include "chase.h"
13 #include "dirent-util.h"
14 #include "fd-util.h"
15 #include "discover-image.h"
16 #include "sd-daemon.h"
17 #include "sd-event.h"
18 #include "sd-id128.h"
19
20 #include "alloc-util.h"
21 #include "architecture.h"
22 #include "build.h"
23 #include "common-signal.h"
24 #include "copy.h"
25 #include "creds-util.h"
26 #include "dissect-image.h"
27 #include "escape.h"
28 #include "event-util.h"
29 #include "extract-word.h"
30 #include "fileio.h"
31 #include "format-util.h"
32 #include "fs-util.h"
33 #include "gpt.h"
34 #include "hexdecoct.h"
35 #include "hostname-util.h"
36 #include "kernel-image.h"
37 #include "log.h"
38 #include "machine-credential.h"
39 #include "macro.h"
40 #include "main-func.h"
41 #include "mkdir.h"
42 #include "pager.h"
43 #include "parse-argument.h"
44 #include "parse-util.h"
45 #include "path-lookup.h"
46 #include "path-util.h"
47 #include "pretty-print.h"
48 #include "process-util.h"
49 #include "random-util.h"
50 #include "rm-rf.h"
51 #include "signal-util.h"
52 #include "socket-util.h"
53 #include "stat-util.h"
54 #include "string-util.h"
55 #include "strv.h"
56 #include "tmpfile-util.h"
57 #include "unit-name.h"
58 #include "vmspawn-mount.h"
59 #include "vmspawn-scope.h"
60 #include "vmspawn-settings.h"
61 #include "vmspawn-util.h"
62
63 static bool arg_quiet = false;
64 static PagerFlags arg_pager_flags = 0;
65 static char *arg_directory = NULL;
66 static char *arg_image = NULL;
67 static char *arg_machine = NULL;
68 static char *arg_qemu_smp = NULL;
69 static uint64_t arg_qemu_mem = UINT64_C(2) * U64_GB;
70 static int arg_qemu_kvm = -1;
71 static int arg_qemu_vsock = -1;
72 static unsigned arg_vsock_cid = VMADDR_CID_ANY;
73 static int arg_tpm = -1;
74 static char *arg_linux = NULL;
75 static char **arg_initrds = NULL;
76 static bool arg_qemu_gui = false;
77 static QemuNetworkStack arg_network_stack = QEMU_NET_NONE;
78 static int arg_secure_boot = -1;
79 static MachineCredentialContext arg_credentials = {};
80 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
81 static RuntimeMountContext arg_runtime_mounts = {};
82 static SettingsMask arg_settings_mask = 0;
83 static char *arg_firmware = NULL;
84 static char *arg_runtime_directory = NULL;
85 static char *arg_forward_journal = NULL;
86 static bool arg_runtime_directory_created = false;
87 static bool arg_privileged = false;
88 static char **arg_kernel_cmdline_extra = NULL;
89
90 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
91 STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
92 STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
93 STATIC_DESTRUCTOR_REGISTER(arg_qemu_smp, freep);
94 STATIC_DESTRUCTOR_REGISTER(arg_runtime_directory, freep);
95 STATIC_DESTRUCTOR_REGISTER(arg_credentials, machine_credential_context_done);
96 STATIC_DESTRUCTOR_REGISTER(arg_firmware, freep);
97 STATIC_DESTRUCTOR_REGISTER(arg_linux, freep);
98 STATIC_DESTRUCTOR_REGISTER(arg_initrds, strv_freep);
99 STATIC_DESTRUCTOR_REGISTER(arg_runtime_mounts, runtime_mount_context_done);
100 STATIC_DESTRUCTOR_REGISTER(arg_forward_journal, freep);
101 STATIC_DESTRUCTOR_REGISTER(arg_kernel_cmdline_extra, strv_freep);
102
103 static int help(void) {
104 _cleanup_free_ char *link = NULL;
105 int r;
106
107 pager_open(arg_pager_flags);
108
109 r = terminal_urlify_man("systemd-vmspawn", "1", &link);
110 if (r < 0)
111 return log_oom();
112
113 printf("%1$s [OPTIONS...] [ARGUMENTS...]\n\n"
114 "%5$sSpawn a command or OS in a virtual machine.%6$s\n\n"
115 " -h --help Show this help\n"
116 " --version Print version string\n"
117 " -q --quiet Do not show status information\n"
118 " --no-pager Do not pipe output into a pager\n"
119 "\n%3$sImage:%4$s\n"
120 " -D --directory=PATH Root directory for the container\n"
121 " -i --image=PATH Root file system disk image (or device node) for\n"
122 " the virtual machine\n"
123 "\n%3$sHost Configuration:%4$s\n"
124 " --qemu-smp=SMP Configure guest's SMP settings\n"
125 " --qemu-mem=MEM Configure guest's RAM size\n"
126 " --qemu-kvm=BOOL Configure whether to use KVM or not\n"
127 " --qemu-vsock=BOOL Configure whether to use qemu with a vsock or not\n"
128 " --vsock-cid= Specify the CID to use for the qemu guest's vsock\n"
129 " --tpm=BOOL Configure whether to use a virtual TPM or not\n"
130 " --linux=PATH Specify the linux kernel for direct kernel boot\n"
131 " --initrd=PATH Specify the initrd for direct kernel boot\n"
132 " --qemu-gui Start QEMU in graphical mode\n"
133 " -n --network-tap Create a TAP device for networking with QEMU.\n"
134 " --network-user-mode Use user mode networking with QEMU.\n"
135 " --secure-boot=BOOL Configure whether to search for firmware which\n"
136 " supports Secure Boot\n"
137 " --firmware=PATH|list Select firmware definition file (or list available)\n"
138 "\n%3$sSystem Identity:%4$s\n"
139 " -M --machine=NAME Set the machine name for the virtual machine\n"
140 "\n%3$sUser Namespacing:%4$s\n"
141 " --private-users=UIDBASE[:NUIDS]\n"
142 " Configure the UID/GID range to map into the\n"
143 " virtiofsd namespace\n"
144 "\n%3$sMounts:%4$s\n"
145 " --bind=SOURCE[:TARGET]\n"
146 " Mount a file or directory from the host into\n"
147 " the VM.\n"
148 " --bind-ro=SOURCE[:TARGET]\n"
149 " Similar, but creates a read-only mount\n"
150 "\n%3$sIntegration:%4$s\n"
151 " --forward-journal=FILE|DIR\n"
152 " Forward the virtual machine's journal entries to\n"
153 " the host.\n"
154 "\n%3$sCredentials:%4$s\n"
155 " --set-credential=ID:VALUE\n"
156 " Pass a credential with literal value to the\n"
157 " virtual machine\n"
158 " --load-credential=ID:PATH\n"
159 " Load credential to pass to the virtual machine from\n"
160 " file or AF_UNIX stream socket.\n"
161 "\nSee the %2$s for details.\n",
162 program_invocation_short_name,
163 link,
164 ansi_underline(),
165 ansi_normal(),
166 ansi_highlight(),
167 ansi_normal());
168
169 return 0;
170 }
171
172 static int parse_argv(int argc, char *argv[]) {
173 enum {
174 ARG_VERSION = 0x100,
175 ARG_NO_PAGER,
176 ARG_QEMU_SMP,
177 ARG_QEMU_MEM,
178 ARG_QEMU_KVM,
179 ARG_QEMU_VSOCK,
180 ARG_VSOCK_CID,
181 ARG_TPM,
182 ARG_LINUX,
183 ARG_INITRD,
184 ARG_QEMU_GUI,
185 ARG_NETWORK_USER_MODE,
186 ARG_BIND,
187 ARG_BIND_RO,
188 ARG_SECURE_BOOT,
189 ARG_PRIVATE_USERS,
190 ARG_FORWARD_JOURNAL,
191 ARG_SET_CREDENTIAL,
192 ARG_LOAD_CREDENTIAL,
193 ARG_FIRMWARE,
194 };
195
196 static const struct option options[] = {
197 { "help", no_argument, NULL, 'h' },
198 { "version", no_argument, NULL, ARG_VERSION },
199 { "quiet", no_argument, NULL, 'q' },
200 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
201 { "image", required_argument, NULL, 'i' },
202 { "directory", required_argument, NULL, 'D' },
203 { "machine", required_argument, NULL, 'M' },
204 { "qemu-smp", required_argument, NULL, ARG_QEMU_SMP },
205 { "qemu-mem", required_argument, NULL, ARG_QEMU_MEM },
206 { "qemu-kvm", required_argument, NULL, ARG_QEMU_KVM },
207 { "qemu-vsock", required_argument, NULL, ARG_QEMU_VSOCK },
208 { "vsock-cid", required_argument, NULL, ARG_VSOCK_CID },
209 { "tpm", required_argument, NULL, ARG_TPM },
210 { "linux", required_argument, NULL, ARG_LINUX },
211 { "initrd", required_argument, NULL, ARG_INITRD },
212 { "qemu-gui", no_argument, NULL, ARG_QEMU_GUI },
213 { "network-tap", no_argument, NULL, 'n' },
214 { "network-user-mode", no_argument, NULL, ARG_NETWORK_USER_MODE },
215 { "bind", required_argument, NULL, ARG_BIND },
216 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
217 { "secure-boot", required_argument, NULL, ARG_SECURE_BOOT },
218 { "private-users", required_argument, NULL, ARG_PRIVATE_USERS },
219 { "forward-journal", required_argument, NULL, ARG_FORWARD_JOURNAL },
220 { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
221 { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
222 { "firmware", required_argument, NULL, ARG_FIRMWARE },
223 {}
224 };
225
226 int c, r;
227
228 assert(argc >= 0);
229 assert(argv);
230
231 optind = 0;
232 while ((c = getopt_long(argc, argv, "+hD:i:M:nq", options, NULL)) >= 0)
233 switch (c) {
234 case 'h':
235 return help();
236
237 case ARG_VERSION:
238 return version();
239
240 case 'q':
241 arg_quiet = true;
242 break;
243
244 case 'D':
245 r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_directory);
246 if (r < 0)
247 return r;
248
249 arg_settings_mask |= SETTING_DIRECTORY;
250 break;
251
252 case 'i':
253 r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_image);
254 if (r < 0)
255 return r;
256
257 arg_settings_mask |= SETTING_DIRECTORY;
258 break;
259
260 case 'M':
261 if (isempty(optarg))
262 arg_machine = mfree(arg_machine);
263 else {
264 if (!hostname_is_valid(optarg, 0))
265 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
266 "Invalid machine name: %s", optarg);
267
268 r = free_and_strdup(&arg_machine, optarg);
269 if (r < 0)
270 return log_oom();
271 }
272 break;
273
274 case ARG_NO_PAGER:
275 arg_pager_flags |= PAGER_DISABLE;
276 break;
277
278 case ARG_QEMU_SMP:
279 r = free_and_strdup_warn(&arg_qemu_smp, optarg);
280 if (r < 0)
281 return r;
282 break;
283
284 case ARG_QEMU_MEM:
285 r = parse_size(optarg, 1024, &arg_qemu_mem);
286 if (r < 0)
287 return log_error_errno(r, "Failed to parse --qemu-mem=%s: %m", optarg);
288 break;
289
290 case ARG_QEMU_KVM:
291 r = parse_tristate(optarg, &arg_qemu_kvm);
292 if (r < 0)
293 return log_error_errno(r, "Failed to parse --qemu-kvm=%s: %m", optarg);
294 break;
295
296 case ARG_QEMU_VSOCK:
297 r = parse_tristate(optarg, &arg_qemu_vsock);
298 if (r < 0)
299 return log_error_errno(r, "Failed to parse --qemu-vsock=%s: %m", optarg);
300 break;
301
302 case ARG_VSOCK_CID:
303 if (isempty(optarg))
304 arg_vsock_cid = VMADDR_CID_ANY;
305 else {
306 unsigned cid;
307
308 r = vsock_parse_cid(optarg, &cid);
309 if (r < 0)
310 return log_error_errno(r, "Failed to parse --vsock-cid: %s", optarg);
311 if (!VSOCK_CID_IS_REGULAR(cid))
312 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Specified CID is not regular, refusing: %u", cid);
313
314 arg_vsock_cid = cid;
315 }
316 break;
317
318 case ARG_TPM:
319 r = parse_tristate(optarg, &arg_tpm);
320 if (r < 0)
321 return log_error_errno(r, "Failed to parse --tpm=%s: %m", optarg);
322 break;
323
324 case ARG_LINUX:
325 r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_linux);
326 if (r < 0)
327 return r;
328 break;
329
330 case ARG_INITRD: {
331 _cleanup_free_ char *initrd_path = NULL;
332 r = parse_path_argument(optarg, /* suppress_root= */ false, &initrd_path);
333 if (r < 0)
334 return r;
335
336 r = strv_consume(&arg_initrds, TAKE_PTR(initrd_path));
337 if (r < 0)
338 return log_oom();
339
340 break;
341 }
342
343 case ARG_QEMU_GUI:
344 arg_qemu_gui = true;
345 break;
346
347 case 'n':
348 arg_network_stack = QEMU_NET_TAP;
349 break;
350
351 case ARG_NETWORK_USER_MODE:
352 arg_network_stack = QEMU_NET_USER;
353 break;
354
355 case ARG_BIND:
356 case ARG_BIND_RO:
357 r = runtime_mount_parse(&arg_runtime_mounts, optarg, c == ARG_BIND_RO);
358 if (r < 0)
359 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
360
361 arg_settings_mask |= SETTING_BIND_MOUNTS;
362 break;
363
364 case ARG_SECURE_BOOT:
365 r = parse_tristate(optarg, &arg_secure_boot);
366 if (r < 0)
367 return log_error_errno(r, "Failed to parse --secure-boot=%s: %m", optarg);
368 break;
369
370 case ARG_PRIVATE_USERS:
371 r = parse_userns_uid_range(optarg, &arg_uid_shift, &arg_uid_range);
372 if (r < 0)
373 return r;
374 break;
375
376 case ARG_FORWARD_JOURNAL:
377 r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_forward_journal);
378 if (r < 0)
379 return r;
380 break;
381
382 case ARG_SET_CREDENTIAL: {
383 r = machine_credential_set(&arg_credentials, optarg);
384 if (r < 0)
385 return r;
386 arg_settings_mask |= SETTING_CREDENTIALS;
387 break;
388 }
389
390 case ARG_LOAD_CREDENTIAL: {
391 r = machine_credential_load(&arg_credentials, optarg);
392 if (r < 0)
393 return r;
394
395 arg_settings_mask |= SETTING_CREDENTIALS;
396 break;
397 }
398
399 case ARG_FIRMWARE:
400 if (streq(optarg, "list")) {
401 _cleanup_strv_free_ char **l = NULL;
402
403 r = list_ovmf_config(&l);
404 if (r < 0)
405 return log_error_errno(r, "Failed to list firmwares: %m");
406
407 bool nl = false;
408 fputstrv(stdout, l, "\n", &nl);
409 if (nl)
410 putchar('\n');
411
412 return 0;
413 }
414
415 if (!isempty(optarg) && !path_is_absolute(optarg) && !startswith(optarg, "./"))
416 return log_error_errno(SYNTHETIC_ERRNO(errno), "Absolute path or path starting with './' required.");
417
418 r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_firmware);
419 if (r < 0)
420 return r;
421
422 break;
423
424 case '?':
425 return -EINVAL;
426
427 default:
428 assert_not_reached();
429 }
430
431 if (argc > optind) {
432 arg_kernel_cmdline_extra = strv_copy(argv + optind);
433 if (!arg_kernel_cmdline_extra)
434 return log_oom();
435
436 arg_settings_mask |= SETTING_START_MODE;
437 }
438
439 return 1;
440 }
441
442 static int open_vsock(void) {
443 _cleanup_close_ int vsock_fd = -EBADF;
444 int r;
445 static const union sockaddr_union bind_addr = {
446 .vm.svm_family = AF_VSOCK,
447 .vm.svm_cid = VMADDR_CID_ANY,
448 .vm.svm_port = VMADDR_PORT_ANY,
449 };
450
451 vsock_fd = socket(AF_VSOCK, SOCK_STREAM|SOCK_CLOEXEC, 0);
452 if (vsock_fd < 0)
453 return log_error_errno(errno, "Failed to open AF_VSOCK socket: %m");
454
455 r = bind(vsock_fd, &bind_addr.sa, sizeof(bind_addr.vm));
456 if (r < 0)
457 return log_error_errno(errno, "Failed to bind to vsock to address %u:%u: %m", bind_addr.vm.svm_cid, bind_addr.vm.svm_port);
458
459 r = listen(vsock_fd, SOMAXCONN_DELUXE);
460 if (r < 0)
461 return log_error_errno(errno, "Failed to listen on vsock: %m");
462
463 return TAKE_FD(vsock_fd);
464 }
465
466 static int vmspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
467 char buf[NOTIFY_BUFFER_MAX+1];
468 const char *p = NULL;
469 struct iovec iovec = {
470 .iov_base = buf,
471 .iov_len = sizeof(buf)-1,
472 };
473 struct msghdr msghdr = {
474 .msg_iov = &iovec,
475 .msg_iovlen = 1,
476 };
477 ssize_t n;
478 _cleanup_strv_free_ char **tags = NULL;
479 int r, *exit_status = ASSERT_PTR(userdata);
480
481 n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT);
482 if (ERRNO_IS_NEG_TRANSIENT(n))
483 return 0;
484 if (n == -EXFULL) {
485 log_warning_errno(n, "Got message with truncated control data, ignoring: %m");
486 return 0;
487 }
488 if (n < 0)
489 return log_warning_errno(n, "Couldn't read notification socket: %m");
490
491 if ((size_t) n >= sizeof(buf)) {
492 log_warning("Received notify message exceeded maximum size. Ignoring.");
493 return 0;
494 }
495
496 buf[n] = 0;
497 tags = strv_split(buf, "\n\r");
498 if (!tags)
499 return log_oom();
500
501 STRV_FOREACH(s, tags)
502 log_debug("Received tag %s from notify socket", *s);
503
504 if (strv_contains(tags, "READY=1")) {
505 r = sd_notify(false, "READY=1\n");
506 if (r < 0)
507 log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
508 }
509
510 p = strv_find_startswith(tags, "STATUS=");
511 if (p)
512 (void) sd_notifyf(false, "STATUS=VM running: %s", p);
513
514 p = strv_find_startswith(tags, "EXIT_STATUS=");
515 if (p) {
516 r = safe_atoi(p, exit_status);
517 if (r < 0)
518 log_warning_errno(r, "Failed to parse exit status from %s, ignoring: %m", p);
519 }
520
521 /* we will only receive one message from each connection so disable this source once one is received */
522 source = sd_event_source_disable_unref(source);
523
524 return 0;
525 }
526
527 static int vmspawn_dispatch_vsock_connections(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
528 int r;
529 sd_event *event;
530 _cleanup_close_ int conn_fd = -EBADF;
531
532 assert(userdata);
533
534 if (revents != EPOLLIN) {
535 log_warning("Got unexpected poll event for vsock fd.");
536 return 0;
537 }
538
539 conn_fd = accept4(fd, NULL, NULL, SOCK_CLOEXEC|SOCK_NONBLOCK);
540 if (conn_fd < 0) {
541 log_warning_errno(errno, "Failed to accept connection from vsock fd (%m), ignoring...");
542 return 0;
543 }
544
545 event = sd_event_source_get_event(source);
546 if (!event)
547 return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "Failed to retrieve event from event source, exiting task");
548
549 /* add a new floating task to read from the connection */
550 r = sd_event_add_io(event, NULL, conn_fd, revents, vmspawn_dispatch_notify_fd, userdata);
551 if (r < 0)
552 return log_error_errno(r, "Failed to allocate notify connection event source: %m");
553
554 /* conn_fd is now owned by the event loop so don't clean it up */
555 TAKE_FD(conn_fd);
556
557 return 0;
558 }
559
560 static int setup_notify_parent(sd_event *event, int fd, int *exit_status, sd_event_source **ret_notify_event_source) {
561 int r;
562
563 assert(event);
564 assert(fd >= 0);
565 assert(exit_status);
566 assert(ret_notify_event_source);
567
568 r = sd_event_add_io(event, ret_notify_event_source, fd, EPOLLIN, vmspawn_dispatch_vsock_connections, exit_status);
569 if (r < 0)
570 return log_error_errno(r, "Failed to allocate notify socket event source: %m");
571
572 (void) sd_event_source_set_description(*ret_notify_event_source, "vmspawn-notify-sock");
573
574 return 0;
575 }
576
577 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
578 PidRef *pidref = userdata;
579 int r;
580
581 /* TODO: actually talk to qemu and ask the guest to shutdown here */
582
583 if (pidref) {
584 r = pidref_kill(pidref, SIGKILL);
585 if (r < 0)
586 log_warning_errno(r, "Failed to kill qemu, terminating: %m");
587 else {
588 log_info("Trying to halt qemu. Send SIGTERM again to trigger vmspawn to immediately terminate.");
589 sd_event_source_set_userdata(s, NULL);
590 return 0;
591 }
592 }
593
594 sd_event_exit(sd_event_source_get_event(s), 0);
595 return 0;
596 }
597
598 static int on_child_exit(sd_event_source *s, const siginfo_t *si, void *userdata) {
599 sd_event_exit(sd_event_source_get_event(s), 0);
600 return 0;
601 }
602
603 static int cmdline_add_vsock(char ***cmdline, int vsock_fd) {
604 int r;
605
606 r = strv_extend(cmdline, "-smbios");
607 if (r < 0)
608 return r;
609
610 union sockaddr_union addr;
611 socklen_t addr_len = sizeof addr.vm;
612 r = getsockname(vsock_fd, &addr.sa, &addr_len);
613 if (r < 0)
614 return -errno;
615 assert(addr_len >= sizeof addr.vm);
616 assert(addr.vm.svm_family == AF_VSOCK);
617
618 r = strv_extendf(cmdline, "type=11,value=io.systemd.credential:vmm.notify_socket=vsock-stream:%u:%u", (unsigned) VMADDR_CID_HOST, addr.vm.svm_port);
619 if (r < 0)
620 return r;
621
622 return 0;
623 }
624
625 static int start_tpm(sd_bus *bus, const char *scope, const char *tpm, const char **ret_state_tempdir) {
626 _cleanup_(rm_rf_physical_and_freep) char *state_dir = NULL;
627 _cleanup_free_ char *scope_prefix = NULL;
628 _cleanup_(socket_service_pair_done) SocketServicePair ssp = {
629 .socket_type = SOCK_STREAM,
630 };
631 int r;
632
633 assert(bus);
634 assert(scope);
635 assert(tpm);
636 assert(ret_state_tempdir);
637
638 r = unit_name_to_prefix(scope, &scope_prefix);
639 if (r < 0)
640 return log_error_errno(r, "Failed to strip .scope suffix from scope: %m");
641
642 ssp.unit_name_prefix = strjoin(scope_prefix, "-tpm");
643 if (!ssp.unit_name_prefix)
644 return log_oom();
645
646 state_dir = path_join(arg_runtime_directory, ssp.unit_name_prefix);
647 if (!state_dir)
648 return log_oom();
649
650 if (arg_runtime_directory_created) {
651 ssp.runtime_directory = path_join("systemd/vmspawn", ssp.unit_name_prefix);
652 if (!ssp.runtime_directory)
653 return log_oom();
654 }
655
656 ssp.listen_address = path_join(state_dir, "sock");
657 if (!ssp.listen_address)
658 return log_oom();
659
660 ssp.exec_start = strv_new(tpm, "socket", "--tpm2", "--tpmstate");
661 if (!ssp.exec_start)
662 return log_oom();
663
664 r = strv_extendf(&ssp.exec_start, "dir=%s", state_dir);
665 if (r < 0)
666 return log_oom();
667
668 r = strv_extend_many(&ssp.exec_start, "--ctrl", "type=unixio,fd=3");
669 if (r < 0)
670 return log_oom();
671
672 r = start_socket_service_pair(bus, scope, &ssp);
673 if (r < 0)
674 return r;
675
676 *ret_state_tempdir = TAKE_PTR(state_dir);
677
678 return 0;
679 }
680
681 static int start_systemd_journal_remote(sd_bus *bus, const char *scope, unsigned port, const char *sd_journal_remote, char **listen_address) {
682 _cleanup_free_ char *scope_prefix = NULL;
683 _cleanup_(socket_service_pair_done) SocketServicePair ssp = {
684 .socket_type = SOCK_STREAM,
685 };
686 int r;
687
688 assert(bus);
689 assert(scope);
690 assert(sd_journal_remote);
691
692 r = unit_name_to_prefix(scope, &scope_prefix);
693 if (r < 0)
694 return log_error_errno(r, "Failed to strip .scope suffix from scope: %m");
695
696 ssp.unit_name_prefix = strjoin(scope_prefix, "-forward-journal");
697 if (!ssp.unit_name_prefix)
698 return log_oom();
699
700 r = asprintf(&ssp.listen_address, "vsock:2:%u", port);
701 if (r < 0)
702 return log_oom();
703
704 ssp.exec_start = strv_new(sd_journal_remote,
705 "--output", arg_forward_journal,
706 "--split-mode", endswith(arg_forward_journal, ".journal") ? "none" : "host");
707 if (!ssp.exec_start)
708 return log_oom();
709
710 r = start_socket_service_pair(bus, scope, &ssp);
711 if (r < 0)
712 return r;
713
714 if (listen_address)
715 *listen_address = TAKE_PTR(ssp.listen_address);
716
717 return 0;
718 }
719
720 static int discover_root(char **ret) {
721 int r;
722 _cleanup_(dissected_image_unrefp) DissectedImage *image = NULL;
723 _cleanup_free_ char *root = NULL;
724
725 assert(ret);
726
727 r = dissect_image_file_and_warn(
728 arg_image,
729 /* verity= */ NULL,
730 /* mount_options= */ NULL,
731 /* image_policy= */ NULL,
732 /* flags= */ 0,
733 &image);
734 if (r < 0)
735 return r;
736
737 if (image->partitions[PARTITION_ROOT].found)
738 root = strjoin("root=PARTUUID=", SD_ID128_TO_UUID_STRING(image->partitions[PARTITION_ROOT].uuid));
739 else if (image->partitions[PARTITION_USR].found)
740 root = strjoin("mount.usr=PARTUUID=", SD_ID128_TO_UUID_STRING(image->partitions[PARTITION_USR].uuid));
741 else
742 return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "Cannot perform a direct kernel boot without a root or usr partition, refusing");
743
744 if (!root)
745 return log_oom();
746
747 *ret = TAKE_PTR(root);
748 return 0;
749 }
750
751 static int find_virtiofsd(char **ret) {
752 int r;
753 _cleanup_free_ char *virtiofsd = NULL;
754
755 assert(ret);
756
757 r = find_executable("virtiofsd", &virtiofsd);
758 if (r < 0 && r != -ENOENT)
759 return log_error_errno(r, "Error while searching for virtiofsd: %m");
760
761 if (!virtiofsd) {
762 FOREACH_STRING(file, "/usr/libexec/virtiofsd", "/usr/lib/virtiofsd") {
763 if (access(file, X_OK) >= 0) {
764 virtiofsd = strdup(file);
765 if (!virtiofsd)
766 return log_oom();
767 break;
768 }
769
770 if (!IN_SET(errno, ENOENT, EACCES))
771 return log_error_errno(errno, "Error while searching for virtiofsd: %m");
772 }
773 }
774
775 if (!virtiofsd)
776 return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "Failed to find virtiofsd binary.");
777
778 *ret = TAKE_PTR(virtiofsd);
779 return 0;
780 }
781
782 static int start_virtiofsd(sd_bus *bus, const char *scope, const char *directory, bool uidmap, char **ret_state_tempdir, char **ret_sock_name) {
783 _cleanup_(rm_rf_physical_and_freep) char *state_dir = NULL;
784 _cleanup_free_ char *virtiofsd = NULL, *sock_name = NULL, *scope_prefix = NULL;
785 _cleanup_(socket_service_pair_done) SocketServicePair ssp = {
786 .socket_type = SOCK_STREAM,
787 };
788 static unsigned virtiofsd_instance = 0;
789 int r;
790
791 assert(bus);
792 assert(scope);
793 assert(directory);
794 assert(ret_state_tempdir);
795 assert(ret_sock_name);
796
797 r = find_virtiofsd(&virtiofsd);
798 if (r < 0)
799 return r;
800
801 r = unit_name_to_prefix(scope, &scope_prefix);
802 if (r < 0)
803 return log_error_errno(r, "Failed to strip .scope suffix from scope: %m");
804
805 if (asprintf(&ssp.unit_name_prefix, "%s-virtiofsd-%u", scope_prefix, virtiofsd_instance++) < 0)
806 return log_oom();
807
808 state_dir = path_join(arg_runtime_directory, ssp.unit_name_prefix);
809 if (!state_dir)
810 return log_oom();
811
812 if (arg_runtime_directory_created) {
813 ssp.runtime_directory = strjoin("systemd/vmspawn/", ssp.unit_name_prefix);
814 if (!ssp.runtime_directory)
815 return log_oom();
816 }
817
818 if (asprintf(&sock_name, "sock-%"PRIx64, random_u64()) < 0)
819 return log_oom();
820
821 ssp.listen_address = path_join(state_dir, sock_name);
822 if (!ssp.listen_address)
823 return log_oom();
824
825 /* QEMU doesn't support submounts so don't announce them */
826 ssp.exec_start = strv_new(virtiofsd, "--shared-dir", directory, "--xattr", "--fd", "3", "--no-announce-submounts");
827 if (!ssp.exec_start)
828 return log_oom();
829
830 if (uidmap && arg_uid_shift != UID_INVALID) {
831 r = strv_extend(&ssp.exec_start, "--uid-map");
832 if (r < 0)
833 return log_oom();
834
835 r = strv_extendf(&ssp.exec_start, ":0:" UID_FMT ":" UID_FMT ":", arg_uid_shift, arg_uid_range);
836 if (r < 0)
837 return log_oom();
838
839 r = strv_extend(&ssp.exec_start, "--gid-map");
840 if (r < 0)
841 return log_oom();
842
843 r = strv_extendf(&ssp.exec_start, ":0:" GID_FMT ":" GID_FMT ":", arg_uid_shift, arg_uid_range);
844 if (r < 0)
845 return log_oom();
846 }
847
848 r = start_socket_service_pair(bus, scope, &ssp);
849 if (r < 0)
850 return r;
851
852 *ret_state_tempdir = TAKE_PTR(state_dir);
853 *ret_sock_name = TAKE_PTR(sock_name);
854
855 return 0;
856 }
857
858 static int kernel_cmdline_maybe_append_root(void) {
859 int r;
860 bool cmdline_contains_root = strv_find_startswith(arg_kernel_cmdline_extra, "root=")
861 || strv_find_startswith(arg_kernel_cmdline_extra, "mount.usr=");
862
863 if (!cmdline_contains_root) {
864 _cleanup_free_ char *root = NULL;
865
866 r = discover_root(&root);
867 if (r < 0)
868 return r;
869
870 log_debug("Determined root file system %s from dissected image", root);
871
872 r = strv_consume(&arg_kernel_cmdline_extra, TAKE_PTR(root));
873 if (r < 0)
874 return log_oom();
875 }
876
877 return 0;
878 }
879
880 static int discover_boot_entry(const char *root, char **ret_linux, char ***ret_initrds) {
881 _cleanup_(boot_config_free) BootConfig config = BOOT_CONFIG_NULL;
882 _cleanup_free_ char *esp_path = NULL, *xbootldr_path = NULL;
883 int r;
884
885 assert(root);
886 assert(ret_linux);
887 assert(ret_initrds);
888
889 esp_path = path_join(root, "efi");
890 if (!esp_path)
891 return log_oom();
892
893 xbootldr_path = path_join(root, "boot");
894 if (!xbootldr_path)
895 return log_oom();
896
897 r = boot_config_load(&config, esp_path, xbootldr_path);
898 if (r < 0)
899 return r;
900
901 r = boot_config_select_special_entries(&config, /* skip_efivars= */ true);
902 if (r < 0)
903 return log_error_errno(r, "Failed to find special boot config entries: %m");
904
905 const BootEntry *boot_entry = boot_config_default_entry(&config);
906
907 if (boot_entry && !IN_SET(boot_entry->type, BOOT_ENTRY_UNIFIED, BOOT_ENTRY_CONF))
908 boot_entry = NULL;
909
910 /* If we cannot determine a default entry search for UKIs (Type #2 EFI Unified Kernel Images)
911 * then .conf files (Type #1 Boot Loader Specification Entries).
912 * https://uapi-group.org/specifications/specs/boot_loader_specification */
913 if (!boot_entry)
914 FOREACH_ARRAY(entry, config.entries, config.n_entries)
915 if (entry->type == BOOT_ENTRY_UNIFIED) {
916 boot_entry = entry;
917 break;
918 }
919
920 if (!boot_entry)
921 FOREACH_ARRAY(entry, config.entries, config.n_entries)
922 if (entry->type == BOOT_ENTRY_CONF) {
923 boot_entry = entry;
924 break;
925 }
926
927 if (!boot_entry)
928 return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "Failed to discover any boot entries.");
929
930 log_debug("Discovered boot entry %s (%s)", boot_entry->id, boot_entry_type_to_string(boot_entry->type));
931
932 _cleanup_free_ char *linux_kernel = NULL;
933 _cleanup_strv_free_ char **initrds = NULL;
934 if (boot_entry->type == BOOT_ENTRY_UNIFIED) {
935 linux_kernel = path_join(boot_entry->root, boot_entry->kernel);
936 if (!linux_kernel)
937 return log_oom();
938 } else if (boot_entry->type == BOOT_ENTRY_CONF) {
939 linux_kernel = path_join(boot_entry->root, boot_entry->kernel);
940 if (!linux_kernel)
941 return log_oom();
942
943 STRV_FOREACH(initrd, boot_entry->initrd) {
944 _cleanup_free_ char *initrd_path = path_join(boot_entry->root, *initrd);
945 if (!initrd_path)
946 return log_oom();
947
948 r = strv_consume(&initrds, TAKE_PTR(initrd_path));
949 if (r < 0)
950 return log_oom();
951 }
952 } else
953 assert_not_reached();
954
955 *ret_linux = TAKE_PTR(linux_kernel);
956 *ret_initrds = TAKE_PTR(initrds);
957
958 return 0;
959 }
960
961 static int merge_initrds(char **ret) {
962 _cleanup_(rm_rf_physical_and_freep) char *merged_initrd = NULL;
963 _cleanup_close_ int ofd = -EBADF;
964 int r;
965
966 assert(ret);
967
968 r = tempfn_random_child(NULL, "vmspawn-initrd-", &merged_initrd);
969 if (r < 0)
970 return log_error_errno(r, "Failed to create temporary file: %m");
971
972 ofd = open(merged_initrd, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600);
973 if (ofd < 0)
974 return log_error_errno(errno, "Failed to create regular file %s: %m", merged_initrd);
975
976 STRV_FOREACH(i, arg_initrds) {
977 _cleanup_close_ int ifd = -EBADF;
978 off_t off, to_seek;
979
980 off = lseek(ofd, 0, SEEK_CUR);
981 if (off < 0)
982 return log_error_errno(errno, "Failed to get file offset of %s: %m", merged_initrd);
983
984 to_seek = (4 - (off % 4)) % 4;
985
986 /* seek to assure 4 byte alignment for each initrd */
987 if (to_seek != 0 && lseek(ofd, to_seek, SEEK_CUR) < 0)
988 return log_error_errno(errno, "Failed to seek %s: %m", merged_initrd);
989
990 ifd = open(*i, O_RDONLY|O_CLOEXEC);
991 if (ifd < 0)
992 return log_error_errno(errno, "Failed to open %s: %m", *i);
993
994 r = copy_bytes(ifd, ofd, UINT64_MAX, COPY_REFLINK);
995 if (r < 0)
996 return log_error_errno(r, "Failed to copy bytes from %s to %s: %m", *i, merged_initrd);
997 }
998
999 *ret = TAKE_PTR(merged_initrd);
1000 return 0;
1001 }
1002
1003 static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) {
1004 _cleanup_(ovmf_config_freep) OvmfConfig *ovmf_config = NULL;
1005 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
1006 _cleanup_free_ char *machine = NULL, *qemu_binary = NULL, *mem = NULL, *trans_scope = NULL, *kernel = NULL;
1007 _cleanup_close_ int notify_sock_fd = -EBADF;
1008 _cleanup_strv_free_ char **cmdline = NULL;
1009 _cleanup_free_ int *pass_fds = NULL;
1010 size_t n_pass_fds = 0;
1011 const char *accel, *shm;
1012 int r;
1013
1014 if (arg_privileged)
1015 r = sd_bus_default_system(&bus);
1016 else
1017 r = sd_bus_default_user(&bus);
1018 if (r < 0)
1019 return log_error_errno(r, "Failed to connect to systemd bus: %m");
1020
1021 r = start_transient_scope(bus, arg_machine, /* allow_pidfd= */ true, &trans_scope);
1022 if (r < 0)
1023 return r;
1024
1025 bool use_kvm = arg_qemu_kvm > 0;
1026 if (arg_qemu_kvm < 0) {
1027 r = qemu_check_kvm_support();
1028 if (r < 0)
1029 return log_error_errno(r, "Failed to check for KVM support: %m");
1030 use_kvm = r;
1031 }
1032
1033 if (arg_firmware)
1034 r = load_ovmf_config(arg_firmware, &ovmf_config);
1035 else
1036 r = find_ovmf_config(arg_secure_boot, &ovmf_config);
1037 if (r < 0)
1038 return log_error_errno(r, "Failed to find OVMF config: %m");
1039
1040 /* only warn if the user hasn't disabled secureboot */
1041 if (!ovmf_config->supports_sb && arg_secure_boot)
1042 log_warning("Couldn't find OVMF firmware blob with Secure Boot support, "
1043 "falling back to OVMF firmware blobs without Secure Boot support.");
1044
1045 shm = arg_directory ? ",memory-backend=mem" : "";
1046 if (ARCHITECTURE_SUPPORTS_SMM)
1047 machine = strjoin("type=" QEMU_MACHINE_TYPE ",smm=", on_off(ovmf_config->supports_sb), shm);
1048 else
1049 machine = strjoin("type=" QEMU_MACHINE_TYPE, shm);
1050 if (!machine)
1051 return log_oom();
1052
1053 if (arg_linux) {
1054 kernel = strdup(arg_linux);
1055 if (!kernel)
1056 return log_oom();
1057 } else if (arg_directory) {
1058 /* a kernel is required for directory type images so attempt to locate a UKI under /boot and /efi */
1059 r = discover_boot_entry(arg_directory, &kernel, &arg_initrds);
1060 if (r < 0)
1061 return log_error_errno(r, "Failed to locate UKI in directory type image, please specify one with --linux=.");
1062
1063 log_debug("Discovered UKI image at %s", kernel);
1064 }
1065
1066 r = find_qemu_binary(&qemu_binary);
1067 if (r == -EOPNOTSUPP)
1068 return log_error_errno(r, "Native architecture is not supported by qemu.");
1069 if (r < 0)
1070 return log_error_errno(r, "Failed to find QEMU binary: %m");
1071
1072 if (asprintf(&mem, "%" PRIu64 "M", DIV_ROUND_UP(arg_qemu_mem, U64_MB)) < 0)
1073 return log_oom();
1074
1075 cmdline = strv_new(
1076 qemu_binary,
1077 "-machine", machine,
1078 "-smp", arg_qemu_smp ?: "1",
1079 "-m", mem,
1080 "-object", "rng-random,filename=/dev/urandom,id=rng0",
1081 "-device", "virtio-rng-pci,rng=rng0,id=rng-device0"
1082 );
1083 if (!cmdline)
1084 return log_oom();
1085
1086 /* if we are going to be starting any units with state then create our runtime dir */
1087 if (arg_tpm != 0 || arg_directory || arg_runtime_mounts.n_mounts != 0) {
1088 r = runtime_directory(&arg_runtime_directory, arg_privileged ? RUNTIME_SCOPE_SYSTEM : RUNTIME_SCOPE_USER, "systemd/vmspawn");
1089 if (r < 0)
1090 return log_error_errno(r, "Failed to lookup runtime directory: %m");
1091 if (r) {
1092 /* r > 0 means we need to create our own runtime dir */
1093 r = mkdir_p(arg_runtime_directory, 0755);
1094 if (r < 0)
1095 return log_error_errno(r, "Failed to create runtime directory: %m");
1096 arg_runtime_directory_created = true;
1097 }
1098 }
1099
1100 if (arg_network_stack == QEMU_NET_TAP)
1101 r = strv_extend_many(&cmdline, "-nic", "tap,script=no,model=virtio-net-pci");
1102 else if (arg_network_stack == QEMU_NET_USER)
1103 r = strv_extend_many(&cmdline, "-nic", "user,model=virtio-net-pci");
1104 else
1105 r = strv_extend_many(&cmdline, "-nic", "none");
1106 if (r < 0)
1107 return log_oom();
1108
1109 /* A shared memory backend might increase ram usage so only add one if actually necessary for virtiofsd. */
1110 if (arg_directory || arg_runtime_mounts.n_mounts != 0) {
1111 r = strv_extend(&cmdline, "-object");
1112 if (r < 0)
1113 return log_oom();
1114
1115 r = strv_extendf(&cmdline, "memory-backend-memfd,id=mem,size=%s,share=on", mem);
1116 if (r < 0)
1117 return log_oom();
1118 }
1119
1120 bool use_vsock = arg_qemu_vsock > 0 && ARCHITECTURE_SUPPORTS_SMBIOS;
1121 if (arg_qemu_vsock < 0) {
1122 r = qemu_check_vsock_support();
1123 if (r < 0)
1124 return log_error_errno(r, "Failed to check for VSock support: %m");
1125
1126 use_vsock = r;
1127 }
1128
1129 if (!use_kvm && kvm_device_fd >= 0) {
1130 log_warning("KVM is disabled but fd for /dev/kvm was passed, closing fd and ignoring");
1131 kvm_device_fd = safe_close(kvm_device_fd);
1132 }
1133
1134 if (use_kvm && kvm_device_fd >= 0) {
1135 /* /dev/fdset/1 is magic string to tell qemu where to find the fd for /dev/kvm
1136 * we use this so that we can take a fd to /dev/kvm and then give qemu that fd */
1137 accel = "kvm,device=/dev/fdset/1";
1138
1139 r = strv_extend(&cmdline, "--add-fd");
1140 if (r < 0)
1141 return log_oom();
1142
1143 r = strv_extendf(&cmdline, "fd=%d,set=1,opaque=/dev/kvm", kvm_device_fd);
1144 if (r < 0)
1145 return log_oom();
1146
1147 if (!GREEDY_REALLOC(pass_fds, n_pass_fds + 1))
1148 return log_oom();
1149
1150 pass_fds[n_pass_fds++] = kvm_device_fd;
1151 } else if (use_kvm)
1152 accel = "kvm";
1153 else
1154 accel = "tcg";
1155
1156 r = strv_extend_many(&cmdline, "-accel", accel);
1157 if (r < 0)
1158 return log_oom();
1159
1160 _cleanup_close_ int child_vsock_fd = -EBADF;
1161 unsigned child_cid = arg_vsock_cid;
1162 if (use_vsock) {
1163 int device_fd = vhost_device_fd;
1164
1165 if (device_fd < 0) {
1166 child_vsock_fd = open("/dev/vhost-vsock", O_RDWR|O_CLOEXEC);
1167 if (child_vsock_fd < 0)
1168 return log_error_errno(errno, "Failed to open /dev/vhost-vsock as read/write: %m");
1169
1170 device_fd = child_vsock_fd;
1171 }
1172
1173 r = vsock_fix_child_cid(device_fd, &child_cid, arg_machine);
1174 if (r < 0)
1175 return log_error_errno(r, "Failed to fix CID for the guest vsock socket: %m");
1176
1177 r = strv_extend(&cmdline, "-device");
1178 if (r < 0)
1179 return log_oom();
1180
1181 r = strv_extendf(&cmdline, "vhost-vsock-pci,guest-cid=%u,vhostfd=%d", child_cid, device_fd);
1182 if (r < 0)
1183 return log_oom();
1184
1185 if (!GREEDY_REALLOC(pass_fds, n_pass_fds + 1))
1186 return log_oom();
1187
1188 pass_fds[n_pass_fds++] = device_fd;
1189 }
1190
1191 r = strv_extend_many(&cmdline, "-cpu", "max");
1192 if (r < 0)
1193 return log_oom();
1194
1195 if (arg_qemu_gui)
1196 r = strv_extend_many(
1197 &cmdline,
1198 "-vga",
1199 "virtio");
1200 else
1201 r = strv_extend_many(
1202 &cmdline,
1203 "-nographic",
1204 "-nodefaults",
1205 "-chardev", "stdio,mux=on,id=console,signal=off",
1206 "-serial", "chardev:console",
1207 "-mon", "console");
1208 if (r < 0)
1209 return log_oom();
1210
1211 r = strv_extend(&cmdline, "-drive");
1212 if (r < 0)
1213 return log_oom();
1214
1215 r = strv_extendf(&cmdline, "if=pflash,format=%s,readonly=on,file=%s", ovmf_config_format(ovmf_config), ovmf_config->path);
1216 if (r < 0)
1217 return log_oom();
1218
1219 _cleanup_(unlink_and_freep) char *ovmf_vars_to = NULL;
1220 if (ovmf_config->supports_sb) {
1221 const char *ovmf_vars_from = ovmf_config->vars;
1222 _cleanup_close_ int source_fd = -EBADF, target_fd = -EBADF;
1223
1224 r = tempfn_random_child(NULL, "vmspawn-", &ovmf_vars_to);
1225 if (r < 0)
1226 return r;
1227
1228 source_fd = open(ovmf_vars_from, O_RDONLY|O_CLOEXEC);
1229 if (source_fd < 0)
1230 return log_error_errno(source_fd, "Failed to open OVMF vars file %s: %m", ovmf_vars_from);
1231
1232 target_fd = open(ovmf_vars_to, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600);
1233 if (target_fd < 0)
1234 return log_error_errno(errno, "Failed to create regular file for OVMF vars at %s: %m", ovmf_vars_to);
1235
1236 r = copy_bytes(source_fd, target_fd, UINT64_MAX, COPY_REFLINK);
1237 if (r < 0)
1238 return log_error_errno(r, "Failed to copy bytes from %s to %s: %m", ovmf_vars_from, ovmf_vars_to);
1239
1240 /* These aren't always available so don't raise an error if they fail */
1241 (void) copy_xattr(source_fd, NULL, target_fd, NULL, 0);
1242 (void) copy_access(source_fd, target_fd);
1243 (void) copy_times(source_fd, target_fd, 0);
1244
1245 r = strv_extend_many(
1246 &cmdline,
1247 "-global", "ICH9-LPC.disable_s3=1",
1248 "-global", "driver=cfi.pflash01,property=secure,value=on",
1249 "-drive");
1250 if (r < 0)
1251 return log_oom();
1252
1253 r = strv_extendf(&cmdline, "file=%s,if=pflash,format=%s", ovmf_vars_to, ovmf_config_format(ovmf_config));
1254 if (r < 0)
1255 return log_oom();
1256 }
1257
1258 if (kernel) {
1259 r = strv_extend_many(&cmdline, "-kernel", kernel);
1260 if (r < 0)
1261 return log_oom();
1262
1263 /* We can't rely on gpt-auto-generator when direct kernel booting so synthesize a root=
1264 * kernel argument instead. */
1265 if (arg_image) {
1266 r = kernel_cmdline_maybe_append_root();
1267 if (r < 0)
1268 return r;
1269 }
1270 }
1271
1272 if (arg_image) {
1273 assert(!arg_directory);
1274
1275 r = strv_extend(&cmdline, "-drive");
1276 if (r < 0)
1277 return log_oom();
1278
1279 r = strv_extendf(&cmdline, "if=none,id=mkosi,file=%s,format=raw", arg_image);
1280 if (r < 0)
1281 return log_oom();
1282
1283 r = strv_extend_many(&cmdline,
1284 "-device", "virtio-scsi-pci,id=scsi",
1285 "-device", "scsi-hd,drive=mkosi,bootindex=1");
1286 if (r < 0)
1287 return log_oom();
1288 }
1289
1290 if (arg_directory) {
1291 _cleanup_free_ char *sock_path = NULL, *sock_name = NULL;
1292 r = start_virtiofsd(bus, trans_scope, arg_directory, /* uidmap= */ true, &sock_path, &sock_name);
1293 if (r < 0)
1294 return r;
1295
1296 r = strv_extend(&cmdline, "-chardev");
1297 if (r < 0)
1298 return log_oom();
1299
1300 r = strv_extendf(&cmdline, "socket,id=%1$s,path=%2$s/%1$s", sock_name, sock_path);
1301 if (r < 0)
1302 return log_oom();
1303
1304 r = strv_extend(&cmdline, "-device");
1305 if (r < 0)
1306 return log_oom();
1307
1308 r = strv_extendf(&cmdline, "vhost-user-fs-pci,queue-size=1024,chardev=%s,tag=root", sock_name);
1309 if (r < 0)
1310 return log_oom();
1311
1312 r = strv_extend(&arg_kernel_cmdline_extra, "root=root rootfstype=virtiofs rw");
1313 if (r < 0)
1314 return log_oom();
1315 }
1316
1317 r = strv_prepend(&arg_kernel_cmdline_extra, "console=" DEFAULT_SERIAL_TTY);
1318 if (r < 0)
1319 return log_oom();
1320
1321 FOREACH_ARRAY(mount, arg_runtime_mounts.mounts, arg_runtime_mounts.n_mounts) {
1322 _cleanup_free_ char *sock_path = NULL, *sock_name = NULL, *clean_target = NULL;
1323 r = start_virtiofsd(bus, trans_scope, mount->source, /* uidmap= */ false, &sock_path, &sock_name);
1324 if (r < 0)
1325 return r;
1326
1327 r = strv_extend(&cmdline, "-chardev");
1328 if (r < 0)
1329 return log_oom();
1330
1331 r = strv_extendf(&cmdline, "socket,id=%1$s,path=%2$s/%1$s", sock_name, sock_path);
1332 if (r < 0)
1333 return log_oom();
1334
1335 r = strv_extend(&cmdline, "-device");
1336 if (r < 0)
1337 return log_oom();
1338
1339 r = strv_extendf(&cmdline, "vhost-user-fs-pci,queue-size=1024,chardev=%1$s,tag=%1$s", sock_name);
1340 if (r < 0)
1341 return log_oom();
1342
1343 clean_target = xescape(mount->target, "\":");
1344 if (!clean_target)
1345 return log_oom();
1346
1347 r = strv_extendf(&arg_kernel_cmdline_extra, "systemd.mount-extra=\"%s:%s:virtiofs:%s\"",
1348 sock_name, clean_target, mount->read_only ? "ro" : "rw");
1349 if (r < 0)
1350 return log_oom();
1351 }
1352
1353 if (ARCHITECTURE_SUPPORTS_SMBIOS) {
1354 _cleanup_free_ char *kcl = strv_join(arg_kernel_cmdline_extra, " ");
1355 if (!kcl)
1356 return log_oom();
1357
1358 if (kernel) {
1359 r = strv_extend_many(&cmdline, "-append", kcl);
1360 if (r < 0)
1361 return log_oom();
1362 } else {
1363 if (ARCHITECTURE_SUPPORTS_SMBIOS) {
1364 r = strv_extend(&cmdline, "-smbios");
1365 if (r < 0)
1366 return log_oom();
1367
1368 r = strv_extendf(&cmdline, "type=11,value=io.systemd.stub.kernel-cmdline-extra=%s", kcl);
1369 if (r < 0)
1370 return log_oom();
1371 } else
1372 log_warning("Cannot append extra args to kernel cmdline, native architecture doesn't support SMBIOS, ignoring");
1373 }
1374 } else
1375 log_warning("Cannot append extra args to kernel cmdline, native architecture doesn't support SMBIOS");
1376
1377 /* disable TPM autodetection if the user's hardware doesn't support it */
1378 if (!ARCHITECTURE_SUPPORTS_TPM) {
1379 if (arg_tpm < 0) {
1380 arg_tpm = 0;
1381 log_debug("TPM not support on %s, disabling tpm autodetection and continuing", architecture_to_string(native_architecture()));
1382 } else if (arg_tpm > 0)
1383 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "TPM not supported on %s, refusing", architecture_to_string(native_architecture()));
1384 }
1385
1386 _cleanup_free_ char *swtpm = NULL;
1387 if (arg_tpm != 0) {
1388 r = find_executable("swtpm", &swtpm);
1389 if (r < 0) {
1390 /* log if the user asked for swtpm and we cannot find it */
1391 if (arg_tpm > 0)
1392 return log_error_errno(r, "Failed to find swtpm binary: %m");
1393 /* also log if we got an error other than ENOENT from find_executable */
1394 if (r != -ENOENT && arg_tpm < 0)
1395 return log_error_errno(r, "Error detecting swtpm: %m");
1396 }
1397 }
1398
1399 _cleanup_free_ const char *tpm_state_tempdir = NULL;
1400 if (swtpm) {
1401 r = start_tpm(bus, trans_scope, swtpm, &tpm_state_tempdir);
1402 if (r < 0) {
1403 /* only bail if the user asked for a tpm */
1404 if (arg_tpm > 0)
1405 return log_error_errno(r, "Failed to start tpm: %m");
1406 log_debug_errno(r, "Failed to start tpm, ignoring: %m");
1407 }
1408
1409 r = strv_extend(&cmdline, "-chardev");
1410 if (r < 0)
1411 return log_oom();
1412
1413 r = strv_extendf(&cmdline, "socket,id=chrtpm,path=%s/sock", tpm_state_tempdir);
1414 if (r < 0)
1415 return log_oom();
1416
1417 r = strv_extend_many(&cmdline, "-tpmdev", "emulator,id=tpm0,chardev=chrtpm");
1418 if (r < 0)
1419 return log_oom();
1420
1421 if (native_architecture() == ARCHITECTURE_X86_64)
1422 r = strv_extend_many(&cmdline, "-device", "tpm-tis,tpmdev=tpm0");
1423 else if (IN_SET(native_architecture(), ARCHITECTURE_ARM64, ARCHITECTURE_ARM64_BE))
1424 r = strv_extend_many(&cmdline, "-device", "tpm-tis-device,tpmdev=tpm0");
1425 if (r < 0)
1426 return log_oom();
1427 }
1428
1429 char *initrd = NULL;
1430 _cleanup_(rm_rf_physical_and_freep) char *merged_initrd = NULL;
1431 size_t n_initrds = strv_length(arg_initrds);
1432
1433 if (n_initrds == 1)
1434 initrd = arg_initrds[0];
1435 else if (n_initrds > 1) {
1436 r = merge_initrds(&merged_initrd);
1437 if (r < 0)
1438 return r;
1439
1440 initrd = merged_initrd;
1441 }
1442
1443 if (initrd) {
1444 r = strv_extend_many(&cmdline, "-initrd", initrd);
1445 if (r < 0)
1446 return log_oom();
1447 }
1448
1449 if (arg_forward_journal) {
1450 _cleanup_free_ char *sd_journal_remote = NULL, *listen_address = NULL, *cred = NULL;
1451 r = find_executable("systemd-journal-remote", &sd_journal_remote);
1452 if (r < 0)
1453 return log_error_errno(r, "Failed to find systemd-journal-remote binary: %m");
1454
1455 r = start_systemd_journal_remote(bus, trans_scope, child_cid, sd_journal_remote, &listen_address);
1456 if (r < 0)
1457 return r;
1458
1459 cred = strjoin("journal.forward_to_socket:", listen_address);
1460 if (!cred)
1461 return log_oom();
1462
1463 r = machine_credential_set(&arg_credentials, cred);
1464 if (r < 0)
1465 return r;
1466 }
1467
1468 if (ARCHITECTURE_SUPPORTS_SMBIOS)
1469 FOREACH_ARRAY(cred, arg_credentials.credentials, arg_credentials.n_credentials) {
1470 _cleanup_free_ char *cred_data_b64 = NULL;
1471 ssize_t n;
1472
1473 n = base64mem(cred->data, cred->size, &cred_data_b64);
1474 if (n < 0)
1475 return log_oom();
1476
1477 r = strv_extend(&cmdline, "-smbios");
1478 if (r < 0)
1479 return log_oom();
1480
1481 r = strv_extendf(&cmdline, "type=11,value=io.systemd.credential.binary:%s=%s", cred->id, cred_data_b64);
1482 if (r < 0)
1483 return log_oom();
1484 }
1485
1486 if (use_vsock) {
1487 notify_sock_fd = open_vsock();
1488 if (notify_sock_fd < 0)
1489 return log_error_errno(notify_sock_fd, "Failed to open vsock: %m");
1490
1491 r = cmdline_add_vsock(&cmdline, notify_sock_fd);
1492 if (r == -ENOMEM)
1493 return log_oom();
1494 if (r < 0)
1495 return log_error_errno(r, "Failed to call getsockname on vsock: %m");
1496 }
1497
1498 if (DEBUG_LOGGING) {
1499 _cleanup_free_ char *joined = quote_command_line(cmdline, SHELL_ESCAPE_EMPTY);
1500 if (!joined)
1501 return log_oom();
1502
1503 log_debug("Executing: %s", joined);
1504 }
1505
1506 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0);
1507
1508 _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
1509 _cleanup_(sd_event_unrefp) sd_event *event = NULL;
1510 r = sd_event_new(&event);
1511 if (r < 0)
1512 return log_error_errno(r, "Failed to get default event source: %m");
1513
1514 (void) sd_event_set_watchdog(event, true);
1515
1516 _cleanup_(pidref_done) PidRef child_pidref = PIDREF_NULL;
1517
1518 r = pidref_safe_fork_full(
1519 qemu_binary,
1520 /* stdio_fds= */ NULL,
1521 &child_vsock_fd, 1, /* pass the vsock fd to qemu */
1522 FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_CLOEXEC_OFF|FORK_RLIMIT_NOFILE_SAFE,
1523 &child_pidref);
1524 if (r < 0)
1525 return r;
1526 if (r == 0) {
1527 /* set TERM and LANG if they are missing */
1528 if (setenv("TERM", "vt220", 0) < 0)
1529 return log_oom();
1530
1531 if (setenv("LANG", "C.UTF-8", 0) < 0)
1532 return log_oom();
1533
1534 execv(qemu_binary, cmdline);
1535 log_error_errno(errno, "Failed to execve %s: %m", qemu_binary);
1536 _exit(EXIT_FAILURE);
1537 }
1538
1539 /* Close the vsock fd we passed to qemu in the parent. We don't need it anymore. */
1540 child_vsock_fd = safe_close(child_vsock_fd);
1541
1542 int exit_status = INT_MAX;
1543 if (use_vsock) {
1544 r = setup_notify_parent(event, notify_sock_fd, &exit_status, &notify_event_source);
1545 if (r < 0)
1546 return log_error_errno(r, "Failed to setup event loop to handle vsock notify events: %m");
1547 }
1548
1549 /* shutdown qemu when we are shutdown */
1550 (void) sd_event_add_signal(event, NULL, SIGINT | SD_EVENT_SIGNAL_PROCMASK, on_orderly_shutdown, &child_pidref);
1551 (void) sd_event_add_signal(event, NULL, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, on_orderly_shutdown, &child_pidref);
1552
1553 (void) sd_event_add_signal(event, NULL, (SIGRTMIN+18) | SD_EVENT_SIGNAL_PROCMASK, sigrtmin18_handler, NULL);
1554
1555 /* Exit when the child exits */
1556 (void) event_add_child_pidref(event, NULL, &child_pidref, WEXITED, on_child_exit, NULL);
1557
1558 r = sd_event_loop(event);
1559 if (r < 0)
1560 return log_error_errno(r, "Failed to run event loop: %m");
1561
1562 if (use_vsock) {
1563 if (exit_status == INT_MAX) {
1564 log_debug("Couldn't retrieve inner EXIT_STATUS from vsock");
1565 return EXIT_SUCCESS;
1566 }
1567 if (exit_status != 0)
1568 log_warning("Non-zero exit code received: %d", exit_status);
1569 return exit_status;
1570 }
1571
1572 return 0;
1573 }
1574
1575 static int determine_names(void) {
1576 int r;
1577
1578 if (!arg_directory && !arg_image) {
1579 if (arg_machine) {
1580 _cleanup_(image_unrefp) Image *i = NULL;
1581
1582 r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
1583 if (r == -ENOENT)
1584 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
1585 if (r < 0)
1586 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
1587
1588 if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
1589 r = free_and_strdup(&arg_image, i->path);
1590 else if (IN_SET(i->type, IMAGE_DIRECTORY, IMAGE_SUBVOLUME))
1591 r = free_and_strdup(&arg_directory, i->path);
1592 else
1593 assert_not_reached();
1594 if (r < 0)
1595 return log_oom();
1596 } else {
1597 r = safe_getcwd(&arg_directory);
1598 if (r < 0)
1599 return log_error_errno(r, "Failed to determine current directory: %m");
1600 }
1601 }
1602
1603 if (!arg_machine) {
1604 if (arg_directory && path_equal(arg_directory, "/")) {
1605 arg_machine = gethostname_malloc();
1606 if (!arg_machine)
1607 return log_oom();
1608 } else if (arg_image) {
1609 char *e;
1610
1611 r = path_extract_filename(arg_image, &arg_machine);
1612 if (r < 0)
1613 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image);
1614
1615 /* Truncate suffix if there is one */
1616 e = endswith(arg_machine, ".raw");
1617 if (e)
1618 *e = 0;
1619 } else {
1620 r = path_extract_filename(arg_directory, &arg_machine);
1621 if (r < 0)
1622 return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory);
1623 }
1624
1625 hostname_cleanup(arg_machine);
1626 if (!hostname_is_valid(arg_machine, 0))
1627 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
1628 }
1629
1630 return 0;
1631 }
1632
1633 static int verify_arguments(void) {
1634 if (arg_network_stack == QEMU_NET_TAP && !arg_privileged)
1635 return log_error_errno(SYNTHETIC_ERRNO(EPERM), "--network-tap requires root privileges, refusing.");
1636
1637 if (!strv_isempty(arg_initrds) && !arg_linux)
1638 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Option --initrd= cannot be used without --linux=.");
1639
1640 return 0;
1641 }
1642
1643 static int run(int argc, char *argv[]) {
1644 int r, kvm_device_fd = -EBADF, vhost_device_fd = -EBADF;
1645 _cleanup_strv_free_ char **names = NULL;
1646
1647 log_setup();
1648
1649 arg_privileged = getuid() == 0;
1650
1651 r = parse_argv(argc, argv);
1652 if (r <= 0)
1653 return r;
1654
1655 r = determine_names();
1656 if (r < 0)
1657 return r;
1658
1659 r = verify_arguments();
1660 if (r < 0)
1661 return r;
1662
1663 if (!arg_quiet) {
1664 _cleanup_free_ char *u = NULL;
1665 const char *vm_path = arg_image ?: arg_directory;
1666 (void) terminal_urlify_path(vm_path, vm_path, &u);
1667
1668 log_info("%s %sSpawning VM %s on %s.%s\n"
1669 "%s %sPress %sCtrl-a x%s to kill VM.%s",
1670 special_glyph(SPECIAL_GLYPH_LIGHT_SHADE), ansi_grey(), arg_machine, u ?: vm_path, ansi_normal(),
1671 special_glyph(SPECIAL_GLYPH_LIGHT_SHADE), ansi_grey(), ansi_highlight(), ansi_grey(), ansi_normal());
1672 }
1673
1674 r = sd_listen_fds_with_names(true, &names);
1675 if (r < 0)
1676 return log_error_errno(r, "Failed to get passed file descriptors: %m");
1677
1678 for (int i = 0; i < r; i++) {
1679 int fd = SD_LISTEN_FDS_START + i;
1680 if (streq(names[i], "kvm"))
1681 kvm_device_fd = fd;
1682 else if (streq(names[i], "vhost-vsock"))
1683 vhost_device_fd = fd;
1684 else {
1685 log_notice("Couldn't recognize passed fd %d (%s), closing fd and ignoring...", fd, names[i]);
1686 safe_close(fd);
1687 }
1688 }
1689
1690 return run_virtual_machine(kvm_device_fd, vhost_device_fd);
1691 }
1692
1693 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);