]> git.ipfire.org Git - thirdparty/systemd.git/blame_incremental - src/basic/virt.c
Reapply "network: add "mac" to alternatives name policy by default"
[thirdparty/systemd.git] / src / basic / virt.c
... / ...
CommitLineData
1/* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3#if defined(__i386__) || defined(__x86_64__)
4#include <cpuid.h>
5#endif
6#include <errno.h>
7#include <stdint.h>
8#include <stdlib.h>
9#include <unistd.h>
10
11#include "alloc-util.h"
12#include "cgroup-util.h"
13#include "dirent-util.h"
14#include "env-util.h"
15#include "errno-util.h"
16#include "fd-util.h"
17#include "fileio.h"
18#include "macro.h"
19#include "missing_threads.h"
20#include "process-util.h"
21#include "stat-util.h"
22#include "string-table.h"
23#include "string-util.h"
24#include "uid-range.h"
25#include "virt.h"
26
27enum {
28 SMBIOS_VM_BIT_SET,
29 SMBIOS_VM_BIT_UNSET,
30 SMBIOS_VM_BIT_UNKNOWN,
31};
32
33static Virtualization detect_vm_cpuid(void) {
34
35 /* CPUID is an x86 specific interface. */
36#if defined(__i386__) || defined(__x86_64__)
37
38 static const struct {
39 const char sig[13];
40 Virtualization id;
41 } vm_table[] = {
42 { "XenVMMXenVMM", VIRTUALIZATION_XEN },
43 { "KVMKVMKVM", VIRTUALIZATION_KVM }, /* qemu with KVM */
44 { "Linux KVM Hv", VIRTUALIZATION_KVM }, /* qemu with KVM + HyperV Enlightenments */
45 { "TCGTCGTCGTCG", VIRTUALIZATION_QEMU }, /* qemu without KVM */
46 /* http://kb.vmware.com/selfservice/microsites/search.do?language=en_US&cmd=displayKC&externalId=1009458 */
47 { "VMwareVMware", VIRTUALIZATION_VMWARE },
48 /* https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs */
49 { "Microsoft Hv", VIRTUALIZATION_MICROSOFT },
50 /* https://wiki.freebsd.org/bhyve */
51 { "bhyve bhyve ", VIRTUALIZATION_BHYVE },
52 { "QNXQVMBSQG", VIRTUALIZATION_QNX },
53 /* https://projectacrn.org */
54 { "ACRNACRNACRN", VIRTUALIZATION_ACRN },
55 /* https://www.lockheedmartin.com/en-us/products/Hardened-Security-for-Intel-Processors.html */
56 { "SRESRESRESRE", VIRTUALIZATION_SRE },
57 { "Apple VZ", VIRTUALIZATION_APPLE },
58 };
59
60 uint32_t eax, ebx, ecx, edx;
61 bool hypervisor;
62
63 /* http://lwn.net/Articles/301888/ */
64
65 /* First detect whether there is a hypervisor */
66 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) == 0)
67 return VIRTUALIZATION_NONE;
68
69 hypervisor = ecx & 0x80000000U;
70
71 if (hypervisor) {
72 union {
73 uint32_t sig32[3];
74 char text[13];
75 } sig = {};
76
77 /* There is a hypervisor, see what it is */
78 __cpuid(0x40000000U, eax, ebx, ecx, edx);
79
80 sig.sig32[0] = ebx;
81 sig.sig32[1] = ecx;
82 sig.sig32[2] = edx;
83
84 log_debug("Virtualization found, CPUID=%s", sig.text);
85
86 for (size_t i = 0; i < ELEMENTSOF(vm_table); i++)
87 if (memcmp_nn(sig.text, sizeof(sig.text),
88 vm_table[i].sig, sizeof(vm_table[i].sig)) == 0)
89 return vm_table[i].id;
90
91 log_debug("Unknown virtualization with CPUID=%s. Add to vm_table[]?", sig.text);
92 return VIRTUALIZATION_VM_OTHER;
93 }
94#endif
95 log_debug("No virtualization found in CPUID");
96
97 return VIRTUALIZATION_NONE;
98}
99
100static Virtualization detect_vm_device_tree(void) {
101#if defined(__arm__) || defined(__aarch64__) || defined(__powerpc__) || defined(__powerpc64__) || defined(__riscv)
102 _cleanup_free_ char *hvtype = NULL;
103 int r;
104
105 r = read_one_line_file("/proc/device-tree/hypervisor/compatible", &hvtype);
106 if (r == -ENOENT) {
107 _cleanup_closedir_ DIR *dir = NULL;
108 _cleanup_free_ char *compat = NULL;
109
110 if (access("/proc/device-tree/ibm,partition-name", F_OK) == 0 &&
111 access("/proc/device-tree/hmc-managed?", F_OK) == 0 &&
112 access("/proc/device-tree/chosen/qemu,graphic-width", F_OK) != 0)
113 return VIRTUALIZATION_POWERVM;
114
115 dir = opendir("/proc/device-tree");
116 if (!dir) {
117 if (errno == ENOENT) {
118 log_debug_errno(errno, "/proc/device-tree: %m");
119 return VIRTUALIZATION_NONE;
120 }
121 return -errno;
122 }
123
124 FOREACH_DIRENT(de, dir, return -errno)
125 if (strstr(de->d_name, "fw-cfg")) {
126 log_debug("Virtualization QEMU: \"fw-cfg\" present in /proc/device-tree/%s", de->d_name);
127 return VIRTUALIZATION_QEMU;
128 }
129
130 r = read_one_line_file("/proc/device-tree/compatible", &compat);
131 if (r < 0 && r != -ENOENT)
132 return r;
133 if (r >= 0 && streq(compat, "qemu,pseries")) {
134 log_debug("Virtualization %s found in /proc/device-tree/compatible", compat);
135 return VIRTUALIZATION_QEMU;
136 }
137
138 log_debug("No virtualization found in /proc/device-tree/*");
139 return VIRTUALIZATION_NONE;
140 } else if (r < 0)
141 return r;
142
143 log_debug("Virtualization %s found in /proc/device-tree/hypervisor/compatible", hvtype);
144 if (streq(hvtype, "linux,kvm"))
145 return VIRTUALIZATION_KVM;
146 else if (strstr(hvtype, "xen"))
147 return VIRTUALIZATION_XEN;
148 else if (strstr(hvtype, "vmware"))
149 return VIRTUALIZATION_VMWARE;
150 else
151 return VIRTUALIZATION_VM_OTHER;
152#else
153 log_debug("This platform does not support /proc/device-tree");
154 return VIRTUALIZATION_NONE;
155#endif
156}
157
158#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) || defined(__riscv)
159static Virtualization detect_vm_dmi_vendor(void) {
160 static const char* const dmi_vendors[] = {
161 "/sys/class/dmi/id/product_name", /* Test this before sys_vendor to detect KVM over QEMU */
162 "/sys/class/dmi/id/sys_vendor",
163 "/sys/class/dmi/id/board_vendor",
164 "/sys/class/dmi/id/bios_vendor",
165 "/sys/class/dmi/id/product_version", /* For Hyper-V VMs test */
166 NULL
167 };
168
169 static const struct {
170 const char *vendor;
171 Virtualization id;
172 } dmi_vendor_table[] = {
173 { "KVM", VIRTUALIZATION_KVM },
174 { "OpenStack", VIRTUALIZATION_KVM }, /* Detect OpenStack instance as KVM in non x86 architecture */
175 { "KubeVirt", VIRTUALIZATION_KVM }, /* Detect KubeVirt instance as KVM in non x86 architecture */
176 { "Amazon EC2", VIRTUALIZATION_AMAZON },
177 { "QEMU", VIRTUALIZATION_QEMU },
178 { "VMware", VIRTUALIZATION_VMWARE }, /* https://kb.vmware.com/s/article/1009458 */
179 { "VMW", VIRTUALIZATION_VMWARE },
180 { "innotek GmbH", VIRTUALIZATION_ORACLE },
181 { "VirtualBox", VIRTUALIZATION_ORACLE },
182 { "Oracle Corporation", VIRTUALIZATION_ORACLE }, /* Detect VirtualBox on some proprietary systems via the board_vendor */
183 { "Xen", VIRTUALIZATION_XEN },
184 { "Bochs", VIRTUALIZATION_BOCHS },
185 { "Parallels", VIRTUALIZATION_PARALLELS },
186 /* https://wiki.freebsd.org/bhyve */
187 { "BHYVE", VIRTUALIZATION_BHYVE },
188 { "Hyper-V", VIRTUALIZATION_MICROSOFT },
189 { "Apple Virtualization", VIRTUALIZATION_APPLE },
190 { "Google Compute Engine", VIRTUALIZATION_GOOGLE }, /* https://cloud.google.com/run/docs/container-contract#sandbox */
191 };
192 int r;
193
194 STRV_FOREACH(vendor, dmi_vendors) {
195 _cleanup_free_ char *s = NULL;
196
197 r = read_one_line_file(*vendor, &s);
198 if (r < 0) {
199 if (r == -ENOENT)
200 continue;
201
202 return r;
203 }
204
205 for (size_t i = 0; i < ELEMENTSOF(dmi_vendor_table); i++)
206 if (startswith(s, dmi_vendor_table[i].vendor)) {
207 log_debug("Virtualization %s found in DMI (%s)", s, *vendor);
208 return dmi_vendor_table[i].id;
209 }
210 }
211 log_debug("No virtualization found in DMI vendor table.");
212 return VIRTUALIZATION_NONE;
213}
214
215static int detect_vm_smbios(void) {
216 /* The SMBIOS BIOS Characteristics Extension Byte 2 (Section 2.1.2.2 of
217 * https://www.dmtf.org/sites/default/files/standards/documents/DSP0134_3.4.0.pdf), specifies that
218 * the 4th bit being set indicates a VM. The BIOS Characteristics table is exposed via the kernel in
219 * /sys/firmware/dmi/entries/0-0. Note that in the general case, this bit being unset should not
220 * imply that the system is running on bare-metal. For example, QEMU 3.1.0 (with or without KVM)
221 * with SeaBIOS does not set this bit. */
222 _cleanup_free_ char *s = NULL;
223 size_t readsize;
224 int r;
225
226 r = read_full_virtual_file("/sys/firmware/dmi/entries/0-0/raw", &s, &readsize);
227 if (r < 0) {
228 log_debug_errno(r, "Unable to read /sys/firmware/dmi/entries/0-0/raw, "
229 "using the virtualization information found in DMI vendor table, ignoring: %m");
230 return SMBIOS_VM_BIT_UNKNOWN;
231 }
232 if (readsize < 20 || s[1] < 20) {
233 /* The spec indicates that byte 1 contains the size of the table, 0x12 + the number of
234 * extension bytes. The data we're interested in is in extension byte 2, which would be at
235 * 0x13. If we didn't read that much data, or if the BIOS indicates that we don't have that
236 * much data, we don't infer anything from the SMBIOS. */
237 log_debug("Only read %zu bytes from /sys/firmware/dmi/entries/0-0/raw (expected 20). "
238 "Using the virtualization information found in DMI vendor table.", readsize);
239 return SMBIOS_VM_BIT_UNKNOWN;
240 }
241
242 uint8_t byte = (uint8_t) s[19];
243 if (byte & (1U<<4)) {
244 log_debug("DMI BIOS Extension table indicates virtualization.");
245 return SMBIOS_VM_BIT_SET;
246 }
247 log_debug("DMI BIOS Extension table does not indicate virtualization.");
248 return SMBIOS_VM_BIT_UNSET;
249}
250#endif /* defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) */
251
252static Virtualization detect_vm_dmi(void) {
253#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64)
254
255 int r;
256 r = detect_vm_dmi_vendor();
257
258 /* The DMI vendor tables in /sys/class/dmi/id don't help us distinguish between Amazon EC2
259 * virtual machines and bare-metal instances, so we need to look at SMBIOS. */
260 if (r == VIRTUALIZATION_AMAZON) {
261 switch (detect_vm_smbios()) {
262 case SMBIOS_VM_BIT_SET:
263 return VIRTUALIZATION_AMAZON;
264 case SMBIOS_VM_BIT_UNSET:
265 return VIRTUALIZATION_NONE;
266 case SMBIOS_VM_BIT_UNKNOWN: {
267 /* The DMI information we are after is only accessible to the root user,
268 * so we fallback to using the product name which is less restricted
269 * to distinguish metal systems from virtualized instances */
270 _cleanup_free_ char *s = NULL;
271 const char *e;
272
273 r = read_full_virtual_file("/sys/class/dmi/id/product_name", &s, NULL);
274 /* In EC2, virtualized is much more common than metal, so if for some reason
275 * we fail to read the DMI data, assume we are virtualized. */
276 if (r < 0) {
277 log_debug_errno(r, "Can't read /sys/class/dmi/id/product_name,"
278 " assuming virtualized: %m");
279 return VIRTUALIZATION_AMAZON;
280 }
281 e = strstrafter(truncate_nl(s), ".metal");
282 if (e && IN_SET(*e, 0, '-')) {
283 log_debug("DMI product name has '.metal', assuming no virtualization");
284 return VIRTUALIZATION_NONE;
285 } else
286 return VIRTUALIZATION_AMAZON;
287 }
288 default:
289 assert_not_reached();
290 }
291 }
292
293 /* If we haven't identified a VM, but the firmware indicates that there is one, indicate as much. We
294 * have no further information about what it is. */
295 if (r == VIRTUALIZATION_NONE && detect_vm_smbios() == SMBIOS_VM_BIT_SET)
296 return VIRTUALIZATION_VM_OTHER;
297 return r;
298#else
299 return VIRTUALIZATION_NONE;
300#endif
301}
302
303#define XENFEAT_dom0 11 /* xen/include/public/features.h */
304#define PATH_FEATURES "/sys/hypervisor/properties/features"
305/* Returns -errno, or 0 for domU, or 1 for dom0 */
306static int detect_vm_xen_dom0(void) {
307 _cleanup_free_ char *domcap = NULL;
308 int r;
309
310 r = read_one_line_file(PATH_FEATURES, &domcap);
311 if (r < 0 && r != -ENOENT)
312 return r;
313 if (r >= 0) {
314 unsigned long features;
315
316 /* Here, we need to use sscanf() instead of safe_atoul()
317 * as the string lacks the leading "0x". */
318 r = sscanf(domcap, "%lx", &features);
319 if (r == 1) {
320 r = !!(features & (1U << XENFEAT_dom0));
321 log_debug("Virtualization XEN, found %s with value %08lx, "
322 "XENFEAT_dom0 (indicating the 'hardware domain') is%s set.",
323 PATH_FEATURES, features, r ? "" : " not");
324 return r;
325 }
326 log_debug("Virtualization XEN, found %s, unhandled content '%s'",
327 PATH_FEATURES, domcap);
328 }
329
330 r = read_one_line_file("/proc/xen/capabilities", &domcap);
331 if (r == -ENOENT) {
332 log_debug("Virtualization XEN because /proc/xen/capabilities does not exist");
333 return 0;
334 }
335 if (r < 0)
336 return r;
337
338 for (const char *i = domcap;;) {
339 _cleanup_free_ char *cap = NULL;
340
341 r = extract_first_word(&i, &cap, ",", 0);
342 if (r < 0)
343 return r;
344 if (r == 0) {
345 log_debug("Virtualization XEN DomU found (/proc/xen/capabilities)");
346 return 0;
347 }
348
349 if (streq(cap, "control_d")) {
350 log_debug("Virtualization XEN Dom0 ignored (/proc/xen/capabilities)");
351 return 1;
352 }
353 }
354}
355
356static Virtualization detect_vm_xen(void) {
357 /* The presence of /proc/xen indicates some form of a Xen domain
358 The check for Dom0 is handled outside this function */
359 if (access("/proc/xen", F_OK) < 0) {
360 log_debug("Virtualization XEN not found, /proc/xen does not exist");
361 return VIRTUALIZATION_NONE;
362 }
363 log_debug("Virtualization XEN found (/proc/xen exists)");
364 return VIRTUALIZATION_XEN;
365}
366
367static Virtualization detect_vm_hypervisor(void) {
368 _cleanup_free_ char *hvtype = NULL;
369 int r;
370
371 r = read_one_line_file("/sys/hypervisor/type", &hvtype);
372 if (r == -ENOENT)
373 return VIRTUALIZATION_NONE;
374 if (r < 0)
375 return r;
376
377 log_debug("Virtualization %s found in /sys/hypervisor/type", hvtype);
378
379 if (streq(hvtype, "xen"))
380 return VIRTUALIZATION_XEN;
381 else
382 return VIRTUALIZATION_VM_OTHER;
383}
384
385static Virtualization detect_vm_uml(void) {
386 _cleanup_fclose_ FILE *f = NULL;
387 int r;
388
389 /* Detect User-Mode Linux by reading /proc/cpuinfo */
390 f = fopen("/proc/cpuinfo", "re");
391 if (!f) {
392 if (errno == ENOENT) {
393 log_debug("/proc/cpuinfo not found, assuming no UML virtualization.");
394 return VIRTUALIZATION_NONE;
395 }
396 return -errno;
397 }
398
399 for (;;) {
400 _cleanup_free_ char *line = NULL;
401 const char *t;
402
403 r = read_line(f, LONG_LINE_MAX, &line);
404 if (r < 0)
405 return r;
406 if (r == 0)
407 break;
408
409 t = startswith(line, "vendor_id\t: ");
410 if (t) {
411 if (startswith(t, "User Mode Linux")) {
412 log_debug("UML virtualization found in /proc/cpuinfo");
413 return VIRTUALIZATION_UML;
414 }
415
416 break;
417 }
418 }
419
420 log_debug("UML virtualization not found in /proc/cpuinfo.");
421 return VIRTUALIZATION_NONE;
422}
423
424static Virtualization detect_vm_zvm(void) {
425
426#if defined(__s390__)
427 _cleanup_free_ char *t = NULL;
428 int r;
429
430 r = get_proc_field("/proc/sysinfo", "VM00 Control Program", WHITESPACE, &t);
431 if (r == -ENOENT)
432 return VIRTUALIZATION_NONE;
433 if (r < 0)
434 return r;
435
436 log_debug("Virtualization %s found in /proc/sysinfo", t);
437 if (streq(t, "z/VM"))
438 return VIRTUALIZATION_ZVM;
439 else
440 return VIRTUALIZATION_KVM;
441#else
442 log_debug("This platform does not support /proc/sysinfo");
443 return VIRTUALIZATION_NONE;
444#endif
445}
446
447/* Returns a short identifier for the various VM implementations */
448Virtualization detect_vm(void) {
449 static thread_local Virtualization cached_found = _VIRTUALIZATION_INVALID;
450 bool other = false, hyperv = false;
451 int xen_dom0 = 0;
452 Virtualization v, dmi;
453
454 if (cached_found >= 0)
455 return cached_found;
456
457 /* We have to use the correct order here:
458 *
459 * → First, try to detect Oracle Virtualbox, Amazon EC2 Nitro, Parallels, and Google Compute Engine,
460 * even if they use KVM, as well as Xen, even if it cloaks as Microsoft Hyper-V. Attempt to detect
461 * UML at this stage too, since it runs as a user-process nested inside other VMs. Also check for
462 * Xen now, because Xen PV mode does not override CPUID when nested inside another hypervisor.
463 *
464 * → Second, try to detect from CPUID. This will report KVM for whatever software is used even if
465 * info in DMI is overwritten.
466 *
467 * → Third, try to detect from DMI. */
468
469 dmi = detect_vm_dmi();
470 if (IN_SET(dmi,
471 VIRTUALIZATION_ORACLE,
472 VIRTUALIZATION_XEN,
473 VIRTUALIZATION_AMAZON,
474 VIRTUALIZATION_PARALLELS,
475 VIRTUALIZATION_GOOGLE)) {
476 v = dmi;
477 goto finish;
478 }
479
480 /* Detect UML */
481 v = detect_vm_uml();
482 if (v < 0)
483 return v;
484 if (v != VIRTUALIZATION_NONE)
485 goto finish;
486
487 /* Detect Xen */
488 v = detect_vm_xen();
489 if (v < 0)
490 return v;
491 if (v == VIRTUALIZATION_XEN) {
492 /* If we are Dom0, then we expect to not report as a VM. However, as we might be nested
493 * inside another hypervisor which can be detected via the CPUID check, wait to report this
494 * until after the CPUID check. */
495 xen_dom0 = detect_vm_xen_dom0();
496 if (xen_dom0 < 0)
497 return xen_dom0;
498 if (xen_dom0 == 0)
499 goto finish;
500 } else if (v != VIRTUALIZATION_NONE)
501 assert_not_reached();
502
503 /* Detect from CPUID */
504 v = detect_vm_cpuid();
505 if (v < 0)
506 return v;
507 if (v == VIRTUALIZATION_MICROSOFT)
508 /* QEMU sets the CPUID string to hyperv's, in case it provides hyperv enlightenments. Let's
509 * hence not return Microsoft here but just use the other mechanisms first to make a better
510 * decision. */
511 hyperv = true;
512 else if (v == VIRTUALIZATION_VM_OTHER)
513 other = true;
514 else if (v != VIRTUALIZATION_NONE)
515 goto finish;
516
517 /* If we are in Dom0 and have not yet finished, finish with the result of detect_vm_cpuid */
518 if (xen_dom0 > 0)
519 goto finish;
520
521 /* Now, let's get back to DMI */
522 if (dmi < 0)
523 return dmi;
524 if (dmi == VIRTUALIZATION_VM_OTHER)
525 other = true;
526 else if (dmi != VIRTUALIZATION_NONE) {
527 v = dmi;
528 goto finish;
529 }
530
531 /* Check high-level hypervisor sysfs file */
532 v = detect_vm_hypervisor();
533 if (v < 0)
534 return v;
535 if (v == VIRTUALIZATION_VM_OTHER)
536 other = true;
537 else if (v != VIRTUALIZATION_NONE)
538 goto finish;
539
540 v = detect_vm_device_tree();
541 if (v < 0)
542 return v;
543 if (v == VIRTUALIZATION_VM_OTHER)
544 other = true;
545 else if (v != VIRTUALIZATION_NONE)
546 goto finish;
547
548 v = detect_vm_zvm();
549 if (v < 0)
550 return v;
551
552finish:
553 /* None of the checks above gave us a clear answer, hence let's now use fallback logic: if hyperv
554 * enlightenments are available but the VMM wasn't recognized as anything yet, it's probably
555 * Microsoft. */
556 if (v == VIRTUALIZATION_NONE) {
557 if (hyperv)
558 v = VIRTUALIZATION_MICROSOFT;
559 else if (other)
560 v = VIRTUALIZATION_VM_OTHER;
561 }
562
563 cached_found = v;
564 log_debug("Found VM virtualization %s", virtualization_to_string(v));
565 return v;
566}
567
568static const char *const container_table[_VIRTUALIZATION_MAX] = {
569 [VIRTUALIZATION_LXC] = "lxc",
570 [VIRTUALIZATION_LXC_LIBVIRT] = "lxc-libvirt",
571 [VIRTUALIZATION_SYSTEMD_NSPAWN] = "systemd-nspawn",
572 [VIRTUALIZATION_DOCKER] = "docker",
573 [VIRTUALIZATION_PODMAN] = "podman",
574 [VIRTUALIZATION_RKT] = "rkt",
575 [VIRTUALIZATION_WSL] = "wsl",
576 [VIRTUALIZATION_PROOT] = "proot",
577 [VIRTUALIZATION_POUCH] = "pouch",
578};
579
580DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(container, int);
581
582static int running_in_cgroupns(void) {
583 int r;
584
585 if (!cg_ns_supported())
586 return false;
587
588 r = cg_all_unified();
589 if (r < 0)
590 return r;
591
592 if (r) {
593 /* cgroup v2 */
594
595 r = access("/sys/fs/cgroup/cgroup.events", F_OK);
596 if (r < 0) {
597 if (errno != ENOENT)
598 return -errno;
599 /* All kernel versions have cgroup.events in nested cgroups. */
600 return false;
601 }
602
603 /* There's no cgroup.type in the root cgroup, and future kernel versions
604 * are unlikely to add it since cgroup.type is something that makes no sense
605 * whatsoever in the root cgroup. */
606 r = access("/sys/fs/cgroup/cgroup.type", F_OK);
607 if (r == 0)
608 return true;
609 if (r < 0 && errno != ENOENT)
610 return -errno;
611
612 /* On older kernel versions, there's no cgroup.type */
613 r = access("/sys/kernel/cgroup/features", F_OK);
614 if (r < 0) {
615 if (errno != ENOENT)
616 return -errno;
617 /* This is an old kernel that we know for sure has cgroup.events
618 * only in nested cgroups. */
619 return true;
620 }
621
622 /* This is a recent kernel, and cgroup.type doesn't exist, so we must be
623 * in the root cgroup. */
624 return false;
625 } else {
626 /* cgroup v1 */
627
628 /* If systemd controller is not mounted, do not even bother. */
629 r = access("/sys/fs/cgroup/systemd", F_OK);
630 if (r < 0) {
631 if (errno != ENOENT)
632 return -errno;
633 return false;
634 }
635
636 /* release_agent only exists in the root cgroup. */
637 r = access("/sys/fs/cgroup/systemd/release_agent", F_OK);
638 if (r < 0) {
639 if (errno != ENOENT)
640 return -errno;
641 return true;
642 }
643
644 return false;
645 }
646}
647
648static Virtualization detect_container_files(void) {
649 static const struct {
650 const char *file_path;
651 Virtualization id;
652 } container_file_table[] = {
653 /* https://github.com/containers/podman/issues/6192 */
654 /* https://github.com/containers/podman/issues/3586#issuecomment-661918679 */
655 { "/run/.containerenv", VIRTUALIZATION_PODMAN },
656 /* https://github.com/moby/moby/issues/18355 */
657 /* Docker must be the last in this table, see below. */
658 { "/.dockerenv", VIRTUALIZATION_DOCKER },
659 };
660
661 for (size_t i = 0; i < ELEMENTSOF(container_file_table); i++) {
662 if (access(container_file_table[i].file_path, F_OK) >= 0)
663 return container_file_table[i].id;
664
665 if (errno != ENOENT)
666 log_debug_errno(errno,
667 "Checking if %s exists failed, ignoring: %m",
668 container_file_table[i].file_path);
669 }
670
671 return VIRTUALIZATION_NONE;
672}
673
674Virtualization detect_container(void) {
675 static thread_local Virtualization cached_found = _VIRTUALIZATION_INVALID;
676 _cleanup_free_ char *m = NULL, *o = NULL, *p = NULL;
677 const char *e = NULL;
678 Virtualization v;
679 int r;
680
681 if (cached_found >= 0)
682 return cached_found;
683
684 /* /proc/vz exists in container and outside of the container, /proc/bc only outside of the container. */
685 if (access("/proc/vz", F_OK) < 0) {
686 if (errno != ENOENT)
687 log_debug_errno(errno, "Failed to check if /proc/vz exists, ignoring: %m");
688 } else if (access("/proc/bc", F_OK) < 0) {
689 if (errno == ENOENT) {
690 v = VIRTUALIZATION_OPENVZ;
691 goto finish;
692 }
693
694 log_debug_errno(errno, "Failed to check if /proc/bc exists, ignoring: %m");
695 }
696
697 /* "Official" way of detecting WSL https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364 */
698 r = read_one_line_file("/proc/sys/kernel/osrelease", &o);
699 if (r < 0)
700 log_debug_errno(r, "Failed to read /proc/sys/kernel/osrelease, ignoring: %m");
701 else if (strstr(o, "Microsoft") || strstr(o, "WSL")) {
702 v = VIRTUALIZATION_WSL;
703 goto finish;
704 }
705
706 /* proot doesn't use PID namespacing, so we can just check if we have a matching tracer for this
707 * invocation without worrying about it being elsewhere.
708 */
709 r = get_proc_field("/proc/self/status", "TracerPid", WHITESPACE, &p);
710 if (r < 0)
711 log_debug_errno(r, "Failed to read our own trace PID, ignoring: %m");
712 else if (!streq(p, "0")) {
713 pid_t ptrace_pid;
714
715 r = parse_pid(p, &ptrace_pid);
716 if (r < 0)
717 log_debug_errno(r, "Failed to parse our own tracer PID, ignoring: %m");
718 else {
719 _cleanup_free_ char *ptrace_comm = NULL;
720 const char *pf;
721
722 pf = procfs_file_alloca(ptrace_pid, "comm");
723 r = read_one_line_file(pf, &ptrace_comm);
724 if (r < 0)
725 log_debug_errno(r, "Failed to read %s, ignoring: %m", pf);
726 else if (startswith(ptrace_comm, "proot")) {
727 v = VIRTUALIZATION_PROOT;
728 goto finish;
729 }
730 }
731 }
732
733 /* The container manager might have placed this in the /run/host/ hierarchy for us, which is best
734 * because we can be consumed just like that, without special privileges. */
735 r = read_one_line_file("/run/host/container-manager", &m);
736 if (r > 0) {
737 e = m;
738 goto translate_name;
739 }
740 if (!IN_SET(r, -ENOENT, 0))
741 return log_debug_errno(r, "Failed to read /run/host/container-manager: %m");
742
743 if (getpid_cached() == 1) {
744 /* If we are PID 1 we can just check our own environment variable, and that's authoritative.
745 * We distinguish three cases:
746 * - the variable is not defined → we jump to other checks
747 * - the variable is defined to an empty value → we are not in a container
748 * - anything else → some container, either one of the known ones or "container-other"
749 */
750 e = getenv("container");
751 if (!e)
752 goto check_files;
753 if (isempty(e)) {
754 v = VIRTUALIZATION_NONE;
755 goto finish;
756 }
757
758 goto translate_name;
759 }
760
761 /* Otherwise, PID 1 might have dropped this information into a file in /run. This is better than accessing
762 * /proc/1/environ, since we don't need CAP_SYS_PTRACE for that. */
763 r = read_one_line_file("/run/systemd/container", &m);
764 if (r > 0) {
765 e = m;
766 goto translate_name;
767 }
768 if (!IN_SET(r, -ENOENT, 0))
769 return log_debug_errno(r, "Failed to read /run/systemd/container: %m");
770
771 /* Fallback for cases where PID 1 was not systemd (for example, cases where init=/bin/sh is used. */
772 r = getenv_for_pid(1, "container", &m);
773 if (r > 0) {
774 e = m;
775 goto translate_name;
776 }
777 if (r < 0) /* This only works if we have CAP_SYS_PTRACE, hence let's better ignore failures here */
778 log_debug_errno(r, "Failed to read $container of PID 1, ignoring: %m");
779
780check_files:
781 /* Check for existence of some well-known files. We only do this after checking
782 * for other specific container managers, otherwise we risk mistaking another
783 * container manager for Docker: the /.dockerenv file could inadvertently end up
784 * in a file system image. */
785 v = detect_container_files();
786 if (v < 0)
787 return v;
788 if (v != VIRTUALIZATION_NONE)
789 goto finish;
790
791 r = running_in_cgroupns();
792 if (r > 0) {
793 v = VIRTUALIZATION_CONTAINER_OTHER;
794 goto finish;
795 }
796 if (r < 0)
797 log_debug_errno(r, "Failed to detect cgroup namespace: %m");
798
799 /* If none of that worked, give up, assume no container manager. */
800 v = VIRTUALIZATION_NONE;
801 goto finish;
802
803translate_name:
804 if (streq(e, "oci")) {
805 /* Some images hardcode container=oci, but OCI is not a specific container manager.
806 * Try to detect one based on well-known files. */
807 v = detect_container_files();
808 if (v == VIRTUALIZATION_NONE)
809 v = VIRTUALIZATION_CONTAINER_OTHER;
810 goto finish;
811 }
812 v = container_from_string(e);
813 if (v < 0)
814 v = VIRTUALIZATION_CONTAINER_OTHER;
815
816finish:
817 log_debug("Found container virtualization %s.", virtualization_to_string(v));
818 cached_found = v;
819 return v;
820}
821
822Virtualization detect_virtualization(void) {
823 int v;
824
825 v = detect_container();
826 if (v != VIRTUALIZATION_NONE)
827 return v;
828
829 return detect_vm();
830}
831
832static int userns_has_mapping(const char *name) {
833 _cleanup_fclose_ FILE *f = NULL;
834 uid_t base, shift, range;
835 int r;
836
837 f = fopen(name, "re");
838 if (!f) {
839 log_debug_errno(errno, "Failed to open %s: %m", name);
840 return errno == ENOENT ? false : -errno;
841 }
842
843 r = uid_map_read_one(f, &base, &shift, &range);
844 if (r == -ENOMSG) {
845 log_debug("%s is empty, we're in an uninitialized user namespace.", name);
846 return true;
847 }
848 if (r < 0)
849 return log_debug_errno(r, "Failed to read %s: %m", name);
850
851 if (base == 0 && shift == 0 && range == UINT32_MAX) {
852 /* The kernel calls mappings_overlap() and does not allow overlaps */
853 log_debug("%s has a full 1:1 mapping", name);
854 return false;
855 }
856
857 /* Anything else implies that we are in a user namespace */
858 log_debug("Mapping found in %s, we're in a user namespace.", name);
859 return true;
860}
861
862int running_in_userns(void) {
863 _cleanup_free_ char *line = NULL;
864 int r;
865
866 r = userns_has_mapping("/proc/self/uid_map");
867 if (r != 0)
868 return r;
869
870 r = userns_has_mapping("/proc/self/gid_map");
871 if (r != 0)
872 return r;
873
874 /* "setgroups" file was added in kernel v3.18-rc6-15-g9cc46516dd. It is also possible to compile a
875 * kernel without CONFIG_USER_NS, in which case "setgroups" also does not exist. We cannot
876 * distinguish those two cases, so assume that we're running on a stripped-down recent kernel, rather
877 * than on an old one, and if the file is not found, return false. */
878 r = read_virtual_file("/proc/self/setgroups", SIZE_MAX, &line, NULL);
879 if (r < 0) {
880 log_debug_errno(r, "/proc/self/setgroups: %m");
881 return r == -ENOENT ? false : r;
882 }
883
884 strstrip(line); /* remove trailing newline */
885
886 r = streq(line, "deny");
887 /* See user_namespaces(7) for a description of this "setgroups" contents. */
888 log_debug("/proc/self/setgroups contains \"%s\", %s user namespace", line, r ? "in" : "not in");
889 return r;
890}
891
892int running_in_chroot(void) {
893 int r;
894
895 /* If we're PID1, /proc may not be mounted (and most likely we're not in a chroot). But PID1 will
896 * mount /proc, so all other programs can assume that if /proc is *not* available, we're in some
897 * chroot. */
898
899 if (getenv_bool("SYSTEMD_IGNORE_CHROOT") > 0)
900 return 0;
901
902 r = inode_same("/proc/1/root", "/", 0);
903 if (r == -ENOENT) {
904 r = proc_mounted();
905 if (r == 0) {
906 if (getpid_cached() == 1)
907 return false; /* We will mount /proc, assuming we're not in a chroot. */
908
909 log_debug("/proc is not mounted, assuming we're in a chroot.");
910 return true;
911 }
912 if (r > 0) /* If we have fake /proc/, we can't do the check properly. */
913 return -ENOSYS;
914 }
915 if (r < 0)
916 return r;
917
918 return r == 0;
919}
920
921#if defined(__i386__) || defined(__x86_64__)
922struct cpuid_table_entry {
923 uint32_t flag_bit;
924 const char *name;
925};
926
927static const struct cpuid_table_entry leaf1_edx[] = {
928 { 0, "fpu" },
929 { 1, "vme" },
930 { 2, "de" },
931 { 3, "pse" },
932 { 4, "tsc" },
933 { 5, "msr" },
934 { 6, "pae" },
935 { 7, "mce" },
936 { 8, "cx8" },
937 { 9, "apic" },
938 { 11, "sep" },
939 { 12, "mtrr" },
940 { 13, "pge" },
941 { 14, "mca" },
942 { 15, "cmov" },
943 { 16, "pat" },
944 { 17, "pse36" },
945 { 19, "clflush" },
946 { 23, "mmx" },
947 { 24, "fxsr" },
948 { 25, "sse" },
949 { 26, "sse2" },
950 { 28, "ht" },
951};
952
953static const struct cpuid_table_entry leaf1_ecx[] = {
954 { 0, "pni" },
955 { 1, "pclmul" },
956 { 3, "monitor" },
957 { 9, "ssse3" },
958 { 12, "fma3" },
959 { 13, "cx16" },
960 { 19, "sse4_1" },
961 { 20, "sse4_2" },
962 { 22, "movbe" },
963 { 23, "popcnt" },
964 { 25, "aes" },
965 { 26, "xsave" },
966 { 27, "osxsave" },
967 { 28, "avx" },
968 { 29, "f16c" },
969 { 30, "rdrand" },
970};
971
972static const struct cpuid_table_entry leaf7_ebx[] = {
973 { 3, "bmi1" },
974 { 5, "avx2" },
975 { 8, "bmi2" },
976 { 18, "rdseed" },
977 { 19, "adx" },
978 { 29, "sha_ni" },
979};
980
981static const struct cpuid_table_entry leaf81_edx[] = {
982 { 11, "syscall" },
983 { 27, "rdtscp" },
984 { 29, "lm" },
985};
986
987static const struct cpuid_table_entry leaf81_ecx[] = {
988 { 0, "lahf_lm" },
989 { 5, "abm" },
990};
991
992static const struct cpuid_table_entry leaf87_edx[] = {
993 { 8, "constant_tsc" },
994};
995
996static bool given_flag_in_set(const char *flag, const struct cpuid_table_entry *set, size_t set_size, uint32_t val) {
997 for (size_t i = 0; i < set_size; i++) {
998 if ((UINT32_C(1) << set[i].flag_bit) & val &&
999 streq(flag, set[i].name))
1000 return true;
1001 }
1002 return false;
1003}
1004
1005static bool real_has_cpu_with_flag(const char *flag) {
1006 uint32_t eax, ebx, ecx, edx;
1007
1008 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
1009 if (given_flag_in_set(flag, leaf1_ecx, ELEMENTSOF(leaf1_ecx), ecx))
1010 return true;
1011
1012 if (given_flag_in_set(flag, leaf1_edx, ELEMENTSOF(leaf1_edx), edx))
1013 return true;
1014 }
1015
1016 if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
1017 if (given_flag_in_set(flag, leaf7_ebx, ELEMENTSOF(leaf7_ebx), ebx))
1018 return true;
1019 }
1020
1021 if (__get_cpuid(0x80000001U, &eax, &ebx, &ecx, &edx)) {
1022 if (given_flag_in_set(flag, leaf81_ecx, ELEMENTSOF(leaf81_ecx), ecx))
1023 return true;
1024
1025 if (given_flag_in_set(flag, leaf81_edx, ELEMENTSOF(leaf81_edx), edx))
1026 return true;
1027 }
1028
1029 if (__get_cpuid(0x80000007U, &eax, &ebx, &ecx, &edx))
1030 if (given_flag_in_set(flag, leaf87_edx, ELEMENTSOF(leaf87_edx), edx))
1031 return true;
1032
1033 return false;
1034}
1035#endif
1036
1037bool has_cpu_with_flag(const char *flag) {
1038 /* CPUID is an x86 specific interface. Assume on all others that no CPUs have those flags. */
1039#if defined(__i386__) || defined(__x86_64__)
1040 return real_has_cpu_with_flag(flag);
1041#else
1042 return false;
1043#endif
1044}
1045
1046static const char *const virtualization_table[_VIRTUALIZATION_MAX] = {
1047 [VIRTUALIZATION_NONE] = "none",
1048 [VIRTUALIZATION_KVM] = "kvm",
1049 [VIRTUALIZATION_AMAZON] = "amazon",
1050 [VIRTUALIZATION_QEMU] = "qemu",
1051 [VIRTUALIZATION_BOCHS] = "bochs",
1052 [VIRTUALIZATION_XEN] = "xen",
1053 [VIRTUALIZATION_UML] = "uml",
1054 [VIRTUALIZATION_VMWARE] = "vmware",
1055 [VIRTUALIZATION_ORACLE] = "oracle",
1056 [VIRTUALIZATION_MICROSOFT] = "microsoft",
1057 [VIRTUALIZATION_ZVM] = "zvm",
1058 [VIRTUALIZATION_PARALLELS] = "parallels",
1059 [VIRTUALIZATION_BHYVE] = "bhyve",
1060 [VIRTUALIZATION_QNX] = "qnx",
1061 [VIRTUALIZATION_ACRN] = "acrn",
1062 [VIRTUALIZATION_POWERVM] = "powervm",
1063 [VIRTUALIZATION_APPLE] = "apple",
1064 [VIRTUALIZATION_SRE] = "sre",
1065 [VIRTUALIZATION_GOOGLE] = "google",
1066 [VIRTUALIZATION_VM_OTHER] = "vm-other",
1067
1068 [VIRTUALIZATION_SYSTEMD_NSPAWN] = "systemd-nspawn",
1069 [VIRTUALIZATION_LXC_LIBVIRT] = "lxc-libvirt",
1070 [VIRTUALIZATION_LXC] = "lxc",
1071 [VIRTUALIZATION_OPENVZ] = "openvz",
1072 [VIRTUALIZATION_DOCKER] = "docker",
1073 [VIRTUALIZATION_PODMAN] = "podman",
1074 [VIRTUALIZATION_RKT] = "rkt",
1075 [VIRTUALIZATION_WSL] = "wsl",
1076 [VIRTUALIZATION_PROOT] = "proot",
1077 [VIRTUALIZATION_POUCH] = "pouch",
1078 [VIRTUALIZATION_CONTAINER_OTHER] = "container-other",
1079};
1080
1081DEFINE_STRING_TABLE_LOOKUP(virtualization, Virtualization);