From: Greg Kroah-Hartman Date: Thu, 6 Jun 2024 13:14:03 +0000 (+0200) Subject: 6.9-stable patches X-Git-Tag: v6.1.93~20 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=a43b832ba21f38a0a059ae2363bd68b432c46123;p=thirdparty%2Fkernel%2Fstable-queue.git 6.9-stable patches added patches: efi-libstub-only-free-priv.runtime_map-when-allocated.patch genirq-cpuhotplug-x86-vector-prevent-vector-leak-during-cpu-offline.patch kvm-x86-don-t-advertise-guest.maxphyaddr-as-host.maxphyaddr-in-cpuid.patch platform-x86-intel-tpmi-handle-error-from-tpmi_process_info.patch platform-x86-intel-uncore-freq-don-t-present-root-domain-on-error.patch x86-efistub-omit-physical-kaslr-when-memory-reservations-exist.patch x86-pci-skip-early-e820-check-for-ecam-region.patch x86-topology-handle-bogus-acpi-tables-correctly.patch x86-topology-intel-unlock-cpuid-before-evaluating-anything.patch --- diff --git a/queue-6.9/efi-libstub-only-free-priv.runtime_map-when-allocated.patch b/queue-6.9/efi-libstub-only-free-priv.runtime_map-when-allocated.patch new file mode 100644 index 00000000000..96f85f9573b --- /dev/null +++ b/queue-6.9/efi-libstub-only-free-priv.runtime_map-when-allocated.patch @@ -0,0 +1,39 @@ +From 4b2543f7e1e6b91cfc8dd1696e3cdf01c3ac8974 Mon Sep 17 00:00:00 2001 +From: Hagar Hemdan +Date: Tue, 23 Apr 2024 13:59:26 +0000 +Subject: efi: libstub: only free priv.runtime_map when allocated + +From: Hagar Hemdan + +commit 4b2543f7e1e6b91cfc8dd1696e3cdf01c3ac8974 upstream. + +priv.runtime_map is only allocated when efi_novamap is not set. +Otherwise, it is an uninitialized value. In the error path, it is freed +unconditionally. Avoid passing an uninitialized value to free_pool. +Free priv.runtime_map only when it was allocated. + +This bug was discovered and resolved using Coverity Static Analysis +Security Testing (SAST) by Synopsys, Inc. + +Fixes: f80d26043af9 ("efi: libstub: avoid efi_get_memory_map() for allocating the virt map") +Cc: +Signed-off-by: Hagar Hemdan +Signed-off-by: Ard Biesheuvel +Signed-off-by: Greg Kroah-Hartman +--- + drivers/firmware/efi/libstub/fdt.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/firmware/efi/libstub/fdt.c ++++ b/drivers/firmware/efi/libstub/fdt.c +@@ -335,8 +335,8 @@ fail_free_new_fdt: + + fail: + efi_free(fdt_size, fdt_addr); +- +- efi_bs_call(free_pool, priv.runtime_map); ++ if (!efi_novamap) ++ efi_bs_call(free_pool, priv.runtime_map); + + return EFI_LOAD_ERROR; + } diff --git a/queue-6.9/genirq-cpuhotplug-x86-vector-prevent-vector-leak-during-cpu-offline.patch b/queue-6.9/genirq-cpuhotplug-x86-vector-prevent-vector-leak-during-cpu-offline.patch new file mode 100644 index 00000000000..c6c9cd45639 --- /dev/null +++ b/queue-6.9/genirq-cpuhotplug-x86-vector-prevent-vector-leak-during-cpu-offline.patch @@ -0,0 +1,123 @@ +From a6c11c0a5235fb144a65e0cb2ffd360ddc1f6c32 Mon Sep 17 00:00:00 2001 +From: Dongli Zhang +Date: Wed, 22 May 2024 15:02:18 -0700 +Subject: genirq/cpuhotplug, x86/vector: Prevent vector leak during CPU offline + +From: Dongli Zhang + +commit a6c11c0a5235fb144a65e0cb2ffd360ddc1f6c32 upstream. + +The absence of IRQD_MOVE_PCNTXT prevents immediate effectiveness of +interrupt affinity reconfiguration via procfs. Instead, the change is +deferred until the next instance of the interrupt being triggered on the +original CPU. + +When the interrupt next triggers on the original CPU, the new affinity is +enforced within __irq_move_irq(). A vector is allocated from the new CPU, +but the old vector on the original CPU remains and is not immediately +reclaimed. Instead, apicd->move_in_progress is flagged, and the reclaiming +process is delayed until the next trigger of the interrupt on the new CPU. + +Upon the subsequent triggering of the interrupt on the new CPU, +irq_complete_move() adds a task to the old CPU's vector_cleanup list if it +remains online. Subsequently, the timer on the old CPU iterates over its +vector_cleanup list, reclaiming old vectors. + +However, a rare scenario arises if the old CPU is outgoing before the +interrupt triggers again on the new CPU. + +In that case irq_force_complete_move() is not invoked on the outgoing CPU +to reclaim the old apicd->prev_vector because the interrupt isn't currently +affine to the outgoing CPU, and irq_needs_fixup() returns false. Even +though __vector_schedule_cleanup() is later called on the new CPU, it +doesn't reclaim apicd->prev_vector; instead, it simply resets both +apicd->move_in_progress and apicd->prev_vector to 0. + +As a result, the vector remains unreclaimed in vector_matrix, leading to a +CPU vector leak. + +To address this issue, move the invocation of irq_force_complete_move() +before the irq_needs_fixup() call to reclaim apicd->prev_vector, if the +interrupt is currently or used to be affine to the outgoing CPU. + +Additionally, reclaim the vector in __vector_schedule_cleanup() as well, +following a warning message, although theoretically it should never see +apicd->move_in_progress with apicd->prev_cpu pointing to an offline CPU. + +Fixes: f0383c24b485 ("genirq/cpuhotplug: Add support for cleaning up move in progress") +Signed-off-by: Dongli Zhang +Signed-off-by: Thomas Gleixner +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20240522220218.162423-1-dongli.zhang@oracle.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/apic/vector.c | 9 ++++++--- + kernel/irq/cpuhotplug.c | 16 ++++++++-------- + 2 files changed, 14 insertions(+), 11 deletions(-) + +--- a/arch/x86/kernel/apic/vector.c ++++ b/arch/x86/kernel/apic/vector.c +@@ -1036,7 +1036,8 @@ static void __vector_schedule_cleanup(st + add_timer_on(&cl->timer, cpu); + } + } else { +- apicd->prev_vector = 0; ++ pr_warn("IRQ %u schedule cleanup for offline CPU %u\n", apicd->irq, cpu); ++ free_moved_vector(apicd); + } + raw_spin_unlock(&vector_lock); + } +@@ -1073,6 +1074,7 @@ void irq_complete_move(struct irq_cfg *c + */ + void irq_force_complete_move(struct irq_desc *desc) + { ++ unsigned int cpu = smp_processor_id(); + struct apic_chip_data *apicd; + struct irq_data *irqd; + unsigned int vector; +@@ -1097,10 +1099,11 @@ void irq_force_complete_move(struct irq_ + goto unlock; + + /* +- * If prev_vector is empty, no action required. ++ * If prev_vector is empty or the descriptor is neither currently ++ * nor previously on the outgoing CPU no action required. + */ + vector = apicd->prev_vector; +- if (!vector) ++ if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu)) + goto unlock; + + /* +--- a/kernel/irq/cpuhotplug.c ++++ b/kernel/irq/cpuhotplug.c +@@ -70,6 +70,14 @@ static bool migrate_one_irq(struct irq_d + } + + /* ++ * Complete an eventually pending irq move cleanup. If this ++ * interrupt was moved in hard irq context, then the vectors need ++ * to be cleaned up. It can't wait until this interrupt actually ++ * happens and this CPU was involved. ++ */ ++ irq_force_complete_move(desc); ++ ++ /* + * No move required, if: + * - Interrupt is per cpu + * - Interrupt is not started +@@ -88,14 +96,6 @@ static bool migrate_one_irq(struct irq_d + } + + /* +- * Complete an eventually pending irq move cleanup. If this +- * interrupt was moved in hard irq context, then the vectors need +- * to be cleaned up. It can't wait until this interrupt actually +- * happens and this CPU was involved. +- */ +- irq_force_complete_move(desc); +- +- /* + * If there is a setaffinity pending, then try to reuse the pending + * mask, so the last change of the affinity does not get lost. If + * there is no move pending or the pending mask does not contain diff --git a/queue-6.9/kvm-x86-don-t-advertise-guest.maxphyaddr-as-host.maxphyaddr-in-cpuid.patch b/queue-6.9/kvm-x86-don-t-advertise-guest.maxphyaddr-as-host.maxphyaddr-in-cpuid.patch new file mode 100644 index 00000000000..5294510e073 --- /dev/null +++ b/queue-6.9/kvm-x86-don-t-advertise-guest.maxphyaddr-as-host.maxphyaddr-in-cpuid.patch @@ -0,0 +1,74 @@ +From 6f5c9600621b4efb5c61b482d767432eb1ad3a9c Mon Sep 17 00:00:00 2001 +From: Gerd Hoffmann +Date: Wed, 13 Mar 2024 13:58:42 +0100 +Subject: KVM: x86: Don't advertise guest.MAXPHYADDR as host.MAXPHYADDR in CPUID + +From: Gerd Hoffmann + +commit 6f5c9600621b4efb5c61b482d767432eb1ad3a9c upstream. + +Drop KVM's propagation of GuestPhysBits (CPUID leaf 80000008, EAX[23:16]) +to HostPhysBits (same leaf, EAX[7:0]) when advertising the address widths +to userspace via KVM_GET_SUPPORTED_CPUID. + +Per AMD, GuestPhysBits is intended for software use, and physical CPUs do +not set that field. I.e. GuestPhysBits will be non-zero if and only if +KVM is running as a nested hypervisor, and in that case, GuestPhysBits is +NOT guaranteed to capture the CPU's effective MAXPHYADDR when running with +TDP enabled. + +E.g. KVM will soon use GuestPhysBits to communicate the CPU's maximum +*addressable* guest physical address, which would result in KVM under- +reporting PhysBits when running as an L1 on a CPU with MAXPHYADDR=52, +but without 5-level paging. + +Signed-off-by: Gerd Hoffmann +Cc: stable@vger.kernel.org +Reviewed-by: Xiaoyao Li +Link: https://lore.kernel.org/r/20240313125844.912415-2-kraxel@redhat.com +[sean: rewrite changelog with --verbose, Cc stable@] +Signed-off-by: Sean Christopherson +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/cpuid.c | 21 ++++++++++----------- + 1 file changed, 10 insertions(+), 11 deletions(-) + +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -1232,9 +1232,8 @@ static inline int __do_cpuid_func(struct + entry->eax = entry->ebx = entry->ecx = 0; + break; + case 0x80000008: { +- unsigned g_phys_as = (entry->eax >> 16) & 0xff; +- unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); +- unsigned phys_as = entry->eax & 0xff; ++ unsigned int virt_as = max((entry->eax >> 8) & 0xff, 48U); ++ unsigned int phys_as; + + /* + * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as +@@ -1242,16 +1241,16 @@ static inline int __do_cpuid_func(struct + * reductions in MAXPHYADDR for memory encryption affect shadow + * paging, too. + * +- * If TDP is enabled but an explicit guest MAXPHYADDR is not +- * provided, use the raw bare metal MAXPHYADDR as reductions to +- * the HPAs do not affect GPAs. ++ * If TDP is enabled, use the raw bare metal MAXPHYADDR as ++ * reductions to the HPAs do not affect GPAs. + */ +- if (!tdp_enabled) +- g_phys_as = boot_cpu_data.x86_phys_bits; +- else if (!g_phys_as) +- g_phys_as = phys_as; ++ if (!tdp_enabled) { ++ phys_as = boot_cpu_data.x86_phys_bits; ++ } else { ++ phys_as = entry->eax & 0xff; ++ } + +- entry->eax = g_phys_as | (virt_as << 8); ++ entry->eax = phys_as | (virt_as << 8); + entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8)); + entry->edx = 0; + cpuid_entry_override(entry, CPUID_8000_0008_EBX); diff --git a/queue-6.9/platform-x86-intel-tpmi-handle-error-from-tpmi_process_info.patch b/queue-6.9/platform-x86-intel-tpmi-handle-error-from-tpmi_process_info.patch new file mode 100644 index 00000000000..6f4d9ba4dbb --- /dev/null +++ b/queue-6.9/platform-x86-intel-tpmi-handle-error-from-tpmi_process_info.patch @@ -0,0 +1,42 @@ +From 2920141fc149f71bad22361946417bc43783ed7f Mon Sep 17 00:00:00 2001 +From: Srinivas Pandruvada +Date: Tue, 23 Apr 2024 13:46:10 -0700 +Subject: platform/x86/intel/tpmi: Handle error from tpmi_process_info() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Srinivas Pandruvada + +commit 2920141fc149f71bad22361946417bc43783ed7f upstream. + +When tpmi_process_info() returns error, fail to load the driver. +This can happen if call to ioremap() returns error. + +Signed-off-by: Srinivas Pandruvada +Reviewed-by: Ilpo Järvinen +Cc: stable@vger.kernel.org # v6.3+ +Link: https://lore.kernel.org/r/20240423204619.3946901-2-srinivas.pandruvada@linux.intel.com +Reviewed-by: Hans de Goede +Signed-off-by: Hans de Goede +Signed-off-by: Greg Kroah-Hartman +--- + drivers/platform/x86/intel/tpmi.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/drivers/platform/x86/intel/tpmi.c ++++ b/drivers/platform/x86/intel/tpmi.c +@@ -763,8 +763,11 @@ static int intel_vsec_tpmi_init(struct a + * when actual device nodes created outside this + * loop via tpmi_create_devices(). + */ +- if (pfs->pfs_header.tpmi_id == TPMI_INFO_ID) +- tpmi_process_info(tpmi_info, pfs); ++ if (pfs->pfs_header.tpmi_id == TPMI_INFO_ID) { ++ ret = tpmi_process_info(tpmi_info, pfs); ++ if (ret) ++ return ret; ++ } + + if (pfs->pfs_header.tpmi_id == TPMI_CONTROL_ID) + tpmi_set_control_base(auxdev, tpmi_info, pfs); diff --git a/queue-6.9/platform-x86-intel-uncore-freq-don-t-present-root-domain-on-error.patch b/queue-6.9/platform-x86-intel-uncore-freq-don-t-present-root-domain-on-error.patch new file mode 100644 index 00000000000..2930a70e33c --- /dev/null +++ b/queue-6.9/platform-x86-intel-uncore-freq-don-t-present-root-domain-on-error.patch @@ -0,0 +1,50 @@ +From db643cb7ebe524d17b4b13583dda03485d4a1bc0 Mon Sep 17 00:00:00 2001 +From: Srinivas Pandruvada +Date: Mon, 15 Apr 2024 14:52:10 -0700 +Subject: platform/x86/intel-uncore-freq: Don't present root domain on error + +From: Srinivas Pandruvada + +commit db643cb7ebe524d17b4b13583dda03485d4a1bc0 upstream. + +If none of the clusters are added because of some error, fail to load +driver without presenting root domain. In this case root domain will +present invalid data. + +Signed-off-by: Srinivas Pandruvada +Fixes: 01c10f88c9b7 ("platform/x86/intel-uncore-freq: tpmi: Provide cluster level control") +Cc: # 6.5+ +Link: https://lore.kernel.org/r/20240415215210.2824868-1-srinivas.pandruvada@linux.intel.com +Reviewed-by: Hans de Goede +Signed-off-by: Hans de Goede +Signed-off-by: Greg Kroah-Hartman +--- + drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c ++++ b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c +@@ -240,6 +240,7 @@ static int uncore_probe(struct auxiliary + bool read_blocked = 0, write_blocked = 0; + struct intel_tpmi_plat_info *plat_info; + struct tpmi_uncore_struct *tpmi_uncore; ++ bool uncore_sysfs_added = false; + int ret, i, pkg = 0; + int num_resources; + +@@ -384,9 +385,15 @@ static int uncore_probe(struct auxiliary + } + /* Point to next cluster offset */ + cluster_offset >>= UNCORE_MAX_CLUSTER_PER_DOMAIN; ++ uncore_sysfs_added = true; + } + } + ++ if (!uncore_sysfs_added) { ++ ret = -ENODEV; ++ goto remove_clusters; ++ } ++ + auxiliary_set_drvdata(auxdev, tpmi_uncore); + + tpmi_uncore->root_cluster.root_domain = true; diff --git a/queue-6.9/series b/queue-6.9/series index 1b90c113cb8..60916d71a7c 100644 --- a/queue-6.9/series +++ b/queue-6.9/series @@ -363,3 +363,12 @@ hwmon-shtc1-fix-property-misspelling.patch riscv-prevent-pt_regs-corruption-for-secondary-idle-.patch alsa-seq-ump-fix-swapped-song-position-pointer-data.patch revert-drm-make-drivers-depends-on-drm_dw_hdmi.patch +x86-efistub-omit-physical-kaslr-when-memory-reservations-exist.patch +efi-libstub-only-free-priv.runtime_map-when-allocated.patch +x86-topology-handle-bogus-acpi-tables-correctly.patch +x86-pci-skip-early-e820-check-for-ecam-region.patch +kvm-x86-don-t-advertise-guest.maxphyaddr-as-host.maxphyaddr-in-cpuid.patch +x86-topology-intel-unlock-cpuid-before-evaluating-anything.patch +genirq-cpuhotplug-x86-vector-prevent-vector-leak-during-cpu-offline.patch +platform-x86-intel-tpmi-handle-error-from-tpmi_process_info.patch +platform-x86-intel-uncore-freq-don-t-present-root-domain-on-error.patch diff --git a/queue-6.9/x86-efistub-omit-physical-kaslr-when-memory-reservations-exist.patch b/queue-6.9/x86-efistub-omit-physical-kaslr-when-memory-reservations-exist.patch new file mode 100644 index 00000000000..7dafff96bb6 --- /dev/null +++ b/queue-6.9/x86-efistub-omit-physical-kaslr-when-memory-reservations-exist.patch @@ -0,0 +1,107 @@ +From 15aa8fb852f995dd234a57f12dfb989044968bb6 Mon Sep 17 00:00:00 2001 +From: Ard Biesheuvel +Date: Thu, 16 May 2024 11:05:42 +0200 +Subject: x86/efistub: Omit physical KASLR when memory reservations exist + +From: Ard Biesheuvel + +commit 15aa8fb852f995dd234a57f12dfb989044968bb6 upstream. + +The legacy decompressor has elaborate logic to ensure that the +randomized physical placement of the decompressed kernel image does not +conflict with any memory reservations, including ones specified on the +command line using mem=, memmap=, efi_fake_mem= or hugepages=, which are +taken into account by the kernel proper at a later stage. + +When booting in EFI mode, it is the firmware's job to ensure that the +chosen range does not conflict with any memory reservations that it +knows about, and this is trivially achieved by using the firmware's +memory allocation APIs. + +That leaves reservations specified on the command line, though, which +the firmware knows nothing about, as these regions have no other special +significance to the platform. Since commit + + a1b87d54f4e4 ("x86/efistub: Avoid legacy decompressor when doing EFI boot") + +these reservations are not taken into account when randomizing the +physical placement, which may result in conflicts where the memory +cannot be reserved by the kernel proper because its own executable image +resides there. + +To avoid having to duplicate or reuse the existing complicated logic, +disable physical KASLR entirely when such overrides are specified. These +are mostly diagnostic tools or niche features, and physical KASLR (as +opposed to virtual KASLR, which is much more important as it affects the +memory addresses observed by code executing in the kernel) is something +we can live without. + +Closes: https://lkml.kernel.org/r/FA5F6719-8824-4B04-803E-82990E65E627%40akamai.com +Reported-by: Ben Chaney +Fixes: a1b87d54f4e4 ("x86/efistub: Avoid legacy decompressor when doing EFI boot") +Cc: # v6.1+ +Reviewed-by: Kees Cook +Signed-off-by: Ard Biesheuvel +Signed-off-by: Greg Kroah-Hartman +--- + drivers/firmware/efi/libstub/x86-stub.c | 28 ++++++++++++++++++++++++++-- + 1 file changed, 26 insertions(+), 2 deletions(-) + +--- a/drivers/firmware/efi/libstub/x86-stub.c ++++ b/drivers/firmware/efi/libstub/x86-stub.c +@@ -776,6 +776,26 @@ static void error(char *str) + efi_warn("Decompression failed: %s\n", str); + } + ++static const char *cmdline_memmap_override; ++ ++static efi_status_t parse_options(const char *cmdline) ++{ ++ static const char opts[][14] = { ++ "mem=", "memmap=", "efi_fake_mem=", "hugepages=" ++ }; ++ ++ for (int i = 0; i < ARRAY_SIZE(opts); i++) { ++ const char *p = strstr(cmdline, opts[i]); ++ ++ if (p == cmdline || (p > cmdline && isspace(p[-1]))) { ++ cmdline_memmap_override = opts[i]; ++ break; ++ } ++ } ++ ++ return efi_parse_options(cmdline); ++} ++ + static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry) + { + unsigned long virt_addr = LOAD_PHYSICAL_ADDR; +@@ -807,6 +827,10 @@ static efi_status_t efi_decompress_kerne + !memcmp(efistub_fw_vendor(), ami, sizeof(ami))) { + efi_debug("AMI firmware v2.0 or older detected - disabling physical KASLR\n"); + seed[0] = 0; ++ } else if (cmdline_memmap_override) { ++ efi_info("%s detected on the kernel command line - disabling physical KASLR\n", ++ cmdline_memmap_override); ++ seed[0] = 0; + } + + boot_params_ptr->hdr.loadflags |= KASLR_FLAG; +@@ -883,7 +907,7 @@ void __noreturn efi_stub_entry(efi_handl + } + + #ifdef CONFIG_CMDLINE_BOOL +- status = efi_parse_options(CONFIG_CMDLINE); ++ status = parse_options(CONFIG_CMDLINE); + if (status != EFI_SUCCESS) { + efi_err("Failed to parse options\n"); + goto fail; +@@ -892,7 +916,7 @@ void __noreturn efi_stub_entry(efi_handl + if (!IS_ENABLED(CONFIG_CMDLINE_OVERRIDE)) { + unsigned long cmdline_paddr = ((u64)hdr->cmd_line_ptr | + ((u64)boot_params->ext_cmd_line_ptr << 32)); +- status = efi_parse_options((char *)cmdline_paddr); ++ status = parse_options((char *)cmdline_paddr); + if (status != EFI_SUCCESS) { + efi_err("Failed to parse options\n"); + goto fail; diff --git a/queue-6.9/x86-pci-skip-early-e820-check-for-ecam-region.patch b/queue-6.9/x86-pci-skip-early-e820-check-for-ecam-region.patch new file mode 100644 index 00000000000..a2f70380296 --- /dev/null +++ b/queue-6.9/x86-pci-skip-early-e820-check-for-ecam-region.patch @@ -0,0 +1,131 @@ +From 199f968f1484a14024d0d467211ffc2faf193eb4 Mon Sep 17 00:00:00 2001 +From: Bjorn Helgaas +Date: Wed, 17 Apr 2024 15:40:12 -0500 +Subject: x86/pci: Skip early E820 check for ECAM region + +From: Bjorn Helgaas + +commit 199f968f1484a14024d0d467211ffc2faf193eb4 upstream. + +Arul, Mateusz, Imcarneiro91, and Aman reported a regression caused by +07eab0901ede ("efi/x86: Remove EfiMemoryMappedIO from E820 map"). On the +Lenovo Legion 9i laptop, that commit removes the ECAM area from E820, which +means the early E820 validation fails, which means we don't enable ECAM in +the "early MCFG" path. + +The static MCFG table describes ECAM without depending on the ACPI +interpreter. Many Legion 9i ACPI methods rely on that, so they fail when +PCI config access isn't available, resulting in the embedded controller, +PS/2, audio, trackpad, and battery devices not being detected. The _OSC +method also fails, so Linux can't take control of the PCIe hotplug, PME, +and AER features: + + # pci_mmcfg_early_init() + + PCI: ECAM [mem 0xc0000000-0xce0fffff] (base 0xc0000000) for domain 0000 [bus 00-e0] + PCI: not using ECAM ([mem 0xc0000000-0xce0fffff] not reserved) + + ACPI Error: AE_ERROR, Returned by Handler for [PCI_Config] (20230628/evregion-300) + ACPI: Interpreter enabled + ACPI: Ignoring error and continuing table load + ACPI BIOS Error (bug): Could not resolve symbol [\_SB.PC00.RP01._SB.PC00], AE_NOT_FOUND (20230628/dswload2-162) + ACPI Error: AE_NOT_FOUND, During name lookup/catalog (20230628/psobject-220) + ACPI: Skipping parse of AML opcode: OpcodeName unavailable (0x0010) + ACPI BIOS Error (bug): Could not resolve symbol [\_SB.PC00.RP01._SB.PC00], AE_NOT_FOUND (20230628/dswload2-162) + ACPI Error: AE_NOT_FOUND, During name lookup/catalog (20230628/psobject-220) + ... + ACPI Error: Aborting method \_SB.PC00._OSC due to previous error (AE_NOT_FOUND) (20230628/psparse-529) + acpi PNP0A08:00: _OSC: platform retains control of PCIe features (AE_NOT_FOUND) + + # pci_mmcfg_late_init() + + PCI: ECAM [mem 0xc0000000-0xce0fffff] (base 0xc0000000) for domain 0000 [bus 00-e0] + PCI: [Firmware Info]: ECAM [mem 0xc0000000-0xce0fffff] not reserved in ACPI motherboard resources + PCI: ECAM [mem 0xc0000000-0xce0fffff] is EfiMemoryMappedIO; assuming valid + PCI: ECAM [mem 0xc0000000-0xce0fffff] reserved to work around lack of ACPI motherboard _CRS + +Per PCI Firmware r3.3, sec 4.1.2, ECAM space must be reserved by a PNP0C02 +resource, but there's no requirement to mention it in E820, so we shouldn't +look at E820 to validate the ECAM space described by MCFG. + +In 2006, 946f2ee5c731 ("[PATCH] i386/x86-64: Check that MCFG points to an +e820 reserved area") added a sanity check of E820 to work around buggy MCFG +tables, but that over-aggressive validation causes failures like this one. + +Keep the E820 validation check for machines older than 2016, an arbitrary +ten years after 946f2ee5c731, so machines that depend on it don't break. + +Skip the early E820 check for 2016 and newer BIOSes since there's no +requirement to describe ECAM in E820. + +Link: https://lore.kernel.org/r/20240417204012.215030-2-helgaas@kernel.org +Fixes: 07eab0901ede ("efi/x86: Remove EfiMemoryMappedIO from E820 map") +Reported-by: Mateusz Kaduk +Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218444 +Signed-off-by: Bjorn Helgaas +Tested-by: Mateusz Kaduk +Reviewed-by: Andy Shevchenko +Reviewed-by: Hans de Goede +Reviewed-by: Kuppuswamy Sathyanarayanan +Cc: stable@vger.kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/pci/mmconfig-shared.c | 40 +++++++++++++++++++++++++++++----------- + 1 file changed, 29 insertions(+), 11 deletions(-) + +--- a/arch/x86/pci/mmconfig-shared.c ++++ b/arch/x86/pci/mmconfig-shared.c +@@ -518,7 +518,34 @@ static bool __ref pci_mmcfg_reserved(str + { + struct resource *conflict; + +- if (!early && !acpi_disabled) { ++ if (early) { ++ ++ /* ++ * Don't try to do this check unless configuration type 1 ++ * is available. How about type 2? ++ */ ++ ++ /* ++ * 946f2ee5c731 ("Check that MCFG points to an e820 ++ * reserved area") added this E820 check in 2006 to work ++ * around BIOS defects. ++ * ++ * Per PCI Firmware r3.3, sec 4.1.2, ECAM space must be ++ * reserved by a PNP0C02 resource, but it need not be ++ * mentioned in E820. Before the ACPI interpreter is ++ * available, we can't check for PNP0C02 resources, so ++ * there's no reliable way to verify the region in this ++ * early check. Keep it only for the old machines that ++ * motivated 946f2ee5c731. ++ */ ++ if (dmi_get_bios_year() < 2016 && raw_pci_ops) ++ return is_mmconf_reserved(e820__mapped_all, cfg, dev, ++ "E820 entry"); ++ ++ return true; ++ } ++ ++ if (!acpi_disabled) { + if (is_mmconf_reserved(is_acpi_reserved, cfg, dev, + "ACPI motherboard resource")) + return true; +@@ -551,16 +578,7 @@ static bool __ref pci_mmcfg_reserved(str + * For MCFG information constructed from hotpluggable host bridge's + * _CBA method, just assume it's reserved. + */ +- if (pci_mmcfg_running_state) +- return true; +- +- /* Don't try to do this check unless configuration +- type 1 is available. how about type 2 ?*/ +- if (raw_pci_ops) +- return is_mmconf_reserved(e820__mapped_all, cfg, dev, +- "E820 entry"); +- +- return false; ++ return pci_mmcfg_running_state; + } + + static void __init pci_mmcfg_reject_broken(int early) diff --git a/queue-6.9/x86-topology-handle-bogus-acpi-tables-correctly.patch b/queue-6.9/x86-topology-handle-bogus-acpi-tables-correctly.patch new file mode 100644 index 00000000000..9265875cb64 --- /dev/null +++ b/queue-6.9/x86-topology-handle-bogus-acpi-tables-correctly.patch @@ -0,0 +1,137 @@ +From 9d22c96316ac59ed38e80920c698fed38717b91b Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Fri, 17 May 2024 16:40:36 +0200 +Subject: x86/topology: Handle bogus ACPI tables correctly + +From: Thomas Gleixner + +commit 9d22c96316ac59ed38e80920c698fed38717b91b upstream. + +The ACPI specification clearly states how the processors should be +enumerated in the MADT: + + "To ensure that the boot processor is supported post initialization, + two guidelines should be followed. The first is that OSPM should + initialize processors in the order that they appear in the MADT. The + second is that platform firmware should list the boot processor as the + first processor entry in the MADT. + ... + Failure of OSPM implementations and platform firmware to abide by + these guidelines can result in both unpredictable and non optimal + platform operation." + +The kernel relies on that ordering to detect the real BSP on crash kernels +which is important to avoid sending a INIT IPI to it as that would cause a +full machine reset. + +On a Dell XPS 16 9640 the BIOS ignores this rule and enumerates the CPUs in +the wrong order. As a consequence the kernel falsely detects a crash kernel +and disables the corresponding CPU. + +Prevent this by checking the IA32_APICBASE MSR for the BSP bit on the boot +CPU. If that bit is set, then the MADT based BSP detection can be safely +ignored. If the kernel detects a mismatch between the BSP bit and the first +enumerated MADT entry then emit a firmware bug message. + +This obviously also has to be taken into account when the boot APIC ID and +the first enumerated APIC ID match. If the boot CPU does not have the BSP +bit set in the APICBASE MSR then there is no way for the boot CPU to +determine which of the CPUs is the real BSP. Sending an INIT to the real +BSP would reset the machine so the only sane way to deal with that is to +limit the number of CPUs to one and emit a corresponding warning message. + +Fixes: 5c5682b9f87a ("x86/cpu: Detect real BSP on crash kernels") +Reported-by: Carsten Tolkmit +Signed-off-by: Thomas Gleixner +Tested-by: Carsten Tolkmit +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/87le48jycb.ffs@tglx +Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218837 +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/cpu/topology.c | 55 +++++++++++++++++++++++++++++++--- + 1 file changed, 51 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c +index d17c9b71eb4a..621a151ccf7d 100644 +--- a/arch/x86/kernel/cpu/topology.c ++++ b/arch/x86/kernel/cpu/topology.c +@@ -128,6 +128,9 @@ static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id) + + static __init bool check_for_real_bsp(u32 apic_id) + { ++ bool is_bsp = false, has_apic_base = boot_cpu_data.x86 >= 6; ++ u64 msr; ++ + /* + * There is no real good way to detect whether this a kdump() + * kernel, but except on the Voyager SMP monstrosity which is not +@@ -144,17 +147,61 @@ static __init bool check_for_real_bsp(u32 apic_id) + if (topo_info.real_bsp_apic_id != BAD_APICID) + return false; + +- if (apic_id == topo_info.boot_cpu_apic_id) { +- topo_info.real_bsp_apic_id = apic_id; +- return false; ++ /* ++ * Check whether the enumeration order is broken by evaluating the ++ * BSP bit in the APICBASE MSR. If the CPU does not have the ++ * APICBASE MSR then the BSP detection is not possible and the ++ * kernel must rely on the firmware enumeration order. ++ */ ++ if (has_apic_base) { ++ rdmsrl(MSR_IA32_APICBASE, msr); ++ is_bsp = !!(msr & MSR_IA32_APICBASE_BSP); + } + +- pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x > %x\n", ++ if (apic_id == topo_info.boot_cpu_apic_id) { ++ /* ++ * If the boot CPU has the APIC BSP bit set then the ++ * firmware enumeration is agreeing. If the CPU does not ++ * have the APICBASE MSR then the only choice is to trust ++ * the enumeration order. ++ */ ++ if (is_bsp || !has_apic_base) { ++ topo_info.real_bsp_apic_id = apic_id; ++ return false; ++ } ++ /* ++ * If the boot APIC is enumerated first, but the APICBASE ++ * MSR does not have the BSP bit set, then there is no way ++ * to discover the real BSP here. Assume a crash kernel and ++ * limit the number of CPUs to 1 as an INIT to the real BSP ++ * would reset the machine. ++ */ ++ pr_warn("Enumerated BSP APIC %x is not marked in APICBASE MSR\n", apic_id); ++ pr_warn("Assuming crash kernel. Limiting to one CPU to prevent machine INIT\n"); ++ set_nr_cpu_ids(1); ++ goto fwbug; ++ } ++ ++ pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x != %x\n", + topo_info.boot_cpu_apic_id, apic_id); ++ ++ if (is_bsp) { ++ /* ++ * The boot CPU has the APIC BSP bit set. Use it and complain ++ * about the broken firmware enumeration. ++ */ ++ topo_info.real_bsp_apic_id = topo_info.boot_cpu_apic_id; ++ goto fwbug; ++ } ++ + pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n"); + + topo_info.real_bsp_apic_id = apic_id; + return true; ++ ++fwbug: ++ pr_warn(FW_BUG "APIC enumeration order not specification compliant\n"); ++ return false; + } + + static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level, +-- +2.45.2 + diff --git a/queue-6.9/x86-topology-intel-unlock-cpuid-before-evaluating-anything.patch b/queue-6.9/x86-topology-intel-unlock-cpuid-before-evaluating-anything.patch new file mode 100644 index 00000000000..19a9a92a59b --- /dev/null +++ b/queue-6.9/x86-topology-intel-unlock-cpuid-before-evaluating-anything.patch @@ -0,0 +1,109 @@ +From 0c2f6d04619ec2b53ad4b0b591eafc9389786e86 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Thu, 30 May 2024 17:29:18 +0200 +Subject: x86/topology/intel: Unlock CPUID before evaluating anything + +From: Thomas Gleixner + +commit 0c2f6d04619ec2b53ad4b0b591eafc9389786e86 upstream. + +Intel CPUs have a MSR bit to limit CPUID enumeration to leaf two. If +this bit is set by the BIOS then CPUID evaluation including topology +enumeration does not work correctly as the evaluation code does not try +to analyze any leaf greater than two. + +This went unnoticed before because the original topology code just +repeated evaluation several times and managed to overwrite the initial +limited information with the correct one later. The new evaluation code +does it once and therefore ends up with the limited and wrong +information. + +Cure this by unlocking CPUID right before evaluating anything which +depends on the maximum CPUID leaf being greater than two instead of +rereading stuff after unlock. + +Fixes: 22d63660c35e ("x86/cpu: Use common topology code for Intel") +Reported-by: Peter Schneider +Signed-off-by: Thomas Gleixner +Signed-off-by: Borislav Petkov (AMD) +Tested-by: Peter Schneider +Cc: +Link: https://lore.kernel.org/r/fd3f73dc-a86f-4bcf-9c60-43556a21eb42@googlemail.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/cpu/common.c | 3 ++- + arch/x86/kernel/cpu/cpu.h | 2 ++ + arch/x86/kernel/cpu/intel.c | 25 ++++++++++++++++--------- + 3 files changed, 20 insertions(+), 10 deletions(-) + +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1589,6 +1589,7 @@ static void __init early_identify_cpu(st + if (have_cpuid_p()) { + cpu_detect(c); + get_cpu_vendor(c); ++ intel_unlock_cpuid_leafs(c); + get_cpu_cap(c); + setup_force_cpu_cap(X86_FEATURE_CPUID); + get_cpu_address_sizes(c); +@@ -1748,7 +1749,7 @@ static void generic_identify(struct cpui + cpu_detect(c); + + get_cpu_vendor(c); +- ++ intel_unlock_cpuid_leafs(c); + get_cpu_cap(c); + + get_cpu_address_sizes(c); +--- a/arch/x86/kernel/cpu/cpu.h ++++ b/arch/x86/kernel/cpu/cpu.h +@@ -61,9 +61,11 @@ extern __ro_after_init enum tsx_ctrl_sta + + extern void __init tsx_init(void); + void tsx_ap_init(void); ++void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c); + #else + static inline void tsx_init(void) { } + static inline void tsx_ap_init(void) { } ++static inline void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) { } + #endif /* CONFIG_CPU_SUP_INTEL */ + + extern void init_spectral_chicken(struct cpuinfo_x86 *c); +--- a/arch/x86/kernel/cpu/intel.c ++++ b/arch/x86/kernel/cpu/intel.c +@@ -268,19 +268,26 @@ detect_keyid_bits: + c->x86_phys_bits -= keyid_bits; + } + ++void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) ++{ ++ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ++ return; ++ ++ if (c->x86 < 6 || (c->x86 == 6 && c->x86_model < 0xd)) ++ return; ++ ++ /* ++ * The BIOS can have limited CPUID to leaf 2, which breaks feature ++ * enumeration. Unlock it and update the maximum leaf info. ++ */ ++ if (msr_clear_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) ++ c->cpuid_level = cpuid_eax(0); ++} ++ + static void early_init_intel(struct cpuinfo_x86 *c) + { + u64 misc_enable; + +- /* Unmask CPUID levels if masked: */ +- if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { +- if (msr_clear_bit(MSR_IA32_MISC_ENABLE, +- MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) { +- c->cpuid_level = cpuid_eax(0); +- get_cpu_cap(c); +- } +- } +- + if ((c->x86 == 0xf && c->x86_model >= 0x03) || + (c->x86 == 0x6 && c->x86_model >= 0x0e)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);