From c520a302f3457efdf6965a290e6041600ab2d6c5 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 4 Mar 2024 08:38:40 +0100 Subject: [PATCH] 6.7-stable patches added patches: fprobe-fix-to-allocate-entry_data_size-buffer-with-rethook-instances.patch fs-aio-make-io_cancel-generate-completions-again.patch mm-debug_vm_pgtable-fix-bug_on-with-pud-advanced-test.patch mm-vmscan-fix-a-bug-calling-wakeup_kswapd-with-a-wrong-zone-index.patch x86-cpu-allow-reducing-x86_phys_bits-during-early_identify_cpu.patch x86-cpu-intel-detect-tme-keyid-bits-before-setting-mtrr-mask-registers.patch x86-e820-don-t-reserve-setup_rng_seed-in-e820.patch --- ...a_size-buffer-with-rethook-instances.patch | 66 +++++ ...io_cancel-generate-completions-again.patch | 85 ++++++ ...le-fix-bug_on-with-pud-advanced-test.patch | 81 ++++++ ...akeup_kswapd-with-a-wrong-zone-index.patch | 93 +++++++ queue-6.7/series | 7 + ..._phys_bits-during-early_identify_cpu.patch | 55 ++++ ...s-before-setting-mtrr-mask-registers.patch | 243 ++++++++++++++++++ ...don-t-reserve-setup_rng_seed-in-e820.patch | 51 ++++ 8 files changed, 681 insertions(+) create mode 100644 queue-6.7/fprobe-fix-to-allocate-entry_data_size-buffer-with-rethook-instances.patch create mode 100644 queue-6.7/fs-aio-make-io_cancel-generate-completions-again.patch create mode 100644 queue-6.7/mm-debug_vm_pgtable-fix-bug_on-with-pud-advanced-test.patch create mode 100644 queue-6.7/mm-vmscan-fix-a-bug-calling-wakeup_kswapd-with-a-wrong-zone-index.patch create mode 100644 queue-6.7/x86-cpu-allow-reducing-x86_phys_bits-during-early_identify_cpu.patch create mode 100644 queue-6.7/x86-cpu-intel-detect-tme-keyid-bits-before-setting-mtrr-mask-registers.patch create mode 100644 queue-6.7/x86-e820-don-t-reserve-setup_rng_seed-in-e820.patch diff --git a/queue-6.7/fprobe-fix-to-allocate-entry_data_size-buffer-with-rethook-instances.patch b/queue-6.7/fprobe-fix-to-allocate-entry_data_size-buffer-with-rethook-instances.patch new file mode 100644 index 00000000000..4ffdf2a76eb --- /dev/null +++ b/queue-6.7/fprobe-fix-to-allocate-entry_data_size-buffer-with-rethook-instances.patch @@ -0,0 +1,66 @@ +From 6572786006fa96ad2c35bb31757f1f861298093b Mon Sep 17 00:00:00 2001 +From: "Masami Hiramatsu (Google)" +Date: Fri, 1 Mar 2024 09:18:24 +0900 +Subject: fprobe: Fix to allocate entry_data_size buffer with rethook instances + +From: Masami Hiramatsu (Google) + +commit 6572786006fa96ad2c35bb31757f1f861298093b upstream. + +Fix to allocate fprobe::entry_data_size buffer with rethook instances. +If fprobe doesn't allocate entry_data_size buffer for each rethook instance, +fprobe entry handler can cause a buffer overrun when storing entry data in +entry handler. + +Link: https://lore.kernel.org/all/170920576727.107552.638161246679734051.stgit@devnote2/ + +Reported-by: Jiri Olsa +Closes: https://lore.kernel.org/all/Zd9eBn2FTQzYyg7L@krava/ +Fixes: 4bbd93455659 ("kprobes: kretprobe scalability improvement") +Cc: stable@vger.kernel.org +Tested-by: Jiri Olsa +Signed-off-by: Masami Hiramatsu (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/fprobe.c | 14 ++++++-------- + 1 file changed, 6 insertions(+), 8 deletions(-) + +diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c +index 6cd2a4e3afb8..9ff018245840 100644 +--- a/kernel/trace/fprobe.c ++++ b/kernel/trace/fprobe.c +@@ -189,9 +189,6 @@ static int fprobe_init_rethook(struct fprobe *fp, int num) + { + int size; + +- if (num <= 0) +- return -EINVAL; +- + if (!fp->exit_handler) { + fp->rethook = NULL; + return 0; +@@ -199,15 +196,16 @@ static int fprobe_init_rethook(struct fprobe *fp, int num) + + /* Initialize rethook if needed */ + if (fp->nr_maxactive) +- size = fp->nr_maxactive; ++ num = fp->nr_maxactive; + else +- size = num * num_possible_cpus() * 2; +- if (size <= 0) ++ num *= num_possible_cpus() * 2; ++ if (num <= 0) + return -EINVAL; + ++ size = sizeof(struct fprobe_rethook_node) + fp->entry_data_size; ++ + /* Initialize rethook */ +- fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, +- sizeof(struct fprobe_rethook_node), size); ++ fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, size, num); + if (IS_ERR(fp->rethook)) + return PTR_ERR(fp->rethook); + +-- +2.44.0 + diff --git a/queue-6.7/fs-aio-make-io_cancel-generate-completions-again.patch b/queue-6.7/fs-aio-make-io_cancel-generate-completions-again.patch new file mode 100644 index 00000000000..1969e81c03b --- /dev/null +++ b/queue-6.7/fs-aio-make-io_cancel-generate-completions-again.patch @@ -0,0 +1,85 @@ +From 54cbc058d86beca3515c994039b5c0f0a34f53dd Mon Sep 17 00:00:00 2001 +From: Bart Van Assche +Date: Thu, 15 Feb 2024 12:47:39 -0800 +Subject: fs/aio: Make io_cancel() generate completions again + +From: Bart Van Assche + +commit 54cbc058d86beca3515c994039b5c0f0a34f53dd upstream. + +The following patch accidentally removed the code for delivering +completions for cancelled reads and writes to user space: "[PATCH 04/33] +aio: remove retry-based AIO" +(https://lore.kernel.org/all/1363883754-27966-5-git-send-email-koverstreet@google.com/) +>From that patch: + +- if (kiocbIsCancelled(iocb)) { +- ret = -EINTR; +- aio_complete(iocb, ret, 0); +- /* must not access the iocb after this */ +- goto out; +- } + +This leads to a leak in user space of a struct iocb. Hence this patch +that restores the code that reports to user space that a read or write +has been cancelled successfully. + +Fixes: 41003a7bcfed ("aio: remove retry-based AIO") +Cc: Christoph Hellwig +Cc: Avi Kivity +Cc: Sandeep Dhavale +Cc: Jens Axboe +Cc: Greg Kroah-Hartman +Cc: Kent Overstreet +Cc: stable@vger.kernel.org +Signed-off-by: Bart Van Assche +Link: https://lore.kernel.org/r/20240215204739.2677806-3-bvanassche@acm.org +Signed-off-by: Christian Brauner +Signed-off-by: Greg Kroah-Hartman +--- + fs/aio.c | 27 +++++++++++---------------- + 1 file changed, 11 insertions(+), 16 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -2119,14 +2119,11 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat + #endif + + /* sys_io_cancel: +- * Attempts to cancel an iocb previously passed to io_submit. If +- * the operation is successfully cancelled, the resulting event is +- * copied into the memory pointed to by result without being placed +- * into the completion queue and 0 is returned. May fail with +- * -EFAULT if any of the data structures pointed to are invalid. +- * May fail with -EINVAL if aio_context specified by ctx_id is +- * invalid. May fail with -EAGAIN if the iocb specified was not +- * cancelled. Will fail with -ENOSYS if not implemented. ++ * Attempts to cancel an iocb previously passed to io_submit(). If the ++ * operation is successfully cancelled 0 is returned. May fail with ++ * -EFAULT if any of the data structures pointed to are invalid. May ++ * fail with -EINVAL if aio_context specified by ctx_id is invalid. Will ++ * fail with -ENOSYS if not implemented. + */ + SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, + struct io_event __user *, result) +@@ -2157,14 +2154,12 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t + } + spin_unlock_irq(&ctx->ctx_lock); + +- if (!ret) { +- /* +- * The result argument is no longer used - the io_event is +- * always delivered via the ring buffer. -EINPROGRESS indicates +- * cancellation is progress: +- */ +- ret = -EINPROGRESS; +- } ++ /* ++ * The result argument is no longer used - the io_event is always ++ * delivered via the ring buffer. ++ */ ++ if (ret == 0 && kiocb->rw.ki_flags & IOCB_AIO_RW) ++ aio_complete_rw(&kiocb->rw, -EINTR); + + percpu_ref_put(&ctx->users); + diff --git a/queue-6.7/mm-debug_vm_pgtable-fix-bug_on-with-pud-advanced-test.patch b/queue-6.7/mm-debug_vm_pgtable-fix-bug_on-with-pud-advanced-test.patch new file mode 100644 index 00000000000..9ef63d95430 --- /dev/null +++ b/queue-6.7/mm-debug_vm_pgtable-fix-bug_on-with-pud-advanced-test.patch @@ -0,0 +1,81 @@ +From 720da1e593b85a550593b415bf1d79a053133451 Mon Sep 17 00:00:00 2001 +From: "Aneesh Kumar K.V (IBM)" +Date: Mon, 29 Jan 2024 11:30:22 +0530 +Subject: mm/debug_vm_pgtable: fix BUG_ON with pud advanced test + +From: Aneesh Kumar K.V (IBM) + +commit 720da1e593b85a550593b415bf1d79a053133451 upstream. + +Architectures like powerpc add debug checks to ensure we find only devmap +PUD pte entries. These debug checks are only done with CONFIG_DEBUG_VM. +This patch marks the ptes used for PUD advanced test devmap pte entries so +that we don't hit on debug checks on architecture like ppc64 as below. + +WARNING: CPU: 2 PID: 1 at arch/powerpc/mm/book3s64/radix_pgtable.c:1382 radix__pud_hugepage_update+0x38/0x138 +.... +NIP [c0000000000a7004] radix__pud_hugepage_update+0x38/0x138 +LR [c0000000000a77a8] radix__pudp_huge_get_and_clear+0x28/0x60 +Call Trace: +[c000000004a2f950] [c000000004a2f9a0] 0xc000000004a2f9a0 (unreliable) +[c000000004a2f980] [000d34c100000000] 0xd34c100000000 +[c000000004a2f9a0] [c00000000206ba98] pud_advanced_tests+0x118/0x334 +[c000000004a2fa40] [c00000000206db34] debug_vm_pgtable+0xcbc/0x1c48 +[c000000004a2fc10] [c00000000000fd28] do_one_initcall+0x60/0x388 + +Also + + kernel BUG at arch/powerpc/mm/book3s64/pgtable.c:202! + .... + + NIP [c000000000096510] pudp_huge_get_and_clear_full+0x98/0x174 + LR [c00000000206bb34] pud_advanced_tests+0x1b4/0x334 + Call Trace: + [c000000004a2f950] [000d34c100000000] 0xd34c100000000 (unreliable) + [c000000004a2f9a0] [c00000000206bb34] pud_advanced_tests+0x1b4/0x334 + [c000000004a2fa40] [c00000000206db34] debug_vm_pgtable+0xcbc/0x1c48 + [c000000004a2fc10] [c00000000000fd28] do_one_initcall+0x60/0x388 + +Link: https://lkml.kernel.org/r/20240129060022.68044-1-aneesh.kumar@kernel.org +Fixes: 27af67f35631 ("powerpc/book3s64/mm: enable transparent pud hugepage") +Signed-off-by: Aneesh Kumar K.V (IBM) +Cc: Anshuman Khandual +Cc: Michael Ellerman +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/debug_vm_pgtable.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/mm/debug_vm_pgtable.c ++++ b/mm/debug_vm_pgtable.c +@@ -362,6 +362,12 @@ static void __init pud_advanced_tests(st + vaddr &= HPAGE_PUD_MASK; + + pud = pfn_pud(args->pud_pfn, args->page_prot); ++ /* ++ * Some architectures have debug checks to make sure ++ * huge pud mapping are only found with devmap entries ++ * For now test with only devmap entries. ++ */ ++ pud = pud_mkdevmap(pud); + set_pud_at(args->mm, vaddr, args->pudp, pud); + flush_dcache_page(page); + pudp_set_wrprotect(args->mm, vaddr, args->pudp); +@@ -374,6 +380,7 @@ static void __init pud_advanced_tests(st + WARN_ON(!pud_none(pud)); + #endif /* __PAGETABLE_PMD_FOLDED */ + pud = pfn_pud(args->pud_pfn, args->page_prot); ++ pud = pud_mkdevmap(pud); + pud = pud_wrprotect(pud); + pud = pud_mkclean(pud); + set_pud_at(args->mm, vaddr, args->pudp, pud); +@@ -391,6 +398,7 @@ static void __init pud_advanced_tests(st + #endif /* __PAGETABLE_PMD_FOLDED */ + + pud = pfn_pud(args->pud_pfn, args->page_prot); ++ pud = pud_mkdevmap(pud); + pud = pud_mkyoung(pud); + set_pud_at(args->mm, vaddr, args->pudp, pud); + flush_dcache_page(page); diff --git a/queue-6.7/mm-vmscan-fix-a-bug-calling-wakeup_kswapd-with-a-wrong-zone-index.patch b/queue-6.7/mm-vmscan-fix-a-bug-calling-wakeup_kswapd-with-a-wrong-zone-index.patch new file mode 100644 index 00000000000..d67767ac04b --- /dev/null +++ b/queue-6.7/mm-vmscan-fix-a-bug-calling-wakeup_kswapd-with-a-wrong-zone-index.patch @@ -0,0 +1,93 @@ +From 2774f256e7c0219e2b0a0894af1c76bdabc4f974 Mon Sep 17 00:00:00 2001 +From: Byungchul Park +Date: Fri, 16 Feb 2024 20:15:02 +0900 +Subject: mm/vmscan: fix a bug calling wakeup_kswapd() with a wrong zone index + +From: Byungchul Park + +commit 2774f256e7c0219e2b0a0894af1c76bdabc4f974 upstream. + +With numa balancing on, when a numa system is running where a numa node +doesn't have its local memory so it has no managed zones, the following +oops has been observed. It's because wakeup_kswapd() is called with a +wrong zone index, -1. Fixed it by checking the index before calling +wakeup_kswapd(). + +> BUG: unable to handle page fault for address: 00000000000033f3 +> #PF: supervisor read access in kernel mode +> #PF: error_code(0x0000) - not-present page +> PGD 0 P4D 0 +> Oops: 0000 [#1] PREEMPT SMP NOPTI +> CPU: 2 PID: 895 Comm: masim Not tainted 6.6.0-dirty #255 +> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS +> rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 +> RIP: 0010:wakeup_kswapd (./linux/mm/vmscan.c:7812) +> Code: (omitted) +> RSP: 0000:ffffc90004257d58 EFLAGS: 00010286 +> RAX: ffffffffffffffff RBX: ffff88883fff0480 RCX: 0000000000000003 +> RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88883fff0480 +> RBP: ffffffffffffffff R08: ff0003ffffffffff R09: ffffffffffffffff +> R10: ffff888106c95540 R11: 0000000055555554 R12: 0000000000000003 +> R13: 0000000000000000 R14: 0000000000000000 R15: ffff88883fff0940 +> FS: 00007fc4b8124740(0000) GS:ffff888827c00000(0000) knlGS:0000000000000000 +> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +> CR2: 00000000000033f3 CR3: 000000026cc08004 CR4: 0000000000770ee0 +> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +> PKRU: 55555554 +> Call Trace: +> +> ? __die +> ? page_fault_oops +> ? __pte_offset_map_lock +> ? exc_page_fault +> ? asm_exc_page_fault +> ? wakeup_kswapd +> migrate_misplaced_page +> __handle_mm_fault +> handle_mm_fault +> do_user_addr_fault +> exc_page_fault +> asm_exc_page_fault +> RIP: 0033:0x55b897ba0808 +> Code: (omitted) +> RSP: 002b:00007ffeefa821a0 EFLAGS: 00010287 +> RAX: 000055b89983acd0 RBX: 00007ffeefa823f8 RCX: 000055b89983acd0 +> RDX: 00007fc2f8122010 RSI: 0000000000020000 RDI: 000055b89983acd0 +> RBP: 00007ffeefa821a0 R08: 0000000000000037 R09: 0000000000000075 +> R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000 +> R13: 00007ffeefa82410 R14: 000055b897ba5dd8 R15: 00007fc4b8340000 +> + +Link: https://lkml.kernel.org/r/20240216111502.79759-1-byungchul@sk.com +Signed-off-by: Byungchul Park +Reported-by: Hyeongtak Ji +Fixes: c574bbe917036 ("NUMA balancing: optimize page placement for memory tiering system") +Reviewed-by: Oscar Salvador +Cc: Baolin Wang +Cc: "Huang, Ying" +Cc: Johannes Weiner +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/migrate.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -2517,6 +2517,14 @@ static int numamigrate_isolate_folio(pg_ + if (managed_zone(pgdat->node_zones + z)) + break; + } ++ ++ /* ++ * If there are no managed zones, it should not proceed ++ * further. ++ */ ++ if (z < 0) ++ return 0; ++ + wakeup_kswapd(pgdat->node_zones + z, 0, + folio_order(folio), ZONE_MOVABLE); + return 0; diff --git a/queue-6.7/series b/queue-6.7/series index cadb7f42ca5..eb8535b83f2 100644 --- a/queue-6.7/series +++ b/queue-6.7/series @@ -113,3 +113,10 @@ iommufd-fix-protection-fault-in-iommufd_test_syz_conv_iova.patch efivarfs-request-at-most-512-bytes-for-variable-names.patch pmdomain-arm-fix-null-dereference-on-scmi_perf_domain-removal.patch pmdomain-qcom-rpmhpd-fix-enabled_corner-aggregation.patch +fs-aio-make-io_cancel-generate-completions-again.patch +fprobe-fix-to-allocate-entry_data_size-buffer-with-rethook-instances.patch +mm-debug_vm_pgtable-fix-bug_on-with-pud-advanced-test.patch +mm-vmscan-fix-a-bug-calling-wakeup_kswapd-with-a-wrong-zone-index.patch +x86-e820-don-t-reserve-setup_rng_seed-in-e820.patch +x86-cpu-allow-reducing-x86_phys_bits-during-early_identify_cpu.patch +x86-cpu-intel-detect-tme-keyid-bits-before-setting-mtrr-mask-registers.patch diff --git a/queue-6.7/x86-cpu-allow-reducing-x86_phys_bits-during-early_identify_cpu.patch b/queue-6.7/x86-cpu-allow-reducing-x86_phys_bits-during-early_identify_cpu.patch new file mode 100644 index 00000000000..38bbb727358 --- /dev/null +++ b/queue-6.7/x86-cpu-allow-reducing-x86_phys_bits-during-early_identify_cpu.patch @@ -0,0 +1,55 @@ +From 9a458198eba98b7207669a166e64d04b04cb651b Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Thu, 1 Feb 2024 00:09:01 +0100 +Subject: x86/cpu: Allow reducing x86_phys_bits during early_identify_cpu() + +From: Paolo Bonzini + +commit 9a458198eba98b7207669a166e64d04b04cb651b upstream. + +In commit fbf6449f84bf ("x86/sev-es: Set x86_virt_bits to the correct +value straight away, instead of a two-phase approach"), the initialization +of c->x86_phys_bits was moved after this_cpu->c_early_init(c). This is +incorrect because early_init_amd() expected to be able to reduce the +value according to the contents of CPUID leaf 0x8000001f. + +Fortunately, the bug was negated by init_amd()'s call to early_init_amd(), +which does reduce x86_phys_bits in the end. However, this is very +late in the boot process and, most notably, the wrong value is used for +x86_phys_bits when setting up MTRRs. + +To fix this, call get_cpu_address_sizes() as soon as X86_FEATURE_CPUID is +set/cleared, and c->extended_cpuid_level is retrieved. + +Fixes: fbf6449f84bf ("x86/sev-es: Set x86_virt_bits to the correct value straight away, instead of a two-phase approach") +Signed-off-by: Paolo Bonzini +Signed-off-by: Dave Hansen +Cc:stable@vger.kernel.org +Link: https://lore.kernel.org/all/20240131230902.1867092-2-pbonzini%40redhat.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/cpu/common.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1596,6 +1596,7 @@ static void __init early_identify_cpu(st + get_cpu_vendor(c); + get_cpu_cap(c); + setup_force_cpu_cap(X86_FEATURE_CPUID); ++ get_cpu_address_sizes(c); + cpu_parse_early_param(); + + if (this_cpu->c_early_init) +@@ -1608,10 +1609,9 @@ static void __init early_identify_cpu(st + this_cpu->c_bsp_init(c); + } else { + setup_clear_cpu_cap(X86_FEATURE_CPUID); ++ get_cpu_address_sizes(c); + } + +- get_cpu_address_sizes(c); +- + setup_force_cpu_cap(X86_FEATURE_ALWAYS); + + cpu_set_bug_bits(c); diff --git a/queue-6.7/x86-cpu-intel-detect-tme-keyid-bits-before-setting-mtrr-mask-registers.patch b/queue-6.7/x86-cpu-intel-detect-tme-keyid-bits-before-setting-mtrr-mask-registers.patch new file mode 100644 index 00000000000..c0bfa713708 --- /dev/null +++ b/queue-6.7/x86-cpu-intel-detect-tme-keyid-bits-before-setting-mtrr-mask-registers.patch @@ -0,0 +1,243 @@ +From 6890cb1ace350b4386c8aee1343dc3b3ddd214da Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Thu, 1 Feb 2024 00:09:02 +0100 +Subject: x86/cpu/intel: Detect TME keyid bits before setting MTRR mask registers + +From: Paolo Bonzini + +commit 6890cb1ace350b4386c8aee1343dc3b3ddd214da upstream. + +MKTME repurposes the high bit of physical address to key id for encryption +key and, even though MAXPHYADDR in CPUID[0x80000008] remains the same, +the valid bits in the MTRR mask register are based on the reduced number +of physical address bits. + +detect_tme() in arch/x86/kernel/cpu/intel.c detects TME and subtracts +it from the total usable physical bits, but it is called too late. +Move the call to early_init_intel() so that it is called in setup_arch(), +before MTRRs are setup. + +This fixes boot on TDX-enabled systems, which until now only worked with +"disable_mtrr_cleanup". Without the patch, the values written to the +MTRRs mask registers were 52-bit wide (e.g. 0x000fffff_80000800) and +the writes failed; with the patch, the values are 46-bit wide, which +matches the reduced MAXPHYADDR that is shown in /proc/cpuinfo. + +Reported-by: Zixi Chen +Signed-off-by: Paolo Bonzini +Signed-off-by: Dave Hansen +Cc:stable@vger.kernel.org +Link: https://lore.kernel.org/all/20240131230902.1867092-3-pbonzini%40redhat.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/cpu/intel.c | 178 ++++++++++++++++++++++---------------------- + 1 file changed, 91 insertions(+), 87 deletions(-) + +--- a/arch/x86/kernel/cpu/intel.c ++++ b/arch/x86/kernel/cpu/intel.c +@@ -184,6 +184,90 @@ static bool bad_spectre_microcode(struct + return false; + } + ++#define MSR_IA32_TME_ACTIVATE 0x982 ++ ++/* Helpers to access TME_ACTIVATE MSR */ ++#define TME_ACTIVATE_LOCKED(x) (x & 0x1) ++#define TME_ACTIVATE_ENABLED(x) (x & 0x2) ++ ++#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */ ++#define TME_ACTIVATE_POLICY_AES_XTS_128 0 ++ ++#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */ ++ ++#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */ ++#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1 ++ ++/* Values for mktme_status (SW only construct) */ ++#define MKTME_ENABLED 0 ++#define MKTME_DISABLED 1 ++#define MKTME_UNINITIALIZED 2 ++static int mktme_status = MKTME_UNINITIALIZED; ++ ++static void detect_tme_early(struct cpuinfo_x86 *c) ++{ ++ u64 tme_activate, tme_policy, tme_crypto_algs; ++ int keyid_bits = 0, nr_keyids = 0; ++ static u64 tme_activate_cpu0 = 0; ++ ++ rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); ++ ++ if (mktme_status != MKTME_UNINITIALIZED) { ++ if (tme_activate != tme_activate_cpu0) { ++ /* Broken BIOS? */ ++ pr_err_once("x86/tme: configuration is inconsistent between CPUs\n"); ++ pr_err_once("x86/tme: MKTME is not usable\n"); ++ mktme_status = MKTME_DISABLED; ++ ++ /* Proceed. We may need to exclude bits from x86_phys_bits. */ ++ } ++ } else { ++ tme_activate_cpu0 = tme_activate; ++ } ++ ++ if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { ++ pr_info_once("x86/tme: not enabled by BIOS\n"); ++ mktme_status = MKTME_DISABLED; ++ return; ++ } ++ ++ if (mktme_status != MKTME_UNINITIALIZED) ++ goto detect_keyid_bits; ++ ++ pr_info("x86/tme: enabled by BIOS\n"); ++ ++ tme_policy = TME_ACTIVATE_POLICY(tme_activate); ++ if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128) ++ pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy); ++ ++ tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate); ++ if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) { ++ pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n", ++ tme_crypto_algs); ++ mktme_status = MKTME_DISABLED; ++ } ++detect_keyid_bits: ++ keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); ++ nr_keyids = (1UL << keyid_bits) - 1; ++ if (nr_keyids) { ++ pr_info_once("x86/mktme: enabled by BIOS\n"); ++ pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids); ++ } else { ++ pr_info_once("x86/mktme: disabled by BIOS\n"); ++ } ++ ++ if (mktme_status == MKTME_UNINITIALIZED) { ++ /* MKTME is usable */ ++ mktme_status = MKTME_ENABLED; ++ } ++ ++ /* ++ * KeyID bits effectively lower the number of physical address ++ * bits. Update cpuinfo_x86::x86_phys_bits accordingly. ++ */ ++ c->x86_phys_bits -= keyid_bits; ++} ++ + static void early_init_intel(struct cpuinfo_x86 *c) + { + u64 misc_enable; +@@ -322,6 +406,13 @@ static void early_init_intel(struct cpui + */ + if (detect_extended_topology_early(c) < 0) + detect_ht_early(c); ++ ++ /* ++ * Adjust the number of physical bits early because it affects the ++ * valid bits of the MTRR mask registers. ++ */ ++ if (cpu_has(c, X86_FEATURE_TME)) ++ detect_tme_early(c); + } + + static void bsp_init_intel(struct cpuinfo_x86 *c) +@@ -482,90 +573,6 @@ static void srat_detect_node(struct cpui + #endif + } + +-#define MSR_IA32_TME_ACTIVATE 0x982 +- +-/* Helpers to access TME_ACTIVATE MSR */ +-#define TME_ACTIVATE_LOCKED(x) (x & 0x1) +-#define TME_ACTIVATE_ENABLED(x) (x & 0x2) +- +-#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */ +-#define TME_ACTIVATE_POLICY_AES_XTS_128 0 +- +-#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */ +- +-#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */ +-#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1 +- +-/* Values for mktme_status (SW only construct) */ +-#define MKTME_ENABLED 0 +-#define MKTME_DISABLED 1 +-#define MKTME_UNINITIALIZED 2 +-static int mktme_status = MKTME_UNINITIALIZED; +- +-static void detect_tme(struct cpuinfo_x86 *c) +-{ +- u64 tme_activate, tme_policy, tme_crypto_algs; +- int keyid_bits = 0, nr_keyids = 0; +- static u64 tme_activate_cpu0 = 0; +- +- rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); +- +- if (mktme_status != MKTME_UNINITIALIZED) { +- if (tme_activate != tme_activate_cpu0) { +- /* Broken BIOS? */ +- pr_err_once("x86/tme: configuration is inconsistent between CPUs\n"); +- pr_err_once("x86/tme: MKTME is not usable\n"); +- mktme_status = MKTME_DISABLED; +- +- /* Proceed. We may need to exclude bits from x86_phys_bits. */ +- } +- } else { +- tme_activate_cpu0 = tme_activate; +- } +- +- if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { +- pr_info_once("x86/tme: not enabled by BIOS\n"); +- mktme_status = MKTME_DISABLED; +- return; +- } +- +- if (mktme_status != MKTME_UNINITIALIZED) +- goto detect_keyid_bits; +- +- pr_info("x86/tme: enabled by BIOS\n"); +- +- tme_policy = TME_ACTIVATE_POLICY(tme_activate); +- if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128) +- pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy); +- +- tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate); +- if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) { +- pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n", +- tme_crypto_algs); +- mktme_status = MKTME_DISABLED; +- } +-detect_keyid_bits: +- keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); +- nr_keyids = (1UL << keyid_bits) - 1; +- if (nr_keyids) { +- pr_info_once("x86/mktme: enabled by BIOS\n"); +- pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids); +- } else { +- pr_info_once("x86/mktme: disabled by BIOS\n"); +- } +- +- if (mktme_status == MKTME_UNINITIALIZED) { +- /* MKTME is usable */ +- mktme_status = MKTME_ENABLED; +- } +- +- /* +- * KeyID bits effectively lower the number of physical address +- * bits. Update cpuinfo_x86::x86_phys_bits accordingly. +- */ +- c->x86_phys_bits -= keyid_bits; +-} +- + static void init_cpuid_fault(struct cpuinfo_x86 *c) + { + u64 msr; +@@ -702,9 +709,6 @@ static void init_intel(struct cpuinfo_x8 + + init_ia32_feat_ctl(c); + +- if (cpu_has(c, X86_FEATURE_TME)) +- detect_tme(c); +- + init_intel_misc_features(c); + + split_lock_init(); diff --git a/queue-6.7/x86-e820-don-t-reserve-setup_rng_seed-in-e820.patch b/queue-6.7/x86-e820-don-t-reserve-setup_rng_seed-in-e820.patch new file mode 100644 index 00000000000..1864d19809c --- /dev/null +++ b/queue-6.7/x86-e820-don-t-reserve-setup_rng_seed-in-e820.patch @@ -0,0 +1,51 @@ +From 7fd817c906503b6813ea3b41f5fdf4192449a707 Mon Sep 17 00:00:00 2001 +From: Jiri Bohac +Date: Wed, 31 Jan 2024 01:04:28 +0100 +Subject: x86/e820: Don't reserve SETUP_RNG_SEED in e820 + +From: Jiri Bohac + +commit 7fd817c906503b6813ea3b41f5fdf4192449a707 upstream. + +SETUP_RNG_SEED in setup_data is supplied by kexec and should +not be reserved in the e820 map. + +Doing so reserves 16 bytes of RAM when booting with kexec. +(16 bytes because data->len is zeroed by parse_setup_data so only +sizeof(setup_data) is reserved.) + +When kexec is used repeatedly, each boot adds two entries in the +kexec-provided e820 map as the 16-byte range splits a larger +range of usable memory. Eventually all of the 128 available entries +get used up. The next split will result in losing usable memory +as the new entries cannot be added to the e820 map. + +Fixes: 68b8e9713c8e ("x86/setup: Use rng seeds from setup_data") +Signed-off-by: Jiri Bohac +Signed-off-by: Borislav Petkov (AMD) +Signed-off-by: Dave Hansen +Cc: +Link: https://lore.kernel.org/r/ZbmOjKnARGiaYBd5@dwarf.suse.cz +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/e820.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +--- a/arch/x86/kernel/e820.c ++++ b/arch/x86/kernel/e820.c +@@ -1017,10 +1017,12 @@ void __init e820__reserve_setup_data(voi + e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + + /* +- * SETUP_EFI and SETUP_IMA are supplied by kexec and do not need +- * to be reserved. ++ * SETUP_EFI, SETUP_IMA and SETUP_RNG_SEED are supplied by ++ * kexec and do not need to be reserved. + */ +- if (data->type != SETUP_EFI && data->type != SETUP_IMA) ++ if (data->type != SETUP_EFI && ++ data->type != SETUP_IMA && ++ data->type != SETUP_RNG_SEED) + e820__range_update_kexec(pa_data, + sizeof(*data) + data->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); -- 2.47.3