]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.7-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 4 Mar 2024 07:38:40 +0000 (08:38 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 4 Mar 2024 07:38:40 +0000 (08:38 +0100)
added patches:
fprobe-fix-to-allocate-entry_data_size-buffer-with-rethook-instances.patch
fs-aio-make-io_cancel-generate-completions-again.patch
mm-debug_vm_pgtable-fix-bug_on-with-pud-advanced-test.patch
mm-vmscan-fix-a-bug-calling-wakeup_kswapd-with-a-wrong-zone-index.patch
x86-cpu-allow-reducing-x86_phys_bits-during-early_identify_cpu.patch
x86-cpu-intel-detect-tme-keyid-bits-before-setting-mtrr-mask-registers.patch
x86-e820-don-t-reserve-setup_rng_seed-in-e820.patch

queue-6.7/fprobe-fix-to-allocate-entry_data_size-buffer-with-rethook-instances.patch [new file with mode: 0644]
queue-6.7/fs-aio-make-io_cancel-generate-completions-again.patch [new file with mode: 0644]
queue-6.7/mm-debug_vm_pgtable-fix-bug_on-with-pud-advanced-test.patch [new file with mode: 0644]
queue-6.7/mm-vmscan-fix-a-bug-calling-wakeup_kswapd-with-a-wrong-zone-index.patch [new file with mode: 0644]
queue-6.7/series
queue-6.7/x86-cpu-allow-reducing-x86_phys_bits-during-early_identify_cpu.patch [new file with mode: 0644]
queue-6.7/x86-cpu-intel-detect-tme-keyid-bits-before-setting-mtrr-mask-registers.patch [new file with mode: 0644]
queue-6.7/x86-e820-don-t-reserve-setup_rng_seed-in-e820.patch [new file with mode: 0644]

diff --git a/queue-6.7/fprobe-fix-to-allocate-entry_data_size-buffer-with-rethook-instances.patch b/queue-6.7/fprobe-fix-to-allocate-entry_data_size-buffer-with-rethook-instances.patch
new file mode 100644 (file)
index 0000000..4ffdf2a
--- /dev/null
@@ -0,0 +1,66 @@
+From 6572786006fa96ad2c35bb31757f1f861298093b Mon Sep 17 00:00:00 2001
+From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
+Date: Fri, 1 Mar 2024 09:18:24 +0900
+Subject: fprobe: Fix to allocate entry_data_size buffer with rethook instances
+
+From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
+
+commit 6572786006fa96ad2c35bb31757f1f861298093b upstream.
+
+Fix to allocate fprobe::entry_data_size buffer with rethook instances.
+If fprobe doesn't allocate entry_data_size buffer for each rethook instance,
+fprobe entry handler can cause a buffer overrun when storing entry data in
+entry handler.
+
+Link: https://lore.kernel.org/all/170920576727.107552.638161246679734051.stgit@devnote2/
+
+Reported-by: Jiri Olsa <olsajiri@gmail.com>
+Closes: https://lore.kernel.org/all/Zd9eBn2FTQzYyg7L@krava/
+Fixes: 4bbd93455659 ("kprobes: kretprobe scalability improvement")
+Cc: stable@vger.kernel.org
+Tested-by: Jiri Olsa <olsajiri@gmail.com>
+Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/fprobe.c | 14 ++++++--------
+ 1 file changed, 6 insertions(+), 8 deletions(-)
+
+diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
+index 6cd2a4e3afb8..9ff018245840 100644
+--- a/kernel/trace/fprobe.c
++++ b/kernel/trace/fprobe.c
+@@ -189,9 +189,6 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
+ {
+       int size;
+-      if (num <= 0)
+-              return -EINVAL;
+-
+       if (!fp->exit_handler) {
+               fp->rethook = NULL;
+               return 0;
+@@ -199,15 +196,16 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
+       /* Initialize rethook if needed */
+       if (fp->nr_maxactive)
+-              size = fp->nr_maxactive;
++              num = fp->nr_maxactive;
+       else
+-              size = num * num_possible_cpus() * 2;
+-      if (size <= 0)
++              num *= num_possible_cpus() * 2;
++      if (num <= 0)
+               return -EINVAL;
++      size = sizeof(struct fprobe_rethook_node) + fp->entry_data_size;
++
+       /* Initialize rethook */
+-      fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler,
+-                              sizeof(struct fprobe_rethook_node), size);
++      fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, size, num);
+       if (IS_ERR(fp->rethook))
+               return PTR_ERR(fp->rethook);
+-- 
+2.44.0
+
diff --git a/queue-6.7/fs-aio-make-io_cancel-generate-completions-again.patch b/queue-6.7/fs-aio-make-io_cancel-generate-completions-again.patch
new file mode 100644 (file)
index 0000000..1969e81
--- /dev/null
@@ -0,0 +1,85 @@
+From 54cbc058d86beca3515c994039b5c0f0a34f53dd Mon Sep 17 00:00:00 2001
+From: Bart Van Assche <bvanassche@acm.org>
+Date: Thu, 15 Feb 2024 12:47:39 -0800
+Subject: fs/aio: Make io_cancel() generate completions again
+
+From: Bart Van Assche <bvanassche@acm.org>
+
+commit 54cbc058d86beca3515c994039b5c0f0a34f53dd upstream.
+
+The following patch accidentally removed the code for delivering
+completions for cancelled reads and writes to user space: "[PATCH 04/33]
+aio: remove retry-based AIO"
+(https://lore.kernel.org/all/1363883754-27966-5-git-send-email-koverstreet@google.com/)
+>From that patch:
+
+-      if (kiocbIsCancelled(iocb)) {
+-              ret = -EINTR;
+-              aio_complete(iocb, ret, 0);
+-              /* must not access the iocb after this */
+-              goto out;
+-      }
+
+This leads to a leak in user space of a struct iocb. Hence this patch
+that restores the code that reports to user space that a read or write
+has been cancelled successfully.
+
+Fixes: 41003a7bcfed ("aio: remove retry-based AIO")
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Avi Kivity <avi@scylladb.com>
+Cc: Sandeep Dhavale <dhavale@google.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Kent Overstreet <kent.overstreet@linux.dev>
+Cc: stable@vger.kernel.org
+Signed-off-by: Bart Van Assche <bvanassche@acm.org>
+Link: https://lore.kernel.org/r/20240215204739.2677806-3-bvanassche@acm.org
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/aio.c |   27 +++++++++++----------------
+ 1 file changed, 11 insertions(+), 16 deletions(-)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -2119,14 +2119,11 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat
+ #endif
+ /* sys_io_cancel:
+- *    Attempts to cancel an iocb previously passed to io_submit.  If
+- *    the operation is successfully cancelled, the resulting event is
+- *    copied into the memory pointed to by result without being placed
+- *    into the completion queue and 0 is returned.  May fail with
+- *    -EFAULT if any of the data structures pointed to are invalid.
+- *    May fail with -EINVAL if aio_context specified by ctx_id is
+- *    invalid.  May fail with -EAGAIN if the iocb specified was not
+- *    cancelled.  Will fail with -ENOSYS if not implemented.
++ *    Attempts to cancel an iocb previously passed to io_submit(). If the
++ *    operation is successfully cancelled 0 is returned. May fail with
++ *    -EFAULT if any of the data structures pointed to are invalid. May
++ *    fail with -EINVAL if aio_context specified by ctx_id is invalid. Will
++ *    fail with -ENOSYS if not implemented.
+  */
+ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
+               struct io_event __user *, result)
+@@ -2157,14 +2154,12 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t
+       }
+       spin_unlock_irq(&ctx->ctx_lock);
+-      if (!ret) {
+-              /*
+-               * The result argument is no longer used - the io_event is
+-               * always delivered via the ring buffer. -EINPROGRESS indicates
+-               * cancellation is progress:
+-               */
+-              ret = -EINPROGRESS;
+-      }
++      /*
++       * The result argument is no longer used - the io_event is always
++       * delivered via the ring buffer.
++       */
++      if (ret == 0 && kiocb->rw.ki_flags & IOCB_AIO_RW)
++              aio_complete_rw(&kiocb->rw, -EINTR);
+       percpu_ref_put(&ctx->users);
diff --git a/queue-6.7/mm-debug_vm_pgtable-fix-bug_on-with-pud-advanced-test.patch b/queue-6.7/mm-debug_vm_pgtable-fix-bug_on-with-pud-advanced-test.patch
new file mode 100644 (file)
index 0000000..9ef63d9
--- /dev/null
@@ -0,0 +1,81 @@
+From 720da1e593b85a550593b415bf1d79a053133451 Mon Sep 17 00:00:00 2001
+From: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
+Date: Mon, 29 Jan 2024 11:30:22 +0530
+Subject: mm/debug_vm_pgtable: fix BUG_ON with pud advanced test
+
+From: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org>
+
+commit 720da1e593b85a550593b415bf1d79a053133451 upstream.
+
+Architectures like powerpc add debug checks to ensure we find only devmap
+PUD pte entries.  These debug checks are only done with CONFIG_DEBUG_VM.
+This patch marks the ptes used for PUD advanced test devmap pte entries so
+that we don't hit on debug checks on architecture like ppc64 as below.
+
+WARNING: CPU: 2 PID: 1 at arch/powerpc/mm/book3s64/radix_pgtable.c:1382 radix__pud_hugepage_update+0x38/0x138
+....
+NIP [c0000000000a7004] radix__pud_hugepage_update+0x38/0x138
+LR [c0000000000a77a8] radix__pudp_huge_get_and_clear+0x28/0x60
+Call Trace:
+[c000000004a2f950] [c000000004a2f9a0] 0xc000000004a2f9a0 (unreliable)
+[c000000004a2f980] [000d34c100000000] 0xd34c100000000
+[c000000004a2f9a0] [c00000000206ba98] pud_advanced_tests+0x118/0x334
+[c000000004a2fa40] [c00000000206db34] debug_vm_pgtable+0xcbc/0x1c48
+[c000000004a2fc10] [c00000000000fd28] do_one_initcall+0x60/0x388
+
+Also
+
+ kernel BUG at arch/powerpc/mm/book3s64/pgtable.c:202!
+ ....
+
+ NIP [c000000000096510] pudp_huge_get_and_clear_full+0x98/0x174
+ LR [c00000000206bb34] pud_advanced_tests+0x1b4/0x334
+ Call Trace:
+ [c000000004a2f950] [000d34c100000000] 0xd34c100000000 (unreliable)
+ [c000000004a2f9a0] [c00000000206bb34] pud_advanced_tests+0x1b4/0x334
+ [c000000004a2fa40] [c00000000206db34] debug_vm_pgtable+0xcbc/0x1c48
+ [c000000004a2fc10] [c00000000000fd28] do_one_initcall+0x60/0x388
+
+Link: https://lkml.kernel.org/r/20240129060022.68044-1-aneesh.kumar@kernel.org
+Fixes: 27af67f35631 ("powerpc/book3s64/mm: enable transparent pud hugepage")
+Signed-off-by: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org>
+Cc: Anshuman Khandual <anshuman.khandual@arm.com>
+Cc: Michael Ellerman <mpe@ellerman.id.au>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/debug_vm_pgtable.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/mm/debug_vm_pgtable.c
++++ b/mm/debug_vm_pgtable.c
+@@ -362,6 +362,12 @@ static void __init pud_advanced_tests(st
+       vaddr &= HPAGE_PUD_MASK;
+       pud = pfn_pud(args->pud_pfn, args->page_prot);
++      /*
++       * Some architectures have debug checks to make sure
++       * huge pud mapping are only found with devmap entries
++       * For now test with only devmap entries.
++       */
++      pud = pud_mkdevmap(pud);
+       set_pud_at(args->mm, vaddr, args->pudp, pud);
+       flush_dcache_page(page);
+       pudp_set_wrprotect(args->mm, vaddr, args->pudp);
+@@ -374,6 +380,7 @@ static void __init pud_advanced_tests(st
+       WARN_ON(!pud_none(pud));
+ #endif /* __PAGETABLE_PMD_FOLDED */
+       pud = pfn_pud(args->pud_pfn, args->page_prot);
++      pud = pud_mkdevmap(pud);
+       pud = pud_wrprotect(pud);
+       pud = pud_mkclean(pud);
+       set_pud_at(args->mm, vaddr, args->pudp, pud);
+@@ -391,6 +398,7 @@ static void __init pud_advanced_tests(st
+ #endif /* __PAGETABLE_PMD_FOLDED */
+       pud = pfn_pud(args->pud_pfn, args->page_prot);
++      pud = pud_mkdevmap(pud);
+       pud = pud_mkyoung(pud);
+       set_pud_at(args->mm, vaddr, args->pudp, pud);
+       flush_dcache_page(page);
diff --git a/queue-6.7/mm-vmscan-fix-a-bug-calling-wakeup_kswapd-with-a-wrong-zone-index.patch b/queue-6.7/mm-vmscan-fix-a-bug-calling-wakeup_kswapd-with-a-wrong-zone-index.patch
new file mode 100644 (file)
index 0000000..d67767a
--- /dev/null
@@ -0,0 +1,93 @@
+From 2774f256e7c0219e2b0a0894af1c76bdabc4f974 Mon Sep 17 00:00:00 2001
+From: Byungchul Park <byungchul@sk.com>
+Date: Fri, 16 Feb 2024 20:15:02 +0900
+Subject: mm/vmscan: fix a bug calling wakeup_kswapd() with a wrong zone index
+
+From: Byungchul Park <byungchul@sk.com>
+
+commit 2774f256e7c0219e2b0a0894af1c76bdabc4f974 upstream.
+
+With numa balancing on, when a numa system is running where a numa node
+doesn't have its local memory so it has no managed zones, the following
+oops has been observed.  It's because wakeup_kswapd() is called with a
+wrong zone index, -1.  Fixed it by checking the index before calling
+wakeup_kswapd().
+
+> BUG: unable to handle page fault for address: 00000000000033f3
+> #PF: supervisor read access in kernel mode
+> #PF: error_code(0x0000) - not-present page
+> PGD 0 P4D 0
+> Oops: 0000 [#1] PREEMPT SMP NOPTI
+> CPU: 2 PID: 895 Comm: masim Not tainted 6.6.0-dirty #255
+> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
+>    rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
+> RIP: 0010:wakeup_kswapd (./linux/mm/vmscan.c:7812)
+> Code: (omitted)
+> RSP: 0000:ffffc90004257d58 EFLAGS: 00010286
+> RAX: ffffffffffffffff RBX: ffff88883fff0480 RCX: 0000000000000003
+> RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88883fff0480
+> RBP: ffffffffffffffff R08: ff0003ffffffffff R09: ffffffffffffffff
+> R10: ffff888106c95540 R11: 0000000055555554 R12: 0000000000000003
+> R13: 0000000000000000 R14: 0000000000000000 R15: ffff88883fff0940
+> FS:  00007fc4b8124740(0000) GS:ffff888827c00000(0000) knlGS:0000000000000000
+> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+> CR2: 00000000000033f3 CR3: 000000026cc08004 CR4: 0000000000770ee0
+> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+> PKRU: 55555554
+> Call Trace:
+>  <TASK>
+> ? __die
+> ? page_fault_oops
+> ? __pte_offset_map_lock
+> ? exc_page_fault
+> ? asm_exc_page_fault
+> ? wakeup_kswapd
+> migrate_misplaced_page
+> __handle_mm_fault
+> handle_mm_fault
+> do_user_addr_fault
+> exc_page_fault
+> asm_exc_page_fault
+> RIP: 0033:0x55b897ba0808
+> Code: (omitted)
+> RSP: 002b:00007ffeefa821a0 EFLAGS: 00010287
+> RAX: 000055b89983acd0 RBX: 00007ffeefa823f8 RCX: 000055b89983acd0
+> RDX: 00007fc2f8122010 RSI: 0000000000020000 RDI: 000055b89983acd0
+> RBP: 00007ffeefa821a0 R08: 0000000000000037 R09: 0000000000000075
+> R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000
+> R13: 00007ffeefa82410 R14: 000055b897ba5dd8 R15: 00007fc4b8340000
+>  </TASK>
+
+Link: https://lkml.kernel.org/r/20240216111502.79759-1-byungchul@sk.com
+Signed-off-by: Byungchul Park <byungchul@sk.com>
+Reported-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
+Fixes: c574bbe917036 ("NUMA balancing: optimize page placement for memory tiering system")
+Reviewed-by: Oscar Salvador <osalvador@suse.de>
+Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/migrate.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -2517,6 +2517,14 @@ static int numamigrate_isolate_folio(pg_
+                       if (managed_zone(pgdat->node_zones + z))
+                               break;
+               }
++
++              /*
++               * If there are no managed zones, it should not proceed
++               * further.
++               */
++              if (z < 0)
++                      return 0;
++
+               wakeup_kswapd(pgdat->node_zones + z, 0,
+                             folio_order(folio), ZONE_MOVABLE);
+               return 0;
index cadb7f42ca573d2b3ad6388a6ec51e59cdf8351c..eb8535b83f2bde93e971270ba5ee92daed83c63c 100644 (file)
@@ -113,3 +113,10 @@ iommufd-fix-protection-fault-in-iommufd_test_syz_conv_iova.patch
 efivarfs-request-at-most-512-bytes-for-variable-names.patch
 pmdomain-arm-fix-null-dereference-on-scmi_perf_domain-removal.patch
 pmdomain-qcom-rpmhpd-fix-enabled_corner-aggregation.patch
+fs-aio-make-io_cancel-generate-completions-again.patch
+fprobe-fix-to-allocate-entry_data_size-buffer-with-rethook-instances.patch
+mm-debug_vm_pgtable-fix-bug_on-with-pud-advanced-test.patch
+mm-vmscan-fix-a-bug-calling-wakeup_kswapd-with-a-wrong-zone-index.patch
+x86-e820-don-t-reserve-setup_rng_seed-in-e820.patch
+x86-cpu-allow-reducing-x86_phys_bits-during-early_identify_cpu.patch
+x86-cpu-intel-detect-tme-keyid-bits-before-setting-mtrr-mask-registers.patch
diff --git a/queue-6.7/x86-cpu-allow-reducing-x86_phys_bits-during-early_identify_cpu.patch b/queue-6.7/x86-cpu-allow-reducing-x86_phys_bits-during-early_identify_cpu.patch
new file mode 100644 (file)
index 0000000..38bbb72
--- /dev/null
@@ -0,0 +1,55 @@
+From 9a458198eba98b7207669a166e64d04b04cb651b Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Thu, 1 Feb 2024 00:09:01 +0100
+Subject: x86/cpu: Allow reducing x86_phys_bits during early_identify_cpu()
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 9a458198eba98b7207669a166e64d04b04cb651b upstream.
+
+In commit fbf6449f84bf ("x86/sev-es: Set x86_virt_bits to the correct
+value straight away, instead of a two-phase approach"), the initialization
+of c->x86_phys_bits was moved after this_cpu->c_early_init(c).  This is
+incorrect because early_init_amd() expected to be able to reduce the
+value according to the contents of CPUID leaf 0x8000001f.
+
+Fortunately, the bug was negated by init_amd()'s call to early_init_amd(),
+which does reduce x86_phys_bits in the end.  However, this is very
+late in the boot process and, most notably, the wrong value is used for
+x86_phys_bits when setting up MTRRs.
+
+To fix this, call get_cpu_address_sizes() as soon as X86_FEATURE_CPUID is
+set/cleared, and c->extended_cpuid_level is retrieved.
+
+Fixes: fbf6449f84bf ("x86/sev-es: Set x86_virt_bits to the correct value straight away, instead of a two-phase approach")
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Cc:stable@vger.kernel.org
+Link: https://lore.kernel.org/all/20240131230902.1867092-2-pbonzini%40redhat.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/cpu/common.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -1596,6 +1596,7 @@ static void __init early_identify_cpu(st
+               get_cpu_vendor(c);
+               get_cpu_cap(c);
+               setup_force_cpu_cap(X86_FEATURE_CPUID);
++              get_cpu_address_sizes(c);
+               cpu_parse_early_param();
+               if (this_cpu->c_early_init)
+@@ -1608,10 +1609,9 @@ static void __init early_identify_cpu(st
+                       this_cpu->c_bsp_init(c);
+       } else {
+               setup_clear_cpu_cap(X86_FEATURE_CPUID);
++              get_cpu_address_sizes(c);
+       }
+-      get_cpu_address_sizes(c);
+-
+       setup_force_cpu_cap(X86_FEATURE_ALWAYS);
+       cpu_set_bug_bits(c);
diff --git a/queue-6.7/x86-cpu-intel-detect-tme-keyid-bits-before-setting-mtrr-mask-registers.patch b/queue-6.7/x86-cpu-intel-detect-tme-keyid-bits-before-setting-mtrr-mask-registers.patch
new file mode 100644 (file)
index 0000000..c0bfa71
--- /dev/null
@@ -0,0 +1,243 @@
+From 6890cb1ace350b4386c8aee1343dc3b3ddd214da Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Thu, 1 Feb 2024 00:09:02 +0100
+Subject: x86/cpu/intel: Detect TME keyid bits before setting MTRR mask registers
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 6890cb1ace350b4386c8aee1343dc3b3ddd214da upstream.
+
+MKTME repurposes the high bit of physical address to key id for encryption
+key and, even though MAXPHYADDR in CPUID[0x80000008] remains the same,
+the valid bits in the MTRR mask register are based on the reduced number
+of physical address bits.
+
+detect_tme() in arch/x86/kernel/cpu/intel.c detects TME and subtracts
+it from the total usable physical bits, but it is called too late.
+Move the call to early_init_intel() so that it is called in setup_arch(),
+before MTRRs are setup.
+
+This fixes boot on TDX-enabled systems, which until now only worked with
+"disable_mtrr_cleanup".  Without the patch, the values written to the
+MTRRs mask registers were 52-bit wide (e.g. 0x000fffff_80000800) and
+the writes failed; with the patch, the values are 46-bit wide, which
+matches the reduced MAXPHYADDR that is shown in /proc/cpuinfo.
+
+Reported-by: Zixi Chen <zixchen@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Cc:stable@vger.kernel.org
+Link: https://lore.kernel.org/all/20240131230902.1867092-3-pbonzini%40redhat.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/cpu/intel.c |  178 ++++++++++++++++++++++----------------------
+ 1 file changed, 91 insertions(+), 87 deletions(-)
+
+--- a/arch/x86/kernel/cpu/intel.c
++++ b/arch/x86/kernel/cpu/intel.c
+@@ -184,6 +184,90 @@ static bool bad_spectre_microcode(struct
+       return false;
+ }
++#define MSR_IA32_TME_ACTIVATE         0x982
++
++/* Helpers to access TME_ACTIVATE MSR */
++#define TME_ACTIVATE_LOCKED(x)                (x & 0x1)
++#define TME_ACTIVATE_ENABLED(x)               (x & 0x2)
++
++#define TME_ACTIVATE_POLICY(x)                ((x >> 4) & 0xf)        /* Bits 7:4 */
++#define TME_ACTIVATE_POLICY_AES_XTS_128       0
++
++#define TME_ACTIVATE_KEYID_BITS(x)    ((x >> 32) & 0xf)       /* Bits 35:32 */
++
++#define TME_ACTIVATE_CRYPTO_ALGS(x)   ((x >> 48) & 0xffff)    /* Bits 63:48 */
++#define TME_ACTIVATE_CRYPTO_AES_XTS_128       1
++
++/* Values for mktme_status (SW only construct) */
++#define MKTME_ENABLED                 0
++#define MKTME_DISABLED                        1
++#define MKTME_UNINITIALIZED           2
++static int mktme_status = MKTME_UNINITIALIZED;
++
++static void detect_tme_early(struct cpuinfo_x86 *c)
++{
++      u64 tme_activate, tme_policy, tme_crypto_algs;
++      int keyid_bits = 0, nr_keyids = 0;
++      static u64 tme_activate_cpu0 = 0;
++
++      rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate);
++
++      if (mktme_status != MKTME_UNINITIALIZED) {
++              if (tme_activate != tme_activate_cpu0) {
++                      /* Broken BIOS? */
++                      pr_err_once("x86/tme: configuration is inconsistent between CPUs\n");
++                      pr_err_once("x86/tme: MKTME is not usable\n");
++                      mktme_status = MKTME_DISABLED;
++
++                      /* Proceed. We may need to exclude bits from x86_phys_bits. */
++              }
++      } else {
++              tme_activate_cpu0 = tme_activate;
++      }
++
++      if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
++              pr_info_once("x86/tme: not enabled by BIOS\n");
++              mktme_status = MKTME_DISABLED;
++              return;
++      }
++
++      if (mktme_status != MKTME_UNINITIALIZED)
++              goto detect_keyid_bits;
++
++      pr_info("x86/tme: enabled by BIOS\n");
++
++      tme_policy = TME_ACTIVATE_POLICY(tme_activate);
++      if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128)
++              pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy);
++
++      tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate);
++      if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) {
++              pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n",
++                              tme_crypto_algs);
++              mktme_status = MKTME_DISABLED;
++      }
++detect_keyid_bits:
++      keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
++      nr_keyids = (1UL << keyid_bits) - 1;
++      if (nr_keyids) {
++              pr_info_once("x86/mktme: enabled by BIOS\n");
++              pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids);
++      } else {
++              pr_info_once("x86/mktme: disabled by BIOS\n");
++      }
++
++      if (mktme_status == MKTME_UNINITIALIZED) {
++              /* MKTME is usable */
++              mktme_status = MKTME_ENABLED;
++      }
++
++      /*
++       * KeyID bits effectively lower the number of physical address
++       * bits.  Update cpuinfo_x86::x86_phys_bits accordingly.
++       */
++      c->x86_phys_bits -= keyid_bits;
++}
++
+ static void early_init_intel(struct cpuinfo_x86 *c)
+ {
+       u64 misc_enable;
+@@ -322,6 +406,13 @@ static void early_init_intel(struct cpui
+        */
+       if (detect_extended_topology_early(c) < 0)
+               detect_ht_early(c);
++
++      /*
++       * Adjust the number of physical bits early because it affects the
++       * valid bits of the MTRR mask registers.
++       */
++      if (cpu_has(c, X86_FEATURE_TME))
++              detect_tme_early(c);
+ }
+ static void bsp_init_intel(struct cpuinfo_x86 *c)
+@@ -482,90 +573,6 @@ static void srat_detect_node(struct cpui
+ #endif
+ }
+-#define MSR_IA32_TME_ACTIVATE         0x982
+-
+-/* Helpers to access TME_ACTIVATE MSR */
+-#define TME_ACTIVATE_LOCKED(x)                (x & 0x1)
+-#define TME_ACTIVATE_ENABLED(x)               (x & 0x2)
+-
+-#define TME_ACTIVATE_POLICY(x)                ((x >> 4) & 0xf)        /* Bits 7:4 */
+-#define TME_ACTIVATE_POLICY_AES_XTS_128       0
+-
+-#define TME_ACTIVATE_KEYID_BITS(x)    ((x >> 32) & 0xf)       /* Bits 35:32 */
+-
+-#define TME_ACTIVATE_CRYPTO_ALGS(x)   ((x >> 48) & 0xffff)    /* Bits 63:48 */
+-#define TME_ACTIVATE_CRYPTO_AES_XTS_128       1
+-
+-/* Values for mktme_status (SW only construct) */
+-#define MKTME_ENABLED                 0
+-#define MKTME_DISABLED                        1
+-#define MKTME_UNINITIALIZED           2
+-static int mktme_status = MKTME_UNINITIALIZED;
+-
+-static void detect_tme(struct cpuinfo_x86 *c)
+-{
+-      u64 tme_activate, tme_policy, tme_crypto_algs;
+-      int keyid_bits = 0, nr_keyids = 0;
+-      static u64 tme_activate_cpu0 = 0;
+-
+-      rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate);
+-
+-      if (mktme_status != MKTME_UNINITIALIZED) {
+-              if (tme_activate != tme_activate_cpu0) {
+-                      /* Broken BIOS? */
+-                      pr_err_once("x86/tme: configuration is inconsistent between CPUs\n");
+-                      pr_err_once("x86/tme: MKTME is not usable\n");
+-                      mktme_status = MKTME_DISABLED;
+-
+-                      /* Proceed. We may need to exclude bits from x86_phys_bits. */
+-              }
+-      } else {
+-              tme_activate_cpu0 = tme_activate;
+-      }
+-
+-      if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
+-              pr_info_once("x86/tme: not enabled by BIOS\n");
+-              mktme_status = MKTME_DISABLED;
+-              return;
+-      }
+-
+-      if (mktme_status != MKTME_UNINITIALIZED)
+-              goto detect_keyid_bits;
+-
+-      pr_info("x86/tme: enabled by BIOS\n");
+-
+-      tme_policy = TME_ACTIVATE_POLICY(tme_activate);
+-      if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128)
+-              pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy);
+-
+-      tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate);
+-      if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) {
+-              pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n",
+-                              tme_crypto_algs);
+-              mktme_status = MKTME_DISABLED;
+-      }
+-detect_keyid_bits:
+-      keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
+-      nr_keyids = (1UL << keyid_bits) - 1;
+-      if (nr_keyids) {
+-              pr_info_once("x86/mktme: enabled by BIOS\n");
+-              pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids);
+-      } else {
+-              pr_info_once("x86/mktme: disabled by BIOS\n");
+-      }
+-
+-      if (mktme_status == MKTME_UNINITIALIZED) {
+-              /* MKTME is usable */
+-              mktme_status = MKTME_ENABLED;
+-      }
+-
+-      /*
+-       * KeyID bits effectively lower the number of physical address
+-       * bits.  Update cpuinfo_x86::x86_phys_bits accordingly.
+-       */
+-      c->x86_phys_bits -= keyid_bits;
+-}
+-
+ static void init_cpuid_fault(struct cpuinfo_x86 *c)
+ {
+       u64 msr;
+@@ -702,9 +709,6 @@ static void init_intel(struct cpuinfo_x8
+       init_ia32_feat_ctl(c);
+-      if (cpu_has(c, X86_FEATURE_TME))
+-              detect_tme(c);
+-
+       init_intel_misc_features(c);
+       split_lock_init();
diff --git a/queue-6.7/x86-e820-don-t-reserve-setup_rng_seed-in-e820.patch b/queue-6.7/x86-e820-don-t-reserve-setup_rng_seed-in-e820.patch
new file mode 100644 (file)
index 0000000..1864d19
--- /dev/null
@@ -0,0 +1,51 @@
+From 7fd817c906503b6813ea3b41f5fdf4192449a707 Mon Sep 17 00:00:00 2001
+From: Jiri Bohac <jbohac@suse.cz>
+Date: Wed, 31 Jan 2024 01:04:28 +0100
+Subject: x86/e820: Don't reserve SETUP_RNG_SEED in e820
+
+From: Jiri Bohac <jbohac@suse.cz>
+
+commit 7fd817c906503b6813ea3b41f5fdf4192449a707 upstream.
+
+SETUP_RNG_SEED in setup_data is supplied by kexec and should
+not be reserved in the e820 map.
+
+Doing so reserves 16 bytes of RAM when booting with kexec.
+(16 bytes because data->len is zeroed by parse_setup_data so only
+sizeof(setup_data) is reserved.)
+
+When kexec is used repeatedly, each boot adds two entries in the
+kexec-provided e820 map as the 16-byte range splits a larger
+range of usable memory. Eventually all of the 128 available entries
+get used up. The next split will result in losing usable memory
+as the new entries cannot be added to the e820 map.
+
+Fixes: 68b8e9713c8e ("x86/setup: Use rng seeds from setup_data")
+Signed-off-by: Jiri Bohac <jbohac@suse.cz>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: <stable@kernel.org>
+Link: https://lore.kernel.org/r/ZbmOjKnARGiaYBd5@dwarf.suse.cz
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/e820.c |    8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/kernel/e820.c
++++ b/arch/x86/kernel/e820.c
+@@ -1017,10 +1017,12 @@ void __init e820__reserve_setup_data(voi
+               e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
+               /*
+-               * SETUP_EFI and SETUP_IMA are supplied by kexec and do not need
+-               * to be reserved.
++               * SETUP_EFI, SETUP_IMA and SETUP_RNG_SEED are supplied by
++               * kexec and do not need to be reserved.
+                */
+-              if (data->type != SETUP_EFI && data->type != SETUP_IMA)
++              if (data->type != SETUP_EFI &&
++                  data->type != SETUP_IMA &&
++                  data->type != SETUP_RNG_SEED)
+                       e820__range_update_kexec(pa_data,
+                                                sizeof(*data) + data->len,
+                                                E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);