Fixes for 6.0

author Sasha Levin <sashal@kernel.org>

Sat, 19 Nov 2022 17:24:33 +0000 (12:24 -0500)

committer Sasha Levin <sashal@kernel.org>

Sat, 19 Nov 2022 17:24:33 +0000 (12:24 -0500)
author Sasha Levin <sashal@kernel.org>
Sat, 19 Nov 2022 17:24:33 +0000 (12:24 -0500)
committer Sasha Levin <sashal@kernel.org>
Sat, 19 Nov 2022 17:24:33 +0000 (12:24 -0500)
diff --git a/queue-6.0/arm64-fix-rodata-full-again.patch b/queue-6.0/arm64-fix-rodata-full-again.patch

new file mode 100644 (file)

index 0000000..20a2b5d
--- /dev/null
+++ b/queue-6.0/arm64-fix-rodata-full-again.patch
@@ -0,0 +1,61 @@
+From f7c7c8e06c1329617dab170683bfc2069acf4c8a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 3 Nov 2022 18:00:15 +0100
+Subject: arm64: fix rodata=full again
+
+From: Ard Biesheuvel <ardb@kernel.org>
+
+[ Upstream commit 2081b3bd0c11757725dcab9ba5d38e1bddb03459 ]
+
+Commit 2e8cff0a0eee87b2 ("arm64: fix rodata=full") addressed a couple of
+issues with the rodata= kernel command line option, which is not a
+simple boolean on arm64, and inadvertently got broken due to changes in
+the generic bool handling.
+
+Unfortunately, the resulting code never clears the rodata_full boolean
+variable if it defaults to true and rodata=on or rodata=off is passed,
+as the generic code is not aware of the existence of this variable.
+
+Given the way this code is plumbed together, clearing rodata_full when
+returning false from arch_parse_debug_rodata() may result in
+inconsistencies if the generic code decides that it cannot parse the
+right hand side, so the best way to deal with this is to only take
+rodata_full in account if rodata_enabled is also true.
+
+Fixes: 2e8cff0a0eee ("arm64: fix rodata=full")
+Cc: <stable@vger.kernel.org> # 6.0.x
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Acked-by: Will Deacon <will@kernel.org>
+Link: https://lore.kernel.org/r/20221103170015.4124426-1-ardb@kernel.org
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/arm64/mm/pageattr.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
+index d107c3d434e2..5922178d7a06 100644
+--- a/arch/arm64/mm/pageattr.c
++++ b/arch/arm64/mm/pageattr.c
+@@ -26,7 +26,7 @@ bool can_set_direct_map(void)
+        * mapped at page granularity, so that it is possible to
+        * protect/unprotect single pages.
+        */
+-      return rodata_full || debug_pagealloc_enabled() ||
++      return (rodata_enabled && rodata_full) || debug_pagealloc_enabled() ||
+               IS_ENABLED(CONFIG_KFENCE);
+ }
+ 
+@@ -102,7 +102,8 @@ static int change_memory_common(unsigned long addr, int numpages,
+        * If we are manipulating read-only permissions, apply the same
+        * change to the linear mapping of the pages that back this VM area.
+        */
+-      if (rodata_full && (pgprot_val(set_mask) == PTE_RDONLY ||
++      if (rodata_enabled &&
++          rodata_full && (pgprot_val(set_mask) == PTE_RDONLY ||
+                           pgprot_val(clear_mask) == PTE_RDONLY)) {
+               for (i = 0; i < area->nr_pages; i++) {
+                       __change_memory_common((u64)page_address(area->pages[i]),
+-- 
+2.35.1
+
diff --git a/queue-6.0/arm64-mm-fold-check-for-kfence-into-can_set_direct_m.patch b/queue-6.0/arm64-mm-fold-check-for-kfence-into-can_set_direct_m.patch

new file mode 100644 (file)

index 0000000..3921ad6
--- /dev/null
+++ b/queue-6.0/arm64-mm-fold-check-for-kfence-into-can_set_direct_m.patch
@@ -0,0 +1,81 @@
+From 829c8c4d6a38d2e384016170c505d8ecef5815a0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 21 Sep 2022 10:48:41 +0300
+Subject: arm64/mm: fold check for KFENCE into can_set_direct_map()
+
+From: Mike Rapoport <rppt@linux.ibm.com>
+
+[ Upstream commit b9dd04a20f81333e4b99662f1bbaf7c9e3a1e137 ]
+
+KFENCE requires linear map to be mapped at page granularity, so that it
+is possible to protect/unprotect single pages, just like with
+rodata_full and DEBUG_PAGEALLOC.
+
+Instead of repating
+
+       can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE)
+
+make can_set_direct_map() handle the KFENCE case.
+
+This also prevents potential false positives in kernel_page_present()
+that may return true for non-present page if CONFIG_KFENCE is enabled.
+
+Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
+Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
+Link: https://lore.kernel.org/r/20220921074841.382615-1-rppt@kernel.org
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Stable-dep-of: 2081b3bd0c11 ("arm64: fix rodata=full again")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/arm64/mm/mmu.c      | 8 ++------
+ arch/arm64/mm/pageattr.c | 8 +++++++-
+ 2 files changed, 9 insertions(+), 7 deletions(-)
+
+diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
+index eb489302c28a..e8de94dd5a60 100644
+--- a/arch/arm64/mm/mmu.c
++++ b/arch/arm64/mm/mmu.c
+@@ -539,7 +539,7 @@ static void __init map_mem(pgd_t *pgdp)
+        */
+       BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end));
+ 
+-      if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
++      if (can_set_direct_map())
+               flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
+ 
+       /*
+@@ -1551,11 +1551,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
+ 
+       VM_BUG_ON(!mhp_range_allowed(start, size, true));
+ 
+-      /*
+-       * KFENCE requires linear map to be mapped at page granularity, so that
+-       * it is possible to protect/unprotect single pages in the KFENCE pool.
+-       */
+-      if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
++      if (can_set_direct_map())
+               flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
+ 
+       __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
+diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
+index 64e985eaa52d..d107c3d434e2 100644
+--- a/arch/arm64/mm/pageattr.c
++++ b/arch/arm64/mm/pageattr.c
+@@ -21,7 +21,13 @@ bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED
+ 
+ bool can_set_direct_map(void)
+ {
+-      return rodata_full || debug_pagealloc_enabled();
++      /*
++       * rodata_full, DEBUG_PAGEALLOC and KFENCE require linear map to be
++       * mapped at page granularity, so that it is possible to
++       * protect/unprotect single pages.
++       */
++      return rodata_full || debug_pagealloc_enabled() ||
++              IS_ENABLED(CONFIG_KFENCE);
+ }
+ 
+ static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
+-- 
+2.35.1
+
diff --git a/queue-6.0/hugetlb-rename-remove_huge_page-to-hugetlb_delete_fr.patch b/queue-6.0/hugetlb-rename-remove_huge_page-to-hugetlb_delete_fr.patch

new file mode 100644 (file)

index 0000000..5f8ea6c
--- /dev/null
+++ b/queue-6.0/hugetlb-rename-remove_huge_page-to-hugetlb_delete_fr.patch
@@ -0,0 +1,155 @@
+From dc228ca9ad0cd11124b1e535d9c108affb272223 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 14 Sep 2022 15:18:04 -0700
+Subject: hugetlb: rename remove_huge_page to hugetlb_delete_from_page_cache
+
+From: Mike Kravetz <mike.kravetz@oracle.com>
+
+[ Upstream commit 7e1813d48dd30e6c6f235f6661d1bc108fcab528 ]
+
+remove_huge_page removes a hugetlb page from the page cache.  Change to
+hugetlb_delete_from_page_cache as it is a more descriptive name.
+huge_add_to_page_cache is global in scope, but only deals with hugetlb
+pages.  For consistency and clarity, rename to hugetlb_add_to_page_cache.
+
+Link: https://lkml.kernel.org/r/20220914221810.95771-4-mike.kravetz@oracle.com
+Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
+Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Axel Rasmussen <axelrasmussen@google.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Davidlohr Bueso <dave@stgolabs.net>
+Cc: James Houghton <jthoughton@google.com>
+Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Mina Almasry <almasrymina@google.com>
+Cc: Muchun Song <songmuchun@bytedance.com>
+Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
+Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
+Cc: Sven Schnelle <svens@linux.ibm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 8625147cafaa ("hugetlbfs: don't delete error page from pagecache")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/hugetlbfs/inode.c    | 21 ++++++++++-----------
+ include/linux/hugetlb.h |  2 +-
+ mm/hugetlb.c            |  8 ++++----
+ 3 files changed, 15 insertions(+), 16 deletions(-)
+
+diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
+index f7a5b5124d8a..b6406e7ab64b 100644
+--- a/fs/hugetlbfs/inode.c
++++ b/fs/hugetlbfs/inode.c
+@@ -364,7 +364,7 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
+       return -EINVAL;
+ }
+ 
+-static void remove_huge_page(struct page *page)
++static void hugetlb_delete_from_page_cache(struct page *page)
+ {
+       ClearPageDirty(page);
+       ClearPageUptodate(page);
+@@ -487,15 +487,14 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
+                       folio_lock(folio);
+                       /*
+                        * We must free the huge page and remove from page
+-                       * cache (remove_huge_page) BEFORE removing the
+-                       * region/reserve map (hugetlb_unreserve_pages).  In
+-                       * rare out of memory conditions, removal of the
+-                       * region/reserve map could fail. Correspondingly,
+-                       * the subpool and global reserve usage count can need
+-                       * to be adjusted.
++                       * cache BEFORE removing the region/reserve map
++                       * (hugetlb_unreserve_pages).  In rare out of memory
++                       * conditions, removal of the region/reserve map could
++                       * fail. Correspondingly, the subpool and global
++                       * reserve usage count can need to be adjusted.
+                        */
+                       VM_BUG_ON(HPageRestoreReserve(&folio->page));
+-                      remove_huge_page(&folio->page);
++                      hugetlb_delete_from_page_cache(&folio->page);
+                       freed++;
+                       if (!truncate_op) {
+                               if (unlikely(hugetlb_unreserve_pages(inode,
+@@ -737,7 +736,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
+               }
+               clear_huge_page(page, addr, pages_per_huge_page(h));
+               __SetPageUptodate(page);
+-              error = huge_add_to_page_cache(page, mapping, index);
++              error = hugetlb_add_to_page_cache(page, mapping, index);
+               if (unlikely(error)) {
+                       restore_reserve_on_error(h, &pseudo_vma, addr, page);
+                       put_page(page);
+@@ -749,7 +748,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
+ 
+               SetHPageMigratable(page);
+               /*
+-               * unlock_page because locked by huge_add_to_page_cache()
++               * unlock_page because locked by hugetlb_add_to_page_cache()
+                * put_page() due to reference from alloc_huge_page()
+                */
+               unlock_page(page);
+@@ -994,7 +993,7 @@ static int hugetlbfs_error_remove_page(struct address_space *mapping,
+       struct inode *inode = mapping->host;
+       pgoff_t index = page->index;
+ 
+-      remove_huge_page(page);
++      hugetlb_delete_from_page_cache(page);
+       if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
+               hugetlb_fix_reserve_counts(inode);
+ 
+diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
+index 67c88b82fc32..53db3648207a 100644
+--- a/include/linux/hugetlb.h
++++ b/include/linux/hugetlb.h
+@@ -665,7 +665,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
+                               nodemask_t *nmask, gfp_t gfp_mask);
+ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
+                               unsigned long address);
+-int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
++int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping,
+                       pgoff_t idx);
+ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
+                               unsigned long address, struct page *page);
+diff --git a/mm/hugetlb.c b/mm/hugetlb.c
+index ecc197d24efb..5e414c90f82f 100644
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -5445,7 +5445,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
+       return page != NULL;
+ }
+ 
+-int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
++int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping,
+                          pgoff_t idx)
+ {
+       struct folio *folio = page_folio(page);
+@@ -5583,7 +5583,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
+               new_page = true;
+ 
+               if (vma->vm_flags & VM_MAYSHARE) {
+-                      int err = huge_add_to_page_cache(page, mapping, idx);
++                      int err = hugetlb_add_to_page_cache(page, mapping, idx);
+                       if (err) {
+                               put_page(page);
+                               if (err == -EEXIST)
+@@ -6008,11 +6008,11 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
+ 
+               /*
+                * Serialization between remove_inode_hugepages() and
+-               * huge_add_to_page_cache() below happens through the
++               * hugetlb_add_to_page_cache() below happens through the
+                * hugetlb_fault_mutex_table that here must be hold by
+                * the caller.
+                */
+-              ret = huge_add_to_page_cache(page, mapping, idx);
++              ret = hugetlb_add_to_page_cache(page, mapping, idx);
+               if (ret)
+                       goto out_release_nounlock;
+               page_in_pagecache = true;
+-- 
+2.35.1
+
diff --git a/queue-6.0/hugetlbfs-don-t-delete-error-page-from-pagecache.patch b/queue-6.0/hugetlbfs-don-t-delete-error-page-from-pagecache.patch

new file mode 100644 (file)

index 0000000..92bfd92
--- /dev/null
+++ b/queue-6.0/hugetlbfs-don-t-delete-error-page-from-pagecache.patch
@@ -0,0 +1,123 @@
+From 111ecae8867badca78f721b3633ab2d59c94ed01 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 18 Oct 2022 20:01:25 +0000
+Subject: hugetlbfs: don't delete error page from pagecache
+
+From: James Houghton <jthoughton@google.com>
+
+[ Upstream commit 8625147cafaa9ba74713d682f5185eb62cb2aedb ]
+
+This change is very similar to the change that was made for shmem [1], and
+it solves the same problem but for HugeTLBFS instead.
+
+Currently, when poison is found in a HugeTLB page, the page is removed
+from the page cache.  That means that attempting to map or read that
+hugepage in the future will result in a new hugepage being allocated
+instead of notifying the user that the page was poisoned.  As [1] states,
+this is effectively memory corruption.
+
+The fix is to leave the page in the page cache.  If the user attempts to
+use a poisoned HugeTLB page with a syscall, the syscall will fail with
+EIO, the same error code that shmem uses.  For attempts to map the page,
+the thread will get a BUS_MCEERR_AR SIGBUS.
+
+[1]: commit a76054266661 ("mm: shmem: don't truncate page if memory failure happens")
+
+Link: https://lkml.kernel.org/r/20221018200125.848471-1-jthoughton@google.com
+Signed-off-by: James Houghton <jthoughton@google.com>
+Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
+Reviewed-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Tested-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Reviewed-by: Yang Shi <shy828301@gmail.com>
+Cc: Axel Rasmussen <axelrasmussen@google.com>
+Cc: James Houghton <jthoughton@google.com>
+Cc: Miaohe Lin <linmiaohe@huawei.com>
+Cc: Muchun Song <songmuchun@bytedance.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/hugetlbfs/inode.c | 13 ++++++-------
+ mm/hugetlb.c         |  4 ++++
+ mm/memory-failure.c  |  5 ++++-
+ 3 files changed, 14 insertions(+), 8 deletions(-)
+
+diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
+index b6406e7ab64b..fbcfa6bfee80 100644
+--- a/fs/hugetlbfs/inode.c
++++ b/fs/hugetlbfs/inode.c
+@@ -328,6 +328,12 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
+               } else {
+                       unlock_page(page);
+ 
++                      if (PageHWPoison(page)) {
++                              put_page(page);
++                              retval = -EIO;
++                              break;
++                      }
++
+                       /*
+                        * We have the page, copy it to user space buffer.
+                        */
+@@ -990,13 +996,6 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
+ static int hugetlbfs_error_remove_page(struct address_space *mapping,
+                               struct page *page)
+ {
+-      struct inode *inode = mapping->host;
+-      pgoff_t index = page->index;
+-
+-      hugetlb_delete_from_page_cache(page);
+-      if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
+-              hugetlb_fix_reserve_counts(inode);
+-
+       return 0;
+ }
+ 
+diff --git a/mm/hugetlb.c b/mm/hugetlb.c
+index 5e414c90f82f..dbb558e71e9e 100644
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -6021,6 +6021,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
+       ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
+       spin_lock(ptl);
+ 
++      ret = -EIO;
++      if (PageHWPoison(page))
++              goto out_release_unlock;
++
+       /*
+        * Recheck the i_size after holding PT lock to make sure not
+        * to leave any page mapped (as page_mapped()) beyond the end
+diff --git a/mm/memory-failure.c b/mm/memory-failure.c
+index e7ac570dda75..4d302f6b02fc 100644
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -1079,6 +1079,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
+       int res;
+       struct page *hpage = compound_head(p);
+       struct address_space *mapping;
++      bool extra_pins = false;
+ 
+       if (!PageHuge(hpage))
+               return MF_DELAYED;
+@@ -1086,6 +1087,8 @@ static int me_huge_page(struct page_state *ps, struct page *p)
+       mapping = page_mapping(hpage);
+       if (mapping) {
+               res = truncate_error_page(hpage, page_to_pfn(p), mapping);
++              /* The page is kept in page cache. */
++              extra_pins = true;
+               unlock_page(hpage);
+       } else {
+               unlock_page(hpage);
+@@ -1103,7 +1106,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
+               }
+       }
+ 
+-      if (has_extra_refcount(ps, p, false))
++      if (has_extra_refcount(ps, p, extra_pins))
+               res = MF_FAILED;
+ 
+       return res;
+-- 
+2.35.1
+
diff --git a/queue-6.0/kvm-svm-do-not-allocate-struct-svm_cpu_data-dynamica.patch b/queue-6.0/kvm-svm-do-not-allocate-struct-svm_cpu_data-dynamica.patch

new file mode 100644 (file)

index 0000000..7cd4335
--- /dev/null
+++ b/queue-6.0/kvm-svm-do-not-allocate-struct-svm_cpu_data-dynamica.patch
@@ -0,0 +1,194 @@
+From a0a3bfa11ebc70ae398450478ae02e91622e420c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 9 Nov 2022 09:07:55 -0500
+Subject: KVM: SVM: do not allocate struct svm_cpu_data dynamically
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+[ Upstream commit 73412dfeea724e6bd775ba64d21157ff322eac9a ]
+
+The svm_data percpu variable is a pointer, but it is allocated via
+svm_hardware_setup() when KVM is loaded.  Unlike hardware_enable()
+this means that it is never NULL for the whole lifetime of KVM, and
+static allocation does not waste any memory compared to the status quo.
+It is also more efficient and more easily handled from assembly code,
+so do it and don't look back.
+
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Stable-dep-of: e287bd005ad9 ("KVM: SVM: restore host save area from assembly")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/svm/sev.c |  4 ++--
+ arch/x86/kvm/svm/svm.c | 41 +++++++++++++++--------------------------
+ arch/x86/kvm/svm/svm.h |  2 +-
+ 3 files changed, 18 insertions(+), 29 deletions(-)
+
+diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
+index c9c9bd453a97..efaaef2b7ae1 100644
+--- a/arch/x86/kvm/svm/sev.c
++++ b/arch/x86/kvm/svm/sev.c
+@@ -196,7 +196,7 @@ static void sev_asid_free(struct kvm_sev_info *sev)
+       __set_bit(sev->asid, sev_reclaim_asid_bitmap);
+ 
+       for_each_possible_cpu(cpu) {
+-              sd = per_cpu(svm_data, cpu);
++              sd = per_cpu_ptr(&svm_data, cpu);
+               sd->sev_vmcbs[sev->asid] = NULL;
+       }
+ 
+@@ -2600,7 +2600,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm)
+ 
+ void pre_sev_run(struct vcpu_svm *svm, int cpu)
+ {
+-      struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
++      struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+       int asid = sev_get_asid(svm->vcpu.kvm);
+ 
+       /* Assign the asid allocated with this SEV guest */
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index ecf4d8233e49..6b2f332f5d54 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -245,7 +245,7 @@ struct kvm_ldttss_desc {
+       u32 zero1;
+ } __attribute__((packed));
+ 
+-DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
++DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
+ 
+ /*
+  * Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
+@@ -583,12 +583,7 @@ static int svm_hardware_enable(void)
+               pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
+               return -EINVAL;
+       }
+-      sd = per_cpu(svm_data, me);
+-      if (!sd) {
+-              pr_err("%s: svm_data is NULL on %d\n", __func__, me);
+-              return -EINVAL;
+-      }
+-
++      sd = per_cpu_ptr(&svm_data, me);
+       sd->asid_generation = 1;
+       sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
+       sd->next_asid = sd->max_asid + 1;
+@@ -648,41 +643,35 @@ static int svm_hardware_enable(void)
+ 
+ static void svm_cpu_uninit(int cpu)
+ {
+-      struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
++      struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+ 
+-      if (!sd)
++      if (!sd->save_area)
+               return;
+ 
+-      per_cpu(svm_data, cpu) = NULL;
+       kfree(sd->sev_vmcbs);
+       __free_page(sd->save_area);
+-      kfree(sd);
++      sd->save_area = NULL;
+ }
+ 
+ static int svm_cpu_init(int cpu)
+ {
+-      struct svm_cpu_data *sd;
++      struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+       int ret = -ENOMEM;
+ 
+-      sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
+-      if (!sd)
+-              return ret;
++      memset(sd, 0, sizeof(struct svm_cpu_data));
+       sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
+       if (!sd->save_area)
+-              goto free_cpu_data;
++              return ret;
+ 
+       ret = sev_cpu_init(sd);
+       if (ret)
+               goto free_save_area;
+ 
+-      per_cpu(svm_data, cpu) = sd;
+-
+       return 0;
+ 
+ free_save_area:
+       __free_page(sd->save_area);
+-free_cpu_data:
+-      kfree(sd);
++      sd->save_area = NULL;
+       return ret;
+ 
+ }
+@@ -1426,7 +1415,7 @@ static void svm_clear_current_vmcb(struct vmcb *vmcb)
+       int i;
+ 
+       for_each_online_cpu(i)
+-              cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
++              cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
+ }
+ 
+ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
+@@ -1451,7 +1440,7 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
+ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+-      struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
++      struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
+ 
+       if (sev_es_guest(vcpu->kvm))
+               sev_es_unmap_ghcb(svm);
+@@ -1488,7 +1477,7 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
+ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+-      struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
++      struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+ 
+       if (sd->current_vmcb != svm->vmcb) {
+               sd->current_vmcb = svm->vmcb;
+@@ -3443,7 +3432,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+ 
+ static void reload_tss(struct kvm_vcpu *vcpu)
+ {
+-      struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
++      struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
+ 
+       sd->tss_desc->type = 9; /* available 32/64-bit TSS */
+       load_TR_desc();
+@@ -3451,7 +3440,7 @@ static void reload_tss(struct kvm_vcpu *vcpu)
+ 
+ static void pre_svm_run(struct kvm_vcpu *vcpu)
+ {
+-      struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
++      struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
+       struct vcpu_svm *svm = to_svm(vcpu);
+ 
+       /*
+@@ -3920,7 +3909,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+       if (sev_es_guest(vcpu->kvm)) {
+               __svm_sev_es_vcpu_run(svm);
+       } else {
+-              struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
++              struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
+ 
+               __svm_vcpu_run(svm);
+               vmload(__sme_page_pa(sd->save_area));
+diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
+index 8a8894d948a0..f1483209e186 100644
+--- a/arch/x86/kvm/svm/svm.h
++++ b/arch/x86/kvm/svm/svm.h
+@@ -294,7 +294,7 @@ struct svm_cpu_data {
+       struct vmcb **sev_vmcbs;
+ };
+ 
+-DECLARE_PER_CPU(struct svm_cpu_data *, svm_data);
++DECLARE_PER_CPU(struct svm_cpu_data, svm_data);
+ 
+ void recalc_intercepts(struct vcpu_svm *svm);
+ 
+-- 
+2.35.1
+
diff --git a/queue-6.0/kvm-svm-move-msr_ia32_spec_ctrl-save-restore-to-asse.patch b/queue-6.0/kvm-svm-move-msr_ia32_spec_ctrl-save-restore-to-asse.patch

new file mode 100644 (file)

index 0000000..2b71f62
--- /dev/null
+++ b/queue-6.0/kvm-svm-move-msr_ia32_spec_ctrl-save-restore-to-asse.patch
@@ -0,0 +1,385 @@
+From 580156a1db7717586424b4c61efdf5480ce82804 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 30 Sep 2022 14:24:40 -0400
+Subject: KVM: SVM: move MSR_IA32_SPEC_CTRL save/restore to assembly
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+[ Upstream commit 9f2febf3f04daebdaaa5a43cfa20e3844905c0f9 ]
+
+Restoration of the host IA32_SPEC_CTRL value is probably too late
+with respect to the return thunk training sequence.
+
+With respect to the user/kernel boundary, AMD says, "If software chooses
+to toggle STIBP (e.g., set STIBP on kernel entry, and clear it on kernel
+exit), software should set STIBP to 1 before executing the return thunk
+training sequence." I assume the same requirements apply to the guest/host
+boundary. The return thunk training sequence is in vmenter.S, quite close
+to the VM-exit. On hosts without V_SPEC_CTRL, however, the host's
+IA32_SPEC_CTRL value is not restored until much later.
+
+To avoid this, move the restoration of host SPEC_CTRL to assembly and,
+for consistency, move the restoration of the guest SPEC_CTRL as well.
+This is not particularly difficult, apart from some care to cover both
+32- and 64-bit, and to share code between SEV-ES and normal vmentry.
+
+Cc: stable@vger.kernel.org
+Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk")
+Suggested-by: Jim Mattson <jmattson@google.com>
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/cpu/bugs.c     |  13 +---
+ arch/x86/kvm/kvm-asm-offsets.c |   1 +
+ arch/x86/kvm/svm/svm.c         |  37 ++++------
+ arch/x86/kvm/svm/svm.h         |   4 +-
+ arch/x86/kvm/svm/vmenter.S     | 119 ++++++++++++++++++++++++++++++++-
+ 5 files changed, 136 insertions(+), 38 deletions(-)
+
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
+index da7c361f47e0..6ec0b7ce7453 100644
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -196,22 +196,15 @@ void __init check_bugs(void)
+ }
+ 
+ /*
+- * NOTE: This function is *only* called for SVM.  VMX spec_ctrl handling is
+- * done in vmenter.S.
++ * NOTE: This function is *only* called for SVM, since Intel uses
++ * MSR_IA32_SPEC_CTRL for SSBD.
+  */
+ void
+ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
+ {
+-      u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current();
++      u64 guestval, hostval;
+       struct thread_info *ti = current_thread_info();
+ 
+-      if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
+-              if (hostval != guestval) {
+-                      msrval = setguest ? guestval : hostval;
+-                      wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
+-              }
+-      }
+-
+       /*
+        * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
+        * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
+diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c
+index 1b805cd24d66..24a710d37323 100644
+--- a/arch/x86/kvm/kvm-asm-offsets.c
++++ b/arch/x86/kvm/kvm-asm-offsets.c
+@@ -16,6 +16,7 @@ static void __used common(void)
+               BLANK();
+               OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs);
+               OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb);
++              OFFSET(SVM_spec_ctrl, vcpu_svm, spec_ctrl);
+               OFFSET(SVM_vmcb01, vcpu_svm, vmcb01);
+               OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa);
+               OFFSET(SD_save_area_pa, svm_cpu_data, save_area_pa);
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index c14fabd662f6..e80756ab141b 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -722,6 +722,15 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+       u32 offset;
+       u32 *msrpm;
+ 
++      /*
++       * For non-nested case:
++       * If the L01 MSR bitmap does not intercept the MSR, then we need to
++       * save it.
++       *
++       * For nested case:
++       * If the L02 MSR bitmap does not intercept the MSR, then we need to
++       * save it.
++       */
+       msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
+                                     to_svm(vcpu)->msrpm;
+ 
+@@ -3902,16 +3911,16 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+       return EXIT_FASTPATH_NONE;
+ }
+ 
+-static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
++static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+ 
+       guest_state_enter_irqoff();
+ 
+       if (sev_es_guest(vcpu->kvm))
+-              __svm_sev_es_vcpu_run(svm);
++              __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
+       else
+-              __svm_vcpu_run(svm);
++              __svm_vcpu_run(svm, spec_ctrl_intercepted);
+ 
+       guest_state_exit_irqoff();
+ }
+@@ -3919,6 +3928,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
++      bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
+ 
+       trace_kvm_entry(vcpu);
+ 
+@@ -3977,26 +3987,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
+       if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+               x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
+ 
+-      svm_vcpu_enter_exit(vcpu);
+-
+-      /*
+-       * We do not use IBRS in the kernel. If this vCPU has used the
+-       * SPEC_CTRL MSR it may have left it on; save the value and
+-       * turn it off. This is much more efficient than blindly adding
+-       * it to the atomic save/restore list. Especially as the former
+-       * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
+-       *
+-       * For non-nested case:
+-       * If the L01 MSR bitmap does not intercept the MSR, then we need to
+-       * save it.
+-       *
+-       * For nested case:
+-       * If the L02 MSR bitmap does not intercept the MSR, then we need to
+-       * save it.
+-       */
+-      if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
+-          unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+-              svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
++      svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
+ 
+       if (!sev_es_guest(vcpu->kvm))
+               reload_tss(vcpu);
+diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
+index 8744f3b1d217..ea3049b978ea 100644
+--- a/arch/x86/kvm/svm/svm.h
++++ b/arch/x86/kvm/svm/svm.h
+@@ -683,7 +683,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm);
+ 
+ /* vmenter.S */
+ 
+-void __svm_sev_es_vcpu_run(struct vcpu_svm *svm);
+-void __svm_vcpu_run(struct vcpu_svm *svm);
++void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
++void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+ 
+ #endif
+diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
+index 57440acfc73e..34367dc203f2 100644
+--- a/arch/x86/kvm/svm/vmenter.S
++++ b/arch/x86/kvm/svm/vmenter.S
+@@ -32,9 +32,69 @@
+ 
+ .section .noinstr.text, "ax"
+ 
++.macro RESTORE_GUEST_SPEC_CTRL
++      /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
++      ALTERNATIVE_2 "", \
++              "jmp 800f", X86_FEATURE_MSR_SPEC_CTRL, \
++              "", X86_FEATURE_V_SPEC_CTRL
++801:
++.endm
++.macro RESTORE_GUEST_SPEC_CTRL_BODY
++800:
++      /*
++       * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
++       * host's, write the MSR.  This is kept out-of-line so that the common
++       * case does not have to jump.
++       *
++       * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
++       * there must not be any returns or indirect branches between this code
++       * and vmentry.
++       */
++      movl SVM_spec_ctrl(%_ASM_DI), %eax
++      cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax
++      je 801b
++      mov $MSR_IA32_SPEC_CTRL, %ecx
++      xor %edx, %edx
++      wrmsr
++      jmp 801b
++.endm
++
++.macro RESTORE_HOST_SPEC_CTRL
++      /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
++      ALTERNATIVE_2 "", \
++              "jmp 900f", X86_FEATURE_MSR_SPEC_CTRL, \
++              "", X86_FEATURE_V_SPEC_CTRL
++901:
++.endm
++.macro RESTORE_HOST_SPEC_CTRL_BODY
++900:
++      /* Same for after vmexit.  */
++      mov $MSR_IA32_SPEC_CTRL, %ecx
++
++      /*
++       * Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
++       * if it was not intercepted during guest execution.
++       */
++      cmpb $0, (%_ASM_SP)
++      jnz 998f
++      rdmsr
++      movl %eax, SVM_spec_ctrl(%_ASM_DI)
++998:
++
++      /* Now restore the host value of the MSR if different from the guest's.  */
++      movl PER_CPU_VAR(x86_spec_ctrl_current), %eax
++      cmp SVM_spec_ctrl(%_ASM_DI), %eax
++      je 901b
++      xor %edx, %edx
++      wrmsr
++      jmp 901b
++.endm
++
++
+ /**
+  * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
+  * @svm:      struct vcpu_svm *
++ * @spec_ctrl_intercepted: bool
+  */
+ SYM_FUNC_START(__svm_vcpu_run)
+       push %_ASM_BP
+@@ -54,17 +114,26 @@ SYM_FUNC_START(__svm_vcpu_run)
+        * order compared to when they are needed.
+        */
+ 
++      /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL.  */
++      push %_ASM_ARG2
++
+       /* Needed to restore access to percpu variables.  */
+       __ASM_SIZE(push) PER_CPU_VAR(svm_data + SD_save_area_pa)
+ 
+-      /* Save @svm. */
++      /* Finally save @svm. */
+       push %_ASM_ARG1
+ 
+ .ifnc _ASM_ARG1, _ASM_DI
+-      /* Move @svm to RDI. */
++      /*
++       * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
++       * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
++       */
+       mov %_ASM_ARG1, %_ASM_DI
+ .endif
+ 
++      /* Clobbers RAX, RCX, RDX.  */
++      RESTORE_GUEST_SPEC_CTRL
++
+       /*
+        * Use a single vmcb (vmcb01 because it's always valid) for
+        * context switching guest state via VMLOAD/VMSAVE, that way
+@@ -142,6 +211,9 @@ SYM_FUNC_START(__svm_vcpu_run)
+       FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+ #endif
+ 
++      /* Clobbers RAX, RCX, RDX.  */
++      RESTORE_HOST_SPEC_CTRL
++
+       /*
+        * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
+        * untrained as soon as we exit the VM and are back to the
+@@ -177,6 +249,9 @@ SYM_FUNC_START(__svm_vcpu_run)
+       xor %r15d, %r15d
+ #endif
+ 
++      /* "Pop" @spec_ctrl_intercepted.  */
++      pop %_ASM_BX
++
+       pop %_ASM_BX
+ 
+ #ifdef CONFIG_X86_64
+@@ -191,6 +266,9 @@ SYM_FUNC_START(__svm_vcpu_run)
+       pop %_ASM_BP
+       RET
+ 
++      RESTORE_GUEST_SPEC_CTRL_BODY
++      RESTORE_HOST_SPEC_CTRL_BODY
++
+ 10:   cmpb $0, kvm_rebooting
+       jne 2b
+       ud2
+@@ -214,6 +292,7 @@ SYM_FUNC_END(__svm_vcpu_run)
+ /**
+  * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
+  * @svm:      struct vcpu_svm *
++ * @spec_ctrl_intercepted: bool
+  */
+ SYM_FUNC_START(__svm_sev_es_vcpu_run)
+       push %_ASM_BP
+@@ -228,8 +307,30 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
+ #endif
+       push %_ASM_BX
+ 
++      /*
++       * Save variables needed after vmexit on the stack, in inverse
++       * order compared to when they are needed.
++       */
++
++      /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL.  */
++      push %_ASM_ARG2
++
++      /* Save @svm. */
++      push %_ASM_ARG1
++
++.ifnc _ASM_ARG1, _ASM_DI
++      /*
++       * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
++       * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
++       */
++      mov %_ASM_ARG1, %_ASM_DI
++.endif
++
++      /* Clobbers RAX, RCX, RDX.  */
++      RESTORE_GUEST_SPEC_CTRL
++
+       /* Get svm->current_vmcb->pa into RAX. */
+-      mov SVM_current_vmcb(%_ASM_ARG1), %_ASM_AX
++      mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
+       mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
+ 
+       /* Enter guest mode */
+@@ -239,11 +340,17 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
+ 
+ 2:    cli
+ 
++      /* Pop @svm to RDI, guest registers have been saved already. */
++      pop %_ASM_DI
++
+ #ifdef CONFIG_RETPOLINE
+       /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+       FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+ #endif
+ 
++      /* Clobbers RAX, RCX, RDX.  */
++      RESTORE_HOST_SPEC_CTRL
++
+       /*
+        * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
+        * untrained as soon as we exit the VM and are back to the
+@@ -253,6 +360,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
+        */
+       UNTRAIN_RET
+ 
++      /* "Pop" @spec_ctrl_intercepted.  */
++      pop %_ASM_BX
++
+       pop %_ASM_BX
+ 
+ #ifdef CONFIG_X86_64
+@@ -267,6 +377,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
+       pop %_ASM_BP
+       RET
+ 
++      RESTORE_GUEST_SPEC_CTRL_BODY
++      RESTORE_HOST_SPEC_CTRL_BODY
++
+ 3:    cmpb $0, kvm_rebooting
+       jne 2b
+       ud2
+-- 
+2.35.1
+
diff --git a/queue-6.0/kvm-svm-remove-dead-field-from-struct-svm_cpu_data.patch b/queue-6.0/kvm-svm-remove-dead-field-from-struct-svm_cpu_data.patch

new file mode 100644 (file)

index 0000000..a01512d
--- /dev/null
+++ b/queue-6.0/kvm-svm-remove-dead-field-from-struct-svm_cpu_data.patch
@@ -0,0 +1,50 @@
+From 3e66c3581a7ce63d69b5f1b5f05845badaf2b3dc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 9 Nov 2022 08:54:20 -0500
+Subject: KVM: SVM: remove dead field from struct svm_cpu_data
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+[ Upstream commit 181d0fb0bb023e8996b1cf7970e3708d72442b0b ]
+
+The "cpu" field of struct svm_cpu_data has been write-only since commit
+4b656b120249 ("KVM: SVM: force new asid on vcpu migration", 2009-08-05).
+Remove it.
+
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Stable-dep-of: e287bd005ad9 ("KVM: SVM: restore host save area from assembly")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/svm/svm.c | 1 -
+ arch/x86/kvm/svm/svm.h | 2 --
+ 2 files changed, 3 deletions(-)
+
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 454746641a48..ecf4d8233e49 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -667,7 +667,6 @@ static int svm_cpu_init(int cpu)
+       sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
+       if (!sd)
+               return ret;
+-      sd->cpu = cpu;
+       sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
+       if (!sd->save_area)
+               goto free_cpu_data;
+diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
+index 7ff1879e73c5..8a8894d948a0 100644
+--- a/arch/x86/kvm/svm/svm.h
++++ b/arch/x86/kvm/svm/svm.h
+@@ -281,8 +281,6 @@ struct vcpu_svm {
+ };
+ 
+ struct svm_cpu_data {
+-      int cpu;
+-
+       u64 asid_generation;
+       u32 max_asid;
+       u32 next_asid;
+-- 
+2.35.1
+
diff --git a/queue-6.0/kvm-svm-restore-host-save-area-from-assembly.patch b/queue-6.0/kvm-svm-restore-host-save-area-from-assembly.patch

new file mode 100644 (file)

index 0000000..1da3b8c
--- /dev/null
+++ b/queue-6.0/kvm-svm-restore-host-save-area-from-assembly.patch
@@ -0,0 +1,175 @@
+From 82432f01ad47e55ca786f40c6adc09eb533c93e4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 7 Nov 2022 03:49:59 -0500
+Subject: KVM: SVM: restore host save area from assembly
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+[ Upstream commit e287bd005ad9d85dd6271dd795d3ecfb6bca46ad ]
+
+Allow access to the percpu area via the GS segment base, which is
+needed in order to access the saved host spec_ctrl value.  In linux-next
+FILL_RETURN_BUFFER also needs to access percpu data.
+
+For simplicity, the physical address of the save area is added to struct
+svm_cpu_data.
+
+Cc: stable@vger.kernel.org
+Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk")
+Reported-by: Nathan Chancellor <nathan@kernel.org>
+Analyzed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Tested-by: Nathan Chancellor <nathan@kernel.org>
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/kvm-asm-offsets.c |  1 +
+ arch/x86/kvm/svm/svm.c         | 14 ++++++--------
+ arch/x86/kvm/svm/svm.h         |  2 ++
+ arch/x86/kvm/svm/svm_ops.h     |  5 -----
+ arch/x86/kvm/svm/vmenter.S     | 17 +++++++++++++++++
+ 5 files changed, 26 insertions(+), 13 deletions(-)
+
+diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c
+index f83e88b85bf2..1b805cd24d66 100644
+--- a/arch/x86/kvm/kvm-asm-offsets.c
++++ b/arch/x86/kvm/kvm-asm-offsets.c
+@@ -18,6 +18,7 @@ static void __used common(void)
+               OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb);
+               OFFSET(SVM_vmcb01, vcpu_svm, vmcb01);
+               OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa);
++              OFFSET(SD_save_area_pa, svm_cpu_data, save_area_pa);
+       }
+ 
+       if (IS_ENABLED(CONFIG_KVM_INTEL)) {
+diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
+index 6b2f332f5d54..c14fabd662f6 100644
+--- a/arch/x86/kvm/svm/svm.c
++++ b/arch/x86/kvm/svm/svm.c
+@@ -594,7 +594,7 @@ static int svm_hardware_enable(void)
+ 
+       wrmsrl(MSR_EFER, efer | EFER_SVME);
+ 
+-      wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area));
++      wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa);
+ 
+       if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+               /*
+@@ -650,6 +650,7 @@ static void svm_cpu_uninit(int cpu)
+ 
+       kfree(sd->sev_vmcbs);
+       __free_page(sd->save_area);
++      sd->save_area_pa = 0;
+       sd->save_area = NULL;
+ }
+ 
+@@ -667,6 +668,7 @@ static int svm_cpu_init(int cpu)
+       if (ret)
+               goto free_save_area;
+ 
++      sd->save_area_pa = __sme_page_pa(sd->save_area);
+       return 0;
+ 
+ free_save_area:
+@@ -1452,7 +1454,7 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+        * Save additional host state that will be restored on VMEXIT (sev-es)
+        * or subsequent vmload of host save area.
+        */
+-      vmsave(__sme_page_pa(sd->save_area));
++      vmsave(sd->save_area_pa);
+       if (sev_es_guest(vcpu->kvm)) {
+               struct sev_es_save_area *hostsa;
+               hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
+@@ -3906,14 +3908,10 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+ 
+       guest_state_enter_irqoff();
+ 
+-      if (sev_es_guest(vcpu->kvm)) {
++      if (sev_es_guest(vcpu->kvm))
+               __svm_sev_es_vcpu_run(svm);
+-      } else {
+-              struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
+-
++      else
+               __svm_vcpu_run(svm);
+-              vmload(__sme_page_pa(sd->save_area));
+-      }
+ 
+       guest_state_exit_irqoff();
+ }
+diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
+index f1483209e186..8744f3b1d217 100644
+--- a/arch/x86/kvm/svm/svm.h
++++ b/arch/x86/kvm/svm/svm.h
+@@ -288,6 +288,8 @@ struct svm_cpu_data {
+       struct kvm_ldttss_desc *tss_desc;
+ 
+       struct page *save_area;
++      unsigned long save_area_pa;
++
+       struct vmcb *current_vmcb;
+ 
+       /* index = sev_asid, value = vmcb pointer */
+diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h
+index 9430d6437c9f..36c8af87a707 100644
+--- a/arch/x86/kvm/svm/svm_ops.h
++++ b/arch/x86/kvm/svm/svm_ops.h
+@@ -61,9 +61,4 @@ static __always_inline void vmsave(unsigned long pa)
+       svm_asm1(vmsave, "a" (pa), "memory");
+ }
+ 
+-static __always_inline void vmload(unsigned long pa)
+-{
+-      svm_asm1(vmload, "a" (pa), "memory");
+-}
+-
+ #endif /* __KVM_X86_SVM_OPS_H */
+diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
+index 5bc2ed7d79c0..57440acfc73e 100644
+--- a/arch/x86/kvm/svm/vmenter.S
++++ b/arch/x86/kvm/svm/vmenter.S
+@@ -49,6 +49,14 @@ SYM_FUNC_START(__svm_vcpu_run)
+ #endif
+       push %_ASM_BX
+ 
++      /*
++       * Save variables needed after vmexit on the stack, in inverse
++       * order compared to when they are needed.
++       */
++
++      /* Needed to restore access to percpu variables.  */
++      __ASM_SIZE(push) PER_CPU_VAR(svm_data + SD_save_area_pa)
++
+       /* Save @svm. */
+       push %_ASM_ARG1
+ 
+@@ -124,6 +132,11 @@ SYM_FUNC_START(__svm_vcpu_run)
+ 5:    vmsave %_ASM_AX
+ 6:
+ 
++      /* Restores GSBASE among other things, allowing access to percpu data.  */
++      pop %_ASM_AX
++7:    vmload %_ASM_AX
++8:
++
+ #ifdef CONFIG_RETPOLINE
+       /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+       FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+@@ -187,10 +200,14 @@ SYM_FUNC_START(__svm_vcpu_run)
+ 50:   cmpb $0, kvm_rebooting
+       jne 6b
+       ud2
++70:   cmpb $0, kvm_rebooting
++      jne 8b
++      ud2
+ 
+       _ASM_EXTABLE(1b, 10b)
+       _ASM_EXTABLE(3b, 30b)
+       _ASM_EXTABLE(5b, 50b)
++      _ASM_EXTABLE(7b, 70b)
+ 
+ SYM_FUNC_END(__svm_vcpu_run)
+ 
+-- 
+2.35.1
+
diff --git a/queue-6.0/series b/queue-6.0/series

index 8a9e91467a2c649cd12ce1cb94797cbddf82b23a..126a04d979870c27692ec7e2c2523514f1c7089d 100644 (file)
--- a/queue-6.0/series
+++ b/queue-6.0/series
@@ -60,3 +60,11 @@ x86-cpu-add-several-intel-server-cpu-model-numbers.patch
  tools-testing-cxl-fix-some-error-exits.patch
  cifs-always-iterate-smb-sessions-using-primary-chann.patch
  asoc-codecs-jz4725b-fix-spelling-mistake-sourc-sourc.patch
+arm64-mm-fold-check-for-kfence-into-can_set_direct_m.patch
+arm64-fix-rodata-full-again.patch
+hugetlb-rename-remove_huge_page-to-hugetlb_delete_fr.patch
+hugetlbfs-don-t-delete-error-page-from-pagecache.patch
+kvm-svm-remove-dead-field-from-struct-svm_cpu_data.patch
+kvm-svm-do-not-allocate-struct-svm_cpu_data-dynamica.patch
+kvm-svm-restore-host-save-area-from-assembly.patch
+kvm-svm-move-msr_ia32_spec_ctrl-save-restore-to-asse.patch
author	Sasha Levin <sashal@kernel.org>
	Sat, 19 Nov 2022 17:24:33 +0000 (12:24 -0500)
committer	Sasha Levin <sashal@kernel.org>
	Sat, 19 Nov 2022 17:24:33 +0000 (12:24 -0500)
queue-6.0/arm64-fix-rodata-full-again.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/arm64-mm-fold-check-for-kfence-into-can_set_direct_m.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/hugetlb-rename-remove_huge_page-to-hugetlb_delete_fr.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/hugetlbfs-don-t-delete-error-page-from-pagecache.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/kvm-svm-do-not-allocate-struct-svm_cpu_data-dynamica.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/kvm-svm-move-msr_ia32_spec_ctrl-save-restore-to-asse.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/kvm-svm-remove-dead-field-from-struct-svm_cpu_data.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/kvm-svm-restore-host-save-area-from-assembly.patch	[new file with mode: 0644]	patch \| blob
queue-6.0/series		patch \| blob \| blame \| history