]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.18-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 3 Jun 2022 15:51:13 +0000 (17:51 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 3 Jun 2022 15:51:13 +0000 (17:51 +0200)
added patches:
drm-i915-fix-wstringop-overflow-warning-in-call-to-intel_read_wm_latency.patch
exfat-check-if-cluster-num-is-valid.patch
exfat-fix-referencing-wrong-parent-directory-information-after-renaming.patch
kvm-ppc-book3s-hv-fix-incorrect-null-check-on-list-iterator.patch
kvm-svm-use-kzalloc-for-sev-ioctl-interfaces-to-prevent-kernel-data-leak.patch
kvm-x86-avoid-calling-x86-emulator-without-a-decoded-instruction.patch
kvm-x86-avoid-loading-a-vcpu-after-.vm_destroy-was-called.patch
kvm-x86-drop-warns-that-assert-a-triple-fault-never-escapes-from-l2.patch
kvm-x86-fix-the-intel_pt-pmi-handling-wrongly-considered-from-guest.patch
kvm-x86-fix-typo-in-__try_cmpxchg_user-causing-non-atomicness.patch
kvm-x86-mmu-don-t-rebuild-page-when-the-page-is-synced-and-no-tlb-flushing-is-required.patch
kvm-x86-use-__try_cmpxchg_user-to-emulate-atomic-accesses.patch
kvm-x86-use-__try_cmpxchg_user-to-update-guest-pte-a-d-bits.patch
net-ipa-compute-proper-aggregation-limit.patch
netfilter-conntrack-re-fetch-conntrack-after-insertion.patch
netfilter-nf_tables-double-hook-unregistration-in-netns-path.patch
netfilter-nf_tables-hold-mutex-on-netns-pre_exit-path.patch
netfilter-nf_tables-sanitize-nft_set_desc_concat_parse.patch
netfilter-nft_limit-clone-packet-limits-cost-value.patch
x86-fpu-kvm-set-the-base-guest-fpu-uabi-size-to-sizeof-struct-kvm_xsave.patch
x86-kvm-alloc-dummy-async-pf-token-outside-of-raw-spinlock.patch
x86-kvm-use-correct-gfp-flags-for-preemption-disabled.patch
x86-uaccess-implement-macros-for-cmpxchg-on-user-addresses.patch

24 files changed:
queue-5.18/drm-i915-fix-wstringop-overflow-warning-in-call-to-intel_read_wm_latency.patch [new file with mode: 0644]
queue-5.18/exfat-check-if-cluster-num-is-valid.patch [new file with mode: 0644]
queue-5.18/exfat-fix-referencing-wrong-parent-directory-information-after-renaming.patch [new file with mode: 0644]
queue-5.18/kvm-ppc-book3s-hv-fix-incorrect-null-check-on-list-iterator.patch [new file with mode: 0644]
queue-5.18/kvm-svm-use-kzalloc-for-sev-ioctl-interfaces-to-prevent-kernel-data-leak.patch [new file with mode: 0644]
queue-5.18/kvm-x86-avoid-calling-x86-emulator-without-a-decoded-instruction.patch [new file with mode: 0644]
queue-5.18/kvm-x86-avoid-loading-a-vcpu-after-.vm_destroy-was-called.patch [new file with mode: 0644]
queue-5.18/kvm-x86-drop-warns-that-assert-a-triple-fault-never-escapes-from-l2.patch [new file with mode: 0644]
queue-5.18/kvm-x86-fix-the-intel_pt-pmi-handling-wrongly-considered-from-guest.patch [new file with mode: 0644]
queue-5.18/kvm-x86-fix-typo-in-__try_cmpxchg_user-causing-non-atomicness.patch [new file with mode: 0644]
queue-5.18/kvm-x86-mmu-don-t-rebuild-page-when-the-page-is-synced-and-no-tlb-flushing-is-required.patch [new file with mode: 0644]
queue-5.18/kvm-x86-use-__try_cmpxchg_user-to-emulate-atomic-accesses.patch [new file with mode: 0644]
queue-5.18/kvm-x86-use-__try_cmpxchg_user-to-update-guest-pte-a-d-bits.patch [new file with mode: 0644]
queue-5.18/net-ipa-compute-proper-aggregation-limit.patch [new file with mode: 0644]
queue-5.18/netfilter-conntrack-re-fetch-conntrack-after-insertion.patch [new file with mode: 0644]
queue-5.18/netfilter-nf_tables-double-hook-unregistration-in-netns-path.patch [new file with mode: 0644]
queue-5.18/netfilter-nf_tables-hold-mutex-on-netns-pre_exit-path.patch [new file with mode: 0644]
queue-5.18/netfilter-nf_tables-sanitize-nft_set_desc_concat_parse.patch [new file with mode: 0644]
queue-5.18/netfilter-nft_limit-clone-packet-limits-cost-value.patch [new file with mode: 0644]
queue-5.18/series
queue-5.18/x86-fpu-kvm-set-the-base-guest-fpu-uabi-size-to-sizeof-struct-kvm_xsave.patch [new file with mode: 0644]
queue-5.18/x86-kvm-alloc-dummy-async-pf-token-outside-of-raw-spinlock.patch [new file with mode: 0644]
queue-5.18/x86-kvm-use-correct-gfp-flags-for-preemption-disabled.patch [new file with mode: 0644]
queue-5.18/x86-uaccess-implement-macros-for-cmpxchg-on-user-addresses.patch [new file with mode: 0644]

diff --git a/queue-5.18/drm-i915-fix-wstringop-overflow-warning-in-call-to-intel_read_wm_latency.patch b/queue-5.18/drm-i915-fix-wstringop-overflow-warning-in-call-to-intel_read_wm_latency.patch
new file mode 100644 (file)
index 0000000..ad98107
--- /dev/null
@@ -0,0 +1,57 @@
+From 336feb502a715909a8136eb6a62a83d7268a353b Mon Sep 17 00:00:00 2001
+From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
+Date: Wed, 27 Apr 2022 17:47:14 -0500
+Subject: drm/i915: Fix -Wstringop-overflow warning in call to intel_read_wm_latency()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Gustavo A. R. Silva <gustavoars@kernel.org>
+
+commit 336feb502a715909a8136eb6a62a83d7268a353b upstream.
+
+Fix the following -Wstringop-overflow warnings when building with GCC-11:
+
+drivers/gpu/drm/i915/intel_pm.c:3106:9: warning: ‘intel_read_wm_latency’ accessing 16 bytes in a region of size 10 [-Wstringop-overflow=]
+ 3106 |         intel_read_wm_latency(dev_priv, dev_priv->wm.pri_latency);
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+drivers/gpu/drm/i915/intel_pm.c:3106:9: note: referencing argument 2 of type ‘u16 *’ {aka ‘short unsigned int *’}
+drivers/gpu/drm/i915/intel_pm.c:2861:13: note: in a call to function ‘intel_read_wm_latency’
+ 2861 | static void intel_read_wm_latency(struct drm_i915_private *dev_priv,
+      |             ^~~~~~~~~~~~~~~~~~~~~
+
+by removing the over-specified array size from the argument declarations.
+
+It seems that this code is actually safe because the size of the
+array depends on the hardware generation, and the function checks
+for that.
+
+Notice that wm can be an array of 5 elements:
+drivers/gpu/drm/i915/intel_pm.c:3109:   intel_read_wm_latency(dev_priv, dev_priv->wm.pri_latency);
+
+or an array of 8 elements:
+drivers/gpu/drm/i915/intel_pm.c:3131:   intel_read_wm_latency(dev_priv, dev_priv->wm.skl_latency);
+
+and the compiler legitimately complains about that.
+
+This helps with the ongoing efforts to globally enable
+-Wstringop-overflow.
+
+Link: https://github.com/KSPP/linux/issues/181
+Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/i915/intel_pm.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/i915/intel_pm.c
++++ b/drivers/gpu/drm/i915/intel_pm.c
+@@ -2859,7 +2859,7 @@ static void ilk_compute_wm_level(const s
+ }
+ static void intel_read_wm_latency(struct drm_i915_private *dev_priv,
+-                                u16 wm[8])
++                                u16 wm[])
+ {
+       struct intel_uncore *uncore = &dev_priv->uncore;
diff --git a/queue-5.18/exfat-check-if-cluster-num-is-valid.patch b/queue-5.18/exfat-check-if-cluster-num-is-valid.patch
new file mode 100644 (file)
index 0000000..2c7f2e3
--- /dev/null
@@ -0,0 +1,104 @@
+From 64ba4b15e5c045f8b746c6da5fc9be9a6b00b61d Mon Sep 17 00:00:00 2001
+From: Tadeusz Struk <tadeusz.struk@linaro.org>
+Date: Tue, 17 May 2022 08:13:08 +0900
+Subject: exfat: check if cluster num is valid
+
+From: Tadeusz Struk <tadeusz.struk@linaro.org>
+
+commit 64ba4b15e5c045f8b746c6da5fc9be9a6b00b61d upstream.
+
+Syzbot reported slab-out-of-bounds read in exfat_clear_bitmap.
+This was triggered by reproducer calling truncute with size 0,
+which causes the following trace:
+
+BUG: KASAN: slab-out-of-bounds in exfat_clear_bitmap+0x147/0x490 fs/exfat/balloc.c:174
+Read of size 8 at addr ffff888115aa9508 by task syz-executor251/365
+
+Call Trace:
+ __dump_stack lib/dump_stack.c:77 [inline]
+ dump_stack_lvl+0x1e2/0x24b lib/dump_stack.c:118
+ print_address_description+0x81/0x3c0 mm/kasan/report.c:233
+ __kasan_report mm/kasan/report.c:419 [inline]
+ kasan_report+0x1a4/0x1f0 mm/kasan/report.c:436
+ __asan_report_load8_noabort+0x14/0x20 mm/kasan/report_generic.c:309
+ exfat_clear_bitmap+0x147/0x490 fs/exfat/balloc.c:174
+ exfat_free_cluster+0x25a/0x4a0 fs/exfat/fatent.c:181
+ __exfat_truncate+0x99e/0xe00 fs/exfat/file.c:217
+ exfat_truncate+0x11b/0x4f0 fs/exfat/file.c:243
+ exfat_setattr+0xa03/0xd40 fs/exfat/file.c:339
+ notify_change+0xb76/0xe10 fs/attr.c:336
+ do_truncate+0x1ea/0x2d0 fs/open.c:65
+
+Move the is_valid_cluster() helper from fatent.c to a common
+header to make it reusable in other *.c files. And add is_valid_cluster()
+to validate if cluster number is within valid range in exfat_clear_bitmap()
+and exfat_set_bitmap().
+
+Link: https://syzkaller.appspot.com/bug?id=50381fc73821ecae743b8cf24b4c9a04776f767c
+Reported-by: syzbot+a4087e40b9c13aad7892@syzkaller.appspotmail.com
+Fixes: 1e49a94cf707 ("exfat: add bitmap operations")
+Cc: stable@vger.kernel.org # v5.7+
+Signed-off-by: Tadeusz Struk <tadeusz.struk@linaro.org>
+Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
+Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/exfat/balloc.c   |    8 ++++++--
+ fs/exfat/exfat_fs.h |    6 ++++++
+ fs/exfat/fatent.c   |    6 ------
+ 3 files changed, 12 insertions(+), 8 deletions(-)
+
+--- a/fs/exfat/balloc.c
++++ b/fs/exfat/balloc.c
+@@ -148,7 +148,9 @@ int exfat_set_bitmap(struct inode *inode
+       struct super_block *sb = inode->i_sb;
+       struct exfat_sb_info *sbi = EXFAT_SB(sb);
+-      WARN_ON(clu < EXFAT_FIRST_CLUSTER);
++      if (!is_valid_cluster(sbi, clu))
++              return -EINVAL;
++
+       ent_idx = CLUSTER_TO_BITMAP_ENT(clu);
+       i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
+       b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
+@@ -166,7 +168,9 @@ void exfat_clear_bitmap(struct inode *in
+       struct exfat_sb_info *sbi = EXFAT_SB(sb);
+       struct exfat_mount_options *opts = &sbi->options;
+-      WARN_ON(clu < EXFAT_FIRST_CLUSTER);
++      if (!is_valid_cluster(sbi, clu))
++              return;
++
+       ent_idx = CLUSTER_TO_BITMAP_ENT(clu);
+       i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
+       b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
+--- a/fs/exfat/exfat_fs.h
++++ b/fs/exfat/exfat_fs.h
+@@ -381,6 +381,12 @@ static inline int exfat_sector_to_cluste
+               EXFAT_RESERVED_CLUSTERS;
+ }
++static inline bool is_valid_cluster(struct exfat_sb_info *sbi,
++              unsigned int clus)
++{
++      return clus >= EXFAT_FIRST_CLUSTER && clus < sbi->num_clusters;
++}
++
+ /* super.c */
+ int exfat_set_volume_dirty(struct super_block *sb);
+ int exfat_clear_volume_dirty(struct super_block *sb);
+--- a/fs/exfat/fatent.c
++++ b/fs/exfat/fatent.c
+@@ -81,12 +81,6 @@ int exfat_ent_set(struct super_block *sb
+       return 0;
+ }
+-static inline bool is_valid_cluster(struct exfat_sb_info *sbi,
+-              unsigned int clus)
+-{
+-      return clus >= EXFAT_FIRST_CLUSTER && clus < sbi->num_clusters;
+-}
+-
+ int exfat_ent_get(struct super_block *sb, unsigned int loc,
+               unsigned int *content)
+ {
diff --git a/queue-5.18/exfat-fix-referencing-wrong-parent-directory-information-after-renaming.patch b/queue-5.18/exfat-fix-referencing-wrong-parent-directory-information-after-renaming.patch
new file mode 100644 (file)
index 0000000..58cf7d8
--- /dev/null
@@ -0,0 +1,98 @@
+From d8dad2588addd1d861ce19e7df3b702330f0c7e3 Mon Sep 17 00:00:00 2001
+From: Yuezhang Mo <Yuezhang.Mo@sony.com>
+Date: Mon, 4 Apr 2022 11:58:06 +0900
+Subject: exfat: fix referencing wrong parent directory information after renaming
+
+From: Yuezhang Mo <Yuezhang.Mo@sony.com>
+
+commit d8dad2588addd1d861ce19e7df3b702330f0c7e3 upstream.
+
+During renaming, the parent directory information maybe
+updated. But the file/directory still references to the
+old parent directory information.
+
+This bug will cause 2 problems.
+
+(1) The renamed file can not be written.
+
+    [10768.175172] exFAT-fs (sda1): error, failed to bmap (inode : 7afd50e4 iblock : 0, err : -5)
+    [10768.184285] exFAT-fs (sda1): Filesystem has been set read-only
+    ash: write error: Input/output error
+
+(2) Some dentries of the renamed file/directory are not set
+    to deleted after removing the file/directory.
+
+exfat_update_parent_info() is a workaround for the wrong parent
+directory information being used after renaming. Now that bug is
+fixed, this is no longer needed, so remove it.
+
+Fixes: 5f2aa075070c ("exfat: add inode operations")
+Cc: stable@vger.kernel.org # v5.7+
+Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
+Reviewed-by: Andy Wu <Andy.Wu@sony.com>
+Reviewed-by: Aoyama Wataru <wataru.aoyama@sony.com>
+Reviewed-by: Daniel Palmer <daniel.palmer@sony.com>
+Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
+Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/exfat/namei.c |   27 +--------------------------
+ 1 file changed, 1 insertion(+), 26 deletions(-)
+
+--- a/fs/exfat/namei.c
++++ b/fs/exfat/namei.c
+@@ -1080,6 +1080,7 @@ static int exfat_rename_file(struct inod
+               exfat_remove_entries(inode, p_dir, oldentry, 0,
+                       num_old_entries);
++              ei->dir = *p_dir;
+               ei->entry = newentry;
+       } else {
+               if (exfat_get_entry_type(epold) == TYPE_FILE) {
+@@ -1167,28 +1168,6 @@ static int exfat_move_file(struct inode
+       return 0;
+ }
+-static void exfat_update_parent_info(struct exfat_inode_info *ei,
+-              struct inode *parent_inode)
+-{
+-      struct exfat_sb_info *sbi = EXFAT_SB(parent_inode->i_sb);
+-      struct exfat_inode_info *parent_ei = EXFAT_I(parent_inode);
+-      loff_t parent_isize = i_size_read(parent_inode);
+-
+-      /*
+-       * the problem that struct exfat_inode_info caches wrong parent info.
+-       *
+-       * because of flag-mismatch of ei->dir,
+-       * there is abnormal traversing cluster chain.
+-       */
+-      if (unlikely(parent_ei->flags != ei->dir.flags ||
+-                   parent_isize != EXFAT_CLU_TO_B(ei->dir.size, sbi) ||
+-                   parent_ei->start_clu != ei->dir.dir)) {
+-              exfat_chain_set(&ei->dir, parent_ei->start_clu,
+-                      EXFAT_B_TO_CLU_ROUND_UP(parent_isize, sbi),
+-                      parent_ei->flags);
+-      }
+-}
+-
+ /* rename or move a old file into a new file */
+ static int __exfat_rename(struct inode *old_parent_inode,
+               struct exfat_inode_info *ei, struct inode *new_parent_inode,
+@@ -1219,8 +1198,6 @@ static int __exfat_rename(struct inode *
+               return -ENOENT;
+       }
+-      exfat_update_parent_info(ei, old_parent_inode);
+-
+       exfat_chain_dup(&olddir, &ei->dir);
+       dentry = ei->entry;
+@@ -1241,8 +1218,6 @@ static int __exfat_rename(struct inode *
+                       goto out;
+               }
+-              exfat_update_parent_info(new_ei, new_parent_inode);
+-
+               p_dir = &(new_ei->dir);
+               new_entry = new_ei->entry;
+               ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh);
diff --git a/queue-5.18/kvm-ppc-book3s-hv-fix-incorrect-null-check-on-list-iterator.patch b/queue-5.18/kvm-ppc-book3s-hv-fix-incorrect-null-check-on-list-iterator.patch
new file mode 100644 (file)
index 0000000..1ebc7e7
--- /dev/null
@@ -0,0 +1,51 @@
+From 300981abddcb13f8f06ad58f52358b53a8096775 Mon Sep 17 00:00:00 2001
+From: Xiaomeng Tong <xiam0nd.tong@gmail.com>
+Date: Thu, 14 Apr 2022 14:21:03 +0800
+Subject: KVM: PPC: Book3S HV: fix incorrect NULL check on list iterator
+
+From: Xiaomeng Tong <xiam0nd.tong@gmail.com>
+
+commit 300981abddcb13f8f06ad58f52358b53a8096775 upstream.
+
+The bug is here:
+       if (!p)
+                return ret;
+
+The list iterator value 'p' will *always* be set and non-NULL by
+list_for_each_entry(), so it is incorrect to assume that the iterator
+value will be NULL if the list is empty or no element is found.
+
+To fix the bug, Use a new value 'iter' as the list iterator, while use
+the old value 'p' as a dedicated variable to point to the found element.
+
+Fixes: dfaa973ae960 ("KVM: PPC: Book3S HV: In H_SVM_INIT_DONE, migrate remaining normal-GFNs to secure-GFNs")
+Cc: stable@vger.kernel.org # v5.9+
+Signed-off-by: Xiaomeng Tong <xiam0nd.tong@gmail.com>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/20220414062103.8153-1-xiam0nd.tong@gmail.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/kvm/book3s_hv_uvmem.c |    8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
++++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
+@@ -361,13 +361,15 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsi
+ static bool kvmppc_next_nontransitioned_gfn(const struct kvm_memory_slot *memslot,
+               struct kvm *kvm, unsigned long *gfn)
+ {
+-      struct kvmppc_uvmem_slot *p;
++      struct kvmppc_uvmem_slot *p = NULL, *iter;
+       bool ret = false;
+       unsigned long i;
+-      list_for_each_entry(p, &kvm->arch.uvmem_pfns, list)
+-              if (*gfn >= p->base_pfn && *gfn < p->base_pfn + p->nr_pfns)
++      list_for_each_entry(iter, &kvm->arch.uvmem_pfns, list)
++              if (*gfn >= iter->base_pfn && *gfn < iter->base_pfn + iter->nr_pfns) {
++                      p = iter;
+                       break;
++              }
+       if (!p)
+               return ret;
+       /*
diff --git a/queue-5.18/kvm-svm-use-kzalloc-for-sev-ioctl-interfaces-to-prevent-kernel-data-leak.patch b/queue-5.18/kvm-svm-use-kzalloc-for-sev-ioctl-interfaces-to-prevent-kernel-data-leak.patch
new file mode 100644 (file)
index 0000000..3d2ccf2
--- /dev/null
@@ -0,0 +1,88 @@
+From d22d2474e3953996f03528b84b7f52cc26a39403 Mon Sep 17 00:00:00 2001
+From: Ashish Kalra <ashish.kalra@amd.com>
+Date: Mon, 16 May 2022 15:43:10 +0000
+Subject: KVM: SVM: Use kzalloc for sev ioctl interfaces to prevent kernel data leak
+
+From: Ashish Kalra <ashish.kalra@amd.com>
+
+commit d22d2474e3953996f03528b84b7f52cc26a39403 upstream.
+
+For some sev ioctl interfaces, the length parameter that is passed maybe
+less than or equal to SEV_FW_BLOB_MAX_SIZE, but larger than the data
+that PSP firmware returns. In this case, kmalloc will allocate memory
+that is the size of the input rather than the size of the data.
+Since PSP firmware doesn't fully overwrite the allocated buffer, these
+sev ioctl interface may return uninitialized kernel slab memory.
+
+Reported-by: Andy Nguyen <theflow@google.com>
+Suggested-by: David Rientjes <rientjes@google.com>
+Suggested-by: Peter Gonda <pgonda@google.com>
+Cc: kvm@vger.kernel.org
+Cc: stable@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Fixes: eaf78265a4ab3 ("KVM: SVM: Move SEV code to separate file")
+Fixes: 2c07ded06427d ("KVM: SVM: add support for SEV attestation command")
+Fixes: 4cfdd47d6d95a ("KVM: SVM: Add KVM_SEV SEND_START command")
+Fixes: d3d1af85e2c75 ("KVM: SVM: Add KVM_SEND_UPDATE_DATA command")
+Fixes: eba04b20e4861 ("KVM: x86: Account a variety of miscellaneous allocations")
+Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
+Reviewed-by: Peter Gonda <pgonda@google.com>
+Message-Id: <20220516154310.3685678-1-Ashish.Kalra@amd.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/sev.c |   12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/kvm/svm/sev.c
++++ b/arch/x86/kvm/svm/sev.c
+@@ -688,7 +688,7 @@ static int sev_launch_measure(struct kvm
+               if (params.len > SEV_FW_BLOB_MAX_SIZE)
+                       return -EINVAL;
+-              blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT);
++              blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
+               if (!blob)
+                       return -ENOMEM;
+@@ -808,7 +808,7 @@ static int __sev_dbg_decrypt_user(struct
+       if (!IS_ALIGNED(dst_paddr, 16) ||
+           !IS_ALIGNED(paddr,     16) ||
+           !IS_ALIGNED(size,      16)) {
+-              tpage = (void *)alloc_page(GFP_KERNEL);
++              tpage = (void *)alloc_page(GFP_KERNEL | __GFP_ZERO);
+               if (!tpage)
+                       return -ENOMEM;
+@@ -1094,7 +1094,7 @@ static int sev_get_attestation_report(st
+               if (params.len > SEV_FW_BLOB_MAX_SIZE)
+                       return -EINVAL;
+-              blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT);
++              blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
+               if (!blob)
+                       return -ENOMEM;
+@@ -1176,7 +1176,7 @@ static int sev_send_start(struct kvm *kv
+               return -EINVAL;
+       /* allocate the memory to hold the session data blob */
+-      session_data = kmalloc(params.session_len, GFP_KERNEL_ACCOUNT);
++      session_data = kzalloc(params.session_len, GFP_KERNEL_ACCOUNT);
+       if (!session_data)
+               return -ENOMEM;
+@@ -1300,11 +1300,11 @@ static int sev_send_update_data(struct k
+       /* allocate memory for header and transport buffer */
+       ret = -ENOMEM;
+-      hdr = kmalloc(params.hdr_len, GFP_KERNEL_ACCOUNT);
++      hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT);
+       if (!hdr)
+               goto e_unpin;
+-      trans_data = kmalloc(params.trans_len, GFP_KERNEL_ACCOUNT);
++      trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT);
+       if (!trans_data)
+               goto e_free_hdr;
diff --git a/queue-5.18/kvm-x86-avoid-calling-x86-emulator-without-a-decoded-instruction.patch b/queue-5.18/kvm-x86-avoid-calling-x86-emulator-without-a-decoded-instruction.patch
new file mode 100644 (file)
index 0000000..4072ec0
--- /dev/null
@@ -0,0 +1,107 @@
+From fee060cd52d69c114b62d1a2948ea9648b5131f9 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Fri, 11 Mar 2022 03:27:41 +0000
+Subject: KVM: x86: avoid calling x86 emulator without a decoded instruction
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit fee060cd52d69c114b62d1a2948ea9648b5131f9 upstream.
+
+Whenever x86_decode_emulated_instruction() detects a breakpoint, it
+returns the value that kvm_vcpu_check_breakpoint() writes into its
+pass-by-reference second argument.  Unfortunately this is completely
+bogus because the expected outcome of x86_decode_emulated_instruction
+is an EMULATION_* value.
+
+Then, if kvm_vcpu_check_breakpoint() does "*r = 0" (corresponding to
+a KVM_EXIT_DEBUG userspace exit), it is misunderstood as EMULATION_OK
+and x86_emulate_instruction() is called without having decoded the
+instruction.  This causes various havoc from running with a stale
+emulation context.
+
+The fix is to move the call to kvm_vcpu_check_breakpoint() where it was
+before commit 4aa2691dcbd3 ("KVM: x86: Factor out x86 instruction
+emulation with decoding") introduced x86_decode_emulated_instruction().
+The other caller of the function does not need breakpoint checks,
+because it is invoked as part of a vmexit and the processor has already
+checked those before executing the instruction that #GP'd.
+
+This fixes CVE-2022-1852.
+
+Reported-by: Qiuhao Li <qiuhao@sysec.org>
+Reported-by: Gaoning Pan <pgn@zju.edu.cn>
+Reported-by: Yongkang Jia <kangel@zju.edu.cn>
+Fixes: 4aa2691dcbd3 ("KVM: x86: Factor out x86 instruction emulation with decoding")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20220311032801.3467418-2-seanjc@google.com>
+[Rewrote commit message according to Qiuhao's report, since a patch
+ already existed to fix the bug. - Paolo]
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c |   31 +++++++++++++++++++------------
+ 1 file changed, 19 insertions(+), 12 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -8244,7 +8244,7 @@ int kvm_skip_emulated_instruction(struct
+ }
+ EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
+-static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
++static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r)
+ {
+       if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
+           (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
+@@ -8313,25 +8313,23 @@ static bool is_vmware_backdoor_opcode(st
+ }
+ /*
+- * Decode to be emulated instruction. Return EMULATION_OK if success.
++ * Decode an instruction for emulation.  The caller is responsible for handling
++ * code breakpoints.  Note, manually detecting code breakpoints is unnecessary
++ * (and wrong) when emulating on an intercepted fault-like exception[*], as
++ * code breakpoints have higher priority and thus have already been done by
++ * hardware.
++ *
++ * [*] Except #MC, which is higher priority, but KVM should never emulate in
++ *     response to a machine check.
+  */
+ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
+                                   void *insn, int insn_len)
+ {
+-      int r = EMULATION_OK;
+       struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
++      int r;
+       init_emulate_ctxt(vcpu);
+-      /*
+-       * We will reenter on the same instruction since we do not set
+-       * complete_userspace_io. This does not handle watchpoints yet,
+-       * those would be handled in the emulate_ops.
+-       */
+-      if (!(emulation_type & EMULTYPE_SKIP) &&
+-          kvm_vcpu_check_breakpoint(vcpu, &r))
+-              return r;
+-
+       r = x86_decode_insn(ctxt, insn, insn_len, emulation_type);
+       trace_kvm_emulate_insn_start(vcpu);
+@@ -8364,6 +8362,15 @@ int x86_emulate_instruction(struct kvm_v
+       if (!(emulation_type & EMULTYPE_NO_DECODE)) {
+               kvm_clear_exception_queue(vcpu);
++              /*
++               * Return immediately if RIP hits a code breakpoint, such #DBs
++               * are fault-like and are higher priority than any faults on
++               * the code fetch itself.
++               */
++              if (!(emulation_type & EMULTYPE_SKIP) &&
++                  kvm_vcpu_check_code_breakpoint(vcpu, &r))
++                      return r;
++
+               r = x86_decode_emulated_instruction(vcpu, emulation_type,
+                                                   insn, insn_len);
+               if (r != EMULATION_OK)  {
diff --git a/queue-5.18/kvm-x86-avoid-loading-a-vcpu-after-.vm_destroy-was-called.patch b/queue-5.18/kvm-x86-avoid-loading-a-vcpu-after-.vm_destroy-was-called.patch
new file mode 100644 (file)
index 0000000..78c801f
--- /dev/null
@@ -0,0 +1,63 @@
+From 6fcee03df6a1a3101a77344be37bb85c6142d56c Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Tue, 22 Mar 2022 19:24:42 +0200
+Subject: KVM: x86: avoid loading a vCPU after .vm_destroy was called
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit 6fcee03df6a1a3101a77344be37bb85c6142d56c upstream.
+
+This can cause various unexpected issues, since VM is partially
+destroyed at that point.
+
+For example when AVIC is enabled, this causes avic_vcpu_load to
+access physical id page entry which is already freed by .vm_destroy.
+
+Fixes: 8221c1370056 ("svm: Manage vcpu load/unload when enable AVIC")
+Cc: stable@vger.kernel.org
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20220322172449.235575-2-mlevitsk@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c |   10 +++-------
+ 1 file changed, 3 insertions(+), 7 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -11747,20 +11747,15 @@ static void kvm_unload_vcpu_mmu(struct k
+       vcpu_put(vcpu);
+ }
+-static void kvm_free_vcpus(struct kvm *kvm)
++static void kvm_unload_vcpu_mmus(struct kvm *kvm)
+ {
+       unsigned long i;
+       struct kvm_vcpu *vcpu;
+-      /*
+-       * Unpin any mmu pages first.
+-       */
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               kvm_clear_async_pf_completion_queue(vcpu);
+               kvm_unload_vcpu_mmu(vcpu);
+       }
+-
+-      kvm_destroy_vcpus(kvm);
+ }
+ void kvm_arch_sync_events(struct kvm *kvm)
+@@ -11866,11 +11861,12 @@ void kvm_arch_destroy_vm(struct kvm *kvm
+               __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
+               mutex_unlock(&kvm->slots_lock);
+       }
++      kvm_unload_vcpu_mmus(kvm);
+       static_call_cond(kvm_x86_vm_destroy)(kvm);
+       kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
+       kvm_pic_destroy(kvm);
+       kvm_ioapic_destroy(kvm);
+-      kvm_free_vcpus(kvm);
++      kvm_destroy_vcpus(kvm);
+       kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+       kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
+       kvm_mmu_uninit_vm(kvm);
diff --git a/queue-5.18/kvm-x86-drop-warns-that-assert-a-triple-fault-never-escapes-from-l2.patch b/queue-5.18/kvm-x86-drop-warns-that-assert-a-triple-fault-never-escapes-from-l2.patch
new file mode 100644 (file)
index 0000000..c9de6ee
--- /dev/null
@@ -0,0 +1,83 @@
+From 45846661d10422ce9e22da21f8277540b29eca22 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Thu, 7 Apr 2022 00:23:13 +0000
+Subject: KVM: x86: Drop WARNs that assert a triple fault never "escapes" from L2
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 45846661d10422ce9e22da21f8277540b29eca22 upstream.
+
+Remove WARNs that sanity check that KVM never lets a triple fault for L2
+escape and incorrectly end up in L1.  In normal operation, the sanity
+check is perfectly valid, but it incorrectly assumes that it's impossible
+for userspace to induce KVM_REQ_TRIPLE_FAULT without bouncing through
+KVM_RUN (which guarantees kvm_check_nested_state() will see and handle
+the triple fault).
+
+The WARN can currently be triggered if userspace injects a machine check
+while L2 is active and CR4.MCE=0.  And a future fix to allow save/restore
+of KVM_REQ_TRIPLE_FAULT, e.g. so that a synthesized triple fault isn't
+lost on migration, will make it trivially easy for userspace to trigger
+the WARN.
+
+Clearing KVM_REQ_TRIPLE_FAULT when forcibly leaving guest mode is
+tempting, but wrong, especially if/when the request is saved/restored,
+e.g. if userspace restores events (including a triple fault) and then
+restores nested state (which may forcibly leave guest mode).  Ignoring
+the fact that KVM doesn't currently provide the necessary APIs, it's
+userspace's responsibility to manage pending events during save/restore.
+
+  ------------[ cut here ]------------
+  WARNING: CPU: 7 PID: 1399 at arch/x86/kvm/vmx/nested.c:4522 nested_vmx_vmexit+0x7fe/0xd90 [kvm_intel]
+  Modules linked in: kvm_intel kvm irqbypass
+  CPU: 7 PID: 1399 Comm: state_test Not tainted 5.17.0-rc3+ #808
+  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
+  RIP: 0010:nested_vmx_vmexit+0x7fe/0xd90 [kvm_intel]
+  Call Trace:
+   <TASK>
+   vmx_leave_nested+0x30/0x40 [kvm_intel]
+   vmx_set_nested_state+0xca/0x3e0 [kvm_intel]
+   kvm_arch_vcpu_ioctl+0xf49/0x13e0 [kvm]
+   kvm_vcpu_ioctl+0x4b9/0x660 [kvm]
+   __x64_sys_ioctl+0x83/0xb0
+   do_syscall_64+0x3b/0xc0
+   entry_SYSCALL_64_after_hwframe+0x44/0xae
+   </TASK>
+  ---[ end trace 0000000000000000 ]---
+
+Fixes: cb6a32c2b877 ("KVM: x86: Handle triple fault in L2 without killing L1")
+Cc: stable@vger.kernel.org
+Cc: Chenyi Qiang <chenyi.qiang@intel.com>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20220407002315.78092-2-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/svm/nested.c |    3 ---
+ arch/x86/kvm/vmx/nested.c |    3 ---
+ 2 files changed, 6 deletions(-)
+
+--- a/arch/x86/kvm/svm/nested.c
++++ b/arch/x86/kvm/svm/nested.c
+@@ -819,9 +819,6 @@ int nested_svm_vmexit(struct vcpu_svm *s
+       struct kvm_host_map map;
+       int rc;
+-      /* Triple faults in L2 should never escape. */
+-      WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
+-
+       rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
+       if (rc) {
+               if (rc == -EINVAL)
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -4518,9 +4518,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *
+       /* trying to cancel vmlaunch/vmresume is a bug */
+       WARN_ON_ONCE(vmx->nested.nested_run_pending);
+-      /* Similarly, triple faults in L2 should never escape. */
+-      WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
+-
+       if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
+               /*
+                * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
diff --git a/queue-5.18/kvm-x86-fix-the-intel_pt-pmi-handling-wrongly-considered-from-guest.patch b/queue-5.18/kvm-x86-fix-the-intel_pt-pmi-handling-wrongly-considered-from-guest.patch
new file mode 100644 (file)
index 0000000..bb7da2e
--- /dev/null
@@ -0,0 +1,40 @@
+From ffd1925a596ce68bed7d81c61cb64bc35f788a9d Mon Sep 17 00:00:00 2001
+From: Yanfei Xu <yanfei.xu@intel.com>
+Date: Mon, 23 May 2022 22:08:21 +0800
+Subject: KVM: x86: Fix the intel_pt PMI handling wrongly considered from guest
+
+From: Yanfei Xu <yanfei.xu@intel.com>
+
+commit ffd1925a596ce68bed7d81c61cb64bc35f788a9d upstream.
+
+When kernel handles the vm-exit caused by external interrupts and NMI,
+it always sets kvm_intr_type to tell if it's dealing an IRQ or NMI. For
+the PMI scenario, it could be IRQ or NMI.
+
+However, intel_pt PMIs are only generated for HARDWARE perf events, and
+HARDWARE events are always configured to generate NMIs.  Use
+kvm_handling_nmi_from_guest() to precisely identify if the intel_pt PMI
+came from the guest; this avoids false positives if an intel_pt PMI/NMI
+arrives while the host is handling an unrelated IRQ VM-Exit.
+
+Fixes: db215756ae59 ("KVM: x86: More precisely identify NMI from guest when handling PMI")
+Signed-off-by: Yanfei Xu <yanfei.xu@intel.com>
+Message-Id: <20220523140821.1345605-1-yanfei.xu@intel.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/vmx/vmx.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7856,7 +7856,7 @@ static unsigned int vmx_handle_intel_pt_
+       struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+       /* '0' on failure so that the !PT case can use a RET0 static call. */
+-      if (!kvm_arch_pmi_in_guest(vcpu))
++      if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
+               return 0;
+       kvm_make_request(KVM_REQ_PMI, vcpu);
diff --git a/queue-5.18/kvm-x86-fix-typo-in-__try_cmpxchg_user-causing-non-atomicness.patch b/queue-5.18/kvm-x86-fix-typo-in-__try_cmpxchg_user-causing-non-atomicness.patch
new file mode 100644 (file)
index 0000000..9f91282
--- /dev/null
@@ -0,0 +1,35 @@
+From 33fbe6befa622c082f7d417896832856814bdde0 Mon Sep 17 00:00:00 2001
+From: Maxim Levitsky <mlevitsk@redhat.com>
+Date: Thu, 12 May 2022 13:14:20 +0300
+Subject: KVM: x86: fix typo in __try_cmpxchg_user causing non-atomicness
+
+From: Maxim Levitsky <mlevitsk@redhat.com>
+
+commit 33fbe6befa622c082f7d417896832856814bdde0 upstream.
+
+This shows up as a TDP MMU leak when running nested.  Non-working cmpxchg on L0
+relies makes L1 install two different shadow pages under same spte, and one of
+them is leaked.
+
+Fixes: 1c2361f667f36 ("KVM: x86: Use __try_cmpxchg_user() to emulate atomic accesses")
+Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
+Message-Id: <20220512101420.306759-1-mlevitsk@redhat.com>
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -7268,7 +7268,7 @@ static int emulator_cmpxchg_emulated(str
+               goto emul_write;
+       hva = kvm_vcpu_gfn_to_hva(vcpu, gpa_to_gfn(gpa));
+-      if (kvm_is_error_hva(addr))
++      if (kvm_is_error_hva(hva))
+               goto emul_write;
+       hva += offset_in_page(gpa);
diff --git a/queue-5.18/kvm-x86-mmu-don-t-rebuild-page-when-the-page-is-synced-and-no-tlb-flushing-is-required.patch b/queue-5.18/kvm-x86-mmu-don-t-rebuild-page-when-the-page-is-synced-and-no-tlb-flushing-is-required.patch
new file mode 100644 (file)
index 0000000..c283c04
--- /dev/null
@@ -0,0 +1,89 @@
+From 8d5678a76689acbf91245a3791fe853ab773090f Mon Sep 17 00:00:00 2001
+From: Hou Wenlong <houwenlong.hwl@antgroup.com>
+Date: Tue, 15 Mar 2022 17:35:13 +0800
+Subject: KVM: x86/mmu: Don't rebuild page when the page is synced and no tlb flushing is required
+
+From: Hou Wenlong <houwenlong.hwl@antgroup.com>
+
+commit 8d5678a76689acbf91245a3791fe853ab773090f upstream.
+
+Before Commit c3e5e415bc1e6 ("KVM: X86: Change kvm_sync_page()
+to return true when remote flush is needed"), the return value
+of kvm_sync_page() indicates whether the page is synced, and
+kvm_mmu_get_page() would rebuild page when the sync fails.
+But now, kvm_sync_page() returns false when the page is
+synced and no tlb flushing is required, which leads to
+rebuild page in kvm_mmu_get_page(). So return the return
+value of mmu->sync_page() directly and check it in
+kvm_mmu_get_page(). If the sync fails, the page will be
+zapped and the invalid_list is not empty, so set flush as
+true is accepted in mmu_sync_children().
+
+Cc: stable@vger.kernel.org
+Fixes: c3e5e415bc1e6 ("KVM: X86: Change kvm_sync_page() to return true when remote flush is needed")
+Signed-off-by: Hou Wenlong <houwenlong.hwl@antgroup.com>
+Acked-by: Lai Jiangshan <jiangshanlai@gmail.com>
+Message-Id: <0dabeeb789f57b0d793f85d073893063e692032d.1647336064.git.houwenlong.hwl@antgroup.com>
+[mmu_sync_children should not flush if the page is zapped. - Paolo]
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/mmu/mmu.c |   18 +++++++++---------
+ 1 file changed, 9 insertions(+), 9 deletions(-)
+
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -1843,17 +1843,14 @@ static void kvm_mmu_commit_zap_page(stru
+         &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])     \
+               if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
+-static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
++static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+                        struct list_head *invalid_list)
+ {
+       int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
+-      if (ret < 0) {
++      if (ret < 0)
+               kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
+-              return false;
+-      }
+-
+-      return !!ret;
++      return ret;
+ }
+ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
+@@ -1975,7 +1972,7 @@ static int mmu_sync_children(struct kvm_
+               for_each_sp(pages, sp, parents, i) {
+                       kvm_unlink_unsync_page(vcpu->kvm, sp);
+-                      flush |= kvm_sync_page(vcpu, sp, &invalid_list);
++                      flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
+                       mmu_pages_clear_parents(&parents);
+               }
+               if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
+@@ -2016,6 +2013,7 @@ static struct kvm_mmu_page *kvm_mmu_get_
+       struct hlist_head *sp_list;
+       unsigned quadrant;
+       struct kvm_mmu_page *sp;
++      int ret;
+       int collisions = 0;
+       LIST_HEAD(invalid_list);
+@@ -2068,11 +2066,13 @@ static struct kvm_mmu_page *kvm_mmu_get_
+                        * If the sync fails, the page is zapped.  If so, break
+                        * in order to rebuild it.
+                        */
+-                      if (!kvm_sync_page(vcpu, sp, &invalid_list))
++                      ret = kvm_sync_page(vcpu, sp, &invalid_list);
++                      if (ret < 0)
+                               break;
+                       WARN_ON(!list_empty(&invalid_list));
+-                      kvm_flush_remote_tlbs(vcpu->kvm);
++                      if (ret > 0)
++                              kvm_flush_remote_tlbs(vcpu->kvm);
+               }
+               __clear_sp_write_flooding_count(sp);
diff --git a/queue-5.18/kvm-x86-use-__try_cmpxchg_user-to-emulate-atomic-accesses.patch b/queue-5.18/kvm-x86-use-__try_cmpxchg_user-to-emulate-atomic-accesses.patch
new file mode 100644 (file)
index 0000000..d09a7f1
--- /dev/null
@@ -0,0 +1,103 @@
+From 1c2361f667f3648855ceae25f1332c18413fdb9f Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Wed, 2 Feb 2022 00:49:44 +0000
+Subject: KVM: x86: Use __try_cmpxchg_user() to emulate atomic accesses
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 1c2361f667f3648855ceae25f1332c18413fdb9f upstream.
+
+Use the recently introduce __try_cmpxchg_user() to emulate atomic guest
+accesses via the associated userspace address instead of mapping the
+backing pfn into kernel address space.  Using kvm_vcpu_map() is unsafe as
+it does not coordinate with KVM's mmu_notifier to ensure the hva=>pfn
+translation isn't changed/unmapped in the memremap() path, i.e. when
+there's no struct page and thus no elevated refcount.
+
+Fixes: 42e35f8072c3 ("KVM/X86: Use kvm_vcpu_map in emulator_cmpxchg_emulated")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20220202004945.2540433-5-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c |   35 ++++++++++++++---------------------
+ 1 file changed, 14 insertions(+), 21 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -7229,15 +7229,8 @@ static int emulator_write_emulated(struc
+                                  exception, &write_emultor);
+ }
+-#define CMPXCHG_TYPE(t, ptr, old, new) \
+-      (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
+-
+-#ifdef CONFIG_X86_64
+-#  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
+-#else
+-#  define CMPXCHG64(ptr, old, new) \
+-      (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
+-#endif
++#define emulator_try_cmpxchg_user(t, ptr, old, new) \
++      (__try_cmpxchg_user((t __user *)(ptr), (t *)(old), *(t *)(new), efault ## t))
+ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
+                                    unsigned long addr,
+@@ -7246,12 +7239,11 @@ static int emulator_cmpxchg_emulated(str
+                                    unsigned int bytes,
+                                    struct x86_exception *exception)
+ {
+-      struct kvm_host_map map;
+       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+       u64 page_line_mask;
++      unsigned long hva;
+       gpa_t gpa;
+-      char *kaddr;
+-      bool exchanged;
++      int r;
+       /* guests cmpxchg8b have to be emulated atomically */
+       if (bytes > 8 || (bytes & (bytes - 1)))
+@@ -7275,31 +7267,32 @@ static int emulator_cmpxchg_emulated(str
+       if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
+               goto emul_write;
+-      if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
++      hva = kvm_vcpu_gfn_to_hva(vcpu, gpa_to_gfn(gpa));
++      if (kvm_is_error_hva(addr))
+               goto emul_write;
+-      kaddr = map.hva + offset_in_page(gpa);
++      hva += offset_in_page(gpa);
+       switch (bytes) {
+       case 1:
+-              exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
++              r = emulator_try_cmpxchg_user(u8, hva, old, new);
+               break;
+       case 2:
+-              exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
++              r = emulator_try_cmpxchg_user(u16, hva, old, new);
+               break;
+       case 4:
+-              exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
++              r = emulator_try_cmpxchg_user(u32, hva, old, new);
+               break;
+       case 8:
+-              exchanged = CMPXCHG64(kaddr, old, new);
++              r = emulator_try_cmpxchg_user(u64, hva, old, new);
+               break;
+       default:
+               BUG();
+       }
+-      kvm_vcpu_unmap(vcpu, &map, true);
+-
+-      if (!exchanged)
++      if (r < 0)
++              goto emul_write;
++      if (r)
+               return X86EMUL_CMPXCHG_FAILED;
+       kvm_page_track_write(vcpu, gpa, new, bytes);
diff --git a/queue-5.18/kvm-x86-use-__try_cmpxchg_user-to-update-guest-pte-a-d-bits.patch b/queue-5.18/kvm-x86-use-__try_cmpxchg_user-to-update-guest-pte-a-d-bits.patch
new file mode 100644 (file)
index 0000000..5eaf367
--- /dev/null
@@ -0,0 +1,84 @@
+From f122dfe4476890d60b8c679128cd2259ec96a24c Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Wed, 2 Feb 2022 00:49:43 +0000
+Subject: KVM: x86: Use __try_cmpxchg_user() to update guest PTE A/D bits
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit f122dfe4476890d60b8c679128cd2259ec96a24c upstream.
+
+Use the recently introduced __try_cmpxchg_user() to update guest PTE A/D
+bits instead of mapping the PTE into kernel address space.  The VM_PFNMAP
+path is broken as it assumes that vm_pgoff is the base pfn of the mapped
+VMA range, which is conceptually wrong as vm_pgoff is the offset relative
+to the file and has nothing to do with the pfn.  The horrific hack worked
+for the original use case (backing guest memory with /dev/mem), but leads
+to accessing "random" pfns for pretty much any other VM_PFNMAP case.
+
+Fixes: bd53cb35a3e9 ("X86/KVM: Handle PFNs outside of kernel reach when touching GPTEs")
+Debugged-by: Tadeusz Struk <tadeusz.struk@linaro.org>
+Tested-by: Tadeusz Struk <tadeusz.struk@linaro.org>
+Reported-by: syzbot+6cde2282daa792c49ab8@syzkaller.appspotmail.com
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20220202004945.2540433-4-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/mmu/paging_tmpl.h |   38 +-------------------------------------
+ 1 file changed, 1 insertion(+), 37 deletions(-)
+
+--- a/arch/x86/kvm/mmu/paging_tmpl.h
++++ b/arch/x86/kvm/mmu/paging_tmpl.h
+@@ -144,42 +144,6 @@ static bool FNAME(is_rsvd_bits_set)(stru
+              FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte);
+ }
+-static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+-                             pt_element_t __user *ptep_user, unsigned index,
+-                             pt_element_t orig_pte, pt_element_t new_pte)
+-{
+-      signed char r;
+-
+-      if (!user_access_begin(ptep_user, sizeof(pt_element_t)))
+-              return -EFAULT;
+-
+-#ifdef CMPXCHG
+-      asm volatile("1:" LOCK_PREFIX CMPXCHG " %[new], %[ptr]\n"
+-                   "setnz %b[r]\n"
+-                   "2:"
+-                   _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %k[r])
+-                   : [ptr] "+m" (*ptep_user),
+-                     [old] "+a" (orig_pte),
+-                     [r] "=q" (r)
+-                   : [new] "r" (new_pte)
+-                   : "memory");
+-#else
+-      asm volatile("1:" LOCK_PREFIX "cmpxchg8b %[ptr]\n"
+-                   "setnz %b[r]\n"
+-                   "2:"
+-                   _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %k[r])
+-                   : [ptr] "+m" (*ptep_user),
+-                     [old] "+A" (orig_pte),
+-                     [r] "=q" (r)
+-                   : [new_lo] "b" ((u32)new_pte),
+-                     [new_hi] "c" ((u32)(new_pte >> 32))
+-                   : "memory");
+-#endif
+-
+-      user_access_end();
+-      return r;
+-}
+-
+ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+                                 struct kvm_mmu_page *sp, u64 *spte,
+                                 u64 gpte)
+@@ -278,7 +242,7 @@ static int FNAME(update_accessed_dirty_b
+               if (unlikely(!walker->pte_writable[level - 1]))
+                       continue;
+-              ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
++              ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault);
+               if (ret)
+                       return ret;
diff --git a/queue-5.18/net-ipa-compute-proper-aggregation-limit.patch b/queue-5.18/net-ipa-compute-proper-aggregation-limit.patch
new file mode 100644 (file)
index 0000000..ac8a7b2
--- /dev/null
@@ -0,0 +1,63 @@
+From c5794097b269f15961ed78f7f27b50e51766dec9 Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@linaro.org>
+Date: Thu, 21 Apr 2022 13:53:33 -0500
+Subject: net: ipa: compute proper aggregation limit
+
+From: Alex Elder <elder@linaro.org>
+
+commit c5794097b269f15961ed78f7f27b50e51766dec9 upstream.
+
+The aggregation byte limit for an endpoint is currently computed
+based on the endpoint's receive buffer size.
+
+However, some bytes at the front of each receive buffer are reserved
+on the assumption that--as with SKBs--it might be useful to insert
+data (such as headers) before what lands in the buffer.
+
+The aggregation byte limit currently doesn't take into account that
+reserved space, and as a result, aggregation could require space
+past that which is available in the buffer.
+
+Fix this by reducing the size used to compute the aggregation byte
+limit by the NET_SKB_PAD offset reserved for each receive buffer.
+
+Signed-off-by: Alex Elder <elder@linaro.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ipa/ipa_endpoint.c |    9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+--- a/drivers/net/ipa/ipa_endpoint.c
++++ b/drivers/net/ipa/ipa_endpoint.c
+@@ -130,9 +130,10 @@ static bool ipa_endpoint_data_valid_one(
+                */
+               if (data->endpoint.config.aggregation) {
+                       limit += SZ_1K * aggr_byte_limit_max(ipa->version);
+-                      if (buffer_size > limit) {
++                      if (buffer_size - NET_SKB_PAD > limit) {
+                               dev_err(dev, "RX buffer size too large for aggregated RX endpoint %u (%u > %u)\n",
+-                                      data->endpoint_id, buffer_size, limit);
++                                      data->endpoint_id,
++                                      buffer_size - NET_SKB_PAD, limit);
+                               return false;
+                       }
+@@ -739,6 +740,7 @@ static void ipa_endpoint_init_aggr(struc
+       if (endpoint->data->aggregation) {
+               if (!endpoint->toward_ipa) {
+                       const struct ipa_endpoint_rx_data *rx_data;
++                      u32 buffer_size;
+                       bool close_eof;
+                       u32 limit;
+@@ -746,7 +748,8 @@ static void ipa_endpoint_init_aggr(struc
+                       val |= u32_encode_bits(IPA_ENABLE_AGGR, AGGR_EN_FMASK);
+                       val |= u32_encode_bits(IPA_GENERIC, AGGR_TYPE_FMASK);
+-                      limit = ipa_aggr_size_kb(rx_data->buffer_size);
++                      buffer_size = rx_data->buffer_size;
++                      limit = ipa_aggr_size_kb(buffer_size - NET_SKB_PAD);
+                       val |= aggr_byte_limit_encoded(version, limit);
+                       limit = IPA_AGGR_TIME_LIMIT;
diff --git a/queue-5.18/netfilter-conntrack-re-fetch-conntrack-after-insertion.patch b/queue-5.18/netfilter-conntrack-re-fetch-conntrack-after-insertion.patch
new file mode 100644 (file)
index 0000000..b634dd7
--- /dev/null
@@ -0,0 +1,43 @@
+From 56b14ecec97f39118bf85c9ac2438c5a949509ed Mon Sep 17 00:00:00 2001
+From: Florian Westphal <fw@strlen.de>
+Date: Fri, 20 May 2022 00:02:04 +0200
+Subject: netfilter: conntrack: re-fetch conntrack after insertion
+
+From: Florian Westphal <fw@strlen.de>
+
+commit 56b14ecec97f39118bf85c9ac2438c5a949509ed upstream.
+
+In case the conntrack is clashing, insertion can free skb->_nfct and
+set skb->_nfct to the already-confirmed entry.
+
+This wasn't found before because the conntrack entry and the extension
+space used to free'd after an rcu grace period, plus the race needs
+events enabled to trigger.
+
+Reported-by: <syzbot+793a590957d9c1b96620@syzkaller.appspotmail.com>
+Fixes: 71d8c47fc653 ("netfilter: conntrack: introduce clash resolution on insertion race")
+Fixes: 2ad9d7747c10 ("netfilter: conntrack: free extension area immediately")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/netfilter/nf_conntrack_core.h |    7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/include/net/netfilter/nf_conntrack_core.h
++++ b/include/net/netfilter/nf_conntrack_core.h
+@@ -58,8 +58,13 @@ static inline int nf_conntrack_confirm(s
+       int ret = NF_ACCEPT;
+       if (ct) {
+-              if (!nf_ct_is_confirmed(ct))
++              if (!nf_ct_is_confirmed(ct)) {
+                       ret = __nf_conntrack_confirm(skb);
++
++                      if (ret == NF_ACCEPT)
++                              ct = (struct nf_conn *)skb_nfct(skb);
++              }
++
+               if (likely(ret == NF_ACCEPT))
+                       nf_ct_deliver_cached_events(ct);
+       }
diff --git a/queue-5.18/netfilter-nf_tables-double-hook-unregistration-in-netns-path.patch b/queue-5.18/netfilter-nf_tables-double-hook-unregistration-in-netns-path.patch
new file mode 100644 (file)
index 0000000..66e38d9
--- /dev/null
@@ -0,0 +1,137 @@
+From f9a43007d3f7ba76d5e7f9421094f00f2ef202f8 Mon Sep 17 00:00:00 2001
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+Date: Mon, 30 May 2022 18:24:06 +0200
+Subject: netfilter: nf_tables: double hook unregistration in netns path
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+commit f9a43007d3f7ba76d5e7f9421094f00f2ef202f8 upstream.
+
+__nft_release_hooks() is called from pre_netns exit path which
+unregisters the hooks, then the NETDEV_UNREGISTER event is triggered
+which unregisters the hooks again.
+
+[  565.221461] WARNING: CPU: 18 PID: 193 at net/netfilter/core.c:495 __nf_unregister_net_hook+0x247/0x270
+[...]
+[  565.246890] CPU: 18 PID: 193 Comm: kworker/u64:1 Tainted: G            E     5.18.0-rc7+ #27
+[  565.253682] Workqueue: netns cleanup_net
+[  565.257059] RIP: 0010:__nf_unregister_net_hook+0x247/0x270
+[...]
+[  565.297120] Call Trace:
+[  565.300900]  <TASK>
+[  565.304683]  nf_tables_flowtable_event+0x16a/0x220 [nf_tables]
+[  565.308518]  raw_notifier_call_chain+0x63/0x80
+[  565.312386]  unregister_netdevice_many+0x54f/0xb50
+
+Unregister and destroy netdev hook from netns pre_exit via kfree_rcu
+so the NETDEV_UNREGISTER path see unregistered hooks.
+
+Fixes: 767d1216bff8 ("netfilter: nftables: fix possible UAF over chains from packet path in netns")
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/netfilter/nf_tables_api.c |   54 +++++++++++++++++++++++++++++++-----------
+ 1 file changed, 41 insertions(+), 13 deletions(-)
+
+--- a/net/netfilter/nf_tables_api.c
++++ b/net/netfilter/nf_tables_api.c
+@@ -222,12 +222,18 @@ err_register:
+ }
+ static void nft_netdev_unregister_hooks(struct net *net,
+-                                      struct list_head *hook_list)
++                                      struct list_head *hook_list,
++                                      bool release_netdev)
+ {
+-      struct nft_hook *hook;
++      struct nft_hook *hook, *next;
+-      list_for_each_entry(hook, hook_list, list)
++      list_for_each_entry_safe(hook, next, hook_list, list) {
+               nf_unregister_net_hook(net, &hook->ops);
++              if (release_netdev) {
++                      list_del(&hook->list);
++                      kfree_rcu(hook, rcu);
++              }
++      }
+ }
+ static int nf_tables_register_hook(struct net *net,
+@@ -253,9 +259,10 @@ static int nf_tables_register_hook(struc
+       return nf_register_net_hook(net, &basechain->ops);
+ }
+-static void nf_tables_unregister_hook(struct net *net,
+-                                    const struct nft_table *table,
+-                                    struct nft_chain *chain)
++static void __nf_tables_unregister_hook(struct net *net,
++                                      const struct nft_table *table,
++                                      struct nft_chain *chain,
++                                      bool release_netdev)
+ {
+       struct nft_base_chain *basechain;
+       const struct nf_hook_ops *ops;
+@@ -270,11 +277,19 @@ static void nf_tables_unregister_hook(st
+               return basechain->type->ops_unregister(net, ops);
+       if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
+-              nft_netdev_unregister_hooks(net, &basechain->hook_list);
++              nft_netdev_unregister_hooks(net, &basechain->hook_list,
++                                          release_netdev);
+       else
+               nf_unregister_net_hook(net, &basechain->ops);
+ }
++static void nf_tables_unregister_hook(struct net *net,
++                                    const struct nft_table *table,
++                                    struct nft_chain *chain)
++{
++      return __nf_tables_unregister_hook(net, table, chain, false);
++}
++
+ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans)
+ {
+       struct nftables_pernet *nft_net = nft_pernet(net);
+@@ -7301,13 +7316,25 @@ static void nft_unregister_flowtable_hoo
+                                   FLOW_BLOCK_UNBIND);
+ }
+-static void nft_unregister_flowtable_net_hooks(struct net *net,
+-                                             struct list_head *hook_list)
++static void __nft_unregister_flowtable_net_hooks(struct net *net,
++                                               struct list_head *hook_list,
++                                               bool release_netdev)
+ {
+-      struct nft_hook *hook;
++      struct nft_hook *hook, *next;
+-      list_for_each_entry(hook, hook_list, list)
++      list_for_each_entry_safe(hook, next, hook_list, list) {
+               nf_unregister_net_hook(net, &hook->ops);
++              if (release_netdev) {
++                      list_del(&hook->list);
++                      kfree_rcu(hook);
++              }
++      }
++}
++
++static void nft_unregister_flowtable_net_hooks(struct net *net,
++                                             struct list_head *hook_list)
++{
++      __nft_unregister_flowtable_net_hooks(net, hook_list, false);
+ }
+ static int nft_register_flowtable_net_hooks(struct net *net,
+@@ -9751,9 +9778,10 @@ static void __nft_release_hook(struct ne
+       struct nft_chain *chain;
+       list_for_each_entry(chain, &table->chains, list)
+-              nf_tables_unregister_hook(net, table, chain);
++              __nf_tables_unregister_hook(net, table, chain, true);
+       list_for_each_entry(flowtable, &table->flowtables, list)
+-              nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list);
++              __nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list,
++                                                   true);
+ }
+ static void __nft_release_hooks(struct net *net)
diff --git a/queue-5.18/netfilter-nf_tables-hold-mutex-on-netns-pre_exit-path.patch b/queue-5.18/netfilter-nf_tables-hold-mutex-on-netns-pre_exit-path.patch
new file mode 100644 (file)
index 0000000..2efdf9c
--- /dev/null
@@ -0,0 +1,32 @@
+From 3923b1e4406680d57da7e873da77b1683035d83f Mon Sep 17 00:00:00 2001
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+Date: Mon, 30 May 2022 18:24:05 +0200
+Subject: netfilter: nf_tables: hold mutex on netns pre_exit path
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+commit 3923b1e4406680d57da7e873da77b1683035d83f upstream.
+
+clean_net() runs in workqueue while walking over the lists, grab mutex.
+
+Fixes: 767d1216bff8 ("netfilter: nftables: fix possible UAF over chains from packet path in netns")
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/netfilter/nf_tables_api.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/net/netfilter/nf_tables_api.c
++++ b/net/netfilter/nf_tables_api.c
+@@ -9892,7 +9892,11 @@ static int __net_init nf_tables_init_net
+ static void __net_exit nf_tables_pre_exit_net(struct net *net)
+ {
++      struct nftables_pernet *nft_net = nft_pernet(net);
++
++      mutex_lock(&nft_net->commit_mutex);
+       __nft_release_hooks(net);
++      mutex_unlock(&nft_net->commit_mutex);
+ }
+ static void __net_exit nf_tables_exit_net(struct net *net)
diff --git a/queue-5.18/netfilter-nf_tables-sanitize-nft_set_desc_concat_parse.patch b/queue-5.18/netfilter-nf_tables-sanitize-nft_set_desc_concat_parse.patch
new file mode 100644 (file)
index 0000000..fbc7471
--- /dev/null
@@ -0,0 +1,74 @@
+From fecf31ee395b0295f2d7260aa29946b7605f7c85 Mon Sep 17 00:00:00 2001
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+Date: Fri, 27 May 2022 09:56:18 +0200
+Subject: netfilter: nf_tables: sanitize nft_set_desc_concat_parse()
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+commit fecf31ee395b0295f2d7260aa29946b7605f7c85 upstream.
+
+Add several sanity checks for nft_set_desc_concat_parse():
+
+- validate desc->field_count not larger than desc->field_len array.
+- field length cannot be larger than desc->field_len (ie. U8_MAX)
+- total length of the concatenation cannot be larger than register array.
+
+Joint work with Florian Westphal.
+
+Fixes: f3a2181e16f1 ("netfilter: nf_tables: Support for sets with multiple ranged fields")
+Reported-by: <zhangziming.zzm@antgroup.com>
+Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/netfilter/nf_tables_api.c |   17 +++++++++++++----
+ 1 file changed, 13 insertions(+), 4 deletions(-)
+
+--- a/net/netfilter/nf_tables_api.c
++++ b/net/netfilter/nf_tables_api.c
+@@ -4246,6 +4246,9 @@ static int nft_set_desc_concat_parse(con
+       u32 len;
+       int err;
++      if (desc->field_count >= ARRAY_SIZE(desc->field_len))
++              return -E2BIG;
++
+       err = nla_parse_nested_deprecated(tb, NFTA_SET_FIELD_MAX, attr,
+                                         nft_concat_policy, NULL);
+       if (err < 0)
+@@ -4255,9 +4258,8 @@ static int nft_set_desc_concat_parse(con
+               return -EINVAL;
+       len = ntohl(nla_get_be32(tb[NFTA_SET_FIELD_LEN]));
+-
+-      if (len * BITS_PER_BYTE / 32 > NFT_REG32_COUNT)
+-              return -E2BIG;
++      if (!len || len > U8_MAX)
++              return -EINVAL;
+       desc->field_len[desc->field_count++] = len;
+@@ -4268,7 +4270,8 @@ static int nft_set_desc_concat(struct nf
+                              const struct nlattr *nla)
+ {
+       struct nlattr *attr;
+-      int rem, err;
++      u32 num_regs = 0;
++      int rem, err, i;
+       nla_for_each_nested(attr, nla, rem) {
+               if (nla_type(attr) != NFTA_LIST_ELEM)
+@@ -4279,6 +4282,12 @@ static int nft_set_desc_concat(struct nf
+                       return err;
+       }
++      for (i = 0; i < desc->field_count; i++)
++              num_regs += DIV_ROUND_UP(desc->field_len[i], sizeof(u32));
++
++      if (num_regs > NFT_REG32_COUNT)
++              return -E2BIG;
++
+       return 0;
+ }
diff --git a/queue-5.18/netfilter-nft_limit-clone-packet-limits-cost-value.patch b/queue-5.18/netfilter-nft_limit-clone-packet-limits-cost-value.patch
new file mode 100644 (file)
index 0000000..9905e5c
--- /dev/null
@@ -0,0 +1,31 @@
+From 558254b0b602b8605d7246a10cfeb584b1fcabfc Mon Sep 17 00:00:00 2001
+From: Phil Sutter <phil@nwl.cc>
+Date: Tue, 24 May 2022 14:50:01 +0200
+Subject: netfilter: nft_limit: Clone packet limits' cost value
+
+From: Phil Sutter <phil@nwl.cc>
+
+commit 558254b0b602b8605d7246a10cfeb584b1fcabfc upstream.
+
+When cloning a packet-based limit expression, copy the cost value as
+well. Otherwise the new limit is not functional anymore.
+
+Fixes: 3b9e2ea6c11bf ("netfilter: nft_limit: move stateful fields out of expression data")
+Signed-off-by: Phil Sutter <phil@nwl.cc>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/netfilter/nft_limit.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/net/netfilter/nft_limit.c
++++ b/net/netfilter/nft_limit.c
+@@ -213,6 +213,8 @@ static int nft_limit_pkts_clone(struct n
+       struct nft_limit_priv_pkts *priv_dst = nft_expr_priv(dst);
+       struct nft_limit_priv_pkts *priv_src = nft_expr_priv(src);
++      priv_dst->cost = priv_src->cost;
++
+       return nft_limit_clone(&priv_dst->limit, &priv_src->limit);
+ }
index 1a7523db53f3f889f78b27057871b92299f04718..098415fa2e8c35e7d62f96e9686a6485f916ccc3 100644 (file)
@@ -3,3 +3,26 @@ i2c-ismt-prevent-memory-corruption-in-ismt_access.patch
 assoc_array-fix-bug_on-during-garbage-collect.patch
 pipe-make-poll_usage-boolean-and-annotate-its-access.patch
 pipe-fix-missing-lock-in-pipe_resize_ring.patch
+net-ipa-compute-proper-aggregation-limit.patch
+drm-i915-fix-wstringop-overflow-warning-in-call-to-intel_read_wm_latency.patch
+exfat-check-if-cluster-num-is-valid.patch
+exfat-fix-referencing-wrong-parent-directory-information-after-renaming.patch
+netfilter-nft_limit-clone-packet-limits-cost-value.patch
+netfilter-nf_tables-sanitize-nft_set_desc_concat_parse.patch
+netfilter-nf_tables-hold-mutex-on-netns-pre_exit-path.patch
+netfilter-nf_tables-double-hook-unregistration-in-netns-path.patch
+netfilter-conntrack-re-fetch-conntrack-after-insertion.patch
+kvm-ppc-book3s-hv-fix-incorrect-null-check-on-list-iterator.patch
+x86-fpu-kvm-set-the-base-guest-fpu-uabi-size-to-sizeof-struct-kvm_xsave.patch
+x86-kvm-alloc-dummy-async-pf-token-outside-of-raw-spinlock.patch
+x86-kvm-use-correct-gfp-flags-for-preemption-disabled.patch
+x86-uaccess-implement-macros-for-cmpxchg-on-user-addresses.patch
+kvm-x86-use-__try_cmpxchg_user-to-update-guest-pte-a-d-bits.patch
+kvm-x86-use-__try_cmpxchg_user-to-emulate-atomic-accesses.patch
+kvm-x86-fix-typo-in-__try_cmpxchg_user-causing-non-atomicness.patch
+kvm-x86-avoid-calling-x86-emulator-without-a-decoded-instruction.patch
+kvm-x86-avoid-loading-a-vcpu-after-.vm_destroy-was-called.patch
+kvm-x86-fix-the-intel_pt-pmi-handling-wrongly-considered-from-guest.patch
+kvm-x86-drop-warns-that-assert-a-triple-fault-never-escapes-from-l2.patch
+kvm-x86-mmu-don-t-rebuild-page-when-the-page-is-synced-and-no-tlb-flushing-is-required.patch
+kvm-svm-use-kzalloc-for-sev-ioctl-interfaces-to-prevent-kernel-data-leak.patch
diff --git a/queue-5.18/x86-fpu-kvm-set-the-base-guest-fpu-uabi-size-to-sizeof-struct-kvm_xsave.patch b/queue-5.18/x86-fpu-kvm-set-the-base-guest-fpu-uabi-size-to-sizeof-struct-kvm_xsave.patch
new file mode 100644 (file)
index 0000000..f71d464
--- /dev/null
@@ -0,0 +1,113 @@
+From d187ba5312307d51818beafaad87d28a7d939adf Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Wed, 4 May 2022 00:12:19 +0000
+Subject: x86/fpu: KVM: Set the base guest FPU uABI size to sizeof(struct kvm_xsave)
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit d187ba5312307d51818beafaad87d28a7d939adf upstream.
+
+Set the starting uABI size of KVM's guest FPU to 'struct kvm_xsave',
+i.e. to KVM's historical uABI size.  When saving FPU state for usersapce,
+KVM (well, now the FPU) sets the FP+SSE bits in the XSAVE header even if
+the host doesn't support XSAVE.  Setting the XSAVE header allows the VM
+to be migrated to a host that does support XSAVE without the new host
+having to handle FPU state that may or may not be compatible with XSAVE.
+
+Setting the uABI size to the host's default size results in out-of-bounds
+writes (setting the FP+SSE bits) and data corruption (that is thankfully
+caught by KASAN) when running on hosts without XSAVE, e.g. on Core2 CPUs.
+
+WARN if the default size is larger than KVM's historical uABI size; all
+features that can push the FPU size beyond the historical size must be
+opt-in.
+
+  ==================================================================
+  BUG: KASAN: slab-out-of-bounds in fpu_copy_uabi_to_guest_fpstate+0x86/0x130
+  Read of size 8 at addr ffff888011e33a00 by task qemu-build/681
+  CPU: 1 PID: 681 Comm: qemu-build Not tainted 5.18.0-rc5-KASAN-amd64 #1
+  Hardware name:  /DG35EC, BIOS ECG3510M.86A.0118.2010.0113.1426 01/13/2010
+  Call Trace:
+   <TASK>
+   dump_stack_lvl+0x34/0x45
+   print_report.cold+0x45/0x575
+   kasan_report+0x9b/0xd0
+   fpu_copy_uabi_to_guest_fpstate+0x86/0x130
+   kvm_arch_vcpu_ioctl+0x72a/0x1c50 [kvm]
+   kvm_vcpu_ioctl+0x47f/0x7b0 [kvm]
+   __x64_sys_ioctl+0x5de/0xc90
+   do_syscall_64+0x31/0x50
+   entry_SYSCALL_64_after_hwframe+0x44/0xae
+   </TASK>
+  Allocated by task 0:
+  (stack is not available)
+  The buggy address belongs to the object at ffff888011e33800
+   which belongs to the cache kmalloc-512 of size 512
+  The buggy address is located 0 bytes to the right of
+   512-byte region [ffff888011e33800, ffff888011e33a00)
+  The buggy address belongs to the physical page:
+  page:0000000089cd4adb refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11e30
+  head:0000000089cd4adb order:2 compound_mapcount:0 compound_pincount:0
+  flags: 0x4000000000010200(slab|head|zone=1)
+  raw: 4000000000010200 dead000000000100 dead000000000122 ffff888001041c80
+  raw: 0000000000000000 0000000080100010 00000001ffffffff 0000000000000000
+  page dumped because: kasan: bad access detected
+  Memory state around the buggy address:
+   ffff888011e33900: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+   ffff888011e33980: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+  >ffff888011e33a00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+                     ^
+   ffff888011e33a80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+   ffff888011e33b00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+  ==================================================================
+  Disabling lock debugging due to kernel taint
+
+Fixes: be50b2065dfa ("kvm: x86: Add support for getting/setting expanded xstate buffer")
+Fixes: c60427dd50ba ("x86/fpu: Add uabi_size to guest_fpu")
+Reported-by: Zdenek Kaspar <zkaspar82@gmail.com>
+Cc: Maciej S. Szmigiero <mail@maciej.szmigiero.name>
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Cc: kvm@vger.kernel.org
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Tested-by: Zdenek Kaspar <zkaspar82@gmail.com>
+Message-Id: <20220504001219.983513-1-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/fpu/core.c |   17 ++++++++++++++++-
+ 1 file changed, 16 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kernel/fpu/core.c
++++ b/arch/x86/kernel/fpu/core.c
+@@ -14,6 +14,8 @@
+ #include <asm/traps.h>
+ #include <asm/irq_regs.h>
++#include <uapi/asm/kvm.h>
++
+ #include <linux/hardirq.h>
+ #include <linux/pkeys.h>
+ #include <linux/vmalloc.h>
+@@ -232,7 +234,20 @@ bool fpu_alloc_guest_fpstate(struct fpu_
+       gfpu->fpstate           = fpstate;
+       gfpu->xfeatures         = fpu_user_cfg.default_features;
+       gfpu->perm              = fpu_user_cfg.default_features;
+-      gfpu->uabi_size         = fpu_user_cfg.default_size;
++
++      /*
++       * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state
++       * to userspace, even when XSAVE is unsupported, so that restoring FPU
++       * state on a different CPU that does support XSAVE can cleanly load
++       * the incoming state using its natural XSAVE.  In other words, KVM's
++       * uABI size may be larger than this host's default size.  Conversely,
++       * the default size should never be larger than KVM's base uABI size;
++       * all features that can expand the uABI size must be opt-in.
++       */
++      gfpu->uabi_size         = sizeof(struct kvm_xsave);
++      if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size))
++              gfpu->uabi_size = fpu_user_cfg.default_size;
++
+       fpu_init_guest_permissions(gfpu);
+       return true;
diff --git a/queue-5.18/x86-kvm-alloc-dummy-async-pf-token-outside-of-raw-spinlock.patch b/queue-5.18/x86-kvm-alloc-dummy-async-pf-token-outside-of-raw-spinlock.patch
new file mode 100644 (file)
index 0000000..44d39c4
--- /dev/null
@@ -0,0 +1,91 @@
+From 0547758a6de3cc71a0cfdd031a3621a30db6a68b Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Thu, 19 May 2022 07:57:11 -0700
+Subject: x86/kvm: Alloc dummy async #PF token outside of raw spinlock
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 0547758a6de3cc71a0cfdd031a3621a30db6a68b upstream.
+
+Drop the raw spinlock in kvm_async_pf_task_wake() before allocating the
+the dummy async #PF token, the allocator is preemptible on PREEMPT_RT
+kernels and must not be called from truly atomic contexts.
+
+Opportunistically document why it's ok to loop on allocation failure,
+i.e. why the function won't get stuck in an infinite loop.
+
+Reported-by: Yajun Deng <yajun.deng@linux.dev>
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/kvm.c |   41 +++++++++++++++++++++++++++--------------
+ 1 file changed, 27 insertions(+), 14 deletions(-)
+
+--- a/arch/x86/kernel/kvm.c
++++ b/arch/x86/kernel/kvm.c
+@@ -191,7 +191,7 @@ void kvm_async_pf_task_wake(u32 token)
+ {
+       u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+       struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+-      struct kvm_task_sleep_node *n;
++      struct kvm_task_sleep_node *n, *dummy = NULL;
+       if (token == ~0) {
+               apf_task_wake_all();
+@@ -203,28 +203,41 @@ again:
+       n = _find_apf_task(b, token);
+       if (!n) {
+               /*
+-               * async PF was not yet handled.
+-               * Add dummy entry for the token.
++               * Async #PF not yet handled, add a dummy entry for the token.
++               * Allocating the token must be down outside of the raw lock
++               * as the allocator is preemptible on PREEMPT_RT kernels.
+                */
+-              n = kzalloc(sizeof(*n), GFP_ATOMIC);
+-              if (!n) {
++              if (!dummy) {
++                      raw_spin_unlock(&b->lock);
++                      dummy = kzalloc(sizeof(*dummy), GFP_KERNEL);
++
+                       /*
+-                       * Allocation failed! Busy wait while other cpu
+-                       * handles async PF.
++                       * Continue looping on allocation failure, eventually
++                       * the async #PF will be handled and allocating a new
++                       * node will be unnecessary.
++                       */
++                      if (!dummy)
++                              cpu_relax();
++
++                      /*
++                       * Recheck for async #PF completion before enqueueing
++                       * the dummy token to avoid duplicate list entries.
+                        */
+-                      raw_spin_unlock(&b->lock);
+-                      cpu_relax();
+                       goto again;
+               }
+-              n->token = token;
+-              n->cpu = smp_processor_id();
+-              init_swait_queue_head(&n->wq);
+-              hlist_add_head(&n->link, &b->list);
++              dummy->token = token;
++              dummy->cpu = smp_processor_id();
++              init_swait_queue_head(&dummy->wq);
++              hlist_add_head(&dummy->link, &b->list);
++              dummy = NULL;
+       } else {
+               apf_task_wake_one(n);
+       }
+       raw_spin_unlock(&b->lock);
+-      return;
++
++      /* A dummy token might be allocated and ultimately not used.  */
++      if (dummy)
++              kfree(dummy);
+ }
+ EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
diff --git a/queue-5.18/x86-kvm-use-correct-gfp-flags-for-preemption-disabled.patch b/queue-5.18/x86-kvm-use-correct-gfp-flags-for-preemption-disabled.patch
new file mode 100644 (file)
index 0000000..8ace75d
--- /dev/null
@@ -0,0 +1,81 @@
+From baec4f5a018fe2d708fc1022330dba04b38b5fe3 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Tue, 24 May 2022 09:43:31 -0400
+Subject: x86, kvm: use correct GFP flags for preemption disabled
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit baec4f5a018fe2d708fc1022330dba04b38b5fe3 upstream.
+
+Commit ddd7ed842627 ("x86/kvm: Alloc dummy async #PF token outside of
+raw spinlock") leads to the following Smatch static checker warning:
+
+       arch/x86/kernel/kvm.c:212 kvm_async_pf_task_wake()
+       warn: sleeping in atomic context
+
+arch/x86/kernel/kvm.c
+    202         raw_spin_lock(&b->lock);
+    203         n = _find_apf_task(b, token);
+    204         if (!n) {
+    205                 /*
+    206                  * Async #PF not yet handled, add a dummy entry for the token.
+    207                  * Allocating the token must be down outside of the raw lock
+    208                  * as the allocator is preemptible on PREEMPT_RT kernels.
+    209                  */
+    210                 if (!dummy) {
+    211                         raw_spin_unlock(&b->lock);
+--> 212                         dummy = kzalloc(sizeof(*dummy), GFP_KERNEL);
+                                                                ^^^^^^^^^^
+Smatch thinks the caller has preempt disabled.  The `smdb.py preempt
+kvm_async_pf_task_wake` output call tree is:
+
+sysvec_kvm_asyncpf_interrupt() <- disables preempt
+-> __sysvec_kvm_asyncpf_interrupt()
+   -> kvm_async_pf_task_wake()
+
+The caller is this:
+
+arch/x86/kernel/kvm.c
+   290        DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
+   291        {
+   292                struct pt_regs *old_regs = set_irq_regs(regs);
+   293                u32 token;
+   294
+   295                ack_APIC_irq();
+   296
+   297                inc_irq_stat(irq_hv_callback_count);
+   298
+   299                if (__this_cpu_read(apf_reason.enabled)) {
+   300                        token = __this_cpu_read(apf_reason.token);
+   301                        kvm_async_pf_task_wake(token);
+   302                        __this_cpu_write(apf_reason.token, 0);
+   303                        wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1);
+   304                }
+   305
+   306                set_irq_regs(old_regs);
+   307        }
+
+The DEFINE_IDTENTRY_SYSVEC() is a wrapper that calls this function
+from the call_on_irqstack_cond().  It's inside the call_on_irqstack_cond()
+where preempt is disabled (unless it's already disabled).  The
+irq_enter/exit_rcu() functions disable/enable preempt.
+
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/kvm.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/kernel/kvm.c
++++ b/arch/x86/kernel/kvm.c
+@@ -209,7 +209,7 @@ again:
+                */
+               if (!dummy) {
+                       raw_spin_unlock(&b->lock);
+-                      dummy = kzalloc(sizeof(*dummy), GFP_KERNEL);
++                      dummy = kzalloc(sizeof(*dummy), GFP_ATOMIC);
+                       /*
+                        * Continue looping on allocation failure, eventually
diff --git a/queue-5.18/x86-uaccess-implement-macros-for-cmpxchg-on-user-addresses.patch b/queue-5.18/x86-uaccess-implement-macros-for-cmpxchg-on-user-addresses.patch
new file mode 100644 (file)
index 0000000..15394e4
--- /dev/null
@@ -0,0 +1,191 @@
+From 989b5db215a2f22f89d730b607b071d964780f10 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 2 Feb 2022 00:49:42 +0000
+Subject: x86/uaccess: Implement macros for CMPXCHG on user addresses
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 989b5db215a2f22f89d730b607b071d964780f10 upstream.
+
+Add support for CMPXCHG loops on userspace addresses.  Provide both an
+"unsafe" version for tight loops that do their own uaccess begin/end, as
+well as a "safe" version for use cases where the CMPXCHG is not buried in
+a loop, e.g. KVM will resume the guest instead of looping when emulation
+of a guest atomic accesses fails the CMPXCHG.
+
+Provide 8-byte versions for 32-bit kernels so that KVM can do CMPXCHG on
+guest PAE PTEs, which are accessed via userspace addresses.
+
+Guard the asm_volatile_goto() variation with CC_HAS_ASM_GOTO_TIED_OUTPUT,
+the "+m" constraint fails on some compilers that otherwise support
+CC_HAS_ASM_GOTO_OUTPUT.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Co-developed-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20220202004945.2540433-3-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/uaccess.h |  142 +++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 142 insertions(+)
+
+--- a/arch/x86/include/asm/uaccess.h
++++ b/arch/x86/include/asm/uaccess.h
+@@ -382,6 +382,103 @@ do {                                                                     \
+ #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
++#ifdef CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
++#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label)        ({ \
++      bool success;                                                   \
++      __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);              \
++      __typeof__(*(_ptr)) __old = *_old;                              \
++      __typeof__(*(_ptr)) __new = (_new);                             \
++      asm_volatile_goto("\n"                                          \
++                   "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
++                   _ASM_EXTABLE_UA(1b, %l[label])                     \
++                   : CC_OUT(z) (success),                             \
++                     [ptr] "+m" (*_ptr),                              \
++                     [old] "+a" (__old)                               \
++                   : [new] ltype (__new)                              \
++                   : "memory"                                         \
++                   : label);                                          \
++      if (unlikely(!success))                                         \
++              *_old = __old;                                          \
++      likely(success);                                        })
++
++#ifdef CONFIG_X86_32
++#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label)    ({      \
++      bool success;                                                   \
++      __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);              \
++      __typeof__(*(_ptr)) __old = *_old;                              \
++      __typeof__(*(_ptr)) __new = (_new);                             \
++      asm_volatile_goto("\n"                                          \
++                   "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n"             \
++                   _ASM_EXTABLE_UA(1b, %l[label])                     \
++                   : CC_OUT(z) (success),                             \
++                     "+A" (__old),                                    \
++                     [ptr] "+m" (*_ptr)                               \
++                   : "b" ((u32)__new),                                \
++                     "c" ((u32)((u64)__new >> 32))                    \
++                   : "memory"                                         \
++                   : label);                                          \
++      if (unlikely(!success))                                         \
++              *_old = __old;                                          \
++      likely(success);                                        })
++#endif // CONFIG_X86_32
++#else  // !CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
++#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label)        ({ \
++      int __err = 0;                                                  \
++      bool success;                                                   \
++      __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);              \
++      __typeof__(*(_ptr)) __old = *_old;                              \
++      __typeof__(*(_ptr)) __new = (_new);                             \
++      asm volatile("\n"                                               \
++                   "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
++                   CC_SET(z)                                          \
++                   "2:\n"                                             \
++                   _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG,  \
++                                         %[errout])                   \
++                   : CC_OUT(z) (success),                             \
++                     [errout] "+r" (__err),                           \
++                     [ptr] "+m" (*_ptr),                              \
++                     [old] "+a" (__old)                               \
++                   : [new] ltype (__new)                              \
++                   : "memory", "cc");                                 \
++      if (unlikely(__err))                                            \
++              goto label;                                             \
++      if (unlikely(!success))                                         \
++              *_old = __old;                                          \
++      likely(success);                                        })
++
++#ifdef CONFIG_X86_32
++/*
++ * Unlike the normal CMPXCHG, hardcode ECX for both success/fail and error.
++ * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are
++ * hardcoded by CMPXCHG8B, leaving only ESI and EDI.  If the compiler uses
++ * both ESI and EDI for the memory operand, compilation will fail if the error
++ * is an input+output as there will be no register available for input.
++ */
++#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label)    ({      \
++      int __result;                                                   \
++      __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);              \
++      __typeof__(*(_ptr)) __old = *_old;                              \
++      __typeof__(*(_ptr)) __new = (_new);                             \
++      asm volatile("\n"                                               \
++                   "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n"             \
++                   "mov $0, %%ecx\n\t"                                \
++                   "setz %%cl\n"                                      \
++                   "2:\n"                                             \
++                   _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %%ecx) \
++                   : [result]"=c" (__result),                         \
++                     "+A" (__old),                                    \
++                     [ptr] "+m" (*_ptr)                               \
++                   : "b" ((u32)__new),                                \
++                     "c" ((u32)((u64)__new >> 32))                    \
++                   : "memory", "cc");                                 \
++      if (unlikely(__result < 0))                                     \
++              goto label;                                             \
++      if (unlikely(!__result))                                        \
++              *_old = __old;                                          \
++      likely(__result);                                       })
++#endif // CONFIG_X86_32
++#endif // CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
++
+ /* FIXME: this hack is definitely wrong -AK */
+ struct __large_struct { unsigned long buf[100]; };
+ #define __m(x) (*(struct __large_struct __user *)(x))
+@@ -474,6 +571,51 @@ do {                                                                              \
+ } while (0)
+ #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
++extern void __try_cmpxchg_user_wrong_size(void);
++
++#ifndef CONFIG_X86_32
++#define __try_cmpxchg64_user_asm(_ptr, _oldp, _nval, _label)          \
++      __try_cmpxchg_user_asm("q", "r", (_ptr), (_oldp), (_nval), _label)
++#endif
++
++/*
++ * Force the pointer to u<size> to match the size expected by the asm helper.
++ * clang/LLVM compiles all cases and only discards the unused paths after
++ * processing errors, which breaks i386 if the pointer is an 8-byte value.
++ */
++#define unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({                        \
++      bool __ret;                                                             \
++      __chk_user_ptr(_ptr);                                                   \
++      switch (sizeof(*(_ptr))) {                                              \
++      case 1: __ret = __try_cmpxchg_user_asm("b", "q",                        \
++                                             (__force u8 *)(_ptr), (_oldp),   \
++                                             (_nval), _label);                \
++              break;                                                          \
++      case 2: __ret = __try_cmpxchg_user_asm("w", "r",                        \
++                                             (__force u16 *)(_ptr), (_oldp),  \
++                                             (_nval), _label);                \
++              break;                                                          \
++      case 4: __ret = __try_cmpxchg_user_asm("l", "r",                        \
++                                             (__force u32 *)(_ptr), (_oldp),  \
++                                             (_nval), _label);                \
++              break;                                                          \
++      case 8: __ret = __try_cmpxchg64_user_asm((__force u64 *)(_ptr), (_oldp),\
++                                               (_nval), _label);              \
++              break;                                                          \
++      default: __try_cmpxchg_user_wrong_size();                               \
++      }                                                                       \
++      __ret;                                          })
++
++/* "Returns" 0 on success, 1 on failure, -EFAULT if the access faults. */
++#define __try_cmpxchg_user(_ptr, _oldp, _nval, _label)        ({              \
++      int __ret = -EFAULT;                                            \
++      __uaccess_begin_nospec();                                       \
++      __ret = !unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label);   \
++_label:                                                                       \
++      __uaccess_end();                                                \
++      __ret;                                                          \
++                                                      })
++
+ /*
+  * We want the unsafe accessors to always be inlined and use
+  * the error labels - thus the macro games.