]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.6-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 15 May 2024 07:41:13 +0000 (09:41 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 15 May 2024 07:41:13 +0000 (09:41 +0200)
added patches:
btrfs-do-not-wait-for-short-bulk-allocation.patch
dmaengine-idxd-add-a-new-security-check-to-deal-with-a-hardware-erratum.patch
dmaengine-idxd-add-a-write-method-for-applications-to-submit-work.patch
keys-fix-overwrite-of-key-expiration-on-instantiation.patch
md-fix-kmemleak-of-rdev-serial.patch
mm-hugetlb-fix-debug_locks_warn_on-1-when-dissolve_free_hugetlb_folio.patch
revert-selftests-bpf-add-netkit-to-tc_redirect-selftest.patch
vfio-add-the-spr_dsa-and-spr_iax-devices-to-the-denylist.patch

queue-6.6/btrfs-do-not-wait-for-short-bulk-allocation.patch [new file with mode: 0644]
queue-6.6/dmaengine-idxd-add-a-new-security-check-to-deal-with-a-hardware-erratum.patch [new file with mode: 0644]
queue-6.6/dmaengine-idxd-add-a-write-method-for-applications-to-submit-work.patch [new file with mode: 0644]
queue-6.6/keys-fix-overwrite-of-key-expiration-on-instantiation.patch [new file with mode: 0644]
queue-6.6/md-fix-kmemleak-of-rdev-serial.patch [new file with mode: 0644]
queue-6.6/mm-hugetlb-fix-debug_locks_warn_on-1-when-dissolve_free_hugetlb_folio.patch [new file with mode: 0644]
queue-6.6/revert-selftests-bpf-add-netkit-to-tc_redirect-selftest.patch [new file with mode: 0644]
queue-6.6/series
queue-6.6/vfio-add-the-spr_dsa-and-spr_iax-devices-to-the-denylist.patch [new file with mode: 0644]

diff --git a/queue-6.6/btrfs-do-not-wait-for-short-bulk-allocation.patch b/queue-6.6/btrfs-do-not-wait-for-short-bulk-allocation.patch
new file mode 100644 (file)
index 0000000..ecb6b20
--- /dev/null
@@ -0,0 +1,89 @@
+From 1db7959aacd905e6487d0478ac01d89f86eb1e51 Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Tue, 26 Mar 2024 09:16:46 +1030
+Subject: btrfs: do not wait for short bulk allocation
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 1db7959aacd905e6487d0478ac01d89f86eb1e51 upstream.
+
+[BUG]
+There is a recent report that when memory pressure is high (including
+cached pages), btrfs can spend most of its time on memory allocation in
+btrfs_alloc_page_array() for compressed read/write.
+
+[CAUSE]
+For btrfs_alloc_page_array() we always go alloc_pages_bulk_array(), and
+even if the bulk allocation failed (fell back to single page
+allocation) we still retry but with extra memalloc_retry_wait().
+
+If the bulk alloc only returned one page a time, we would spend a lot of
+time on the retry wait.
+
+The behavior was introduced in commit 395cb57e8560 ("btrfs: wait between
+incomplete batch memory allocations").
+
+[FIX]
+Although the commit mentioned that other filesystems do the wait, it's
+not the case at least nowadays.
+
+All the mainlined filesystems only call memalloc_retry_wait() if they
+failed to allocate any page (not only for bulk allocation).
+If there is any progress, they won't call memalloc_retry_wait() at all.
+
+For example, xfs_buf_alloc_pages() would only call memalloc_retry_wait()
+if there is no allocation progress at all, and the call is not for
+metadata readahead.
+
+So I don't believe we should call memalloc_retry_wait() unconditionally
+for short allocation.
+
+Call memalloc_retry_wait() if it fails to allocate any page for tree
+block allocation (which goes with __GFP_NOFAIL and may not need the
+special handling anyway), and reduce the latency for
+btrfs_alloc_page_array().
+
+Reported-by: Julian Taylor <julian.taylor@1und1.de>
+Tested-by: Julian Taylor <julian.taylor@1und1.de>
+Link: https://lore.kernel.org/all/8966c095-cbe7-4d22-9784-a647d1bf27c3@1und1.de/
+Fixes: 395cb57e8560 ("btrfs: wait between incomplete batch memory allocations")
+CC: stable@vger.kernel.org # 6.1+
+Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent_io.c |   14 ++------------
+ 1 file changed, 2 insertions(+), 12 deletions(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -686,24 +686,14 @@ int btrfs_alloc_page_array(unsigned int
+               unsigned int last = allocated;
+               allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
+-
+-              if (allocated == nr_pages)
+-                      return 0;
+-
+-              /*
+-               * During this iteration, no page could be allocated, even
+-               * though alloc_pages_bulk_array() falls back to alloc_page()
+-               * if  it could not bulk-allocate. So we must be out of memory.
+-               */
+-              if (allocated == last) {
++              if (unlikely(allocated == last)) {
++                      /* No progress, fail and do cleanup. */
+                       for (int i = 0; i < allocated; i++) {
+                               __free_page(page_array[i]);
+                               page_array[i] = NULL;
+                       }
+                       return -ENOMEM;
+               }
+-
+-              memalloc_retry_wait(GFP_NOFS);
+       }
+       return 0;
+ }
diff --git a/queue-6.6/dmaengine-idxd-add-a-new-security-check-to-deal-with-a-hardware-erratum.patch b/queue-6.6/dmaengine-idxd-add-a-new-security-check-to-deal-with-a-hardware-erratum.patch
new file mode 100644 (file)
index 0000000..2277b4f
--- /dev/null
@@ -0,0 +1,97 @@
+From e11452eb071b2a8e6ba52892b2e270bbdaa6640d Mon Sep 17 00:00:00 2001
+From: Arjan van de Ven <arjan@linux.intel.com>
+Date: Wed, 24 Apr 2024 14:43:22 +0000
+Subject: dmaengine: idxd: add a new security check to deal with a hardware erratum
+
+From: Arjan van de Ven <arjan@linux.intel.com>
+
+commit e11452eb071b2a8e6ba52892b2e270bbdaa6640d upstream.
+
+On Sapphire Rapids and related platforms, the DSA and IAA devices have an
+erratum that causes direct access (for example, by using the ENQCMD or
+MOVDIR64 instructions) from untrusted applications to be a security problem.
+
+To solve this, add a flag to the PCI device enumeration and device structures
+to indicate the presence/absence of this security exposure. In the mmap()
+method of the device, this flag is then used to enforce that the user
+has the CAP_SYS_RAWIO capability.
+
+In a future patch, a write() based method will be added that allows untrusted
+applications submit work to the accelerator, where the kernel can do
+sanity checking on the user input to ensure secure operation of the accelerator.
+
+Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/dma/idxd/cdev.c |   12 ++++++++++++
+ drivers/dma/idxd/idxd.h |    3 +++
+ drivers/dma/idxd/init.c |    4 ++++
+ 3 files changed, 19 insertions(+)
+
+--- a/drivers/dma/idxd/cdev.c
++++ b/drivers/dma/idxd/cdev.c
+@@ -400,6 +400,18 @@ static int idxd_cdev_mmap(struct file *f
+       int rc;
+       dev_dbg(&pdev->dev, "%s called\n", __func__);
++
++      /*
++       * Due to an erratum in some of the devices supported by the driver,
++       * direct user submission to the device can be unsafe.
++       * (See the INTEL-SA-01084 security advisory)
++       *
++       * For the devices that exhibit this behavior, require that the user
++       * has CAP_SYS_RAWIO capabilities.
++       */
++      if (!idxd->user_submission_safe && !capable(CAP_SYS_RAWIO))
++              return -EPERM;
++
+       rc = check_vma(wq, vma, __func__);
+       if (rc < 0)
+               return rc;
+--- a/drivers/dma/idxd/idxd.h
++++ b/drivers/dma/idxd/idxd.h
+@@ -275,6 +275,7 @@ struct idxd_driver_data {
+       int evl_cr_off;
+       int cr_status_off;
+       int cr_result_off;
++      bool user_submission_safe;
+ };
+ struct idxd_evl {
+@@ -360,6 +361,8 @@ struct idxd_device {
+       struct dentry *dbgfs_dir;
+       struct dentry *dbgfs_evl_file;
++
++      bool user_submission_safe;
+ };
+ static inline unsigned int evl_ent_size(struct idxd_device *idxd)
+--- a/drivers/dma/idxd/init.c
++++ b/drivers/dma/idxd/init.c
+@@ -47,6 +47,7 @@ static struct idxd_driver_data idxd_driv
+               .align = 32,
+               .dev_type = &dsa_device_type,
+               .evl_cr_off = offsetof(struct dsa_evl_entry, cr),
++              .user_submission_safe = false, /* See INTEL-SA-01084 security advisory */
+               .cr_status_off = offsetof(struct dsa_completion_record, status),
+               .cr_result_off = offsetof(struct dsa_completion_record, result),
+       },
+@@ -57,6 +58,7 @@ static struct idxd_driver_data idxd_driv
+               .align = 64,
+               .dev_type = &iax_device_type,
+               .evl_cr_off = offsetof(struct iax_evl_entry, cr),
++              .user_submission_safe = false, /* See INTEL-SA-01084 security advisory */
+               .cr_status_off = offsetof(struct iax_completion_record, status),
+               .cr_result_off = offsetof(struct iax_completion_record, error_code),
+       },
+@@ -767,6 +769,8 @@ static int idxd_pci_probe(struct pci_dev
+       dev_info(&pdev->dev, "Intel(R) Accelerator Device (v%x)\n",
+                idxd->hw.version);
++      idxd->user_submission_safe = data->user_submission_safe;
++
+       return 0;
+  err_dev_register:
diff --git a/queue-6.6/dmaengine-idxd-add-a-write-method-for-applications-to-submit-work.patch b/queue-6.6/dmaengine-idxd-add-a-write-method-for-applications-to-submit-work.patch
new file mode 100644 (file)
index 0000000..fdcc3f9
--- /dev/null
@@ -0,0 +1,164 @@
+From 6827738dc684a87ad54ebba3ae7f3d7c977698eb Mon Sep 17 00:00:00 2001
+From: Nikhil Rao <nikhil.rao@intel.com>
+Date: Wed, 24 Apr 2024 15:16:12 +0000
+Subject: dmaengine: idxd: add a write() method for applications to submit work
+
+From: Nikhil Rao <nikhil.rao@intel.com>
+
+commit 6827738dc684a87ad54ebba3ae7f3d7c977698eb upstream.
+
+After the patch to restrict the use of mmap() to CAP_SYS_RAWIO for
+the currently existing devices, most applications can no longer make
+use of the accelerators as in production "you don't run things as root".
+
+To keep the DSA and IAA accelerators usable, hook up a write() method
+so that applications can still submit work. In the write method,
+sufficient input validation is performed to avoid the security issue
+that required the mmap CAP_SYS_RAWIO check.
+
+One complication is that the DSA device allows for indirect ("batched")
+descriptors. There is no reasonable way to do the input validation
+on these indirect descriptors so the write() method will not allow these
+to be submitted to the hardware on affected hardware, and the sysfs
+enumeration of support for the opcode is also removed.
+
+Early performance data shows that the performance delta for most common
+cases is within the noise.
+
+Signed-off-by: Nikhil Rao <nikhil.rao@intel.com>
+Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/dma/idxd/cdev.c  |   65 +++++++++++++++++++++++++++++++++++++++++++++++
+ drivers/dma/idxd/sysfs.c |   27 ++++++++++++++++++-
+ 2 files changed, 90 insertions(+), 2 deletions(-)
+
+--- a/drivers/dma/idxd/cdev.c
++++ b/drivers/dma/idxd/cdev.c
+@@ -426,6 +426,70 @@ static int idxd_cdev_mmap(struct file *f
+                       vma->vm_page_prot);
+ }
++static int idxd_submit_user_descriptor(struct idxd_user_context *ctx,
++                                     struct dsa_hw_desc __user *udesc)
++{
++      struct idxd_wq *wq = ctx->wq;
++      struct idxd_dev *idxd_dev = &wq->idxd->idxd_dev;
++      const uint64_t comp_addr_align = is_dsa_dev(idxd_dev) ? 0x20 : 0x40;
++      void __iomem *portal = idxd_wq_portal_addr(wq);
++      struct dsa_hw_desc descriptor __aligned(64);
++      int rc;
++
++      rc = copy_from_user(&descriptor, udesc, sizeof(descriptor));
++      if (rc)
++              return -EFAULT;
++
++      /*
++       * DSA devices are capable of indirect ("batch") command submission.
++       * On devices where direct user submissions are not safe, we cannot
++       * allow this since there is no good way for us to verify these
++       * indirect commands.
++       */
++      if (is_dsa_dev(idxd_dev) && descriptor.opcode == DSA_OPCODE_BATCH &&
++              !wq->idxd->user_submission_safe)
++              return -EINVAL;
++      /*
++       * As per the programming specification, the completion address must be
++       * aligned to 32 or 64 bytes. If this is violated the hardware
++       * engine can get very confused (security issue).
++       */
++      if (!IS_ALIGNED(descriptor.completion_addr, comp_addr_align))
++              return -EINVAL;
++
++      if (wq_dedicated(wq))
++              iosubmit_cmds512(portal, &descriptor, 1);
++      else {
++              descriptor.priv = 0;
++              descriptor.pasid = ctx->pasid;
++              rc = idxd_enqcmds(wq, portal, &descriptor);
++              if (rc < 0)
++                      return rc;
++      }
++
++      return 0;
++}
++
++static ssize_t idxd_cdev_write(struct file *filp, const char __user *buf, size_t len,
++                             loff_t *unused)
++{
++      struct dsa_hw_desc __user *udesc = (struct dsa_hw_desc __user *)buf;
++      struct idxd_user_context *ctx = filp->private_data;
++      ssize_t written = 0;
++      int i;
++
++      for (i = 0; i < len/sizeof(struct dsa_hw_desc); i++) {
++              int rc = idxd_submit_user_descriptor(ctx, udesc + i);
++
++              if (rc)
++                      return written ? written : rc;
++
++              written += sizeof(struct dsa_hw_desc);
++      }
++
++      return written;
++}
++
+ static __poll_t idxd_cdev_poll(struct file *filp,
+                              struct poll_table_struct *wait)
+ {
+@@ -448,6 +512,7 @@ static const struct file_operations idxd
+       .open = idxd_cdev_open,
+       .release = idxd_cdev_release,
+       .mmap = idxd_cdev_mmap,
++      .write = idxd_cdev_write,
+       .poll = idxd_cdev_poll,
+ };
+--- a/drivers/dma/idxd/sysfs.c
++++ b/drivers/dma/idxd/sysfs.c
+@@ -1197,12 +1197,35 @@ static ssize_t wq_enqcmds_retries_store(
+ static struct device_attribute dev_attr_wq_enqcmds_retries =
+               __ATTR(enqcmds_retries, 0644, wq_enqcmds_retries_show, wq_enqcmds_retries_store);
++static ssize_t op_cap_show_common(struct device *dev, char *buf, unsigned long *opcap_bmap)
++{
++      ssize_t pos;
++      int i;
++
++      pos = 0;
++      for (i = IDXD_MAX_OPCAP_BITS/64 - 1; i >= 0; i--) {
++              unsigned long val = opcap_bmap[i];
++
++              /* On systems where direct user submissions are not safe, we need to clear out
++               * the BATCH capability from the capability mask in sysfs since we cannot support
++               * that command on such systems.
++               */
++              if (i == DSA_OPCODE_BATCH/64 && !confdev_to_idxd(dev)->user_submission_safe)
++                      clear_bit(DSA_OPCODE_BATCH % 64, &val);
++
++              pos += sysfs_emit_at(buf, pos, "%*pb", 64, &val);
++              pos += sysfs_emit_at(buf, pos, "%c", i == 0 ? '\n' : ',');
++      }
++
++      return pos;
++}
++
+ static ssize_t wq_op_config_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+ {
+       struct idxd_wq *wq = confdev_to_wq(dev);
+-      return sysfs_emit(buf, "%*pb\n", IDXD_MAX_OPCAP_BITS, wq->opcap_bmap);
++      return op_cap_show_common(dev, buf, wq->opcap_bmap);
+ }
+ static int idxd_verify_supported_opcap(struct idxd_device *idxd, unsigned long *opmask)
+@@ -1421,7 +1444,7 @@ static ssize_t op_cap_show(struct device
+ {
+       struct idxd_device *idxd = confdev_to_idxd(dev);
+-      return sysfs_emit(buf, "%*pb\n", IDXD_MAX_OPCAP_BITS, idxd->opcap_bmap);
++      return op_cap_show_common(dev, buf, idxd->opcap_bmap);
+ }
+ static DEVICE_ATTR_RO(op_cap);
diff --git a/queue-6.6/keys-fix-overwrite-of-key-expiration-on-instantiation.patch b/queue-6.6/keys-fix-overwrite-of-key-expiration-on-instantiation.patch
new file mode 100644 (file)
index 0000000..35b558d
--- /dev/null
@@ -0,0 +1,44 @@
+From 9da27fb65a14c18efd4473e2e82b76b53ba60252 Mon Sep 17 00:00:00 2001
+From: Silvio Gissi <sifonsec@amazon.com>
+Date: Fri, 15 Mar 2024 15:05:39 -0400
+Subject: keys: Fix overwrite of key expiration on instantiation
+
+From: Silvio Gissi <sifonsec@amazon.com>
+
+commit 9da27fb65a14c18efd4473e2e82b76b53ba60252 upstream.
+
+The expiry time of a key is unconditionally overwritten during
+instantiation, defaulting to turn it permanent. This causes a problem
+for DNS resolution as the expiration set by user-space is overwritten to
+TIME64_MAX, disabling further DNS updates. Fix this by restoring the
+condition that key_set_expiry is only called when the pre-parser sets a
+specific expiry.
+
+Fixes: 39299bdd2546 ("keys, dns: Allow key types (eg. DNS) to be reclaimed immediately on expiry")
+Signed-off-by: Silvio Gissi <sifonsec@amazon.com>
+cc: David Howells <dhowells@redhat.com>
+cc: Hazem Mohamed Abuelfotoh <abuehaze@amazon.com>
+cc: linux-afs@lists.infradead.org
+cc: linux-cifs@vger.kernel.org
+cc: keyrings@vger.kernel.org
+cc: netdev@vger.kernel.org
+cc: stable@vger.kernel.org
+Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
+Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ security/keys/key.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/security/keys/key.c
++++ b/security/keys/key.c
+@@ -464,7 +464,8 @@ static int __key_instantiate_and_link(st
+                       if (authkey)
+                               key_invalidate(authkey);
+-                      key_set_expiry(key, prep->expiry);
++                      if (prep->expiry != TIME64_MAX)
++                              key_set_expiry(key, prep->expiry);
+               }
+       }
diff --git a/queue-6.6/md-fix-kmemleak-of-rdev-serial.patch b/queue-6.6/md-fix-kmemleak-of-rdev-serial.patch
new file mode 100644 (file)
index 0000000..e37ed8c
--- /dev/null
@@ -0,0 +1,55 @@
+From 6cf350658736681b9d6b0b6e58c5c76b235bb4c4 Mon Sep 17 00:00:00 2001
+From: Li Nan <linan122@huawei.com>
+Date: Thu, 8 Feb 2024 16:55:56 +0800
+Subject: md: fix kmemleak of rdev->serial
+
+From: Li Nan <linan122@huawei.com>
+
+commit 6cf350658736681b9d6b0b6e58c5c76b235bb4c4 upstream.
+
+If kobject_add() is fail in bind_rdev_to_array(), 'rdev->serial' will be
+alloc not be freed, and kmemleak occurs.
+
+unreferenced object 0xffff88815a350000 (size 49152):
+  comm "mdadm", pid 789, jiffies 4294716910
+  hex dump (first 32 bytes):
+    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+  backtrace (crc f773277a):
+    [<0000000058b0a453>] kmemleak_alloc+0x61/0xe0
+    [<00000000366adf14>] __kmalloc_large_node+0x15e/0x270
+    [<000000002e82961b>] __kmalloc_node.cold+0x11/0x7f
+    [<00000000f206d60a>] kvmalloc_node+0x74/0x150
+    [<0000000034bf3363>] rdev_init_serial+0x67/0x170
+    [<0000000010e08fe9>] mddev_create_serial_pool+0x62/0x220
+    [<00000000c3837bf0>] bind_rdev_to_array+0x2af/0x630
+    [<0000000073c28560>] md_add_new_disk+0x400/0x9f0
+    [<00000000770e30ff>] md_ioctl+0x15bf/0x1c10
+    [<000000006cfab718>] blkdev_ioctl+0x191/0x3f0
+    [<0000000085086a11>] vfs_ioctl+0x22/0x60
+    [<0000000018b656fe>] __x64_sys_ioctl+0xba/0xe0
+    [<00000000e54e675e>] do_syscall_64+0x71/0x150
+    [<000000008b0ad622>] entry_SYSCALL_64_after_hwframe+0x6c/0x74
+
+Fixes: 963c555e75b0 ("md: introduce mddev_create/destroy_wb_pool for the change of member device")
+Signed-off-by: Li Nan <linan122@huawei.com>
+Signed-off-by: Song Liu <song@kernel.org>
+Link: https://lore.kernel.org/r/20240208085556.2412922-1-linan666@huaweicloud.com
+[ mddev_destroy_serial_pool third parameter was removed in mainline,
+  where there is no need to suspend within this function anymore. ]
+Signed-off-by: Jeremy Bongio <jbongio@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/md.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/md/md.c
++++ b/drivers/md/md.c
+@@ -2485,6 +2485,7 @@ static int bind_rdev_to_array(struct md_
+  fail:
+       pr_warn("md: failed to register dev-%s for %s\n",
+               b, mdname(mddev));
++      mddev_destroy_serial_pool(mddev, rdev, false);
+       return err;
+ }
diff --git a/queue-6.6/mm-hugetlb-fix-debug_locks_warn_on-1-when-dissolve_free_hugetlb_folio.patch b/queue-6.6/mm-hugetlb-fix-debug_locks_warn_on-1-when-dissolve_free_hugetlb_folio.patch
new file mode 100644 (file)
index 0000000..7807b88
--- /dev/null
@@ -0,0 +1,148 @@
+From 52ccdde16b6540abe43b6f8d8e1e1ec90b0983af Mon Sep 17 00:00:00 2001
+From: Miaohe Lin <linmiaohe@huawei.com>
+Date: Fri, 19 Apr 2024 16:58:19 +0800
+Subject: mm/hugetlb: fix DEBUG_LOCKS_WARN_ON(1) when dissolve_free_hugetlb_folio()
+
+From: Miaohe Lin <linmiaohe@huawei.com>
+
+commit 52ccdde16b6540abe43b6f8d8e1e1ec90b0983af upstream.
+
+When I did memory failure tests recently, below warning occurs:
+
+DEBUG_LOCKS_WARN_ON(1)
+WARNING: CPU: 8 PID: 1011 at kernel/locking/lockdep.c:232 __lock_acquire+0xccb/0x1ca0
+Modules linked in: mce_inject hwpoison_inject
+CPU: 8 PID: 1011 Comm: bash Kdump: loaded Not tainted 6.9.0-rc3-next-20240410-00012-gdb69f219f4be #3
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
+RIP: 0010:__lock_acquire+0xccb/0x1ca0
+RSP: 0018:ffffa7a1c7fe3bd0 EFLAGS: 00000082
+RAX: 0000000000000000 RBX: eb851eb853975fcf RCX: ffffa1ce5fc1c9c8
+RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffffa1ce5fc1c9c0
+RBP: ffffa1c6865d3280 R08: ffffffffb0f570a8 R09: 0000000000009ffb
+R10: 0000000000000286 R11: ffffffffb0f2ad50 R12: ffffa1c6865d3d10
+R13: ffffa1c6865d3c70 R14: 0000000000000000 R15: 0000000000000004
+FS:  00007ff9f32aa740(0000) GS:ffffa1ce5fc00000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00007ff9f3134ba0 CR3: 00000008484e4000 CR4: 00000000000006f0
+Call Trace:
+ <TASK>
+ lock_acquire+0xbe/0x2d0
+ _raw_spin_lock_irqsave+0x3a/0x60
+ hugepage_subpool_put_pages.part.0+0xe/0xc0
+ free_huge_folio+0x253/0x3f0
+ dissolve_free_huge_page+0x147/0x210
+ __page_handle_poison+0x9/0x70
+ memory_failure+0x4e6/0x8c0
+ hard_offline_page_store+0x55/0xa0
+ kernfs_fop_write_iter+0x12c/0x1d0
+ vfs_write+0x380/0x540
+ ksys_write+0x64/0xe0
+ do_syscall_64+0xbc/0x1d0
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+RIP: 0033:0x7ff9f3114887
+RSP: 002b:00007ffecbacb458 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
+RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007ff9f3114887
+RDX: 000000000000000c RSI: 0000564494164e10 RDI: 0000000000000001
+RBP: 0000564494164e10 R08: 00007ff9f31d1460 R09: 000000007fffffff
+R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000000c
+R13: 00007ff9f321b780 R14: 00007ff9f3217600 R15: 00007ff9f3216a00
+ </TASK>
+Kernel panic - not syncing: kernel: panic_on_warn set ...
+CPU: 8 PID: 1011 Comm: bash Kdump: loaded Not tainted 6.9.0-rc3-next-20240410-00012-gdb69f219f4be #3
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
+Call Trace:
+ <TASK>
+ panic+0x326/0x350
+ check_panic_on_warn+0x4f/0x50
+ __warn+0x98/0x190
+ report_bug+0x18e/0x1a0
+ handle_bug+0x3d/0x70
+ exc_invalid_op+0x18/0x70
+ asm_exc_invalid_op+0x1a/0x20
+RIP: 0010:__lock_acquire+0xccb/0x1ca0
+RSP: 0018:ffffa7a1c7fe3bd0 EFLAGS: 00000082
+RAX: 0000000000000000 RBX: eb851eb853975fcf RCX: ffffa1ce5fc1c9c8
+RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffffa1ce5fc1c9c0
+RBP: ffffa1c6865d3280 R08: ffffffffb0f570a8 R09: 0000000000009ffb
+R10: 0000000000000286 R11: ffffffffb0f2ad50 R12: ffffa1c6865d3d10
+R13: ffffa1c6865d3c70 R14: 0000000000000000 R15: 0000000000000004
+ lock_acquire+0xbe/0x2d0
+ _raw_spin_lock_irqsave+0x3a/0x60
+ hugepage_subpool_put_pages.part.0+0xe/0xc0
+ free_huge_folio+0x253/0x3f0
+ dissolve_free_huge_page+0x147/0x210
+ __page_handle_poison+0x9/0x70
+ memory_failure+0x4e6/0x8c0
+ hard_offline_page_store+0x55/0xa0
+ kernfs_fop_write_iter+0x12c/0x1d0
+ vfs_write+0x380/0x540
+ ksys_write+0x64/0xe0
+ do_syscall_64+0xbc/0x1d0
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+RIP: 0033:0x7ff9f3114887
+RSP: 002b:00007ffecbacb458 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
+RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007ff9f3114887
+RDX: 000000000000000c RSI: 0000564494164e10 RDI: 0000000000000001
+RBP: 0000564494164e10 R08: 00007ff9f31d1460 R09: 000000007fffffff
+R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000000c
+R13: 00007ff9f321b780 R14: 00007ff9f3217600 R15: 00007ff9f3216a00
+ </TASK>
+
+After git bisecting and digging into the code, I believe the root cause is
+that _deferred_list field of folio is unioned with _hugetlb_subpool field.
+In __update_and_free_hugetlb_folio(), folio->_deferred_list is
+initialized leading to corrupted folio->_hugetlb_subpool when folio is
+hugetlb.  Later free_huge_folio() will use _hugetlb_subpool and above
+warning happens.
+
+But it is assumed hugetlb flag must have been cleared when calling
+folio_put() in update_and_free_hugetlb_folio().  This assumption is broken
+due to below race:
+
+CPU1                                   CPU2
+dissolve_free_huge_page                        update_and_free_pages_bulk
+ update_and_free_hugetlb_folio          hugetlb_vmemmap_restore_folios
+                                         folio_clear_hugetlb_vmemmap_optimized
+  clear_flag = folio_test_hugetlb_vmemmap_optimized
+  if (clear_flag) <-- False, it's already cleared.
+   __folio_clear_hugetlb(folio) <-- Hugetlb is not cleared.
+  folio_put
+   free_huge_folio <-- free_the_page is expected.
+                                        list_for_each_entry()
+                                         __folio_clear_hugetlb <-- Too late.
+
+Fix this issue by checking whether folio is hugetlb directly instead of
+checking clear_flag to close the race window.
+
+Link: https://lkml.kernel.org/r/20240419085819.1901645-1-linmiaohe@huawei.com
+Fixes: 32c877191e02 ("hugetlb: do not clear hugetlb dtor until allocating vmemmap")
+Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
+Reviewed-by: Oscar Salvador <osalvador@suse.de>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/hugetlb.c |    4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -1747,8 +1747,6 @@ static void add_hugetlb_folio(struct hst
+ static void __update_and_free_hugetlb_folio(struct hstate *h,
+                                               struct folio *folio)
+ {
+-      bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio);
+-
+       if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+               return;
+@@ -1782,7 +1780,7 @@ static void __update_and_free_hugetlb_fo
+        * If vmemmap pages were allocated above, then we need to clear the
+        * hugetlb destructor under the hugetlb lock.
+        */
+-      if (clear_dtor) {
++      if (folio_test_hugetlb(folio)) {
+               spin_lock_irq(&hugetlb_lock);
+               __clear_hugetlb_destructor(h, folio);
+               spin_unlock_irq(&hugetlb_lock);
diff --git a/queue-6.6/revert-selftests-bpf-add-netkit-to-tc_redirect-selftest.patch b/queue-6.6/revert-selftests-bpf-add-netkit-to-tc_redirect-selftest.patch
new file mode 100644 (file)
index 0000000..cc2c8c4
--- /dev/null
@@ -0,0 +1,121 @@
+From xiaopeitux@foxmail.com  Wed May 15 09:24:46 2024
+From: xiaopeitux@foxmail.com
+Date: Wed,  1 May 2024 09:31:45 +0800
+Subject: Revert "selftests/bpf: Add netkit to tc_redirect selftest"
+To: gregkh@linuxfoundation.org, geliang@kernel.org
+Cc: xiaopeitux@foxmail.com, Pei Xiao <xiaopei01@kylinos.cn>, stable@vger.kernel.org
+Message-ID: <tencent_06FC391857FB08476E2DAA0048302FDE1307@qq.com>
+
+From: Pei Xiao <xiaopei01@kylinos.cn>
+
+This reverts commit 1ccc54df579701a2b6ec10bd2448ea3b65043c1a which is
+upstream commit adfeae2d243d9e5b83d094af481d189156b11779
+
+This commit depends on bpf netkit series which isn't on linux-6.6.y
+branch yet. So it needs to be reverted. Otherwise, a build error
+"netlink_helpers.h: No such file or directory" occurs.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Pei Xiao <xiaopei01@kylinos.cn>
+Reported-by: Geliang Tang <geliang@kernel.org>
+Tested-by: Geliang Tang <geliang@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/bpf/prog_tests/tc_redirect.c |   52 -------------------
+ 1 file changed, 52 deletions(-)
+
+--- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
++++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
+@@ -24,7 +24,6 @@
+ #include "test_progs.h"
+ #include "network_helpers.h"
+-#include "netlink_helpers.h"
+ #include "test_tc_neigh_fib.skel.h"
+ #include "test_tc_neigh.skel.h"
+ #include "test_tc_peer.skel.h"
+@@ -113,7 +112,6 @@ static void netns_setup_namespaces_nofai
+ enum dev_mode {
+       MODE_VETH,
+-      MODE_NETKIT,
+ };
+ struct netns_setup_result {
+@@ -144,52 +142,11 @@ static int get_ifaddr(const char *name,
+       return 0;
+ }
+-static int create_netkit(int mode, char *prim, char *peer)
+-{
+-      struct rtattr *linkinfo, *data, *peer_info;
+-      struct rtnl_handle rth = { .fd = -1 };
+-      const char *type = "netkit";
+-      struct {
+-              struct nlmsghdr n;
+-              struct ifinfomsg i;
+-              char buf[1024];
+-      } req = {};
+-      int err;
+-
+-      err = rtnl_open(&rth, 0);
+-      if (!ASSERT_OK(err, "open_rtnetlink"))
+-              return err;
+-
+-      memset(&req, 0, sizeof(req));
+-      req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+-      req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
+-      req.n.nlmsg_type = RTM_NEWLINK;
+-      req.i.ifi_family = AF_UNSPEC;
+-
+-      addattr_l(&req.n, sizeof(req), IFLA_IFNAME, prim, strlen(prim));
+-      linkinfo = addattr_nest(&req.n, sizeof(req), IFLA_LINKINFO);
+-      addattr_l(&req.n, sizeof(req), IFLA_INFO_KIND, type, strlen(type));
+-      data = addattr_nest(&req.n, sizeof(req), IFLA_INFO_DATA);
+-      addattr32(&req.n, sizeof(req), IFLA_NETKIT_MODE, mode);
+-      peer_info = addattr_nest(&req.n, sizeof(req), IFLA_NETKIT_PEER_INFO);
+-      req.n.nlmsg_len += sizeof(struct ifinfomsg);
+-      addattr_l(&req.n, sizeof(req), IFLA_IFNAME, peer, strlen(peer));
+-      addattr_nest_end(&req.n, peer_info);
+-      addattr_nest_end(&req.n, data);
+-      addattr_nest_end(&req.n, linkinfo);
+-
+-      err = rtnl_talk(&rth, &req.n, NULL);
+-      ASSERT_OK(err, "talk_rtnetlink");
+-      rtnl_close(&rth);
+-      return err;
+-}
+-
+ static int netns_setup_links_and_routes(struct netns_setup_result *result)
+ {
+       struct nstoken *nstoken = NULL;
+       char src_fwd_addr[IFADDR_STR_LEN+1] = {};
+       char src_addr[IFADDR_STR_LEN + 1] = {};
+-      int err;
+       if (result->dev_mode == MODE_VETH) {
+               SYS(fail, "ip link add src type veth peer name src_fwd");
+@@ -197,13 +154,6 @@ static int netns_setup_links_and_routes(
+               SYS(fail, "ip link set dst_fwd address " MAC_DST_FWD);
+               SYS(fail, "ip link set dst address " MAC_DST);
+-      } else if (result->dev_mode == MODE_NETKIT) {
+-              err = create_netkit(NETKIT_L3, "src", "src_fwd");
+-              if (!ASSERT_OK(err, "create_ifindex_src"))
+-                      goto fail;
+-              err = create_netkit(NETKIT_L3, "dst", "dst_fwd");
+-              if (!ASSERT_OK(err, "create_ifindex_dst"))
+-                      goto fail;
+       }
+       if (get_ifaddr("src_fwd", src_fwd_addr))
+@@ -1266,9 +1216,7 @@ static void *test_tc_redirect_run_tests(
+       netns_setup_namespaces_nofail("delete");
+       RUN_TEST(tc_redirect_peer, MODE_VETH);
+-      RUN_TEST(tc_redirect_peer, MODE_NETKIT);
+       RUN_TEST(tc_redirect_peer_l3, MODE_VETH);
+-      RUN_TEST(tc_redirect_peer_l3, MODE_NETKIT);
+       RUN_TEST(tc_redirect_neigh, MODE_VETH);
+       RUN_TEST(tc_redirect_neigh_fib, MODE_VETH);
+       RUN_TEST(tc_redirect_dtime, MODE_VETH);
index 50f480fb4d1929913c6a6fc4da46c22c7090c092..809fa58733cc592f2e2e5f68f2c39e2f9fa13a7c 100644 (file)
@@ -299,3 +299,11 @@ bluetooth-qca-generalise-device-address-check.patch
 bluetooth-qca-fix-info-leak-when-fetching-board-id.patch
 bluetooth-qca-fix-info-leak-when-fetching-fw-build-id.patch
 bluetooth-qca-fix-firmware-check-error-path.patch
+vfio-add-the-spr_dsa-and-spr_iax-devices-to-the-denylist.patch
+dmaengine-idxd-add-a-new-security-check-to-deal-with-a-hardware-erratum.patch
+dmaengine-idxd-add-a-write-method-for-applications-to-submit-work.patch
+keys-fix-overwrite-of-key-expiration-on-instantiation.patch
+btrfs-do-not-wait-for-short-bulk-allocation.patch
+mm-hugetlb-fix-debug_locks_warn_on-1-when-dissolve_free_hugetlb_folio.patch
+revert-selftests-bpf-add-netkit-to-tc_redirect-selftest.patch
+md-fix-kmemleak-of-rdev-serial.patch
diff --git a/queue-6.6/vfio-add-the-spr_dsa-and-spr_iax-devices-to-the-denylist.patch b/queue-6.6/vfio-add-the-spr_dsa-and-spr_iax-devices-to-the-denylist.patch
new file mode 100644 (file)
index 0000000..8468efb
--- /dev/null
@@ -0,0 +1,60 @@
+From 95feb3160eef0caa6018e175a5560b816aee8e79 Mon Sep 17 00:00:00 2001
+From: Arjan van de Ven <arjan@linux.intel.com>
+Date: Thu, 21 Mar 2024 19:44:07 +0000
+Subject: VFIO: Add the SPR_DSA and SPR_IAX devices to the denylist
+
+From: Arjan van de Ven <arjan@linux.intel.com>
+
+commit 95feb3160eef0caa6018e175a5560b816aee8e79 upstream.
+
+Due to an erratum with the SPR_DSA and SPR_IAX devices, it is not secure to assign
+these devices to virtual machines. Add the PCI IDs of these devices to the VFIO
+denylist to ensure that this is handled appropriately by the VFIO subsystem.
+
+The SPR_DSA and SPR_IAX devices are on-SOC devices for the Sapphire Rapids
+(and related) family of products that perform data movement and compression.
+
+Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/dma/idxd/registers.h |    3 ---
+ drivers/vfio/pci/vfio_pci.c  |    2 ++
+ include/linux/pci_ids.h      |    2 ++
+ 3 files changed, 4 insertions(+), 3 deletions(-)
+
+--- a/drivers/dma/idxd/registers.h
++++ b/drivers/dma/idxd/registers.h
+@@ -6,9 +6,6 @@
+ #include <uapi/linux/idxd.h>
+ /* PCI Config */
+-#define PCI_DEVICE_ID_INTEL_DSA_SPR0  0x0b25
+-#define PCI_DEVICE_ID_INTEL_IAX_SPR0  0x0cfe
+-
+ #define DEVICE_VERSION_1              0x100
+ #define DEVICE_VERSION_2              0x200
+--- a/drivers/vfio/pci/vfio_pci.c
++++ b/drivers/vfio/pci/vfio_pci.c
+@@ -71,6 +71,8 @@ static bool vfio_pci_dev_in_denylist(str
+               case PCI_DEVICE_ID_INTEL_QAT_C62X_VF:
+               case PCI_DEVICE_ID_INTEL_QAT_DH895XCC:
+               case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF:
++              case PCI_DEVICE_ID_INTEL_DSA_SPR0:
++              case PCI_DEVICE_ID_INTEL_IAX_SPR0:
+                       return true;
+               default:
+                       return false;
+--- a/include/linux/pci_ids.h
++++ b/include/linux/pci_ids.h
+@@ -2683,8 +2683,10 @@
+ #define PCI_DEVICE_ID_INTEL_I960      0x0960
+ #define PCI_DEVICE_ID_INTEL_I960RM    0x0962
+ #define PCI_DEVICE_ID_INTEL_HDA_HSW_0 0x0a0c
++#define PCI_DEVICE_ID_INTEL_DSA_SPR0  0x0b25
+ #define PCI_DEVICE_ID_INTEL_HDA_HSW_2 0x0c0c
+ #define PCI_DEVICE_ID_INTEL_CENTERTON_ILB     0x0c60
++#define PCI_DEVICE_ID_INTEL_IAX_SPR0  0x0cfe
+ #define PCI_DEVICE_ID_INTEL_HDA_HSW_3 0x0d0c
+ #define PCI_DEVICE_ID_INTEL_HDA_BYT   0x0f04
+ #define PCI_DEVICE_ID_INTEL_SST_BYT   0x0f28