6.1-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Wed, 15 May 2024 07:41:00 +0000 (09:41 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Wed, 15 May 2024 07:41:00 +0000 (09:41 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 15 May 2024 07:41:00 +0000 (09:41 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 15 May 2024 07:41:00 +0000 (09:41 +0200)
diff --git a/queue-6.1/btrfs-do-not-wait-for-short-bulk-allocation.patch b/queue-6.1/btrfs-do-not-wait-for-short-bulk-allocation.patch

new file mode 100644 (file)

index 0000000..eb209a1
--- /dev/null
+++ b/queue-6.1/btrfs-do-not-wait-for-short-bulk-allocation.patch
@@ -0,0 +1,89 @@
+From 1db7959aacd905e6487d0478ac01d89f86eb1e51 Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Tue, 26 Mar 2024 09:16:46 +1030
+Subject: btrfs: do not wait for short bulk allocation
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 1db7959aacd905e6487d0478ac01d89f86eb1e51 upstream.
+
+[BUG]
+There is a recent report that when memory pressure is high (including
+cached pages), btrfs can spend most of its time on memory allocation in
+btrfs_alloc_page_array() for compressed read/write.
+
+[CAUSE]
+For btrfs_alloc_page_array() we always go alloc_pages_bulk_array(), and
+even if the bulk allocation failed (fell back to single page
+allocation) we still retry but with extra memalloc_retry_wait().
+
+If the bulk alloc only returned one page a time, we would spend a lot of
+time on the retry wait.
+
+The behavior was introduced in commit 395cb57e8560 ("btrfs: wait between
+incomplete batch memory allocations").
+
+[FIX]
+Although the commit mentioned that other filesystems do the wait, it's
+not the case at least nowadays.
+
+All the mainlined filesystems only call memalloc_retry_wait() if they
+failed to allocate any page (not only for bulk allocation).
+If there is any progress, they won't call memalloc_retry_wait() at all.
+
+For example, xfs_buf_alloc_pages() would only call memalloc_retry_wait()
+if there is no allocation progress at all, and the call is not for
+metadata readahead.
+
+So I don't believe we should call memalloc_retry_wait() unconditionally
+for short allocation.
+
+Call memalloc_retry_wait() if it fails to allocate any page for tree
+block allocation (which goes with __GFP_NOFAIL and may not need the
+special handling anyway), and reduce the latency for
+btrfs_alloc_page_array().
+
+Reported-by: Julian Taylor <julian.taylor@1und1.de>
+Tested-by: Julian Taylor <julian.taylor@1und1.de>
+Link: https://lore.kernel.org/all/8966c095-cbe7-4d22-9784-a647d1bf27c3@1und1.de/
+Fixes: 395cb57e8560 ("btrfs: wait between incomplete batch memory allocations")
+CC: stable@vger.kernel.org # 6.1+
+Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent_io.c |   19 +++++++------------
+ 1 file changed, 7 insertions(+), 12 deletions(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -1324,19 +1324,14 @@ int btrfs_alloc_page_array(unsigned int
+               unsigned int last = allocated;
+ 
+               allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
+-
+-              if (allocated == nr_pages)
+-                      return 0;
+-
+-              /*
+-               * During this iteration, no page could be allocated, even
+-               * though alloc_pages_bulk_array() falls back to alloc_page()
+-               * if  it could not bulk-allocate. So we must be out of memory.
+-               */
+-              if (allocated == last)
++              if (unlikely(allocated == last)) {
++                      /* No progress, fail and do cleanup. */
++                      for (int i = 0; i < allocated; i++) {
++                              __free_page(page_array[i]);
++                              page_array[i] = NULL;
++                      }
+                       return -ENOMEM;
+-
+-              memalloc_retry_wait(GFP_NOFS);
++              }
+       }
+       return 0;
+ }
diff --git a/queue-6.1/dmaengine-idxd-add-a-new-security-check-to-deal-with-a-hardware-erratum.patch b/queue-6.1/dmaengine-idxd-add-a-new-security-check-to-deal-with-a-hardware-erratum.patch

new file mode 100644 (file)

index 0000000..d0529a3
--- /dev/null
+++ b/queue-6.1/dmaengine-idxd-add-a-new-security-check-to-deal-with-a-hardware-erratum.patch
@@ -0,0 +1,97 @@
+From e11452eb071b2a8e6ba52892b2e270bbdaa6640d Mon Sep 17 00:00:00 2001
+From: Arjan van de Ven <arjan@linux.intel.com>
+Date: Wed, 24 Apr 2024 14:43:22 +0000
+Subject: dmaengine: idxd: add a new security check to deal with a hardware erratum
+
+From: Arjan van de Ven <arjan@linux.intel.com>
+
+commit e11452eb071b2a8e6ba52892b2e270bbdaa6640d upstream.
+
+On Sapphire Rapids and related platforms, the DSA and IAA devices have an
+erratum that causes direct access (for example, by using the ENQCMD or
+MOVDIR64 instructions) from untrusted applications to be a security problem.
+
+To solve this, add a flag to the PCI device enumeration and device structures
+to indicate the presence/absence of this security exposure. In the mmap()
+method of the device, this flag is then used to enforce that the user
+has the CAP_SYS_RAWIO capability.
+
+In a future patch, a write() based method will be added that allows untrusted
+applications submit work to the accelerator, where the kernel can do
+sanity checking on the user input to ensure secure operation of the accelerator.
+
+Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/dma/idxd/cdev.c |   12 ++++++++++++
+ drivers/dma/idxd/idxd.h |    3 +++
+ drivers/dma/idxd/init.c |    4 ++++
+ 3 files changed, 19 insertions(+)
+
+--- a/drivers/dma/idxd/cdev.c
++++ b/drivers/dma/idxd/cdev.c
+@@ -198,6 +198,18 @@ static int idxd_cdev_mmap(struct file *f
+       int rc;
+ 
+       dev_dbg(&pdev->dev, "%s called\n", __func__);
++
++      /*
++       * Due to an erratum in some of the devices supported by the driver,
++       * direct user submission to the device can be unsafe.
++       * (See the INTEL-SA-01084 security advisory)
++       *
++       * For the devices that exhibit this behavior, require that the user
++       * has CAP_SYS_RAWIO capabilities.
++       */
++      if (!idxd->user_submission_safe && !capable(CAP_SYS_RAWIO))
++              return -EPERM;
++
+       rc = check_vma(wq, vma, __func__);
+       if (rc < 0)
+               return rc;
+--- a/drivers/dma/idxd/idxd.h
++++ b/drivers/dma/idxd/idxd.h
+@@ -258,6 +258,7 @@ struct idxd_driver_data {
+       struct device_type *dev_type;
+       int compl_size;
+       int align;
++      bool user_submission_safe;
+ };
+ 
+ struct idxd_device {
+@@ -316,6 +317,8 @@ struct idxd_device {
+       struct idxd_pmu *idxd_pmu;
+ 
+       unsigned long *opcap_bmap;
++
++      bool user_submission_safe;
+ };
+ 
+ /* IDXD software descriptor */
+--- a/drivers/dma/idxd/init.c
++++ b/drivers/dma/idxd/init.c
+@@ -48,6 +48,7 @@ static struct idxd_driver_data idxd_driv
+               .compl_size = sizeof(struct dsa_completion_record),
+               .align = 32,
+               .dev_type = &dsa_device_type,
++              .user_submission_safe = false, /* See INTEL-SA-01084 security advisory */
+       },
+       [IDXD_TYPE_IAX] = {
+               .name_prefix = "iax",
+@@ -55,6 +56,7 @@ static struct idxd_driver_data idxd_driv
+               .compl_size = sizeof(struct iax_completion_record),
+               .align = 64,
+               .dev_type = &iax_device_type,
++              .user_submission_safe = false, /* See INTEL-SA-01084 security advisory */
+       },
+ };
+ 
+@@ -663,6 +665,8 @@ static int idxd_pci_probe(struct pci_dev
+       dev_info(&pdev->dev, "Intel(R) Accelerator Device (v%x)\n",
+                idxd->hw.version);
+ 
++      idxd->user_submission_safe = data->user_submission_safe;
++
+       return 0;
+ 
+  err_dev_register:
diff --git a/queue-6.1/dmaengine-idxd-add-a-write-method-for-applications-to-submit-work.patch b/queue-6.1/dmaengine-idxd-add-a-write-method-for-applications-to-submit-work.patch

new file mode 100644 (file)

index 0000000..4bde7cb
--- /dev/null
+++ b/queue-6.1/dmaengine-idxd-add-a-write-method-for-applications-to-submit-work.patch
@@ -0,0 +1,164 @@
+From 6827738dc684a87ad54ebba3ae7f3d7c977698eb Mon Sep 17 00:00:00 2001
+From: Nikhil Rao <nikhil.rao@intel.com>
+Date: Wed, 24 Apr 2024 15:16:12 +0000
+Subject: dmaengine: idxd: add a write() method for applications to submit work
+
+From: Nikhil Rao <nikhil.rao@intel.com>
+
+commit 6827738dc684a87ad54ebba3ae7f3d7c977698eb upstream.
+
+After the patch to restrict the use of mmap() to CAP_SYS_RAWIO for
+the currently existing devices, most applications can no longer make
+use of the accelerators as in production "you don't run things as root".
+
+To keep the DSA and IAA accelerators usable, hook up a write() method
+so that applications can still submit work. In the write method,
+sufficient input validation is performed to avoid the security issue
+that required the mmap CAP_SYS_RAWIO check.
+
+One complication is that the DSA device allows for indirect ("batched")
+descriptors. There is no reasonable way to do the input validation
+on these indirect descriptors so the write() method will not allow these
+to be submitted to the hardware on affected hardware, and the sysfs
+enumeration of support for the opcode is also removed.
+
+Early performance data shows that the performance delta for most common
+cases is within the noise.
+
+Signed-off-by: Nikhil Rao <nikhil.rao@intel.com>
+Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/dma/idxd/cdev.c  |   65 +++++++++++++++++++++++++++++++++++++++++++++++
+ drivers/dma/idxd/sysfs.c |   27 ++++++++++++++++++-
+ 2 files changed, 90 insertions(+), 2 deletions(-)
+
+--- a/drivers/dma/idxd/cdev.c
++++ b/drivers/dma/idxd/cdev.c
+@@ -224,6 +224,70 @@ static int idxd_cdev_mmap(struct file *f
+                       vma->vm_page_prot);
+ }
+ 
++static int idxd_submit_user_descriptor(struct idxd_user_context *ctx,
++                                     struct dsa_hw_desc __user *udesc)
++{
++      struct idxd_wq *wq = ctx->wq;
++      struct idxd_dev *idxd_dev = &wq->idxd->idxd_dev;
++      const uint64_t comp_addr_align = is_dsa_dev(idxd_dev) ? 0x20 : 0x40;
++      void __iomem *portal = idxd_wq_portal_addr(wq);
++      struct dsa_hw_desc descriptor __aligned(64);
++      int rc;
++
++      rc = copy_from_user(&descriptor, udesc, sizeof(descriptor));
++      if (rc)
++              return -EFAULT;
++
++      /*
++       * DSA devices are capable of indirect ("batch") command submission.
++       * On devices where direct user submissions are not safe, we cannot
++       * allow this since there is no good way for us to verify these
++       * indirect commands.
++       */
++      if (is_dsa_dev(idxd_dev) && descriptor.opcode == DSA_OPCODE_BATCH &&
++              !wq->idxd->user_submission_safe)
++              return -EINVAL;
++      /*
++       * As per the programming specification, the completion address must be
++       * aligned to 32 or 64 bytes. If this is violated the hardware
++       * engine can get very confused (security issue).
++       */
++      if (!IS_ALIGNED(descriptor.completion_addr, comp_addr_align))
++              return -EINVAL;
++
++      if (wq_dedicated(wq))
++              iosubmit_cmds512(portal, &descriptor, 1);
++      else {
++              descriptor.priv = 0;
++              descriptor.pasid = ctx->pasid;
++              rc = idxd_enqcmds(wq, portal, &descriptor);
++              if (rc < 0)
++                      return rc;
++      }
++
++      return 0;
++}
++
++static ssize_t idxd_cdev_write(struct file *filp, const char __user *buf, size_t len,
++                             loff_t *unused)
++{
++      struct dsa_hw_desc __user *udesc = (struct dsa_hw_desc __user *)buf;
++      struct idxd_user_context *ctx = filp->private_data;
++      ssize_t written = 0;
++      int i;
++
++      for (i = 0; i < len/sizeof(struct dsa_hw_desc); i++) {
++              int rc = idxd_submit_user_descriptor(ctx, udesc + i);
++
++              if (rc)
++                      return written ? written : rc;
++
++              written += sizeof(struct dsa_hw_desc);
++      }
++
++      return written;
++}
++
+ static __poll_t idxd_cdev_poll(struct file *filp,
+                              struct poll_table_struct *wait)
+ {
+@@ -246,6 +310,7 @@ static const struct file_operations idxd
+       .open = idxd_cdev_open,
+       .release = idxd_cdev_release,
+       .mmap = idxd_cdev_mmap,
++      .write = idxd_cdev_write,
+       .poll = idxd_cdev_poll,
+ };
+ 
+--- a/drivers/dma/idxd/sysfs.c
++++ b/drivers/dma/idxd/sysfs.c
+@@ -1162,12 +1162,35 @@ static ssize_t wq_enqcmds_retries_store(
+ static struct device_attribute dev_attr_wq_enqcmds_retries =
+               __ATTR(enqcmds_retries, 0644, wq_enqcmds_retries_show, wq_enqcmds_retries_store);
+ 
++static ssize_t op_cap_show_common(struct device *dev, char *buf, unsigned long *opcap_bmap)
++{
++      ssize_t pos;
++      int i;
++
++      pos = 0;
++      for (i = IDXD_MAX_OPCAP_BITS/64 - 1; i >= 0; i--) {
++              unsigned long val = opcap_bmap[i];
++
++              /* On systems where direct user submissions are not safe, we need to clear out
++               * the BATCH capability from the capability mask in sysfs since we cannot support
++               * that command on such systems.
++               */
++              if (i == DSA_OPCODE_BATCH/64 && !confdev_to_idxd(dev)->user_submission_safe)
++                      clear_bit(DSA_OPCODE_BATCH % 64, &val);
++
++              pos += sysfs_emit_at(buf, pos, "%*pb", 64, &val);
++              pos += sysfs_emit_at(buf, pos, "%c", i == 0 ? '\n' : ',');
++      }
++
++      return pos;
++}
++
+ static ssize_t wq_op_config_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+ {
+       struct idxd_wq *wq = confdev_to_wq(dev);
+ 
+-      return sysfs_emit(buf, "%*pb\n", IDXD_MAX_OPCAP_BITS, wq->opcap_bmap);
++      return op_cap_show_common(dev, buf, wq->opcap_bmap);
+ }
+ 
+ static int idxd_verify_supported_opcap(struct idxd_device *idxd, unsigned long *opmask)
+@@ -1381,7 +1404,7 @@ static ssize_t op_cap_show(struct device
+ {
+       struct idxd_device *idxd = confdev_to_idxd(dev);
+ 
+-      return sysfs_emit(buf, "%*pb\n", IDXD_MAX_OPCAP_BITS, idxd->opcap_bmap);
++      return op_cap_show_common(dev, buf, idxd->opcap_bmap);
+ }
+ static DEVICE_ATTR_RO(op_cap);
+ 
diff --git a/queue-6.1/keys-fix-overwrite-of-key-expiration-on-instantiation.patch b/queue-6.1/keys-fix-overwrite-of-key-expiration-on-instantiation.patch

new file mode 100644 (file)

index 0000000..35b558d
--- /dev/null
+++ b/queue-6.1/keys-fix-overwrite-of-key-expiration-on-instantiation.patch
@@ -0,0 +1,44 @@
+From 9da27fb65a14c18efd4473e2e82b76b53ba60252 Mon Sep 17 00:00:00 2001
+From: Silvio Gissi <sifonsec@amazon.com>
+Date: Fri, 15 Mar 2024 15:05:39 -0400
+Subject: keys: Fix overwrite of key expiration on instantiation
+
+From: Silvio Gissi <sifonsec@amazon.com>
+
+commit 9da27fb65a14c18efd4473e2e82b76b53ba60252 upstream.
+
+The expiry time of a key is unconditionally overwritten during
+instantiation, defaulting to turn it permanent. This causes a problem
+for DNS resolution as the expiration set by user-space is overwritten to
+TIME64_MAX, disabling further DNS updates. Fix this by restoring the
+condition that key_set_expiry is only called when the pre-parser sets a
+specific expiry.
+
+Fixes: 39299bdd2546 ("keys, dns: Allow key types (eg. DNS) to be reclaimed immediately on expiry")
+Signed-off-by: Silvio Gissi <sifonsec@amazon.com>
+cc: David Howells <dhowells@redhat.com>
+cc: Hazem Mohamed Abuelfotoh <abuehaze@amazon.com>
+cc: linux-afs@lists.infradead.org
+cc: linux-cifs@vger.kernel.org
+cc: keyrings@vger.kernel.org
+cc: netdev@vger.kernel.org
+cc: stable@vger.kernel.org
+Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
+Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ security/keys/key.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/security/keys/key.c
++++ b/security/keys/key.c
+@@ -464,7 +464,8 @@ static int __key_instantiate_and_link(st
+                       if (authkey)
+                               key_invalidate(authkey);
+ 
+-                      key_set_expiry(key, prep->expiry);
++                      if (prep->expiry != TIME64_MAX)
++                              key_set_expiry(key, prep->expiry);
+               }
+       }
+ 
diff --git a/queue-6.1/md-fix-kmemleak-of-rdev-serial.patch b/queue-6.1/md-fix-kmemleak-of-rdev-serial.patch

new file mode 100644 (file)

index 0000000..013efe9
--- /dev/null
+++ b/queue-6.1/md-fix-kmemleak-of-rdev-serial.patch
@@ -0,0 +1,55 @@
+From 6cf350658736681b9d6b0b6e58c5c76b235bb4c4 Mon Sep 17 00:00:00 2001
+From: Li Nan <linan122@huawei.com>
+Date: Thu, 8 Feb 2024 16:55:56 +0800
+Subject: md: fix kmemleak of rdev->serial
+
+From: Li Nan <linan122@huawei.com>
+
+commit 6cf350658736681b9d6b0b6e58c5c76b235bb4c4 upstream.
+
+If kobject_add() is fail in bind_rdev_to_array(), 'rdev->serial' will be
+alloc not be freed, and kmemleak occurs.
+
+unreferenced object 0xffff88815a350000 (size 49152):
+  comm "mdadm", pid 789, jiffies 4294716910
+  hex dump (first 32 bytes):
+    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+  backtrace (crc f773277a):
+    [<0000000058b0a453>] kmemleak_alloc+0x61/0xe0
+    [<00000000366adf14>] __kmalloc_large_node+0x15e/0x270
+    [<000000002e82961b>] __kmalloc_node.cold+0x11/0x7f
+    [<00000000f206d60a>] kvmalloc_node+0x74/0x150
+    [<0000000034bf3363>] rdev_init_serial+0x67/0x170
+    [<0000000010e08fe9>] mddev_create_serial_pool+0x62/0x220
+    [<00000000c3837bf0>] bind_rdev_to_array+0x2af/0x630
+    [<0000000073c28560>] md_add_new_disk+0x400/0x9f0
+    [<00000000770e30ff>] md_ioctl+0x15bf/0x1c10
+    [<000000006cfab718>] blkdev_ioctl+0x191/0x3f0
+    [<0000000085086a11>] vfs_ioctl+0x22/0x60
+    [<0000000018b656fe>] __x64_sys_ioctl+0xba/0xe0
+    [<00000000e54e675e>] do_syscall_64+0x71/0x150
+    [<000000008b0ad622>] entry_SYSCALL_64_after_hwframe+0x6c/0x74
+
+Fixes: 963c555e75b0 ("md: introduce mddev_create/destroy_wb_pool for the change of member device")
+Signed-off-by: Li Nan <linan122@huawei.com>
+Signed-off-by: Song Liu <song@kernel.org>
+Link: https://lore.kernel.org/r/20240208085556.2412922-1-linan666@huaweicloud.com
+[ mddev_destroy_serial_pool third parameter was removed in mainline,
+  where there is no need to suspend within this function anymore. ]
+Signed-off-by: Jeremy Bongio <jbongio@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/md.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/md/md.c
++++ b/drivers/md/md.c
+@@ -2508,6 +2508,7 @@ static int bind_rdev_to_array(struct md_
+  fail:
+       pr_warn("md: failed to register dev-%s for %s\n",
+               b, mdname(mddev));
++      mddev_destroy_serial_pool(mddev, rdev, false);
+       return err;
+ }
+ 
diff --git a/queue-6.1/mm-hugetlb-fix-debug_locks_warn_on-1-when-dissolve_free_hugetlb_folio.patch b/queue-6.1/mm-hugetlb-fix-debug_locks_warn_on-1-when-dissolve_free_hugetlb_folio.patch

new file mode 100644 (file)

index 0000000..dd32bca
--- /dev/null
+++ b/queue-6.1/mm-hugetlb-fix-debug_locks_warn_on-1-when-dissolve_free_hugetlb_folio.patch
@@ -0,0 +1,147 @@
+From 52ccdde16b6540abe43b6f8d8e1e1ec90b0983af Mon Sep 17 00:00:00 2001
+From: Miaohe Lin <linmiaohe@huawei.com>
+Date: Fri, 19 Apr 2024 16:58:19 +0800
+Subject: mm/hugetlb: fix DEBUG_LOCKS_WARN_ON(1) when dissolve_free_hugetlb_folio()
+
+From: Miaohe Lin <linmiaohe@huawei.com>
+
+commit 52ccdde16b6540abe43b6f8d8e1e1ec90b0983af upstream.
+
+When I did memory failure tests recently, below warning occurs:
+
+DEBUG_LOCKS_WARN_ON(1)
+WARNING: CPU: 8 PID: 1011 at kernel/locking/lockdep.c:232 __lock_acquire+0xccb/0x1ca0
+Modules linked in: mce_inject hwpoison_inject
+CPU: 8 PID: 1011 Comm: bash Kdump: loaded Not tainted 6.9.0-rc3-next-20240410-00012-gdb69f219f4be #3
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
+RIP: 0010:__lock_acquire+0xccb/0x1ca0
+RSP: 0018:ffffa7a1c7fe3bd0 EFLAGS: 00000082
+RAX: 0000000000000000 RBX: eb851eb853975fcf RCX: ffffa1ce5fc1c9c8
+RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffffa1ce5fc1c9c0
+RBP: ffffa1c6865d3280 R08: ffffffffb0f570a8 R09: 0000000000009ffb
+R10: 0000000000000286 R11: ffffffffb0f2ad50 R12: ffffa1c6865d3d10
+R13: ffffa1c6865d3c70 R14: 0000000000000000 R15: 0000000000000004
+FS:  00007ff9f32aa740(0000) GS:ffffa1ce5fc00000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00007ff9f3134ba0 CR3: 00000008484e4000 CR4: 00000000000006f0
+Call Trace:
+ <TASK>
+ lock_acquire+0xbe/0x2d0
+ _raw_spin_lock_irqsave+0x3a/0x60
+ hugepage_subpool_put_pages.part.0+0xe/0xc0
+ free_huge_folio+0x253/0x3f0
+ dissolve_free_huge_page+0x147/0x210
+ __page_handle_poison+0x9/0x70
+ memory_failure+0x4e6/0x8c0
+ hard_offline_page_store+0x55/0xa0
+ kernfs_fop_write_iter+0x12c/0x1d0
+ vfs_write+0x380/0x540
+ ksys_write+0x64/0xe0
+ do_syscall_64+0xbc/0x1d0
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+RIP: 0033:0x7ff9f3114887
+RSP: 002b:00007ffecbacb458 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
+RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007ff9f3114887
+RDX: 000000000000000c RSI: 0000564494164e10 RDI: 0000000000000001
+RBP: 0000564494164e10 R08: 00007ff9f31d1460 R09: 000000007fffffff
+R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000000c
+R13: 00007ff9f321b780 R14: 00007ff9f3217600 R15: 00007ff9f3216a00
+ </TASK>
+Kernel panic - not syncing: kernel: panic_on_warn set ...
+CPU: 8 PID: 1011 Comm: bash Kdump: loaded Not tainted 6.9.0-rc3-next-20240410-00012-gdb69f219f4be #3
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
+Call Trace:
+ <TASK>
+ panic+0x326/0x350
+ check_panic_on_warn+0x4f/0x50
+ __warn+0x98/0x190
+ report_bug+0x18e/0x1a0
+ handle_bug+0x3d/0x70
+ exc_invalid_op+0x18/0x70
+ asm_exc_invalid_op+0x1a/0x20
+RIP: 0010:__lock_acquire+0xccb/0x1ca0
+RSP: 0018:ffffa7a1c7fe3bd0 EFLAGS: 00000082
+RAX: 0000000000000000 RBX: eb851eb853975fcf RCX: ffffa1ce5fc1c9c8
+RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffffa1ce5fc1c9c0
+RBP: ffffa1c6865d3280 R08: ffffffffb0f570a8 R09: 0000000000009ffb
+R10: 0000000000000286 R11: ffffffffb0f2ad50 R12: ffffa1c6865d3d10
+R13: ffffa1c6865d3c70 R14: 0000000000000000 R15: 0000000000000004
+ lock_acquire+0xbe/0x2d0
+ _raw_spin_lock_irqsave+0x3a/0x60
+ hugepage_subpool_put_pages.part.0+0xe/0xc0
+ free_huge_folio+0x253/0x3f0
+ dissolve_free_huge_page+0x147/0x210
+ __page_handle_poison+0x9/0x70
+ memory_failure+0x4e6/0x8c0
+ hard_offline_page_store+0x55/0xa0
+ kernfs_fop_write_iter+0x12c/0x1d0
+ vfs_write+0x380/0x540
+ ksys_write+0x64/0xe0
+ do_syscall_64+0xbc/0x1d0
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+RIP: 0033:0x7ff9f3114887
+RSP: 002b:00007ffecbacb458 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
+RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007ff9f3114887
+RDX: 000000000000000c RSI: 0000564494164e10 RDI: 0000000000000001
+RBP: 0000564494164e10 R08: 00007ff9f31d1460 R09: 000000007fffffff
+R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000000c
+R13: 00007ff9f321b780 R14: 00007ff9f3217600 R15: 00007ff9f3216a00
+ </TASK>
+
+After git bisecting and digging into the code, I believe the root cause is
+that _deferred_list field of folio is unioned with _hugetlb_subpool field.
+In __update_and_free_hugetlb_folio(), folio->_deferred_list is
+initialized leading to corrupted folio->_hugetlb_subpool when folio is
+hugetlb.  Later free_huge_folio() will use _hugetlb_subpool and above
+warning happens.
+
+But it is assumed hugetlb flag must have been cleared when calling
+folio_put() in update_and_free_hugetlb_folio().  This assumption is broken
+due to below race:
+
+CPU1                                   CPU2
+dissolve_free_huge_page                        update_and_free_pages_bulk
+ update_and_free_hugetlb_folio          hugetlb_vmemmap_restore_folios
+                                         folio_clear_hugetlb_vmemmap_optimized
+  clear_flag = folio_test_hugetlb_vmemmap_optimized
+  if (clear_flag) <-- False, it's already cleared.
+   __folio_clear_hugetlb(folio) <-- Hugetlb is not cleared.
+  folio_put
+   free_huge_folio <-- free_the_page is expected.
+                                        list_for_each_entry()
+                                         __folio_clear_hugetlb <-- Too late.
+
+Fix this issue by checking whether folio is hugetlb directly instead of
+checking clear_flag to close the race window.
+
+Link: https://lkml.kernel.org/r/20240419085819.1901645-1-linmiaohe@huawei.com
+Fixes: 32c877191e02 ("hugetlb: do not clear hugetlb dtor until allocating vmemmap")
+Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
+Reviewed-by: Oscar Salvador <osalvador@suse.de>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/hugetlb.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -1762,7 +1762,6 @@ static void __update_and_free_page(struc
+ {
+       int i;
+       struct page *subpage;
+-      bool clear_dtor = HPageVmemmapOptimized(page);
+ 
+       if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+               return;
+@@ -1797,7 +1796,7 @@ static void __update_and_free_page(struc
+        * If vmemmap pages were allocated above, then we need to clear the
+        * hugetlb destructor under the hugetlb lock.
+        */
+-      if (clear_dtor) {
++      if (PageHuge(page)) {
+               spin_lock_irq(&hugetlb_lock);
+               __clear_hugetlb_destructor(h, page);
+               spin_unlock_irq(&hugetlb_lock);
diff --git a/queue-6.1/mm-swapops-update-check-in-is_pfn_swap_entry-for-hwpoison-entries.patch b/queue-6.1/mm-swapops-update-check-in-is_pfn_swap_entry-for-hwpoison-entries.patch

new file mode 100644 (file)

index 0000000..dc511f1
--- /dev/null
+++ b/queue-6.1/mm-swapops-update-check-in-is_pfn_swap_entry-for-hwpoison-entries.patch
@@ -0,0 +1,178 @@
+From 07a57a338adb6ec9e766d6a6790f76527f45ceb5 Mon Sep 17 00:00:00 2001
+From: Oscar Salvador <osalvador@suse.de>
+Date: Sun, 7 Apr 2024 15:05:37 +0200
+Subject: mm,swapops: update check in is_pfn_swap_entry for hwpoison entries
+
+From: Oscar Salvador <osalvador@suse.de>
+
+commit 07a57a338adb6ec9e766d6a6790f76527f45ceb5 upstream.
+
+Tony reported that the Machine check recovery was broken in v6.9-rc1, as
+he was hitting a VM_BUG_ON when injecting uncorrectable memory errors to
+DRAM.
+
+After some more digging and debugging on his side, he realized that this
+went back to v6.1, with the introduction of 'commit 0d206b5d2e0d
+("mm/swap: add swp_offset_pfn() to fetch PFN from swap entry")'.  That
+commit, among other things, introduced swp_offset_pfn(), replacing
+hwpoison_entry_to_pfn() in its favour.
+
+The patch also introduced a VM_BUG_ON() check for is_pfn_swap_entry(), but
+is_pfn_swap_entry() never got updated to cover hwpoison entries, which
+means that we would hit the VM_BUG_ON whenever we would call
+swp_offset_pfn() for such entries on environments with CONFIG_DEBUG_VM
+set.  Fix this by updating the check to cover hwpoison entries as well,
+and update the comment while we are it.
+
+Link: https://lkml.kernel.org/r/20240407130537.16977-1-osalvador@suse.de
+Fixes: 0d206b5d2e0d ("mm/swap: add swp_offset_pfn() to fetch PFN from swap entry")
+Signed-off-by: Oscar Salvador <osalvador@suse.de>
+Reported-by: Tony Luck <tony.luck@intel.com>
+Closes: https://lore.kernel.org/all/Zg8kLSl2yAlA3o5D@agluck-desk3/
+Tested-by: Tony Luck <tony.luck@intel.com>
+Reviewed-by: Peter Xu <peterx@redhat.com>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Acked-by: Miaohe Lin <linmiaohe@huawei.com>
+Cc: <stable@vger.kernel.org>   [6.1.x]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/swapops.h |  105 ++++++++++++++++++++++++------------------------
+ 1 file changed, 53 insertions(+), 52 deletions(-)
+
+--- a/include/linux/swapops.h
++++ b/include/linux/swapops.h
+@@ -409,6 +409,55 @@ static inline bool is_migration_entry_di
+ }
+ #endif        /* CONFIG_MIGRATION */
+ 
++#ifdef CONFIG_MEMORY_FAILURE
++
++extern atomic_long_t num_poisoned_pages __read_mostly;
++
++/*
++ * Support for hardware poisoned pages
++ */
++static inline swp_entry_t make_hwpoison_entry(struct page *page)
++{
++      BUG_ON(!PageLocked(page));
++      return swp_entry(SWP_HWPOISON, page_to_pfn(page));
++}
++
++static inline int is_hwpoison_entry(swp_entry_t entry)
++{
++      return swp_type(entry) == SWP_HWPOISON;
++}
++
++static inline void num_poisoned_pages_inc(void)
++{
++      atomic_long_inc(&num_poisoned_pages);
++}
++
++static inline void num_poisoned_pages_sub(long i)
++{
++      atomic_long_sub(i, &num_poisoned_pages);
++}
++
++#else  /* CONFIG_MEMORY_FAILURE */
++
++static inline swp_entry_t make_hwpoison_entry(struct page *page)
++{
++      return swp_entry(0, 0);
++}
++
++static inline int is_hwpoison_entry(swp_entry_t swp)
++{
++      return 0;
++}
++
++static inline void num_poisoned_pages_inc(void)
++{
++}
++
++static inline void num_poisoned_pages_sub(long i)
++{
++}
++#endif  /* CONFIG_MEMORY_FAILURE */
++
+ typedef unsigned long pte_marker;
+ 
+ #define  PTE_MARKER_UFFD_WP  BIT(0)
+@@ -503,8 +552,9 @@ static inline struct page *pfn_swap_entr
+ 
+ /*
+  * A pfn swap entry is a special type of swap entry that always has a pfn stored
+- * in the swap offset. They are used to represent unaddressable device memory
+- * and to restrict access to a page undergoing migration.
++ * in the swap offset. They can either be used to represent unaddressable device
++ * memory, to restrict access to a page undergoing migration or to represent a
++ * pfn which has been hwpoisoned and unmapped.
+  */
+ static inline bool is_pfn_swap_entry(swp_entry_t entry)
+ {
+@@ -512,7 +562,7 @@ static inline bool is_pfn_swap_entry(swp
+       BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS);
+ 
+       return is_migration_entry(entry) || is_device_private_entry(entry) ||
+-             is_device_exclusive_entry(entry);
++             is_device_exclusive_entry(entry) || is_hwpoison_entry(entry);
+ }
+ 
+ struct page_vma_mapped_walk;
+@@ -581,55 +631,6 @@ static inline int is_pmd_migration_entry
+ }
+ #endif  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+ 
+-#ifdef CONFIG_MEMORY_FAILURE
+-
+-extern atomic_long_t num_poisoned_pages __read_mostly;
+-
+-/*
+- * Support for hardware poisoned pages
+- */
+-static inline swp_entry_t make_hwpoison_entry(struct page *page)
+-{
+-      BUG_ON(!PageLocked(page));
+-      return swp_entry(SWP_HWPOISON, page_to_pfn(page));
+-}
+-
+-static inline int is_hwpoison_entry(swp_entry_t entry)
+-{
+-      return swp_type(entry) == SWP_HWPOISON;
+-}
+-
+-static inline void num_poisoned_pages_inc(void)
+-{
+-      atomic_long_inc(&num_poisoned_pages);
+-}
+-
+-static inline void num_poisoned_pages_sub(long i)
+-{
+-      atomic_long_sub(i, &num_poisoned_pages);
+-}
+-
+-#else  /* CONFIG_MEMORY_FAILURE */
+-
+-static inline swp_entry_t make_hwpoison_entry(struct page *page)
+-{
+-      return swp_entry(0, 0);
+-}
+-
+-static inline int is_hwpoison_entry(swp_entry_t swp)
+-{
+-      return 0;
+-}
+-
+-static inline void num_poisoned_pages_inc(void)
+-{
+-}
+-
+-static inline void num_poisoned_pages_sub(long i)
+-{
+-}
+-#endif  /* CONFIG_MEMORY_FAILURE */
+-
+ static inline int non_swap_entry(swp_entry_t entry)
+ {
+       return swp_type(entry) >= MAX_SWAPFILES;
diff --git a/queue-6.1/series b/queue-6.1/series

index fddbdd1f3a964d73afd97e642c20972381680014..7eb2e52b9934ac329ef4e020b7b0801075a60c56 100644 (file)
--- a/queue-6.1/series
+++ b/queue-6.1/series
@@ -233,3 +233,11 @@ bluetooth-qca-fix-nvm-configuration-parsing.patch
  bluetooth-qca-fix-info-leak-when-fetching-board-id.patch
  bluetooth-qca-fix-info-leak-when-fetching-fw-build-id.patch
  bluetooth-qca-fix-firmware-check-error-path.patch
+vfio-add-the-spr_dsa-and-spr_iax-devices-to-the-denylist.patch
+dmaengine-idxd-add-a-new-security-check-to-deal-with-a-hardware-erratum.patch
+dmaengine-idxd-add-a-write-method-for-applications-to-submit-work.patch
+keys-fix-overwrite-of-key-expiration-on-instantiation.patch
+btrfs-do-not-wait-for-short-bulk-allocation.patch
+mm-hugetlb-fix-debug_locks_warn_on-1-when-dissolve_free_hugetlb_folio.patch
+mm-swapops-update-check-in-is_pfn_swap_entry-for-hwpoison-entries.patch
+md-fix-kmemleak-of-rdev-serial.patch
diff --git a/queue-6.1/vfio-add-the-spr_dsa-and-spr_iax-devices-to-the-denylist.patch b/queue-6.1/vfio-add-the-spr_dsa-and-spr_iax-devices-to-the-denylist.patch

new file mode 100644 (file)

index 0000000..534cafe
--- /dev/null
+++ b/queue-6.1/vfio-add-the-spr_dsa-and-spr_iax-devices-to-the-denylist.patch
@@ -0,0 +1,59 @@
+From 95feb3160eef0caa6018e175a5560b816aee8e79 Mon Sep 17 00:00:00 2001
+From: Arjan van de Ven <arjan@linux.intel.com>
+Date: Thu, 21 Mar 2024 19:44:07 +0000
+Subject: VFIO: Add the SPR_DSA and SPR_IAX devices to the denylist
+
+From: Arjan van de Ven <arjan@linux.intel.com>
+
+commit 95feb3160eef0caa6018e175a5560b816aee8e79 upstream.
+
+Due to an erratum with the SPR_DSA and SPR_IAX devices, it is not secure to assign
+these devices to virtual machines. Add the PCI IDs of these devices to the VFIO
+denylist to ensure that this is handled appropriately by the VFIO subsystem.
+
+The SPR_DSA and SPR_IAX devices are on-SOC devices for the Sapphire Rapids
+(and related) family of products that perform data movement and compression.
+
+Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/dma/idxd/registers.h |    3 ---
+ drivers/vfio/pci/vfio_pci.c  |    2 ++
+ include/linux/pci_ids.h      |    2 ++
+ 3 files changed, 4 insertions(+), 3 deletions(-)
+
+--- a/drivers/dma/idxd/registers.h
++++ b/drivers/dma/idxd/registers.h
+@@ -4,9 +4,6 @@
+ #define _IDXD_REGISTERS_H_
+ 
+ /* PCI Config */
+-#define PCI_DEVICE_ID_INTEL_DSA_SPR0  0x0b25
+-#define PCI_DEVICE_ID_INTEL_IAX_SPR0  0x0cfe
+-
+ #define DEVICE_VERSION_1              0x100
+ #define DEVICE_VERSION_2              0x200
+ 
+--- a/drivers/vfio/pci/vfio_pci.c
++++ b/drivers/vfio/pci/vfio_pci.c
+@@ -71,6 +71,8 @@ static bool vfio_pci_dev_in_denylist(str
+               case PCI_DEVICE_ID_INTEL_QAT_C62X_VF:
+               case PCI_DEVICE_ID_INTEL_QAT_DH895XCC:
+               case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF:
++              case PCI_DEVICE_ID_INTEL_DSA_SPR0:
++              case PCI_DEVICE_ID_INTEL_IAX_SPR0:
+                       return true;
+               default:
+                       return false;
+--- a/include/linux/pci_ids.h
++++ b/include/linux/pci_ids.h
+@@ -2664,7 +2664,9 @@
+ #define PCI_DEVICE_ID_INTEL_QUARK_X1000_ILB   0x095e
+ #define PCI_DEVICE_ID_INTEL_I960      0x0960
+ #define PCI_DEVICE_ID_INTEL_I960RM    0x0962
++#define PCI_DEVICE_ID_INTEL_DSA_SPR0  0x0b25
+ #define PCI_DEVICE_ID_INTEL_CENTERTON_ILB     0x0c60
++#define PCI_DEVICE_ID_INTEL_IAX_SPR0  0x0cfe
+ #define PCI_DEVICE_ID_INTEL_8257X_SOL 0x1062
+ #define PCI_DEVICE_ID_INTEL_82573E_SOL        0x1085
+ #define PCI_DEVICE_ID_INTEL_82573L_SOL        0x108f
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Wed, 15 May 2024 07:41:00 +0000 (09:41 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Wed, 15 May 2024 07:41:00 +0000 (09:41 +0200)
queue-6.1/btrfs-do-not-wait-for-short-bulk-allocation.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/dmaengine-idxd-add-a-new-security-check-to-deal-with-a-hardware-erratum.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/dmaengine-idxd-add-a-write-method-for-applications-to-submit-work.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/keys-fix-overwrite-of-key-expiration-on-instantiation.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/md-fix-kmemleak-of-rdev-serial.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/mm-hugetlb-fix-debug_locks_warn_on-1-when-dissolve_free_hugetlb_folio.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/mm-swapops-update-check-in-is_pfn_swap_entry-for-hwpoison-entries.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/series		patch \| blob \| blame \| history
queue-6.1/vfio-add-the-spr_dsa-and-spr_iax-devices-to-the-denylist.patch	[new file with mode: 0644]	patch \| blob