From cb99b36875b1be9da5ef3e9c5f1f892d7c7721d4 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 15 May 2024 09:41:00 +0200 Subject: [PATCH] 6.1-stable patches added patches: btrfs-do-not-wait-for-short-bulk-allocation.patch dmaengine-idxd-add-a-new-security-check-to-deal-with-a-hardware-erratum.patch dmaengine-idxd-add-a-write-method-for-applications-to-submit-work.patch keys-fix-overwrite-of-key-expiration-on-instantiation.patch md-fix-kmemleak-of-rdev-serial.patch mm-hugetlb-fix-debug_locks_warn_on-1-when-dissolve_free_hugetlb_folio.patch mm-swapops-update-check-in-is_pfn_swap_entry-for-hwpoison-entries.patch vfio-add-the-spr_dsa-and-spr_iax-devices-to-the-denylist.patch --- ...o-not-wait-for-short-bulk-allocation.patch | 89 +++++++++ ...heck-to-deal-with-a-hardware-erratum.patch | 97 ++++++++++ ...thod-for-applications-to-submit-work.patch | 164 ++++++++++++++++ ...e-of-key-expiration-on-instantiation.patch | 44 +++++ .../md-fix-kmemleak-of-rdev-serial.patch | 55 ++++++ ...n-1-when-dissolve_free_hugetlb_folio.patch | 147 +++++++++++++++ ..._pfn_swap_entry-for-hwpoison-entries.patch | 178 ++++++++++++++++++ queue-6.1/series | 8 + ...-and-spr_iax-devices-to-the-denylist.patch | 59 ++++++ 9 files changed, 841 insertions(+) create mode 100644 queue-6.1/btrfs-do-not-wait-for-short-bulk-allocation.patch create mode 100644 queue-6.1/dmaengine-idxd-add-a-new-security-check-to-deal-with-a-hardware-erratum.patch create mode 100644 queue-6.1/dmaengine-idxd-add-a-write-method-for-applications-to-submit-work.patch create mode 100644 queue-6.1/keys-fix-overwrite-of-key-expiration-on-instantiation.patch create mode 100644 queue-6.1/md-fix-kmemleak-of-rdev-serial.patch create mode 100644 queue-6.1/mm-hugetlb-fix-debug_locks_warn_on-1-when-dissolve_free_hugetlb_folio.patch create mode 100644 queue-6.1/mm-swapops-update-check-in-is_pfn_swap_entry-for-hwpoison-entries.patch create mode 100644 queue-6.1/vfio-add-the-spr_dsa-and-spr_iax-devices-to-the-denylist.patch diff --git a/queue-6.1/btrfs-do-not-wait-for-short-bulk-allocation.patch b/queue-6.1/btrfs-do-not-wait-for-short-bulk-allocation.patch new file mode 100644 index 00000000000..eb209a1d3ff --- /dev/null +++ b/queue-6.1/btrfs-do-not-wait-for-short-bulk-allocation.patch @@ -0,0 +1,89 @@ +From 1db7959aacd905e6487d0478ac01d89f86eb1e51 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Tue, 26 Mar 2024 09:16:46 +1030 +Subject: btrfs: do not wait for short bulk allocation + +From: Qu Wenruo + +commit 1db7959aacd905e6487d0478ac01d89f86eb1e51 upstream. + +[BUG] +There is a recent report that when memory pressure is high (including +cached pages), btrfs can spend most of its time on memory allocation in +btrfs_alloc_page_array() for compressed read/write. + +[CAUSE] +For btrfs_alloc_page_array() we always go alloc_pages_bulk_array(), and +even if the bulk allocation failed (fell back to single page +allocation) we still retry but with extra memalloc_retry_wait(). + +If the bulk alloc only returned one page a time, we would spend a lot of +time on the retry wait. + +The behavior was introduced in commit 395cb57e8560 ("btrfs: wait between +incomplete batch memory allocations"). + +[FIX] +Although the commit mentioned that other filesystems do the wait, it's +not the case at least nowadays. + +All the mainlined filesystems only call memalloc_retry_wait() if they +failed to allocate any page (not only for bulk allocation). +If there is any progress, they won't call memalloc_retry_wait() at all. + +For example, xfs_buf_alloc_pages() would only call memalloc_retry_wait() +if there is no allocation progress at all, and the call is not for +metadata readahead. + +So I don't believe we should call memalloc_retry_wait() unconditionally +for short allocation. + +Call memalloc_retry_wait() if it fails to allocate any page for tree +block allocation (which goes with __GFP_NOFAIL and may not need the +special handling anyway), and reduce the latency for +btrfs_alloc_page_array(). + +Reported-by: Julian Taylor +Tested-by: Julian Taylor +Link: https://lore.kernel.org/all/8966c095-cbe7-4d22-9784-a647d1bf27c3@1und1.de/ +Fixes: 395cb57e8560 ("btrfs: wait between incomplete batch memory allocations") +CC: stable@vger.kernel.org # 6.1+ +Reviewed-by: Sweet Tea Dorminy +Reviewed-by: Filipe Manana +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent_io.c | 19 +++++++------------ + 1 file changed, 7 insertions(+), 12 deletions(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -1324,19 +1324,14 @@ int btrfs_alloc_page_array(unsigned int + unsigned int last = allocated; + + allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array); +- +- if (allocated == nr_pages) +- return 0; +- +- /* +- * During this iteration, no page could be allocated, even +- * though alloc_pages_bulk_array() falls back to alloc_page() +- * if it could not bulk-allocate. So we must be out of memory. +- */ +- if (allocated == last) ++ if (unlikely(allocated == last)) { ++ /* No progress, fail and do cleanup. */ ++ for (int i = 0; i < allocated; i++) { ++ __free_page(page_array[i]); ++ page_array[i] = NULL; ++ } + return -ENOMEM; +- +- memalloc_retry_wait(GFP_NOFS); ++ } + } + return 0; + } diff --git a/queue-6.1/dmaengine-idxd-add-a-new-security-check-to-deal-with-a-hardware-erratum.patch b/queue-6.1/dmaengine-idxd-add-a-new-security-check-to-deal-with-a-hardware-erratum.patch new file mode 100644 index 00000000000..d0529a34b85 --- /dev/null +++ b/queue-6.1/dmaengine-idxd-add-a-new-security-check-to-deal-with-a-hardware-erratum.patch @@ -0,0 +1,97 @@ +From e11452eb071b2a8e6ba52892b2e270bbdaa6640d Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Wed, 24 Apr 2024 14:43:22 +0000 +Subject: dmaengine: idxd: add a new security check to deal with a hardware erratum + +From: Arjan van de Ven + +commit e11452eb071b2a8e6ba52892b2e270bbdaa6640d upstream. + +On Sapphire Rapids and related platforms, the DSA and IAA devices have an +erratum that causes direct access (for example, by using the ENQCMD or +MOVDIR64 instructions) from untrusted applications to be a security problem. + +To solve this, add a flag to the PCI device enumeration and device structures +to indicate the presence/absence of this security exposure. In the mmap() +method of the device, this flag is then used to enforce that the user +has the CAP_SYS_RAWIO capability. + +In a future patch, a write() based method will be added that allows untrusted +applications submit work to the accelerator, where the kernel can do +sanity checking on the user input to ensure secure operation of the accelerator. + +Signed-off-by: Arjan van de Ven +Signed-off-by: Greg Kroah-Hartman +--- + drivers/dma/idxd/cdev.c | 12 ++++++++++++ + drivers/dma/idxd/idxd.h | 3 +++ + drivers/dma/idxd/init.c | 4 ++++ + 3 files changed, 19 insertions(+) + +--- a/drivers/dma/idxd/cdev.c ++++ b/drivers/dma/idxd/cdev.c +@@ -198,6 +198,18 @@ static int idxd_cdev_mmap(struct file *f + int rc; + + dev_dbg(&pdev->dev, "%s called\n", __func__); ++ ++ /* ++ * Due to an erratum in some of the devices supported by the driver, ++ * direct user submission to the device can be unsafe. ++ * (See the INTEL-SA-01084 security advisory) ++ * ++ * For the devices that exhibit this behavior, require that the user ++ * has CAP_SYS_RAWIO capabilities. ++ */ ++ if (!idxd->user_submission_safe && !capable(CAP_SYS_RAWIO)) ++ return -EPERM; ++ + rc = check_vma(wq, vma, __func__); + if (rc < 0) + return rc; +--- a/drivers/dma/idxd/idxd.h ++++ b/drivers/dma/idxd/idxd.h +@@ -258,6 +258,7 @@ struct idxd_driver_data { + struct device_type *dev_type; + int compl_size; + int align; ++ bool user_submission_safe; + }; + + struct idxd_device { +@@ -316,6 +317,8 @@ struct idxd_device { + struct idxd_pmu *idxd_pmu; + + unsigned long *opcap_bmap; ++ ++ bool user_submission_safe; + }; + + /* IDXD software descriptor */ +--- a/drivers/dma/idxd/init.c ++++ b/drivers/dma/idxd/init.c +@@ -48,6 +48,7 @@ static struct idxd_driver_data idxd_driv + .compl_size = sizeof(struct dsa_completion_record), + .align = 32, + .dev_type = &dsa_device_type, ++ .user_submission_safe = false, /* See INTEL-SA-01084 security advisory */ + }, + [IDXD_TYPE_IAX] = { + .name_prefix = "iax", +@@ -55,6 +56,7 @@ static struct idxd_driver_data idxd_driv + .compl_size = sizeof(struct iax_completion_record), + .align = 64, + .dev_type = &iax_device_type, ++ .user_submission_safe = false, /* See INTEL-SA-01084 security advisory */ + }, + }; + +@@ -663,6 +665,8 @@ static int idxd_pci_probe(struct pci_dev + dev_info(&pdev->dev, "Intel(R) Accelerator Device (v%x)\n", + idxd->hw.version); + ++ idxd->user_submission_safe = data->user_submission_safe; ++ + return 0; + + err_dev_register: diff --git a/queue-6.1/dmaengine-idxd-add-a-write-method-for-applications-to-submit-work.patch b/queue-6.1/dmaengine-idxd-add-a-write-method-for-applications-to-submit-work.patch new file mode 100644 index 00000000000..4bde7cbd4b7 --- /dev/null +++ b/queue-6.1/dmaengine-idxd-add-a-write-method-for-applications-to-submit-work.patch @@ -0,0 +1,164 @@ +From 6827738dc684a87ad54ebba3ae7f3d7c977698eb Mon Sep 17 00:00:00 2001 +From: Nikhil Rao +Date: Wed, 24 Apr 2024 15:16:12 +0000 +Subject: dmaengine: idxd: add a write() method for applications to submit work + +From: Nikhil Rao + +commit 6827738dc684a87ad54ebba3ae7f3d7c977698eb upstream. + +After the patch to restrict the use of mmap() to CAP_SYS_RAWIO for +the currently existing devices, most applications can no longer make +use of the accelerators as in production "you don't run things as root". + +To keep the DSA and IAA accelerators usable, hook up a write() method +so that applications can still submit work. In the write method, +sufficient input validation is performed to avoid the security issue +that required the mmap CAP_SYS_RAWIO check. + +One complication is that the DSA device allows for indirect ("batched") +descriptors. There is no reasonable way to do the input validation +on these indirect descriptors so the write() method will not allow these +to be submitted to the hardware on affected hardware, and the sysfs +enumeration of support for the opcode is also removed. + +Early performance data shows that the performance delta for most common +cases is within the noise. + +Signed-off-by: Nikhil Rao +Signed-off-by: Arjan van de Ven +Signed-off-by: Greg Kroah-Hartman +--- + drivers/dma/idxd/cdev.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++ + drivers/dma/idxd/sysfs.c | 27 ++++++++++++++++++- + 2 files changed, 90 insertions(+), 2 deletions(-) + +--- a/drivers/dma/idxd/cdev.c ++++ b/drivers/dma/idxd/cdev.c +@@ -224,6 +224,70 @@ static int idxd_cdev_mmap(struct file *f + vma->vm_page_prot); + } + ++static int idxd_submit_user_descriptor(struct idxd_user_context *ctx, ++ struct dsa_hw_desc __user *udesc) ++{ ++ struct idxd_wq *wq = ctx->wq; ++ struct idxd_dev *idxd_dev = &wq->idxd->idxd_dev; ++ const uint64_t comp_addr_align = is_dsa_dev(idxd_dev) ? 0x20 : 0x40; ++ void __iomem *portal = idxd_wq_portal_addr(wq); ++ struct dsa_hw_desc descriptor __aligned(64); ++ int rc; ++ ++ rc = copy_from_user(&descriptor, udesc, sizeof(descriptor)); ++ if (rc) ++ return -EFAULT; ++ ++ /* ++ * DSA devices are capable of indirect ("batch") command submission. ++ * On devices where direct user submissions are not safe, we cannot ++ * allow this since there is no good way for us to verify these ++ * indirect commands. ++ */ ++ if (is_dsa_dev(idxd_dev) && descriptor.opcode == DSA_OPCODE_BATCH && ++ !wq->idxd->user_submission_safe) ++ return -EINVAL; ++ /* ++ * As per the programming specification, the completion address must be ++ * aligned to 32 or 64 bytes. If this is violated the hardware ++ * engine can get very confused (security issue). ++ */ ++ if (!IS_ALIGNED(descriptor.completion_addr, comp_addr_align)) ++ return -EINVAL; ++ ++ if (wq_dedicated(wq)) ++ iosubmit_cmds512(portal, &descriptor, 1); ++ else { ++ descriptor.priv = 0; ++ descriptor.pasid = ctx->pasid; ++ rc = idxd_enqcmds(wq, portal, &descriptor); ++ if (rc < 0) ++ return rc; ++ } ++ ++ return 0; ++} ++ ++static ssize_t idxd_cdev_write(struct file *filp, const char __user *buf, size_t len, ++ loff_t *unused) ++{ ++ struct dsa_hw_desc __user *udesc = (struct dsa_hw_desc __user *)buf; ++ struct idxd_user_context *ctx = filp->private_data; ++ ssize_t written = 0; ++ int i; ++ ++ for (i = 0; i < len/sizeof(struct dsa_hw_desc); i++) { ++ int rc = idxd_submit_user_descriptor(ctx, udesc + i); ++ ++ if (rc) ++ return written ? written : rc; ++ ++ written += sizeof(struct dsa_hw_desc); ++ } ++ ++ return written; ++} ++ + static __poll_t idxd_cdev_poll(struct file *filp, + struct poll_table_struct *wait) + { +@@ -246,6 +310,7 @@ static const struct file_operations idxd + .open = idxd_cdev_open, + .release = idxd_cdev_release, + .mmap = idxd_cdev_mmap, ++ .write = idxd_cdev_write, + .poll = idxd_cdev_poll, + }; + +--- a/drivers/dma/idxd/sysfs.c ++++ b/drivers/dma/idxd/sysfs.c +@@ -1162,12 +1162,35 @@ static ssize_t wq_enqcmds_retries_store( + static struct device_attribute dev_attr_wq_enqcmds_retries = + __ATTR(enqcmds_retries, 0644, wq_enqcmds_retries_show, wq_enqcmds_retries_store); + ++static ssize_t op_cap_show_common(struct device *dev, char *buf, unsigned long *opcap_bmap) ++{ ++ ssize_t pos; ++ int i; ++ ++ pos = 0; ++ for (i = IDXD_MAX_OPCAP_BITS/64 - 1; i >= 0; i--) { ++ unsigned long val = opcap_bmap[i]; ++ ++ /* On systems where direct user submissions are not safe, we need to clear out ++ * the BATCH capability from the capability mask in sysfs since we cannot support ++ * that command on such systems. ++ */ ++ if (i == DSA_OPCODE_BATCH/64 && !confdev_to_idxd(dev)->user_submission_safe) ++ clear_bit(DSA_OPCODE_BATCH % 64, &val); ++ ++ pos += sysfs_emit_at(buf, pos, "%*pb", 64, &val); ++ pos += sysfs_emit_at(buf, pos, "%c", i == 0 ? '\n' : ','); ++ } ++ ++ return pos; ++} ++ + static ssize_t wq_op_config_show(struct device *dev, + struct device_attribute *attr, char *buf) + { + struct idxd_wq *wq = confdev_to_wq(dev); + +- return sysfs_emit(buf, "%*pb\n", IDXD_MAX_OPCAP_BITS, wq->opcap_bmap); ++ return op_cap_show_common(dev, buf, wq->opcap_bmap); + } + + static int idxd_verify_supported_opcap(struct idxd_device *idxd, unsigned long *opmask) +@@ -1381,7 +1404,7 @@ static ssize_t op_cap_show(struct device + { + struct idxd_device *idxd = confdev_to_idxd(dev); + +- return sysfs_emit(buf, "%*pb\n", IDXD_MAX_OPCAP_BITS, idxd->opcap_bmap); ++ return op_cap_show_common(dev, buf, idxd->opcap_bmap); + } + static DEVICE_ATTR_RO(op_cap); + diff --git a/queue-6.1/keys-fix-overwrite-of-key-expiration-on-instantiation.patch b/queue-6.1/keys-fix-overwrite-of-key-expiration-on-instantiation.patch new file mode 100644 index 00000000000..35b558d1646 --- /dev/null +++ b/queue-6.1/keys-fix-overwrite-of-key-expiration-on-instantiation.patch @@ -0,0 +1,44 @@ +From 9da27fb65a14c18efd4473e2e82b76b53ba60252 Mon Sep 17 00:00:00 2001 +From: Silvio Gissi +Date: Fri, 15 Mar 2024 15:05:39 -0400 +Subject: keys: Fix overwrite of key expiration on instantiation + +From: Silvio Gissi + +commit 9da27fb65a14c18efd4473e2e82b76b53ba60252 upstream. + +The expiry time of a key is unconditionally overwritten during +instantiation, defaulting to turn it permanent. This causes a problem +for DNS resolution as the expiration set by user-space is overwritten to +TIME64_MAX, disabling further DNS updates. Fix this by restoring the +condition that key_set_expiry is only called when the pre-parser sets a +specific expiry. + +Fixes: 39299bdd2546 ("keys, dns: Allow key types (eg. DNS) to be reclaimed immediately on expiry") +Signed-off-by: Silvio Gissi +cc: David Howells +cc: Hazem Mohamed Abuelfotoh +cc: linux-afs@lists.infradead.org +cc: linux-cifs@vger.kernel.org +cc: keyrings@vger.kernel.org +cc: netdev@vger.kernel.org +cc: stable@vger.kernel.org +Reviewed-by: Jarkko Sakkinen +Signed-off-by: Jarkko Sakkinen +Signed-off-by: Greg Kroah-Hartman +--- + security/keys/key.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/security/keys/key.c ++++ b/security/keys/key.c +@@ -464,7 +464,8 @@ static int __key_instantiate_and_link(st + if (authkey) + key_invalidate(authkey); + +- key_set_expiry(key, prep->expiry); ++ if (prep->expiry != TIME64_MAX) ++ key_set_expiry(key, prep->expiry); + } + } + diff --git a/queue-6.1/md-fix-kmemleak-of-rdev-serial.patch b/queue-6.1/md-fix-kmemleak-of-rdev-serial.patch new file mode 100644 index 00000000000..013efe94f4f --- /dev/null +++ b/queue-6.1/md-fix-kmemleak-of-rdev-serial.patch @@ -0,0 +1,55 @@ +From 6cf350658736681b9d6b0b6e58c5c76b235bb4c4 Mon Sep 17 00:00:00 2001 +From: Li Nan +Date: Thu, 8 Feb 2024 16:55:56 +0800 +Subject: md: fix kmemleak of rdev->serial + +From: Li Nan + +commit 6cf350658736681b9d6b0b6e58c5c76b235bb4c4 upstream. + +If kobject_add() is fail in bind_rdev_to_array(), 'rdev->serial' will be +alloc not be freed, and kmemleak occurs. + +unreferenced object 0xffff88815a350000 (size 49152): + comm "mdadm", pid 789, jiffies 4294716910 + hex dump (first 32 bytes): + 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + backtrace (crc f773277a): + [<0000000058b0a453>] kmemleak_alloc+0x61/0xe0 + [<00000000366adf14>] __kmalloc_large_node+0x15e/0x270 + [<000000002e82961b>] __kmalloc_node.cold+0x11/0x7f + [<00000000f206d60a>] kvmalloc_node+0x74/0x150 + [<0000000034bf3363>] rdev_init_serial+0x67/0x170 + [<0000000010e08fe9>] mddev_create_serial_pool+0x62/0x220 + [<00000000c3837bf0>] bind_rdev_to_array+0x2af/0x630 + [<0000000073c28560>] md_add_new_disk+0x400/0x9f0 + [<00000000770e30ff>] md_ioctl+0x15bf/0x1c10 + [<000000006cfab718>] blkdev_ioctl+0x191/0x3f0 + [<0000000085086a11>] vfs_ioctl+0x22/0x60 + [<0000000018b656fe>] __x64_sys_ioctl+0xba/0xe0 + [<00000000e54e675e>] do_syscall_64+0x71/0x150 + [<000000008b0ad622>] entry_SYSCALL_64_after_hwframe+0x6c/0x74 + +Fixes: 963c555e75b0 ("md: introduce mddev_create/destroy_wb_pool for the change of member device") +Signed-off-by: Li Nan +Signed-off-by: Song Liu +Link: https://lore.kernel.org/r/20240208085556.2412922-1-linan666@huaweicloud.com +[ mddev_destroy_serial_pool third parameter was removed in mainline, + where there is no need to suspend within this function anymore. ] +Signed-off-by: Jeremy Bongio +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/md.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/md/md.c ++++ b/drivers/md/md.c +@@ -2508,6 +2508,7 @@ static int bind_rdev_to_array(struct md_ + fail: + pr_warn("md: failed to register dev-%s for %s\n", + b, mdname(mddev)); ++ mddev_destroy_serial_pool(mddev, rdev, false); + return err; + } + diff --git a/queue-6.1/mm-hugetlb-fix-debug_locks_warn_on-1-when-dissolve_free_hugetlb_folio.patch b/queue-6.1/mm-hugetlb-fix-debug_locks_warn_on-1-when-dissolve_free_hugetlb_folio.patch new file mode 100644 index 00000000000..dd32bca4983 --- /dev/null +++ b/queue-6.1/mm-hugetlb-fix-debug_locks_warn_on-1-when-dissolve_free_hugetlb_folio.patch @@ -0,0 +1,147 @@ +From 52ccdde16b6540abe43b6f8d8e1e1ec90b0983af Mon Sep 17 00:00:00 2001 +From: Miaohe Lin +Date: Fri, 19 Apr 2024 16:58:19 +0800 +Subject: mm/hugetlb: fix DEBUG_LOCKS_WARN_ON(1) when dissolve_free_hugetlb_folio() + +From: Miaohe Lin + +commit 52ccdde16b6540abe43b6f8d8e1e1ec90b0983af upstream. + +When I did memory failure tests recently, below warning occurs: + +DEBUG_LOCKS_WARN_ON(1) +WARNING: CPU: 8 PID: 1011 at kernel/locking/lockdep.c:232 __lock_acquire+0xccb/0x1ca0 +Modules linked in: mce_inject hwpoison_inject +CPU: 8 PID: 1011 Comm: bash Kdump: loaded Not tainted 6.9.0-rc3-next-20240410-00012-gdb69f219f4be #3 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 +RIP: 0010:__lock_acquire+0xccb/0x1ca0 +RSP: 0018:ffffa7a1c7fe3bd0 EFLAGS: 00000082 +RAX: 0000000000000000 RBX: eb851eb853975fcf RCX: ffffa1ce5fc1c9c8 +RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffffa1ce5fc1c9c0 +RBP: ffffa1c6865d3280 R08: ffffffffb0f570a8 R09: 0000000000009ffb +R10: 0000000000000286 R11: ffffffffb0f2ad50 R12: ffffa1c6865d3d10 +R13: ffffa1c6865d3c70 R14: 0000000000000000 R15: 0000000000000004 +FS: 00007ff9f32aa740(0000) GS:ffffa1ce5fc00000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00007ff9f3134ba0 CR3: 00000008484e4000 CR4: 00000000000006f0 +Call Trace: + + lock_acquire+0xbe/0x2d0 + _raw_spin_lock_irqsave+0x3a/0x60 + hugepage_subpool_put_pages.part.0+0xe/0xc0 + free_huge_folio+0x253/0x3f0 + dissolve_free_huge_page+0x147/0x210 + __page_handle_poison+0x9/0x70 + memory_failure+0x4e6/0x8c0 + hard_offline_page_store+0x55/0xa0 + kernfs_fop_write_iter+0x12c/0x1d0 + vfs_write+0x380/0x540 + ksys_write+0x64/0xe0 + do_syscall_64+0xbc/0x1d0 + entry_SYSCALL_64_after_hwframe+0x77/0x7f +RIP: 0033:0x7ff9f3114887 +RSP: 002b:00007ffecbacb458 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 +RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007ff9f3114887 +RDX: 000000000000000c RSI: 0000564494164e10 RDI: 0000000000000001 +RBP: 0000564494164e10 R08: 00007ff9f31d1460 R09: 000000007fffffff +R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000000c +R13: 00007ff9f321b780 R14: 00007ff9f3217600 R15: 00007ff9f3216a00 + +Kernel panic - not syncing: kernel: panic_on_warn set ... +CPU: 8 PID: 1011 Comm: bash Kdump: loaded Not tainted 6.9.0-rc3-next-20240410-00012-gdb69f219f4be #3 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 +Call Trace: + + panic+0x326/0x350 + check_panic_on_warn+0x4f/0x50 + __warn+0x98/0x190 + report_bug+0x18e/0x1a0 + handle_bug+0x3d/0x70 + exc_invalid_op+0x18/0x70 + asm_exc_invalid_op+0x1a/0x20 +RIP: 0010:__lock_acquire+0xccb/0x1ca0 +RSP: 0018:ffffa7a1c7fe3bd0 EFLAGS: 00000082 +RAX: 0000000000000000 RBX: eb851eb853975fcf RCX: ffffa1ce5fc1c9c8 +RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffffa1ce5fc1c9c0 +RBP: ffffa1c6865d3280 R08: ffffffffb0f570a8 R09: 0000000000009ffb +R10: 0000000000000286 R11: ffffffffb0f2ad50 R12: ffffa1c6865d3d10 +R13: ffffa1c6865d3c70 R14: 0000000000000000 R15: 0000000000000004 + lock_acquire+0xbe/0x2d0 + _raw_spin_lock_irqsave+0x3a/0x60 + hugepage_subpool_put_pages.part.0+0xe/0xc0 + free_huge_folio+0x253/0x3f0 + dissolve_free_huge_page+0x147/0x210 + __page_handle_poison+0x9/0x70 + memory_failure+0x4e6/0x8c0 + hard_offline_page_store+0x55/0xa0 + kernfs_fop_write_iter+0x12c/0x1d0 + vfs_write+0x380/0x540 + ksys_write+0x64/0xe0 + do_syscall_64+0xbc/0x1d0 + entry_SYSCALL_64_after_hwframe+0x77/0x7f +RIP: 0033:0x7ff9f3114887 +RSP: 002b:00007ffecbacb458 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 +RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007ff9f3114887 +RDX: 000000000000000c RSI: 0000564494164e10 RDI: 0000000000000001 +RBP: 0000564494164e10 R08: 00007ff9f31d1460 R09: 000000007fffffff +R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000000c +R13: 00007ff9f321b780 R14: 00007ff9f3217600 R15: 00007ff9f3216a00 + + +After git bisecting and digging into the code, I believe the root cause is +that _deferred_list field of folio is unioned with _hugetlb_subpool field. +In __update_and_free_hugetlb_folio(), folio->_deferred_list is +initialized leading to corrupted folio->_hugetlb_subpool when folio is +hugetlb. Later free_huge_folio() will use _hugetlb_subpool and above +warning happens. + +But it is assumed hugetlb flag must have been cleared when calling +folio_put() in update_and_free_hugetlb_folio(). This assumption is broken +due to below race: + +CPU1 CPU2 +dissolve_free_huge_page update_and_free_pages_bulk + update_and_free_hugetlb_folio hugetlb_vmemmap_restore_folios + folio_clear_hugetlb_vmemmap_optimized + clear_flag = folio_test_hugetlb_vmemmap_optimized + if (clear_flag) <-- False, it's already cleared. + __folio_clear_hugetlb(folio) <-- Hugetlb is not cleared. + folio_put + free_huge_folio <-- free_the_page is expected. + list_for_each_entry() + __folio_clear_hugetlb <-- Too late. + +Fix this issue by checking whether folio is hugetlb directly instead of +checking clear_flag to close the race window. + +Link: https://lkml.kernel.org/r/20240419085819.1901645-1-linmiaohe@huawei.com +Fixes: 32c877191e02 ("hugetlb: do not clear hugetlb dtor until allocating vmemmap") +Signed-off-by: Miaohe Lin +Reviewed-by: Oscar Salvador +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Miaohe Lin +Signed-off-by: Greg Kroah-Hartman +--- + mm/hugetlb.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -1762,7 +1762,6 @@ static void __update_and_free_page(struc + { + int i; + struct page *subpage; +- bool clear_dtor = HPageVmemmapOptimized(page); + + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + return; +@@ -1797,7 +1796,7 @@ static void __update_and_free_page(struc + * If vmemmap pages were allocated above, then we need to clear the + * hugetlb destructor under the hugetlb lock. + */ +- if (clear_dtor) { ++ if (PageHuge(page)) { + spin_lock_irq(&hugetlb_lock); + __clear_hugetlb_destructor(h, page); + spin_unlock_irq(&hugetlb_lock); diff --git a/queue-6.1/mm-swapops-update-check-in-is_pfn_swap_entry-for-hwpoison-entries.patch b/queue-6.1/mm-swapops-update-check-in-is_pfn_swap_entry-for-hwpoison-entries.patch new file mode 100644 index 00000000000..dc511f15cf8 --- /dev/null +++ b/queue-6.1/mm-swapops-update-check-in-is_pfn_swap_entry-for-hwpoison-entries.patch @@ -0,0 +1,178 @@ +From 07a57a338adb6ec9e766d6a6790f76527f45ceb5 Mon Sep 17 00:00:00 2001 +From: Oscar Salvador +Date: Sun, 7 Apr 2024 15:05:37 +0200 +Subject: mm,swapops: update check in is_pfn_swap_entry for hwpoison entries + +From: Oscar Salvador + +commit 07a57a338adb6ec9e766d6a6790f76527f45ceb5 upstream. + +Tony reported that the Machine check recovery was broken in v6.9-rc1, as +he was hitting a VM_BUG_ON when injecting uncorrectable memory errors to +DRAM. + +After some more digging and debugging on his side, he realized that this +went back to v6.1, with the introduction of 'commit 0d206b5d2e0d +("mm/swap: add swp_offset_pfn() to fetch PFN from swap entry")'. That +commit, among other things, introduced swp_offset_pfn(), replacing +hwpoison_entry_to_pfn() in its favour. + +The patch also introduced a VM_BUG_ON() check for is_pfn_swap_entry(), but +is_pfn_swap_entry() never got updated to cover hwpoison entries, which +means that we would hit the VM_BUG_ON whenever we would call +swp_offset_pfn() for such entries on environments with CONFIG_DEBUG_VM +set. Fix this by updating the check to cover hwpoison entries as well, +and update the comment while we are it. + +Link: https://lkml.kernel.org/r/20240407130537.16977-1-osalvador@suse.de +Fixes: 0d206b5d2e0d ("mm/swap: add swp_offset_pfn() to fetch PFN from swap entry") +Signed-off-by: Oscar Salvador +Reported-by: Tony Luck +Closes: https://lore.kernel.org/all/Zg8kLSl2yAlA3o5D@agluck-desk3/ +Tested-by: Tony Luck +Reviewed-by: Peter Xu +Reviewed-by: David Hildenbrand +Acked-by: Miaohe Lin +Cc: [6.1.x] +Signed-off-by: Andrew Morton +Signed-off-by: Miaohe Lin +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/swapops.h | 105 ++++++++++++++++++++++++------------------------ + 1 file changed, 53 insertions(+), 52 deletions(-) + +--- a/include/linux/swapops.h ++++ b/include/linux/swapops.h +@@ -409,6 +409,55 @@ static inline bool is_migration_entry_di + } + #endif /* CONFIG_MIGRATION */ + ++#ifdef CONFIG_MEMORY_FAILURE ++ ++extern atomic_long_t num_poisoned_pages __read_mostly; ++ ++/* ++ * Support for hardware poisoned pages ++ */ ++static inline swp_entry_t make_hwpoison_entry(struct page *page) ++{ ++ BUG_ON(!PageLocked(page)); ++ return swp_entry(SWP_HWPOISON, page_to_pfn(page)); ++} ++ ++static inline int is_hwpoison_entry(swp_entry_t entry) ++{ ++ return swp_type(entry) == SWP_HWPOISON; ++} ++ ++static inline void num_poisoned_pages_inc(void) ++{ ++ atomic_long_inc(&num_poisoned_pages); ++} ++ ++static inline void num_poisoned_pages_sub(long i) ++{ ++ atomic_long_sub(i, &num_poisoned_pages); ++} ++ ++#else /* CONFIG_MEMORY_FAILURE */ ++ ++static inline swp_entry_t make_hwpoison_entry(struct page *page) ++{ ++ return swp_entry(0, 0); ++} ++ ++static inline int is_hwpoison_entry(swp_entry_t swp) ++{ ++ return 0; ++} ++ ++static inline void num_poisoned_pages_inc(void) ++{ ++} ++ ++static inline void num_poisoned_pages_sub(long i) ++{ ++} ++#endif /* CONFIG_MEMORY_FAILURE */ ++ + typedef unsigned long pte_marker; + + #define PTE_MARKER_UFFD_WP BIT(0) +@@ -503,8 +552,9 @@ static inline struct page *pfn_swap_entr + + /* + * A pfn swap entry is a special type of swap entry that always has a pfn stored +- * in the swap offset. They are used to represent unaddressable device memory +- * and to restrict access to a page undergoing migration. ++ * in the swap offset. They can either be used to represent unaddressable device ++ * memory, to restrict access to a page undergoing migration or to represent a ++ * pfn which has been hwpoisoned and unmapped. + */ + static inline bool is_pfn_swap_entry(swp_entry_t entry) + { +@@ -512,7 +562,7 @@ static inline bool is_pfn_swap_entry(swp + BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); + + return is_migration_entry(entry) || is_device_private_entry(entry) || +- is_device_exclusive_entry(entry); ++ is_device_exclusive_entry(entry) || is_hwpoison_entry(entry); + } + + struct page_vma_mapped_walk; +@@ -581,55 +631,6 @@ static inline int is_pmd_migration_entry + } + #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ + +-#ifdef CONFIG_MEMORY_FAILURE +- +-extern atomic_long_t num_poisoned_pages __read_mostly; +- +-/* +- * Support for hardware poisoned pages +- */ +-static inline swp_entry_t make_hwpoison_entry(struct page *page) +-{ +- BUG_ON(!PageLocked(page)); +- return swp_entry(SWP_HWPOISON, page_to_pfn(page)); +-} +- +-static inline int is_hwpoison_entry(swp_entry_t entry) +-{ +- return swp_type(entry) == SWP_HWPOISON; +-} +- +-static inline void num_poisoned_pages_inc(void) +-{ +- atomic_long_inc(&num_poisoned_pages); +-} +- +-static inline void num_poisoned_pages_sub(long i) +-{ +- atomic_long_sub(i, &num_poisoned_pages); +-} +- +-#else /* CONFIG_MEMORY_FAILURE */ +- +-static inline swp_entry_t make_hwpoison_entry(struct page *page) +-{ +- return swp_entry(0, 0); +-} +- +-static inline int is_hwpoison_entry(swp_entry_t swp) +-{ +- return 0; +-} +- +-static inline void num_poisoned_pages_inc(void) +-{ +-} +- +-static inline void num_poisoned_pages_sub(long i) +-{ +-} +-#endif /* CONFIG_MEMORY_FAILURE */ +- + static inline int non_swap_entry(swp_entry_t entry) + { + return swp_type(entry) >= MAX_SWAPFILES; diff --git a/queue-6.1/series b/queue-6.1/series index fddbdd1f3a9..7eb2e52b993 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -233,3 +233,11 @@ bluetooth-qca-fix-nvm-configuration-parsing.patch bluetooth-qca-fix-info-leak-when-fetching-board-id.patch bluetooth-qca-fix-info-leak-when-fetching-fw-build-id.patch bluetooth-qca-fix-firmware-check-error-path.patch +vfio-add-the-spr_dsa-and-spr_iax-devices-to-the-denylist.patch +dmaengine-idxd-add-a-new-security-check-to-deal-with-a-hardware-erratum.patch +dmaengine-idxd-add-a-write-method-for-applications-to-submit-work.patch +keys-fix-overwrite-of-key-expiration-on-instantiation.patch +btrfs-do-not-wait-for-short-bulk-allocation.patch +mm-hugetlb-fix-debug_locks_warn_on-1-when-dissolve_free_hugetlb_folio.patch +mm-swapops-update-check-in-is_pfn_swap_entry-for-hwpoison-entries.patch +md-fix-kmemleak-of-rdev-serial.patch diff --git a/queue-6.1/vfio-add-the-spr_dsa-and-spr_iax-devices-to-the-denylist.patch b/queue-6.1/vfio-add-the-spr_dsa-and-spr_iax-devices-to-the-denylist.patch new file mode 100644 index 00000000000..534cafec284 --- /dev/null +++ b/queue-6.1/vfio-add-the-spr_dsa-and-spr_iax-devices-to-the-denylist.patch @@ -0,0 +1,59 @@ +From 95feb3160eef0caa6018e175a5560b816aee8e79 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Thu, 21 Mar 2024 19:44:07 +0000 +Subject: VFIO: Add the SPR_DSA and SPR_IAX devices to the denylist + +From: Arjan van de Ven + +commit 95feb3160eef0caa6018e175a5560b816aee8e79 upstream. + +Due to an erratum with the SPR_DSA and SPR_IAX devices, it is not secure to assign +these devices to virtual machines. Add the PCI IDs of these devices to the VFIO +denylist to ensure that this is handled appropriately by the VFIO subsystem. + +The SPR_DSA and SPR_IAX devices are on-SOC devices for the Sapphire Rapids +(and related) family of products that perform data movement and compression. + +Signed-off-by: Arjan van de Ven +Signed-off-by: Greg Kroah-Hartman +--- + drivers/dma/idxd/registers.h | 3 --- + drivers/vfio/pci/vfio_pci.c | 2 ++ + include/linux/pci_ids.h | 2 ++ + 3 files changed, 4 insertions(+), 3 deletions(-) + +--- a/drivers/dma/idxd/registers.h ++++ b/drivers/dma/idxd/registers.h +@@ -4,9 +4,6 @@ + #define _IDXD_REGISTERS_H_ + + /* PCI Config */ +-#define PCI_DEVICE_ID_INTEL_DSA_SPR0 0x0b25 +-#define PCI_DEVICE_ID_INTEL_IAX_SPR0 0x0cfe +- + #define DEVICE_VERSION_1 0x100 + #define DEVICE_VERSION_2 0x200 + +--- a/drivers/vfio/pci/vfio_pci.c ++++ b/drivers/vfio/pci/vfio_pci.c +@@ -71,6 +71,8 @@ static bool vfio_pci_dev_in_denylist(str + case PCI_DEVICE_ID_INTEL_QAT_C62X_VF: + case PCI_DEVICE_ID_INTEL_QAT_DH895XCC: + case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF: ++ case PCI_DEVICE_ID_INTEL_DSA_SPR0: ++ case PCI_DEVICE_ID_INTEL_IAX_SPR0: + return true; + default: + return false; +--- a/include/linux/pci_ids.h ++++ b/include/linux/pci_ids.h +@@ -2664,7 +2664,9 @@ + #define PCI_DEVICE_ID_INTEL_QUARK_X1000_ILB 0x095e + #define PCI_DEVICE_ID_INTEL_I960 0x0960 + #define PCI_DEVICE_ID_INTEL_I960RM 0x0962 ++#define PCI_DEVICE_ID_INTEL_DSA_SPR0 0x0b25 + #define PCI_DEVICE_ID_INTEL_CENTERTON_ILB 0x0c60 ++#define PCI_DEVICE_ID_INTEL_IAX_SPR0 0x0cfe + #define PCI_DEVICE_ID_INTEL_8257X_SOL 0x1062 + #define PCI_DEVICE_ID_INTEL_82573E_SOL 0x1085 + #define PCI_DEVICE_ID_INTEL_82573L_SOL 0x108f -- 2.47.2