From: Greg Kroah-Hartman Date: Mon, 21 Jun 2021 13:09:22 +0000 (+0200) Subject: 5.12-stable patches X-Git-Tag: v5.4.128~21 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=8130e859c66a6c0a8df3c47e01e321327bd82246;p=thirdparty%2Fkernel%2Fstable-queue.git 5.12-stable patches added patches: cfg80211-avoid-double-free-of-pmsr-request.patch cfg80211-fix-phy80211-symlink-creation.patch cfg80211-make-certificate-generation-more-robust.patch cfg80211-shut-down-interfaces-on-failed-resume.patch crash_core-vmcoreinfo-append-section_size_bits-to-vmcoreinfo.patch dmaengine-pl330-fix-wrong-usage-of-spinlock-flags-in-dma_cyclc.patch drm-amdgpu-gfx10-enlarge-cp_mec_doorbell_range_upper-to-cover-full-doorbell.patch drm-amdgpu-gfx9-fix-the-doorbell-missing-when-in-cgpg-issue.patch mac80211-fix-deadlock-in-ap-vlan-handling.patch mac80211-fix-null-ptr-deref-for-injected-rate-info.patch mac80211-fix-reset-debugfs-locking.patch mac80211-minstrel_ht-fix-sample-time-check.patch mac80211-move-interface-shutdown-out-of-wiphy-lock.patch makefile-lto-pass-warn-stack-size-only-on-lld-13.0.0.patch mm-hugetlb-expand-restore_reserve_on_error-functionality.patch mm-hwpoison-fix-race-with-hugetlb-page-allocation.patch mm-slub-actually-fix-freelist-pointer-vs-redzoning.patch mm-slub-clarify-verification-reporting.patch mm-slub-fix-redzoning-for-small-allocations.patch mm-slub.c-include-swab.h.patch mm-swap-fix-pte_same_as_swp-not-removing-uffd-wp-bit-when-compare.patch net-bridge-fix-vlan-tunnel-dst-null-pointer-dereference.patch net-bridge-fix-vlan-tunnel-dst-refcnt-when-egressing.patch net-ll_temac-fix-tx-bd-buffer-overwrite.patch net-ll_temac-make-sure-to-free-skb-when-it-is-completely-used.patch powerpc-perf-fix-crash-in-perf_instruction_pointer-when-ppmu-is-not-set.patch x86-fpu-invalidate-fpu-state-after-a-failed-xrstor-from-a-user-buffer.patch x86-fpu-prevent-state-corruption-in-__fpu__restore_sig.patch x86-fpu-reset-state-for-all-signal-restore-failures.patch x86-ioremap-map-efi-reserved-memory-as-encrypted-for-sev.patch x86-mm-avoid-truncating-memblocks-for-sgx-memory.patch x86-pkru-write-hardware-init-value-to-pkru-when-xstate-is-init.patch x86-process-check-pf_kthread-and-not-current-mm-for-kernel-threads.patch --- diff --git a/queue-5.12/cfg80211-avoid-double-free-of-pmsr-request.patch b/queue-5.12/cfg80211-avoid-double-free-of-pmsr-request.patch new file mode 100644 index 00000000000..24e09cf033e --- /dev/null +++ b/queue-5.12/cfg80211-avoid-double-free-of-pmsr-request.patch @@ -0,0 +1,61 @@ +From 0288e5e16a2e18f0b7e61a2b70d9037fc6e4abeb Mon Sep 17 00:00:00 2001 +From: Avraham Stern +Date: Fri, 18 Jun 2021 13:41:31 +0300 +Subject: cfg80211: avoid double free of PMSR request + +From: Avraham Stern + +commit 0288e5e16a2e18f0b7e61a2b70d9037fc6e4abeb upstream. + +If cfg80211_pmsr_process_abort() moves all the PMSR requests that +need to be freed into a local list before aborting and freeing them. +As a result, it is possible that cfg80211_pmsr_complete() will run in +parallel and free the same PMSR request. + +Fix it by freeing the request in cfg80211_pmsr_complete() only if it +is still in the original pmsr list. + +Cc: stable@vger.kernel.org +Fixes: 9bb7e0f24e7e ("cfg80211: add peer measurement with FTM initiator API") +Signed-off-by: Avraham Stern +Signed-off-by: Luca Coelho +Link: https://lore.kernel.org/r/iwlwifi.20210618133832.1fbef57e269a.I00294bebdb0680b892f8d1d5c871fd9dbe785a5e@changeid +Signed-off-by: Johannes Berg +Signed-off-by: Greg Kroah-Hartman +--- + net/wireless/pmsr.c | 16 ++++++++++++++-- + 1 file changed, 14 insertions(+), 2 deletions(-) + +--- a/net/wireless/pmsr.c ++++ b/net/wireless/pmsr.c +@@ -324,6 +324,7 @@ void cfg80211_pmsr_complete(struct wirel + gfp_t gfp) + { + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); ++ struct cfg80211_pmsr_request *tmp, *prev, *to_free = NULL; + struct sk_buff *msg; + void *hdr; + +@@ -354,9 +355,20 @@ free_msg: + nlmsg_free(msg); + free_request: + spin_lock_bh(&wdev->pmsr_lock); +- list_del(&req->list); ++ /* ++ * cfg80211_pmsr_process_abort() may have already moved this request ++ * to the free list, and will free it later. In this case, don't free ++ * it here. ++ */ ++ list_for_each_entry_safe(tmp, prev, &wdev->pmsr_list, list) { ++ if (tmp == req) { ++ list_del(&req->list); ++ to_free = req; ++ break; ++ } ++ } + spin_unlock_bh(&wdev->pmsr_lock); +- kfree(req); ++ kfree(to_free); + } + EXPORT_SYMBOL_GPL(cfg80211_pmsr_complete); + diff --git a/queue-5.12/cfg80211-fix-phy80211-symlink-creation.patch b/queue-5.12/cfg80211-fix-phy80211-symlink-creation.patch new file mode 100644 index 00000000000..1adad4aa1d0 --- /dev/null +++ b/queue-5.12/cfg80211-fix-phy80211-symlink-creation.patch @@ -0,0 +1,55 @@ +From 43076c1e074359f11c85d7d1b85ede1bbb8ee6b9 Mon Sep 17 00:00:00 2001 +From: Johannes Berg +Date: Tue, 8 Jun 2021 11:32:28 +0200 +Subject: cfg80211: fix phy80211 symlink creation + +From: Johannes Berg + +commit 43076c1e074359f11c85d7d1b85ede1bbb8ee6b9 upstream. + +When I moved around the code here, I neglected that we could still +call register_netdev() or similar without the wiphy mutex held, +which then calls cfg80211_register_wdev() - that's also done from +cfg80211_register_netdevice(), but the phy80211 symlink creation +was only there. Now, the symlink isn't needed for a *pure* wdev, +but a netdev not registered via cfg80211_register_wdev() should +still have the symlink, so move the creation to the right place. + +Cc: stable@vger.kernel.org +Fixes: 2fe8ef106238 ("cfg80211: change netdev registration/unregistration semantics") +Link: https://lore.kernel.org/r/20210608113226.a5dc4c1e488c.Ia42fe663cefe47b0883af78c98f284c5555bbe5d@changeid +Signed-off-by: Johannes Berg +Signed-off-by: Greg Kroah-Hartman +--- + net/wireless/core.c | 13 +++++-------- + 1 file changed, 5 insertions(+), 8 deletions(-) + +--- a/net/wireless/core.c ++++ b/net/wireless/core.c +@@ -1339,6 +1339,11 @@ void cfg80211_register_wdev(struct cfg80 + rdev->devlist_generation++; + wdev->registered = true; + ++ if (wdev->netdev && ++ sysfs_create_link(&wdev->netdev->dev.kobj, &rdev->wiphy.dev.kobj, ++ "phy80211")) ++ pr_err("failed to add phy80211 symlink to netdev!\n"); ++ + nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); + } + +@@ -1364,14 +1369,6 @@ int cfg80211_register_netdevice(struct n + if (ret) + goto out; + +- if (sysfs_create_link(&dev->dev.kobj, &rdev->wiphy.dev.kobj, +- "phy80211")) { +- pr_err("failed to add phy80211 symlink to netdev!\n"); +- unregister_netdevice(dev); +- ret = -EINVAL; +- goto out; +- } +- + cfg80211_register_wdev(rdev, wdev); + ret = 0; + out: diff --git a/queue-5.12/cfg80211-make-certificate-generation-more-robust.patch b/queue-5.12/cfg80211-make-certificate-generation-more-robust.patch new file mode 100644 index 00000000000..f2b0b74f2bd --- /dev/null +++ b/queue-5.12/cfg80211-make-certificate-generation-more-robust.patch @@ -0,0 +1,35 @@ +From b5642479b0f7168fe16d156913533fe65ab4f8d5 Mon Sep 17 00:00:00 2001 +From: Johannes Berg +Date: Fri, 18 Jun 2021 13:41:29 +0300 +Subject: cfg80211: make certificate generation more robust + +From: Johannes Berg + +commit b5642479b0f7168fe16d156913533fe65ab4f8d5 upstream. + +If all net/wireless/certs/*.hex files are deleted, the build +will hang at this point since the 'cat' command will have no +arguments. Do "echo | cat - ..." so that even if the "..." +part is empty, the whole thing won't hang. + +Cc: stable@vger.kernel.org +Signed-off-by: Johannes Berg +Signed-off-by: Luca Coelho +Link: https://lore.kernel.org/r/iwlwifi.20210618133832.c989056c3664.Ic3b77531d00b30b26dcd69c64e55ae2f60c3f31e@changeid +Signed-off-by: Johannes Berg +Signed-off-by: Greg Kroah-Hartman +--- + net/wireless/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/wireless/Makefile ++++ b/net/wireless/Makefile +@@ -28,7 +28,7 @@ $(obj)/shipped-certs.c: $(wildcard $(src + @$(kecho) " GEN $@" + @(echo '#include "reg.h"'; \ + echo 'const u8 shipped_regdb_certs[] = {'; \ +- cat $^ ; \ ++ echo | cat - $^ ; \ + echo '};'; \ + echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \ + ) > $@ diff --git a/queue-5.12/cfg80211-shut-down-interfaces-on-failed-resume.patch b/queue-5.12/cfg80211-shut-down-interfaces-on-failed-resume.patch new file mode 100644 index 00000000000..fc4d523c027 --- /dev/null +++ b/queue-5.12/cfg80211-shut-down-interfaces-on-failed-resume.patch @@ -0,0 +1,36 @@ +From 65bec836da8394b1d56bdec2c478dcac21cf12a4 Mon Sep 17 00:00:00 2001 +From: Johannes Berg +Date: Tue, 8 Jun 2021 11:32:29 +0200 +Subject: cfg80211: shut down interfaces on failed resume + +From: Johannes Berg + +commit 65bec836da8394b1d56bdec2c478dcac21cf12a4 upstream. + +If resume fails, we should shut down all interfaces as the +hardware is probably dead. This was/is already done now in +mac80211, but we need to change that due to locking issues, +so move it here and do it without the wiphy lock held. + +Cc: stable@vger.kernel.org +Fixes: 2fe8ef106238 ("cfg80211: change netdev registration/unregistration semantics") +Link: https://lore.kernel.org/r/20210608113226.d564ca69de7c.I2e3c3e5d410b72a4f63bade4fb075df041b3d92f@changeid +Signed-off-by: Johannes Berg +Signed-off-by: Greg Kroah-Hartman +--- + net/wireless/sysfs.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/net/wireless/sysfs.c ++++ b/net/wireless/sysfs.c +@@ -133,6 +133,10 @@ static int wiphy_resume(struct device *d + if (rdev->wiphy.registered && rdev->ops->resume) + ret = rdev_resume(rdev); + wiphy_unlock(&rdev->wiphy); ++ ++ if (ret) ++ cfg80211_shutdown_all_interfaces(&rdev->wiphy); ++ + rtnl_unlock(); + + return ret; diff --git a/queue-5.12/crash_core-vmcoreinfo-append-section_size_bits-to-vmcoreinfo.patch b/queue-5.12/crash_core-vmcoreinfo-append-section_size_bits-to-vmcoreinfo.patch new file mode 100644 index 00000000000..6af9117a823 --- /dev/null +++ b/queue-5.12/crash_core-vmcoreinfo-append-section_size_bits-to-vmcoreinfo.patch @@ -0,0 +1,60 @@ +From 4f5aecdff25f59fb5ea456d5152a913906ecf287 Mon Sep 17 00:00:00 2001 +From: Pingfan Liu +Date: Tue, 15 Jun 2021 18:23:36 -0700 +Subject: crash_core, vmcoreinfo: append 'SECTION_SIZE_BITS' to vmcoreinfo + +From: Pingfan Liu + +commit 4f5aecdff25f59fb5ea456d5152a913906ecf287 upstream. + +As mentioned in kernel commit 1d50e5d0c505 ("crash_core, vmcoreinfo: +Append 'MAX_PHYSMEM_BITS' to vmcoreinfo"), SECTION_SIZE_BITS in the +formula: + + #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) + +Besides SECTIONS_SHIFT, SECTION_SIZE_BITS is also used to calculate +PAGES_PER_SECTION in makedumpfile just like kernel. + +Unfortunately, this arch-dependent macro SECTION_SIZE_BITS changes, e.g. +recently in kernel commit f0b13ee23241 ("arm64/sparsemem: reduce +SECTION_SIZE_BITS"). But user space wants a stable interface to get +this info. Such info is impossible to be deduced from a crashdump +vmcore. Hence append SECTION_SIZE_BITS to vmcoreinfo. + +Link: https://lkml.kernel.org/r/20210608103359.84907-1-kernelfans@gmail.com +Link: http://lists.infradead.org/pipermail/kexec/2021-June/022676.html +Signed-off-by: Pingfan Liu +Acked-by: Baoquan He +Cc: Bhupesh Sharma +Cc: Kazuhito Hagio +Cc: Dave Young +Cc: Boris Petkov +Cc: Ingo Molnar +Cc: Thomas Gleixner +Cc: James Morse +Cc: Mark Rutland +Cc: Will Deacon +Cc: Catalin Marinas +Cc: Michael Ellerman +Cc: Paul Mackerras +Cc: Benjamin Herrenschmidt +Cc: Dave Anderson +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + kernel/crash_core.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/kernel/crash_core.c ++++ b/kernel/crash_core.c +@@ -464,6 +464,7 @@ static int __init crash_save_vmcoreinfo_ + VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); + VMCOREINFO_STRUCT_SIZE(mem_section); + VMCOREINFO_OFFSET(mem_section, section_mem_map); ++ VMCOREINFO_NUMBER(SECTION_SIZE_BITS); + VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); + #endif + VMCOREINFO_STRUCT_SIZE(page); diff --git a/queue-5.12/dmaengine-pl330-fix-wrong-usage-of-spinlock-flags-in-dma_cyclc.patch b/queue-5.12/dmaengine-pl330-fix-wrong-usage-of-spinlock-flags-in-dma_cyclc.patch new file mode 100644 index 00000000000..426821e6252 --- /dev/null +++ b/queue-5.12/dmaengine-pl330-fix-wrong-usage-of-spinlock-flags-in-dma_cyclc.patch @@ -0,0 +1,52 @@ +From 4ad5dd2d7876d79507a20f026507d1a93b8fff10 Mon Sep 17 00:00:00 2001 +From: Bumyong Lee +Date: Fri, 7 May 2021 15:36:47 +0900 +Subject: dmaengine: pl330: fix wrong usage of spinlock flags in dma_cyclc + +From: Bumyong Lee + +commit 4ad5dd2d7876d79507a20f026507d1a93b8fff10 upstream. + +flags varible which is the input parameter of pl330_prep_dma_cyclic() +should not be used by spinlock_irq[save/restore] function. + +Signed-off-by: Jongho Park +Signed-off-by: Bumyong Lee +Signed-off-by: Chanho Park +Link: https://lore.kernel.org/r/20210507063647.111209-1-chanho61.park@samsung.com +Fixes: f6f2421c0a1c ("dmaengine: pl330: Merge dma_pl330_dmac and pl330_dmac structs") +Cc: stable@vger.kernel.org +Signed-off-by: Vinod Koul +Signed-off-by: Greg Kroah-Hartman +--- + drivers/dma/pl330.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/drivers/dma/pl330.c ++++ b/drivers/dma/pl330.c +@@ -2694,13 +2694,15 @@ static struct dma_async_tx_descriptor *p + for (i = 0; i < len / period_len; i++) { + desc = pl330_get_desc(pch); + if (!desc) { ++ unsigned long iflags; ++ + dev_err(pch->dmac->ddma.dev, "%s:%d Unable to fetch desc\n", + __func__, __LINE__); + + if (!first) + return NULL; + +- spin_lock_irqsave(&pl330->pool_lock, flags); ++ spin_lock_irqsave(&pl330->pool_lock, iflags); + + while (!list_empty(&first->node)) { + desc = list_entry(first->node.next, +@@ -2710,7 +2712,7 @@ static struct dma_async_tx_descriptor *p + + list_move_tail(&first->node, &pl330->desc_pool); + +- spin_unlock_irqrestore(&pl330->pool_lock, flags); ++ spin_unlock_irqrestore(&pl330->pool_lock, iflags); + + return NULL; + } diff --git a/queue-5.12/drm-amdgpu-gfx10-enlarge-cp_mec_doorbell_range_upper-to-cover-full-doorbell.patch b/queue-5.12/drm-amdgpu-gfx10-enlarge-cp_mec_doorbell_range_upper-to-cover-full-doorbell.patch new file mode 100644 index 00000000000..6776e18fa2c --- /dev/null +++ b/queue-5.12/drm-amdgpu-gfx10-enlarge-cp_mec_doorbell_range_upper-to-cover-full-doorbell.patch @@ -0,0 +1,38 @@ +From 1c0b0efd148d5b24c4932ddb3fa03c8edd6097b3 Mon Sep 17 00:00:00 2001 +From: Yifan Zhang +Date: Thu, 10 Jun 2021 10:10:07 +0800 +Subject: drm/amdgpu/gfx10: enlarge CP_MEC_DOORBELL_RANGE_UPPER to cover full doorbell. + +From: Yifan Zhang + +commit 1c0b0efd148d5b24c4932ddb3fa03c8edd6097b3 upstream. + +If GC has entered CGPG, ringing doorbell > first page doesn't wakeup GC. +Enlarge CP_MEC_DOORBELL_RANGE_UPPER to workaround this issue. + +Signed-off-by: Yifan Zhang +Reviewed-by: Felix Kuehling +Reviewed-by: Alex Deucher +Signed-off-by: Alex Deucher +Cc: stable@vger.kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +@@ -6769,8 +6769,12 @@ static int gfx_v10_0_kiq_init_register(s + if (ring->use_doorbell) { + WREG32_SOC15(GC, 0, mmCP_MEC_DOORBELL_RANGE_LOWER, + (adev->doorbell_index.kiq * 2) << 2); ++ /* If GC has entered CGPG, ringing doorbell > first page doesn't ++ * wakeup GC. Enlarge CP_MEC_DOORBELL_RANGE_UPPER to workaround ++ * this issue. ++ */ + WREG32_SOC15(GC, 0, mmCP_MEC_DOORBELL_RANGE_UPPER, +- (adev->doorbell_index.userqueue_end * 2) << 2); ++ (adev->doorbell.size - 4)); + } + + WREG32_SOC15(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL, diff --git a/queue-5.12/drm-amdgpu-gfx9-fix-the-doorbell-missing-when-in-cgpg-issue.patch b/queue-5.12/drm-amdgpu-gfx9-fix-the-doorbell-missing-when-in-cgpg-issue.patch new file mode 100644 index 00000000000..f95bf7b38ec --- /dev/null +++ b/queue-5.12/drm-amdgpu-gfx9-fix-the-doorbell-missing-when-in-cgpg-issue.patch @@ -0,0 +1,38 @@ +From 4cbbe34807938e6e494e535a68d5ff64edac3f20 Mon Sep 17 00:00:00 2001 +From: Yifan Zhang +Date: Thu, 10 Jun 2021 09:55:01 +0800 +Subject: drm/amdgpu/gfx9: fix the doorbell missing when in CGPG issue. + +From: Yifan Zhang + +commit 4cbbe34807938e6e494e535a68d5ff64edac3f20 upstream. + +If GC has entered CGPG, ringing doorbell > first page doesn't wakeup GC. +Enlarge CP_MEC_DOORBELL_RANGE_UPPER to workaround this issue. + +Signed-off-by: Yifan Zhang +Reviewed-by: Felix Kuehling +Reviewed-by: Alex Deucher +Signed-off-by: Alex Deucher +Cc: stable@vger.kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +@@ -3623,8 +3623,12 @@ static int gfx_v9_0_kiq_init_register(st + if (ring->use_doorbell) { + WREG32_SOC15(GC, 0, mmCP_MEC_DOORBELL_RANGE_LOWER, + (adev->doorbell_index.kiq * 2) << 2); ++ /* If GC has entered CGPG, ringing doorbell > first page doesn't ++ * wakeup GC. Enlarge CP_MEC_DOORBELL_RANGE_UPPER to workaround ++ * this issue. ++ */ + WREG32_SOC15(GC, 0, mmCP_MEC_DOORBELL_RANGE_UPPER, +- (adev->doorbell_index.userqueue_end * 2) << 2); ++ (adev->doorbell.size - 4)); + } + + WREG32_SOC15_RLC(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL, diff --git a/queue-5.12/mac80211-fix-deadlock-in-ap-vlan-handling.patch b/queue-5.12/mac80211-fix-deadlock-in-ap-vlan-handling.patch new file mode 100644 index 00000000000..85031fb3573 --- /dev/null +++ b/queue-5.12/mac80211-fix-deadlock-in-ap-vlan-handling.patch @@ -0,0 +1,77 @@ +From d5befb224edbe53056c2c18999d630dafb4a08b9 Mon Sep 17 00:00:00 2001 +From: Johannes Berg +Date: Mon, 17 May 2021 16:03:23 +0200 +Subject: mac80211: fix deadlock in AP/VLAN handling + +From: Johannes Berg + +commit d5befb224edbe53056c2c18999d630dafb4a08b9 upstream. + +Syzbot reports that when you have AP_VLAN interfaces that are up +and close the AP interface they belong to, we get a deadlock. No +surprise - since we dev_close() them with the wiphy mutex held, +which goes back into the netdev notifier in cfg80211 and tries to +acquire the wiphy mutex there. + +To fix this, we need to do two things: + 1) prevent changing iftype while AP_VLANs are up, we can't + easily fix this case since cfg80211 already calls us with + the wiphy mutex held, but change_interface() is relatively + rare in drivers anyway, so changing iftype isn't used much + (and userspace has to fall back to down/change/up anyway) + 2) pull the dev_close() loop over VLANs out of the wiphy mutex + section in the normal stop case + +Cc: stable@vger.kernel.org +Reported-by: syzbot+452ea4fbbef700ff0a56@syzkaller.appspotmail.com +Fixes: a05829a7222e ("cfg80211: avoid holding the RTNL when calling the driver") +Link: https://lore.kernel.org/r/20210517160322.9b8f356c0222.I392cb0e2fa5a1a94cf2e637555d702c7e512c1ff@changeid +Signed-off-by: Johannes Berg +Signed-off-by: Greg Kroah-Hartman +--- + net/mac80211/iface.c | 19 ++++++++++++------- + 1 file changed, 12 insertions(+), 7 deletions(-) + +--- a/net/mac80211/iface.c ++++ b/net/mac80211/iface.c +@@ -475,14 +475,7 @@ static void ieee80211_do_stop(struct iee + GFP_KERNEL); + } + +- /* APs need special treatment */ + if (sdata->vif.type == NL80211_IFTYPE_AP) { +- struct ieee80211_sub_if_data *vlan, *tmpsdata; +- +- /* down all dependent devices, that is VLANs */ +- list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans, +- u.vlan.list) +- dev_close(vlan->dev); + WARN_ON(!list_empty(&sdata->u.ap.vlans)); + } else if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { + /* remove all packets in parent bc_buf pointing to this dev */ +@@ -640,6 +633,15 @@ static int ieee80211_stop(struct net_dev + { + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + ++ /* close all dependent VLAN interfaces before locking wiphy */ ++ if (sdata->vif.type == NL80211_IFTYPE_AP) { ++ struct ieee80211_sub_if_data *vlan, *tmpsdata; ++ ++ list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans, ++ u.vlan.list) ++ dev_close(vlan->dev); ++ } ++ + wiphy_lock(sdata->local->hw.wiphy); + ieee80211_do_stop(sdata, true); + wiphy_unlock(sdata->local->hw.wiphy); +@@ -1589,6 +1591,9 @@ static int ieee80211_runtime_change_ifty + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP: ++ if (!list_empty(&sdata->u.ap.vlans)) ++ return -EBUSY; ++ break; + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_OCB: diff --git a/queue-5.12/mac80211-fix-null-ptr-deref-for-injected-rate-info.patch b/queue-5.12/mac80211-fix-null-ptr-deref-for-injected-rate-info.patch new file mode 100644 index 00000000000..6f575cf7c11 --- /dev/null +++ b/queue-5.12/mac80211-fix-null-ptr-deref-for-injected-rate-info.patch @@ -0,0 +1,164 @@ +From bddc0c411a45d3718ac535a070f349be8eca8d48 Mon Sep 17 00:00:00 2001 +From: Mathy Vanhoef +Date: Sun, 30 May 2021 15:32:26 +0200 +Subject: mac80211: Fix NULL ptr deref for injected rate info + +From: Mathy Vanhoef + +commit bddc0c411a45d3718ac535a070f349be8eca8d48 upstream. + +The commit cb17ed29a7a5 ("mac80211: parse radiotap header when selecting Tx +queue") moved the code to validate the radiotap header from +ieee80211_monitor_start_xmit to ieee80211_parse_tx_radiotap. This made is +possible to share more code with the new Tx queue selection code for +injected frames. But at the same time, it now required the call of +ieee80211_parse_tx_radiotap at the beginning of functions which wanted to +handle the radiotap header. And this broke the rate parser for radiotap +header parser. + +The radiotap parser for rates is operating most of the time only on the +data in the actual radiotap header. But for the 802.11a/b/g rates, it must +also know the selected band from the chandef information. But this +information is only written to the ieee80211_tx_info at the end of the +ieee80211_monitor_start_xmit - long after ieee80211_parse_tx_radiotap was +already called. The info->band information was therefore always 0 +(NL80211_BAND_2GHZ) when the parser code tried to access it. + +For a 5GHz only device, injecting a frame with 802.11a rates would cause a +NULL pointer dereference because local->hw.wiphy->bands[NL80211_BAND_2GHZ] +would most likely have been NULL when the radiotap parser searched for the +correct rate index of the driver. + +Cc: stable@vger.kernel.org +Reported-by: Ben Greear +Fixes: cb17ed29a7a5 ("mac80211: parse radiotap header when selecting Tx queue") +Signed-off-by: Mathy Vanhoef +[sven@narfation.org: added commit message] +Signed-off-by: Sven Eckelmann +Link: https://lore.kernel.org/r/20210530133226.40587-1-sven@narfation.org +Signed-off-by: Johannes Berg +Signed-off-by: Greg Kroah-Hartman +--- + include/net/mac80211.h | 7 +++++- + net/mac80211/tx.c | 52 +++++++++++++++++++++++++++++++++---------------- + 2 files changed, 42 insertions(+), 17 deletions(-) + +--- a/include/net/mac80211.h ++++ b/include/net/mac80211.h +@@ -6388,7 +6388,12 @@ bool ieee80211_tx_prepare_skb(struct iee + + /** + * ieee80211_parse_tx_radiotap - Sanity-check and parse the radiotap header +- * of injected frames ++ * of injected frames. ++ * ++ * To accurately parse and take into account rate and retransmission fields, ++ * you must initialize the chandef field in the ieee80211_tx_info structure ++ * of the skb before calling this function. ++ * + * @skb: packet injected by userspace + * @dev: the &struct device of this 802.11 device + */ +--- a/net/mac80211/tx.c ++++ b/net/mac80211/tx.c +@@ -2002,6 +2002,26 @@ void ieee80211_xmit(struct ieee80211_sub + ieee80211_tx(sdata, sta, skb, false); + } + ++static bool ieee80211_validate_radiotap_len(struct sk_buff *skb) ++{ ++ struct ieee80211_radiotap_header *rthdr = ++ (struct ieee80211_radiotap_header *)skb->data; ++ ++ /* check for not even having the fixed radiotap header part */ ++ if (unlikely(skb->len < sizeof(struct ieee80211_radiotap_header))) ++ return false; /* too short to be possibly valid */ ++ ++ /* is it a header version we can trust to find length from? */ ++ if (unlikely(rthdr->it_version)) ++ return false; /* only version 0 is supported */ ++ ++ /* does the skb contain enough to deliver on the alleged length? */ ++ if (unlikely(skb->len < ieee80211_get_radiotap_len(skb->data))) ++ return false; /* skb too short for claimed rt header extent */ ++ ++ return true; ++} ++ + bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, + struct net_device *dev) + { +@@ -2010,8 +2030,6 @@ bool ieee80211_parse_tx_radiotap(struct + struct ieee80211_radiotap_header *rthdr = + (struct ieee80211_radiotap_header *) skb->data; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); +- struct ieee80211_supported_band *sband = +- local->hw.wiphy->bands[info->band]; + int ret = ieee80211_radiotap_iterator_init(&iterator, rthdr, skb->len, + NULL); + u16 txflags; +@@ -2024,17 +2042,8 @@ bool ieee80211_parse_tx_radiotap(struct + u8 vht_mcs = 0, vht_nss = 0; + int i; + +- /* check for not even having the fixed radiotap header part */ +- if (unlikely(skb->len < sizeof(struct ieee80211_radiotap_header))) +- return false; /* too short to be possibly valid */ +- +- /* is it a header version we can trust to find length from? */ +- if (unlikely(rthdr->it_version)) +- return false; /* only version 0 is supported */ +- +- /* does the skb contain enough to deliver on the alleged length? */ +- if (unlikely(skb->len < ieee80211_get_radiotap_len(skb->data))) +- return false; /* skb too short for claimed rt header extent */ ++ if (!ieee80211_validate_radiotap_len(skb)) ++ return false; + + info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | + IEEE80211_TX_CTL_DONTFRAG; +@@ -2174,6 +2183,9 @@ bool ieee80211_parse_tx_radiotap(struct + return false; + + if (rate_found) { ++ struct ieee80211_supported_band *sband = ++ local->hw.wiphy->bands[info->band]; ++ + info->control.flags |= IEEE80211_TX_CTRL_RATE_INJECT; + + for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { +@@ -2187,7 +2199,7 @@ bool ieee80211_parse_tx_radiotap(struct + } else if (rate_flags & IEEE80211_TX_RC_VHT_MCS) { + ieee80211_rate_set_vht(info->control.rates, vht_mcs, + vht_nss); +- } else { ++ } else if (sband) { + for (i = 0; i < sband->n_bitrates; i++) { + if (rate * 5 != sband->bitrates[i].bitrate) + continue; +@@ -2224,8 +2236,8 @@ netdev_tx_t ieee80211_monitor_start_xmit + info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS | + IEEE80211_TX_CTL_INJECTED; + +- /* Sanity-check and process the injection radiotap header */ +- if (!ieee80211_parse_tx_radiotap(skb, dev)) ++ /* Sanity-check the length of the radiotap header */ ++ if (!ieee80211_validate_radiotap_len(skb)) + goto fail; + + /* we now know there is a radiotap header with a length we can use */ +@@ -2339,6 +2351,14 @@ netdev_tx_t ieee80211_monitor_start_xmit + ieee80211_select_queue_80211(sdata, skb, hdr); + skb_set_queue_mapping(skb, ieee80211_ac_from_tid(skb->priority)); + ++ /* ++ * Process the radiotap header. This will now take into account the ++ * selected chandef above to accurately set injection rates and ++ * retransmissions. ++ */ ++ if (!ieee80211_parse_tx_radiotap(skb, dev)) ++ goto fail_rcu; ++ + /* remove the injection radiotap header */ + skb_pull(skb, len_rthdr); + diff --git a/queue-5.12/mac80211-fix-reset-debugfs-locking.patch b/queue-5.12/mac80211-fix-reset-debugfs-locking.patch new file mode 100644 index 00000000000..e9fd3b86921 --- /dev/null +++ b/queue-5.12/mac80211-fix-reset-debugfs-locking.patch @@ -0,0 +1,44 @@ +From adaed1b9daf5a045be71e923e04b5069d2bee664 Mon Sep 17 00:00:00 2001 +From: Johannes Berg +Date: Tue, 8 Jun 2021 11:32:27 +0200 +Subject: mac80211: fix 'reset' debugfs locking + +From: Johannes Berg + +commit adaed1b9daf5a045be71e923e04b5069d2bee664 upstream. + +cfg80211 now calls suspend/resume with the wiphy lock +held, and while there's a problem with that needing +to be fixed, we should do the same in debugfs. + +Cc: stable@vger.kernel.org +Fixes: a05829a7222e ("cfg80211: avoid holding the RTNL when calling the driver") +Link: https://lore.kernel.org/r/20210608113226.14020430e449.I78e19db0a55a8295a376e15ac4cf77dbb4c6fb51@changeid +Signed-off-by: Johannes Berg +Signed-off-by: Greg Kroah-Hartman +--- + net/mac80211/debugfs.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/net/mac80211/debugfs.c ++++ b/net/mac80211/debugfs.c +@@ -4,7 +4,7 @@ + * + * Copyright 2007 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH +- * Copyright (C) 2018 - 2019 Intel Corporation ++ * Copyright (C) 2018 - 2019, 2021 Intel Corporation + */ + + #include +@@ -389,8 +389,10 @@ static ssize_t reset_write(struct file * + struct ieee80211_local *local = file->private_data; + + rtnl_lock(); ++ wiphy_lock(local->hw.wiphy); + __ieee80211_suspend(&local->hw, NULL); + __ieee80211_resume(&local->hw); ++ wiphy_unlock(local->hw.wiphy); + rtnl_unlock(); + + return count; diff --git a/queue-5.12/mac80211-minstrel_ht-fix-sample-time-check.patch b/queue-5.12/mac80211-minstrel_ht-fix-sample-time-check.patch new file mode 100644 index 00000000000..660d9fa0001 --- /dev/null +++ b/queue-5.12/mac80211-minstrel_ht-fix-sample-time-check.patch @@ -0,0 +1,34 @@ +From 1236af327af476731aa548dfcbbefb1a3ec6726a Mon Sep 17 00:00:00 2001 +From: Felix Fietkau +Date: Thu, 17 Jun 2021 12:38:54 +0200 +Subject: mac80211: minstrel_ht: fix sample time check + +From: Felix Fietkau + +commit 1236af327af476731aa548dfcbbefb1a3ec6726a upstream. + +We need to skip sampling if the next sample time is after jiffies, not before. +This patch fixes an issue where in some cases only very little sampling (or none +at all) is performed, leading to really bad data rates + +Fixes: 80d55154b2f8 ("mac80211: minstrel_ht: significantly redesign the rate probing strategy") +Cc: stable@vger.kernel.org +Signed-off-by: Felix Fietkau +Link: https://lore.kernel.org/r/20210617103854.61875-1-nbd@nbd.name +Signed-off-by: Johannes Berg +Signed-off-by: Greg Kroah-Hartman +--- + net/mac80211/rc80211_minstrel_ht.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/mac80211/rc80211_minstrel_ht.c ++++ b/net/mac80211/rc80211_minstrel_ht.c +@@ -1516,7 +1516,7 @@ minstrel_ht_get_rate(void *priv, struct + (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO)) + return; + +- if (time_is_before_jiffies(mi->sample_time)) ++ if (time_is_after_jiffies(mi->sample_time)) + return; + + mi->sample_time = jiffies + MINSTREL_SAMPLE_INTERVAL; diff --git a/queue-5.12/mac80211-move-interface-shutdown-out-of-wiphy-lock.patch b/queue-5.12/mac80211-move-interface-shutdown-out-of-wiphy-lock.patch new file mode 100644 index 00000000000..6f5351bf0a3 --- /dev/null +++ b/queue-5.12/mac80211-move-interface-shutdown-out-of-wiphy-lock.patch @@ -0,0 +1,82 @@ +From f5baf287f5da5641099ad5c809b3b4ebfc08506d Mon Sep 17 00:00:00 2001 +From: Johannes Berg +Date: Tue, 8 Jun 2021 11:32:30 +0200 +Subject: mac80211: move interface shutdown out of wiphy lock + +From: Johannes Berg + +commit f5baf287f5da5641099ad5c809b3b4ebfc08506d upstream. + +When reconfiguration fails, we shut down everything, but we +cannot call cfg80211_shutdown_all_interfaces() with the wiphy +mutex held. Since cfg80211 now calls it on resume errors, we +only need to do likewise for where we call reconfig (whether +directly or indirectly), but not under the wiphy lock. + +Cc: stable@vger.kernel.org +Fixes: 2fe8ef106238 ("cfg80211: change netdev registration/unregistration semantics") +Link: https://lore.kernel.org/r/20210608113226.78233c80f548.Iecc104aceb89f0568f50e9670a9cb191a1c8887b@changeid +Signed-off-by: Johannes Berg +Signed-off-by: Greg Kroah-Hartman +--- + net/mac80211/debugfs.c | 7 ++++++- + net/mac80211/main.c | 7 ++++++- + net/mac80211/util.c | 2 -- + 3 files changed, 12 insertions(+), 4 deletions(-) + +--- a/net/mac80211/debugfs.c ++++ b/net/mac80211/debugfs.c +@@ -387,12 +387,17 @@ static ssize_t reset_write(struct file * + size_t count, loff_t *ppos) + { + struct ieee80211_local *local = file->private_data; ++ int ret; + + rtnl_lock(); + wiphy_lock(local->hw.wiphy); + __ieee80211_suspend(&local->hw, NULL); +- __ieee80211_resume(&local->hw); ++ ret = __ieee80211_resume(&local->hw); + wiphy_unlock(local->hw.wiphy); ++ ++ if (ret) ++ cfg80211_shutdown_all_interfaces(local->hw.wiphy); ++ + rtnl_unlock(); + + return count; +--- a/net/mac80211/main.c ++++ b/net/mac80211/main.c +@@ -252,6 +252,7 @@ static void ieee80211_restart_work(struc + struct ieee80211_local *local = + container_of(work, struct ieee80211_local, restart_work); + struct ieee80211_sub_if_data *sdata; ++ int ret; + + /* wait for scan work complete */ + flush_workqueue(local->workqueue); +@@ -294,8 +295,12 @@ static void ieee80211_restart_work(struc + /* wait for all packet processing to be done */ + synchronize_net(); + +- ieee80211_reconfig(local); ++ ret = ieee80211_reconfig(local); + wiphy_unlock(local->hw.wiphy); ++ ++ if (ret) ++ cfg80211_shutdown_all_interfaces(local->hw.wiphy); ++ + rtnl_unlock(); + } + +--- a/net/mac80211/util.c ++++ b/net/mac80211/util.c +@@ -2186,8 +2186,6 @@ static void ieee80211_handle_reconfig_fa + list_for_each_entry(ctx, &local->chanctx_list, list) + ctx->driver_present = false; + mutex_unlock(&local->chanctx_mtx); +- +- cfg80211_shutdown_all_interfaces(local->hw.wiphy); + } + + static void ieee80211_assign_chanctx(struct ieee80211_local *local, diff --git a/queue-5.12/makefile-lto-pass-warn-stack-size-only-on-lld-13.0.0.patch b/queue-5.12/makefile-lto-pass-warn-stack-size-only-on-lld-13.0.0.patch new file mode 100644 index 00000000000..8ca4f853c84 --- /dev/null +++ b/queue-5.12/makefile-lto-pass-warn-stack-size-only-on-lld-13.0.0.patch @@ -0,0 +1,54 @@ +From 0236526d76b87c1dc2cbe3eb31ae29be5b0ca151 Mon Sep 17 00:00:00 2001 +From: Tor Vic +Date: Sun, 13 Jun 2021 13:07:49 +0000 +Subject: Makefile: lto: Pass -warn-stack-size only on LLD < 13.0.0 + +From: Tor Vic + +commit 0236526d76b87c1dc2cbe3eb31ae29be5b0ca151 upstream. + +Since LLVM commit fc018eb, the '-warn-stack-size' flag has been dropped +[1], leading to the following error message when building with Clang-13 +and LLD-13: + + ld.lld: error: -plugin-opt=-: ld.lld: Unknown command line argument + '-warn-stack-size=2048'. Try: 'ld.lld --help' + ld.lld: Did you mean '--asan-stack=2048'? + +In the same way as with commit 2398ce80152a ("x86, lto: Pass +-stack-alignment only on LLD < 13.0.0") , make '-warn-stack-size' +conditional on LLD < 13.0.0. + +[1] https://reviews.llvm.org/D103928 + +Fixes: 24845dcb170e ("Makefile: LTO: have linker check -Wframe-larger-than") +Cc: stable@vger.kernel.org +Link: https://github.com/ClangBuiltLinux/linux/issues/1377 +Signed-off-by: Tor Vic +Reviewed-by: Nathan Chancellor +Reviewed-by: Nick Desaulniers +Signed-off-by: Kees Cook +Link: https://lore.kernel.org/r/7631bab7-a8ab-f884-ab54-f4198976125c@mailbox.org +Signed-off-by: Greg Kroah-Hartman +--- + Makefile | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/Makefile ++++ b/Makefile +@@ -913,11 +913,14 @@ CC_FLAGS_LTO += -fvisibility=hidden + # Limit inlining across translation units to reduce binary size + KBUILD_LDFLAGS += -mllvm -import-instr-limit=5 + +-# Check for frame size exceeding threshold during prolog/epilog insertion. ++# Check for frame size exceeding threshold during prolog/epilog insertion ++# when using lld < 13.0.0. + ifneq ($(CONFIG_FRAME_WARN),0) ++ifeq ($(shell test $(CONFIG_LLD_VERSION) -lt 130000; echo $$?),0) + KBUILD_LDFLAGS += -plugin-opt=-warn-stack-size=$(CONFIG_FRAME_WARN) + endif + endif ++endif + + ifdef CONFIG_LTO + KBUILD_CFLAGS += -fno-lto $(CC_FLAGS_LTO) diff --git a/queue-5.12/mm-hugetlb-expand-restore_reserve_on_error-functionality.patch b/queue-5.12/mm-hugetlb-expand-restore_reserve_on_error-functionality.patch new file mode 100644 index 00000000000..95170a15019 --- /dev/null +++ b/queue-5.12/mm-hugetlb-expand-restore_reserve_on_error-functionality.patch @@ -0,0 +1,286 @@ +From 846be08578edb81f02bc8534577e6c367ef34f41 Mon Sep 17 00:00:00 2001 +From: Mike Kravetz +Date: Tue, 15 Jun 2021 18:23:29 -0700 +Subject: mm/hugetlb: expand restore_reserve_on_error functionality + +From: Mike Kravetz + +commit 846be08578edb81f02bc8534577e6c367ef34f41 upstream. + +The routine restore_reserve_on_error is called to restore reservation +information when an error occurs after page allocation. The routine +alloc_huge_page modifies the mapping reserve map and potentially the +reserve count during allocation. If code calling alloc_huge_page +encounters an error after allocation and needs to free the page, the +reservation information needs to be adjusted. + +Currently, restore_reserve_on_error only takes action on pages for which +the reserve count was adjusted(HPageRestoreReserve flag). There is +nothing wrong with these adjustments. However, alloc_huge_page ALWAYS +modifies the reserve map during allocation even if the reserve count is +not adjusted. This can cause issues as observed during development of +this patch [1]. + +One specific series of operations causing an issue is: + + - Create a shared hugetlb mapping + Reservations for all pages created by default + + - Fault in a page in the mapping + Reservation exists so reservation count is decremented + + - Punch a hole in the file/mapping at index previously faulted + Reservation and any associated pages will be removed + + - Allocate a page to fill the hole + No reservation entry, so reserve count unmodified + Reservation entry added to map by alloc_huge_page + + - Error after allocation and before instantiating the page + Reservation entry remains in map + + - Allocate a page to fill the hole + Reservation entry exists, so decrement reservation count + +This will cause a reservation count underflow as the reservation count +was decremented twice for the same index. + +A user would observe a very large number for HugePages_Rsvd in +/proc/meminfo. This would also likely cause subsequent allocations of +hugetlb pages to fail as it would 'appear' that all pages are reserved. + +This sequence of operations is unlikely to happen, however they were +easily reproduced and observed using hacked up code as described in [1]. + +Address the issue by having the routine restore_reserve_on_error take +action on pages where HPageRestoreReserve is not set. In this case, we +need to remove any reserve map entry created by alloc_huge_page. A new +helper routine vma_del_reservation assists with this operation. + +There are three callers of alloc_huge_page which do not currently call +restore_reserve_on error before freeing a page on error paths. Add +those missing calls. + +[1] https://lore.kernel.org/linux-mm/20210528005029.88088-1-almasrymina@google.com/ + +Link: https://lkml.kernel.org/r/20210607204510.22617-1-mike.kravetz@oracle.com +Fixes: 96b96a96ddee ("mm/hugetlb: fix huge page reservation leak in private mapping error paths" +Signed-off-by: Mike Kravetz +Reviewed-by: Mina Almasry +Cc: Axel Rasmussen +Cc: Peter Xu +Cc: Muchun Song +Cc: Michal Hocko +Cc: Naoya Horiguchi +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + fs/hugetlbfs/inode.c | 1 + include/linux/hugetlb.h | 2 + mm/hugetlb.c | 120 ++++++++++++++++++++++++++++++++++++++---------- + 3 files changed, 100 insertions(+), 23 deletions(-) + +--- a/fs/hugetlbfs/inode.c ++++ b/fs/hugetlbfs/inode.c +@@ -738,6 +738,7 @@ static long hugetlbfs_fallocate(struct f + __SetPageUptodate(page); + error = huge_add_to_page_cache(page, mapping, index); + if (unlikely(error)) { ++ restore_reserve_on_error(h, &pseudo_vma, addr, page); + put_page(page); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out; +--- a/include/linux/hugetlb.h ++++ b/include/linux/hugetlb.h +@@ -597,6 +597,8 @@ struct page *alloc_huge_page_vma(struct + unsigned long address); + int huge_add_to_page_cache(struct page *page, struct address_space *mapping, + pgoff_t idx); ++void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, ++ unsigned long address, struct page *page); + + /* arch callback */ + int __init __alloc_bootmem_huge_page(struct hstate *h); +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -2127,12 +2127,18 @@ out: + * be restored when a newly allocated huge page must be freed. It is + * to be called after calling vma_needs_reservation to determine if a + * reservation exists. ++ * ++ * vma_del_reservation is used in error paths where an entry in the reserve ++ * map was created during huge page allocation and must be removed. It is to ++ * be called after calling vma_needs_reservation to determine if a reservation ++ * exists. + */ + enum vma_resv_mode { + VMA_NEEDS_RESV, + VMA_COMMIT_RESV, + VMA_END_RESV, + VMA_ADD_RESV, ++ VMA_DEL_RESV, + }; + static long __vma_reservation_common(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr, +@@ -2176,11 +2182,21 @@ static long __vma_reservation_common(str + ret = region_del(resv, idx, idx + 1); + } + break; ++ case VMA_DEL_RESV: ++ if (vma->vm_flags & VM_MAYSHARE) { ++ region_abort(resv, idx, idx + 1, 1); ++ ret = region_del(resv, idx, idx + 1); ++ } else { ++ ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); ++ /* region_add calls of range 1 should never fail. */ ++ VM_BUG_ON(ret < 0); ++ } ++ break; + default: + BUG(); + } + +- if (vma->vm_flags & VM_MAYSHARE) ++ if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV) + return ret; + else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) { + /* +@@ -2229,25 +2245,39 @@ static long vma_add_reservation(struct h + return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); + } + ++static long vma_del_reservation(struct hstate *h, ++ struct vm_area_struct *vma, unsigned long addr) ++{ ++ return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV); ++} ++ + /* +- * This routine is called to restore a reservation on error paths. In the +- * specific error paths, a huge page was allocated (via alloc_huge_page) +- * and is about to be freed. If a reservation for the page existed, +- * alloc_huge_page would have consumed the reservation and set +- * HPageRestoreReserve in the newly allocated page. When the page is freed +- * via free_huge_page, the global reservation count will be incremented if +- * HPageRestoreReserve is set. However, free_huge_page can not adjust the +- * reserve map. Adjust the reserve map here to be consistent with global +- * reserve count adjustments to be made by free_huge_page. ++ * This routine is called to restore reservation information on error paths. ++ * It should ONLY be called for pages allocated via alloc_huge_page(), and ++ * the hugetlb mutex should remain held when calling this routine. ++ * ++ * It handles two specific cases: ++ * 1) A reservation was in place and the page consumed the reservation. ++ * HPageRestoreReserve is set in the page. ++ * 2) No reservation was in place for the page, so HPageRestoreReserve is ++ * not set. However, alloc_huge_page always updates the reserve map. ++ * ++ * In case 1, free_huge_page later in the error path will increment the ++ * global reserve count. But, free_huge_page does not have enough context ++ * to adjust the reservation map. This case deals primarily with private ++ * mappings. Adjust the reserve map here to be consistent with global ++ * reserve count adjustments to be made by free_huge_page. Make sure the ++ * reserve map indicates there is a reservation present. ++ * ++ * In case 2, simply undo reserve map modifications done by alloc_huge_page. + */ +-static void restore_reserve_on_error(struct hstate *h, +- struct vm_area_struct *vma, unsigned long address, +- struct page *page) ++void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, ++ unsigned long address, struct page *page) + { +- if (unlikely(HPageRestoreReserve(page))) { +- long rc = vma_needs_reservation(h, vma, address); ++ long rc = vma_needs_reservation(h, vma, address); + +- if (unlikely(rc < 0)) { ++ if (HPageRestoreReserve(page)) { ++ if (unlikely(rc < 0)) + /* + * Rare out of memory condition in reserve map + * manipulation. Clear HPageRestoreReserve so that +@@ -2260,16 +2290,57 @@ static void restore_reserve_on_error(str + * accounting of reserve counts. + */ + ClearHPageRestoreReserve(page); +- } else if (rc) { +- rc = vma_add_reservation(h, vma, address); +- if (unlikely(rc < 0)) ++ else if (rc) ++ (void)vma_add_reservation(h, vma, address); ++ else ++ vma_end_reservation(h, vma, address); ++ } else { ++ if (!rc) { ++ /* ++ * This indicates there is an entry in the reserve map ++ * added by alloc_huge_page. We know it was added ++ * before the alloc_huge_page call, otherwise ++ * HPageRestoreReserve would be set on the page. ++ * Remove the entry so that a subsequent allocation ++ * does not consume a reservation. ++ */ ++ rc = vma_del_reservation(h, vma, address); ++ if (rc < 0) + /* +- * See above comment about rare out of +- * memory condition. ++ * VERY rare out of memory condition. Since ++ * we can not delete the entry, set ++ * HPageRestoreReserve so that the reserve ++ * count will be incremented when the page ++ * is freed. This reserve will be consumed ++ * on a subsequent allocation. + */ +- ClearHPageRestoreReserve(page); ++ SetHPageRestoreReserve(page); ++ } else if (rc < 0) { ++ /* ++ * Rare out of memory condition from ++ * vma_needs_reservation call. Memory allocation is ++ * only attempted if a new entry is needed. Therefore, ++ * this implies there is not an entry in the ++ * reserve map. ++ * ++ * For shared mappings, no entry in the map indicates ++ * no reservation. We are done. ++ */ ++ if (!(vma->vm_flags & VM_MAYSHARE)) ++ /* ++ * For private mappings, no entry indicates ++ * a reservation is present. Since we can ++ * not add an entry, set SetHPageRestoreReserve ++ * on the page so reserve count will be ++ * incremented when freed. This reserve will ++ * be consumed on a subsequent allocation. ++ */ ++ SetHPageRestoreReserve(page); + } else +- vma_end_reservation(h, vma, address); ++ /* ++ * No reservation present, do nothing ++ */ ++ vma_end_reservation(h, vma, address); + } + } + +@@ -3886,6 +3957,8 @@ again: + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + entry = huge_ptep_get(src_pte); + if (!pte_same(src_pte_old, entry)) { ++ restore_reserve_on_error(h, vma, addr, ++ new); + put_page(new); + /* dst_entry won't change as in child */ + goto again; +@@ -4820,6 +4893,7 @@ out_release_unlock: + if (vm_shared) + unlock_page(page); + out_release_nounlock: ++ restore_reserve_on_error(h, dst_vma, dst_addr, page); + put_page(page); + goto out; + } diff --git a/queue-5.12/mm-hwpoison-fix-race-with-hugetlb-page-allocation.patch b/queue-5.12/mm-hwpoison-fix-race-with-hugetlb-page-allocation.patch new file mode 100644 index 00000000000..9b2be81e45a --- /dev/null +++ b/queue-5.12/mm-hwpoison-fix-race-with-hugetlb-page-allocation.patch @@ -0,0 +1,155 @@ +From 25182f05ffed0b45602438693e4eed5d7f3ebadd Mon Sep 17 00:00:00 2001 +From: Naoya Horiguchi +Date: Tue, 15 Jun 2021 18:23:13 -0700 +Subject: mm,hwpoison: fix race with hugetlb page allocation + +From: Naoya Horiguchi + +commit 25182f05ffed0b45602438693e4eed5d7f3ebadd upstream. + +When hugetlb page fault (under overcommitting situation) and +memory_failure() race, VM_BUG_ON_PAGE() is triggered by the following +race: + + CPU0: CPU1: + + gather_surplus_pages() + page = alloc_surplus_huge_page() + memory_failure_hugetlb() + get_hwpoison_page(page) + __get_hwpoison_page(page) + get_page_unless_zero(page) + zero = put_page_testzero(page) + VM_BUG_ON_PAGE(!zero, page) + enqueue_huge_page(h, page) + put_page(page) + +__get_hwpoison_page() only checks the page refcount before taking an +additional one for memory error handling, which is not enough because +there's a time window where compound pages have non-zero refcount during +hugetlb page initialization. + +So make __get_hwpoison_page() check page status a bit more for hugetlb +pages with get_hwpoison_huge_page(). Checking hugetlb-specific flags +under hugetlb_lock makes sure that the hugetlb page is not transitive. +It's notable that another new function, HWPoisonHandlable(), is helpful +to prevent a race against other transitive page states (like a generic +compound page just before PageHuge becomes true). + +Link: https://lkml.kernel.org/r/20210603233632.2964832-2-nao.horiguchi@gmail.com +Fixes: ead07f6a867b ("mm/memory-failure: introduce get_hwpoison_page() for consistent refcount handling") +Signed-off-by: Naoya Horiguchi +Reported-by: Muchun Song +Acked-by: Mike Kravetz +Cc: Oscar Salvador +Cc: Michal Hocko +Cc: Tony Luck +Cc: [5.12+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/hugetlb.h | 6 ++++++ + mm/hugetlb.c | 15 +++++++++++++++ + mm/memory-failure.c | 29 +++++++++++++++++++++++++++-- + 3 files changed, 48 insertions(+), 2 deletions(-) + +--- a/include/linux/hugetlb.h ++++ b/include/linux/hugetlb.h +@@ -145,6 +145,7 @@ bool hugetlb_reserve_pages(struct inode + long hugetlb_unreserve_pages(struct inode *inode, long start, long end, + long freed); + bool isolate_huge_page(struct page *page, struct list_head *list); ++int get_hwpoison_huge_page(struct page *page, bool *hugetlb); + void putback_active_hugepage(struct page *page); + void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); + void free_huge_page(struct page *page); +@@ -330,6 +331,11 @@ static inline bool isolate_huge_page(str + return false; + } + ++static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb) ++{ ++ return 0; ++} ++ + static inline void putback_active_hugepage(struct page *page) + { + } +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -5664,6 +5664,21 @@ unlock: + return ret; + } + ++int get_hwpoison_huge_page(struct page *page, bool *hugetlb) ++{ ++ int ret = 0; ++ ++ *hugetlb = false; ++ spin_lock_irq(&hugetlb_lock); ++ if (PageHeadHuge(page)) { ++ *hugetlb = true; ++ if (HPageFreed(page) || HPageMigratable(page)) ++ ret = get_page_unless_zero(page); ++ } ++ spin_unlock_irq(&hugetlb_lock); ++ return ret; ++} ++ + void putback_active_hugepage(struct page *page) + { + spin_lock(&hugetlb_lock); +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -949,6 +949,17 @@ static int page_action(struct page_state + return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; + } + ++/* ++ * Return true if a page type of a given page is supported by hwpoison ++ * mechanism (while handling could fail), otherwise false. This function ++ * does not return true for hugetlb or device memory pages, so it's assumed ++ * to be called only in the context where we never have such pages. ++ */ ++static inline bool HWPoisonHandlable(struct page *page) ++{ ++ return PageLRU(page) || __PageMovable(page); ++} ++ + /** + * __get_hwpoison_page() - Get refcount for memory error handling: + * @page: raw error page (hit by memory error) +@@ -959,8 +970,22 @@ static int page_action(struct page_state + static int __get_hwpoison_page(struct page *page) + { + struct page *head = compound_head(page); ++ int ret = 0; ++ bool hugetlb = false; ++ ++ ret = get_hwpoison_huge_page(head, &hugetlb); ++ if (hugetlb) ++ return ret; ++ ++ /* ++ * This check prevents from calling get_hwpoison_unless_zero() ++ * for any unsupported type of page in order to reduce the risk of ++ * unexpected races caused by taking a page refcount. ++ */ ++ if (!HWPoisonHandlable(head)) ++ return 0; + +- if (!PageHuge(head) && PageTransHuge(head)) { ++ if (PageTransHuge(head)) { + /* + * Non anonymous thp exists only in allocation/free time. We + * can't handle such a case correctly, so let's give it up. +@@ -1017,7 +1042,7 @@ try_again: + ret = -EIO; + } + } else { +- if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) { ++ if (PageHuge(p) || HWPoisonHandlable(p)) { + ret = 1; + } else { + /* diff --git a/queue-5.12/mm-slub-actually-fix-freelist-pointer-vs-redzoning.patch b/queue-5.12/mm-slub-actually-fix-freelist-pointer-vs-redzoning.patch new file mode 100644 index 00000000000..ec7cf5c0353 --- /dev/null +++ b/queue-5.12/mm-slub-actually-fix-freelist-pointer-vs-redzoning.patch @@ -0,0 +1,104 @@ +From e41a49fadbc80b60b48d3c095d9e2ee7ef7c9a8e Mon Sep 17 00:00:00 2001 +From: Kees Cook +Date: Tue, 15 Jun 2021 18:23:26 -0700 +Subject: mm/slub: actually fix freelist pointer vs redzoning + +From: Kees Cook + +commit e41a49fadbc80b60b48d3c095d9e2ee7ef7c9a8e upstream. + +It turns out that SLUB redzoning ("slub_debug=Z") checks from +s->object_size rather than from s->inuse (which is normally bumped to +make room for the freelist pointer), so a cache created with an object +size less than 24 would have the freelist pointer written beyond +s->object_size, causing the redzone to be corrupted by the freelist +pointer. This was very visible with "slub_debug=ZF": + + BUG test (Tainted: G B ): Right Redzone overwritten + ----------------------------------------------------------------------------- + + INFO: 0xffff957ead1c05de-0xffff957ead1c05df @offset=1502. First byte 0x1a instead of 0xbb + INFO: Slab 0xffffef3950b47000 objects=170 used=170 fp=0x0000000000000000 flags=0x8000000000000200 + INFO: Object 0xffff957ead1c05d8 @offset=1496 fp=0xffff957ead1c0620 + + Redzone (____ptrval____): bb bb bb bb bb bb bb bb ........ + Object (____ptrval____): 00 00 00 00 00 f6 f4 a5 ........ + Redzone (____ptrval____): 40 1d e8 1a aa @.... + Padding (____ptrval____): 00 00 00 00 00 00 00 00 ........ + +Adjust the offset to stay within s->object_size. + +(Note that no caches of in this size range are known to exist in the +kernel currently.) + +Link: https://lkml.kernel.org/r/20210608183955.280836-4-keescook@chromium.org +Link: https://lore.kernel.org/linux-mm/20200807160627.GA1420741@elver.google.com/ +Link: https://lore.kernel.org/lkml/0f7dd7b2-7496-5e2d-9488-2ec9f8e90441@suse.cz/Fixes: 89b83f282d8b (slub: avoid redzone when choosing freepointer location) +Link: https://lore.kernel.org/lkml/CANpmjNOwZ5VpKQn+SYWovTkFB4VsT-RPwyENBmaK0dLcpqStkA@mail.gmail.com +Signed-off-by: Kees Cook +Reported-by: Marco Elver +Reported-by: "Lin, Zhenpeng" +Tested-by: Marco Elver +Acked-by: Vlastimil Babka +Cc: Christoph Lameter +Cc: David Rientjes +Cc: Joonsoo Kim +Cc: Pekka Enberg +Cc: Roman Gushchin +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/slub.c | 14 +++----------- + 1 file changed, 3 insertions(+), 11 deletions(-) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -3687,7 +3687,6 @@ static int calculate_sizes(struct kmem_c + { + slab_flags_t flags = s->flags; + unsigned int size = s->object_size; +- unsigned int freepointer_area; + unsigned int order; + + /* +@@ -3696,13 +3695,6 @@ static int calculate_sizes(struct kmem_c + * the possible location of the free pointer. + */ + size = ALIGN(size, sizeof(void *)); +- /* +- * This is the area of the object where a freepointer can be +- * safely written. If redzoning adds more to the inuse size, we +- * can't use that portion for writing the freepointer, so +- * s->offset must be limited within this for the general case. +- */ +- freepointer_area = size; + + #ifdef CONFIG_SLUB_DEBUG + /* +@@ -3728,7 +3720,7 @@ static int calculate_sizes(struct kmem_c + + /* + * With that we have determined the number of bytes in actual use +- * by the object. This is the potential offset to the free pointer. ++ * by the object and redzoning. + */ + s->inuse = size; + +@@ -3751,13 +3743,13 @@ static int calculate_sizes(struct kmem_c + */ + s->offset = size; + size += sizeof(void *); +- } else if (freepointer_area > sizeof(void *)) { ++ } else { + /* + * Store freelist pointer near middle of object to keep + * it away from the edges of the object to avoid small + * sized over/underflows from neighboring allocations. + */ +- s->offset = ALIGN(freepointer_area / 2, sizeof(void *)); ++ s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *)); + } + + #ifdef CONFIG_SLUB_DEBUG diff --git a/queue-5.12/mm-slub-clarify-verification-reporting.patch b/queue-5.12/mm-slub-clarify-verification-reporting.patch new file mode 100644 index 00000000000..61ccb16b8d3 --- /dev/null +++ b/queue-5.12/mm-slub-clarify-verification-reporting.patch @@ -0,0 +1,147 @@ +From 8669dbab2ae56085c128894b181c2aa50f97e368 Mon Sep 17 00:00:00 2001 +From: Kees Cook +Date: Tue, 15 Jun 2021 18:23:19 -0700 +Subject: mm/slub: clarify verification reporting + +From: Kees Cook + +commit 8669dbab2ae56085c128894b181c2aa50f97e368 upstream. + +Patch series "Actually fix freelist pointer vs redzoning", v4. + +This fixes redzoning vs the freelist pointer (both for middle-position +and very small caches). Both are "theoretical" fixes, in that I see no +evidence of such small-sized caches actually be used in the kernel, but +that's no reason to let the bugs continue to exist, especially since +people doing local development keep tripping over it. :) + +This patch (of 3): + +Instead of repeating "Redzone" and "Poison", clarify which sides of +those zones got tripped. Additionally fix column alignment in the +trailer. + +Before: + + BUG test (Tainted: G B ): Redzone overwritten + ... + Redzone (____ptrval____): bb bb bb bb bb bb bb bb ........ + Object (____ptrval____): f6 f4 a5 40 1d e8 ...@.. + Redzone (____ptrval____): 1a aa .. + Padding (____ptrval____): 00 00 00 00 00 00 00 00 ........ + +After: + + BUG test (Tainted: G B ): Right Redzone overwritten + ... + Redzone (____ptrval____): bb bb bb bb bb bb bb bb ........ + Object (____ptrval____): f6 f4 a5 40 1d e8 ...@.. + Redzone (____ptrval____): 1a aa .. + Padding (____ptrval____): 00 00 00 00 00 00 00 00 ........ + +The earlier commits that slowly resulted in the "Before" reporting were: + + d86bd1bece6f ("mm/slub: support left redzone") + ffc79d288000 ("slub: use print_hex_dump") + 2492268472e7 ("SLUB: change error reporting format to follow lockdep loosely") + +Link: https://lkml.kernel.org/r/20210608183955.280836-1-keescook@chromium.org +Link: https://lkml.kernel.org/r/20210608183955.280836-2-keescook@chromium.org +Link: https://lore.kernel.org/lkml/cfdb11d7-fb8e-e578-c939-f7f5fb69a6bd@suse.cz/ +Signed-off-by: Kees Cook +Acked-by: Vlastimil Babka +Cc: Marco Elver +Cc: "Lin, Zhenpeng" +Cc: Christoph Lameter +Cc: Pekka Enberg +Cc: David Rientjes +Cc: Joonsoo Kim +Cc: Roman Gushchin +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/vm/slub.rst | 10 +++++----- + mm/slub.c | 14 +++++++------- + 2 files changed, 12 insertions(+), 12 deletions(-) + +--- a/Documentation/vm/slub.rst ++++ b/Documentation/vm/slub.rst +@@ -181,7 +181,7 @@ SLUB Debug output + Here is a sample of slub debug output:: + + ==================================================================== +- BUG kmalloc-8: Redzone overwritten ++ BUG kmalloc-8: Right Redzone overwritten + -------------------------------------------------------------------- + + INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc +@@ -189,10 +189,10 @@ Here is a sample of slub debug output:: + INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58 + INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554 + +- Bytes b4 0xc90f6d10: 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ +- Object 0xc90f6d20: 31 30 31 39 2e 30 30 35 1019.005 +- Redzone 0xc90f6d28: 00 cc cc cc . +- Padding 0xc90f6d50: 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ ++ Bytes b4 (0xc90f6d10): 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ ++ Object (0xc90f6d20): 31 30 31 39 2e 30 30 35 1019.005 ++ Redzone (0xc90f6d28): 00 cc cc cc . ++ Padding (0xc90f6d50): 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ + + [] dump_trace+0x63/0x1eb + [] show_trace_log_lvl+0x1a/0x2f +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -710,15 +710,15 @@ static void print_trailer(struct kmem_ca + p, p - addr, get_freepointer(s, p)); + + if (s->flags & SLAB_RED_ZONE) +- print_section(KERN_ERR, "Redzone ", p - s->red_left_pad, ++ print_section(KERN_ERR, "Redzone ", p - s->red_left_pad, + s->red_left_pad); + else if (p > addr + 16) + print_section(KERN_ERR, "Bytes b4 ", p - 16, 16); + +- print_section(KERN_ERR, "Object ", p, ++ print_section(KERN_ERR, "Object ", p, + min_t(unsigned int, s->object_size, PAGE_SIZE)); + if (s->flags & SLAB_RED_ZONE) +- print_section(KERN_ERR, "Redzone ", p + s->object_size, ++ print_section(KERN_ERR, "Redzone ", p + s->object_size, + s->inuse - s->object_size); + + off = get_info_end(s); +@@ -730,7 +730,7 @@ static void print_trailer(struct kmem_ca + + if (off != size_from_object(s)) + /* Beginning of the filler is the free pointer */ +- print_section(KERN_ERR, "Padding ", p + off, ++ print_section(KERN_ERR, "Padding ", p + off, + size_from_object(s) - off); + + dump_stack(); +@@ -907,11 +907,11 @@ static int check_object(struct kmem_cach + u8 *endobject = object + s->object_size; + + if (s->flags & SLAB_RED_ZONE) { +- if (!check_bytes_and_report(s, page, object, "Redzone", ++ if (!check_bytes_and_report(s, page, object, "Left Redzone", + object - s->red_left_pad, val, s->red_left_pad)) + return 0; + +- if (!check_bytes_and_report(s, page, object, "Redzone", ++ if (!check_bytes_and_report(s, page, object, "Right Redzone", + endobject, val, s->inuse - s->object_size)) + return 0; + } else { +@@ -926,7 +926,7 @@ static int check_object(struct kmem_cach + if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && + (!check_bytes_and_report(s, page, p, "Poison", p, + POISON_FREE, s->object_size - 1) || +- !check_bytes_and_report(s, page, p, "Poison", ++ !check_bytes_and_report(s, page, p, "End Poison", + p + s->object_size - 1, POISON_END, 1))) + return 0; + /* diff --git a/queue-5.12/mm-slub-fix-redzoning-for-small-allocations.patch b/queue-5.12/mm-slub-fix-redzoning-for-small-allocations.patch new file mode 100644 index 00000000000..8f36a112680 --- /dev/null +++ b/queue-5.12/mm-slub-fix-redzoning-for-small-allocations.patch @@ -0,0 +1,92 @@ +From 74c1d3e081533825f2611e46edea1fcdc0701985 Mon Sep 17 00:00:00 2001 +From: Kees Cook +Date: Tue, 15 Jun 2021 18:23:22 -0700 +Subject: mm/slub: fix redzoning for small allocations + +From: Kees Cook + +commit 74c1d3e081533825f2611e46edea1fcdc0701985 upstream. + +The redzone area for SLUB exists between s->object_size and s->inuse +(which is at least the word-aligned object_size). If a cache were +created with an object_size smaller than sizeof(void *), the in-object +stored freelist pointer would overwrite the redzone (e.g. with boot +param "slub_debug=ZF"): + + BUG test (Tainted: G B ): Right Redzone overwritten + ----------------------------------------------------------------------------- + + INFO: 0xffff957ead1c05de-0xffff957ead1c05df @offset=1502. First byte 0x1a instead of 0xbb + INFO: Slab 0xffffef3950b47000 objects=170 used=170 fp=0x0000000000000000 flags=0x8000000000000200 + INFO: Object 0xffff957ead1c05d8 @offset=1496 fp=0xffff957ead1c0620 + + Redzone (____ptrval____): bb bb bb bb bb bb bb bb ........ + Object (____ptrval____): f6 f4 a5 40 1d e8 ...@.. + Redzone (____ptrval____): 1a aa .. + Padding (____ptrval____): 00 00 00 00 00 00 00 00 ........ + +Store the freelist pointer out of line when object_size is smaller than +sizeof(void *) and redzoning is enabled. + +Additionally remove the "smaller than sizeof(void *)" check under +CONFIG_DEBUG_VM in kmem_cache_sanity_check() as it is now redundant: +SLAB and SLOB both handle small sizes. + +(Note that no caches within this size range are known to exist in the +kernel currently.) + +Link: https://lkml.kernel.org/r/20210608183955.280836-3-keescook@chromium.org +Fixes: 81819f0fc828 ("SLUB core") +Signed-off-by: Kees Cook +Acked-by: Vlastimil Babka +Cc: Christoph Lameter +Cc: David Rientjes +Cc: Joonsoo Kim +Cc: "Lin, Zhenpeng" +Cc: Marco Elver +Cc: Pekka Enberg +Cc: Roman Gushchin +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/slab_common.c | 3 +-- + mm/slub.c | 8 +++++--- + 2 files changed, 6 insertions(+), 5 deletions(-) + +--- a/mm/slab_common.c ++++ b/mm/slab_common.c +@@ -89,8 +89,7 @@ EXPORT_SYMBOL(kmem_cache_size); + #ifdef CONFIG_DEBUG_VM + static int kmem_cache_sanity_check(const char *name, unsigned int size) + { +- if (!name || in_interrupt() || size < sizeof(void *) || +- size > KMALLOC_MAX_SIZE) { ++ if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) { + pr_err("kmem_cache_create(%s) integrity check failed\n", name); + return -EINVAL; + } +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -3732,15 +3732,17 @@ static int calculate_sizes(struct kmem_c + */ + s->inuse = size; + +- if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || +- s->ctor)) { ++ if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || ++ ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) || ++ s->ctor) { + /* + * Relocate free pointer after the object if it is not + * permitted to overwrite the first word of the object on + * kmem_cache_free. + * + * This is the case if we do RCU, have a constructor or +- * destructor or are poisoning the objects. ++ * destructor, are poisoning the objects, or are ++ * redzoning an object smaller than sizeof(void *). + * + * The assumption that s->offset >= s->inuse means free + * pointer is outside of the object is used in the diff --git a/queue-5.12/mm-slub.c-include-swab.h.patch b/queue-5.12/mm-slub.c-include-swab.h.patch new file mode 100644 index 00000000000..b74f61550b2 --- /dev/null +++ b/queue-5.12/mm-slub.c-include-swab.h.patch @@ -0,0 +1,35 @@ +From 1b3865d016815cbd69a1879ca1c8a8901fda1072 Mon Sep 17 00:00:00 2001 +From: Andrew Morton +Date: Tue, 15 Jun 2021 18:23:39 -0700 +Subject: mm/slub.c: include swab.h + +From: Andrew Morton + +commit 1b3865d016815cbd69a1879ca1c8a8901fda1072 upstream. + +Fixes build with CONFIG_SLAB_FREELIST_HARDENED=y. + +Hopefully. But it's the right thing to do anwyay. + +Fixes: 1ad53d9fa3f61 ("slub: improve bit diffusion for freelist ptr obfuscation") +Link: https://bugzilla.kernel.org/show_bug.cgi?id=213417 +Reported-by: +Acked-by: Kees Cook +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/slub.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + #include "slab.h" diff --git a/queue-5.12/mm-swap-fix-pte_same_as_swp-not-removing-uffd-wp-bit-when-compare.patch b/queue-5.12/mm-swap-fix-pte_same_as_swp-not-removing-uffd-wp-bit-when-compare.patch new file mode 100644 index 00000000000..02b7c502726 --- /dev/null +++ b/queue-5.12/mm-swap-fix-pte_same_as_swp-not-removing-uffd-wp-bit-when-compare.patch @@ -0,0 +1,70 @@ +From 099dd6878b9b12d6bbfa6bf29ce0c8ddd38f6901 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 15 Jun 2021 18:23:16 -0700 +Subject: mm/swap: fix pte_same_as_swp() not removing uffd-wp bit when compare + +From: Peter Xu + +commit 099dd6878b9b12d6bbfa6bf29ce0c8ddd38f6901 upstream. + +I found it by pure code review, that pte_same_as_swp() of unuse_vma() +didn't take uffd-wp bit into account when comparing ptes. +pte_same_as_swp() returning false negative could cause failure to +swapoff swap ptes that was wr-protected by userfaultfd. + +Link: https://lkml.kernel.org/r/20210603180546.9083-1-peterx@redhat.com +Fixes: f45ec5ff16a7 ("userfaultfd: wp: support swap and page migration") +Signed-off-by: Peter Xu +Acked-by: Hugh Dickins +Cc: Andrea Arcangeli +Cc: [5.7+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/swapops.h | 15 +++++++++++---- + mm/swapfile.c | 2 +- + 2 files changed, 12 insertions(+), 5 deletions(-) + +--- a/include/linux/swapops.h ++++ b/include/linux/swapops.h +@@ -23,6 +23,16 @@ + #define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT) + #define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1) + ++/* Clear all flags but only keep swp_entry_t related information */ ++static inline pte_t pte_swp_clear_flags(pte_t pte) ++{ ++ if (pte_swp_soft_dirty(pte)) ++ pte = pte_swp_clear_soft_dirty(pte); ++ if (pte_swp_uffd_wp(pte)) ++ pte = pte_swp_clear_uffd_wp(pte); ++ return pte; ++} ++ + /* + * Store a type+offset into a swp_entry_t in an arch-independent format + */ +@@ -66,10 +76,7 @@ static inline swp_entry_t pte_to_swp_ent + { + swp_entry_t arch_entry; + +- if (pte_swp_soft_dirty(pte)) +- pte = pte_swp_clear_soft_dirty(pte); +- if (pte_swp_uffd_wp(pte)) +- pte = pte_swp_clear_uffd_wp(pte); ++ pte = pte_swp_clear_flags(pte); + arch_entry = __pte_to_swp_entry(pte); + return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); + } +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -1900,7 +1900,7 @@ unsigned int count_swap_pages(int type, + + static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) + { +- return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte); ++ return pte_same(pte_swp_clear_flags(pte), swp_pte); + } + + /* diff --git a/queue-5.12/net-bridge-fix-vlan-tunnel-dst-null-pointer-dereference.patch b/queue-5.12/net-bridge-fix-vlan-tunnel-dst-null-pointer-dereference.patch new file mode 100644 index 00000000000..d5916f11f8a --- /dev/null +++ b/queue-5.12/net-bridge-fix-vlan-tunnel-dst-null-pointer-dereference.patch @@ -0,0 +1,135 @@ +From 58e2071742e38f29f051b709a5cca014ba51166f Mon Sep 17 00:00:00 2001 +From: Nikolay Aleksandrov +Date: Thu, 10 Jun 2021 15:04:10 +0300 +Subject: net: bridge: fix vlan tunnel dst null pointer dereference + +From: Nikolay Aleksandrov + +commit 58e2071742e38f29f051b709a5cca014ba51166f upstream. + +This patch fixes a tunnel_dst null pointer dereference due to lockless +access in the tunnel egress path. When deleting a vlan tunnel the +tunnel_dst pointer is set to NULL without waiting a grace period (i.e. +while it's still usable) and packets egressing are dereferencing it +without checking. Use READ/WRITE_ONCE to annotate the lockless use of +tunnel_id, use RCU for accessing tunnel_dst and make sure it is read +only once and checked in the egress path. The dst is already properly RCU +protected so we don't need to do anything fancy than to make sure +tunnel_id and tunnel_dst are read only once and checked in the egress path. + +Cc: stable@vger.kernel.org +Fixes: 11538d039ac6 ("bridge: vlan dst_metadata hooks in ingress and egress paths") +Signed-off-by: Nikolay Aleksandrov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/bridge/br_private.h | 4 ++-- + net/bridge/br_vlan_tunnel.c | 38 ++++++++++++++++++++++++-------------- + 2 files changed, 26 insertions(+), 16 deletions(-) + +--- a/net/bridge/br_private.h ++++ b/net/bridge/br_private.h +@@ -90,8 +90,8 @@ struct bridge_mcast_stats { + #endif + + struct br_tunnel_info { +- __be64 tunnel_id; +- struct metadata_dst *tunnel_dst; ++ __be64 tunnel_id; ++ struct metadata_dst __rcu *tunnel_dst; + }; + + /* private vlan flags */ +--- a/net/bridge/br_vlan_tunnel.c ++++ b/net/bridge/br_vlan_tunnel.c +@@ -41,26 +41,33 @@ static struct net_bridge_vlan *br_vlan_t + br_vlan_tunnel_rht_params); + } + ++static void vlan_tunnel_info_release(struct net_bridge_vlan *vlan) ++{ ++ struct metadata_dst *tdst = rtnl_dereference(vlan->tinfo.tunnel_dst); ++ ++ WRITE_ONCE(vlan->tinfo.tunnel_id, 0); ++ RCU_INIT_POINTER(vlan->tinfo.tunnel_dst, NULL); ++ dst_release(&tdst->dst); ++} ++ + void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg, + struct net_bridge_vlan *vlan) + { +- if (!vlan->tinfo.tunnel_dst) ++ if (!rcu_access_pointer(vlan->tinfo.tunnel_dst)) + return; + rhashtable_remove_fast(&vg->tunnel_hash, &vlan->tnode, + br_vlan_tunnel_rht_params); +- vlan->tinfo.tunnel_id = 0; +- dst_release(&vlan->tinfo.tunnel_dst->dst); +- vlan->tinfo.tunnel_dst = NULL; ++ vlan_tunnel_info_release(vlan); + } + + static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg, + struct net_bridge_vlan *vlan, u32 tun_id) + { +- struct metadata_dst *metadata = NULL; ++ struct metadata_dst *metadata = rtnl_dereference(vlan->tinfo.tunnel_dst); + __be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id)); + int err; + +- if (vlan->tinfo.tunnel_dst) ++ if (metadata) + return -EEXIST; + + metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY, +@@ -69,8 +76,8 @@ static int __vlan_tunnel_info_add(struct + return -EINVAL; + + metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_BRIDGE; +- vlan->tinfo.tunnel_dst = metadata; +- vlan->tinfo.tunnel_id = key; ++ rcu_assign_pointer(vlan->tinfo.tunnel_dst, metadata); ++ WRITE_ONCE(vlan->tinfo.tunnel_id, key); + + err = rhashtable_lookup_insert_fast(&vg->tunnel_hash, &vlan->tnode, + br_vlan_tunnel_rht_params); +@@ -79,9 +86,7 @@ static int __vlan_tunnel_info_add(struct + + return 0; + out: +- dst_release(&vlan->tinfo.tunnel_dst->dst); +- vlan->tinfo.tunnel_dst = NULL; +- vlan->tinfo.tunnel_id = 0; ++ vlan_tunnel_info_release(vlan); + + return err; + } +@@ -182,12 +187,15 @@ int br_handle_ingress_vlan_tunnel(struct + int br_handle_egress_vlan_tunnel(struct sk_buff *skb, + struct net_bridge_vlan *vlan) + { ++ struct metadata_dst *tunnel_dst; ++ __be64 tunnel_id; + int err; + +- if (!vlan || !vlan->tinfo.tunnel_id) ++ if (!vlan) + return 0; + +- if (unlikely(!skb_vlan_tag_present(skb))) ++ tunnel_id = READ_ONCE(vlan->tinfo.tunnel_id); ++ if (!tunnel_id || unlikely(!skb_vlan_tag_present(skb))) + return 0; + + skb_dst_drop(skb); +@@ -195,7 +203,9 @@ int br_handle_egress_vlan_tunnel(struct + if (err) + return err; + +- skb_dst_set(skb, dst_clone(&vlan->tinfo.tunnel_dst->dst)); ++ tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst); ++ if (tunnel_dst) ++ skb_dst_set(skb, dst_clone(&tunnel_dst->dst)); + + return 0; + } diff --git a/queue-5.12/net-bridge-fix-vlan-tunnel-dst-refcnt-when-egressing.patch b/queue-5.12/net-bridge-fix-vlan-tunnel-dst-refcnt-when-egressing.patch new file mode 100644 index 00000000000..b1c112f5bdd --- /dev/null +++ b/queue-5.12/net-bridge-fix-vlan-tunnel-dst-refcnt-when-egressing.patch @@ -0,0 +1,87 @@ +From cfc579f9d89af4ada58c69b03bcaa4887840f3b3 Mon Sep 17 00:00:00 2001 +From: Nikolay Aleksandrov +Date: Thu, 10 Jun 2021 15:04:11 +0300 +Subject: net: bridge: fix vlan tunnel dst refcnt when egressing + +From: Nikolay Aleksandrov + +commit cfc579f9d89af4ada58c69b03bcaa4887840f3b3 upstream. + +The egress tunnel code uses dst_clone() and directly sets the result +which is wrong because the entry might have 0 refcnt or be already deleted, +causing number of problems. It also triggers the WARN_ON() in dst_hold()[1] +when a refcnt couldn't be taken. Fix it by using dst_hold_safe() and +checking if a reference was actually taken before setting the dst. + +[1] dmesg WARN_ON log and following refcnt errors + WARNING: CPU: 5 PID: 38 at include/net/dst.h:230 br_handle_egress_vlan_tunnel+0x10b/0x134 [bridge] + Modules linked in: 8021q garp mrp bridge stp llc bonding ipv6 virtio_net + CPU: 5 PID: 38 Comm: ksoftirqd/5 Kdump: loaded Tainted: G W 5.13.0-rc3+ #360 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014 + RIP: 0010:br_handle_egress_vlan_tunnel+0x10b/0x134 [bridge] + Code: e8 85 bc 01 e1 45 84 f6 74 90 45 31 f6 85 db 48 c7 c7 a0 02 19 a0 41 0f 94 c6 31 c9 31 d2 44 89 f6 e8 64 bc 01 e1 85 db 75 02 <0f> 0b 31 c9 31 d2 44 89 f6 48 c7 c7 70 02 19 a0 e8 4b bc 01 e1 49 + RSP: 0018:ffff8881003d39e8 EFLAGS: 00010246 + RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 + RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffffffffa01902a0 + RBP: ffff8881040c6700 R08: 0000000000000000 R09: 0000000000000001 + R10: 2ce93d0054fe0d00 R11: 54fe0d00000e0000 R12: ffff888109515000 + R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000401 + FS: 0000000000000000(0000) GS:ffff88822bf40000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 00007f42ba70f030 CR3: 0000000109926000 CR4: 00000000000006e0 + Call Trace: + br_handle_vlan+0xbc/0xca [bridge] + __br_forward+0x23/0x164 [bridge] + deliver_clone+0x41/0x48 [bridge] + br_handle_frame_finish+0x36f/0x3aa [bridge] + ? skb_dst+0x2e/0x38 [bridge] + ? br_handle_ingress_vlan_tunnel+0x3e/0x1c8 [bridge] + ? br_handle_frame_finish+0x3aa/0x3aa [bridge] + br_handle_frame+0x2c3/0x377 [bridge] + ? __skb_pull+0x33/0x51 + ? vlan_do_receive+0x4f/0x36a + ? br_handle_frame_finish+0x3aa/0x3aa [bridge] + __netif_receive_skb_core+0x539/0x7c6 + ? __list_del_entry_valid+0x16e/0x1c2 + __netif_receive_skb_list_core+0x6d/0xd6 + netif_receive_skb_list_internal+0x1d9/0x1fa + gro_normal_list+0x22/0x3e + dev_gro_receive+0x55b/0x600 + ? detach_buf_split+0x58/0x140 + napi_gro_receive+0x94/0x12e + virtnet_poll+0x15d/0x315 [virtio_net] + __napi_poll+0x2c/0x1c9 + net_rx_action+0xe6/0x1fb + __do_softirq+0x115/0x2d8 + run_ksoftirqd+0x18/0x20 + smpboot_thread_fn+0x183/0x19c + ? smpboot_unregister_percpu_thread+0x66/0x66 + kthread+0x10a/0x10f + ? kthread_mod_delayed_work+0xb6/0xb6 + ret_from_fork+0x22/0x30 + ---[ end trace 49f61b07f775fd2b ]--- + dst_release: dst:00000000c02d677a refcnt:-1 + dst_release underflow + +Cc: stable@vger.kernel.org +Fixes: 11538d039ac6 ("bridge: vlan dst_metadata hooks in ingress and egress paths") +Signed-off-by: Nikolay Aleksandrov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/bridge/br_vlan_tunnel.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/net/bridge/br_vlan_tunnel.c ++++ b/net/bridge/br_vlan_tunnel.c +@@ -204,8 +204,8 @@ int br_handle_egress_vlan_tunnel(struct + return err; + + tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst); +- if (tunnel_dst) +- skb_dst_set(skb, dst_clone(&tunnel_dst->dst)); ++ if (tunnel_dst && dst_hold_safe(&tunnel_dst->dst)) ++ skb_dst_set(skb, &tunnel_dst->dst); + + return 0; + } diff --git a/queue-5.12/net-ll_temac-fix-tx-bd-buffer-overwrite.patch b/queue-5.12/net-ll_temac-fix-tx-bd-buffer-overwrite.patch new file mode 100644 index 00000000000..8035548fc67 --- /dev/null +++ b/queue-5.12/net-ll_temac-fix-tx-bd-buffer-overwrite.patch @@ -0,0 +1,36 @@ +From c364df2489b8ef2f5e3159b1dff1ff1fdb16040d Mon Sep 17 00:00:00 2001 +From: Esben Haabendal +Date: Fri, 18 Jun 2021 12:52:33 +0200 +Subject: net: ll_temac: Fix TX BD buffer overwrite + +From: Esben Haabendal + +commit c364df2489b8ef2f5e3159b1dff1ff1fdb16040d upstream. + +Just as the initial check, we need to ensure num_frag+1 buffers available, +as that is the number of buffers we are going to use. + +This fixes a buffer overflow, which might be seen during heavy network +load. Complete lockup of TEMAC was reproducible within about 10 minutes of +a particular load. + +Fixes: 84823ff80f74 ("net: ll_temac: Fix race condition causing TX hang") +Cc: stable@vger.kernel.org # v5.4+ +Signed-off-by: Esben Haabendal +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/xilinx/ll_temac_main.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/xilinx/ll_temac_main.c ++++ b/drivers/net/ethernet/xilinx/ll_temac_main.c +@@ -849,7 +849,7 @@ temac_start_xmit(struct sk_buff *skb, st + smp_mb(); + + /* Space might have just been freed - check again */ +- if (temac_check_tx_bd_space(lp, num_frag)) ++ if (temac_check_tx_bd_space(lp, num_frag + 1)) + return NETDEV_TX_BUSY; + + netif_wake_queue(ndev); diff --git a/queue-5.12/net-ll_temac-make-sure-to-free-skb-when-it-is-completely-used.patch b/queue-5.12/net-ll_temac-make-sure-to-free-skb-when-it-is-completely-used.patch new file mode 100644 index 00000000000..ac1e38de387 --- /dev/null +++ b/queue-5.12/net-ll_temac-make-sure-to-free-skb-when-it-is-completely-used.patch @@ -0,0 +1,49 @@ +From 6aa32217a9a446275440ee8724b1ecaf1838df47 Mon Sep 17 00:00:00 2001 +From: Esben Haabendal +Date: Fri, 18 Jun 2021 12:52:23 +0200 +Subject: net: ll_temac: Make sure to free skb when it is completely used + +From: Esben Haabendal + +commit 6aa32217a9a446275440ee8724b1ecaf1838df47 upstream. + +With the skb pointer piggy-backed on the TX BD, we have a simple and +efficient way to free the skb buffer when the frame has been transmitted. +But in order to avoid freeing the skb while there are still fragments from +the skb in use, we need to piggy-back on the TX BD of the skb, not the +first. + +Without this, we are doing use-after-free on the DMA side, when the first +BD of a multi TX BD packet is seen as completed in xmit_done, and the +remaining BDs are still being processed. + +Cc: stable@vger.kernel.org # v5.4+ +Signed-off-by: Esben Haabendal +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/xilinx/ll_temac_main.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/xilinx/ll_temac_main.c ++++ b/drivers/net/ethernet/xilinx/ll_temac_main.c +@@ -876,7 +876,6 @@ temac_start_xmit(struct sk_buff *skb, st + return NETDEV_TX_OK; + } + cur_p->phys = cpu_to_be32(skb_dma_addr); +- ptr_to_txbd((void *)skb, cur_p); + + for (ii = 0; ii < num_frag; ii++) { + if (++lp->tx_bd_tail >= lp->tx_bd_num) +@@ -915,6 +914,11 @@ temac_start_xmit(struct sk_buff *skb, st + } + cur_p->app0 |= cpu_to_be32(STS_CTRL_APP0_EOP); + ++ /* Mark last fragment with skb address, so it can be consumed ++ * in temac_start_xmit_done() ++ */ ++ ptr_to_txbd((void *)skb, cur_p); ++ + tail_p = lp->tx_bd_p + sizeof(*lp->tx_bd_v) * lp->tx_bd_tail; + lp->tx_bd_tail++; + if (lp->tx_bd_tail >= lp->tx_bd_num) diff --git a/queue-5.12/powerpc-perf-fix-crash-in-perf_instruction_pointer-when-ppmu-is-not-set.patch b/queue-5.12/powerpc-perf-fix-crash-in-perf_instruction_pointer-when-ppmu-is-not-set.patch new file mode 100644 index 00000000000..b164a190083 --- /dev/null +++ b/queue-5.12/powerpc-perf-fix-crash-in-perf_instruction_pointer-when-ppmu-is-not-set.patch @@ -0,0 +1,62 @@ +From 60b7ed54a41b550d50caf7f2418db4a7e75b5bdc Mon Sep 17 00:00:00 2001 +From: Athira Rajeev +Date: Thu, 17 Jun 2021 13:55:06 -0400 +Subject: powerpc/perf: Fix crash in perf_instruction_pointer() when ppmu is not set + +From: Athira Rajeev + +commit 60b7ed54a41b550d50caf7f2418db4a7e75b5bdc upstream. + +On systems without any specific PMU driver support registered, running +perf record causes Oops. + +The relevant portion from call trace: + + BUG: Kernel NULL pointer dereference on read at 0x00000040 + Faulting instruction address: 0xc0021f0c + Oops: Kernel access of bad area, sig: 11 [#1] + BE PAGE_SIZE=4K PREEMPT CMPCPRO + SAF3000 DIE NOTIFICATION + CPU: 0 PID: 442 Comm: null_syscall Not tainted 5.13.0-rc6-s3k-dev-01645-g7649ee3d2957 #5164 + NIP: c0021f0c LR: c00e8ad8 CTR: c00d8a5c + NIP perf_instruction_pointer+0x10/0x60 + LR perf_prepare_sample+0x344/0x674 + Call Trace: + perf_prepare_sample+0x7c/0x674 (unreliable) + perf_event_output_forward+0x3c/0x94 + __perf_event_overflow+0x74/0x14c + perf_swevent_hrtimer+0xf8/0x170 + __hrtimer_run_queues.constprop.0+0x160/0x318 + hrtimer_interrupt+0x148/0x3b0 + timer_interrupt+0xc4/0x22c + Decrementer_virt+0xb8/0xbc + +During perf record session, perf_instruction_pointer() is called to +capture the sample IP. This function in core-book3s accesses +ppmu->flags. If a platform specific PMU driver is not registered, ppmu +is set to NULL and accessing its members results in a crash. Fix this +crash by checking if ppmu is set. + +Fixes: 2ca13a4cc56c ("powerpc/perf: Use regs->nip when SIAR is zero") +Cc: stable@vger.kernel.org # v5.11+ +Reported-by: Christophe Leroy +Signed-off-by: Athira Rajeev +Tested-by: Christophe Leroy +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/1623952506-1431-1-git-send-email-atrajeev@linux.vnet.ibm.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/perf/core-book3s.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/powerpc/perf/core-book3s.c ++++ b/arch/powerpc/perf/core-book3s.c +@@ -2242,7 +2242,7 @@ unsigned long perf_instruction_pointer(s + bool use_siar = regs_use_siar(regs); + unsigned long siar = mfspr(SPRN_SIAR); + +- if (ppmu->flags & PPMU_P10_DD1) { ++ if (ppmu && (ppmu->flags & PPMU_P10_DD1)) { + if (siar) + return siar; + else diff --git a/queue-5.12/series b/queue-5.12/series index 1213163cf4d..a5d4907d785 100644 --- a/queue-5.12/series +++ b/queue-5.12/series @@ -137,3 +137,36 @@ kvm-x86-fix-x86_emulator-slab-cache-leak.patch s390-mcck-fix-calculation-of-sie-critical-section-size.patch s390-ap-fix-hanging-ioctl-caused-by-wrong-msg-counter.patch arcv2-save-abi-registers-across-signal-handling.patch +x86-mm-avoid-truncating-memblocks-for-sgx-memory.patch +x86-process-check-pf_kthread-and-not-current-mm-for-kernel-threads.patch +x86-ioremap-map-efi-reserved-memory-as-encrypted-for-sev.patch +x86-pkru-write-hardware-init-value-to-pkru-when-xstate-is-init.patch +x86-fpu-prevent-state-corruption-in-__fpu__restore_sig.patch +x86-fpu-invalidate-fpu-state-after-a-failed-xrstor-from-a-user-buffer.patch +x86-fpu-reset-state-for-all-signal-restore-failures.patch +powerpc-perf-fix-crash-in-perf_instruction_pointer-when-ppmu-is-not-set.patch +makefile-lto-pass-warn-stack-size-only-on-lld-13.0.0.patch +crash_core-vmcoreinfo-append-section_size_bits-to-vmcoreinfo.patch +dmaengine-pl330-fix-wrong-usage-of-spinlock-flags-in-dma_cyclc.patch +mac80211-fix-deadlock-in-ap-vlan-handling.patch +mac80211-fix-null-ptr-deref-for-injected-rate-info.patch +mac80211-fix-reset-debugfs-locking.patch +cfg80211-fix-phy80211-symlink-creation.patch +cfg80211-shut-down-interfaces-on-failed-resume.patch +mac80211-move-interface-shutdown-out-of-wiphy-lock.patch +mac80211-minstrel_ht-fix-sample-time-check.patch +cfg80211-make-certificate-generation-more-robust.patch +cfg80211-avoid-double-free-of-pmsr-request.patch +drm-amdgpu-gfx10-enlarge-cp_mec_doorbell_range_upper-to-cover-full-doorbell.patch +drm-amdgpu-gfx9-fix-the-doorbell-missing-when-in-cgpg-issue.patch +net-ll_temac-make-sure-to-free-skb-when-it-is-completely-used.patch +net-ll_temac-fix-tx-bd-buffer-overwrite.patch +net-bridge-fix-vlan-tunnel-dst-null-pointer-dereference.patch +net-bridge-fix-vlan-tunnel-dst-refcnt-when-egressing.patch +mm-hwpoison-fix-race-with-hugetlb-page-allocation.patch +mm-swap-fix-pte_same_as_swp-not-removing-uffd-wp-bit-when-compare.patch +mm-hugetlb-expand-restore_reserve_on_error-functionality.patch +mm-slub-clarify-verification-reporting.patch +mm-slub-fix-redzoning-for-small-allocations.patch +mm-slub-actually-fix-freelist-pointer-vs-redzoning.patch +mm-slub.c-include-swab.h.patch diff --git a/queue-5.12/x86-fpu-invalidate-fpu-state-after-a-failed-xrstor-from-a-user-buffer.patch b/queue-5.12/x86-fpu-invalidate-fpu-state-after-a-failed-xrstor-from-a-user-buffer.patch new file mode 100644 index 00000000000..07700f32c35 --- /dev/null +++ b/queue-5.12/x86-fpu-invalidate-fpu-state-after-a-failed-xrstor-from-a-user-buffer.patch @@ -0,0 +1,74 @@ +From d8778e393afa421f1f117471144f8ce6deb6953a Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Tue, 8 Jun 2021 16:36:19 +0200 +Subject: x86/fpu: Invalidate FPU state after a failed XRSTOR from a user buffer + +From: Andy Lutomirski + +commit d8778e393afa421f1f117471144f8ce6deb6953a upstream. + +Both Intel and AMD consider it to be architecturally valid for XRSTOR to +fail with #PF but nonetheless change the register state. The actual +conditions under which this might occur are unclear [1], but it seems +plausible that this might be triggered if one sibling thread unmaps a page +and invalidates the shared TLB while another sibling thread is executing +XRSTOR on the page in question. + +__fpu__restore_sig() can execute XRSTOR while the hardware registers +are preserved on behalf of a different victim task (using the +fpu_fpregs_owner_ctx mechanism), and, in theory, XRSTOR could fail but +modify the registers. + +If this happens, then there is a window in which __fpu__restore_sig() +could schedule out and the victim task could schedule back in without +reloading its own FPU registers. This would result in part of the FPU +state that __fpu__restore_sig() was attempting to load leaking into the +victim task's user-visible state. + +Invalidate preserved FPU registers on XRSTOR failure to prevent this +situation from corrupting any state. + +[1] Frequent readers of the errata lists might imagine "complex + microarchitectural conditions". + +Fixes: 1d731e731c4c ("x86/fpu: Add a fastpath to __fpu__restore_sig()") +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Signed-off-by: Borislav Petkov +Acked-by: Dave Hansen +Acked-by: Rik van Riel +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/20210608144345.758116583@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/fpu/signal.c | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +--- a/arch/x86/kernel/fpu/signal.c ++++ b/arch/x86/kernel/fpu/signal.c +@@ -369,6 +369,25 @@ static int __fpu__restore_sig(void __use + fpregs_unlock(); + return 0; + } ++ ++ /* ++ * The above did an FPU restore operation, restricted to ++ * the user portion of the registers, and failed, but the ++ * microcode might have modified the FPU registers ++ * nevertheless. ++ * ++ * If the FPU registers do not belong to current, then ++ * invalidate the FPU register state otherwise the task might ++ * preempt current and return to user space with corrupted ++ * FPU registers. ++ * ++ * In case current owns the FPU registers then no further ++ * action is required. The fixup below will handle it ++ * correctly. ++ */ ++ if (test_thread_flag(TIF_NEED_FPU_LOAD)) ++ __cpu_invalidate_fpregs_state(); ++ + fpregs_unlock(); + } else { + /* diff --git a/queue-5.12/x86-fpu-prevent-state-corruption-in-__fpu__restore_sig.patch b/queue-5.12/x86-fpu-prevent-state-corruption-in-__fpu__restore_sig.patch new file mode 100644 index 00000000000..3b9caf02c8a --- /dev/null +++ b/queue-5.12/x86-fpu-prevent-state-corruption-in-__fpu__restore_sig.patch @@ -0,0 +1,64 @@ +From 484cea4f362e1eeb5c869abbfb5f90eae6421b38 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Tue, 8 Jun 2021 16:36:18 +0200 +Subject: x86/fpu: Prevent state corruption in __fpu__restore_sig() + +From: Thomas Gleixner + +commit 484cea4f362e1eeb5c869abbfb5f90eae6421b38 upstream. + +The non-compacted slowpath uses __copy_from_user() and copies the entire +user buffer into the kernel buffer, verbatim. This means that the kernel +buffer may now contain entirely invalid state on which XRSTOR will #GP. +validate_user_xstate_header() can detect some of that corruption, but that +leaves the onus on callers to clear the buffer. + +Prior to XSAVES support, it was possible just to reinitialize the buffer, +completely, but with supervisor states that is not longer possible as the +buffer clearing code split got it backwards. Fixing that is possible but +not corrupting the state in the first place is more robust. + +Avoid corruption of the kernel XSAVE buffer by using copy_user_to_xstate() +which validates the XSAVE header contents before copying the actual states +to the kernel. copy_user_to_xstate() was previously only called for +compacted-format kernel buffers, but it works for both compacted and +non-compacted forms. + +Using it for the non-compacted form is slower because of multiple +__copy_from_user() operations, but that cost is less important than robust +code in an already slow path. + +[ Changelog polished by Dave Hansen ] + +Fixes: b860eb8dce59 ("x86/fpu/xstate: Define new functions for clearing fpregs and xstates") +Reported-by: syzbot+2067e764dbcd10721e2e@syzkaller.appspotmail.com +Signed-off-by: Thomas Gleixner +Signed-off-by: Borislav Petkov +Reviewed-by: Borislav Petkov +Acked-by: Dave Hansen +Acked-by: Rik van Riel +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/20210608144345.611833074@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/fpu/signal.c | 9 +-------- + 1 file changed, 1 insertion(+), 8 deletions(-) + +--- a/arch/x86/kernel/fpu/signal.c ++++ b/arch/x86/kernel/fpu/signal.c +@@ -405,14 +405,7 @@ static int __fpu__restore_sig(void __use + if (use_xsave() && !fx_only) { + u64 init_bv = xfeatures_mask_user() & ~user_xfeatures; + +- if (using_compacted_format()) { +- ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx); +- } else { +- ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); +- +- if (!ret && state_size > offsetof(struct xregs_state, header)) +- ret = validate_user_xstate_header(&fpu->state.xsave.header); +- } ++ ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx); + if (ret) + goto err_out; + diff --git a/queue-5.12/x86-fpu-reset-state-for-all-signal-restore-failures.patch b/queue-5.12/x86-fpu-reset-state-for-all-signal-restore-failures.patch new file mode 100644 index 00000000000..34e989aafc3 --- /dev/null +++ b/queue-5.12/x86-fpu-reset-state-for-all-signal-restore-failures.patch @@ -0,0 +1,96 @@ +From efa165504943f2128d50f63de0c02faf6dcceb0d Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Wed, 9 Jun 2021 21:18:00 +0200 +Subject: x86/fpu: Reset state for all signal restore failures + +From: Thomas Gleixner + +commit efa165504943f2128d50f63de0c02faf6dcceb0d upstream. + +If access_ok() or fpregs_soft_set() fails in __fpu__restore_sig() then the +function just returns but does not clear the FPU state as it does for all +other fatal failures. + +Clear the FPU state for these failures as well. + +Fixes: 72a671ced66d ("x86, fpu: Unify signal handling code paths for x86 and x86_64 kernels") +Signed-off-by: Thomas Gleixner +Signed-off-by: Borislav Petkov +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/87mtryyhhz.ffs@nanos.tec.linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/fpu/signal.c | 26 +++++++++++++++----------- + 1 file changed, 15 insertions(+), 11 deletions(-) + +--- a/arch/x86/kernel/fpu/signal.c ++++ b/arch/x86/kernel/fpu/signal.c +@@ -307,13 +307,17 @@ static int __fpu__restore_sig(void __use + return 0; + } + +- if (!access_ok(buf, size)) +- return -EACCES; ++ if (!access_ok(buf, size)) { ++ ret = -EACCES; ++ goto out; ++ } + +- if (!static_cpu_has(X86_FEATURE_FPU)) +- return fpregs_soft_set(current, NULL, +- 0, sizeof(struct user_i387_ia32_struct), +- NULL, buf) != 0; ++ if (!static_cpu_has(X86_FEATURE_FPU)) { ++ ret = fpregs_soft_set(current, NULL, 0, ++ sizeof(struct user_i387_ia32_struct), ++ NULL, buf); ++ goto out; ++ } + + if (use_xsave()) { + struct _fpx_sw_bytes fx_sw_user; +@@ -396,7 +400,7 @@ static int __fpu__restore_sig(void __use + */ + ret = __copy_from_user(&env, buf, sizeof(env)); + if (ret) +- goto err_out; ++ goto out; + envp = &env; + } + +@@ -426,7 +430,7 @@ static int __fpu__restore_sig(void __use + + ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx); + if (ret) +- goto err_out; ++ goto out; + + sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures, + fx_only); +@@ -446,7 +450,7 @@ static int __fpu__restore_sig(void __use + ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size); + if (ret) { + ret = -EFAULT; +- goto err_out; ++ goto out; + } + + sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures, +@@ -464,7 +468,7 @@ static int __fpu__restore_sig(void __use + } else { + ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size); + if (ret) +- goto err_out; ++ goto out; + + fpregs_lock(); + ret = copy_kernel_to_fregs_err(&fpu->state.fsave); +@@ -475,7 +479,7 @@ static int __fpu__restore_sig(void __use + fpregs_deactivate(fpu); + fpregs_unlock(); + +-err_out: ++out: + if (ret) + fpu__clear_user_states(fpu); + return ret; diff --git a/queue-5.12/x86-ioremap-map-efi-reserved-memory-as-encrypted-for-sev.patch b/queue-5.12/x86-ioremap-map-efi-reserved-memory-as-encrypted-for-sev.patch new file mode 100644 index 00000000000..0f030876da7 --- /dev/null +++ b/queue-5.12/x86-ioremap-map-efi-reserved-memory-as-encrypted-for-sev.patch @@ -0,0 +1,67 @@ +From 8d651ee9c71bb12fc0c8eb2786b66cbe5aa3e43b Mon Sep 17 00:00:00 2001 +From: Tom Lendacky +Date: Tue, 8 Jun 2021 11:54:33 +0200 +Subject: x86/ioremap: Map EFI-reserved memory as encrypted for SEV + +From: Tom Lendacky + +commit 8d651ee9c71bb12fc0c8eb2786b66cbe5aa3e43b upstream. + +Some drivers require memory that is marked as EFI boot services +data. In order for this memory to not be re-used by the kernel +after ExitBootServices(), efi_mem_reserve() is used to preserve it +by inserting a new EFI memory descriptor and marking it with the +EFI_MEMORY_RUNTIME attribute. + +Under SEV, memory marked with the EFI_MEMORY_RUNTIME attribute needs to +be mapped encrypted by Linux, otherwise the kernel might crash at boot +like below: + + EFI Variables Facility v0.08 2004-May-17 + general protection fault, probably for non-canonical address 0x3597688770a868b2: 0000 [#1] SMP NOPTI + CPU: 13 PID: 1 Comm: swapper/0 Not tainted 5.12.4-2-default #1 openSUSE Tumbleweed + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 + RIP: 0010:efi_mokvar_entry_next + [...] + Call Trace: + efi_mokvar_sysfs_init + ? efi_mokvar_table_init + do_one_initcall + ? __kmalloc + kernel_init_freeable + ? rest_init + kernel_init + ret_from_fork + +Expand the __ioremap_check_other() function to additionally check for +this other type of boot data reserved at runtime and indicate that it +should be mapped encrypted for an SEV guest. + + [ bp: Massage commit message. ] + +Fixes: 58c909022a5a ("efi: Support for MOK variable config table") +Reported-by: Joerg Roedel +Signed-off-by: Tom Lendacky +Signed-off-by: Joerg Roedel +Signed-off-by: Borislav Petkov +Tested-by: Joerg Roedel +Cc: # 5.10+ +Link: https://lkml.kernel.org/r/20210608095439.12668-2-joro@8bytes.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/mm/ioremap.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/arch/x86/mm/ioremap.c ++++ b/arch/x86/mm/ioremap.c +@@ -118,7 +118,9 @@ static void __ioremap_check_other(resour + if (!IS_ENABLED(CONFIG_EFI)) + return; + +- if (efi_mem_type(addr) == EFI_RUNTIME_SERVICES_DATA) ++ if (efi_mem_type(addr) == EFI_RUNTIME_SERVICES_DATA || ++ (efi_mem_type(addr) == EFI_BOOT_SERVICES_DATA && ++ efi_mem_attributes(addr) & EFI_MEMORY_RUNTIME)) + desc->flags |= IORES_MAP_ENCRYPTED; + } + diff --git a/queue-5.12/x86-mm-avoid-truncating-memblocks-for-sgx-memory.patch b/queue-5.12/x86-mm-avoid-truncating-memblocks-for-sgx-memory.patch new file mode 100644 index 00000000000..91e80e5f7b5 --- /dev/null +++ b/queue-5.12/x86-mm-avoid-truncating-memblocks-for-sgx-memory.patch @@ -0,0 +1,95 @@ +From 28e5e44aa3f4e0e0370864ed008fb5e2d85f4dc8 Mon Sep 17 00:00:00 2001 +From: Fan Du +Date: Thu, 17 Jun 2021 12:46:57 -0700 +Subject: x86/mm: Avoid truncating memblocks for SGX memory + +From: Fan Du + +commit 28e5e44aa3f4e0e0370864ed008fb5e2d85f4dc8 upstream. + +tl;dr: + +Several SGX users reported seeing the following message on NUMA systems: + + sgx: [Firmware Bug]: Unable to map EPC section to online node. Fallback to the NUMA node 0. + +This turned out to be the memblock code mistakenly throwing away SGX +memory. + +=== Full Changelog === + +The 'max_pfn' variable represents the highest known RAM address. It can +be used, for instance, to quickly determine for which physical addresses +there is mem_map[] space allocated. The numa_meminfo code makes an +effort to throw out ("trim") all memory blocks which are above 'max_pfn'. + +SGX memory is not considered RAM (it is marked as "Reserved" in the +e820) and is not taken into account by max_pfn. Despite this, SGX memory +areas have NUMA affinity and are enumerated in the ACPI SRAT table. The +existing SGX code uses the numa_meminfo mechanism to look up the NUMA +affinity for its memory areas. + +In cases where SGX memory was above max_pfn (usually just the one EPC +section in the last highest NUMA node), the numa_memblock is truncated +at 'max_pfn', which is below the SGX memory. When the SGX code tries to +look up the affinity of this memory, it fails and produces an error message: + + sgx: [Firmware Bug]: Unable to map EPC section to online node. Fallback to the NUMA node 0. + +and assigns the memory to NUMA node 0. + +Instead of silently truncating the memory block at 'max_pfn' and +dropping the SGX memory, add the truncated portion to +'numa_reserved_meminfo'. This allows the SGX code to later determine +the NUMA affinity of its 'Reserved' area. + +Before, numa_meminfo looked like this (from 'crash'): + + blk = { start = 0x0, end = 0x2080000000, nid = 0x0 } + { start = 0x2080000000, end = 0x4000000000, nid = 0x1 } + +numa_reserved_meminfo is empty. + +With this, numa_meminfo looks like this: + + blk = { start = 0x0, end = 0x2080000000, nid = 0x0 } + { start = 0x2080000000, end = 0x4000000000, nid = 0x1 } + +and numa_reserved_meminfo has an entry for node 1's SGX memory: + + blk = { start = 0x4000000000, end = 0x4080000000, nid = 0x1 } + + [ daveh: completely rewrote/reworked changelog ] + +Fixes: 5d30f92e7631 ("x86/NUMA: Provide a range-to-target_node lookup facility") +Reported-by: Reinette Chatre +Signed-off-by: Fan Du +Signed-off-by: Dave Hansen +Signed-off-by: Borislav Petkov +Reviewed-by: Jarkko Sakkinen +Reviewed-by: Dan Williams +Reviewed-by: Dave Hansen +Cc: +Link: https://lkml.kernel.org/r/20210617194657.0A99CB22@viggo.jf.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/mm/numa.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/arch/x86/mm/numa.c ++++ b/arch/x86/mm/numa.c +@@ -254,7 +254,13 @@ int __init numa_cleanup_meminfo(struct n + + /* make sure all non-reserved blocks are inside the limits */ + bi->start = max(bi->start, low); +- bi->end = min(bi->end, high); ++ ++ /* preserve info for non-RAM areas above 'max_pfn': */ ++ if (bi->end > high) { ++ numa_add_memblk_to(bi->nid, high, bi->end, ++ &numa_reserved_meminfo); ++ bi->end = high; ++ } + + /* and there's no empty block */ + if (bi->start >= bi->end) diff --git a/queue-5.12/x86-pkru-write-hardware-init-value-to-pkru-when-xstate-is-init.patch b/queue-5.12/x86-pkru-write-hardware-init-value-to-pkru-when-xstate-is-init.patch new file mode 100644 index 00000000000..70ff8700edd --- /dev/null +++ b/queue-5.12/x86-pkru-write-hardware-init-value-to-pkru-when-xstate-is-init.patch @@ -0,0 +1,93 @@ +From 510b80a6a0f1a0d114c6e33bcea64747d127973c Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Tue, 8 Jun 2021 16:36:21 +0200 +Subject: x86/pkru: Write hardware init value to PKRU when xstate is init + +From: Thomas Gleixner + +commit 510b80a6a0f1a0d114c6e33bcea64747d127973c upstream. + +When user space brings PKRU into init state, then the kernel handling is +broken: + + T1 user space + xsave(state) + state.header.xfeatures &= ~XFEATURE_MASK_PKRU; + xrstor(state) + + T1 -> kernel + schedule() + XSAVE(S) -> T1->xsave.header.xfeatures[PKRU] == 0 + T1->flags |= TIF_NEED_FPU_LOAD; + + wrpkru(); + + schedule() + ... + pk = get_xsave_addr(&T1->fpu->state.xsave, XFEATURE_PKRU); + if (pk) + wrpkru(pk->pkru); + else + wrpkru(DEFAULT_PKRU); + +Because the xfeatures bit is 0 and therefore the value in the xsave +storage is not valid, get_xsave_addr() returns NULL and switch_to() +writes the default PKRU. -> FAIL #1! + +So that wrecks any copy_to/from_user() on the way back to user space +which hits memory which is protected by the default PKRU value. + +Assumed that this does not fail (pure luck) then T1 goes back to user +space and because TIF_NEED_FPU_LOAD is set it ends up in + + switch_fpu_return() + __fpregs_load_activate() + if (!fpregs_state_valid()) { + load_XSTATE_from_task(); + } + +But if nothing touched the FPU between T1 scheduling out and back in, +then the fpregs_state is still valid which means switch_fpu_return() +does nothing and just clears TIF_NEED_FPU_LOAD. Back to user space with +DEFAULT_PKRU loaded. -> FAIL #2! + +The fix is simple: if get_xsave_addr() returns NULL then set the +PKRU value to 0 instead of the restrictive default PKRU value in +init_pkru_value. + + [ bp: Massage in minor nitpicks from folks. ] + +Fixes: 0cecca9d03c9 ("x86/fpu: Eager switch PKRU state") +Signed-off-by: Thomas Gleixner +Signed-off-by: Borislav Petkov +Acked-by: Dave Hansen +Acked-by: Rik van Riel +Tested-by: Babu Moger +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/20210608144346.045616965@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/fpu/internal.h | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +--- a/arch/x86/include/asm/fpu/internal.h ++++ b/arch/x86/include/asm/fpu/internal.h +@@ -579,9 +579,16 @@ static inline void switch_fpu_finish(str + * return to userland e.g. for a copy_to_user() operation. + */ + if (!(current->flags & PF_KTHREAD)) { ++ /* ++ * If the PKRU bit in xsave.header.xfeatures is not set, ++ * then the PKRU component was in init state, which means ++ * XRSTOR will set PKRU to 0. If the bit is not set then ++ * get_xsave_addr() will return NULL because the PKRU value ++ * in memory is not valid. This means pkru_val has to be ++ * set to 0 and not to init_pkru_value. ++ */ + pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU); +- if (pk) +- pkru_val = pk->pkru; ++ pkru_val = pk ? pk->pkru : 0; + } + __write_pkru(pkru_val); + } diff --git a/queue-5.12/x86-process-check-pf_kthread-and-not-current-mm-for-kernel-threads.patch b/queue-5.12/x86-process-check-pf_kthread-and-not-current-mm-for-kernel-threads.patch new file mode 100644 index 00000000000..bbc1e444c89 --- /dev/null +++ b/queue-5.12/x86-process-check-pf_kthread-and-not-current-mm-for-kernel-threads.patch @@ -0,0 +1,38 @@ +From 12f7764ac61200e32c916f038bdc08f884b0b604 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Tue, 8 Jun 2021 16:36:20 +0200 +Subject: x86/process: Check PF_KTHREAD and not current->mm for kernel threads + +From: Thomas Gleixner + +commit 12f7764ac61200e32c916f038bdc08f884b0b604 upstream. + +switch_fpu_finish() checks current->mm as indicator for kernel threads. +That's wrong because kernel threads can temporarily use a mm of a user +process via kthread_use_mm(). + +Check the task flags for PF_KTHREAD instead. + +Fixes: 0cecca9d03c9 ("x86/fpu: Eager switch PKRU state") +Signed-off-by: Thomas Gleixner +Signed-off-by: Borislav Petkov +Acked-by: Dave Hansen +Acked-by: Rik van Riel +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/20210608144345.912645927@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/fpu/internal.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/include/asm/fpu/internal.h ++++ b/arch/x86/include/asm/fpu/internal.h +@@ -578,7 +578,7 @@ static inline void switch_fpu_finish(str + * PKRU state is switched eagerly because it needs to be valid before we + * return to userland e.g. for a copy_to_user() operation. + */ +- if (current->mm) { ++ if (!(current->flags & PF_KTHREAD)) { + pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU); + if (pk) + pkru_val = pk->pkru;