From: Greg Kroah-Hartman Date: Mon, 22 Nov 2021 12:42:17 +0000 (+0100) Subject: 5.15-stable patches X-Git-Tag: v5.15.5~56 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=5d71db18878ce1f1d5781e2140794c2533dc4f46;p=thirdparty%2Fkernel%2Fstable-queue.git 5.15-stable patches added patches: ata-libata-add-missing-ata_identify_page_supported-calls.patch ata-libata-improve-ata_read_log_page-error-message.patch block-check-admin-before-nice-for-ioprio_class_rt.patch fbdev-prevent-probing-generic-drivers-if-a-fb-is-already-registered.patch hexagon-clean-up-timer-regs.h.patch hexagon-export-raw-i-o-routines-for-modules.patch hugetlb-userfaultfd-fix-reservation-restore-on-userfaultfd-error.patch ipc-warn-if-trying-to-remove-ipc-object-which-is-absent.patch kmap_local-don-t-assume-kmap-ptes-are-linear-arrays-in-memory.patch kvm-x86-assume-a-64-bit-hypercall-for-guests-with-protected-state.patch kvm-x86-fix-uninitialized-eoi_exit_bitmap-usage-in-vcpu_load_eoi_exitmap.patch kvm-x86-mmu-include-efer.lma-in-extended-mmu-role.patch kvm-x86-xen-fix-get_attr-of-kvm_xen_attr_type_shared_info.patch mm-damon-dbgfs-fix-missed-use-of-damon_dbgfs_lock.patch mm-damon-dbgfs-use-__gfp_nowarn-for-user-specified-size-buffer-allocation.patch mm-kmemleak-slob-respect-slab_noleaktrace-flag.patch pinctrl-ralink-include-ralink_regs.h-in-pinctrl-mt7620.c.patch powerpc-8xx-fix-pinned-tlbs-with-config_strict_kernel_rwx.patch powerpc-signal32-fix-sigset_t-copy.patch powerpc-xive-change-irq-domain-to-a-tree-domain.patch pstore-blk-use-lu-to-format-unsigned-long.patch revert-drm-i915-tgl-dsi-gate-the-ddi-clocks-after-pll-mapping.patch revert-mark-pstore-blk-as-broken.patch revert-parisc-reduce-sigreturn-trampoline-to-3-instructions.patch s390-boot-simplify-and-fix-kernel-memory-layout-setup.patch s390-dump-fix-copying-to-user-space-of-swapped-kdump-oldmem.patch s390-kexec-fix-memory-leak-of-ipl-report-buffer.patch s390-setup-avoid-reserving-memory-above-identity-mapping.patch s390-vdso-filter-out-mstack-guard-and-mstack-size.patch scsi-qla2xxx-fix-mailbox-direction-flags-in-qla2xxx_get_adapter_id.patch shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch tipc-check-for-null-after-calling-kmemdup.patch x86-boot-pull-up-cmdline-preparation-and-early-param-parsing.patch x86-hyperv-fix-null-deref-in-set_hv_tscchange_cb-if-hyper-v-setup-fails.patch x86-sgx-fix-free-page-accounting.patch --- diff --git a/queue-5.15/ata-libata-add-missing-ata_identify_page_supported-calls.patch b/queue-5.15/ata-libata-add-missing-ata_identify_page_supported-calls.patch new file mode 100644 index 00000000000..56b4dfe68b2 --- /dev/null +++ b/queue-5.15/ata-libata-add-missing-ata_identify_page_supported-calls.patch @@ -0,0 +1,47 @@ +From 06f6c4c6c3e8354dceddd77bd58f9a7a84c67246 Mon Sep 17 00:00:00 2001 +From: Damien Le Moal +Date: Mon, 15 Nov 2021 12:47:26 +0900 +Subject: ata: libata: add missing ata_identify_page_supported() calls + +From: Damien Le Moal + +commit 06f6c4c6c3e8354dceddd77bd58f9a7a84c67246 upstream. + +ata_dev_config_ncq_prio() and ata_dev_config_devslp() both access pages +of the IDENTIFY DEVICE data log. Before calling ata_read_log_page(), +make sure to check for the existence of the IDENTIFY DEVICE data log and +of the log page accessed using ata_identify_page_supported(). This +avoids useless error messages from ata_read_log_page() and failures with +some LLDD scsi drivers using libsas. + +Reported-by: Nikolay +Cc: stable@kernel.org # 5.15 +Signed-off-by: Damien Le Moal +Tested-by: Matthew Perkowski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/ata/libata-core.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/drivers/ata/libata-core.c ++++ b/drivers/ata/libata-core.c +@@ -2167,6 +2167,9 @@ static void ata_dev_config_ncq_prio(stru + struct ata_port *ap = dev->link->ap; + unsigned int err_mask; + ++ if (!ata_identify_page_supported(dev, ATA_LOG_SATA_SETTINGS)) ++ return; ++ + err_mask = ata_read_log_page(dev, + ATA_LOG_IDENTIFY_DEVICE, + ATA_LOG_SATA_SETTINGS, +@@ -2443,7 +2446,8 @@ static void ata_dev_config_devslp(struct + * Check device sleep capability. Get DevSlp timing variables + * from SATA Settings page of Identify Device Data Log. + */ +- if (!ata_id_has_devslp(dev->id)) ++ if (!ata_id_has_devslp(dev->id) || ++ !ata_identify_page_supported(dev, ATA_LOG_SATA_SETTINGS)) + return; + + err_mask = ata_read_log_page(dev, diff --git a/queue-5.15/ata-libata-improve-ata_read_log_page-error-message.patch b/queue-5.15/ata-libata-improve-ata_read_log_page-error-message.patch new file mode 100644 index 00000000000..6854c0a54b4 --- /dev/null +++ b/queue-5.15/ata-libata-improve-ata_read_log_page-error-message.patch @@ -0,0 +1,35 @@ +From 23ef63d5e14f916c5bba39128ebef395859d7c0f Mon Sep 17 00:00:00 2001 +From: Damien Le Moal +Date: Mon, 15 Nov 2021 12:37:46 +0900 +Subject: ata: libata: improve ata_read_log_page() error message + +From: Damien Le Moal + +commit 23ef63d5e14f916c5bba39128ebef395859d7c0f upstream. + +If ata_read_log_page() fails to read a log page, the ata_dev_err() error +message only print the page number, omitting the log number. In case of +error, facilitate debugging by also printing the log number. + +Cc: stable@kernel.org # 5.15 +Signed-off-by: Damien Le Moal +Tested-by: Matthew Perkowski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/ata/libata-core.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/drivers/ata/libata-core.c ++++ b/drivers/ata/libata-core.c +@@ -2031,8 +2031,9 @@ retry: + dev->horkage |= ATA_HORKAGE_NO_DMA_LOG; + goto retry; + } +- ata_dev_err(dev, "Read log page 0x%02x failed, Emask 0x%x\n", +- (unsigned int)page, err_mask); ++ ata_dev_err(dev, ++ "Read log 0x%02x page 0x%02x failed, Emask 0x%x\n", ++ (unsigned int)log, (unsigned int)page, err_mask); + } + + return err_mask; diff --git a/queue-5.15/block-check-admin-before-nice-for-ioprio_class_rt.patch b/queue-5.15/block-check-admin-before-nice-for-ioprio_class_rt.patch new file mode 100644 index 00000000000..646af9341d2 --- /dev/null +++ b/queue-5.15/block-check-admin-before-nice-for-ioprio_class_rt.patch @@ -0,0 +1,58 @@ +From 94c4b4fd25e6c3763941bdec3ad54f2204afa992 Mon Sep 17 00:00:00 2001 +From: Alistair Delva +Date: Mon, 15 Nov 2021 18:16:55 +0000 +Subject: block: Check ADMIN before NICE for IOPRIO_CLASS_RT + +From: Alistair Delva + +commit 94c4b4fd25e6c3763941bdec3ad54f2204afa992 upstream. + +Booting to Android userspace on 5.14 or newer triggers the following +SELinux denial: + +avc: denied { sys_nice } for comm="init" capability=23 + scontext=u:r:init:s0 tcontext=u:r:init:s0 tclass=capability + permissive=0 + +Init is PID 0 running as root, so it already has CAP_SYS_ADMIN. For +better compatibility with older SEPolicy, check ADMIN before NICE. + +Fixes: 9d3a39a5f1e4 ("block: grant IOPRIO_CLASS_RT to CAP_SYS_NICE") +Signed-off-by: Alistair Delva +Cc: Khazhismel Kumykov +Cc: Bart Van Assche +Cc: Serge Hallyn +Cc: Jens Axboe +Cc: Greg Kroah-Hartman +Cc: Paul Moore +Cc: selinux@vger.kernel.org +Cc: linux-security-module@vger.kernel.org +Cc: kernel-team@android.com +Cc: stable@vger.kernel.org # v5.14+ +Reviewed-by: Bart Van Assche +Acked-by: Serge Hallyn +Link: https://lore.kernel.org/r/20211115181655.3608659-1-adelva@google.com +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + block/ioprio.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +--- a/block/ioprio.c ++++ b/block/ioprio.c +@@ -69,7 +69,14 @@ int ioprio_check_cap(int ioprio) + + switch (class) { + case IOPRIO_CLASS_RT: +- if (!capable(CAP_SYS_NICE) && !capable(CAP_SYS_ADMIN)) ++ /* ++ * Originally this only checked for CAP_SYS_ADMIN, ++ * which was implicitly allowed for pid 0 by security ++ * modules such as SELinux. Make sure we check ++ * CAP_SYS_ADMIN first to avoid a denial/avc for ++ * possibly missing CAP_SYS_NICE permission. ++ */ ++ if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE)) + return -EPERM; + fallthrough; + /* rt has prio field too */ diff --git a/queue-5.15/fbdev-prevent-probing-generic-drivers-if-a-fb-is-already-registered.patch b/queue-5.15/fbdev-prevent-probing-generic-drivers-if-a-fb-is-already-registered.patch new file mode 100644 index 00000000000..806d80ce58a --- /dev/null +++ b/queue-5.15/fbdev-prevent-probing-generic-drivers-if-a-fb-is-already-registered.patch @@ -0,0 +1,74 @@ +From fb561bf9abde49f7e00fdbf9ed2ccf2d86cac8ee Mon Sep 17 00:00:00 2001 +From: Javier Martinez Canillas +Date: Thu, 11 Nov 2021 12:57:57 +0100 +Subject: fbdev: Prevent probing generic drivers if a FB is already registered + +From: Javier Martinez Canillas + +commit fb561bf9abde49f7e00fdbf9ed2ccf2d86cac8ee upstream. + +The efifb and simplefb drivers just render to a pre-allocated frame buffer +and rely on the display hardware being initialized before the kernel boots. + +But if another driver already probed correctly and registered a fbdev, the +generic drivers shouldn't be probed since an actual driver for the display +hardware is already present. + +This is more likely to occur after commit d391c5827107 ("drivers/firmware: +move x86 Generic System Framebuffers support") since the "efi-framebuffer" +and "simple-framebuffer" platform devices are registered at a later time. + +Link: https://lore.kernel.org/r/20211110200253.rfudkt3edbd3nsyj@lahvuun/ +Fixes: d391c5827107 ("drivers/firmware: move x86 Generic System Framebuffers support") +Reported-by: Ilya Trukhanov +Cc: # 5.15.x +Signed-off-by: Javier Martinez Canillas +Reviewed-by: Daniel Vetter +Tested-by: Ilya Trukhanov +Link: https://patchwork.freedesktop.org/patch/msgid/20211111115757.1351045-1-javierm@redhat.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/video/fbdev/efifb.c | 11 +++++++++++ + drivers/video/fbdev/simplefb.c | 11 +++++++++++ + 2 files changed, 22 insertions(+) + +--- a/drivers/video/fbdev/efifb.c ++++ b/drivers/video/fbdev/efifb.c +@@ -351,6 +351,17 @@ static int efifb_probe(struct platform_d + char *option = NULL; + efi_memory_desc_t md; + ++ /* ++ * Generic drivers must not be registered if a framebuffer exists. ++ * If a native driver was probed, the display hardware was already ++ * taken and attempting to use the system framebuffer is dangerous. ++ */ ++ if (num_registered_fb > 0) { ++ dev_err(&dev->dev, ++ "efifb: a framebuffer is already registered\n"); ++ return -EINVAL; ++ } ++ + if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI || pci_dev_disabled) + return -ENODEV; + +--- a/drivers/video/fbdev/simplefb.c ++++ b/drivers/video/fbdev/simplefb.c +@@ -407,6 +407,17 @@ static int simplefb_probe(struct platfor + struct simplefb_par *par; + struct resource *mem; + ++ /* ++ * Generic drivers must not be registered if a framebuffer exists. ++ * If a native driver was probed, the display hardware was already ++ * taken and attempting to use the system framebuffer is dangerous. ++ */ ++ if (num_registered_fb > 0) { ++ dev_err(&pdev->dev, ++ "simplefb: a framebuffer is already registered\n"); ++ return -EINVAL; ++ } ++ + if (fb_get_options("simplefb", NULL)) + return -ENODEV; + diff --git a/queue-5.15/hexagon-clean-up-timer-regs.h.patch b/queue-5.15/hexagon-clean-up-timer-regs.h.patch new file mode 100644 index 00000000000..567bb462c83 --- /dev/null +++ b/queue-5.15/hexagon-clean-up-timer-regs.h.patch @@ -0,0 +1,132 @@ +From 51f2ec593441d3d1ebc0d478fac3ea329c7c93ac Mon Sep 17 00:00:00 2001 +From: Nathan Chancellor +Date: Fri, 19 Nov 2021 16:43:31 -0800 +Subject: hexagon: clean up timer-regs.h + +From: Nathan Chancellor + +commit 51f2ec593441d3d1ebc0d478fac3ea329c7c93ac upstream. + +When building allmodconfig, there is a warning about TIMER_ENABLE being +redefined: + + drivers/clocksource/timer-oxnas-rps.c:39:9: error: 'TIMER_ENABLE' macro redefined [-Werror,-Wmacro-redefined] + #define TIMER_ENABLE BIT(7) + ^ + arch/hexagon/include/asm/timer-regs.h:13:9: note: previous definition is here + #define TIMER_ENABLE 0 + ^ + 1 error generated. + +The values in this header are only used in one file each, if they are +used at all. Remove the header and sink all of the constants into their +respective files. + +TCX0_CLK_RATE is only used in arch/hexagon/include/asm/timex.h + +TIMER_ENABLE, RTOS_TIMER_INT, RTOS_TIMER_REGS_ADDR are only used in +arch/hexagon/kernel/time.c. + +SLEEP_CLK_RATE and TIMER_CLR_ON_MATCH have both been unused since the +file's introduction in commit 71e4a47f32f4 ("Hexagon: Add time and timer +functions"). + +TIMER_ENABLE is redefined as BIT(0) so the shift is moved into the +definition, rather than its use. + +Link: https://lkml.kernel.org/r/20211115174250.1994179-3-nathan@kernel.org +Signed-off-by: Nathan Chancellor +Acked-by: Brian Cain +Cc: Nick Desaulniers +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/hexagon/include/asm/timer-regs.h | 26 -------------------------- + arch/hexagon/include/asm/timex.h | 3 +-- + arch/hexagon/kernel/time.c | 12 ++++++++++-- + 3 files changed, 11 insertions(+), 30 deletions(-) + delete mode 100644 arch/hexagon/include/asm/timer-regs.h + +--- a/arch/hexagon/include/asm/timer-regs.h ++++ /dev/null +@@ -1,26 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0-only */ +-/* +- * Timer support for Hexagon +- * +- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved. +- */ +- +-#ifndef _ASM_TIMER_REGS_H +-#define _ASM_TIMER_REGS_H +- +-/* This stuff should go into a platform specific file */ +-#define TCX0_CLK_RATE 19200 +-#define TIMER_ENABLE 0 +-#define TIMER_CLR_ON_MATCH 1 +- +-/* +- * 8x50 HDD Specs 5-8. Simulator co-sim not fixed until +- * release 1.1, and then it's "adjustable" and probably not defaulted. +- */ +-#define RTOS_TIMER_INT 3 +-#ifdef CONFIG_HEXAGON_COMET +-#define RTOS_TIMER_REGS_ADDR 0xAB000000UL +-#endif +-#define SLEEP_CLK_RATE 32000 +- +-#endif +--- a/arch/hexagon/include/asm/timex.h ++++ b/arch/hexagon/include/asm/timex.h +@@ -7,11 +7,10 @@ + #define _ASM_TIMEX_H + + #include +-#include + #include + + /* Using TCX0 as our clock. CLOCK_TICK_RATE scheduled to be removed. */ +-#define CLOCK_TICK_RATE TCX0_CLK_RATE ++#define CLOCK_TICK_RATE 19200 + + #define ARCH_HAS_READ_CURRENT_TIMER + +--- a/arch/hexagon/kernel/time.c ++++ b/arch/hexagon/kernel/time.c +@@ -17,9 +17,10 @@ + #include + #include + +-#include + #include + ++#define TIMER_ENABLE BIT(0) ++ + /* + * For the clocksource we need: + * pcycle frequency (600MHz) +@@ -33,6 +34,13 @@ cycles_t pcycle_freq_mhz; + cycles_t thread_freq_mhz; + cycles_t sleep_clk_freq; + ++/* ++ * 8x50 HDD Specs 5-8. Simulator co-sim not fixed until ++ * release 1.1, and then it's "adjustable" and probably not defaulted. ++ */ ++#define RTOS_TIMER_INT 3 ++#define RTOS_TIMER_REGS_ADDR 0xAB000000UL ++ + static struct resource rtos_timer_resources[] = { + { + .start = RTOS_TIMER_REGS_ADDR, +@@ -80,7 +88,7 @@ static int set_next_event(unsigned long + iowrite32(0, &rtos_timer->clear); + + iowrite32(delta, &rtos_timer->match); +- iowrite32(1 << TIMER_ENABLE, &rtos_timer->enable); ++ iowrite32(TIMER_ENABLE, &rtos_timer->enable); + return 0; + } + diff --git a/queue-5.15/hexagon-export-raw-i-o-routines-for-modules.patch b/queue-5.15/hexagon-export-raw-i-o-routines-for-modules.patch new file mode 100644 index 00000000000..b4c6b01bb6d --- /dev/null +++ b/queue-5.15/hexagon-export-raw-i-o-routines-for-modules.patch @@ -0,0 +1,70 @@ +From ffb92ce826fd801acb0f4e15b75e4ddf0d189bde Mon Sep 17 00:00:00 2001 +From: Nathan Chancellor +Date: Fri, 19 Nov 2021 16:43:28 -0800 +Subject: hexagon: export raw I/O routines for modules + +From: Nathan Chancellor + +commit ffb92ce826fd801acb0f4e15b75e4ddf0d189bde upstream. + +Patch series "Fixes for ARCH=hexagon allmodconfig", v2. + +This series fixes some issues noticed with ARCH=hexagon allmodconfig. + +This patch (of 3): + +When building ARCH=hexagon allmodconfig, the following errors occur: + + ERROR: modpost: "__raw_readsl" [drivers/i3c/master/svc-i3c-master.ko] undefined! + ERROR: modpost: "__raw_writesl" [drivers/i3c/master/dw-i3c-master.ko] undefined! + ERROR: modpost: "__raw_readsl" [drivers/i3c/master/dw-i3c-master.ko] undefined! + ERROR: modpost: "__raw_writesl" [drivers/i3c/master/i3c-master-cdns.ko] undefined! + ERROR: modpost: "__raw_readsl" [drivers/i3c/master/i3c-master-cdns.ko] undefined! + +Export these symbols so that modules can use them without any errors. + +Link: https://lkml.kernel.org/r/20211115174250.1994179-1-nathan@kernel.org +Link: https://lkml.kernel.org/r/20211115174250.1994179-2-nathan@kernel.org +Fixes: 013bf24c3829 ("Hexagon: Provide basic implementation and/or stubs for I/O routines.") +Signed-off-by: Nathan Chancellor +Acked-by: Brian Cain +Cc: Nick Desaulniers +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/hexagon/lib/io.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/arch/hexagon/lib/io.c ++++ b/arch/hexagon/lib/io.c +@@ -27,6 +27,7 @@ void __raw_readsw(const void __iomem *ad + *dst++ = *src; + + } ++EXPORT_SYMBOL(__raw_readsw); + + /* + * __raw_writesw - read words a short at a time +@@ -47,6 +48,7 @@ void __raw_writesw(void __iomem *addr, c + + + } ++EXPORT_SYMBOL(__raw_writesw); + + /* Pretty sure len is pre-adjusted for the length of the access already */ + void __raw_readsl(const void __iomem *addr, void *data, int len) +@@ -62,6 +64,7 @@ void __raw_readsl(const void __iomem *ad + + + } ++EXPORT_SYMBOL(__raw_readsl); + + void __raw_writesl(void __iomem *addr, const void *data, int len) + { +@@ -76,3 +79,4 @@ void __raw_writesl(void __iomem *addr, c + + + } ++EXPORT_SYMBOL(__raw_writesl); diff --git a/queue-5.15/hugetlb-userfaultfd-fix-reservation-restore-on-userfaultfd-error.patch b/queue-5.15/hugetlb-userfaultfd-fix-reservation-restore-on-userfaultfd-error.patch new file mode 100644 index 00000000000..31cfd919c69 --- /dev/null +++ b/queue-5.15/hugetlb-userfaultfd-fix-reservation-restore-on-userfaultfd-error.patch @@ -0,0 +1,73 @@ +From cc30042df6fcc82ea18acf0dace831503e60a0b7 Mon Sep 17 00:00:00 2001 +From: Mina Almasry +Date: Fri, 19 Nov 2021 16:43:43 -0800 +Subject: hugetlb, userfaultfd: fix reservation restore on userfaultfd error + +From: Mina Almasry + +commit cc30042df6fcc82ea18acf0dace831503e60a0b7 upstream. + +Currently in the is_continue case in hugetlb_mcopy_atomic_pte(), if we +bail out using "goto out_release_unlock;" in the cases where idx >= +size, or !huge_pte_none(), the code will detect that new_pagecache_page +== false, and so call restore_reserve_on_error(). In this case I see +restore_reserve_on_error() delete the reservation, and the following +call to remove_inode_hugepages() will increment h->resv_hugepages +causing a 100% reproducible leak. + +We should treat the is_continue case similar to adding a page into the +pagecache and set new_pagecache_page to true, to indicate that there is +no reservation to restore on the error path, and we need not call +restore_reserve_on_error(). Rename new_pagecache_page to +page_in_pagecache to make that clear. + +Link: https://lkml.kernel.org/r/20211117193825.378528-1-almasrymina@google.com +Fixes: c7b1850dfb41 ("hugetlb: don't pass page cache pages to restore_reserve_on_error") +Signed-off-by: Mina Almasry +Reported-by: James Houghton +Reviewed-by: Mike Kravetz +Cc: Wei Xu +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/hugetlb.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -5236,13 +5236,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_s + int ret = -ENOMEM; + struct page *page; + int writable; +- bool new_pagecache_page = false; ++ bool page_in_pagecache = false; + + if (is_continue) { + ret = -EFAULT; + page = find_lock_page(mapping, idx); + if (!page) + goto out; ++ page_in_pagecache = true; + } else if (!*pagep) { + /* If a page already exists, then it's UFFDIO_COPY for + * a non-missing case. Return -EEXIST. +@@ -5330,7 +5331,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_s + ret = huge_add_to_page_cache(page, mapping, idx); + if (ret) + goto out_release_nounlock; +- new_pagecache_page = true; ++ page_in_pagecache = true; + } + + ptl = huge_pte_lockptr(h, dst_mm, dst_pte); +@@ -5394,7 +5395,7 @@ out_release_unlock: + if (vm_shared || is_continue) + unlock_page(page); + out_release_nounlock: +- if (!new_pagecache_page) ++ if (!page_in_pagecache) + restore_reserve_on_error(h, dst_vma, dst_addr, page); + put_page(page); + goto out; diff --git a/queue-5.15/ipc-warn-if-trying-to-remove-ipc-object-which-is-absent.patch b/queue-5.15/ipc-warn-if-trying-to-remove-ipc-object-which-is-absent.patch new file mode 100644 index 00000000000..7d2f991f541 --- /dev/null +++ b/queue-5.15/ipc-warn-if-trying-to-remove-ipc-object-which-is-absent.patch @@ -0,0 +1,115 @@ +From 126e8bee943e9926238c891e2df5b5573aee76bc Mon Sep 17 00:00:00 2001 +From: Alexander Mikhalitsyn +Date: Fri, 19 Nov 2021 16:43:18 -0800 +Subject: ipc: WARN if trying to remove ipc object which is absent + +From: Alexander Mikhalitsyn + +commit 126e8bee943e9926238c891e2df5b5573aee76bc upstream. + +Patch series "shm: shm_rmid_forced feature fixes". + +Some time ago I met kernel crash after CRIU restore procedure, +fortunately, it was CRIU restore, so, I had dump files and could do +restore many times and crash reproduced easily. After some +investigation I've constructed the minimal reproducer. It was found +that it's use-after-free and it happens only if sysctl +kernel.shm_rmid_forced = 1. + +The key of the problem is that the exit_shm() function not handles shp's +object destroy when task->sysvshm.shm_clist contains items from +different IPC namespaces. In most cases this list will contain only +items from one IPC namespace. + +How can this list contain object from different namespaces? The +exit_shm() function is designed to clean up this list always when +process leaves IPC namespace. But we made a mistake a long time ago and +did not add a exit_shm() call into the setns() syscall procedures. + +The first idea was just to add this call to setns() syscall but it +obviously changes semantics of setns() syscall and that's +userspace-visible change. So, I gave up on this idea. + +The first real attempt to address the issue was just to omit forced +destroy if we meet shp object not from current task IPC namespace [1]. +But that was not the best idea because task->sysvshm.shm_clist was +protected by rwsem which belongs to current task IPC namespace. It +means that list corruption may occur. + +Second approach is just extend exit_shm() to properly handle shp's from +different IPC namespaces [2]. This is really non-trivial thing, I've +put a lot of effort into that but not believed that it's possible to +make it fully safe, clean and clear. + +Thanks to the efforts of Manfred Spraul working an elegant solution was +designed. Thanks a lot, Manfred! + +Eric also suggested the way to address the issue in ("[RFC][PATCH] shm: +In shm_exit destroy all created and never attached segments") Eric's +idea was to maintain a list of shm_clists one per IPC namespace, use +lock-less lists. But there is some extra memory consumption-related +concerns. + +An alternative solution which was suggested by me was implemented in +("shm: reset shm_clist on setns but omit forced shm destroy"). The idea +is pretty simple, we add exit_shm() syscall to setns() but DO NOT +destroy shm segments even if sysctl kernel.shm_rmid_forced = 1, we just +clean up the task->sysvshm.shm_clist list. + +This chages semantics of setns() syscall a little bit but in comparision +to the "naive" solution when we just add exit_shm() without any special +exclusions this looks like a safer option. + +[1] https://lkml.org/lkml/2021/7/6/1108 +[2] https://lkml.org/lkml/2021/7/14/736 + +This patch (of 2): + +Let's produce a warning if we trying to remove non-existing IPC object +from IPC namespace kht/idr structures. + +This allows us to catch possible bugs when the ipc_rmid() function was +called with inconsistent struct ipc_ids*, struct kern_ipc_perm* +arguments. + +Link: https://lkml.kernel.org/r/20211027224348.611025-1-alexander.mikhalitsyn@virtuozzo.com +Link: https://lkml.kernel.org/r/20211027224348.611025-2-alexander.mikhalitsyn@virtuozzo.com +Co-developed-by: Manfred Spraul +Signed-off-by: Manfred Spraul +Signed-off-by: Alexander Mikhalitsyn +Cc: "Eric W. Biederman" +Cc: Davidlohr Bueso +Cc: Greg KH +Cc: Andrei Vagin +Cc: Pavel Tikhomirov +Cc: Vasily Averin +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + ipc/util.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/ipc/util.c ++++ b/ipc/util.c +@@ -447,8 +447,8 @@ static int ipcget_public(struct ipc_name + static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) + { + if (ipcp->key != IPC_PRIVATE) +- rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode, +- ipc_kht_params); ++ WARN_ON_ONCE(rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode, ++ ipc_kht_params)); + } + + /** +@@ -498,7 +498,7 @@ void ipc_rmid(struct ipc_ids *ids, struc + { + int idx = ipcid_to_idx(ipcp->id); + +- idr_remove(&ids->ipcs_idr, idx); ++ WARN_ON_ONCE(idr_remove(&ids->ipcs_idr, idx) != ipcp); + ipc_kht_remove(ids, ipcp); + ids->in_use--; + ipcp->deleted = true; diff --git a/queue-5.15/kmap_local-don-t-assume-kmap-ptes-are-linear-arrays-in-memory.patch b/queue-5.15/kmap_local-don-t-assume-kmap-ptes-are-linear-arrays-in-memory.patch new file mode 100644 index 00000000000..29bd2b343eb --- /dev/null +++ b/queue-5.15/kmap_local-don-t-assume-kmap-ptes-are-linear-arrays-in-memory.patch @@ -0,0 +1,167 @@ +From 825c43f50e3aa811a291ffcb40e02fbf6d91ba86 Mon Sep 17 00:00:00 2001 +From: Ard Biesheuvel +Date: Fri, 19 Nov 2021 16:43:55 -0800 +Subject: kmap_local: don't assume kmap PTEs are linear arrays in memory + +From: Ard Biesheuvel + +commit 825c43f50e3aa811a291ffcb40e02fbf6d91ba86 upstream. + +The kmap_local conversion broke the ARM architecture, because the new +code assumes that all PTEs used for creating kmaps form a linear array +in memory, and uses array indexing to look up the kmap PTE belonging to +a certain kmap index. + +On ARM, this cannot work, not only because the PTE pages may be +non-adjacent in memory, but also because ARM/!LPAE interleaves hardware +entries and extended entries (carrying software-only bits) in a way that +is not compatible with array indexing. + +Fortunately, this only seems to affect configurations with more than 8 +CPUs, due to the way the per-CPU kmap slots are organized in memory. + +Work around this by permitting an architecture to set a Kconfig symbol +that signifies that the kmap PTEs do not form a lineary array in memory, +and so the only way to locate the appropriate one is to walk the page +tables. + +Link: https://lore.kernel.org/linux-arm-kernel/20211026131249.3731275-1-ardb@kernel.org/ +Link: https://lkml.kernel.org/r/20211116094737.7391-1-ardb@kernel.org +Fixes: 2a15ba82fa6c ("ARM: highmem: Switch to generic kmap atomic") +Signed-off-by: Ard Biesheuvel +Reported-by: Quanyang Wang +Reviewed-by: Linus Walleij +Acked-by: Russell King (Oracle) +Cc: Thomas Gleixner +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm/Kconfig | 1 + + mm/Kconfig | 3 +++ + mm/highmem.c | 32 +++++++++++++++++++++----------- + 3 files changed, 25 insertions(+), 11 deletions(-) + +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -1455,6 +1455,7 @@ config HIGHMEM + bool "High Memory Support" + depends on MMU + select KMAP_LOCAL ++ select KMAP_LOCAL_NON_LINEAR_PTE_ARRAY + help + The address space of ARM processors is only 4 Gigabytes large + and it has to accommodate user address space, kernel address +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -887,6 +887,9 @@ config MAPPING_DIRTY_HELPERS + config KMAP_LOCAL + bool + ++config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY ++ bool ++ + # struct io_mapping based helper. Selected by drivers that need them + config IO_MAPPING + bool +--- a/mm/highmem.c ++++ b/mm/highmem.c +@@ -504,16 +504,22 @@ static inline int kmap_local_calc_idx(in + + static pte_t *__kmap_pte; + +-static pte_t *kmap_get_pte(void) ++static pte_t *kmap_get_pte(unsigned long vaddr, int idx) + { ++ if (IS_ENABLED(CONFIG_KMAP_LOCAL_NON_LINEAR_PTE_ARRAY)) ++ /* ++ * Set by the arch if __kmap_pte[-idx] does not produce ++ * the correct entry. ++ */ ++ return virt_to_kpte(vaddr); + if (!__kmap_pte) + __kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); +- return __kmap_pte; ++ return &__kmap_pte[-idx]; + } + + void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) + { +- pte_t pteval, *kmap_pte = kmap_get_pte(); ++ pte_t pteval, *kmap_pte; + unsigned long vaddr; + int idx; + +@@ -525,9 +531,10 @@ void *__kmap_local_pfn_prot(unsigned lon + preempt_disable(); + idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- BUG_ON(!pte_none(*(kmap_pte - idx))); ++ kmap_pte = kmap_get_pte(vaddr, idx); ++ BUG_ON(!pte_none(*kmap_pte)); + pteval = pfn_pte(pfn, prot); +- arch_kmap_local_set_pte(&init_mm, vaddr, kmap_pte - idx, pteval); ++ arch_kmap_local_set_pte(&init_mm, vaddr, kmap_pte, pteval); + arch_kmap_local_post_map(vaddr, pteval); + current->kmap_ctrl.pteval[kmap_local_idx()] = pteval; + preempt_enable(); +@@ -560,7 +567,7 @@ EXPORT_SYMBOL(__kmap_local_page_prot); + void kunmap_local_indexed(void *vaddr) + { + unsigned long addr = (unsigned long) vaddr & PAGE_MASK; +- pte_t *kmap_pte = kmap_get_pte(); ++ pte_t *kmap_pte; + int idx; + + if (addr < __fix_to_virt(FIX_KMAP_END) || +@@ -585,8 +592,9 @@ void kunmap_local_indexed(void *vaddr) + idx = arch_kmap_local_unmap_idx(kmap_local_idx(), addr); + WARN_ON_ONCE(addr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); + ++ kmap_pte = kmap_get_pte(addr, idx); + arch_kmap_local_pre_unmap(addr); +- pte_clear(&init_mm, addr, kmap_pte - idx); ++ pte_clear(&init_mm, addr, kmap_pte); + arch_kmap_local_post_unmap(addr); + current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0); + kmap_local_idx_pop(); +@@ -608,7 +616,7 @@ EXPORT_SYMBOL(kunmap_local_indexed); + void __kmap_local_sched_out(void) + { + struct task_struct *tsk = current; +- pte_t *kmap_pte = kmap_get_pte(); ++ pte_t *kmap_pte; + int i; + + /* Clear kmaps */ +@@ -635,8 +643,9 @@ void __kmap_local_sched_out(void) + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + + addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); ++ kmap_pte = kmap_get_pte(addr, idx); + arch_kmap_local_pre_unmap(addr); +- pte_clear(&init_mm, addr, kmap_pte - idx); ++ pte_clear(&init_mm, addr, kmap_pte); + arch_kmap_local_post_unmap(addr); + } + } +@@ -644,7 +653,7 @@ void __kmap_local_sched_out(void) + void __kmap_local_sched_in(void) + { + struct task_struct *tsk = current; +- pte_t *kmap_pte = kmap_get_pte(); ++ pte_t *kmap_pte; + int i; + + /* Restore kmaps */ +@@ -664,7 +673,8 @@ void __kmap_local_sched_in(void) + /* See comment in __kmap_local_sched_out() */ + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- set_pte_at(&init_mm, addr, kmap_pte - idx, pteval); ++ kmap_pte = kmap_get_pte(addr, idx); ++ set_pte_at(&init_mm, addr, kmap_pte, pteval); + arch_kmap_local_post_map(addr, pteval); + } + } diff --git a/queue-5.15/kvm-x86-assume-a-64-bit-hypercall-for-guests-with-protected-state.patch b/queue-5.15/kvm-x86-assume-a-64-bit-hypercall-for-guests-with-protected-state.patch new file mode 100644 index 00000000000..6a9b69a44b4 --- /dev/null +++ b/queue-5.15/kvm-x86-assume-a-64-bit-hypercall-for-guests-with-protected-state.patch @@ -0,0 +1,110 @@ +From b5aead0064f33ae5e693a364e3204fe1c0ac9af2 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky +Date: Mon, 24 May 2021 12:48:57 -0500 +Subject: KVM: x86: Assume a 64-bit hypercall for guests with protected state + +From: Tom Lendacky + +commit b5aead0064f33ae5e693a364e3204fe1c0ac9af2 upstream. + +When processing a hypercall for a guest with protected state, currently +SEV-ES guests, the guest CS segment register can't be checked to +determine if the guest is in 64-bit mode. For an SEV-ES guest, it is +expected that communication between the guest and the hypervisor is +performed to shared memory using the GHCB. In order to use the GHCB, the +guest must have been in long mode, otherwise writes by the guest to the +GHCB would be encrypted and not be able to be comprehended by the +hypervisor. + +Create a new helper function, is_64_bit_hypercall(), that assumes the +guest is in 64-bit mode when the guest has protected state, and returns +true, otherwise invoking is_64_bit_mode() to determine the mode. Update +the hypercall related routines to use is_64_bit_hypercall() instead of +is_64_bit_mode(). + +Add a WARN_ON_ONCE() to is_64_bit_mode() to catch occurences of calls to +this helper function for a guest running with protected state. + +Fixes: f1c6366e3043 ("KVM: SVM: Add required changes to support intercepts under SEV-ES") +Reported-by: Sean Christopherson +Signed-off-by: Tom Lendacky +Message-Id: +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/hyperv.c | 4 ++-- + arch/x86/kvm/x86.c | 2 +- + arch/x86/kvm/x86.h | 12 ++++++++++++ + arch/x86/kvm/xen.c | 2 +- + 4 files changed, 16 insertions(+), 4 deletions(-) + +--- a/arch/x86/kvm/hyperv.c ++++ b/arch/x86/kvm/hyperv.c +@@ -2022,7 +2022,7 @@ static void kvm_hv_hypercall_set_result( + { + bool longmode; + +- longmode = is_64_bit_mode(vcpu); ++ longmode = is_64_bit_hypercall(vcpu); + if (longmode) + kvm_rax_write(vcpu, result); + else { +@@ -2171,7 +2171,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vc + } + + #ifdef CONFIG_X86_64 +- if (is_64_bit_mode(vcpu)) { ++ if (is_64_bit_hypercall(vcpu)) { + hc.param = kvm_rcx_read(vcpu); + hc.ingpa = kvm_rdx_read(vcpu); + hc.outgpa = kvm_r8_read(vcpu); +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -8737,7 +8737,7 @@ int kvm_emulate_hypercall(struct kvm_vcp + + trace_kvm_hypercall(nr, a0, a1, a2, a3); + +- op_64_bit = is_64_bit_mode(vcpu); ++ op_64_bit = is_64_bit_hypercall(vcpu); + if (!op_64_bit) { + nr &= 0xFFFFFFFF; + a0 &= 0xFFFFFFFF; +--- a/arch/x86/kvm/x86.h ++++ b/arch/x86/kvm/x86.h +@@ -153,12 +153,24 @@ static inline bool is_64_bit_mode(struct + { + int cs_db, cs_l; + ++ WARN_ON_ONCE(vcpu->arch.guest_state_protected); ++ + if (!is_long_mode(vcpu)) + return false; + static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); + return cs_l; + } + ++static inline bool is_64_bit_hypercall(struct kvm_vcpu *vcpu) ++{ ++ /* ++ * If running with protected guest state, the CS register is not ++ * accessible. The hypercall register values will have had to been ++ * provided in 64-bit mode, so assume the guest is in 64-bit. ++ */ ++ return vcpu->arch.guest_state_protected || is_64_bit_mode(vcpu); ++} ++ + static inline bool x86_exception_has_error_code(unsigned int vector) + { + static u32 exception_has_error_code = BIT(DF_VECTOR) | BIT(TS_VECTOR) | +--- a/arch/x86/kvm/xen.c ++++ b/arch/x86/kvm/xen.c +@@ -698,7 +698,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *v + kvm_hv_hypercall_enabled(vcpu)) + return kvm_hv_hypercall(vcpu); + +- longmode = is_64_bit_mode(vcpu); ++ longmode = is_64_bit_hypercall(vcpu); + if (!longmode) { + params[0] = (u32)kvm_rbx_read(vcpu); + params[1] = (u32)kvm_rcx_read(vcpu); diff --git a/queue-5.15/kvm-x86-fix-uninitialized-eoi_exit_bitmap-usage-in-vcpu_load_eoi_exitmap.patch b/queue-5.15/kvm-x86-fix-uninitialized-eoi_exit_bitmap-usage-in-vcpu_load_eoi_exitmap.patch new file mode 100644 index 00000000000..8aff5fe09f1 --- /dev/null +++ b/queue-5.15/kvm-x86-fix-uninitialized-eoi_exit_bitmap-usage-in-vcpu_load_eoi_exitmap.patch @@ -0,0 +1,51 @@ +From c5adbb3af051079f35abfa26551107e2c653087f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E9=BB=84=E4=B9=90?= +Date: Mon, 15 Nov 2021 14:08:29 +0000 +Subject: KVM: x86: Fix uninitialized eoi_exit_bitmap usage in vcpu_load_eoi_exitmap() + +From: 黄乐 + +commit c5adbb3af051079f35abfa26551107e2c653087f upstream. + +In vcpu_load_eoi_exitmap(), currently the eoi_exit_bitmap[4] array is +initialized only when Hyper-V context is available, in other path it is +just passed to kvm_x86_ops.load_eoi_exitmap() directly from on the stack, +which would cause unexpected interrupt delivery/handling issues, e.g. an +*old* linux kernel that relies on PIT to do clock calibration on KVM might +randomly fail to boot. + +Fix it by passing ioapic_handled_vectors to load_eoi_exitmap() when Hyper-V +context is not available. + +Fixes: f2bc14b69c38 ("KVM: x86: hyper-v: Prepare to meet unallocated Hyper-V context") +Cc: stable@vger.kernel.org +Reviewed-by: Vitaly Kuznetsov +Signed-off-by: Huang Le +Message-Id: <62115b277dab49ea97da5633f8522daf@jd.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/x86.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -9429,12 +9429,16 @@ static void vcpu_load_eoi_exitmap(struct + if (!kvm_apic_hw_enabled(vcpu->arch.apic)) + return; + +- if (to_hv_vcpu(vcpu)) ++ if (to_hv_vcpu(vcpu)) { + bitmap_or((ulong *)eoi_exit_bitmap, + vcpu->arch.ioapic_handled_vectors, + to_hv_synic(vcpu)->vec_bitmap, 256); ++ static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); ++ return; ++ } + +- static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); ++ static_call(kvm_x86_load_eoi_exitmap)( ++ vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors); + } + + void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, diff --git a/queue-5.15/kvm-x86-mmu-include-efer.lma-in-extended-mmu-role.patch b/queue-5.15/kvm-x86-mmu-include-efer.lma-in-extended-mmu-role.patch new file mode 100644 index 00000000000..5ac20155532 --- /dev/null +++ b/queue-5.15/kvm-x86-mmu-include-efer.lma-in-extended-mmu-role.patch @@ -0,0 +1,62 @@ +From b8453cdcf26020030da182f0156d7bf59ae5719f Mon Sep 17 00:00:00 2001 +From: Maxim Levitsky +Date: Mon, 15 Nov 2021 15:18:37 +0200 +Subject: KVM: x86/mmu: include EFER.LMA in extended mmu role + +From: Maxim Levitsky + +commit b8453cdcf26020030da182f0156d7bf59ae5719f upstream. + +Incorporate EFER.LMA into kvm_mmu_extended_role, as it used to compute the +guest root level and is not reflected in kvm_mmu_page_role.level when TDP +is in use. When simply running the guest, it is impossible for EFER.LMA +and kvm_mmu.root_level to get out of sync, as the guest cannot transition +from PAE paging to 64-bit paging without toggling CR0.PG, i.e. without +first bouncing through a different MMU context. And stuffing guest state +via KVM_SET_SREGS{,2} also ensures a full MMU context reset. + +However, if KVM_SET_SREGS{,2} is followed by KVM_SET_NESTED_STATE, e.g. to +set guest state when migrating the VM while L2 is active, the vCPU state +will reflect L2, not L1. If L1 is using TDP for L2, then root_mmu will +have been configured using L2's state, despite not being used for L2. If +L2.EFER.LMA != L1.EFER.LMA, and L2 is using PAE paging, then root_mmu will +be configured for guest PAE paging, but will match the mmu_role for 64-bit +paging and cause KVM to not reconfigure root_mmu on the next nested VM-Exit. + +Alternatively, the root_mmu's role could be invalidated after a successful +KVM_SET_NESTED_STATE that yields vcpu->arch.mmu != vcpu->arch.root_mmu, +i.e. that switches the active mmu to guest_mmu, but doing so is unnecessarily +tricky, and not even needed if L1 and L2 do have the same role (e.g., they +are both 64-bit guests and run with the same CR4). + +Suggested-by: Sean Christopherson +Signed-off-by: Maxim Levitsky +Message-Id: <20211115131837.195527-3-mlevitsk@redhat.com> +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/kvm_host.h | 1 + + arch/x86/kvm/mmu/mmu.c | 1 + + 2 files changed, 2 insertions(+) + +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -364,6 +364,7 @@ union kvm_mmu_extended_role { + unsigned int cr4_smap:1; + unsigned int cr4_smep:1; + unsigned int cr4_la57:1; ++ unsigned int efer_lma:1; + }; + }; + +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -4679,6 +4679,7 @@ static union kvm_mmu_extended_role kvm_c + /* PKEY and LA57 are active iff long mode is active. */ + ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs); + ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs); ++ ext.efer_lma = ____is_efer_lma(regs); + } + + ext.valid = 1; diff --git a/queue-5.15/kvm-x86-xen-fix-get_attr-of-kvm_xen_attr_type_shared_info.patch b/queue-5.15/kvm-x86-xen-fix-get_attr-of-kvm_xen_attr_type_shared_info.patch new file mode 100644 index 00000000000..f7d05b39a87 --- /dev/null +++ b/queue-5.15/kvm-x86-xen-fix-get_attr-of-kvm_xen_attr_type_shared_info.patch @@ -0,0 +1,35 @@ +From 4e8436479ad3be76a3823e6ce466ae464ce71300 Mon Sep 17 00:00:00 2001 +From: David Woodhouse +Date: Mon, 15 Nov 2021 16:50:21 +0000 +Subject: KVM: x86/xen: Fix get_attr of KVM_XEN_ATTR_TYPE_SHARED_INFO + +From: David Woodhouse + +commit 4e8436479ad3be76a3823e6ce466ae464ce71300 upstream. + +In commit 319afe68567b ("KVM: xen: do not use struct gfn_to_hva_cache") we +stopped storing this in-kernel as a GPA, and started storing it as a GFN. +Which means we probably should have stopped calling gpa_to_gfn() on it +when userspace asks for it back. + +Cc: stable@vger.kernel.org +Fixes: 319afe68567b ("KVM: xen: do not use struct gfn_to_hva_cache") +Signed-off-by: David Woodhouse +Message-Id: <20211115165030.7422-2-dwmw2@infradead.org> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/xen.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/kvm/xen.c ++++ b/arch/x86/kvm/xen.c +@@ -299,7 +299,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm + break; + + case KVM_XEN_ATTR_TYPE_SHARED_INFO: +- data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_gfn); ++ data->u.shared_info.gfn = kvm->arch.xen.shinfo_gfn; + r = 0; + break; + diff --git a/queue-5.15/mm-damon-dbgfs-fix-missed-use-of-damon_dbgfs_lock.patch b/queue-5.15/mm-damon-dbgfs-fix-missed-use-of-damon_dbgfs_lock.patch new file mode 100644 index 00000000000..6dbeb0bd2c3 --- /dev/null +++ b/queue-5.15/mm-damon-dbgfs-fix-missed-use-of-damon_dbgfs_lock.patch @@ -0,0 +1,71 @@ +From d78f3853f831eee46c6dbe726debf3be9e9c0d05 Mon Sep 17 00:00:00 2001 +From: SeongJae Park +Date: Fri, 19 Nov 2021 16:43:52 -0800 +Subject: mm/damon/dbgfs: fix missed use of damon_dbgfs_lock + +From: SeongJae Park + +commit d78f3853f831eee46c6dbe726debf3be9e9c0d05 upstream. + +DAMON debugfs is supposed to protect dbgfs_ctxs, dbgfs_nr_ctxs, and +dbgfs_dirs using damon_dbgfs_lock. However, some of the code is +accessing the variables without the protection. This fixes it by +protecting all such accesses. + +Link: https://lkml.kernel.org/r/20211110145758.16558-3-sj@kernel.org +Fixes: 75c1c2b53c78 ("mm/damon/dbgfs: support multiple contexts") +Signed-off-by: SeongJae Park +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/damon/dbgfs.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/mm/damon/dbgfs.c ++++ b/mm/damon/dbgfs.c +@@ -538,12 +538,14 @@ static ssize_t dbgfs_monitor_on_write(st + return -EINVAL; + } + ++ mutex_lock(&damon_dbgfs_lock); + if (!strncmp(kbuf, "on", count)) + err = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs); + else if (!strncmp(kbuf, "off", count)) + err = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); + else + err = -EINVAL; ++ mutex_unlock(&damon_dbgfs_lock); + + if (err) + ret = err; +@@ -596,15 +598,16 @@ static int __init __damon_dbgfs_init(voi + + static int __init damon_dbgfs_init(void) + { +- int rc; ++ int rc = -ENOMEM; + ++ mutex_lock(&damon_dbgfs_lock); + dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL); + if (!dbgfs_ctxs) +- return -ENOMEM; ++ goto out; + dbgfs_ctxs[0] = dbgfs_new_ctx(); + if (!dbgfs_ctxs[0]) { + kfree(dbgfs_ctxs); +- return -ENOMEM; ++ goto out; + } + dbgfs_nr_ctxs = 1; + +@@ -615,6 +618,8 @@ static int __init damon_dbgfs_init(void) + pr_err("%s: dbgfs init failed\n", __func__); + } + ++out: ++ mutex_unlock(&damon_dbgfs_lock); + return rc; + } + diff --git a/queue-5.15/mm-damon-dbgfs-use-__gfp_nowarn-for-user-specified-size-buffer-allocation.patch b/queue-5.15/mm-damon-dbgfs-use-__gfp_nowarn-for-user-specified-size-buffer-allocation.patch new file mode 100644 index 00000000000..87cf35654b4 --- /dev/null +++ b/queue-5.15/mm-damon-dbgfs-use-__gfp_nowarn-for-user-specified-size-buffer-allocation.patch @@ -0,0 +1,60 @@ +From db7a347b26fe05d2e8c115bb24dfd908d0252bc3 Mon Sep 17 00:00:00 2001 +From: SeongJae Park +Date: Fri, 19 Nov 2021 16:43:49 -0800 +Subject: mm/damon/dbgfs: use '__GFP_NOWARN' for user-specified size buffer allocation + +From: SeongJae Park + +commit db7a347b26fe05d2e8c115bb24dfd908d0252bc3 upstream. + +Patch series "DAMON fixes". + +This patch (of 2): + +DAMON users can trigger below warning in '__alloc_pages()' by invoking +write() to some DAMON debugfs files with arbitrarily high count +argument, because DAMON debugfs interface allocates some buffers based +on the user-specified 'count'. + + if (unlikely(order >= MAX_ORDER)) { + WARN_ON_ONCE(!(gfp & __GFP_NOWARN)); + return NULL; + } + +Because the DAMON debugfs interface code checks failure of the +'kmalloc()', this commit simply suppresses the warnings by adding +'__GFP_NOWARN' flag. + +Link: https://lkml.kernel.org/r/20211110145758.16558-1-sj@kernel.org +Link: https://lkml.kernel.org/r/20211110145758.16558-2-sj@kernel.org +Fixes: 4bc05954d007 ("mm/damon: implement a debugfs-based user space interface") +Signed-off-by: SeongJae Park +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/damon/dbgfs.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/mm/damon/dbgfs.c ++++ b/mm/damon/dbgfs.c +@@ -32,7 +32,7 @@ static char *user_input_str(const char _ + if (*ppos) + return ERR_PTR(-EINVAL); + +- kbuf = kmalloc(count + 1, GFP_KERNEL); ++ kbuf = kmalloc(count + 1, GFP_KERNEL | __GFP_NOWARN); + if (!kbuf) + return ERR_PTR(-ENOMEM); + +@@ -247,7 +247,7 @@ static ssize_t dbgfs_kdamond_pid_read(st + char *kbuf; + ssize_t len; + +- kbuf = kmalloc(count, GFP_KERNEL); ++ kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); + if (!kbuf) + return -ENOMEM; + diff --git a/queue-5.15/mm-kmemleak-slob-respect-slab_noleaktrace-flag.patch b/queue-5.15/mm-kmemleak-slob-respect-slab_noleaktrace-flag.patch new file mode 100644 index 00000000000..ce7154046af --- /dev/null +++ b/queue-5.15/mm-kmemleak-slob-respect-slab_noleaktrace-flag.patch @@ -0,0 +1,51 @@ +From 34dbc3aaf5d9e89ba6cc5e24add9458c21ab1950 Mon Sep 17 00:00:00 2001 +From: Rustam Kovhaev +Date: Fri, 19 Nov 2021 16:43:37 -0800 +Subject: mm: kmemleak: slob: respect SLAB_NOLEAKTRACE flag + +From: Rustam Kovhaev + +commit 34dbc3aaf5d9e89ba6cc5e24add9458c21ab1950 upstream. + +When kmemleak is enabled for SLOB, system does not boot and does not +print anything to the console. At the very early stage in the boot +process we hit infinite recursion from kmemleak_init() and eventually +kernel crashes. + +kmemleak_init() specifies SLAB_NOLEAKTRACE for KMEM_CACHE(), but +kmem_cache_create_usercopy() removes it because CACHE_CREATE_MASK is not +valid for SLOB. + +Let's fix CACHE_CREATE_MASK and make kmemleak work with SLOB + +Link: https://lkml.kernel.org/r/20211115020850.3154366-1-rkovhaev@gmail.com +Fixes: d8843922fba4 ("slab: Ignore internal flags in cache creation") +Signed-off-by: Rustam Kovhaev +Acked-by: Vlastimil Babka +Reviewed-by: Muchun Song +Cc: Christoph Lameter +Cc: Pekka Enberg +Cc: David Rientjes +Cc: Joonsoo Kim +Cc: Catalin Marinas +Cc: Greg Kroah-Hartman +Cc: Glauber Costa +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/slab.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/slab.h ++++ b/mm/slab.h +@@ -147,7 +147,7 @@ static inline slab_flags_t kmem_cache_fl + #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_ACCOUNT) + #else +-#define SLAB_CACHE_FLAGS (0) ++#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE) + #endif + + /* Common flags available with current configuration */ diff --git a/queue-5.15/pinctrl-ralink-include-ralink_regs.h-in-pinctrl-mt7620.c.patch b/queue-5.15/pinctrl-ralink-include-ralink_regs.h-in-pinctrl-mt7620.c.patch new file mode 100644 index 00000000000..6b55578df1e --- /dev/null +++ b/queue-5.15/pinctrl-ralink-include-ralink_regs.h-in-pinctrl-mt7620.c.patch @@ -0,0 +1,32 @@ +From a5b9703fe11cd1d6d7a60102aa2abe686dc1867f Mon Sep 17 00:00:00 2001 +From: Sergio Paracuellos +Date: Sun, 31 Oct 2021 07:40:46 +0100 +Subject: pinctrl: ralink: include 'ralink_regs.h' in 'pinctrl-mt7620.c' + +From: Sergio Paracuellos + +commit a5b9703fe11cd1d6d7a60102aa2abe686dc1867f upstream. + +mt7620.h, included by pinctrl-mt7620.c, mentions MT762X_SOC_MT7628AN +declared in ralink_regs.h. + +Fixes: 745ec436de72 ("pinctrl: ralink: move MT7620 SoC pinmux config into a new 'pinctrl-mt7620.c' file") +Cc: stable@vger.kernel.org +Signed-off-by: Luiz Angelo Daros de Luca +Signed-off-by: Sergio Paracuellos +Link: https://lore.kernel.org/r/20211031064046.13533-1-sergio.paracuellos@gmail.com +Signed-off-by: Linus Walleij +Signed-off-by: Greg Kroah-Hartman +--- + drivers/pinctrl/ralink/pinctrl-mt7620.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/pinctrl/ralink/pinctrl-mt7620.c ++++ b/drivers/pinctrl/ralink/pinctrl-mt7620.c +@@ -1,5 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0-only + ++#include + #include + #include + #include diff --git a/queue-5.15/powerpc-8xx-fix-pinned-tlbs-with-config_strict_kernel_rwx.patch b/queue-5.15/powerpc-8xx-fix-pinned-tlbs-with-config_strict_kernel_rwx.patch new file mode 100644 index 00000000000..c5cbec27558 --- /dev/null +++ b/queue-5.15/powerpc-8xx-fix-pinned-tlbs-with-config_strict_kernel_rwx.patch @@ -0,0 +1,90 @@ +From 1e35eba4055149c578baf0318d2f2f89ea3c44a0 Mon Sep 17 00:00:00 2001 +From: Christophe Leroy +Date: Mon, 15 Nov 2021 09:08:36 +0100 +Subject: powerpc/8xx: Fix pinned TLBs with CONFIG_STRICT_KERNEL_RWX + +From: Christophe Leroy + +commit 1e35eba4055149c578baf0318d2f2f89ea3c44a0 upstream. + +As spotted and explained in commit c12ab8dbc492 ("powerpc/8xx: Fix +Oops with STRICT_KERNEL_RWX without DEBUG_RODATA_TEST"), the selection +of STRICT_KERNEL_RWX without selecting DEBUG_RODATA_TEST has spotted +the lack of the DIRTY bit in the pinned kernel data TLBs. + +This problem should have been detected a lot earlier if things had +been working as expected. But due to an incredible level of chance or +mishap, this went undetected because of a set of bugs: In fact the +DTLBs were not pinned, because instead of setting the reserve bit +in MD_CTR, it was set in MI_CTR that is the register for ITLBs. + +But then, another huge bug was there: the physical address was +reset to 0 at the boundary between RO and RW areas, leading to the +same physical space being mapped at both 0xc0000000 and 0xc8000000. +This had by miracle no consequence until now because the entry was +not really pinned so it was overwritten soon enough to go undetected. + +Of course, now that we really pin the DTLBs, it must be fixed as well. + +Fixes: f76c8f6d257c ("powerpc/8xx: Add function to set pinned TLBs") +Cc: stable@vger.kernel.org # v5.8+ +Signed-off-by: Christophe Leroy +Depends-on: c12ab8dbc492 ("powerpc/8xx: Fix Oops with STRICT_KERNEL_RWX without DEBUG_RODATA_TEST") +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/a21e9a057fe2d247a535aff0d157a54eefee017a.1636963688.git.christophe.leroy@csgroup.eu +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/kernel/head_8xx.S | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +--- a/arch/powerpc/kernel/head_8xx.S ++++ b/arch/powerpc/kernel/head_8xx.S +@@ -733,6 +733,7 @@ _GLOBAL(mmu_pin_tlb) + #ifdef CONFIG_PIN_TLB_DATA + LOAD_REG_IMMEDIATE(r6, PAGE_OFFSET) + LOAD_REG_IMMEDIATE(r7, MI_SVALID | MI_PS8MEG | _PMD_ACCESSED) ++ li r8, 0 + #ifdef CONFIG_PIN_TLB_IMMR + li r0, 3 + #else +@@ -741,26 +742,26 @@ _GLOBAL(mmu_pin_tlb) + mtctr r0 + cmpwi r4, 0 + beq 4f +- LOAD_REG_IMMEDIATE(r8, 0xf0 | _PAGE_RO | _PAGE_SPS | _PAGE_SH | _PAGE_PRESENT) + LOAD_REG_ADDR(r9, _sinittext) + + 2: ori r0, r6, MD_EVALID ++ ori r12, r8, 0xf0 | _PAGE_RO | _PAGE_SPS | _PAGE_SH | _PAGE_PRESENT + mtspr SPRN_MD_CTR, r5 + mtspr SPRN_MD_EPN, r0 + mtspr SPRN_MD_TWC, r7 +- mtspr SPRN_MD_RPN, r8 ++ mtspr SPRN_MD_RPN, r12 + addi r5, r5, 0x100 + addis r6, r6, SZ_8M@h + addis r8, r8, SZ_8M@h + cmplw r6, r9 + bdnzt lt, 2b +- +-4: LOAD_REG_IMMEDIATE(r8, 0xf0 | _PAGE_DIRTY | _PAGE_SPS | _PAGE_SH | _PAGE_PRESENT) ++4: + 2: ori r0, r6, MD_EVALID ++ ori r12, r8, 0xf0 | _PAGE_DIRTY | _PAGE_SPS | _PAGE_SH | _PAGE_PRESENT + mtspr SPRN_MD_CTR, r5 + mtspr SPRN_MD_EPN, r0 + mtspr SPRN_MD_TWC, r7 +- mtspr SPRN_MD_RPN, r8 ++ mtspr SPRN_MD_RPN, r12 + addi r5, r5, 0x100 + addis r6, r6, SZ_8M@h + addis r8, r8, SZ_8M@h +@@ -781,7 +782,7 @@ _GLOBAL(mmu_pin_tlb) + #endif + #if defined(CONFIG_PIN_TLB_IMMR) || defined(CONFIG_PIN_TLB_DATA) + lis r0, (MD_RSV4I | MD_TWAM)@h +- mtspr SPRN_MI_CTR, r0 ++ mtspr SPRN_MD_CTR, r0 + #endif + mtspr SPRN_SRR1, r10 + mtspr SPRN_SRR0, r11 diff --git a/queue-5.15/powerpc-signal32-fix-sigset_t-copy.patch b/queue-5.15/powerpc-signal32-fix-sigset_t-copy.patch new file mode 100644 index 00000000000..e03bbefee29 --- /dev/null +++ b/queue-5.15/powerpc-signal32-fix-sigset_t-copy.patch @@ -0,0 +1,64 @@ +From 5499802b2284331788a440585869590f1bd63f7f Mon Sep 17 00:00:00 2001 +From: Christophe Leroy +Date: Mon, 15 Nov 2021 09:52:55 +0100 +Subject: powerpc/signal32: Fix sigset_t copy + +From: Christophe Leroy + +commit 5499802b2284331788a440585869590f1bd63f7f upstream. + +The conversion from __copy_from_user() to __get_user() by +commit d3ccc9781560 ("powerpc/signal: Use __get_user() to copy +sigset_t") introduced a regression in __get_user_sigset() for +powerpc/32. The bug was subsequently moved into +unsafe_get_user_sigset(). + +The bug is due to the copied 64 bit value being truncated to +32 bits while being assigned to dst->sig[0] + +The regression was reported by users of the Xorg packages distributed in +Debian/powerpc -- + + "The symptoms are that the fb screen goes blank, with the backlight + remaining on and no errors logged in /var/log; wdm (or startx) run + with no effect (I tried logging in in the blind, with no effect). + And they are hard to kill, requiring 'kill -KILL ...'" + +Fix the regression by copying each word of the sigset, not only the +first one. + +__get_user_sigset() was tentatively optimised to copy 64 bits at once +in order to minimise KUAP unlock/lock impact, but the unsafe variant +doesn't suffer that, so it can just copy words. + +Fixes: 887f3ceb51cd ("powerpc/signal32: Convert do_setcontext[_tm]() to user access block") +Cc: stable@vger.kernel.org # v5.13+ +Reported-by: Finn Thain +Reported-and-tested-by: Stan Johnson +Signed-off-by: Christophe Leroy +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/99ef38d61c0eb3f79c68942deb0c35995a93a777.1636966353.git.christophe.leroy@csgroup.eu +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/kernel/signal.h | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/arch/powerpc/kernel/signal.h ++++ b/arch/powerpc/kernel/signal.h +@@ -25,8 +25,14 @@ static inline int __get_user_sigset(sigs + + return __get_user(dst->sig[0], (u64 __user *)&src->sig[0]); + } +-#define unsafe_get_user_sigset(dst, src, label) \ +- unsafe_get_user((dst)->sig[0], (u64 __user *)&(src)->sig[0], label) ++#define unsafe_get_user_sigset(dst, src, label) do { \ ++ sigset_t *__dst = dst; \ ++ const sigset_t __user *__src = src; \ ++ int i; \ ++ \ ++ for (i = 0; i < _NSIG_WORDS; i++) \ ++ unsafe_get_user(__dst->sig[i], &__src->sig[i], label); \ ++} while (0) + + #ifdef CONFIG_VSX + extern unsigned long copy_vsx_to_user(void __user *to, diff --git a/queue-5.15/powerpc-xive-change-irq-domain-to-a-tree-domain.patch b/queue-5.15/powerpc-xive-change-irq-domain-to-a-tree-domain.patch new file mode 100644 index 00000000000..46161aaad1f --- /dev/null +++ b/queue-5.15/powerpc-xive-change-irq-domain-to-a-tree-domain.patch @@ -0,0 +1,63 @@ +From 8e80a73fa9a7747e3e8255cb149c543aabf65a24 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Tue, 16 Nov 2021 14:40:22 +0100 +Subject: powerpc/xive: Change IRQ domain to a tree domain +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Cédric Le Goater + +commit 8e80a73fa9a7747e3e8255cb149c543aabf65a24 upstream. + +Commit 4f86a06e2d6e ("irqdomain: Make normal and nomap irqdomains +exclusive") introduced an IRQ_DOMAIN_FLAG_NO_MAP flag to isolate the +'nomap' domains still in use under the powerpc arch. With this new +flag, the revmap_tree of the IRQ domain is not used anymore. This +change broke the support of shared LSIs [1] in the XIVE driver because +it was relying on a lookup in the revmap_tree to query previously +mapped interrupts. Linux now creates two distinct IRQ mappings on the +same HW IRQ which can lead to unexpected behavior in the drivers. + +The XIVE IRQ domain is not a direct mapping domain and its HW IRQ +interrupt number space is rather large : 1M/socket on POWER9 and +POWER10, change the XIVE driver to use a 'tree' domain type instead. + +[1] For instance, a linux KVM guest with virtio-rng and virtio-balloon + devices. + +Fixes: 4f86a06e2d6e ("irqdomain: Make normal and nomap irqdomains exclusive") +Cc: stable@vger.kernel.org # v5.14+ +Signed-off-by: Cédric Le Goater +Tested-by: Greg Kurz +Acked-by: Marc Zyngier +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20211116134022.420412-1-clg@kaod.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/sysdev/xive/Kconfig | 1 - + arch/powerpc/sysdev/xive/common.c | 3 +-- + 2 files changed, 1 insertion(+), 3 deletions(-) + +--- a/arch/powerpc/sysdev/xive/Kconfig ++++ b/arch/powerpc/sysdev/xive/Kconfig +@@ -3,7 +3,6 @@ config PPC_XIVE + bool + select PPC_SMP_MUXED_IPI + select HARDIRQS_SW_RESEND +- select IRQ_DOMAIN_NOMAP + + config PPC_XIVE_NATIVE + bool +--- a/arch/powerpc/sysdev/xive/common.c ++++ b/arch/powerpc/sysdev/xive/common.c +@@ -1443,8 +1443,7 @@ static const struct irq_domain_ops xive_ + + static void __init xive_init_host(struct device_node *np) + { +- xive_irq_domain = irq_domain_add_nomap(np, XIVE_MAX_IRQ, +- &xive_irq_domain_ops, NULL); ++ xive_irq_domain = irq_domain_add_tree(np, &xive_irq_domain_ops, NULL); + if (WARN_ON(xive_irq_domain == NULL)) + return; + irq_set_default_host(xive_irq_domain); diff --git a/queue-5.15/pstore-blk-use-lu-to-format-unsigned-long.patch b/queue-5.15/pstore-blk-use-lu-to-format-unsigned-long.patch new file mode 100644 index 00000000000..a38bc3d2273 --- /dev/null +++ b/queue-5.15/pstore-blk-use-lu-to-format-unsigned-long.patch @@ -0,0 +1,52 @@ +From 61eb495c83bf6ebde490992bf888ca15b9babc39 Mon Sep 17 00:00:00 2001 +From: Geert Uytterhoeven +Date: Thu, 18 Nov 2021 10:26:21 -0800 +Subject: pstore/blk: Use "%lu" to format unsigned long +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Geert Uytterhoeven + +commit 61eb495c83bf6ebde490992bf888ca15b9babc39 upstream. + +On 32-bit: + + fs/pstore/blk.c: In function ‘__best_effort_init’: + include/linux/kern_levels.h:5:18: warning: format ‘%zu’ expects argument of type ‘size_t’, but argument 3 has type ‘long unsigned int’ [-Wformat=] + 5 | #define KERN_SOH "\001" /* ASCII Start Of Header */ + | ^~~~~~ + include/linux/kern_levels.h:14:19: note: in expansion of macro ‘KERN_SOH’ + 14 | #define KERN_INFO KERN_SOH "6" /* informational */ + | ^~~~~~~~ + include/linux/printk.h:373:9: note: in expansion of macro ‘KERN_INFO’ + 373 | printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) + | ^~~~~~~~~ + fs/pstore/blk.c:314:3: note: in expansion of macro ‘pr_info’ + 314 | pr_info("attached %s (%zu) (no dedicated panic_write!)\n", + | ^~~~~~~ + +Cc: stable@vger.kernel.org +Fixes: 7bb9557b48fcabaa ("pstore/blk: Use the normal block device I/O path") +Signed-off-by: Geert Uytterhoeven +Signed-off-by: Kees Cook +Link: https://lore.kernel.org/r/20210629103700.1935012-1-geert@linux-m68k.org +Cc: Jens Axboe +Reviewed-by: Christoph Hellwig +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + fs/pstore/blk.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/pstore/blk.c ++++ b/fs/pstore/blk.c +@@ -311,7 +311,7 @@ static int __init __best_effort_init(voi + if (ret) + kfree(best_effort_dev); + else +- pr_info("attached %s (%zu) (no dedicated panic_write!)\n", ++ pr_info("attached %s (%lu) (no dedicated panic_write!)\n", + blkdev, best_effort_dev->zone.total_size); + + return ret; diff --git a/queue-5.15/revert-drm-i915-tgl-dsi-gate-the-ddi-clocks-after-pll-mapping.patch b/queue-5.15/revert-drm-i915-tgl-dsi-gate-the-ddi-clocks-after-pll-mapping.patch new file mode 100644 index 00000000000..40973c45f58 --- /dev/null +++ b/queue-5.15/revert-drm-i915-tgl-dsi-gate-the-ddi-clocks-after-pll-mapping.patch @@ -0,0 +1,59 @@ +From f15863b27752682bb700c21de5f83f613a0fb77e Mon Sep 17 00:00:00 2001 +From: Vandita Kulkarni +Date: Tue, 9 Nov 2021 17:34:28 +0530 +Subject: Revert "drm/i915/tgl/dsi: Gate the ddi clocks after pll mapping" + +From: Vandita Kulkarni + +commit f15863b27752682bb700c21de5f83f613a0fb77e upstream. + +This reverts commit 991d9557b0c4 ("drm/i915/tgl/dsi: Gate the ddi clocks +after pll mapping"). The Bspec was updated recently with the pll ungate +sequence similar to that of icl dsi enable sequence. Hence reverting. + +Bspec: 49187 +Fixes: 991d9557b0c4 ("drm/i915/tgl/dsi: Gate the ddi clocks after pll mapping") +Cc: # v5.4+ +Signed-off-by: Vandita Kulkarni +Signed-off-by: Jani Nikula +Link: https://patchwork.freedesktop.org/patch/msgid/20211109120428.15211-1-vandita.kulkarni@intel.com +(cherry picked from commit 4579509ef181480f4e4510d436c691519167c5c2) +Signed-off-by: Rodrigo Vivi +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/i915/display/icl_dsi.c | 10 ++-------- + 1 file changed, 2 insertions(+), 8 deletions(-) + +--- a/drivers/gpu/drm/i915/display/icl_dsi.c ++++ b/drivers/gpu/drm/i915/display/icl_dsi.c +@@ -711,10 +711,7 @@ static void gen11_dsi_map_pll(struct int + intel_de_write(dev_priv, ICL_DPCLKA_CFGCR0, val); + + for_each_dsi_phy(phy, intel_dsi->phys) { +- if (DISPLAY_VER(dev_priv) >= 12) +- val |= ICL_DPCLKA_CFGCR0_DDI_CLK_OFF(phy); +- else +- val &= ~ICL_DPCLKA_CFGCR0_DDI_CLK_OFF(phy); ++ val &= ~ICL_DPCLKA_CFGCR0_DDI_CLK_OFF(phy); + } + intel_de_write(dev_priv, ICL_DPCLKA_CFGCR0, val); + +@@ -1150,8 +1147,6 @@ static void + gen11_dsi_enable_port_and_phy(struct intel_encoder *encoder, + const struct intel_crtc_state *crtc_state) + { +- struct drm_i915_private *dev_priv = to_i915(encoder->base.dev); +- + /* step 4a: power up all lanes of the DDI used by DSI */ + gen11_dsi_power_up_lanes(encoder); + +@@ -1177,8 +1172,7 @@ gen11_dsi_enable_port_and_phy(struct int + gen11_dsi_configure_transcoder(encoder, crtc_state); + + /* Step 4l: Gate DDI clocks */ +- if (DISPLAY_VER(dev_priv) == 11) +- gen11_dsi_gate_clocks(encoder); ++ gen11_dsi_gate_clocks(encoder); + } + + static void gen11_dsi_powerup_panel(struct intel_encoder *encoder) diff --git a/queue-5.15/revert-mark-pstore-blk-as-broken.patch b/queue-5.15/revert-mark-pstore-blk-as-broken.patch new file mode 100644 index 00000000000..63d9c22f72f --- /dev/null +++ b/queue-5.15/revert-mark-pstore-blk-as-broken.patch @@ -0,0 +1,36 @@ +From d1faacbf67b1944f0e0c618dc581d929263f6fe9 Mon Sep 17 00:00:00 2001 +From: Kees Cook +Date: Tue, 16 Nov 2021 10:15:59 -0800 +Subject: Revert "mark pstore-blk as broken" + +From: Kees Cook + +commit d1faacbf67b1944f0e0c618dc581d929263f6fe9 upstream. + +This reverts commit d07f3b081ee632268786601f55e1334d1f68b997. + +pstore-blk was fixed to avoid the unwanted APIs in commit 7bb9557b48fc +("pstore/blk: Use the normal block device I/O path"), which landed in +the same release as the commit adding BROKEN. + +Cc: Jens Axboe +Cc: Christoph Hellwig +Cc: stable@vger.kernel.org +Signed-off-by: Kees Cook +Link: https://lore.kernel.org/r/20211116181559.3975566-1-keescook@chromium.org +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/pstore/Kconfig | 1 - + 1 file changed, 1 deletion(-) + +--- a/fs/pstore/Kconfig ++++ b/fs/pstore/Kconfig +@@ -173,7 +173,6 @@ config PSTORE_BLK + tristate "Log panic/oops to a block device" + depends on PSTORE + depends on BLOCK +- depends on BROKEN + select PSTORE_ZONE + default n + help diff --git a/queue-5.15/revert-parisc-reduce-sigreturn-trampoline-to-3-instructions.patch b/queue-5.15/revert-parisc-reduce-sigreturn-trampoline-to-3-instructions.patch new file mode 100644 index 00000000000..8f32b061cfc --- /dev/null +++ b/queue-5.15/revert-parisc-reduce-sigreturn-trampoline-to-3-instructions.patch @@ -0,0 +1,75 @@ +From 79df39d535c7a3770856fe9f5aba8c0ad1eebdb6 Mon Sep 17 00:00:00 2001 +From: Helge Deller +Date: Wed, 17 Nov 2021 11:05:07 +0100 +Subject: Revert "parisc: Reduce sigreturn trampoline to 3 instructions" + +From: Helge Deller + +commit 79df39d535c7a3770856fe9f5aba8c0ad1eebdb6 upstream. + +This reverts commit e4f2006f1287e7ea17660490569cff323772dac4. + +This patch shows problems with signal handling. Revert it for now. + +Signed-off-by: Helge Deller +Cc: # v5.15 +Signed-off-by: Greg Kroah-Hartman +--- + arch/parisc/include/asm/rt_sigframe.h | 2 +- + arch/parisc/kernel/signal.c | 13 +++++++------ + arch/parisc/kernel/signal32.h | 2 +- + 3 files changed, 9 insertions(+), 8 deletions(-) + +--- a/arch/parisc/include/asm/rt_sigframe.h ++++ b/arch/parisc/include/asm/rt_sigframe.h +@@ -2,7 +2,7 @@ + #ifndef _ASM_PARISC_RT_SIGFRAME_H + #define _ASM_PARISC_RT_SIGFRAME_H + +-#define SIGRETURN_TRAMP 3 ++#define SIGRETURN_TRAMP 4 + #define SIGRESTARTBLOCK_TRAMP 5 + #define TRAMP_SIZE (SIGRETURN_TRAMP + SIGRESTARTBLOCK_TRAMP) + +--- a/arch/parisc/kernel/signal.c ++++ b/arch/parisc/kernel/signal.c +@@ -288,21 +288,22 @@ setup_rt_frame(struct ksignal *ksig, sig + already in userspace. The first words of tramp are used to + save the previous sigrestartblock trampoline that might be + on the stack. We start the sigreturn trampoline at +- SIGRESTARTBLOCK_TRAMP. */ ++ SIGRESTARTBLOCK_TRAMP+X. */ + err |= __put_user(in_syscall ? INSN_LDI_R25_1 : INSN_LDI_R25_0, + &frame->tramp[SIGRESTARTBLOCK_TRAMP+0]); +- err |= __put_user(INSN_BLE_SR2_R0, ++ err |= __put_user(INSN_LDI_R20, + &frame->tramp[SIGRESTARTBLOCK_TRAMP+1]); +- err |= __put_user(INSN_LDI_R20, ++ err |= __put_user(INSN_BLE_SR2_R0, + &frame->tramp[SIGRESTARTBLOCK_TRAMP+2]); ++ err |= __put_user(INSN_NOP, &frame->tramp[SIGRESTARTBLOCK_TRAMP+3]); + +- start = (unsigned long) &frame->tramp[SIGRESTARTBLOCK_TRAMP+0]; +- end = (unsigned long) &frame->tramp[SIGRESTARTBLOCK_TRAMP+3]; ++ start = (unsigned long) &frame->tramp[0]; ++ end = (unsigned long) &frame->tramp[TRAMP_SIZE]; + flush_user_dcache_range_asm(start, end); + flush_user_icache_range_asm(start, end); + + /* TRAMP Words 0-4, Length 5 = SIGRESTARTBLOCK_TRAMP +- * TRAMP Words 5-7, Length 3 = SIGRETURN_TRAMP ++ * TRAMP Words 5-9, Length 4 = SIGRETURN_TRAMP + * So the SIGRETURN_TRAMP is at the end of SIGRESTARTBLOCK_TRAMP + */ + rp = (unsigned long) &frame->tramp[SIGRESTARTBLOCK_TRAMP]; +--- a/arch/parisc/kernel/signal32.h ++++ b/arch/parisc/kernel/signal32.h +@@ -36,7 +36,7 @@ struct compat_regfile { + compat_int_t rf_sar; + }; + +-#define COMPAT_SIGRETURN_TRAMP 3 ++#define COMPAT_SIGRETURN_TRAMP 4 + #define COMPAT_SIGRESTARTBLOCK_TRAMP 5 + #define COMPAT_TRAMP_SIZE (COMPAT_SIGRETURN_TRAMP + \ + COMPAT_SIGRESTARTBLOCK_TRAMP) diff --git a/queue-5.15/s390-boot-simplify-and-fix-kernel-memory-layout-setup.patch b/queue-5.15/s390-boot-simplify-and-fix-kernel-memory-layout-setup.patch new file mode 100644 index 00000000000..8058bf50c39 --- /dev/null +++ b/queue-5.15/s390-boot-simplify-and-fix-kernel-memory-layout-setup.patch @@ -0,0 +1,165 @@ +From 9a39abb7c9aab50eec4ac4421e9ee7f3de013d24 Mon Sep 17 00:00:00 2001 +From: Vasily Gorbik +Date: Thu, 14 Oct 2021 13:53:54 +0200 +Subject: s390/boot: simplify and fix kernel memory layout setup + +From: Vasily Gorbik + +commit 9a39abb7c9aab50eec4ac4421e9ee7f3de013d24 upstream. + +Initial KASAN shadow memory range was picked to preserve original kernel +modules area position. With protected execution support, which might +impose addressing limitation on vmalloc area and hence affect modules +area position, current fixed KASAN shadow memory range is only making +kernel memory layout setup more complex. So move it to the very end of +available virtual space and simplify calculations. + +At the same time return to previous kernel address space split. In +particular commit 0c4f2623b957 ("s390: setup kernel memory layout +early") introduced precise identity map size calculation and keeping +vmemmap left most starting from a fresh region table entry. This didn't +take into account additional mapping region requirement for potential +DCSS mapping above available physical memory. So go back to virtual +space split between 1:1 mapping & vmemmap array once vmalloc area size +is subtracted. + +Cc: stable@vger.kernel.org +Fixes: 0c4f2623b957 ("s390: setup kernel memory layout early") +Reported-by: Gerald Schaefer +Reviewed-by: Heiko Carstens +Reviewed-by: Alexander Gordeev +Signed-off-by: Vasily Gorbik +Signed-off-by: Heiko Carstens +Signed-off-by: Greg Kroah-Hartman +--- + arch/s390/Kconfig | 2 - + arch/s390/boot/startup.c | 88 ++++++++++++++++------------------------------- + 2 files changed, 32 insertions(+), 58 deletions(-) + +--- a/arch/s390/Kconfig ++++ b/arch/s390/Kconfig +@@ -47,7 +47,7 @@ config ARCH_SUPPORTS_UPROBES + config KASAN_SHADOW_OFFSET + hex + depends on KASAN +- default 0x18000000000000 ++ default 0x1C000000000000 + + config S390 + def_bool y +--- a/arch/s390/boot/startup.c ++++ b/arch/s390/boot/startup.c +@@ -148,82 +148,56 @@ static void setup_ident_map_size(unsigne + + static void setup_kernel_memory_layout(void) + { +- bool vmalloc_size_verified = false; +- unsigned long vmemmap_off; +- unsigned long vspace_left; ++ unsigned long vmemmap_start; + unsigned long rte_size; + unsigned long pages; +- unsigned long vmax; + + pages = ident_map_size / PAGE_SIZE; + /* vmemmap contains a multiple of PAGES_PER_SECTION struct pages */ + vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page); + + /* choose kernel address space layout: 4 or 3 levels. */ +- vmemmap_off = round_up(ident_map_size, _REGION3_SIZE); ++ vmemmap_start = round_up(ident_map_size, _REGION3_SIZE); + if (IS_ENABLED(CONFIG_KASAN) || + vmalloc_size > _REGION2_SIZE || +- vmemmap_off + vmemmap_size + vmalloc_size + MODULES_LEN > _REGION2_SIZE) +- vmax = _REGION1_SIZE; +- else +- vmax = _REGION2_SIZE; +- +- /* keep vmemmap_off aligned to a top level region table entry */ +- rte_size = vmax == _REGION1_SIZE ? _REGION2_SIZE : _REGION3_SIZE; +- MODULES_END = vmax; +- if (is_prot_virt_host()) { +- /* +- * forcing modules and vmalloc area under the ultravisor +- * secure storage limit, so that any vmalloc allocation +- * we do could be used to back secure guest storage. +- */ +- adjust_to_uv_max(&MODULES_END); +- } +- +-#ifdef CONFIG_KASAN +- if (MODULES_END < vmax) { +- /* force vmalloc and modules below kasan shadow */ +- MODULES_END = min(MODULES_END, KASAN_SHADOW_START); ++ vmemmap_start + vmemmap_size + vmalloc_size + MODULES_LEN > ++ _REGION2_SIZE) { ++ MODULES_END = _REGION1_SIZE; ++ rte_size = _REGION2_SIZE; + } else { +- /* +- * leave vmalloc and modules above kasan shadow but make +- * sure they don't overlap with it +- */ +- vmalloc_size = min(vmalloc_size, vmax - KASAN_SHADOW_END - MODULES_LEN); +- vmalloc_size_verified = true; +- vspace_left = KASAN_SHADOW_START; ++ MODULES_END = _REGION2_SIZE; ++ rte_size = _REGION3_SIZE; + } ++ /* ++ * forcing modules and vmalloc area under the ultravisor ++ * secure storage limit, so that any vmalloc allocation ++ * we do could be used to back secure guest storage. ++ */ ++ adjust_to_uv_max(&MODULES_END); ++#ifdef CONFIG_KASAN ++ /* force vmalloc and modules below kasan shadow */ ++ MODULES_END = min(MODULES_END, KASAN_SHADOW_START); + #endif + MODULES_VADDR = MODULES_END - MODULES_LEN; + VMALLOC_END = MODULES_VADDR; + +- if (vmalloc_size_verified) { +- VMALLOC_START = VMALLOC_END - vmalloc_size; +- } else { +- vmemmap_off = round_up(ident_map_size, rte_size); +- +- if (vmemmap_off + vmemmap_size > VMALLOC_END || +- vmalloc_size > VMALLOC_END - vmemmap_off - vmemmap_size) { +- /* +- * allow vmalloc area to occupy up to 1/2 of +- * the rest virtual space left. +- */ +- vmalloc_size = min(vmalloc_size, VMALLOC_END / 2); +- } +- VMALLOC_START = VMALLOC_END - vmalloc_size; +- vspace_left = VMALLOC_START; +- } ++ /* allow vmalloc area to occupy up to about 1/2 of the rest virtual space left */ ++ vmalloc_size = min(vmalloc_size, round_down(VMALLOC_END / 2, _REGION3_SIZE)); ++ VMALLOC_START = VMALLOC_END - vmalloc_size; + +- pages = vspace_left / (PAGE_SIZE + sizeof(struct page)); ++ /* split remaining virtual space between 1:1 mapping & vmemmap array */ ++ pages = VMALLOC_START / (PAGE_SIZE + sizeof(struct page)); + pages = SECTION_ALIGN_UP(pages); +- vmemmap_off = round_up(vspace_left - pages * sizeof(struct page), rte_size); +- /* keep vmemmap left most starting from a fresh region table entry */ +- vmemmap_off = min(vmemmap_off, round_up(ident_map_size, rte_size)); +- /* take care that identity map is lower then vmemmap */ +- ident_map_size = min(ident_map_size, vmemmap_off); ++ /* keep vmemmap_start aligned to a top level region table entry */ ++ vmemmap_start = round_down(VMALLOC_START - pages * sizeof(struct page), rte_size); ++ /* vmemmap_start is the future VMEM_MAX_PHYS, make sure it is within MAX_PHYSMEM */ ++ vmemmap_start = min(vmemmap_start, 1UL << MAX_PHYSMEM_BITS); ++ /* make sure identity map doesn't overlay with vmemmap */ ++ ident_map_size = min(ident_map_size, vmemmap_start); + vmemmap_size = SECTION_ALIGN_UP(ident_map_size / PAGE_SIZE) * sizeof(struct page); +- VMALLOC_START = max(vmemmap_off + vmemmap_size, VMALLOC_START); +- vmemmap = (struct page *)vmemmap_off; ++ /* make sure vmemmap doesn't overlay with vmalloc area */ ++ VMALLOC_START = max(vmemmap_start + vmemmap_size, VMALLOC_START); ++ vmemmap = (struct page *)vmemmap_start; + } + + /* diff --git a/queue-5.15/s390-dump-fix-copying-to-user-space-of-swapped-kdump-oldmem.patch b/queue-5.15/s390-dump-fix-copying-to-user-space-of-swapped-kdump-oldmem.patch new file mode 100644 index 00000000000..fc173afc696 --- /dev/null +++ b/queue-5.15/s390-dump-fix-copying-to-user-space-of-swapped-kdump-oldmem.patch @@ -0,0 +1,41 @@ +From 3b90954419d4c05651de9cce6d7632bcf6977678 Mon Sep 17 00:00:00 2001 +From: Alexander Egorenkov +Date: Mon, 15 Nov 2021 07:40:25 +0100 +Subject: s390/dump: fix copying to user-space of swapped kdump oldmem + +From: Alexander Egorenkov + +commit 3b90954419d4c05651de9cce6d7632bcf6977678 upstream. + +This commit fixes a bug introduced by commit e9e7870f90e3 ("s390/dump: +introduce boot data 'oldmem_data'"). +OLDMEM_BASE was mistakenly replaced by oldmem_data.size instead of +oldmem_data.start. + +This bug caused the following error during kdump: +kdump.sh[878]: No program header covering vaddr 0x3434f5245found kexec bug? + +Fixes: e9e7870f90e3 ("s390/dump: introduce boot data 'oldmem_data'") +Cc: stable@vger.kernel.org # 5.15+ +Signed-off-by: Alexander Egorenkov +Reviewed-by: Marc Hartmayer +Reviewed-by: Heiko Carstens +Signed-off-by: Heiko Carstens +Signed-off-by: Greg Kroah-Hartman +--- + arch/s390/kernel/crash_dump.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/s390/kernel/crash_dump.c ++++ b/arch/s390/kernel/crash_dump.c +@@ -191,8 +191,8 @@ static int copy_oldmem_user(void __user + return rc; + } else { + /* Check for swapped kdump oldmem areas */ +- if (oldmem_data.start && from - oldmem_data.size < oldmem_data.size) { +- from -= oldmem_data.size; ++ if (oldmem_data.start && from - oldmem_data.start < oldmem_data.size) { ++ from -= oldmem_data.start; + len = min(count, oldmem_data.size - from); + } else if (oldmem_data.start && from < oldmem_data.size) { + len = min(count, oldmem_data.size - from); diff --git a/queue-5.15/s390-kexec-fix-memory-leak-of-ipl-report-buffer.patch b/queue-5.15/s390-kexec-fix-memory-leak-of-ipl-report-buffer.patch new file mode 100644 index 00000000000..e14e44f2950 --- /dev/null +++ b/queue-5.15/s390-kexec-fix-memory-leak-of-ipl-report-buffer.patch @@ -0,0 +1,85 @@ +From 4aa9340584e37debef06fa99b56d064beb723891 Mon Sep 17 00:00:00 2001 +From: Baoquan He +Date: Tue, 16 Nov 2021 11:31:01 +0800 +Subject: s390/kexec: fix memory leak of ipl report buffer + +From: Baoquan He + +commit 4aa9340584e37debef06fa99b56d064beb723891 upstream. + +unreferenced object 0x38000195000 (size 4096): + comm "kexec", pid 8548, jiffies 4294953647 (age 32443.270s) + hex dump (first 32 bytes): + 00 00 00 c8 20 00 00 00 00 00 00 c0 02 80 00 00 .... ........... + 40 40 40 40 40 40 40 40 00 00 00 00 00 00 00 00 @@@@@@@@........ + backtrace: + [<0000000011a2f199>] __vmalloc_node_range+0xc0/0x140 + [<0000000081fa2752>] vzalloc+0x5a/0x70 + [<0000000063a4c92d>] ipl_report_finish+0x2c/0x180 + [<00000000553304da>] kexec_file_add_ipl_report+0xf4/0x150 + [<00000000862d033f>] kexec_file_add_components+0x124/0x160 + [<000000000d2717bb>] arch_kexec_kernel_image_load+0x62/0x90 + [<000000002e0373b6>] kimage_file_alloc_init+0x1aa/0x2e0 + [<0000000060f2d14f>] __do_sys_kexec_file_load+0x17c/0x2c0 + [<000000008c86fe5a>] __s390x_sys_kexec_file_load+0x40/0x50 + [<000000001fdb9dac>] __do_syscall+0x1bc/0x1f0 + [<000000003ee4258d>] system_call+0x78/0xa0 + +Signed-off-by: Baoquan He +Reviewed-by: Philipp Rudo +Fixes: 99feaa717e55 ("s390/kexec_file: Create ipl report and pass to next kernel") +Cc: # v5.2: 20c76e242e70: s390/kexec: fix return code handling +Cc: # v5.2 +Link: https://lore.kernel.org/r/20211116033101.GD21646@MiWiFi-R3L-srv +Signed-off-by: Heiko Carstens +Signed-off-by: Greg Kroah-Hartman +--- + arch/s390/include/asm/kexec.h | 6 ++++++ + arch/s390/kernel/machine_kexec_file.c | 10 ++++++++++ + 2 files changed, 16 insertions(+) + +--- a/arch/s390/include/asm/kexec.h ++++ b/arch/s390/include/asm/kexec.h +@@ -74,6 +74,12 @@ void *kexec_file_add_components(struct k + int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val, + unsigned long addr); + ++#define ARCH_HAS_KIMAGE_ARCH ++ ++struct kimage_arch { ++ void *ipl_buf; ++}; ++ + extern const struct kexec_file_ops s390_kexec_image_ops; + extern const struct kexec_file_ops s390_kexec_elf_ops; + +--- a/arch/s390/kernel/machine_kexec_file.c ++++ b/arch/s390/kernel/machine_kexec_file.c +@@ -12,6 +12,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -206,6 +207,7 @@ static int kexec_file_add_ipl_report(str + goto out; + buf.bufsz = data->report->size; + buf.memsz = buf.bufsz; ++ image->arch.ipl_buf = buf.buffer; + + data->memsz += buf.memsz; + +@@ -327,3 +329,11 @@ int arch_kexec_kernel_image_probe(struct + + return kexec_image_probe_default(image, buf, buf_len); + } ++ ++int arch_kimage_file_post_load_cleanup(struct kimage *image) ++{ ++ vfree(image->arch.ipl_buf); ++ image->arch.ipl_buf = NULL; ++ ++ return kexec_image_post_load_cleanup_default(image); ++} diff --git a/queue-5.15/s390-setup-avoid-reserving-memory-above-identity-mapping.patch b/queue-5.15/s390-setup-avoid-reserving-memory-above-identity-mapping.patch new file mode 100644 index 00000000000..9ac06b9a149 --- /dev/null +++ b/queue-5.15/s390-setup-avoid-reserving-memory-above-identity-mapping.patch @@ -0,0 +1,59 @@ +From 420f48f636b98fd685f44a3acc4c0a7c0840910d Mon Sep 17 00:00:00 2001 +From: Vasily Gorbik +Date: Thu, 14 Oct 2021 13:33:45 +0200 +Subject: s390/setup: avoid reserving memory above identity mapping + +From: Vasily Gorbik + +commit 420f48f636b98fd685f44a3acc4c0a7c0840910d upstream. + +Such reserved memory region, if not cleaned up later causes problems when +memblock_free_all() is called to release free pages to the buddy allocator +and those reserved regions are carried over to reserve_bootmem_region() +which marks the pages as PageReserved. + +Instead use memblock_set_current_limit() to make sure memblock allocations +do not go over identity mapping (which could happen when "mem=" option +is used or during kdump). + +Cc: stable@vger.kernel.org +Fixes: 73045a08cf55 ("s390: unify identity mapping limits handling") +Reported-by: Gerald Schaefer +Reviewed-by: Heiko Carstens +Signed-off-by: Vasily Gorbik +Signed-off-by: Heiko Carstens +Signed-off-by: Greg Kroah-Hartman +--- + arch/s390/kernel/setup.c | 10 +--------- + 1 file changed, 1 insertion(+), 9 deletions(-) + +--- a/arch/s390/kernel/setup.c ++++ b/arch/s390/kernel/setup.c +@@ -634,14 +634,6 @@ static struct notifier_block kdump_mem_n + #endif + + /* +- * Make sure that the area above identity mapping is protected +- */ +-static void __init reserve_above_ident_map(void) +-{ +- memblock_reserve(ident_map_size, ULONG_MAX); +-} +- +-/* + * Reserve memory for kdump kernel to be loaded with kexec + */ + static void __init reserve_crashkernel(void) +@@ -1005,11 +997,11 @@ void __init setup_arch(char **cmdline_p) + setup_control_program_code(); + + /* Do some memory reservations *before* memory is added to memblock */ +- reserve_above_ident_map(); + reserve_kernel(); + reserve_initrd(); + reserve_certificate_list(); + reserve_mem_detect_info(); ++ memblock_set_current_limit(ident_map_size); + memblock_allow_resize(); + + /* Get information about *all* installed memory */ diff --git a/queue-5.15/s390-vdso-filter-out-mstack-guard-and-mstack-size.patch b/queue-5.15/s390-vdso-filter-out-mstack-guard-and-mstack-size.patch new file mode 100644 index 00000000000..28c12918f7b --- /dev/null +++ b/queue-5.15/s390-vdso-filter-out-mstack-guard-and-mstack-size.patch @@ -0,0 +1,62 @@ +From 00b55eaf45549ce26424224d069a091c7e5d8bac Mon Sep 17 00:00:00 2001 +From: Sven Schnelle +Date: Thu, 11 Nov 2021 10:58:26 +0100 +Subject: s390/vdso: filter out -mstack-guard and -mstack-size + +From: Sven Schnelle + +commit 00b55eaf45549ce26424224d069a091c7e5d8bac upstream. + +When CONFIG_VMAP_STACK is disabled, the user can enable CONFIG_STACK_CHECK, +which adds a stack overflow check to each C function in the kernel. This is +also done for functions in the vdso page. These functions are run in user +context and user stack sizes are usually different to what the kernel uses. +This might trigger the stack check although the stack size is valid. +Therefore filter the -mstack-guard and -mstack-size flags when compiling +vdso C files. + +Cc: stable@kernel.org # 5.10+ +Fixes: 4bff8cb54502 ("s390: convert to GENERIC_VDSO") +Reported-by: Janosch Frank +Signed-off-by: Sven Schnelle +Reviewed-by: Heiko Carstens +Signed-off-by: Heiko Carstens +Signed-off-by: Greg Kroah-Hartman +--- + arch/s390/Makefile | 10 ++++++---- + arch/s390/kernel/vdso64/Makefile | 5 +++-- + 2 files changed, 9 insertions(+), 6 deletions(-) + +--- a/arch/s390/Makefile ++++ b/arch/s390/Makefile +@@ -79,10 +79,12 @@ KBUILD_AFLAGS_DECOMPRESSOR += $(aflags-y + KBUILD_CFLAGS_DECOMPRESSOR += $(cflags-y) + + ifneq ($(call cc-option,-mstack-size=8192 -mstack-guard=128),) +-cflags-$(CONFIG_CHECK_STACK) += -mstack-size=$(STACK_SIZE) +-ifeq ($(call cc-option,-mstack-size=8192),) +-cflags-$(CONFIG_CHECK_STACK) += -mstack-guard=$(CONFIG_STACK_GUARD) +-endif ++ CC_FLAGS_CHECK_STACK := -mstack-size=$(STACK_SIZE) ++ ifeq ($(call cc-option,-mstack-size=8192),) ++ CC_FLAGS_CHECK_STACK += -mstack-guard=$(CONFIG_STACK_GUARD) ++ endif ++ export CC_FLAGS_CHECK_STACK ++ cflags-$(CONFIG_CHECK_STACK) += $(CC_FLAGS_CHECK_STACK) + endif + + ifdef CONFIG_EXPOLINE +--- a/arch/s390/kernel/vdso64/Makefile ++++ b/arch/s390/kernel/vdso64/Makefile +@@ -8,8 +8,9 @@ ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT + include $(srctree)/lib/vdso/Makefile + obj-vdso64 = vdso_user_wrapper.o note.o + obj-cvdso64 = vdso64_generic.o getcpu.o +-CFLAGS_REMOVE_getcpu.o = -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) +-CFLAGS_REMOVE_vdso64_generic.o = -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) ++VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) $(CC_FLAGS_CHECK_STACK) ++CFLAGS_REMOVE_getcpu.o = $(VDSO_CFLAGS_REMOVE) ++CFLAGS_REMOVE_vdso64_generic.o = $(VDSO_CFLAGS_REMOVE) + + # Build rules + diff --git a/queue-5.15/scsi-qla2xxx-fix-mailbox-direction-flags-in-qla2xxx_get_adapter_id.patch b/queue-5.15/scsi-qla2xxx-fix-mailbox-direction-flags-in-qla2xxx_get_adapter_id.patch new file mode 100644 index 00000000000..2804d0a33e2 --- /dev/null +++ b/queue-5.15/scsi-qla2xxx-fix-mailbox-direction-flags-in-qla2xxx_get_adapter_id.patch @@ -0,0 +1,39 @@ +From 392006871bb26166bcfafa56faf49431c2cfaaa8 Mon Sep 17 00:00:00 2001 +From: "Ewan D. Milne" +Date: Mon, 8 Nov 2021 13:30:12 -0500 +Subject: scsi: qla2xxx: Fix mailbox direction flags in qla2xxx_get_adapter_id() + +From: Ewan D. Milne + +commit 392006871bb26166bcfafa56faf49431c2cfaaa8 upstream. + +The SCM changes set the flags in mcp->out_mb instead of mcp->in_mb so the +data was not actually being read into the mcp->mb[] array from the adapter. + +Link: https://lore.kernel.org/r/20211108183012.13895-1-emilne@redhat.com +Fixes: 9f2475fe7406 ("scsi: qla2xxx: SAN congestion management implementation") +Cc: stable@vger.kernel.org +Reviewed-by: Himanshu Madhani +Reviewed-by: Arun Easi +Signed-off-by: Ewan D. Milne +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/scsi/qla2xxx/qla_mbx.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +--- a/drivers/scsi/qla2xxx/qla_mbx.c ++++ b/drivers/scsi/qla2xxx/qla_mbx.c +@@ -1695,10 +1695,8 @@ qla2x00_get_adapter_id(scsi_qla_host_t * + mcp->in_mb |= MBX_13|MBX_12|MBX_11|MBX_10; + if (IS_FWI2_CAPABLE(vha->hw)) + mcp->in_mb |= MBX_19|MBX_18|MBX_17|MBX_16; +- if (IS_QLA27XX(vha->hw) || IS_QLA28XX(vha->hw)) { +- mcp->in_mb |= MBX_15; +- mcp->out_mb |= MBX_7|MBX_21|MBX_22|MBX_23; +- } ++ if (IS_QLA27XX(vha->hw) || IS_QLA28XX(vha->hw)) ++ mcp->in_mb |= MBX_15|MBX_21|MBX_22|MBX_23; + + mcp->tov = MBX_TOV_SECONDS; + mcp->flags = 0; diff --git a/queue-5.15/series b/queue-5.15/series index 1d51243d00c..d080d9d1978 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -188,3 +188,38 @@ s390-kexec-fix-return-code-handling.patch blk-cgroup-fix-missing-put-device-in-error-path-from.patch dmaengine-remove-debugfs-ifdef.patch tun-fix-bonding-active-backup-with-arp-monitoring.patch +revert-mark-pstore-blk-as-broken.patch +pstore-blk-use-lu-to-format-unsigned-long.patch +hexagon-export-raw-i-o-routines-for-modules.patch +hexagon-clean-up-timer-regs.h.patch +tipc-check-for-null-after-calling-kmemdup.patch +ipc-warn-if-trying-to-remove-ipc-object-which-is-absent.patch +shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch +mm-kmemleak-slob-respect-slab_noleaktrace-flag.patch +hugetlb-userfaultfd-fix-reservation-restore-on-userfaultfd-error.patch +kmap_local-don-t-assume-kmap-ptes-are-linear-arrays-in-memory.patch +mm-damon-dbgfs-use-__gfp_nowarn-for-user-specified-size-buffer-allocation.patch +mm-damon-dbgfs-fix-missed-use-of-damon_dbgfs_lock.patch +x86-boot-pull-up-cmdline-preparation-and-early-param-parsing.patch +x86-sgx-fix-free-page-accounting.patch +x86-hyperv-fix-null-deref-in-set_hv_tscchange_cb-if-hyper-v-setup-fails.patch +kvm-x86-assume-a-64-bit-hypercall-for-guests-with-protected-state.patch +kvm-x86-fix-uninitialized-eoi_exit_bitmap-usage-in-vcpu_load_eoi_exitmap.patch +kvm-x86-mmu-include-efer.lma-in-extended-mmu-role.patch +kvm-x86-xen-fix-get_attr-of-kvm_xen_attr_type_shared_info.patch +powerpc-signal32-fix-sigset_t-copy.patch +powerpc-xive-change-irq-domain-to-a-tree-domain.patch +powerpc-8xx-fix-pinned-tlbs-with-config_strict_kernel_rwx.patch +revert-drm-i915-tgl-dsi-gate-the-ddi-clocks-after-pll-mapping.patch +revert-parisc-reduce-sigreturn-trampoline-to-3-instructions.patch +ata-libata-improve-ata_read_log_page-error-message.patch +ata-libata-add-missing-ata_identify_page_supported-calls.patch +scsi-qla2xxx-fix-mailbox-direction-flags-in-qla2xxx_get_adapter_id.patch +pinctrl-ralink-include-ralink_regs.h-in-pinctrl-mt7620.c.patch +s390-setup-avoid-reserving-memory-above-identity-mapping.patch +s390-boot-simplify-and-fix-kernel-memory-layout-setup.patch +s390-vdso-filter-out-mstack-guard-and-mstack-size.patch +s390-kexec-fix-memory-leak-of-ipl-report-buffer.patch +s390-dump-fix-copying-to-user-space-of-swapped-kdump-oldmem.patch +block-check-admin-before-nice-for-ioprio_class_rt.patch +fbdev-prevent-probing-generic-drivers-if-a-fb-is-already-registered.patch diff --git a/queue-5.15/shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch b/queue-5.15/shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch new file mode 100644 index 00000000000..c0625ea4cd9 --- /dev/null +++ b/queue-5.15/shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch @@ -0,0 +1,386 @@ +From 85b6d24646e4125c591639841169baa98a2da503 Mon Sep 17 00:00:00 2001 +From: Alexander Mikhalitsyn +Date: Fri, 19 Nov 2021 16:43:21 -0800 +Subject: shm: extend forced shm destroy to support objects from several IPC nses + +From: Alexander Mikhalitsyn + +commit 85b6d24646e4125c591639841169baa98a2da503 upstream. + +Currently, the exit_shm() function not designed to work properly when +task->sysvshm.shm_clist holds shm objects from different IPC namespaces. + +This is a real pain when sysctl kernel.shm_rmid_forced = 1, because it +leads to use-after-free (reproducer exists). + +This is an attempt to fix the problem by extending exit_shm mechanism to +handle shm's destroy from several IPC ns'es. + +To achieve that we do several things: + +1. add a namespace (non-refcounted) pointer to the struct shmid_kernel + +2. during new shm object creation (newseg()/shmget syscall) we + initialize this pointer by current task IPC ns + +3. exit_shm() fully reworked such that it traverses over all shp's in + task->sysvshm.shm_clist and gets IPC namespace not from current task + as it was before but from shp's object itself, then call + shm_destroy(shp, ns). + +Note: We need to be really careful here, because as it was said before +(1), our pointer to IPC ns non-refcnt'ed. To be on the safe side we +using special helper get_ipc_ns_not_zero() which allows to get IPC ns +refcounter only if IPC ns not in the "state of destruction". + +Q/A + +Q: Why can we access shp->ns memory using non-refcounted pointer? +A: Because shp object lifetime is always shorther than IPC namespace + lifetime, so, if we get shp object from the task->sysvshm.shm_clist + while holding task_lock(task) nobody can steal our namespace. + +Q: Does this patch change semantics of unshare/setns/clone syscalls? +A: No. It's just fixes non-covered case when process may leave IPC + namespace without getting task->sysvshm.shm_clist list cleaned up. + +Link: https://lkml.kernel.org/r/67bb03e5-f79c-1815-e2bf-949c67047418@colorfullife.com +Link: https://lkml.kernel.org/r/20211109151501.4921-1-manfred@colorfullife.com +Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity") +Co-developed-by: Manfred Spraul +Signed-off-by: Manfred Spraul +Signed-off-by: Alexander Mikhalitsyn +Cc: "Eric W. Biederman" +Cc: Davidlohr Bueso +Cc: Greg KH +Cc: Andrei Vagin +Cc: Pavel Tikhomirov +Cc: Vasily Averin +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/ipc_namespace.h | 15 +++ + include/linux/sched/task.h | 2 + ipc/shm.c | 189 +++++++++++++++++++++++++++++++----------- + 3 files changed, 159 insertions(+), 47 deletions(-) + +--- a/include/linux/ipc_namespace.h ++++ b/include/linux/ipc_namespace.h +@@ -131,6 +131,16 @@ static inline struct ipc_namespace *get_ + return ns; + } + ++static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) ++{ ++ if (ns) { ++ if (refcount_inc_not_zero(&ns->ns.count)) ++ return ns; ++ } ++ ++ return NULL; ++} ++ + extern void put_ipc_ns(struct ipc_namespace *ns); + #else + static inline struct ipc_namespace *copy_ipcs(unsigned long flags, +@@ -146,6 +156,11 @@ static inline struct ipc_namespace *get_ + { + return ns; + } ++ ++static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) ++{ ++ return ns; ++} + + static inline void put_ipc_ns(struct ipc_namespace *ns) + { +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -158,7 +158,7 @@ static inline struct vm_struct *task_sta + * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring + * subscriptions and synchronises with wait4(). Also used in procfs. Also + * pins the final release of task.io_context. Also protects ->cpuset and +- * ->cgroup.subsys[]. And ->vfork_done. ++ * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. + * + * Nests both inside and outside of read_lock(&tasklist_lock). + * It must not be nested with write_lock_irq(&tasklist_lock), +--- a/ipc/shm.c ++++ b/ipc/shm.c +@@ -62,9 +62,18 @@ struct shmid_kernel /* private to the ke + struct pid *shm_lprid; + struct ucounts *mlock_ucounts; + +- /* The task created the shm object. NULL if the task is dead. */ ++ /* ++ * The task created the shm object, for ++ * task_lock(shp->shm_creator) ++ */ + struct task_struct *shm_creator; +- struct list_head shm_clist; /* list by creator */ ++ ++ /* ++ * List by creator. task_lock(->shm_creator) required for read/write. ++ * If list_empty(), then the creator is dead already. ++ */ ++ struct list_head shm_clist; ++ struct ipc_namespace *ns; + } __randomize_layout; + + /* shm_mode upper byte flags */ +@@ -115,6 +124,7 @@ static void do_shm_rmid(struct ipc_names + struct shmid_kernel *shp; + + shp = container_of(ipcp, struct shmid_kernel, shm_perm); ++ WARN_ON(ns != shp->ns); + + if (shp->shm_nattch) { + shp->shm_perm.mode |= SHM_DEST; +@@ -225,10 +235,43 @@ static void shm_rcu_free(struct rcu_head + kfree(shp); + } + +-static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) ++/* ++ * It has to be called with shp locked. ++ * It must be called before ipc_rmid() ++ */ ++static inline void shm_clist_rm(struct shmid_kernel *shp) ++{ ++ struct task_struct *creator; ++ ++ /* ensure that shm_creator does not disappear */ ++ rcu_read_lock(); ++ ++ /* ++ * A concurrent exit_shm may do a list_del_init() as well. ++ * Just do nothing if exit_shm already did the work ++ */ ++ if (!list_empty(&shp->shm_clist)) { ++ /* ++ * shp->shm_creator is guaranteed to be valid *only* ++ * if shp->shm_clist is not empty. ++ */ ++ creator = shp->shm_creator; ++ ++ task_lock(creator); ++ /* ++ * list_del_init() is a nop if the entry was already removed ++ * from the list. ++ */ ++ list_del_init(&shp->shm_clist); ++ task_unlock(creator); ++ } ++ rcu_read_unlock(); ++} ++ ++static inline void shm_rmid(struct shmid_kernel *s) + { +- list_del(&s->shm_clist); +- ipc_rmid(&shm_ids(ns), &s->shm_perm); ++ shm_clist_rm(s); ++ ipc_rmid(&shm_ids(s->ns), &s->shm_perm); + } + + +@@ -283,7 +326,7 @@ static void shm_destroy(struct ipc_names + shm_file = shp->shm_file; + shp->shm_file = NULL; + ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; +- shm_rmid(ns, shp); ++ shm_rmid(shp); + shm_unlock(shp); + if (!is_file_hugepages(shm_file)) + shmem_lock(shm_file, 0, shp->mlock_ucounts); +@@ -306,10 +349,10 @@ static void shm_destroy(struct ipc_names + * + * 2) sysctl kernel.shm_rmid_forced is set to 1. + */ +-static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) ++static bool shm_may_destroy(struct shmid_kernel *shp) + { + return (shp->shm_nattch == 0) && +- (ns->shm_rmid_forced || ++ (shp->ns->shm_rmid_forced || + (shp->shm_perm.mode & SHM_DEST)); + } + +@@ -340,7 +383,7 @@ static void shm_close(struct vm_area_str + ipc_update_pid(&shp->shm_lprid, task_tgid(current)); + shp->shm_dtim = ktime_get_real_seconds(); + shp->shm_nattch--; +- if (shm_may_destroy(ns, shp)) ++ if (shm_may_destroy(shp)) + shm_destroy(ns, shp); + else + shm_unlock(shp); +@@ -361,10 +404,10 @@ static int shm_try_destroy_orphaned(int + * + * As shp->* are changed under rwsem, it's safe to skip shp locking. + */ +- if (shp->shm_creator != NULL) ++ if (!list_empty(&shp->shm_clist)) + return 0; + +- if (shm_may_destroy(ns, shp)) { ++ if (shm_may_destroy(shp)) { + shm_lock_by_ptr(shp); + shm_destroy(ns, shp); + } +@@ -382,48 +425,97 @@ void shm_destroy_orphaned(struct ipc_nam + /* Locking assumes this will only be called with task == current */ + void exit_shm(struct task_struct *task) + { +- struct ipc_namespace *ns = task->nsproxy->ipc_ns; +- struct shmid_kernel *shp, *n; ++ for (;;) { ++ struct shmid_kernel *shp; ++ struct ipc_namespace *ns; + +- if (list_empty(&task->sysvshm.shm_clist)) +- return; ++ task_lock(task); ++ ++ if (list_empty(&task->sysvshm.shm_clist)) { ++ task_unlock(task); ++ break; ++ } ++ ++ shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel, ++ shm_clist); + +- /* +- * If kernel.shm_rmid_forced is not set then only keep track of +- * which shmids are orphaned, so that a later set of the sysctl +- * can clean them up. +- */ +- if (!ns->shm_rmid_forced) { +- down_read(&shm_ids(ns).rwsem); +- list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist) +- shp->shm_creator = NULL; + /* +- * Only under read lock but we are only called on current +- * so no entry on the list will be shared. ++ * 1) Get pointer to the ipc namespace. It is worth to say ++ * that this pointer is guaranteed to be valid because ++ * shp lifetime is always shorter than namespace lifetime ++ * in which shp lives. ++ * We taken task_lock it means that shp won't be freed. + */ +- list_del(&task->sysvshm.shm_clist); +- up_read(&shm_ids(ns).rwsem); +- return; +- } ++ ns = shp->ns; + +- /* +- * Destroy all already created segments, that were not yet mapped, +- * and mark any mapped as orphan to cover the sysctl toggling. +- * Destroy is skipped if shm_may_destroy() returns false. +- */ +- down_write(&shm_ids(ns).rwsem); +- list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) { +- shp->shm_creator = NULL; ++ /* ++ * 2) If kernel.shm_rmid_forced is not set then only keep track of ++ * which shmids are orphaned, so that a later set of the sysctl ++ * can clean them up. ++ */ ++ if (!ns->shm_rmid_forced) ++ goto unlink_continue; + +- if (shm_may_destroy(ns, shp)) { +- shm_lock_by_ptr(shp); +- shm_destroy(ns, shp); ++ /* ++ * 3) get a reference to the namespace. ++ * The refcount could be already 0. If it is 0, then ++ * the shm objects will be free by free_ipc_work(). ++ */ ++ ns = get_ipc_ns_not_zero(ns); ++ if (!ns) { ++unlink_continue: ++ list_del_init(&shp->shm_clist); ++ task_unlock(task); ++ continue; + } +- } + +- /* Remove the list head from any segments still attached. */ +- list_del(&task->sysvshm.shm_clist); +- up_write(&shm_ids(ns).rwsem); ++ /* ++ * 4) get a reference to shp. ++ * This cannot fail: shm_clist_rm() is called before ++ * ipc_rmid(), thus the refcount cannot be 0. ++ */ ++ WARN_ON(!ipc_rcu_getref(&shp->shm_perm)); ++ ++ /* ++ * 5) unlink the shm segment from the list of segments ++ * created by current. ++ * This must be done last. After unlinking, ++ * only the refcounts obtained above prevent IPC_RMID ++ * from destroying the segment or the namespace. ++ */ ++ list_del_init(&shp->shm_clist); ++ ++ task_unlock(task); ++ ++ /* ++ * 6) we have all references ++ * Thus lock & if needed destroy shp. ++ */ ++ down_write(&shm_ids(ns).rwsem); ++ shm_lock_by_ptr(shp); ++ /* ++ * rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's ++ * safe to call ipc_rcu_putref here ++ */ ++ ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); ++ ++ if (ipc_valid_object(&shp->shm_perm)) { ++ if (shm_may_destroy(shp)) ++ shm_destroy(ns, shp); ++ else ++ shm_unlock(shp); ++ } else { ++ /* ++ * Someone else deleted the shp from namespace ++ * idr/kht while we have waited. ++ * Just unlock and continue. ++ */ ++ shm_unlock(shp); ++ } ++ ++ up_write(&shm_ids(ns).rwsem); ++ put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */ ++ } + } + + static vm_fault_t shm_fault(struct vm_fault *vmf) +@@ -680,7 +772,11 @@ static int newseg(struct ipc_namespace * + if (error < 0) + goto no_id; + ++ shp->ns = ns; ++ ++ task_lock(current); + list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist); ++ task_unlock(current); + + /* + * shmid gets reported as "inode#" in /proc/pid/maps. +@@ -1573,7 +1669,8 @@ out_nattch: + down_write(&shm_ids(ns).rwsem); + shp = shm_lock(ns, shmid); + shp->shm_nattch--; +- if (shm_may_destroy(ns, shp)) ++ ++ if (shm_may_destroy(shp)) + shm_destroy(ns, shp); + else + shm_unlock(shp); diff --git a/queue-5.15/tipc-check-for-null-after-calling-kmemdup.patch b/queue-5.15/tipc-check-for-null-after-calling-kmemdup.patch new file mode 100644 index 00000000000..5d04eec3add --- /dev/null +++ b/queue-5.15/tipc-check-for-null-after-calling-kmemdup.patch @@ -0,0 +1,42 @@ +From 3e6db079751afd527bf3db32314ae938dc571916 Mon Sep 17 00:00:00 2001 +From: Tadeusz Struk +Date: Mon, 15 Nov 2021 08:01:43 -0800 +Subject: tipc: check for null after calling kmemdup + +From: Tadeusz Struk + +commit 3e6db079751afd527bf3db32314ae938dc571916 upstream. + +kmemdup can return a null pointer so need to check for it, otherwise +the null key will be dereferenced later in tipc_crypto_key_xmit as +can be seen in the trace [1]. + +Cc: tipc-discussion@lists.sourceforge.net +Cc: stable@vger.kernel.org # 5.15, 5.14, 5.10 + +[1] https://syzkaller.appspot.com/bug?id=bca180abb29567b189efdbdb34cbf7ba851c2a58 + +Reported-by: Dmitry Vyukov +Signed-off-by: Tadeusz Struk +Acked-by: Ying Xue +Acked-by: Jon Maloy +Link: https://lore.kernel.org/r/20211115160143.5099-1-tadeusz.struk@linaro.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/tipc/crypto.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/net/tipc/crypto.c ++++ b/net/tipc/crypto.c +@@ -597,6 +597,10 @@ static int tipc_aead_init(struct tipc_ae + tmp->cloned = NULL; + tmp->authsize = TIPC_AES_GCM_TAG_SIZE; + tmp->key = kmemdup(ukey, tipc_aead_key_size(ukey), GFP_KERNEL); ++ if (!tmp->key) { ++ tipc_aead_free(&tmp->rcu); ++ return -ENOMEM; ++ } + memcpy(&tmp->salt, ukey->key + keylen, TIPC_AES_GCM_SALT_SIZE); + atomic_set(&tmp->users, 0); + atomic64_set(&tmp->seqno, 0); diff --git a/queue-5.15/x86-boot-pull-up-cmdline-preparation-and-early-param-parsing.patch b/queue-5.15/x86-boot-pull-up-cmdline-preparation-and-early-param-parsing.patch new file mode 100644 index 00000000000..cbeff6d44a8 --- /dev/null +++ b/queue-5.15/x86-boot-pull-up-cmdline-preparation-and-early-param-parsing.patch @@ -0,0 +1,137 @@ +From 8d48bf8206f77aa8687f0e241e901e5197e52423 Mon Sep 17 00:00:00 2001 +From: Borislav Petkov +Date: Fri, 5 Nov 2021 10:41:51 +0100 +Subject: x86/boot: Pull up cmdline preparation and early param parsing + +From: Borislav Petkov + +commit 8d48bf8206f77aa8687f0e241e901e5197e52423 upstream. + +Dan reports that Anjaneya Chagam can no longer use the efi=nosoftreserve +kernel command line parameter to suppress "soft reservation" behavior. + +This is due to the fact that the following call-chain happens at boot: + +early_reserve_memory +|-> efi_memblock_x86_reserve_range + |-> efi_fake_memmap_early + +which does + + if (!efi_soft_reserve_enabled()) + return; + +and that would have set EFI_MEM_NO_SOFT_RESERVE after having parsed +"nosoftreserve". + +However, parse_early_param() gets called *after* it, leading to the boot +cmdline not being taken into account. + +Therefore, carve out the command line preparation into a separate +function which does the early param parsing too. So that it all goes +together. + +And then call that function before early_reserve_memory() so that the +params would have been parsed by then. + +Fixes: 8aa83e6395ce ("x86/setup: Call early_reserve_memory() earlier") +Reported-by: Dan Williams +Reviewed-by: Dan Williams +Signed-off-by: Borislav Petkov +Tested-by: Anjaneya Chagam +Cc: +Link: https://lore.kernel.org/r/e8dd8993c38702ee6dd73b3c11f158617e665607.camel@intel.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/setup.c | 66 ++++++++++++++++++++++++++++-------------------- + 1 file changed, 39 insertions(+), 27 deletions(-) + +--- a/arch/x86/kernel/setup.c ++++ b/arch/x86/kernel/setup.c +@@ -742,6 +742,28 @@ dump_kernel_offset(struct notifier_block + return 0; + } + ++static char *prepare_command_line(void) ++{ ++#ifdef CONFIG_CMDLINE_BOOL ++#ifdef CONFIG_CMDLINE_OVERRIDE ++ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); ++#else ++ if (builtin_cmdline[0]) { ++ /* append boot loader cmdline to builtin */ ++ strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); ++ strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); ++ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); ++ } ++#endif ++#endif ++ ++ strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); ++ ++ parse_early_param(); ++ ++ return command_line; ++} ++ + /* + * Determine if we were loaded by an EFI loader. If so, then we have also been + * passed the efi memmap, systab, etc., so we should use these data structures +@@ -831,6 +853,23 @@ void __init setup_arch(char **cmdline_p) + x86_init.oem.arch_setup(); + + /* ++ * x86_configure_nx() is called before parse_early_param() (called by ++ * prepare_command_line()) to detect whether hardware doesn't support ++ * NX (so that the early EHCI debug console setup can safely call ++ * set_fixmap()). It may then be called again from within noexec_setup() ++ * during parsing early parameters to honor the respective command line ++ * option. ++ */ ++ x86_configure_nx(); ++ ++ /* ++ * This parses early params and it needs to run before ++ * early_reserve_memory() because latter relies on such settings ++ * supplied as early params. ++ */ ++ *cmdline_p = prepare_command_line(); ++ ++ /* + * Do some memory reservations *before* memory is added to memblock, so + * memblock allocations won't overwrite it. + * +@@ -863,33 +902,6 @@ void __init setup_arch(char **cmdline_p) + bss_resource.start = __pa_symbol(__bss_start); + bss_resource.end = __pa_symbol(__bss_stop)-1; + +-#ifdef CONFIG_CMDLINE_BOOL +-#ifdef CONFIG_CMDLINE_OVERRIDE +- strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); +-#else +- if (builtin_cmdline[0]) { +- /* append boot loader cmdline to builtin */ +- strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); +- strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); +- strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); +- } +-#endif +-#endif +- +- strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); +- *cmdline_p = command_line; +- +- /* +- * x86_configure_nx() is called before parse_early_param() to detect +- * whether hardware doesn't support NX (so that the early EHCI debug +- * console setup can safely call set_fixmap()). It may then be called +- * again from within noexec_setup() during parsing early parameters +- * to honor the respective command line option. +- */ +- x86_configure_nx(); +- +- parse_early_param(); +- + #ifdef CONFIG_MEMORY_HOTPLUG + /* + * Memory used by the kernel cannot be hot-removed because Linux diff --git a/queue-5.15/x86-hyperv-fix-null-deref-in-set_hv_tscchange_cb-if-hyper-v-setup-fails.patch b/queue-5.15/x86-hyperv-fix-null-deref-in-set_hv_tscchange_cb-if-hyper-v-setup-fails.patch new file mode 100644 index 00000000000..c7bfdd1e8b9 --- /dev/null +++ b/queue-5.15/x86-hyperv-fix-null-deref-in-set_hv_tscchange_cb-if-hyper-v-setup-fails.patch @@ -0,0 +1,57 @@ +From daf972118c517b91f74ff1731417feb4270625a4 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Thu, 4 Nov 2021 18:22:38 +0000 +Subject: x86/hyperv: Fix NULL deref in set_hv_tscchange_cb() if Hyper-V setup fails + +From: Sean Christopherson + +commit daf972118c517b91f74ff1731417feb4270625a4 upstream. + +Check for a valid hv_vp_index array prior to derefencing hv_vp_index when +setting Hyper-V's TSC change callback. If Hyper-V setup failed in +hyperv_init(), the kernel will still report that it's running under +Hyper-V, but will have silently disabled nearly all functionality. + + BUG: kernel NULL pointer dereference, address: 0000000000000010 + #PF: supervisor read access in kernel mode + #PF: error_code(0x0000) - not-present page + PGD 0 P4D 0 + Oops: 0000 [#1] SMP + CPU: 4 PID: 1 Comm: swapper/0 Not tainted 5.15.0-rc2+ #75 + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 + RIP: 0010:set_hv_tscchange_cb+0x15/0xa0 + Code: <8b> 04 82 8b 15 12 17 85 01 48 c1 e0 20 48 0d ee 00 01 00 f6 c6 08 + ... + Call Trace: + kvm_arch_init+0x17c/0x280 + kvm_init+0x31/0x330 + vmx_init+0xba/0x13a + do_one_initcall+0x41/0x1c0 + kernel_init_freeable+0x1f2/0x23b + kernel_init+0x16/0x120 + ret_from_fork+0x22/0x30 + +Fixes: 93286261de1b ("x86/hyperv: Reenlightenment notifications support") +Cc: stable@vger.kernel.org +Cc: Vitaly Kuznetsov +Signed-off-by: Sean Christopherson +Reviewed-by: Vitaly Kuznetsov +Link: https://lore.kernel.org/r/20211104182239.1302956-2-seanjc@google.com +Signed-off-by: Wei Liu +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/hyperv/hv_init.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/arch/x86/hyperv/hv_init.c ++++ b/arch/x86/hyperv/hv_init.c +@@ -147,6 +147,9 @@ void set_hv_tscchange_cb(void (*cb)(void + return; + } + ++ if (!hv_vp_index) ++ return; ++ + hv_reenlightenment_cb = cb; + + /* Make sure callback is registered before we write to MSRs */ diff --git a/queue-5.15/x86-sgx-fix-free-page-accounting.patch b/queue-5.15/x86-sgx-fix-free-page-accounting.patch new file mode 100644 index 00000000000..a1d6a8ef1e1 --- /dev/null +++ b/queue-5.15/x86-sgx-fix-free-page-accounting.patch @@ -0,0 +1,167 @@ +From ac5d272a0ad0419f52e08c91953356e32b075af7 Mon Sep 17 00:00:00 2001 +From: Reinette Chatre +Date: Mon, 15 Nov 2021 11:29:04 -0800 +Subject: x86/sgx: Fix free page accounting + +From: Reinette Chatre + +commit ac5d272a0ad0419f52e08c91953356e32b075af7 upstream. + +The SGX driver maintains a single global free page counter, +sgx_nr_free_pages, that reflects the number of free pages available +across all NUMA nodes. Correspondingly, a list of free pages is +associated with each NUMA node and sgx_nr_free_pages is updated +every time a page is added or removed from any of the free page +lists. The main usage of sgx_nr_free_pages is by the reclaimer +that runs when it (sgx_nr_free_pages) goes below a watermark +to ensure that there are always some free pages available to, for +example, support efficient page faults. + +With sgx_nr_free_pages accessed and modified from a few places +it is essential to ensure that these accesses are done safely but +this is not the case. sgx_nr_free_pages is read without any +protection and updated with inconsistent protection by any one +of the spin locks associated with the individual NUMA nodes. +For example: + + CPU_A CPU_B + ----- ----- + spin_lock(&nodeA->lock); spin_lock(&nodeB->lock); + ... ... + sgx_nr_free_pages--; /* NOT SAFE */ sgx_nr_free_pages--; + + spin_unlock(&nodeA->lock); spin_unlock(&nodeB->lock); + +Since sgx_nr_free_pages may be protected by different spin locks +while being modified from different CPUs, the following scenario +is possible: + + CPU_A CPU_B + ----- ----- +{sgx_nr_free_pages = 100} + spin_lock(&nodeA->lock); spin_lock(&nodeB->lock); + sgx_nr_free_pages--; sgx_nr_free_pages--; + /* LOAD sgx_nr_free_pages = 100 */ /* LOAD sgx_nr_free_pages = 100 */ + /* sgx_nr_free_pages-- */ /* sgx_nr_free_pages-- */ + /* STORE sgx_nr_free_pages = 99 */ /* STORE sgx_nr_free_pages = 99 */ + spin_unlock(&nodeA->lock); spin_unlock(&nodeB->lock); + +In the above scenario, sgx_nr_free_pages is decremented from two CPUs +but instead of sgx_nr_free_pages ending with a value that is two less +than it started with, it was only decremented by one while the number +of free pages were actually reduced by two. The consequence of +sgx_nr_free_pages not being protected is that its value may not +accurately reflect the actual number of free pages on the system, +impacting the availability of free pages in support of many flows. + +The problematic scenario is when the reclaimer does not run because it +believes there to be sufficient free pages while any attempt to allocate +a page fails because there are no free pages available. In the SGX driver +the reclaimer's watermark is only 32 pages so after encountering the +above example scenario 32 times a user space hang is possible when there +are no more free pages because of repeated page faults caused by no +free pages made available. + +The following flow was encountered: +asm_exc_page_fault + ... + sgx_vma_fault() + sgx_encl_load_page() + sgx_encl_eldu() // Encrypted page needs to be loaded from backing + // storage into newly allocated SGX memory page + sgx_alloc_epc_page() // Allocate a page of SGX memory + __sgx_alloc_epc_page() // Fails, no free SGX memory + ... + if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) // Wake reclaimer + wake_up(&ksgxd_waitq); + return -EBUSY; // Return -EBUSY giving reclaimer time to run + return -EBUSY; + return -EBUSY; + return VM_FAULT_NOPAGE; + +The reclaimer is triggered in above flow with the following code: + +static bool sgx_should_reclaim(unsigned long watermark) +{ + return sgx_nr_free_pages < watermark && + !list_empty(&sgx_active_page_list); +} + +In the problematic scenario there were no free pages available yet the +value of sgx_nr_free_pages was above the watermark. The allocation of +SGX memory thus always failed because of a lack of free pages while no +free pages were made available because the reclaimer is never started +because of sgx_nr_free_pages' incorrect value. The consequence was that +user space kept encountering VM_FAULT_NOPAGE that caused the same +address to be accessed repeatedly with the same result. + +Change the global free page counter to an atomic type that +ensures simultaneous updates are done safely. While doing so, move +the updating of the variable outside of the spin lock critical +section to which it does not belong. + +Cc: stable@vger.kernel.org +Fixes: 901ddbb9ecf5 ("x86/sgx: Add a basic NUMA allocation scheme to sgx_alloc_epc_page()") +Suggested-by: Dave Hansen +Signed-off-by: Reinette Chatre +Signed-off-by: Dave Hansen +Reviewed-by: Tony Luck +Acked-by: Jarkko Sakkinen +Link: https://lkml.kernel.org/r/a95a40743bbd3f795b465f30922dde7f1ea9e0eb.1637004094.git.reinette.chatre@intel.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/cpu/sgx/main.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +--- a/arch/x86/kernel/cpu/sgx/main.c ++++ b/arch/x86/kernel/cpu/sgx/main.c +@@ -28,8 +28,7 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_wai + static LIST_HEAD(sgx_active_page_list); + static DEFINE_SPINLOCK(sgx_reclaimer_lock); + +-/* The free page list lock protected variables prepend the lock. */ +-static unsigned long sgx_nr_free_pages; ++static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0); + + /* Nodes with one or more EPC sections. */ + static nodemask_t sgx_numa_mask; +@@ -403,14 +402,15 @@ skip: + + spin_lock(&node->lock); + list_add_tail(&epc_page->list, &node->free_page_list); +- sgx_nr_free_pages++; + spin_unlock(&node->lock); ++ atomic_long_inc(&sgx_nr_free_pages); + } + } + + static bool sgx_should_reclaim(unsigned long watermark) + { +- return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list); ++ return atomic_long_read(&sgx_nr_free_pages) < watermark && ++ !list_empty(&sgx_active_page_list); + } + + static int ksgxd(void *p) +@@ -471,9 +471,9 @@ static struct sgx_epc_page *__sgx_alloc_ + + page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); + list_del_init(&page->list); +- sgx_nr_free_pages--; + + spin_unlock(&node->lock); ++ atomic_long_dec(&sgx_nr_free_pages); + + return page; + } +@@ -625,9 +625,9 @@ void sgx_free_epc_page(struct sgx_epc_pa + spin_lock(&node->lock); + + list_add_tail(&page->list, &node->free_page_list); +- sgx_nr_free_pages++; + + spin_unlock(&node->lock); ++ atomic_long_inc(&sgx_nr_free_pages); + } + + static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,