From: Greg Kroah-Hartman Date: Sun, 16 May 2021 09:32:50 +0000 (+0200) Subject: 5.11-stable patches X-Git-Tag: v5.4.120~78 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=523e03212e8698e42db15fc5c5f6ca71edba935b;p=thirdparty%2Fkernel%2Fstable-queue.git 5.11-stable patches added patches: arc-entry-fix-off-by-one-error-in-syscall-number-validation.patch arc-mm-pae-use-40-bit-physical-page-mask.patch arc-mm-use-max_high_pfn-as-a-highmem-zone-border.patch arm64-fix-race-condition-on-pg_dcache_clean-in-__sync_icache_dcache.patch arm64-mte-initialize-rgsr_el1.seed-in-__cpu_setup.patch blk-iocost-fix-weight-updates-of-inner-active-iocgs.patch btrfs-fix-deadlock-when-cloning-inline-extents-and-using-qgroups.patch btrfs-fix-race-leading-to-unpersisted-data-and-metadata-on-fsync.patch drm-amd-display-initialize-attribute-for-hdcp_srm-sysfs-file.patch drm-i915-avoid-div-by-zero-on-gen2.patch drm-radeon-dpm-disable-sclk-switching-on-oland-when-two-4k-60hz-monitors-are-connected.patch hfsplus-prevent-corruption-in-shrinking-truncate.patch kasan-fix-unit-tests-with-config_ubsan_local_bounds-enabled.patch kvm-exit-halt-polling-on-need_resched-as-well.patch mm-hugetlb-fix-f_seal_future_write.patch powerpc-64s-fix-crashes-when-toggling-entry-flush-barrier.patch powerpc-64s-fix-crashes-when-toggling-stf-barrier.patch sh-remove-unused-variable.patch squashfs-fix-divide-error-in-calculate_skip.patch userfaultfd-release-page-in-error-path-to-avoid-bug_on.patch x86-sched-fix-the-amd-cppc-maximum-performance-value-on-certain-amd-ryzen-generations.patch --- diff --git a/queue-5.11/arc-entry-fix-off-by-one-error-in-syscall-number-validation.patch b/queue-5.11/arc-entry-fix-off-by-one-error-in-syscall-number-validation.patch new file mode 100644 index 00000000000..1790674df64 --- /dev/null +++ b/queue-5.11/arc-entry-fix-off-by-one-error-in-syscall-number-validation.patch @@ -0,0 +1,51 @@ +From 3433adc8bd09fc9f29b8baddf33b4ecd1ecd2cdc Mon Sep 17 00:00:00 2001 +From: Vineet Gupta +Date: Fri, 23 Apr 2021 12:16:25 -0700 +Subject: ARC: entry: fix off-by-one error in syscall number validation + +From: Vineet Gupta + +commit 3433adc8bd09fc9f29b8baddf33b4ecd1ecd2cdc upstream. + +We have NR_syscall syscalls from [0 .. NR_syscall-1]. +However the check for invalid syscall number is "> NR_syscall" as +opposed to >=. This off-by-one error erronesously allows "NR_syscall" +to be treated as valid syscall causeing out-of-bounds access into +syscall-call table ensuing a crash (holes within syscall table have a +invalid-entry handler but this is beyond the array implementing the +table). + +This problem showed up on v5.6 kernel when testing glibc 2.33 (v5.10 +kernel capable, includng faccessat2 syscall 439). The v5.6 kernel has +NR_syscalls=439 (0 to 438). Due to the bug, 439 passed by glibc was +not handled as -ENOSYS but processed leading to a crash. + +Link: https://github.com/foss-for-synopsys-dwc-arc-processors/linux/issues/48 +Reported-by: Shahab Vahedi +Cc: +Signed-off-by: Vineet Gupta +Signed-off-by: Greg Kroah-Hartman +--- + arch/arc/kernel/entry.S | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/arc/kernel/entry.S ++++ b/arch/arc/kernel/entry.S +@@ -177,7 +177,7 @@ tracesys: + + ; Do the Sys Call as we normally would. + ; Validate the Sys Call number +- cmp r8, NR_syscalls ++ cmp r8, NR_syscalls - 1 + mov.hi r0, -ENOSYS + bhi tracesys_exit + +@@ -255,7 +255,7 @@ ENTRY(EV_Trap) + ;============ Normal syscall case + + ; syscall num shd not exceed the total system calls avail +- cmp r8, NR_syscalls ++ cmp r8, NR_syscalls - 1 + mov.hi r0, -ENOSYS + bhi .Lret_from_system_call + diff --git a/queue-5.11/arc-mm-pae-use-40-bit-physical-page-mask.patch b/queue-5.11/arc-mm-pae-use-40-bit-physical-page-mask.patch new file mode 100644 index 00000000000..17e5bfb356b --- /dev/null +++ b/queue-5.11/arc-mm-pae-use-40-bit-physical-page-mask.patch @@ -0,0 +1,133 @@ +From c5f756d8c6265ebb1736a7787231f010a3b782e5 Mon Sep 17 00:00:00 2001 +From: Vladimir Isaev +Date: Tue, 27 Apr 2021 15:12:37 +0300 +Subject: ARC: mm: PAE: use 40-bit physical page mask + +From: Vladimir Isaev + +commit c5f756d8c6265ebb1736a7787231f010a3b782e5 upstream. + +32-bit PAGE_MASK can not be used as a mask for physical addresses +when PAE is enabled. PAGE_MASK_PHYS must be used for physical +addresses instead of PAGE_MASK. + +Without this, init gets SIGSEGV if pte_modify was called: + +| potentially unexpected fatal signal 11. +| Path: /bin/busybox +| CPU: 0 PID: 1 Comm: init Not tainted 5.12.0-rc5-00003-g1e43c377a79f-dirty +| Insn could not be fetched +| @No matching VMA found +| ECR: 0x00040000 EFA: 0x00000000 ERET: 0x00000000 +| STAT: 0x80080082 [IE U ] BTA: 0x00000000 +| SP: 0x5f9ffe44 FP: 0x00000000 BLK: 0xaf3d4 +| LPS: 0x000d093e LPE: 0x000d0950 LPC: 0x00000000 +| r00: 0x00000002 r01: 0x5f9fff14 r02: 0x5f9fff20 +| ... +| Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b + +Signed-off-by: Vladimir Isaev +Reported-by: kernel test robot +Cc: Vineet Gupta +Cc: stable@vger.kernel.org +Signed-off-by: Vineet Gupta +Signed-off-by: Greg Kroah-Hartman +--- + arch/arc/include/asm/page.h | 12 ++++++++++++ + arch/arc/include/asm/pgtable.h | 12 +++--------- + arch/arc/include/uapi/asm/page.h | 1 - + arch/arc/mm/ioremap.c | 5 +++-- + arch/arc/mm/tlb.c | 2 +- + 5 files changed, 19 insertions(+), 13 deletions(-) + +--- a/arch/arc/include/asm/page.h ++++ b/arch/arc/include/asm/page.h +@@ -7,6 +7,18 @@ + + #include + ++#ifdef CONFIG_ARC_HAS_PAE40 ++ ++#define MAX_POSSIBLE_PHYSMEM_BITS 40 ++#define PAGE_MASK_PHYS (0xff00000000ull | PAGE_MASK) ++ ++#else /* CONFIG_ARC_HAS_PAE40 */ ++ ++#define MAX_POSSIBLE_PHYSMEM_BITS 32 ++#define PAGE_MASK_PHYS PAGE_MASK ++ ++#endif /* CONFIG_ARC_HAS_PAE40 */ ++ + #ifndef __ASSEMBLY__ + + #define clear_page(paddr) memset((paddr), 0, PAGE_SIZE) +--- a/arch/arc/include/asm/pgtable.h ++++ b/arch/arc/include/asm/pgtable.h +@@ -107,8 +107,8 @@ + #define ___DEF (_PAGE_PRESENT | _PAGE_CACHEABLE) + + /* Set of bits not changed in pte_modify */ +-#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_SPECIAL) +- ++#define _PAGE_CHG_MASK (PAGE_MASK_PHYS | _PAGE_ACCESSED | _PAGE_DIRTY | \ ++ _PAGE_SPECIAL) + /* More Abbrevaited helpers */ + #define PAGE_U_NONE __pgprot(___DEF) + #define PAGE_U_R __pgprot(___DEF | _PAGE_READ) +@@ -132,13 +132,7 @@ + #define PTE_BITS_IN_PD0 (_PAGE_GLOBAL | _PAGE_PRESENT | _PAGE_HW_SZ) + #define PTE_BITS_RWX (_PAGE_EXECUTE | _PAGE_WRITE | _PAGE_READ) + +-#ifdef CONFIG_ARC_HAS_PAE40 +-#define PTE_BITS_NON_RWX_IN_PD1 (0xff00000000 | PAGE_MASK | _PAGE_CACHEABLE) +-#define MAX_POSSIBLE_PHYSMEM_BITS 40 +-#else +-#define PTE_BITS_NON_RWX_IN_PD1 (PAGE_MASK | _PAGE_CACHEABLE) +-#define MAX_POSSIBLE_PHYSMEM_BITS 32 +-#endif ++#define PTE_BITS_NON_RWX_IN_PD1 (PAGE_MASK_PHYS | _PAGE_CACHEABLE) + + /************************************************************************** + * Mapping of vm_flags (Generic VM) to PTE flags (arch specific) +--- a/arch/arc/include/uapi/asm/page.h ++++ b/arch/arc/include/uapi/asm/page.h +@@ -33,5 +33,4 @@ + + #define PAGE_MASK (~(PAGE_SIZE-1)) + +- + #endif /* _UAPI__ASM_ARC_PAGE_H */ +--- a/arch/arc/mm/ioremap.c ++++ b/arch/arc/mm/ioremap.c +@@ -53,9 +53,10 @@ EXPORT_SYMBOL(ioremap); + void __iomem *ioremap_prot(phys_addr_t paddr, unsigned long size, + unsigned long flags) + { ++ unsigned int off; + unsigned long vaddr; + struct vm_struct *area; +- phys_addr_t off, end; ++ phys_addr_t end; + pgprot_t prot = __pgprot(flags); + + /* Don't allow wraparound, zero size */ +@@ -72,7 +73,7 @@ void __iomem *ioremap_prot(phys_addr_t p + + /* Mappings have to be page-aligned */ + off = paddr & ~PAGE_MASK; +- paddr &= PAGE_MASK; ++ paddr &= PAGE_MASK_PHYS; + size = PAGE_ALIGN(end + 1) - paddr; + + /* +--- a/arch/arc/mm/tlb.c ++++ b/arch/arc/mm/tlb.c +@@ -576,7 +576,7 @@ void update_mmu_cache(struct vm_area_str + pte_t *ptep) + { + unsigned long vaddr = vaddr_unaligned & PAGE_MASK; +- phys_addr_t paddr = pte_val(*ptep) & PAGE_MASK; ++ phys_addr_t paddr = pte_val(*ptep) & PAGE_MASK_PHYS; + struct page *page = pfn_to_page(pte_pfn(*ptep)); + + create_tlb(vma, vaddr, ptep); diff --git a/queue-5.11/arc-mm-use-max_high_pfn-as-a-highmem-zone-border.patch b/queue-5.11/arc-mm-use-max_high_pfn-as-a-highmem-zone-border.patch new file mode 100644 index 00000000000..062c31d993e --- /dev/null +++ b/queue-5.11/arc-mm-use-max_high_pfn-as-a-highmem-zone-border.patch @@ -0,0 +1,56 @@ +From 1d5e4640e5df15252398c1b621f6bd432f2d7f17 Mon Sep 17 00:00:00 2001 +From: Vladimir Isaev +Date: Tue, 27 Apr 2021 15:13:54 +0300 +Subject: ARC: mm: Use max_high_pfn as a HIGHMEM zone border + +From: Vladimir Isaev + +commit 1d5e4640e5df15252398c1b621f6bd432f2d7f17 upstream. + +Commit 4af22ded0ecf ("arc: fix memory initialization for systems +with two memory banks") fixed highmem, but for the PAE case it causes +bug messages: + +| BUG: Bad page state in process swapper pfn:80000 +| page:(ptrval) refcount:0 mapcount:1 mapping:00000000 index:0x0 pfn:0x80000 flags: 0x0() +| raw: 00000000 00000100 00000122 00000000 00000000 00000000 00000000 00000000 +| raw: 00000000 +| page dumped because: nonzero mapcount +| Modules linked in: +| CPU: 0 PID: 0 Comm: swapper Not tainted 5.12.0-rc5-00003-g1e43c377a79f #1 + +This is because the fix expects highmem to be always less than +lowmem and uses min_low_pfn as an upper zone border for highmem. + +max_high_pfn should be ok for both highmem and highmem+PAE cases. + +Fixes: 4af22ded0ecf ("arc: fix memory initialization for systems with two memory banks") +Signed-off-by: Vladimir Isaev +Cc: Mike Rapoport +Cc: stable@vger.kernel.org #5.8 onwards +Signed-off-by: Vineet Gupta +Signed-off-by: Greg Kroah-Hartman +--- + arch/arc/mm/init.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +--- a/arch/arc/mm/init.c ++++ b/arch/arc/mm/init.c +@@ -157,7 +157,16 @@ void __init setup_arch_memory(void) + min_high_pfn = PFN_DOWN(high_mem_start); + max_high_pfn = PFN_DOWN(high_mem_start + high_mem_sz); + +- max_zone_pfn[ZONE_HIGHMEM] = min_low_pfn; ++ /* ++ * max_high_pfn should be ok here for both HIGHMEM and HIGHMEM+PAE. ++ * For HIGHMEM without PAE max_high_pfn should be less than ++ * min_low_pfn to guarantee that these two regions don't overlap. ++ * For PAE case highmem is greater than lowmem, so it is natural ++ * to use max_high_pfn. ++ * ++ * In both cases, holes should be handled by pfn_valid(). ++ */ ++ max_zone_pfn[ZONE_HIGHMEM] = max_high_pfn; + + high_memory = (void *)(min_high_pfn << PAGE_SHIFT); + diff --git a/queue-5.11/arm64-fix-race-condition-on-pg_dcache_clean-in-__sync_icache_dcache.patch b/queue-5.11/arm64-fix-race-condition-on-pg_dcache_clean-in-__sync_icache_dcache.patch new file mode 100644 index 00000000000..5dbb8c685d8 --- /dev/null +++ b/queue-5.11/arm64-fix-race-condition-on-pg_dcache_clean-in-__sync_icache_dcache.patch @@ -0,0 +1,53 @@ +From 588a513d34257fdde95a9f0df0202e31998e85c6 Mon Sep 17 00:00:00 2001 +From: Catalin Marinas +Date: Fri, 14 May 2021 10:50:01 +0100 +Subject: arm64: Fix race condition on PG_dcache_clean in __sync_icache_dcache() + +From: Catalin Marinas + +commit 588a513d34257fdde95a9f0df0202e31998e85c6 upstream. + +To ensure that instructions are observable in a new mapping, the arm64 +set_pte_at() implementation cleans the D-cache and invalidates the +I-cache to the PoU. As an optimisation, this is only done on executable +mappings and the PG_dcache_clean page flag is set to avoid future cache +maintenance on the same page. + +When two different processes map the same page (e.g. private executable +file or shared mapping) there's a potential race on checking and setting +PG_dcache_clean via set_pte_at() -> __sync_icache_dcache(). While on the +fault paths the page is locked (PG_locked), mprotect() does not take the +page lock. The result is that one process may see the PG_dcache_clean +flag set but the I/D cache maintenance not yet performed. + +Avoid test_and_set_bit(PG_dcache_clean) in favour of separate test_bit() +and set_bit(). In the rare event of a race, the cache maintenance is +done twice. + +Signed-off-by: Catalin Marinas +Cc: +Cc: Will Deacon +Cc: Steven Price +Reviewed-by: Steven Price +Acked-by: Will Deacon +Link: https://lore.kernel.org/r/20210514095001.13236-1-catalin.marinas@arm.com +Signed-off-by: Catalin Marinas +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/mm/flush.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/arch/arm64/mm/flush.c ++++ b/arch/arm64/mm/flush.c +@@ -55,8 +55,10 @@ void __sync_icache_dcache(pte_t pte) + { + struct page *page = pte_page(pte); + +- if (!test_and_set_bit(PG_dcache_clean, &page->flags)) ++ if (!test_bit(PG_dcache_clean, &page->flags)) { + sync_icache_aliases(page_address(page), page_size(page)); ++ set_bit(PG_dcache_clean, &page->flags); ++ } + } + EXPORT_SYMBOL_GPL(__sync_icache_dcache); + diff --git a/queue-5.11/arm64-mte-initialize-rgsr_el1.seed-in-__cpu_setup.patch b/queue-5.11/arm64-mte-initialize-rgsr_el1.seed-in-__cpu_setup.patch new file mode 100644 index 00000000000..03a9017588b --- /dev/null +++ b/queue-5.11/arm64-mte-initialize-rgsr_el1.seed-in-__cpu_setup.patch @@ -0,0 +1,51 @@ +From 37a8024d265564eba680575df6421f19db21dfce Mon Sep 17 00:00:00 2001 +From: Peter Collingbourne +Date: Fri, 7 May 2021 11:59:05 -0700 +Subject: arm64: mte: initialize RGSR_EL1.SEED in __cpu_setup + +From: Peter Collingbourne + +commit 37a8024d265564eba680575df6421f19db21dfce upstream. + +A valid implementation choice for the ChooseRandomNonExcludedTag() +pseudocode function used by IRG is to behave in the same way as with +GCR_EL1.RRND=0. This would mean that RGSR_EL1.SEED is used as an LFSR +which must have a non-zero value in order for IRG to properly produce +pseudorandom numbers. However, RGSR_EL1 is reset to an UNKNOWN value +on soft reset and thus may reset to 0. Therefore we must initialize +RGSR_EL1.SEED to a non-zero value in order to ensure that IRG behaves +as expected. + +Signed-off-by: Peter Collingbourne +Fixes: 3b714d24ef17 ("arm64: mte: CPU feature detection and initial sysreg configuration") +Cc: # 5.10 +Link: https://linux-review.googlesource.com/id/I2b089b6c7d6f17ee37e2f0db7df5ad5bcc04526c +Acked-by: Mark Rutland +Link: https://lore.kernel.org/r/20210507185905.1745402-1-pcc@google.com +Signed-off-by: Catalin Marinas +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/mm/proc.S | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +--- a/arch/arm64/mm/proc.S ++++ b/arch/arm64/mm/proc.S +@@ -454,6 +454,18 @@ SYM_FUNC_START(__cpu_setup) + mov x10, #(SYS_GCR_EL1_RRND | SYS_GCR_EL1_EXCL_MASK) + msr_s SYS_GCR_EL1, x10 + ++ /* ++ * If GCR_EL1.RRND=1 is implemented the same way as RRND=0, then ++ * RGSR_EL1.SEED must be non-zero for IRG to produce ++ * pseudorandom numbers. As RGSR_EL1 is UNKNOWN out of reset, we ++ * must initialize it. ++ */ ++ mrs x10, CNTVCT_EL0 ++ ands x10, x10, #SYS_RGSR_EL1_SEED_MASK ++ csinc x10, x10, xzr, ne ++ lsl x10, x10, #SYS_RGSR_EL1_SEED_SHIFT ++ msr_s SYS_RGSR_EL1, x10 ++ + /* clear any pending tag check faults in TFSR*_EL1 */ + msr_s SYS_TFSR_EL1, xzr + msr_s SYS_TFSRE0_EL1, xzr diff --git a/queue-5.11/blk-iocost-fix-weight-updates-of-inner-active-iocgs.patch b/queue-5.11/blk-iocost-fix-weight-updates-of-inner-active-iocgs.patch new file mode 100644 index 00000000000..7aeee23acdc --- /dev/null +++ b/queue-5.11/blk-iocost-fix-weight-updates-of-inner-active-iocgs.patch @@ -0,0 +1,90 @@ +From e9f4eee9a0023ba22db9560d4cc6ee63f933dae8 Mon Sep 17 00:00:00 2001 +From: Tejun Heo +Date: Tue, 11 May 2021 21:38:36 -0400 +Subject: blk-iocost: fix weight updates of inner active iocgs + +From: Tejun Heo + +commit e9f4eee9a0023ba22db9560d4cc6ee63f933dae8 upstream. + +When the weight of an active iocg is updated, weight_updated() is called +which in turn calls __propagate_weights() to update the active and inuse +weights so that the effective hierarchical weights are update accordingly. + +The current implementation is incorrect for inner active nodes. For an +active leaf iocg, inuse can be any value between 1 and active and the +difference represents how much the iocg is donating. When weight is updated, +as long as inuse is clamped between 1 and the new weight, we're alright and +this is what __propagate_weights() currently implements. + +However, that's not how an active inner node's inuse is set. An inner node's +inuse is solely determined by the ratio between the sums of inuse's and +active's of its children - ie. they're results of propagating the leaves' +active and inuse weights upwards. __propagate_weights() incorrectly applies +the same clamping as for a leaf when an active inner node's weight is +updated. Consider a hierarchy which looks like the following with saturating +workloads in AA and BB. + + R + / \ + A B + | | + AA BB + +1. For both A and B, active=100, inuse=100, hwa=0.5, hwi=0.5. + +2. echo 200 > A/io.weight + +3. __propagate_weights() update A's active to 200 and leave inuse at 100 as + it's already between 1 and the new active, making A:active=200, + A:inuse=100. As R's active_sum is updated along with A's active, + A:hwa=2/3, B:hwa=1/3. However, because the inuses didn't change, the + hwi's remain unchanged at 0.5. + +4. The weight of A is now twice that of B but AA and BB still have the same + hwi of 0.5 and thus are doing the same amount of IOs. + +Fix it by making __propgate_weights() always calculate the inuse of an +active inner iocg based on the ratio of child_inuse_sum to child_active_sum. + +Signed-off-by: Tejun Heo +Reported-by: Dan Schatzberg +Fixes: 7caa47151ab2 ("blkcg: implement blk-iocost") +Cc: stable@vger.kernel.org # v5.4+ +Link: https://lore.kernel.org/r/YJsxnLZV1MnBcqjj@slm.duckdns.org +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + block/blk-iocost.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +--- a/block/blk-iocost.c ++++ b/block/blk-iocost.c +@@ -1073,7 +1073,17 @@ static void __propagate_weights(struct i + + lockdep_assert_held(&ioc->lock); + +- inuse = clamp_t(u32, inuse, 1, active); ++ /* ++ * For an active leaf node, its inuse shouldn't be zero or exceed ++ * @active. An active internal node's inuse is solely determined by the ++ * inuse to active ratio of its children regardless of @inuse. ++ */ ++ if (list_empty(&iocg->active_list) && iocg->child_active_sum) { ++ inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum, ++ iocg->child_active_sum); ++ } else { ++ inuse = clamp_t(u32, inuse, 1, active); ++ } + + iocg->last_inuse = iocg->inuse; + if (save) +@@ -1090,7 +1100,7 @@ static void __propagate_weights(struct i + /* update the level sums */ + parent->child_active_sum += (s32)(active - child->active); + parent->child_inuse_sum += (s32)(inuse - child->inuse); +- /* apply the udpates */ ++ /* apply the updates */ + child->active = active; + child->inuse = inuse; + diff --git a/queue-5.11/btrfs-fix-deadlock-when-cloning-inline-extents-and-using-qgroups.patch b/queue-5.11/btrfs-fix-deadlock-when-cloning-inline-extents-and-using-qgroups.patch new file mode 100644 index 00000000000..0f2caa60b3f --- /dev/null +++ b/queue-5.11/btrfs-fix-deadlock-when-cloning-inline-extents-and-using-qgroups.patch @@ -0,0 +1,229 @@ +From f9baa501b4fd6962257853d46ddffbc21f27e344 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 22 Apr 2021 12:08:05 +0100 +Subject: btrfs: fix deadlock when cloning inline extents and using qgroups + +From: Filipe Manana + +commit f9baa501b4fd6962257853d46ddffbc21f27e344 upstream. + +There are a few exceptional cases where cloning an inline extent needs to +copy the inline extent data into a page of the destination inode. + +When this happens, we end up starting a transaction while having a dirty +page for the destination inode and while having the range locked in the +destination's inode iotree too. Because when reserving metadata space +for a transaction we may need to flush existing delalloc in case there is +not enough free space, we have a mechanism in place to prevent a deadlock, +which was introduced in commit 3d45f221ce627d ("btrfs: fix deadlock when +cloning inline extent and low on free metadata space"). + +However when using qgroups, a transaction also reserves metadata qgroup +space, which can also result in flushing delalloc in case there is not +enough available space at the moment. When this happens we deadlock, since +flushing delalloc requires locking the file range in the inode's iotree +and the range was already locked at the very beginning of the clone +operation, before attempting to start the transaction. + +When this issue happens, stack traces like the following are reported: + + [72747.556262] task:kworker/u81:9 state:D stack: 0 pid: 225 ppid: 2 flags:0x00004000 + [72747.556268] Workqueue: writeback wb_workfn (flush-btrfs-1142) + [72747.556271] Call Trace: + [72747.556273] __schedule+0x296/0x760 + [72747.556277] schedule+0x3c/0xa0 + [72747.556279] io_schedule+0x12/0x40 + [72747.556284] __lock_page+0x13c/0x280 + [72747.556287] ? generic_file_readonly_mmap+0x70/0x70 + [72747.556325] extent_write_cache_pages+0x22a/0x440 [btrfs] + [72747.556331] ? __set_page_dirty_nobuffers+0xe7/0x160 + [72747.556358] ? set_extent_buffer_dirty+0x5e/0x80 [btrfs] + [72747.556362] ? update_group_capacity+0x25/0x210 + [72747.556366] ? cpumask_next_and+0x1a/0x20 + [72747.556391] extent_writepages+0x44/0xa0 [btrfs] + [72747.556394] do_writepages+0x41/0xd0 + [72747.556398] __writeback_single_inode+0x39/0x2a0 + [72747.556403] writeback_sb_inodes+0x1ea/0x440 + [72747.556407] __writeback_inodes_wb+0x5f/0xc0 + [72747.556410] wb_writeback+0x235/0x2b0 + [72747.556414] ? get_nr_inodes+0x35/0x50 + [72747.556417] wb_workfn+0x354/0x490 + [72747.556420] ? newidle_balance+0x2c5/0x3e0 + [72747.556424] process_one_work+0x1aa/0x340 + [72747.556426] worker_thread+0x30/0x390 + [72747.556429] ? create_worker+0x1a0/0x1a0 + [72747.556432] kthread+0x116/0x130 + [72747.556435] ? kthread_park+0x80/0x80 + [72747.556438] ret_from_fork+0x1f/0x30 + + [72747.566958] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs] + [72747.566961] Call Trace: + [72747.566964] __schedule+0x296/0x760 + [72747.566968] ? finish_wait+0x80/0x80 + [72747.566970] schedule+0x3c/0xa0 + [72747.566995] wait_extent_bit.constprop.68+0x13b/0x1c0 [btrfs] + [72747.566999] ? finish_wait+0x80/0x80 + [72747.567024] lock_extent_bits+0x37/0x90 [btrfs] + [72747.567047] btrfs_invalidatepage+0x299/0x2c0 [btrfs] + [72747.567051] ? find_get_pages_range_tag+0x2cd/0x380 + [72747.567076] __extent_writepage+0x203/0x320 [btrfs] + [72747.567102] extent_write_cache_pages+0x2bb/0x440 [btrfs] + [72747.567106] ? update_load_avg+0x7e/0x5f0 + [72747.567109] ? enqueue_entity+0xf4/0x6f0 + [72747.567134] extent_writepages+0x44/0xa0 [btrfs] + [72747.567137] ? enqueue_task_fair+0x93/0x6f0 + [72747.567140] do_writepages+0x41/0xd0 + [72747.567144] __filemap_fdatawrite_range+0xc7/0x100 + [72747.567167] btrfs_run_delalloc_work+0x17/0x40 [btrfs] + [72747.567195] btrfs_work_helper+0xc2/0x300 [btrfs] + [72747.567200] process_one_work+0x1aa/0x340 + [72747.567202] worker_thread+0x30/0x390 + [72747.567205] ? create_worker+0x1a0/0x1a0 + [72747.567208] kthread+0x116/0x130 + [72747.567211] ? kthread_park+0x80/0x80 + [72747.567214] ret_from_fork+0x1f/0x30 + + [72747.569686] task:fsstress state:D stack: 0 pid:841421 ppid:841417 flags:0x00000000 + [72747.569689] Call Trace: + [72747.569691] __schedule+0x296/0x760 + [72747.569694] schedule+0x3c/0xa0 + [72747.569721] try_flush_qgroup+0x95/0x140 [btrfs] + [72747.569725] ? finish_wait+0x80/0x80 + [72747.569753] btrfs_qgroup_reserve_data+0x34/0x50 [btrfs] + [72747.569781] btrfs_check_data_free_space+0x5f/0xa0 [btrfs] + [72747.569804] btrfs_buffered_write+0x1f7/0x7f0 [btrfs] + [72747.569810] ? path_lookupat.isra.48+0x97/0x140 + [72747.569833] btrfs_file_write_iter+0x81/0x410 [btrfs] + [72747.569836] ? __kmalloc+0x16a/0x2c0 + [72747.569839] do_iter_readv_writev+0x160/0x1c0 + [72747.569843] do_iter_write+0x80/0x1b0 + [72747.569847] vfs_writev+0x84/0x140 + [72747.569869] ? btrfs_file_llseek+0x38/0x270 [btrfs] + [72747.569873] do_writev+0x65/0x100 + [72747.569876] do_syscall_64+0x33/0x40 + [72747.569879] entry_SYSCALL_64_after_hwframe+0x44/0xa9 + + [72747.569899] task:fsstress state:D stack: 0 pid:841424 ppid:841417 flags:0x00004000 + [72747.569903] Call Trace: + [72747.569906] __schedule+0x296/0x760 + [72747.569909] schedule+0x3c/0xa0 + [72747.569936] try_flush_qgroup+0x95/0x140 [btrfs] + [72747.569940] ? finish_wait+0x80/0x80 + [72747.569967] __btrfs_qgroup_reserve_meta+0x36/0x50 [btrfs] + [72747.569989] start_transaction+0x279/0x580 [btrfs] + [72747.570014] clone_copy_inline_extent+0x332/0x490 [btrfs] + [72747.570041] btrfs_clone+0x5b7/0x7a0 [btrfs] + [72747.570068] ? lock_extent_bits+0x64/0x90 [btrfs] + [72747.570095] btrfs_clone_files+0xfc/0x150 [btrfs] + [72747.570122] btrfs_remap_file_range+0x3d8/0x4a0 [btrfs] + [72747.570126] do_clone_file_range+0xed/0x200 + [72747.570131] vfs_clone_file_range+0x37/0x110 + [72747.570134] ioctl_file_clone+0x7d/0xb0 + [72747.570137] do_vfs_ioctl+0x138/0x630 + [72747.570140] __x64_sys_ioctl+0x62/0xc0 + [72747.570143] do_syscall_64+0x33/0x40 + [72747.570146] entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +So fix this by skipping the flush of delalloc for an inode that is +flagged with BTRFS_INODE_NO_DELALLOC_FLUSH, meaning it is currently under +such a special case of cloning an inline extent, when flushing delalloc +during qgroup metadata reservation. + +The special cases for cloning inline extents were added in kernel 5.7 by +by commit 05a5a7621ce66c ("Btrfs: implement full reflink support for +inline extents"), while having qgroup metadata space reservation flushing +delalloc when low on space was added in kernel 5.9 by commit +c53e9653605dbf ("btrfs: qgroup: try to flush qgroup space when we get +-EDQUOT"). So use a "Fixes:" tag for the later commit to ease stable +kernel backports. + +Reported-by: Wang Yugui +Link: https://lore.kernel.org/linux-btrfs/20210421083137.31E3.409509F4@e16-tech.com/ +Fixes: c53e9653605dbf ("btrfs: qgroup: try to flush qgroup space when we get -EDQUOT") +CC: stable@vger.kernel.org # 5.9+ +Reviewed-by: Qu Wenruo +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/ctree.h | 2 +- + fs/btrfs/inode.c | 4 ++-- + fs/btrfs/ioctl.c | 2 +- + fs/btrfs/qgroup.c | 2 +- + fs/btrfs/send.c | 4 ++-- + 5 files changed, 7 insertions(+), 7 deletions(-) + +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -3104,7 +3104,7 @@ int btrfs_truncate_inode_items(struct bt + struct btrfs_inode *inode, u64 new_size, + u32 min_type); + +-int btrfs_start_delalloc_snapshot(struct btrfs_root *root); ++int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); + int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, + bool in_reclaim_context); + int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -9475,7 +9475,7 @@ out: + return ret; + } + +-int btrfs_start_delalloc_snapshot(struct btrfs_root *root) ++int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) + { + struct writeback_control wbc = { + .nr_to_write = LONG_MAX, +@@ -9488,7 +9488,7 @@ int btrfs_start_delalloc_snapshot(struct + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + return -EROFS; + +- return start_delalloc_inodes(root, &wbc, true, false); ++ return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); + } + + int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -1042,7 +1042,7 @@ static noinline int btrfs_mksnapshot(con + */ + btrfs_drew_read_lock(&root->snapshot_lock); + +- ret = btrfs_start_delalloc_snapshot(root); ++ ret = btrfs_start_delalloc_snapshot(root, false); + if (ret) + goto out; + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -3579,7 +3579,7 @@ static int try_flush_qgroup(struct btrfs + return 0; + } + +- ret = btrfs_start_delalloc_snapshot(root); ++ ret = btrfs_start_delalloc_snapshot(root, true); + if (ret < 0) + goto out; + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); +--- a/fs/btrfs/send.c ++++ b/fs/btrfs/send.c +@@ -7159,7 +7159,7 @@ static int flush_delalloc_roots(struct s + int i; + + if (root) { +- ret = btrfs_start_delalloc_snapshot(root); ++ ret = btrfs_start_delalloc_snapshot(root, false); + if (ret) + return ret; + btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); +@@ -7167,7 +7167,7 @@ static int flush_delalloc_roots(struct s + + for (i = 0; i < sctx->clone_roots_cnt; i++) { + root = sctx->clone_roots[i].root; +- ret = btrfs_start_delalloc_snapshot(root); ++ ret = btrfs_start_delalloc_snapshot(root, false); + if (ret) + return ret; + btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); diff --git a/queue-5.11/btrfs-fix-race-leading-to-unpersisted-data-and-metadata-on-fsync.patch b/queue-5.11/btrfs-fix-race-leading-to-unpersisted-data-and-metadata-on-fsync.patch new file mode 100644 index 00000000000..f79720522d5 --- /dev/null +++ b/queue-5.11/btrfs-fix-race-leading-to-unpersisted-data-and-metadata-on-fsync.patch @@ -0,0 +1,271 @@ +From 626e9f41f7c281ba3e02843702f68471706aa6d9 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 27 Apr 2021 11:27:20 +0100 +Subject: btrfs: fix race leading to unpersisted data and metadata on fsync + +From: Filipe Manana + +commit 626e9f41f7c281ba3e02843702f68471706aa6d9 upstream. + +When doing a fast fsync on a file, there is a race which can result in the +fsync returning success to user space without logging the inode and without +durably persisting new data. + +The following example shows one possible scenario for this: + + $ mkfs.btrfs -f /dev/sdc + $ mount /dev/sdc /mnt + + $ touch /mnt/bar + $ xfs_io -f -c "pwrite -S 0xab 0 1M" -c "fsync" /mnt/baz + + # Now we have: + # file bar == inode 257 + # file baz == inode 258 + + $ mv /mnt/baz /mnt/foo + + # Now we have: + # file bar == inode 257 + # file foo == inode 258 + + $ xfs_io -c "pwrite -S 0xcd 0 1M" /mnt/foo + + # fsync bar before foo, it is important to trigger the race. + $ xfs_io -c "fsync" /mnt/bar + $ xfs_io -c "fsync" /mnt/foo + + # After this: + # inode 257, file bar, is empty + # inode 258, file foo, has 1M filled with 0xcd + + + + # Replay the log: + $ mount /dev/sdc /mnt + + # After this point file foo should have 1M filled with 0xcd and not 0xab + +The following steps explain how the race happens: + +1) Before the first fsync of inode 258, when it has the "baz" name, its + ->logged_trans is 0, ->last_sub_trans is 0 and ->last_log_commit is -1. + The inode also has the full sync flag set; + +2) After the first fsync, we set inode 258 ->logged_trans to 6, which is + the generation of the current transaction, and set ->last_log_commit + to 0, which is the current value of ->last_sub_trans (done at + btrfs_log_inode()). + + The full sync flag is cleared from the inode during the fsync. + + The log sub transaction that was committed had an ID of 0 and when we + synced the log, at btrfs_sync_log(), we incremented root->log_transid + from 0 to 1; + +3) During the rename: + + We update inode 258, through btrfs_update_inode(), and that causes its + ->last_sub_trans to be set to 1 (the current log transaction ID), and + ->last_log_commit remains with a value of 0. + + After updating inode 258, because we have previously logged the inode + in the previous fsync, we log again the inode through the call to + btrfs_log_new_name(). This results in updating the inode's + ->last_log_commit from 0 to 1 (the current value of its + ->last_sub_trans). + + The ->last_sub_trans of inode 257 is updated to 1, which is the ID of + the next log transaction; + +4) Then a buffered write against inode 258 is made. This leaves the value + of ->last_sub_trans as 1 (the ID of the current log transaction, stored + at root->log_transid); + +5) Then an fsync against inode 257 (or any other inode other than 258), + happens. This results in committing the log transaction with ID 1, + which results in updating root->last_log_commit to 1 and bumping + root->log_transid from 1 to 2; + +6) Then an fsync against inode 258 starts. We flush delalloc and wait only + for writeback to complete, since the full sync flag is not set in the + inode's runtime flags - we do not wait for ordered extents to complete. + + Then, at btrfs_sync_file(), we call btrfs_inode_in_log() before the + ordered extent completes. The call returns true: + + static inline bool btrfs_inode_in_log(...) + { + bool ret = false; + + spin_lock(&inode->lock); + if (inode->logged_trans == generation && + inode->last_sub_trans <= inode->last_log_commit && + inode->last_sub_trans <= inode->root->last_log_commit) + ret = true; + spin_unlock(&inode->lock); + return ret; + } + + generation has a value of 6 (fs_info->generation), ->logged_trans also + has a value of 6 (set when we logged the inode during the first fsync + and when logging it during the rename), ->last_sub_trans has a value + of 1, set during the rename (step 3), ->last_log_commit also has a + value of 1 (set in step 3) and root->last_log_commit has a value of 1, + which was set in step 5 when fsyncing inode 257. + + As a consequence we don't log the inode, any new extents and do not + sync the log, resulting in a data loss if a power failure happens + after the fsync and before the current transaction commits. + Also, because we do not log the inode, after a power failure the mtime + and ctime of the inode do not match those we had before. + + When the ordered extent completes before we call btrfs_inode_in_log(), + then the call returns false and we log the inode and sync the log, + since at the end of ordered extent completion we update the inode and + set ->last_sub_trans to 2 (the value of root->log_transid) and + ->last_log_commit to 1. + +This problem is found after removing the check for the emptiness of the +inode's list of modified extents in the recent commit 209ecbb8585bf6 +("btrfs: remove stale comment and logic from btrfs_inode_in_log()"), +added in the 5.13 merge window. However checking the emptiness of the +list is not really the way to solve this problem, and was never intended +to, because while that solves the problem for COW writes, the problem +persists for NOCOW writes because in that case the list is always empty. + +In the case of NOCOW writes, even though we wait for the writeback to +complete before returning from btrfs_sync_file(), we end up not logging +the inode, which has a new mtime/ctime, and because we don't sync the log, +we never issue disk barriers (send REQ_PREFLUSH to the device) since that +only happens when we sync the log (when we write super blocks at +btrfs_sync_log()). So effectively, for a NOCOW case, when we return from +btrfs_sync_file() to user space, we are not guaranteeing that the data is +durably persisted on disk. + +Also, while the example above uses a rename exchange to show how the +problem happens, it is not the only way to trigger it. An alternative +could be adding a new hard link to inode 258, since that also results +in calling btrfs_log_new_name() and updating the inode in the log. +An example reproducer using the addition of a hard link instead of a +rename operation: + + $ mkfs.btrfs -f /dev/sdc + $ mount /dev/sdc /mnt + + $ touch /mnt/bar + $ xfs_io -f -c "pwrite -S 0xab 0 1M" -c "fsync" /mnt/foo + + $ ln /mnt/foo /mnt/foo_link + $ xfs_io -c "pwrite -S 0xcd 0 1M" /mnt/foo + + $ xfs_io -c "fsync" /mnt/bar + $ xfs_io -c "fsync" /mnt/foo + + + + # Replay the log: + $ mount /dev/sdc /mnt + + # After this point file foo often has 1M filled with 0xab and not 0xcd + +The reasons leading to the final fsync of file foo, inode 258, not +persisting the new data are the same as for the previous example with +a rename operation. + +So fix by never skipping logging and log syncing when there are still any +ordered extents in flight. To avoid making the conditional if statement +that checks if logging an inode is needed harder to read, place all the +logic into an helper function with separate if statements to make it more +manageable and easier to read. + +A test case for fstests will follow soon. + +For NOCOW writes, the problem existed before commit b5e6c3e170b770 +("btrfs: always wait on ordered extents at fsync time"), introduced in +kernel 4.19, then it went away with that commit since we started to always +wait for ordered extent completion before logging. + +The problem came back again once the fast fsync path was changed again to +avoid waiting for ordered extent completion, in commit 487781796d3022 +("btrfs: make fast fsyncs wait only for writeback"), added in kernel 5.10. + +However, for COW writes, the race only happens after the recent +commit 209ecbb8585bf6 ("btrfs: remove stale comment and logic from +btrfs_inode_in_log()"), introduced in the 5.13 merge window. For NOCOW +writes, the bug existed before that commit. So tag 5.10+ as the release +for stable backports. + +CC: stable@vger.kernel.org # 5.10+ +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/file.c | 35 +++++++++++++++++++++++++---------- + fs/btrfs/tree-log.c | 3 ++- + 2 files changed, 27 insertions(+), 11 deletions(-) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -2082,6 +2082,30 @@ static int start_ordered_ops(struct inod + return ret; + } + ++static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) ++{ ++ struct btrfs_inode *inode = BTRFS_I(ctx->inode); ++ struct btrfs_fs_info *fs_info = inode->root->fs_info; ++ ++ if (btrfs_inode_in_log(inode, fs_info->generation) && ++ list_empty(&ctx->ordered_extents)) ++ return true; ++ ++ /* ++ * If we are doing a fast fsync we can not bail out if the inode's ++ * last_trans is <= then the last committed transaction, because we only ++ * update the last_trans of the inode during ordered extent completion, ++ * and for a fast fsync we don't wait for that, we only wait for the ++ * writeback to complete. ++ */ ++ if (inode->last_trans <= fs_info->last_trans_committed && ++ (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) || ++ list_empty(&ctx->ordered_extents))) ++ return true; ++ ++ return false; ++} ++ + /* + * fsync call for both files and directories. This logs the inode into + * the tree log instead of forcing full commits whenever possible. +@@ -2196,17 +2220,8 @@ int btrfs_sync_file(struct file *file, l + + atomic_inc(&root->log_batch); + +- /* +- * If we are doing a fast fsync we can not bail out if the inode's +- * last_trans is <= then the last committed transaction, because we only +- * update the last_trans of the inode during ordered extent completion, +- * and for a fast fsync we don't wait for that, we only wait for the +- * writeback to complete. +- */ + smp_mb(); +- if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) || +- (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed && +- (full_sync || list_empty(&ctx.ordered_extents)))) { ++ if (skip_inode_logging(&ctx)) { + /* + * We've had everything committed since the last time we were + * modified so clear this flag in case it was set for whatever +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -6066,7 +6066,8 @@ static int btrfs_log_inode_parent(struct + * (since logging them is pointless, a link count of 0 means they + * will never be accessible). + */ +- if (btrfs_inode_in_log(inode, trans->transid) || ++ if ((btrfs_inode_in_log(inode, trans->transid) && ++ list_empty(&ctx->ordered_extents)) || + inode->vfs_inode.i_nlink == 0) { + ret = BTRFS_NO_LOG_SYNC; + goto end_no_trans; diff --git a/queue-5.11/drm-amd-display-initialize-attribute-for-hdcp_srm-sysfs-file.patch b/queue-5.11/drm-amd-display-initialize-attribute-for-hdcp_srm-sysfs-file.patch new file mode 100644 index 00000000000..3c189971340 --- /dev/null +++ b/queue-5.11/drm-amd-display-initialize-attribute-for-hdcp_srm-sysfs-file.patch @@ -0,0 +1,38 @@ +From fe1c97d008f86f672f0e9265f180c22451ca3b9f Mon Sep 17 00:00:00 2001 +From: David Ward +Date: Mon, 10 May 2021 05:30:39 -0400 +Subject: drm/amd/display: Initialize attribute for hdcp_srm sysfs file + +From: David Ward + +commit fe1c97d008f86f672f0e9265f180c22451ca3b9f upstream. + +It is stored in dynamically allocated memory, so sysfs_bin_attr_init() must +be called to initialize it. (Note: "initialization" only sets the .attr.key +member in this struct; it does not change the value of any other members.) + +Otherwise, when CONFIG_DEBUG_LOCK_ALLOC=y this message appears during boot: + + BUG: key ffff9248900cd148 has not been registered! + +Fixes: 9037246bb2da ("drm/amd/display: Add sysfs interface for set/get srm") +Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1586 +Reported-by: Mikhail Gavrilov +Signed-off-by: David Ward +Signed-off-by: Alex Deucher +Cc: stable@vger.kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c +@@ -643,6 +643,7 @@ struct hdcp_workqueue *hdcp_create_workq + + /* File created at /sys/class/drm/card0/device/hdcp_srm*/ + hdcp_work[0].attr = data_attr; ++ sysfs_bin_attr_init(&hdcp_work[0].attr); + + if (sysfs_create_bin_file(&adev->dev->kobj, &hdcp_work[0].attr)) + DRM_WARN("Failed to create device file hdcp_srm"); diff --git a/queue-5.11/drm-i915-avoid-div-by-zero-on-gen2.patch b/queue-5.11/drm-i915-avoid-div-by-zero-on-gen2.patch new file mode 100644 index 00000000000..f00654346d1 --- /dev/null +++ b/queue-5.11/drm-i915-avoid-div-by-zero-on-gen2.patch @@ -0,0 +1,47 @@ +From 4819d16d91145966ce03818a95169df1fd56b299 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= +Date: Wed, 21 Apr 2021 18:33:58 +0300 +Subject: drm/i915: Avoid div-by-zero on gen2 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Ville Syrjälä + +commit 4819d16d91145966ce03818a95169df1fd56b299 upstream. + +Gen2 tiles are 2KiB in size so i915_gem_object_get_tile_row_size() +can in fact return <4KiB, which leads to div-by-zero here. +Avoid that. + +Not sure i915_gem_object_get_tile_row_size() is entirely +sane anyway since it doesn't account for the different tile +layouts on i8xx/i915... + +I'm not able to hit this before commit 6846895fde05 ("drm/i915: +Replace PIN_NONFAULT with calls to PIN_NOEVICT") and it looks +like I also need to run recent version of Mesa. With those in +place xonotic trips on this quite easily on my 85x. + +Cc: stable@vger.kernel.org +Reviewed-by: Chris Wilson +Signed-off-by: Ville Syrjälä +Link: https://patchwork.freedesktop.org/patch/msgid/20210421153401.13847-2-ville.syrjala@linux.intel.com +(cherry picked from commit ed52c62d386f764194e0184fdb905d5f24194cae) +Signed-off-by: Jani Nikula +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/i915/gem/i915_gem_mman.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c ++++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c +@@ -189,7 +189,7 @@ compute_partial_view(const struct drm_i9 + struct i915_ggtt_view view; + + if (i915_gem_object_is_tiled(obj)) +- chunk = roundup(chunk, tile_row_pages(obj)); ++ chunk = roundup(chunk, tile_row_pages(obj) ?: 1); + + view.type = I915_GGTT_VIEW_PARTIAL; + view.partial.offset = rounddown(page_offset, chunk); diff --git a/queue-5.11/drm-radeon-dpm-disable-sclk-switching-on-oland-when-two-4k-60hz-monitors-are-connected.patch b/queue-5.11/drm-radeon-dpm-disable-sclk-switching-on-oland-when-two-4k-60hz-monitors-are-connected.patch new file mode 100644 index 00000000000..b6cdb6789d2 --- /dev/null +++ b/queue-5.11/drm-radeon-dpm-disable-sclk-switching-on-oland-when-two-4k-60hz-monitors-are-connected.patch @@ -0,0 +1,85 @@ +From 227545b9a08c68778ddd89428f99c351fc9315ac Mon Sep 17 00:00:00 2001 +From: Kai-Heng Feng +Date: Fri, 30 Apr 2021 12:56:56 +0800 +Subject: drm/radeon/dpm: Disable sclk switching on Oland when two 4K 60Hz monitors are connected + +From: Kai-Heng Feng + +commit 227545b9a08c68778ddd89428f99c351fc9315ac upstream. + +Screen flickers rapidly when two 4K 60Hz monitors are in use. This issue +doesn't happen when one monitor is 4K 60Hz (pixelclock 594MHz) and +another one is 4K 30Hz (pixelclock 297MHz). + +The issue is gone after setting "power_dpm_force_performance_level" to +"high". Following the indication, we found that the issue occurs when +sclk is too low. + +So resolve the issue by disabling sclk switching when there are two +monitors requires high pixelclock (> 297MHz). + +v2: + - Only apply the fix to Oland. +Signed-off-by: Kai-Heng Feng +Signed-off-by: Alex Deucher +Cc: stable@vger.kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/radeon/radeon.h | 1 + + drivers/gpu/drm/radeon/radeon_pm.c | 8 ++++++++ + drivers/gpu/drm/radeon/si_dpm.c | 3 +++ + 3 files changed, 12 insertions(+) + +--- a/drivers/gpu/drm/radeon/radeon.h ++++ b/drivers/gpu/drm/radeon/radeon.h +@@ -1559,6 +1559,7 @@ struct radeon_dpm { + void *priv; + u32 new_active_crtcs; + int new_active_crtc_count; ++ int high_pixelclock_count; + u32 current_active_crtcs; + int current_active_crtc_count; + bool single_display; +--- a/drivers/gpu/drm/radeon/radeon_pm.c ++++ b/drivers/gpu/drm/radeon/radeon_pm.c +@@ -1775,6 +1775,7 @@ static void radeon_pm_compute_clocks_dpm + struct drm_device *ddev = rdev->ddev; + struct drm_crtc *crtc; + struct radeon_crtc *radeon_crtc; ++ struct radeon_connector *radeon_connector; + + if (!rdev->pm.dpm_enabled) + return; +@@ -1784,6 +1785,7 @@ static void radeon_pm_compute_clocks_dpm + /* update active crtc counts */ + rdev->pm.dpm.new_active_crtcs = 0; + rdev->pm.dpm.new_active_crtc_count = 0; ++ rdev->pm.dpm.high_pixelclock_count = 0; + if (rdev->num_crtc && rdev->mode_info.mode_config_initialized) { + list_for_each_entry(crtc, + &ddev->mode_config.crtc_list, head) { +@@ -1791,6 +1793,12 @@ static void radeon_pm_compute_clocks_dpm + if (crtc->enabled) { + rdev->pm.dpm.new_active_crtcs |= (1 << radeon_crtc->crtc_id); + rdev->pm.dpm.new_active_crtc_count++; ++ if (!radeon_crtc->connector) ++ continue; ++ ++ radeon_connector = to_radeon_connector(radeon_crtc->connector); ++ if (radeon_connector->pixelclock_for_modeset > 297000) ++ rdev->pm.dpm.high_pixelclock_count++; + } + } + } +--- a/drivers/gpu/drm/radeon/si_dpm.c ++++ b/drivers/gpu/drm/radeon/si_dpm.c +@@ -2979,6 +2979,9 @@ static void si_apply_state_adjust_rules( + (rdev->pdev->device == 0x6605)) { + max_sclk = 75000; + } ++ ++ if (rdev->pm.dpm.high_pixelclock_count > 1) ++ disable_sclk_switching = true; + } + + if (rps->vce_active) { diff --git a/queue-5.11/hfsplus-prevent-corruption-in-shrinking-truncate.patch b/queue-5.11/hfsplus-prevent-corruption-in-shrinking-truncate.patch new file mode 100644 index 00000000000..706a839af59 --- /dev/null +++ b/queue-5.11/hfsplus-prevent-corruption-in-shrinking-truncate.patch @@ -0,0 +1,89 @@ +From c3187cf32216313fb316084efac4dab3a8459b1d Mon Sep 17 00:00:00 2001 +From: Jouni Roivas +Date: Fri, 14 May 2021 17:27:33 -0700 +Subject: hfsplus: prevent corruption in shrinking truncate + +From: Jouni Roivas + +commit c3187cf32216313fb316084efac4dab3a8459b1d upstream. + +I believe there are some issues introduced by commit 31651c607151 +("hfsplus: avoid deadlock on file truncation") + +HFS+ has extent records which always contains 8 extents. In case the +first extent record in catalog file gets full, new ones are allocated from +extents overflow file. + +In case shrinking truncate happens to middle of an extent record which +locates in extents overflow file, the logic in hfsplus_file_truncate() was +changed so that call to hfs_brec_remove() is not guarded any more. + +Right action would be just freeing the extents that exceed the new size +inside extent record by calling hfsplus_free_extents(), and then check if +the whole extent record should be removed. However since the guard +(blk_cnt > start) is now after the call to hfs_brec_remove(), this has +unfortunate effect that the last matching extent record is removed +unconditionally. + +To reproduce this issue, create a file which has at least 10 extents, and +then perform shrinking truncate into middle of the last extent record, so +that the number of remaining extents is not under or divisible by 8. This +causes the last extent record (8 extents) to be removed totally instead of +truncating into middle of it. Thus this causes corruption, and lost data. + +Fix for this is simply checking if the new truncated end is below the +start of this extent record, making it safe to remove the full extent +record. However call to hfs_brec_remove() can't be moved to it's previous +place since we're dropping ->tree_lock and it can cause a race condition +and the cached info being invalidated possibly corrupting the node data. + +Another issue is related to this one. When entering into the block +(blk_cnt > start) we are not holding the ->tree_lock. We break out from +the loop not holding the lock, but hfs_find_exit() does unlock it. Not +sure if it's possible for someone else to take the lock under our feet, +but it can cause hard to debug errors and premature unlocking. Even if +there's no real risk of it, the locking should still always be kept in +balance. Thus taking the lock now just before the check. + +Link: https://lkml.kernel.org/r/20210429165139.3082828-1-jouni.roivas@tuxera.com +Fixes: 31651c607151f ("hfsplus: avoid deadlock on file truncation") +Signed-off-by: Jouni Roivas +Reviewed-by: Anton Altaparmakov +Cc: Anatoly Trosinenko +Cc: Viacheslav Dubeyko +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + fs/hfsplus/extents.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- a/fs/hfsplus/extents.c ++++ b/fs/hfsplus/extents.c +@@ -598,13 +598,15 @@ void hfsplus_file_truncate(struct inode + res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); + if (res) + break; +- hfs_brec_remove(&fd); + +- mutex_unlock(&fd.tree->tree_lock); + start = hip->cached_start; ++ if (blk_cnt <= start) ++ hfs_brec_remove(&fd); ++ mutex_unlock(&fd.tree->tree_lock); + hfsplus_free_extents(sb, hip->cached_extents, + alloc_cnt - start, alloc_cnt - blk_cnt); + hfsplus_dump_extent(hip->cached_extents); ++ mutex_lock(&fd.tree->tree_lock); + if (blk_cnt > start) { + hip->extent_state |= HFSPLUS_EXT_DIRTY; + break; +@@ -612,7 +614,6 @@ void hfsplus_file_truncate(struct inode + alloc_cnt = start; + hip->cached_start = hip->cached_blocks = 0; + hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); +- mutex_lock(&fd.tree->tree_lock); + } + hfs_find_exit(&fd); + diff --git a/queue-5.11/kasan-fix-unit-tests-with-config_ubsan_local_bounds-enabled.patch b/queue-5.11/kasan-fix-unit-tests-with-config_ubsan_local_bounds-enabled.patch new file mode 100644 index 00000000000..574bcc95091 --- /dev/null +++ b/queue-5.11/kasan-fix-unit-tests-with-config_ubsan_local_bounds-enabled.patch @@ -0,0 +1,96 @@ +From f649dc0e0d7b509c75570ee403723660f5b72ec7 Mon Sep 17 00:00:00 2001 +From: Peter Collingbourne +Date: Fri, 14 May 2021 17:27:27 -0700 +Subject: kasan: fix unit tests with CONFIG_UBSAN_LOCAL_BOUNDS enabled + +From: Peter Collingbourne + +commit f649dc0e0d7b509c75570ee403723660f5b72ec7 upstream. + +These tests deliberately access these arrays out of bounds, which will +cause the dynamic local bounds checks inserted by +CONFIG_UBSAN_LOCAL_BOUNDS to fail and panic the kernel. To avoid this +problem, access the arrays via volatile pointers, which will prevent the +compiler from being able to determine the array bounds. + +These accesses use volatile pointers to char (char *volatile) rather than +the more conventional pointers to volatile char (volatile char *) because +we want to prevent the compiler from making inferences about the pointer +itself (i.e. its array bounds), not the data that it refers to. + +Link: https://lkml.kernel.org/r/20210507025915.1464056-1-pcc@google.com +Link: https://linux-review.googlesource.com/id/I90b1713fbfa1bf68ff895aef099ea77b98a7c3b9 +Signed-off-by: Peter Collingbourne +Tested-by: Alexander Potapenko +Reviewed-by: Andrey Konovalov +Cc: Peter Collingbourne +Cc: George Popescu +Cc: Elena Petrova +Cc: Evgenii Stepanov +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + lib/test_kasan.c | 29 +++++++++++++++++++++++------ + 1 file changed, 23 insertions(+), 6 deletions(-) + +--- a/lib/test_kasan.c ++++ b/lib/test_kasan.c +@@ -449,8 +449,20 @@ static char global_array[10]; + + static void kasan_global_oob(struct kunit *test) + { +- volatile int i = 3; +- char *p = &global_array[ARRAY_SIZE(global_array) + i]; ++ /* ++ * Deliberate out-of-bounds access. To prevent CONFIG_UBSAN_LOCAL_BOUNDS ++ * from failing here and panicing the kernel, access the array via a ++ * volatile pointer, which will prevent the compiler from being able to ++ * determine the array bounds. ++ * ++ * This access uses a volatile pointer to char (char *volatile) rather ++ * than the more conventional pointer to volatile char (volatile char *) ++ * because we want to prevent the compiler from making inferences about ++ * the pointer itself (i.e. its array bounds), not the data that it ++ * refers to. ++ */ ++ char *volatile array = global_array; ++ char *p = &array[ARRAY_SIZE(global_array) + 3]; + + /* Only generic mode instruments globals. */ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { +@@ -479,8 +491,9 @@ static void ksize_unpoisons_memory(struc + static void kasan_stack_oob(struct kunit *test) + { + char stack_array[10]; +- volatile int i = OOB_TAG_OFF; +- char *p = &stack_array[ARRAY_SIZE(stack_array) + i]; ++ /* See comment in kasan_global_oob. */ ++ char *volatile array = stack_array; ++ char *p = &array[ARRAY_SIZE(stack_array) + OOB_TAG_OFF]; + + if (!IS_ENABLED(CONFIG_KASAN_STACK)) { + kunit_info(test, "CONFIG_KASAN_STACK is not enabled"); +@@ -494,7 +507,9 @@ static void kasan_alloca_oob_left(struct + { + volatile int i = 10; + char alloca_array[i]; +- char *p = alloca_array - 1; ++ /* See comment in kasan_global_oob. */ ++ char *volatile array = alloca_array; ++ char *p = array - 1; + + /* Only generic mode instruments dynamic allocas. */ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { +@@ -514,7 +529,9 @@ static void kasan_alloca_oob_right(struc + { + volatile int i = 10; + char alloca_array[i]; +- char *p = alloca_array + i; ++ /* See comment in kasan_global_oob. */ ++ char *volatile array = alloca_array; ++ char *p = array + i; + + /* Only generic mode instruments dynamic allocas. */ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { diff --git a/queue-5.11/kvm-exit-halt-polling-on-need_resched-as-well.patch b/queue-5.11/kvm-exit-halt-polling-on-need_resched-as-well.patch new file mode 100644 index 00000000000..88ee88b5727 --- /dev/null +++ b/queue-5.11/kvm-exit-halt-polling-on-need_resched-as-well.patch @@ -0,0 +1,38 @@ +From 262de4102c7bb8e59f26a967a8ffe8cce85cc537 Mon Sep 17 00:00:00 2001 +From: Benjamin Segall +Date: Thu, 29 Apr 2021 16:22:34 +0000 +Subject: kvm: exit halt polling on need_resched() as well + +From: Benjamin Segall + +commit 262de4102c7bb8e59f26a967a8ffe8cce85cc537 upstream. + +single_task_running() is usually more general than need_resched() +but CFS_BANDWIDTH throttling will use resched_task() when there +is just one task to get the task to block. This was causing +long-need_resched warnings and was likely allowing VMs to +overrun their quota when halt polling. + +Signed-off-by: Ben Segall +Signed-off-by: Venkatesh Srinivas +Message-Id: <20210429162233.116849-1-venkateshs@chromium.org> +Signed-off-by: Paolo Bonzini +Cc: stable@vger.kernel.org +Reviewed-by: Jim Mattson +Signed-off-by: Greg Kroah-Hartman +--- + virt/kvm/kvm_main.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -2814,7 +2814,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcp + goto out; + } + poll_end = cur = ktime_get(); +- } while (single_task_running() && ktime_before(cur, stop)); ++ } while (single_task_running() && !need_resched() && ++ ktime_before(cur, stop)); + } + + prepare_to_rcuwait(&vcpu->wait); diff --git a/queue-5.11/mm-hugetlb-fix-f_seal_future_write.patch b/queue-5.11/mm-hugetlb-fix-f_seal_future_write.patch new file mode 100644 index 00000000000..03d3c280d4e --- /dev/null +++ b/queue-5.11/mm-hugetlb-fix-f_seal_future_write.patch @@ -0,0 +1,150 @@ +From 22247efd822e6d263f3c8bd327f3f769aea9b1d9 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 14 May 2021 17:27:04 -0700 +Subject: mm/hugetlb: fix F_SEAL_FUTURE_WRITE + +From: Peter Xu + +commit 22247efd822e6d263f3c8bd327f3f769aea9b1d9 upstream. + +Patch series "mm/hugetlb: Fix issues on file sealing and fork", v2. + +Hugh reported issue with F_SEAL_FUTURE_WRITE not applied correctly to +hugetlbfs, which I can easily verify using the memfd_test program, which +seems that the program is hardly run with hugetlbfs pages (as by default +shmem). + +Meanwhile I found another probably even more severe issue on that hugetlb +fork won't wr-protect child cow pages, so child can potentially write to +parent private pages. Patch 2 addresses that. + +After this series applied, "memfd_test hugetlbfs" should start to pass. + +This patch (of 2): + +F_SEAL_FUTURE_WRITE is missing for hugetlb starting from the first day. +There is a test program for that and it fails constantly. + +$ ./memfd_test hugetlbfs +memfd-hugetlb: CREATE +memfd-hugetlb: BASIC +memfd-hugetlb: SEAL-WRITE +memfd-hugetlb: SEAL-FUTURE-WRITE +mmap() didn't fail as expected +Aborted (core dumped) + +I think it's probably because no one is really running the hugetlbfs test. + +Fix it by checking FUTURE_WRITE also in hugetlbfs_file_mmap() as what we +do in shmem_mmap(). Generalize a helper for that. + +Link: https://lkml.kernel.org/r/20210503234356.9097-1-peterx@redhat.com +Link: https://lkml.kernel.org/r/20210503234356.9097-2-peterx@redhat.com +Fixes: ab3948f58ff84 ("mm/memfd: add an F_SEAL_FUTURE_WRITE seal to memfd") +Signed-off-by: Peter Xu +Reported-by: Hugh Dickins +Reviewed-by: Mike Kravetz +Cc: Joel Fernandes (Google) +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + fs/hugetlbfs/inode.c | 5 +++++ + include/linux/mm.h | 32 ++++++++++++++++++++++++++++++++ + mm/shmem.c | 22 ++++------------------ + 3 files changed, 41 insertions(+), 18 deletions(-) + +--- a/fs/hugetlbfs/inode.c ++++ b/fs/hugetlbfs/inode.c +@@ -131,6 +131,7 @@ static void huge_pagevec_release(struct + static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) + { + struct inode *inode = file_inode(file); ++ struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); + loff_t len, vma_len; + int ret; + struct hstate *h = hstate_file(file); +@@ -146,6 +147,10 @@ static int hugetlbfs_file_mmap(struct fi + vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; + vma->vm_ops = &hugetlb_vm_ops; + ++ ret = seal_check_future_write(info->seals, vma); ++ if (ret) ++ return ret; ++ + /* + * page based offset in vm_pgoff could be sufficiently large to + * overflow a loff_t when converted to byte offset. This can +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -3191,5 +3191,37 @@ unsigned long wp_shared_mapping_range(st + + extern int sysctl_nr_trim_pages; + ++/** ++ * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it ++ * @seals: the seals to check ++ * @vma: the vma to operate on ++ * ++ * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on ++ * the vma flags. Return 0 if check pass, or <0 for errors. ++ */ ++static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) ++{ ++ if (seals & F_SEAL_FUTURE_WRITE) { ++ /* ++ * New PROT_WRITE and MAP_SHARED mmaps are not allowed when ++ * "future write" seal active. ++ */ ++ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) ++ return -EPERM; ++ ++ /* ++ * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as ++ * MAP_SHARED and read-only, take care to not allow mprotect to ++ * revert protections on such mappings. Do this only for shared ++ * mappings. For private mappings, don't need to mask ++ * VM_MAYWRITE as we still want them to be COW-writable. ++ */ ++ if (vma->vm_flags & VM_SHARED) ++ vma->vm_flags &= ~(VM_MAYWRITE); ++ } ++ ++ return 0; ++} ++ + #endif /* __KERNEL__ */ + #endif /* _LINUX_MM_H */ +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -2256,25 +2256,11 @@ out_nomem: + static int shmem_mmap(struct file *file, struct vm_area_struct *vma) + { + struct shmem_inode_info *info = SHMEM_I(file_inode(file)); ++ int ret; + +- if (info->seals & F_SEAL_FUTURE_WRITE) { +- /* +- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when +- * "future write" seal active. +- */ +- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) +- return -EPERM; +- +- /* +- * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as +- * MAP_SHARED and read-only, take care to not allow mprotect to +- * revert protections on such mappings. Do this only for shared +- * mappings. For private mappings, don't need to mask +- * VM_MAYWRITE as we still want them to be COW-writable. +- */ +- if (vma->vm_flags & VM_SHARED) +- vma->vm_flags &= ~(VM_MAYWRITE); +- } ++ ret = seal_check_future_write(info->seals, vma); ++ if (ret) ++ return ret; + + /* arm64 - allow memory tagging on RAM-based files */ + vma->vm_flags |= VM_MTE_ALLOWED; diff --git a/queue-5.11/powerpc-64s-fix-crashes-when-toggling-entry-flush-barrier.patch b/queue-5.11/powerpc-64s-fix-crashes-when-toggling-entry-flush-barrier.patch new file mode 100644 index 00000000000..8f9cdcc4d6b --- /dev/null +++ b/queue-5.11/powerpc-64s-fix-crashes-when-toggling-entry-flush-barrier.patch @@ -0,0 +1,70 @@ +From aec86b052df6541cc97c5fca44e5934cbea4963b Mon Sep 17 00:00:00 2001 +From: Michael Ellerman +Date: Thu, 6 May 2021 14:49:59 +1000 +Subject: powerpc/64s: Fix crashes when toggling entry flush barrier + +From: Michael Ellerman + +commit aec86b052df6541cc97c5fca44e5934cbea4963b upstream. + +The entry flush mitigation can be enabled/disabled at runtime via a +debugfs file (entry_flush), which causes the kernel to patch itself to +enable/disable the relevant mitigations. + +However depending on which mitigation we're using, it may not be safe to +do that patching while other CPUs are active. For example the following +crash: + + sleeper[15639]: segfault (11) at c000000000004c20 nip c000000000004c20 lr c000000000004c20 + +Shows that we returned to userspace with a corrupted LR that points into +the kernel, due to executing the partially patched call to the fallback +entry flush (ie. we missed the LR restore). + +Fix it by doing the patching under stop machine. The CPUs that aren't +doing the patching will be spinning in the core of the stop machine +logic. That is currently sufficient for our purposes, because none of +the patching we do is to that code or anywhere in the vicinity. + +Fixes: f79643787e0a ("powerpc/64s: flush L1D on kernel entry") +Cc: stable@vger.kernel.org # v5.10+ +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20210506044959.1298123-2-mpe@ellerman.id.au +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/lib/feature-fixups.c | 16 +++++++++++++++- + 1 file changed, 15 insertions(+), 1 deletion(-) + +--- a/arch/powerpc/lib/feature-fixups.c ++++ b/arch/powerpc/lib/feature-fixups.c +@@ -299,8 +299,9 @@ void do_uaccess_flush_fixups(enum l1d_fl + : "unknown"); + } + +-void do_entry_flush_fixups(enum l1d_flush_type types) ++static int __do_entry_flush_fixups(void *data) + { ++ enum l1d_flush_type types = *(enum l1d_flush_type *)data; + unsigned int instrs[3], *dest; + long *start, *end; + int i; +@@ -369,6 +370,19 @@ void do_entry_flush_fixups(enum l1d_flus + : "ori type" : + (types & L1D_FLUSH_MTTRIG) ? "mttrig type" + : "unknown"); ++ ++ return 0; ++} ++ ++void do_entry_flush_fixups(enum l1d_flush_type types) ++{ ++ /* ++ * The call to the fallback flush can not be safely patched in/out while ++ * other CPUs are executing it. So call __do_entry_flush_fixups() on one ++ * CPU while all other CPUs spin in the stop machine core with interrupts ++ * hard disabled. ++ */ ++ stop_machine(__do_entry_flush_fixups, &types, NULL); + } + + void do_rfi_flush_fixups(enum l1d_flush_type types) diff --git a/queue-5.11/powerpc-64s-fix-crashes-when-toggling-stf-barrier.patch b/queue-5.11/powerpc-64s-fix-crashes-when-toggling-stf-barrier.patch new file mode 100644 index 00000000000..c9ef9030a6e --- /dev/null +++ b/queue-5.11/powerpc-64s-fix-crashes-when-toggling-stf-barrier.patch @@ -0,0 +1,78 @@ +From 8ec7791bae1327b1c279c5cd6e929c3b12daaf0a Mon Sep 17 00:00:00 2001 +From: Michael Ellerman +Date: Thu, 6 May 2021 14:49:58 +1000 +Subject: powerpc/64s: Fix crashes when toggling stf barrier + +From: Michael Ellerman + +commit 8ec7791bae1327b1c279c5cd6e929c3b12daaf0a upstream. + +The STF (store-to-load forwarding) barrier mitigation can be +enabled/disabled at runtime via a debugfs file (stf_barrier), which +causes the kernel to patch itself to enable/disable the relevant +mitigations. + +However depending on which mitigation we're using, it may not be safe to +do that patching while other CPUs are active. For example the following +crash: + + User access of kernel address (c00000003fff5af0) - exploit attempt? (uid: 0) + segfault (11) at c00000003fff5af0 nip 7fff8ad12198 lr 7fff8ad121f8 code 1 + code: 40820128 e93c00d0 e9290058 7c292840 40810058 38600000 4bfd9a81 e8410018 + code: 2c030006 41810154 3860ffb6 e9210098 7d295279 39400000 40820a3c + +Shows that we returned to userspace without restoring the user r13 +value, due to executing the partially patched STF exit code. + +Fix it by doing the patching under stop machine. The CPUs that aren't +doing the patching will be spinning in the core of the stop machine +logic. That is currently sufficient for our purposes, because none of +the patching we do is to that code or anywhere in the vicinity. + +Fixes: a048a07d7f45 ("powerpc/64s: Add support for a store forwarding barrier at kernel entry/exit") +Cc: stable@vger.kernel.org # v4.17+ +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20210506044959.1298123-1-mpe@ellerman.id.au +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/lib/feature-fixups.c | 19 +++++++++++++++++-- + 1 file changed, 17 insertions(+), 2 deletions(-) + +--- a/arch/powerpc/lib/feature-fixups.c ++++ b/arch/powerpc/lib/feature-fixups.c +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -227,11 +228,25 @@ static void do_stf_exit_barrier_fixups(e + : "unknown"); + } + ++static int __do_stf_barrier_fixups(void *data) ++{ ++ enum stf_barrier_type *types = data; ++ ++ do_stf_entry_barrier_fixups(*types); ++ do_stf_exit_barrier_fixups(*types); ++ ++ return 0; ++} + + void do_stf_barrier_fixups(enum stf_barrier_type types) + { +- do_stf_entry_barrier_fixups(types); +- do_stf_exit_barrier_fixups(types); ++ /* ++ * The call to the fallback entry flush, and the fallback/sync-ori exit ++ * flush can not be safely patched in/out while other CPUs are executing ++ * them. So call __do_stf_barrier_fixups() on one CPU while all other CPUs ++ * spin in the stop machine core with interrupts hard disabled. ++ */ ++ stop_machine(__do_stf_barrier_fixups, &types, NULL); + } + + void do_uaccess_flush_fixups(enum l1d_flush_type types) diff --git a/queue-5.11/series b/queue-5.11/series index 3e4350137b3..7dcd05bc762 100644 --- a/queue-5.11/series +++ b/queue-5.11/series @@ -226,3 +226,24 @@ i40e-fix-use-after-free-in-i40e_client_subtask.patch i40e-fix-the-restart-auto-negotiation-after-fec-modi.patch i40e-fix-phy-type-identifiers-for-2.5g-and-5g-adapte.patch mptcp-fix-splat-when-closing-unaccepted-socket.patch +arc-entry-fix-off-by-one-error-in-syscall-number-validation.patch +arc-mm-pae-use-40-bit-physical-page-mask.patch +arc-mm-use-max_high_pfn-as-a-highmem-zone-border.patch +sh-remove-unused-variable.patch +powerpc-64s-fix-crashes-when-toggling-stf-barrier.patch +powerpc-64s-fix-crashes-when-toggling-entry-flush-barrier.patch +hfsplus-prevent-corruption-in-shrinking-truncate.patch +squashfs-fix-divide-error-in-calculate_skip.patch +userfaultfd-release-page-in-error-path-to-avoid-bug_on.patch +kasan-fix-unit-tests-with-config_ubsan_local_bounds-enabled.patch +mm-hugetlb-fix-f_seal_future_write.patch +blk-iocost-fix-weight-updates-of-inner-active-iocgs.patch +x86-sched-fix-the-amd-cppc-maximum-performance-value-on-certain-amd-ryzen-generations.patch +arm64-mte-initialize-rgsr_el1.seed-in-__cpu_setup.patch +arm64-fix-race-condition-on-pg_dcache_clean-in-__sync_icache_dcache.patch +btrfs-fix-deadlock-when-cloning-inline-extents-and-using-qgroups.patch +btrfs-fix-race-leading-to-unpersisted-data-and-metadata-on-fsync.patch +drm-radeon-dpm-disable-sclk-switching-on-oland-when-two-4k-60hz-monitors-are-connected.patch +drm-amd-display-initialize-attribute-for-hdcp_srm-sysfs-file.patch +drm-i915-avoid-div-by-zero-on-gen2.patch +kvm-exit-halt-polling-on-need_resched-as-well.patch diff --git a/queue-5.11/sh-remove-unused-variable.patch b/queue-5.11/sh-remove-unused-variable.patch new file mode 100644 index 00000000000..12b0fd8f6bf --- /dev/null +++ b/queue-5.11/sh-remove-unused-variable.patch @@ -0,0 +1,38 @@ +From 0d3ae948741ac6d80e39ab27b45297367ee477de Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Wed, 14 Apr 2021 10:05:17 -0700 +Subject: sh: Remove unused variable +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Eric Dumazet + +commit 0d3ae948741ac6d80e39ab27b45297367ee477de upstream. + +Removes this annoying warning: + +arch/sh/kernel/traps.c: In function ‘nmi_trap_handler’: +arch/sh/kernel/traps.c:183:15: warning: unused variable ‘cpu’ [-Wunused-variable] + 183 | unsigned int cpu = smp_processor_id(); + +Fixes: fe3f1d5d7cd3 ("sh: Get rid of nmi_count()") +Signed-off-by: Eric Dumazet +Signed-off-by: Thomas Gleixner +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20210414170517.1205430-1-eric.dumazet@gmail.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/sh/kernel/traps.c | 1 - + 1 file changed, 1 deletion(-) + +--- a/arch/sh/kernel/traps.c ++++ b/arch/sh/kernel/traps.c +@@ -180,7 +180,6 @@ static inline void arch_ftrace_nmi_exit( + + BUILD_TRAP_HANDLER(nmi) + { +- unsigned int cpu = smp_processor_id(); + TRAP_HANDLER_DECL; + + arch_ftrace_nmi_enter(); diff --git a/queue-5.11/squashfs-fix-divide-error-in-calculate_skip.patch b/queue-5.11/squashfs-fix-divide-error-in-calculate_skip.patch new file mode 100644 index 00000000000..1894fad2bd4 --- /dev/null +++ b/queue-5.11/squashfs-fix-divide-error-in-calculate_skip.patch @@ -0,0 +1,53 @@ +From d6e621de1fceb3b098ebf435ef7ea91ec4838a1a Mon Sep 17 00:00:00 2001 +From: Phillip Lougher +Date: Fri, 14 May 2021 17:27:16 -0700 +Subject: squashfs: fix divide error in calculate_skip() + +From: Phillip Lougher + +commit d6e621de1fceb3b098ebf435ef7ea91ec4838a1a upstream. + +Sysbot has reported a "divide error" which has been identified as being +caused by a corrupted file_size value within the file inode. This value +has been corrupted to a much larger value than expected. + +Calculate_skip() is passed i_size_read(inode) >> msblk->block_log. Due to +the file_size value corruption this overflows the int argument/variable in +that function, leading to the divide error. + +This patch changes the function to use u64. This will accommodate any +unexpectedly large values due to corruption. + +The value returned from calculate_skip() is clamped to be never more than +SQUASHFS_CACHED_BLKS - 1, or 7. So file_size corruption does not lead to +an unexpectedly large return result here. + +Link: https://lkml.kernel.org/r/20210507152618.9447-1-phillip@squashfs.org.uk +Signed-off-by: Phillip Lougher +Reported-by: +Reported-by: +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + fs/squashfs/file.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/fs/squashfs/file.c ++++ b/fs/squashfs/file.c +@@ -211,11 +211,11 @@ failure: + * If the skip factor is limited in this way then the file will use multiple + * slots. + */ +-static inline int calculate_skip(int blocks) ++static inline int calculate_skip(u64 blocks) + { +- int skip = blocks / ((SQUASHFS_META_ENTRIES + 1) ++ u64 skip = blocks / ((SQUASHFS_META_ENTRIES + 1) + * SQUASHFS_META_INDEXES); +- return min(SQUASHFS_CACHED_BLKS - 1, skip + 1); ++ return min((u64) SQUASHFS_CACHED_BLKS - 1, skip + 1); + } + + diff --git a/queue-5.11/userfaultfd-release-page-in-error-path-to-avoid-bug_on.patch b/queue-5.11/userfaultfd-release-page-in-error-path-to-avoid-bug_on.patch new file mode 100644 index 00000000000..a7d97ea6e69 --- /dev/null +++ b/queue-5.11/userfaultfd-release-page-in-error-path-to-avoid-bug_on.patch @@ -0,0 +1,64 @@ +From 7ed9d238c7dbb1fdb63ad96a6184985151b0171c Mon Sep 17 00:00:00 2001 +From: Axel Rasmussen +Date: Fri, 14 May 2021 17:27:19 -0700 +Subject: userfaultfd: release page in error path to avoid BUG_ON + +From: Axel Rasmussen + +commit 7ed9d238c7dbb1fdb63ad96a6184985151b0171c upstream. + +Consider the following sequence of events: + +1. Userspace issues a UFFD ioctl, which ends up calling into + shmem_mfill_atomic_pte(). We successfully account the blocks, we + shmem_alloc_page(), but then the copy_from_user() fails. We return + -ENOENT. We don't release the page we allocated. +2. Our caller detects this error code, tries the copy_from_user() after + dropping the mmap_lock, and retries, calling back into + shmem_mfill_atomic_pte(). +3. Meanwhile, let's say another process filled up the tmpfs being used. +4. So shmem_mfill_atomic_pte() fails to account blocks this time, and + immediately returns - without releasing the page. + +This triggers a BUG_ON in our caller, which asserts that the page +should always be consumed, unless -ENOENT is returned. + +To fix this, detect if we have such a "dangling" page when accounting +fails, and if so, release it before returning. + +Link: https://lkml.kernel.org/r/20210428230858.348400-1-axelrasmussen@google.com +Fixes: cb658a453b93 ("userfaultfd: shmem: avoid leaking blocks and used blocks in UFFDIO_COPY") +Signed-off-by: Axel Rasmussen +Reported-by: Hugh Dickins +Acked-by: Hugh Dickins +Reviewed-by: Peter Xu +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/shmem.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -2373,8 +2373,18 @@ static int shmem_mfill_atomic_pte(struct + pgoff_t offset, max_off; + + ret = -ENOMEM; +- if (!shmem_inode_acct_block(inode, 1)) ++ if (!shmem_inode_acct_block(inode, 1)) { ++ /* ++ * We may have got a page, returned -ENOENT triggering a retry, ++ * and now we find ourselves with -ENOMEM. Release the page, to ++ * avoid a BUG_ON in our caller. ++ */ ++ if (unlikely(*pagep)) { ++ put_page(*pagep); ++ *pagep = NULL; ++ } + goto out; ++ } + + if (!*pagep) { + page = shmem_alloc_page(gfp, info, pgoff); diff --git a/queue-5.11/x86-sched-fix-the-amd-cppc-maximum-performance-value-on-certain-amd-ryzen-generations.patch b/queue-5.11/x86-sched-fix-the-amd-cppc-maximum-performance-value-on-certain-amd-ryzen-generations.patch new file mode 100644 index 00000000000..148e5943a5d --- /dev/null +++ b/queue-5.11/x86-sched-fix-the-amd-cppc-maximum-performance-value-on-certain-amd-ryzen-generations.patch @@ -0,0 +1,106 @@ +From 3743d55b289c203d8f77b7cd47c24926b9d186ae Mon Sep 17 00:00:00 2001 +From: Huang Rui +Date: Sun, 25 Apr 2021 15:34:51 +0800 +Subject: x86, sched: Fix the AMD CPPC maximum performance value on certain AMD Ryzen generations +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Huang Rui + +commit 3743d55b289c203d8f77b7cd47c24926b9d186ae upstream. + +Some AMD Ryzen generations has different calculation method on maximum +performance. 255 is not for all ASICs, some specific generations should use 166 +as the maximum performance. Otherwise, it will report incorrect frequency value +like below: + + ~ → lscpu | grep MHz + CPU MHz: 3400.000 + CPU max MHz: 7228.3198 + CPU min MHz: 2200.0000 + +[ mingo: Tidied up whitespace use. ] +[ Alexander Monakov : fix 225 -> 255 typo. ] + +Fixes: 41ea667227ba ("x86, sched: Calculate frequency invariance for AMD systems") +Fixes: 3c55e94c0ade ("cpufreq: ACPI: Extend frequency tables to cover boost frequencies") +Reported-by: Jason Bagavatsingham +Fixed-by: Alexander Monakov +Reviewed-by: Rafael J. Wysocki +Signed-off-by: Huang Rui +Signed-off-by: Ingo Molnar +Tested-by: Jason Bagavatsingham +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20210425073451.2557394-1-ray.huang@amd.com +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=211791 +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/include/asm/processor.h | 2 ++ + arch/x86/kernel/cpu/amd.c | 16 ++++++++++++++++ + arch/x86/kernel/smpboot.c | 2 +- + drivers/cpufreq/acpi-cpufreq.c | 6 +++++- + 4 files changed, 24 insertions(+), 2 deletions(-) + +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -805,8 +805,10 @@ DECLARE_PER_CPU(u64, msr_misc_features_s + + #ifdef CONFIG_CPU_SUP_AMD + extern u32 amd_get_nodes_per_socket(void); ++extern u32 amd_get_highest_perf(void); + #else + static inline u32 amd_get_nodes_per_socket(void) { return 0; } ++static inline u32 amd_get_highest_perf(void) { return 0; } + #endif + + static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -1170,3 +1170,19 @@ void set_dr_addr_mask(unsigned long mask + break; + } + } ++ ++u32 amd_get_highest_perf(void) ++{ ++ struct cpuinfo_x86 *c = &boot_cpu_data; ++ ++ if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) || ++ (c->x86_model >= 0x70 && c->x86_model < 0x80))) ++ return 166; ++ ++ if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) || ++ (c->x86_model >= 0x40 && c->x86_model < 0x70))) ++ return 166; ++ ++ return 255; ++} ++EXPORT_SYMBOL_GPL(amd_get_highest_perf); +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -2046,7 +2046,7 @@ static bool amd_set_max_freq_ratio(void) + return false; + } + +- highest_perf = perf_caps.highest_perf; ++ highest_perf = amd_get_highest_perf(); + nominal_perf = perf_caps.nominal_perf; + + if (!highest_perf || !nominal_perf) { +--- a/drivers/cpufreq/acpi-cpufreq.c ++++ b/drivers/cpufreq/acpi-cpufreq.c +@@ -646,7 +646,11 @@ static u64 get_max_boost_ratio(unsigned + return 0; + } + +- highest_perf = perf_caps.highest_perf; ++ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) ++ highest_perf = amd_get_highest_perf(); ++ else ++ highest_perf = perf_caps.highest_perf; ++ + nominal_perf = perf_caps.nominal_perf; + + if (!highest_perf || !nominal_perf) {