--- /dev/null
+From 3433adc8bd09fc9f29b8baddf33b4ecd1ecd2cdc Mon Sep 17 00:00:00 2001
+From: Vineet Gupta <vgupta@synopsys.com>
+Date: Fri, 23 Apr 2021 12:16:25 -0700
+Subject: ARC: entry: fix off-by-one error in syscall number validation
+
+From: Vineet Gupta <vgupta@synopsys.com>
+
+commit 3433adc8bd09fc9f29b8baddf33b4ecd1ecd2cdc upstream.
+
+We have NR_syscall syscalls from [0 .. NR_syscall-1].
+However the check for invalid syscall number is "> NR_syscall" as
+opposed to >=. This off-by-one error erronesously allows "NR_syscall"
+to be treated as valid syscall causeing out-of-bounds access into
+syscall-call table ensuing a crash (holes within syscall table have a
+invalid-entry handler but this is beyond the array implementing the
+table).
+
+This problem showed up on v5.6 kernel when testing glibc 2.33 (v5.10
+kernel capable, includng faccessat2 syscall 439). The v5.6 kernel has
+NR_syscalls=439 (0 to 438). Due to the bug, 439 passed by glibc was
+not handled as -ENOSYS but processed leading to a crash.
+
+Link: https://github.com/foss-for-synopsys-dwc-arc-processors/linux/issues/48
+Reported-by: Shahab Vahedi <shahab@synopsys.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arc/kernel/entry.S | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/arc/kernel/entry.S
++++ b/arch/arc/kernel/entry.S
+@@ -177,7 +177,7 @@ tracesys:
+
+ ; Do the Sys Call as we normally would.
+ ; Validate the Sys Call number
+- cmp r8, NR_syscalls
++ cmp r8, NR_syscalls - 1
+ mov.hi r0, -ENOSYS
+ bhi tracesys_exit
+
+@@ -255,7 +255,7 @@ ENTRY(EV_Trap)
+ ;============ Normal syscall case
+
+ ; syscall num shd not exceed the total system calls avail
+- cmp r8, NR_syscalls
++ cmp r8, NR_syscalls - 1
+ mov.hi r0, -ENOSYS
+ bhi .Lret_from_system_call
+
--- /dev/null
+From c5f756d8c6265ebb1736a7787231f010a3b782e5 Mon Sep 17 00:00:00 2001
+From: Vladimir Isaev <isaev@synopsys.com>
+Date: Tue, 27 Apr 2021 15:12:37 +0300
+Subject: ARC: mm: PAE: use 40-bit physical page mask
+
+From: Vladimir Isaev <isaev@synopsys.com>
+
+commit c5f756d8c6265ebb1736a7787231f010a3b782e5 upstream.
+
+32-bit PAGE_MASK can not be used as a mask for physical addresses
+when PAE is enabled. PAGE_MASK_PHYS must be used for physical
+addresses instead of PAGE_MASK.
+
+Without this, init gets SIGSEGV if pte_modify was called:
+
+| potentially unexpected fatal signal 11.
+| Path: /bin/busybox
+| CPU: 0 PID: 1 Comm: init Not tainted 5.12.0-rc5-00003-g1e43c377a79f-dirty
+| Insn could not be fetched
+| @No matching VMA found
+| ECR: 0x00040000 EFA: 0x00000000 ERET: 0x00000000
+| STAT: 0x80080082 [IE U ] BTA: 0x00000000
+| SP: 0x5f9ffe44 FP: 0x00000000 BLK: 0xaf3d4
+| LPS: 0x000d093e LPE: 0x000d0950 LPC: 0x00000000
+| r00: 0x00000002 r01: 0x5f9fff14 r02: 0x5f9fff20
+| ...
+| Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
+
+Signed-off-by: Vladimir Isaev <isaev@synopsys.com>
+Reported-by: kernel test robot <lkp@intel.com>
+Cc: Vineet Gupta <vgupta@synopsys.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arc/include/asm/page.h | 12 ++++++++++++
+ arch/arc/include/asm/pgtable.h | 12 +++---------
+ arch/arc/include/uapi/asm/page.h | 1 -
+ arch/arc/mm/ioremap.c | 5 +++--
+ arch/arc/mm/tlb.c | 2 +-
+ 5 files changed, 19 insertions(+), 13 deletions(-)
+
+--- a/arch/arc/include/asm/page.h
++++ b/arch/arc/include/asm/page.h
+@@ -7,6 +7,18 @@
+
+ #include <uapi/asm/page.h>
+
++#ifdef CONFIG_ARC_HAS_PAE40
++
++#define MAX_POSSIBLE_PHYSMEM_BITS 40
++#define PAGE_MASK_PHYS (0xff00000000ull | PAGE_MASK)
++
++#else /* CONFIG_ARC_HAS_PAE40 */
++
++#define MAX_POSSIBLE_PHYSMEM_BITS 32
++#define PAGE_MASK_PHYS PAGE_MASK
++
++#endif /* CONFIG_ARC_HAS_PAE40 */
++
+ #ifndef __ASSEMBLY__
+
+ #define clear_page(paddr) memset((paddr), 0, PAGE_SIZE)
+--- a/arch/arc/include/asm/pgtable.h
++++ b/arch/arc/include/asm/pgtable.h
+@@ -107,8 +107,8 @@
+ #define ___DEF (_PAGE_PRESENT | _PAGE_CACHEABLE)
+
+ /* Set of bits not changed in pte_modify */
+-#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_SPECIAL)
+-
++#define _PAGE_CHG_MASK (PAGE_MASK_PHYS | _PAGE_ACCESSED | _PAGE_DIRTY | \
++ _PAGE_SPECIAL)
+ /* More Abbrevaited helpers */
+ #define PAGE_U_NONE __pgprot(___DEF)
+ #define PAGE_U_R __pgprot(___DEF | _PAGE_READ)
+@@ -132,13 +132,7 @@
+ #define PTE_BITS_IN_PD0 (_PAGE_GLOBAL | _PAGE_PRESENT | _PAGE_HW_SZ)
+ #define PTE_BITS_RWX (_PAGE_EXECUTE | _PAGE_WRITE | _PAGE_READ)
+
+-#ifdef CONFIG_ARC_HAS_PAE40
+-#define PTE_BITS_NON_RWX_IN_PD1 (0xff00000000 | PAGE_MASK | _PAGE_CACHEABLE)
+-#define MAX_POSSIBLE_PHYSMEM_BITS 40
+-#else
+-#define PTE_BITS_NON_RWX_IN_PD1 (PAGE_MASK | _PAGE_CACHEABLE)
+-#define MAX_POSSIBLE_PHYSMEM_BITS 32
+-#endif
++#define PTE_BITS_NON_RWX_IN_PD1 (PAGE_MASK_PHYS | _PAGE_CACHEABLE)
+
+ /**************************************************************************
+ * Mapping of vm_flags (Generic VM) to PTE flags (arch specific)
+--- a/arch/arc/include/uapi/asm/page.h
++++ b/arch/arc/include/uapi/asm/page.h
+@@ -33,5 +33,4 @@
+
+ #define PAGE_MASK (~(PAGE_SIZE-1))
+
+-
+ #endif /* _UAPI__ASM_ARC_PAGE_H */
+--- a/arch/arc/mm/ioremap.c
++++ b/arch/arc/mm/ioremap.c
+@@ -53,9 +53,10 @@ EXPORT_SYMBOL(ioremap);
+ void __iomem *ioremap_prot(phys_addr_t paddr, unsigned long size,
+ unsigned long flags)
+ {
++ unsigned int off;
+ unsigned long vaddr;
+ struct vm_struct *area;
+- phys_addr_t off, end;
++ phys_addr_t end;
+ pgprot_t prot = __pgprot(flags);
+
+ /* Don't allow wraparound, zero size */
+@@ -72,7 +73,7 @@ void __iomem *ioremap_prot(phys_addr_t p
+
+ /* Mappings have to be page-aligned */
+ off = paddr & ~PAGE_MASK;
+- paddr &= PAGE_MASK;
++ paddr &= PAGE_MASK_PHYS;
+ size = PAGE_ALIGN(end + 1) - paddr;
+
+ /*
+--- a/arch/arc/mm/tlb.c
++++ b/arch/arc/mm/tlb.c
+@@ -576,7 +576,7 @@ void update_mmu_cache(struct vm_area_str
+ pte_t *ptep)
+ {
+ unsigned long vaddr = vaddr_unaligned & PAGE_MASK;
+- phys_addr_t paddr = pte_val(*ptep) & PAGE_MASK;
++ phys_addr_t paddr = pte_val(*ptep) & PAGE_MASK_PHYS;
+ struct page *page = pfn_to_page(pte_pfn(*ptep));
+
+ create_tlb(vma, vaddr, ptep);
--- /dev/null
+From 1d5e4640e5df15252398c1b621f6bd432f2d7f17 Mon Sep 17 00:00:00 2001
+From: Vladimir Isaev <isaev@synopsys.com>
+Date: Tue, 27 Apr 2021 15:13:54 +0300
+Subject: ARC: mm: Use max_high_pfn as a HIGHMEM zone border
+
+From: Vladimir Isaev <isaev@synopsys.com>
+
+commit 1d5e4640e5df15252398c1b621f6bd432f2d7f17 upstream.
+
+Commit 4af22ded0ecf ("arc: fix memory initialization for systems
+with two memory banks") fixed highmem, but for the PAE case it causes
+bug messages:
+
+| BUG: Bad page state in process swapper pfn:80000
+| page:(ptrval) refcount:0 mapcount:1 mapping:00000000 index:0x0 pfn:0x80000 flags: 0x0()
+| raw: 00000000 00000100 00000122 00000000 00000000 00000000 00000000 00000000
+| raw: 00000000
+| page dumped because: nonzero mapcount
+| Modules linked in:
+| CPU: 0 PID: 0 Comm: swapper Not tainted 5.12.0-rc5-00003-g1e43c377a79f #1
+
+This is because the fix expects highmem to be always less than
+lowmem and uses min_low_pfn as an upper zone border for highmem.
+
+max_high_pfn should be ok for both highmem and highmem+PAE cases.
+
+Fixes: 4af22ded0ecf ("arc: fix memory initialization for systems with two memory banks")
+Signed-off-by: Vladimir Isaev <isaev@synopsys.com>
+Cc: Mike Rapoport <rppt@linux.ibm.com>
+Cc: stable@vger.kernel.org #5.8 onwards
+Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arc/mm/init.c | 11 ++++++++++-
+ 1 file changed, 10 insertions(+), 1 deletion(-)
+
+--- a/arch/arc/mm/init.c
++++ b/arch/arc/mm/init.c
+@@ -157,7 +157,16 @@ void __init setup_arch_memory(void)
+ min_high_pfn = PFN_DOWN(high_mem_start);
+ max_high_pfn = PFN_DOWN(high_mem_start + high_mem_sz);
+
+- max_zone_pfn[ZONE_HIGHMEM] = min_low_pfn;
++ /*
++ * max_high_pfn should be ok here for both HIGHMEM and HIGHMEM+PAE.
++ * For HIGHMEM without PAE max_high_pfn should be less than
++ * min_low_pfn to guarantee that these two regions don't overlap.
++ * For PAE case highmem is greater than lowmem, so it is natural
++ * to use max_high_pfn.
++ *
++ * In both cases, holes should be handled by pfn_valid().
++ */
++ max_zone_pfn[ZONE_HIGHMEM] = max_high_pfn;
+
+ high_memory = (void *)(min_high_pfn << PAGE_SHIFT);
+
--- /dev/null
+From 588a513d34257fdde95a9f0df0202e31998e85c6 Mon Sep 17 00:00:00 2001
+From: Catalin Marinas <catalin.marinas@arm.com>
+Date: Fri, 14 May 2021 10:50:01 +0100
+Subject: arm64: Fix race condition on PG_dcache_clean in __sync_icache_dcache()
+
+From: Catalin Marinas <catalin.marinas@arm.com>
+
+commit 588a513d34257fdde95a9f0df0202e31998e85c6 upstream.
+
+To ensure that instructions are observable in a new mapping, the arm64
+set_pte_at() implementation cleans the D-cache and invalidates the
+I-cache to the PoU. As an optimisation, this is only done on executable
+mappings and the PG_dcache_clean page flag is set to avoid future cache
+maintenance on the same page.
+
+When two different processes map the same page (e.g. private executable
+file or shared mapping) there's a potential race on checking and setting
+PG_dcache_clean via set_pte_at() -> __sync_icache_dcache(). While on the
+fault paths the page is locked (PG_locked), mprotect() does not take the
+page lock. The result is that one process may see the PG_dcache_clean
+flag set but the I/D cache maintenance not yet performed.
+
+Avoid test_and_set_bit(PG_dcache_clean) in favour of separate test_bit()
+and set_bit(). In the rare event of a race, the cache maintenance is
+done twice.
+
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Cc: <stable@vger.kernel.org>
+Cc: Will Deacon <will@kernel.org>
+Cc: Steven Price <steven.price@arm.com>
+Reviewed-by: Steven Price <steven.price@arm.com>
+Acked-by: Will Deacon <will@kernel.org>
+Link: https://lore.kernel.org/r/20210514095001.13236-1-catalin.marinas@arm.com
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/mm/flush.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/arch/arm64/mm/flush.c
++++ b/arch/arm64/mm/flush.c
+@@ -55,8 +55,10 @@ void __sync_icache_dcache(pte_t pte)
+ {
+ struct page *page = pte_page(pte);
+
+- if (!test_and_set_bit(PG_dcache_clean, &page->flags))
++ if (!test_bit(PG_dcache_clean, &page->flags)) {
+ sync_icache_aliases(page_address(page), page_size(page));
++ set_bit(PG_dcache_clean, &page->flags);
++ }
+ }
+ EXPORT_SYMBOL_GPL(__sync_icache_dcache);
+
--- /dev/null
+From 37a8024d265564eba680575df6421f19db21dfce Mon Sep 17 00:00:00 2001
+From: Peter Collingbourne <pcc@google.com>
+Date: Fri, 7 May 2021 11:59:05 -0700
+Subject: arm64: mte: initialize RGSR_EL1.SEED in __cpu_setup
+
+From: Peter Collingbourne <pcc@google.com>
+
+commit 37a8024d265564eba680575df6421f19db21dfce upstream.
+
+A valid implementation choice for the ChooseRandomNonExcludedTag()
+pseudocode function used by IRG is to behave in the same way as with
+GCR_EL1.RRND=0. This would mean that RGSR_EL1.SEED is used as an LFSR
+which must have a non-zero value in order for IRG to properly produce
+pseudorandom numbers. However, RGSR_EL1 is reset to an UNKNOWN value
+on soft reset and thus may reset to 0. Therefore we must initialize
+RGSR_EL1.SEED to a non-zero value in order to ensure that IRG behaves
+as expected.
+
+Signed-off-by: Peter Collingbourne <pcc@google.com>
+Fixes: 3b714d24ef17 ("arm64: mte: CPU feature detection and initial sysreg configuration")
+Cc: <stable@vger.kernel.org> # 5.10
+Link: https://linux-review.googlesource.com/id/I2b089b6c7d6f17ee37e2f0db7df5ad5bcc04526c
+Acked-by: Mark Rutland <mark.rutland@arm.com>
+Link: https://lore.kernel.org/r/20210507185905.1745402-1-pcc@google.com
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/mm/proc.S | 12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+--- a/arch/arm64/mm/proc.S
++++ b/arch/arm64/mm/proc.S
+@@ -444,6 +444,18 @@ SYM_FUNC_START(__cpu_setup)
+ mov x10, #(SYS_GCR_EL1_RRND | SYS_GCR_EL1_EXCL_MASK)
+ msr_s SYS_GCR_EL1, x10
+
++ /*
++ * If GCR_EL1.RRND=1 is implemented the same way as RRND=0, then
++ * RGSR_EL1.SEED must be non-zero for IRG to produce
++ * pseudorandom numbers. As RGSR_EL1 is UNKNOWN out of reset, we
++ * must initialize it.
++ */
++ mrs x10, CNTVCT_EL0
++ ands x10, x10, #SYS_RGSR_EL1_SEED_MASK
++ csinc x10, x10, xzr, ne
++ lsl x10, x10, #SYS_RGSR_EL1_SEED_SHIFT
++ msr_s SYS_RGSR_EL1, x10
++
+ /* clear any pending tag check faults in TFSR*_EL1 */
+ msr_s SYS_TFSR_EL1, xzr
+ msr_s SYS_TFSRE0_EL1, xzr
--- /dev/null
+From e9f4eee9a0023ba22db9560d4cc6ee63f933dae8 Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Tue, 11 May 2021 21:38:36 -0400
+Subject: blk-iocost: fix weight updates of inner active iocgs
+
+From: Tejun Heo <tj@kernel.org>
+
+commit e9f4eee9a0023ba22db9560d4cc6ee63f933dae8 upstream.
+
+When the weight of an active iocg is updated, weight_updated() is called
+which in turn calls __propagate_weights() to update the active and inuse
+weights so that the effective hierarchical weights are update accordingly.
+
+The current implementation is incorrect for inner active nodes. For an
+active leaf iocg, inuse can be any value between 1 and active and the
+difference represents how much the iocg is donating. When weight is updated,
+as long as inuse is clamped between 1 and the new weight, we're alright and
+this is what __propagate_weights() currently implements.
+
+However, that's not how an active inner node's inuse is set. An inner node's
+inuse is solely determined by the ratio between the sums of inuse's and
+active's of its children - ie. they're results of propagating the leaves'
+active and inuse weights upwards. __propagate_weights() incorrectly applies
+the same clamping as for a leaf when an active inner node's weight is
+updated. Consider a hierarchy which looks like the following with saturating
+workloads in AA and BB.
+
+ R
+ / \
+ A B
+ | |
+ AA BB
+
+1. For both A and B, active=100, inuse=100, hwa=0.5, hwi=0.5.
+
+2. echo 200 > A/io.weight
+
+3. __propagate_weights() update A's active to 200 and leave inuse at 100 as
+ it's already between 1 and the new active, making A:active=200,
+ A:inuse=100. As R's active_sum is updated along with A's active,
+ A:hwa=2/3, B:hwa=1/3. However, because the inuses didn't change, the
+ hwi's remain unchanged at 0.5.
+
+4. The weight of A is now twice that of B but AA and BB still have the same
+ hwi of 0.5 and thus are doing the same amount of IOs.
+
+Fix it by making __propgate_weights() always calculate the inuse of an
+active inner iocg based on the ratio of child_inuse_sum to child_active_sum.
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Reported-by: Dan Schatzberg <dschatzberg@fb.com>
+Fixes: 7caa47151ab2 ("blkcg: implement blk-iocost")
+Cc: stable@vger.kernel.org # v5.4+
+Link: https://lore.kernel.org/r/YJsxnLZV1MnBcqjj@slm.duckdns.org
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/blk-iocost.c | 14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+--- a/block/blk-iocost.c
++++ b/block/blk-iocost.c
+@@ -1073,7 +1073,17 @@ static void __propagate_weights(struct i
+
+ lockdep_assert_held(&ioc->lock);
+
+- inuse = clamp_t(u32, inuse, 1, active);
++ /*
++ * For an active leaf node, its inuse shouldn't be zero or exceed
++ * @active. An active internal node's inuse is solely determined by the
++ * inuse to active ratio of its children regardless of @inuse.
++ */
++ if (list_empty(&iocg->active_list) && iocg->child_active_sum) {
++ inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum,
++ iocg->child_active_sum);
++ } else {
++ inuse = clamp_t(u32, inuse, 1, active);
++ }
+
+ iocg->last_inuse = iocg->inuse;
+ if (save)
+@@ -1090,7 +1100,7 @@ static void __propagate_weights(struct i
+ /* update the level sums */
+ parent->child_active_sum += (s32)(active - child->active);
+ parent->child_inuse_sum += (s32)(inuse - child->inuse);
+- /* apply the udpates */
++ /* apply the updates */
+ child->active = active;
+ child->inuse = inuse;
+
--- /dev/null
+From f9baa501b4fd6962257853d46ddffbc21f27e344 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 22 Apr 2021 12:08:05 +0100
+Subject: btrfs: fix deadlock when cloning inline extents and using qgroups
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit f9baa501b4fd6962257853d46ddffbc21f27e344 upstream.
+
+There are a few exceptional cases where cloning an inline extent needs to
+copy the inline extent data into a page of the destination inode.
+
+When this happens, we end up starting a transaction while having a dirty
+page for the destination inode and while having the range locked in the
+destination's inode iotree too. Because when reserving metadata space
+for a transaction we may need to flush existing delalloc in case there is
+not enough free space, we have a mechanism in place to prevent a deadlock,
+which was introduced in commit 3d45f221ce627d ("btrfs: fix deadlock when
+cloning inline extent and low on free metadata space").
+
+However when using qgroups, a transaction also reserves metadata qgroup
+space, which can also result in flushing delalloc in case there is not
+enough available space at the moment. When this happens we deadlock, since
+flushing delalloc requires locking the file range in the inode's iotree
+and the range was already locked at the very beginning of the clone
+operation, before attempting to start the transaction.
+
+When this issue happens, stack traces like the following are reported:
+
+ [72747.556262] task:kworker/u81:9 state:D stack: 0 pid: 225 ppid: 2 flags:0x00004000
+ [72747.556268] Workqueue: writeback wb_workfn (flush-btrfs-1142)
+ [72747.556271] Call Trace:
+ [72747.556273] __schedule+0x296/0x760
+ [72747.556277] schedule+0x3c/0xa0
+ [72747.556279] io_schedule+0x12/0x40
+ [72747.556284] __lock_page+0x13c/0x280
+ [72747.556287] ? generic_file_readonly_mmap+0x70/0x70
+ [72747.556325] extent_write_cache_pages+0x22a/0x440 [btrfs]
+ [72747.556331] ? __set_page_dirty_nobuffers+0xe7/0x160
+ [72747.556358] ? set_extent_buffer_dirty+0x5e/0x80 [btrfs]
+ [72747.556362] ? update_group_capacity+0x25/0x210
+ [72747.556366] ? cpumask_next_and+0x1a/0x20
+ [72747.556391] extent_writepages+0x44/0xa0 [btrfs]
+ [72747.556394] do_writepages+0x41/0xd0
+ [72747.556398] __writeback_single_inode+0x39/0x2a0
+ [72747.556403] writeback_sb_inodes+0x1ea/0x440
+ [72747.556407] __writeback_inodes_wb+0x5f/0xc0
+ [72747.556410] wb_writeback+0x235/0x2b0
+ [72747.556414] ? get_nr_inodes+0x35/0x50
+ [72747.556417] wb_workfn+0x354/0x490
+ [72747.556420] ? newidle_balance+0x2c5/0x3e0
+ [72747.556424] process_one_work+0x1aa/0x340
+ [72747.556426] worker_thread+0x30/0x390
+ [72747.556429] ? create_worker+0x1a0/0x1a0
+ [72747.556432] kthread+0x116/0x130
+ [72747.556435] ? kthread_park+0x80/0x80
+ [72747.556438] ret_from_fork+0x1f/0x30
+
+ [72747.566958] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs]
+ [72747.566961] Call Trace:
+ [72747.566964] __schedule+0x296/0x760
+ [72747.566968] ? finish_wait+0x80/0x80
+ [72747.566970] schedule+0x3c/0xa0
+ [72747.566995] wait_extent_bit.constprop.68+0x13b/0x1c0 [btrfs]
+ [72747.566999] ? finish_wait+0x80/0x80
+ [72747.567024] lock_extent_bits+0x37/0x90 [btrfs]
+ [72747.567047] btrfs_invalidatepage+0x299/0x2c0 [btrfs]
+ [72747.567051] ? find_get_pages_range_tag+0x2cd/0x380
+ [72747.567076] __extent_writepage+0x203/0x320 [btrfs]
+ [72747.567102] extent_write_cache_pages+0x2bb/0x440 [btrfs]
+ [72747.567106] ? update_load_avg+0x7e/0x5f0
+ [72747.567109] ? enqueue_entity+0xf4/0x6f0
+ [72747.567134] extent_writepages+0x44/0xa0 [btrfs]
+ [72747.567137] ? enqueue_task_fair+0x93/0x6f0
+ [72747.567140] do_writepages+0x41/0xd0
+ [72747.567144] __filemap_fdatawrite_range+0xc7/0x100
+ [72747.567167] btrfs_run_delalloc_work+0x17/0x40 [btrfs]
+ [72747.567195] btrfs_work_helper+0xc2/0x300 [btrfs]
+ [72747.567200] process_one_work+0x1aa/0x340
+ [72747.567202] worker_thread+0x30/0x390
+ [72747.567205] ? create_worker+0x1a0/0x1a0
+ [72747.567208] kthread+0x116/0x130
+ [72747.567211] ? kthread_park+0x80/0x80
+ [72747.567214] ret_from_fork+0x1f/0x30
+
+ [72747.569686] task:fsstress state:D stack: 0 pid:841421 ppid:841417 flags:0x00000000
+ [72747.569689] Call Trace:
+ [72747.569691] __schedule+0x296/0x760
+ [72747.569694] schedule+0x3c/0xa0
+ [72747.569721] try_flush_qgroup+0x95/0x140 [btrfs]
+ [72747.569725] ? finish_wait+0x80/0x80
+ [72747.569753] btrfs_qgroup_reserve_data+0x34/0x50 [btrfs]
+ [72747.569781] btrfs_check_data_free_space+0x5f/0xa0 [btrfs]
+ [72747.569804] btrfs_buffered_write+0x1f7/0x7f0 [btrfs]
+ [72747.569810] ? path_lookupat.isra.48+0x97/0x140
+ [72747.569833] btrfs_file_write_iter+0x81/0x410 [btrfs]
+ [72747.569836] ? __kmalloc+0x16a/0x2c0
+ [72747.569839] do_iter_readv_writev+0x160/0x1c0
+ [72747.569843] do_iter_write+0x80/0x1b0
+ [72747.569847] vfs_writev+0x84/0x140
+ [72747.569869] ? btrfs_file_llseek+0x38/0x270 [btrfs]
+ [72747.569873] do_writev+0x65/0x100
+ [72747.569876] do_syscall_64+0x33/0x40
+ [72747.569879] entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+ [72747.569899] task:fsstress state:D stack: 0 pid:841424 ppid:841417 flags:0x00004000
+ [72747.569903] Call Trace:
+ [72747.569906] __schedule+0x296/0x760
+ [72747.569909] schedule+0x3c/0xa0
+ [72747.569936] try_flush_qgroup+0x95/0x140 [btrfs]
+ [72747.569940] ? finish_wait+0x80/0x80
+ [72747.569967] __btrfs_qgroup_reserve_meta+0x36/0x50 [btrfs]
+ [72747.569989] start_transaction+0x279/0x580 [btrfs]
+ [72747.570014] clone_copy_inline_extent+0x332/0x490 [btrfs]
+ [72747.570041] btrfs_clone+0x5b7/0x7a0 [btrfs]
+ [72747.570068] ? lock_extent_bits+0x64/0x90 [btrfs]
+ [72747.570095] btrfs_clone_files+0xfc/0x150 [btrfs]
+ [72747.570122] btrfs_remap_file_range+0x3d8/0x4a0 [btrfs]
+ [72747.570126] do_clone_file_range+0xed/0x200
+ [72747.570131] vfs_clone_file_range+0x37/0x110
+ [72747.570134] ioctl_file_clone+0x7d/0xb0
+ [72747.570137] do_vfs_ioctl+0x138/0x630
+ [72747.570140] __x64_sys_ioctl+0x62/0xc0
+ [72747.570143] do_syscall_64+0x33/0x40
+ [72747.570146] entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+So fix this by skipping the flush of delalloc for an inode that is
+flagged with BTRFS_INODE_NO_DELALLOC_FLUSH, meaning it is currently under
+such a special case of cloning an inline extent, when flushing delalloc
+during qgroup metadata reservation.
+
+The special cases for cloning inline extents were added in kernel 5.7 by
+by commit 05a5a7621ce66c ("Btrfs: implement full reflink support for
+inline extents"), while having qgroup metadata space reservation flushing
+delalloc when low on space was added in kernel 5.9 by commit
+c53e9653605dbf ("btrfs: qgroup: try to flush qgroup space when we get
+-EDQUOT"). So use a "Fixes:" tag for the later commit to ease stable
+kernel backports.
+
+Reported-by: Wang Yugui <wangyugui@e16-tech.com>
+Link: https://lore.kernel.org/linux-btrfs/20210421083137.31E3.409509F4@e16-tech.com/
+Fixes: c53e9653605dbf ("btrfs: qgroup: try to flush qgroup space when we get -EDQUOT")
+CC: stable@vger.kernel.org # 5.9+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ctree.h | 2 +-
+ fs/btrfs/inode.c | 4 ++--
+ fs/btrfs/ioctl.c | 2 +-
+ fs/btrfs/qgroup.c | 2 +-
+ fs/btrfs/send.c | 4 ++--
+ 5 files changed, 7 insertions(+), 7 deletions(-)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -3110,7 +3110,7 @@ int btrfs_truncate_inode_items(struct bt
+ struct btrfs_inode *inode, u64 new_size,
+ u32 min_type);
+
+-int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
++int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
+ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
+ bool in_reclaim_context);
+ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -9672,7 +9672,7 @@ out:
+ return ret;
+ }
+
+-int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
++int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
+ {
+ struct writeback_control wbc = {
+ .nr_to_write = LONG_MAX,
+@@ -9685,7 +9685,7 @@ int btrfs_start_delalloc_snapshot(struct
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ return -EROFS;
+
+- return start_delalloc_inodes(root, &wbc, true, false);
++ return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
+ }
+
+ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -1046,7 +1046,7 @@ static noinline int btrfs_mksnapshot(con
+ */
+ btrfs_drew_read_lock(&root->snapshot_lock);
+
+- ret = btrfs_start_delalloc_snapshot(root);
++ ret = btrfs_start_delalloc_snapshot(root, false);
+ if (ret)
+ goto out;
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -3579,7 +3579,7 @@ static int try_flush_qgroup(struct btrfs
+ return 0;
+ }
+
+- ret = btrfs_start_delalloc_snapshot(root);
++ ret = btrfs_start_delalloc_snapshot(root, true);
+ if (ret < 0)
+ goto out;
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+--- a/fs/btrfs/send.c
++++ b/fs/btrfs/send.c
+@@ -7139,7 +7139,7 @@ static int flush_delalloc_roots(struct s
+ int i;
+
+ if (root) {
+- ret = btrfs_start_delalloc_snapshot(root);
++ ret = btrfs_start_delalloc_snapshot(root, false);
+ if (ret)
+ return ret;
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
+@@ -7147,7 +7147,7 @@ static int flush_delalloc_roots(struct s
+
+ for (i = 0; i < sctx->clone_roots_cnt; i++) {
+ root = sctx->clone_roots[i].root;
+- ret = btrfs_start_delalloc_snapshot(root);
++ ret = btrfs_start_delalloc_snapshot(root, false);
+ if (ret)
+ return ret;
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
--- /dev/null
+From 626e9f41f7c281ba3e02843702f68471706aa6d9 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 27 Apr 2021 11:27:20 +0100
+Subject: btrfs: fix race leading to unpersisted data and metadata on fsync
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 626e9f41f7c281ba3e02843702f68471706aa6d9 upstream.
+
+When doing a fast fsync on a file, there is a race which can result in the
+fsync returning success to user space without logging the inode and without
+durably persisting new data.
+
+The following example shows one possible scenario for this:
+
+ $ mkfs.btrfs -f /dev/sdc
+ $ mount /dev/sdc /mnt
+
+ $ touch /mnt/bar
+ $ xfs_io -f -c "pwrite -S 0xab 0 1M" -c "fsync" /mnt/baz
+
+ # Now we have:
+ # file bar == inode 257
+ # file baz == inode 258
+
+ $ mv /mnt/baz /mnt/foo
+
+ # Now we have:
+ # file bar == inode 257
+ # file foo == inode 258
+
+ $ xfs_io -c "pwrite -S 0xcd 0 1M" /mnt/foo
+
+ # fsync bar before foo, it is important to trigger the race.
+ $ xfs_io -c "fsync" /mnt/bar
+ $ xfs_io -c "fsync" /mnt/foo
+
+ # After this:
+ # inode 257, file bar, is empty
+ # inode 258, file foo, has 1M filled with 0xcd
+
+ <power failure>
+
+ # Replay the log:
+ $ mount /dev/sdc /mnt
+
+ # After this point file foo should have 1M filled with 0xcd and not 0xab
+
+The following steps explain how the race happens:
+
+1) Before the first fsync of inode 258, when it has the "baz" name, its
+ ->logged_trans is 0, ->last_sub_trans is 0 and ->last_log_commit is -1.
+ The inode also has the full sync flag set;
+
+2) After the first fsync, we set inode 258 ->logged_trans to 6, which is
+ the generation of the current transaction, and set ->last_log_commit
+ to 0, which is the current value of ->last_sub_trans (done at
+ btrfs_log_inode()).
+
+ The full sync flag is cleared from the inode during the fsync.
+
+ The log sub transaction that was committed had an ID of 0 and when we
+ synced the log, at btrfs_sync_log(), we incremented root->log_transid
+ from 0 to 1;
+
+3) During the rename:
+
+ We update inode 258, through btrfs_update_inode(), and that causes its
+ ->last_sub_trans to be set to 1 (the current log transaction ID), and
+ ->last_log_commit remains with a value of 0.
+
+ After updating inode 258, because we have previously logged the inode
+ in the previous fsync, we log again the inode through the call to
+ btrfs_log_new_name(). This results in updating the inode's
+ ->last_log_commit from 0 to 1 (the current value of its
+ ->last_sub_trans).
+
+ The ->last_sub_trans of inode 257 is updated to 1, which is the ID of
+ the next log transaction;
+
+4) Then a buffered write against inode 258 is made. This leaves the value
+ of ->last_sub_trans as 1 (the ID of the current log transaction, stored
+ at root->log_transid);
+
+5) Then an fsync against inode 257 (or any other inode other than 258),
+ happens. This results in committing the log transaction with ID 1,
+ which results in updating root->last_log_commit to 1 and bumping
+ root->log_transid from 1 to 2;
+
+6) Then an fsync against inode 258 starts. We flush delalloc and wait only
+ for writeback to complete, since the full sync flag is not set in the
+ inode's runtime flags - we do not wait for ordered extents to complete.
+
+ Then, at btrfs_sync_file(), we call btrfs_inode_in_log() before the
+ ordered extent completes. The call returns true:
+
+ static inline bool btrfs_inode_in_log(...)
+ {
+ bool ret = false;
+
+ spin_lock(&inode->lock);
+ if (inode->logged_trans == generation &&
+ inode->last_sub_trans <= inode->last_log_commit &&
+ inode->last_sub_trans <= inode->root->last_log_commit)
+ ret = true;
+ spin_unlock(&inode->lock);
+ return ret;
+ }
+
+ generation has a value of 6 (fs_info->generation), ->logged_trans also
+ has a value of 6 (set when we logged the inode during the first fsync
+ and when logging it during the rename), ->last_sub_trans has a value
+ of 1, set during the rename (step 3), ->last_log_commit also has a
+ value of 1 (set in step 3) and root->last_log_commit has a value of 1,
+ which was set in step 5 when fsyncing inode 257.
+
+ As a consequence we don't log the inode, any new extents and do not
+ sync the log, resulting in a data loss if a power failure happens
+ after the fsync and before the current transaction commits.
+ Also, because we do not log the inode, after a power failure the mtime
+ and ctime of the inode do not match those we had before.
+
+ When the ordered extent completes before we call btrfs_inode_in_log(),
+ then the call returns false and we log the inode and sync the log,
+ since at the end of ordered extent completion we update the inode and
+ set ->last_sub_trans to 2 (the value of root->log_transid) and
+ ->last_log_commit to 1.
+
+This problem is found after removing the check for the emptiness of the
+inode's list of modified extents in the recent commit 209ecbb8585bf6
+("btrfs: remove stale comment and logic from btrfs_inode_in_log()"),
+added in the 5.13 merge window. However checking the emptiness of the
+list is not really the way to solve this problem, and was never intended
+to, because while that solves the problem for COW writes, the problem
+persists for NOCOW writes because in that case the list is always empty.
+
+In the case of NOCOW writes, even though we wait for the writeback to
+complete before returning from btrfs_sync_file(), we end up not logging
+the inode, which has a new mtime/ctime, and because we don't sync the log,
+we never issue disk barriers (send REQ_PREFLUSH to the device) since that
+only happens when we sync the log (when we write super blocks at
+btrfs_sync_log()). So effectively, for a NOCOW case, when we return from
+btrfs_sync_file() to user space, we are not guaranteeing that the data is
+durably persisted on disk.
+
+Also, while the example above uses a rename exchange to show how the
+problem happens, it is not the only way to trigger it. An alternative
+could be adding a new hard link to inode 258, since that also results
+in calling btrfs_log_new_name() and updating the inode in the log.
+An example reproducer using the addition of a hard link instead of a
+rename operation:
+
+ $ mkfs.btrfs -f /dev/sdc
+ $ mount /dev/sdc /mnt
+
+ $ touch /mnt/bar
+ $ xfs_io -f -c "pwrite -S 0xab 0 1M" -c "fsync" /mnt/foo
+
+ $ ln /mnt/foo /mnt/foo_link
+ $ xfs_io -c "pwrite -S 0xcd 0 1M" /mnt/foo
+
+ $ xfs_io -c "fsync" /mnt/bar
+ $ xfs_io -c "fsync" /mnt/foo
+
+ <power failure>
+
+ # Replay the log:
+ $ mount /dev/sdc /mnt
+
+ # After this point file foo often has 1M filled with 0xab and not 0xcd
+
+The reasons leading to the final fsync of file foo, inode 258, not
+persisting the new data are the same as for the previous example with
+a rename operation.
+
+So fix by never skipping logging and log syncing when there are still any
+ordered extents in flight. To avoid making the conditional if statement
+that checks if logging an inode is needed harder to read, place all the
+logic into an helper function with separate if statements to make it more
+manageable and easier to read.
+
+A test case for fstests will follow soon.
+
+For NOCOW writes, the problem existed before commit b5e6c3e170b770
+("btrfs: always wait on ordered extents at fsync time"), introduced in
+kernel 4.19, then it went away with that commit since we started to always
+wait for ordered extent completion before logging.
+
+The problem came back again once the fast fsync path was changed again to
+avoid waiting for ordered extent completion, in commit 487781796d3022
+("btrfs: make fast fsyncs wait only for writeback"), added in kernel 5.10.
+
+However, for COW writes, the race only happens after the recent
+commit 209ecbb8585bf6 ("btrfs: remove stale comment and logic from
+btrfs_inode_in_log()"), introduced in the 5.13 merge window. For NOCOW
+writes, the bug existed before that commit. So tag 5.10+ as the release
+for stable backports.
+
+CC: stable@vger.kernel.org # 5.10+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/file.c | 35 +++++++++++++++++++++++++----------
+ fs/btrfs/tree-log.c | 3 ++-
+ 2 files changed, 27 insertions(+), 11 deletions(-)
+
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -2067,6 +2067,30 @@ static int start_ordered_ops(struct inod
+ return ret;
+ }
+
++static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
++{
++ struct btrfs_inode *inode = BTRFS_I(ctx->inode);
++ struct btrfs_fs_info *fs_info = inode->root->fs_info;
++
++ if (btrfs_inode_in_log(inode, fs_info->generation) &&
++ list_empty(&ctx->ordered_extents))
++ return true;
++
++ /*
++ * If we are doing a fast fsync we can not bail out if the inode's
++ * last_trans is <= then the last committed transaction, because we only
++ * update the last_trans of the inode during ordered extent completion,
++ * and for a fast fsync we don't wait for that, we only wait for the
++ * writeback to complete.
++ */
++ if (inode->last_trans <= fs_info->last_trans_committed &&
++ (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
++ list_empty(&ctx->ordered_extents)))
++ return true;
++
++ return false;
++}
++
+ /*
+ * fsync call for both files and directories. This logs the inode into
+ * the tree log instead of forcing full commits whenever possible.
+@@ -2185,17 +2209,8 @@ int btrfs_sync_file(struct file *file, l
+
+ atomic_inc(&root->log_batch);
+
+- /*
+- * If we are doing a fast fsync we can not bail out if the inode's
+- * last_trans is <= then the last committed transaction, because we only
+- * update the last_trans of the inode during ordered extent completion,
+- * and for a fast fsync we don't wait for that, we only wait for the
+- * writeback to complete.
+- */
+ smp_mb();
+- if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) ||
+- (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed &&
+- (full_sync || list_empty(&ctx.ordered_extents)))) {
++ if (skip_inode_logging(&ctx)) {
+ /*
+ * We've had everything committed since the last time we were
+ * modified so clear this flag in case it was set for whatever
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -6060,7 +6060,8 @@ static int btrfs_log_inode_parent(struct
+ * (since logging them is pointless, a link count of 0 means they
+ * will never be accessible).
+ */
+- if (btrfs_inode_in_log(inode, trans->transid) ||
++ if ((btrfs_inode_in_log(inode, trans->transid) &&
++ list_empty(&ctx->ordered_extents)) ||
+ inode->vfs_inode.i_nlink == 0) {
+ ret = BTRFS_NO_LOG_SYNC;
+ goto end_no_trans;
--- /dev/null
+From 77364faf21b4105ee5adbb4844fdfb461334d249 Mon Sep 17 00:00:00 2001
+From: Tom Rix <trix@redhat.com>
+Date: Fri, 30 Apr 2021 11:06:55 -0700
+Subject: btrfs: initialize return variable in cleanup_free_space_cache_v1
+
+From: Tom Rix <trix@redhat.com>
+
+commit 77364faf21b4105ee5adbb4844fdfb461334d249 upstream.
+
+Static analysis reports this problem
+
+ free-space-cache.c:3965:2: warning: Undefined or garbage value returned
+ return ret;
+ ^~~~~~~~~~
+
+ret is set in the node handling loop. Treat doing nothing as a success
+and initialize ret to 0, although it's unlikely the loop would be
+skipped. We always have block groups, but as it could lead to
+transaction abort in the caller it's better to be safe.
+
+CC: stable@vger.kernel.org # 5.12+
+Signed-off-by: Tom Rix <trix@redhat.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/free-space-cache.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/free-space-cache.c
++++ b/fs/btrfs/free-space-cache.c
+@@ -3942,7 +3942,7 @@ static int cleanup_free_space_cache_v1(s
+ {
+ struct btrfs_block_group *block_group;
+ struct rb_node *node;
+- int ret;
++ int ret = 0;
+
+ btrfs_info(fs_info, "cleaning free space cache v1");
+
--- /dev/null
+From adbd914dcde0b03bfc08ffe40b81f31b0457833f Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 21 Apr 2021 14:31:50 +0100
+Subject: btrfs: zoned: fix silent data loss after failure splitting ordered extent
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit adbd914dcde0b03bfc08ffe40b81f31b0457833f upstream.
+
+On a zoned filesystem, sometimes we need to split an ordered extent into 3
+different ordered extents. The original ordered extent is shortened, at
+the front and at the rear, and we create two other new ordered extents to
+represent the trimmed parts of the original ordered extent.
+
+After adjusting the original ordered extent, we create an ordered extent
+to represent the pre-range, and that may fail with ENOMEM for example.
+After that we always try to create the ordered extent for the post-range,
+and if that happens to succeed we end up returning success to the caller
+as we overwrite the 'ret' variable which contained the previous error.
+
+This means we end up with a file range for which there is no ordered
+extent, which results in the range never getting a new file extent item
+pointing to the new data location. And since the split operation did
+not return an error, writeback does not fail and the inode's mapping is
+not flagged with an error, resulting in a subsequent fsync not reporting
+an error either.
+
+It's possibly very unlikely to have the creation of the post-range ordered
+extent succeed after the creation of the pre-range ordered extent failed,
+but it's not impossible.
+
+So fix this by making sure we only create the post-range ordered extent
+if there was no error creating the ordered extent for the pre-range.
+
+Fixes: d22002fd37bd97 ("btrfs: zoned: split ordered extent when bio is sent")
+CC: stable@vger.kernel.org # 5.12+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ordered-data.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/ordered-data.c
++++ b/fs/btrfs/ordered-data.c
+@@ -995,7 +995,7 @@ int btrfs_split_ordered_extent(struct bt
+
+ if (pre)
+ ret = clone_ordered_extent(ordered, 0, pre);
+- if (post)
++ if (ret == 0 && post)
+ ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
+ post);
+
--- /dev/null
+From 784daf2b9628f2d0117f1f0b578cfe5ab6634919 Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Fri, 30 Apr 2021 15:34:17 +0200
+Subject: btrfs: zoned: sanity check zone type
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 784daf2b9628f2d0117f1f0b578cfe5ab6634919 upstream.
+
+The fstests test case generic/475 creates a dm-linear device that gets
+changed to a dm-error device. This leads to errors in loading the block
+group's zone information when running on a zoned file system, ultimately
+resulting in a list corruption. When running on a kernel with list
+debugging enabled this leads to the following crash.
+
+ BTRFS: error (device dm-2) in cleanup_transaction:1953: errno=-5 IO failure
+ kernel BUG at lib/list_debug.c:54!
+ invalid opcode: 0000 [#1] SMP PTI
+ CPU: 1 PID: 2433 Comm: umount Tainted: G W 5.12.0+ #1018
+ RIP: 0010:__list_del_entry_valid.cold+0x1d/0x47
+ RSP: 0018:ffffc90001473df0 EFLAGS: 00010296
+ RAX: 0000000000000054 RBX: ffff8881038fd000 RCX: ffffc90001473c90
+ RDX: 0000000100001a31 RSI: 0000000000000003 RDI: 0000000000000003
+ RBP: ffff888308871108 R08: 0000000000000003 R09: 0000000000000001
+ R10: 3961373532383838 R11: 6666666620736177 R12: ffff888308871000
+ R13: ffff8881038fd088 R14: ffff8881038fdc78 R15: dead000000000100
+ FS: 00007f353c9b1540(0000) GS:ffff888627d00000(0000) knlGS:0000000000000000
+ CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ CR2: 00007f353cc2c710 CR3: 000000018e13c000 CR4: 00000000000006a0
+ DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+ DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+ Call Trace:
+ btrfs_free_block_groups+0xc9/0x310 [btrfs]
+ close_ctree+0x2ee/0x31a [btrfs]
+ ? call_rcu+0x8f/0x270
+ ? mutex_lock+0x1c/0x40
+ generic_shutdown_super+0x67/0x100
+ kill_anon_super+0x14/0x30
+ btrfs_kill_super+0x12/0x20 [btrfs]
+ deactivate_locked_super+0x31/0x90
+ cleanup_mnt+0x13e/0x1b0
+ task_work_run+0x63/0xb0
+ exit_to_user_mode_loop+0xd9/0xe0
+ exit_to_user_mode_prepare+0x3e/0x60
+ syscall_exit_to_user_mode+0x1d/0x50
+ entry_SYSCALL_64_after_hwframe+0x44/0xae
+
+As dm-error has no support for zones, btrfs will run it's zone emulation
+mode on this device. The zone emulation mode emulates conventional zones,
+so bail out if the zone bitmap that gets populated on mount sees the zone
+as sequential while we're thinking it's a conventional zone when creating
+a block group.
+
+Note: this scenario is unlikely in a real wold application and can only
+happen by this (ab)use of device-mapper targets.
+
+CC: stable@vger.kernel.org # 5.12+
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/zoned.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -1126,6 +1126,11 @@ int btrfs_load_block_group_zone_info(str
+ goto out;
+ }
+
++ if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
++ ret = -EIO;
++ goto out;
++ }
++
+ switch (zone.cond) {
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
--- /dev/null
+From fe1c97d008f86f672f0e9265f180c22451ca3b9f Mon Sep 17 00:00:00 2001
+From: David Ward <david.ward@gatech.edu>
+Date: Mon, 10 May 2021 05:30:39 -0400
+Subject: drm/amd/display: Initialize attribute for hdcp_srm sysfs file
+
+From: David Ward <david.ward@gatech.edu>
+
+commit fe1c97d008f86f672f0e9265f180c22451ca3b9f upstream.
+
+It is stored in dynamically allocated memory, so sysfs_bin_attr_init() must
+be called to initialize it. (Note: "initialization" only sets the .attr.key
+member in this struct; it does not change the value of any other members.)
+
+Otherwise, when CONFIG_DEBUG_LOCK_ALLOC=y this message appears during boot:
+
+ BUG: key ffff9248900cd148 has not been registered!
+
+Fixes: 9037246bb2da ("drm/amd/display: Add sysfs interface for set/get srm")
+Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1586
+Reported-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
+Signed-off-by: David Ward <david.ward@gatech.edu>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c
+@@ -644,6 +644,7 @@ struct hdcp_workqueue *hdcp_create_workq
+
+ /* File created at /sys/class/drm/card0/device/hdcp_srm*/
+ hdcp_work[0].attr = data_attr;
++ sysfs_bin_attr_init(&hdcp_work[0].attr);
+
+ if (sysfs_create_bin_file(&adev->dev->kobj, &hdcp_work[0].attr))
+ DRM_WARN("Failed to create device file hdcp_srm");
--- /dev/null
+From 4819d16d91145966ce03818a95169df1fd56b299 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
+Date: Wed, 21 Apr 2021 18:33:58 +0300
+Subject: drm/i915: Avoid div-by-zero on gen2
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Ville Syrjälä <ville.syrjala@linux.intel.com>
+
+commit 4819d16d91145966ce03818a95169df1fd56b299 upstream.
+
+Gen2 tiles are 2KiB in size so i915_gem_object_get_tile_row_size()
+can in fact return <4KiB, which leads to div-by-zero here.
+Avoid that.
+
+Not sure i915_gem_object_get_tile_row_size() is entirely
+sane anyway since it doesn't account for the different tile
+layouts on i8xx/i915...
+
+I'm not able to hit this before commit 6846895fde05 ("drm/i915:
+Replace PIN_NONFAULT with calls to PIN_NOEVICT") and it looks
+like I also need to run recent version of Mesa. With those in
+place xonotic trips on this quite easily on my 85x.
+
+Cc: stable@vger.kernel.org
+Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
+Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20210421153401.13847-2-ville.syrjala@linux.intel.com
+(cherry picked from commit ed52c62d386f764194e0184fdb905d5f24194cae)
+Signed-off-by: Jani Nikula <jani.nikula@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/i915/gem/i915_gem_mman.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
++++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+@@ -189,7 +189,7 @@ compute_partial_view(const struct drm_i9
+ struct i915_ggtt_view view;
+
+ if (i915_gem_object_is_tiled(obj))
+- chunk = roundup(chunk, tile_row_pages(obj));
++ chunk = roundup(chunk, tile_row_pages(obj) ?: 1);
+
+ view.type = I915_GGTT_VIEW_PARTIAL;
+ view.partial.offset = rounddown(page_offset, chunk);
--- /dev/null
+From a5c936add6a23c15c6ae538ab7a12f80751fdf0f Mon Sep 17 00:00:00 2001
+From: Kai-Heng Feng <kai.heng.feng@canonical.com>
+Date: Wed, 21 Apr 2021 13:20:31 +0800
+Subject: drm/i915/dp: Use slow and wide link training for everything
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Kai-Heng Feng <kai.heng.feng@canonical.com>
+
+commit a5c936add6a23c15c6ae538ab7a12f80751fdf0f upstream.
+
+Screen flickers on Innolux eDP 1.3 panel when clock rate 540000 is in use.
+
+According to the panel vendor, though clock rate 540000 is advertised,
+but the max clock rate it really supports is 270000.
+
+Ville Syrjälä mentioned that fast and narrow also breaks some eDP 1.4
+panel, so use slow and wide training for all panels to resolve the
+issue.
+
+User also confirmed that the new strategy doesn't introduce any
+regression on XPS 9380.
+
+v2:
+ - Use slow and wide for everything.
+
+Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/3384
+References: https://gitlab.freedesktop.org/drm/intel/-/issues/272
+Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
+Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20210421052054.1434718-1-kai.heng.feng@canonical.com
+(cherry picked from commit acca7762eb71bc05a8f28d29320d193150051f79)
+Fixes: 2bbd6dba84d4 ("drm/i915: Try to use fast+narrow link on eDP again and fall back to the old max strategy on failure")
+Cc: <stable@vger.kernel.org> # v5.12+
+Signed-off-by: Jani Nikula <jani.nikula@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/i915/display/intel_dp.c | 59 ++------------------------------
+ 1 file changed, 5 insertions(+), 54 deletions(-)
+
+--- a/drivers/gpu/drm/i915/display/intel_dp.c
++++ b/drivers/gpu/drm/i915/display/intel_dp.c
+@@ -1174,44 +1174,6 @@ intel_dp_compute_link_config_wide(struct
+ return -EINVAL;
+ }
+
+-/* Optimize link config in order: max bpp, min lanes, min clock */
+-static int
+-intel_dp_compute_link_config_fast(struct intel_dp *intel_dp,
+- struct intel_crtc_state *pipe_config,
+- const struct link_config_limits *limits)
+-{
+- const struct drm_display_mode *adjusted_mode = &pipe_config->hw.adjusted_mode;
+- int bpp, clock, lane_count;
+- int mode_rate, link_clock, link_avail;
+-
+- for (bpp = limits->max_bpp; bpp >= limits->min_bpp; bpp -= 2 * 3) {
+- int output_bpp = intel_dp_output_bpp(pipe_config->output_format, bpp);
+-
+- mode_rate = intel_dp_link_required(adjusted_mode->crtc_clock,
+- output_bpp);
+-
+- for (lane_count = limits->min_lane_count;
+- lane_count <= limits->max_lane_count;
+- lane_count <<= 1) {
+- for (clock = limits->min_clock; clock <= limits->max_clock; clock++) {
+- link_clock = intel_dp->common_rates[clock];
+- link_avail = intel_dp_max_data_rate(link_clock,
+- lane_count);
+-
+- if (mode_rate <= link_avail) {
+- pipe_config->lane_count = lane_count;
+- pipe_config->pipe_bpp = bpp;
+- pipe_config->port_clock = link_clock;
+-
+- return 0;
+- }
+- }
+- }
+- }
+-
+- return -EINVAL;
+-}
+-
+ static int intel_dp_dsc_compute_bpp(struct intel_dp *intel_dp, u8 dsc_max_bpc)
+ {
+ int i, num_bpc;
+@@ -1461,22 +1423,11 @@ intel_dp_compute_link_config(struct inte
+ intel_dp_can_bigjoiner(intel_dp))
+ pipe_config->bigjoiner = true;
+
+- if (intel_dp_is_edp(intel_dp))
+- /*
+- * Optimize for fast and narrow. eDP 1.3 section 3.3 and eDP 1.4
+- * section A.1: "It is recommended that the minimum number of
+- * lanes be used, using the minimum link rate allowed for that
+- * lane configuration."
+- *
+- * Note that we fall back to the max clock and lane count for eDP
+- * panels that fail with the fast optimal settings (see
+- * intel_dp->use_max_params), in which case the fast vs. wide
+- * choice doesn't matter.
+- */
+- ret = intel_dp_compute_link_config_fast(intel_dp, pipe_config, &limits);
+- else
+- /* Optimize for slow and wide. */
+- ret = intel_dp_compute_link_config_wide(intel_dp, pipe_config, &limits);
++ /*
++ * Optimize for slow and wide for everything, because there are some
++ * eDP 1.3 and 1.4 panels don't work well with fast and narrow.
++ */
++ ret = intel_dp_compute_link_config_wide(intel_dp, pipe_config, &limits);
+
+ /* enable compression if the mode doesn't fit available BW */
+ drm_dbg_kms(&i915->drm, "Force DSC en = %d\n", intel_dp->force_dsc_en);
--- /dev/null
+From 227545b9a08c68778ddd89428f99c351fc9315ac Mon Sep 17 00:00:00 2001
+From: Kai-Heng Feng <kai.heng.feng@canonical.com>
+Date: Fri, 30 Apr 2021 12:56:56 +0800
+Subject: drm/radeon/dpm: Disable sclk switching on Oland when two 4K 60Hz monitors are connected
+
+From: Kai-Heng Feng <kai.heng.feng@canonical.com>
+
+commit 227545b9a08c68778ddd89428f99c351fc9315ac upstream.
+
+Screen flickers rapidly when two 4K 60Hz monitors are in use. This issue
+doesn't happen when one monitor is 4K 60Hz (pixelclock 594MHz) and
+another one is 4K 30Hz (pixelclock 297MHz).
+
+The issue is gone after setting "power_dpm_force_performance_level" to
+"high". Following the indication, we found that the issue occurs when
+sclk is too low.
+
+So resolve the issue by disabling sclk switching when there are two
+monitors requires high pixelclock (> 297MHz).
+
+v2:
+ - Only apply the fix to Oland.
+Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/radeon/radeon.h | 1 +
+ drivers/gpu/drm/radeon/radeon_pm.c | 8 ++++++++
+ drivers/gpu/drm/radeon/si_dpm.c | 3 +++
+ 3 files changed, 12 insertions(+)
+
+--- a/drivers/gpu/drm/radeon/radeon.h
++++ b/drivers/gpu/drm/radeon/radeon.h
+@@ -1558,6 +1558,7 @@ struct radeon_dpm {
+ void *priv;
+ u32 new_active_crtcs;
+ int new_active_crtc_count;
++ int high_pixelclock_count;
+ u32 current_active_crtcs;
+ int current_active_crtc_count;
+ bool single_display;
+--- a/drivers/gpu/drm/radeon/radeon_pm.c
++++ b/drivers/gpu/drm/radeon/radeon_pm.c
+@@ -1775,6 +1775,7 @@ static void radeon_pm_compute_clocks_dpm
+ struct drm_device *ddev = rdev->ddev;
+ struct drm_crtc *crtc;
+ struct radeon_crtc *radeon_crtc;
++ struct radeon_connector *radeon_connector;
+
+ if (!rdev->pm.dpm_enabled)
+ return;
+@@ -1784,6 +1785,7 @@ static void radeon_pm_compute_clocks_dpm
+ /* update active crtc counts */
+ rdev->pm.dpm.new_active_crtcs = 0;
+ rdev->pm.dpm.new_active_crtc_count = 0;
++ rdev->pm.dpm.high_pixelclock_count = 0;
+ if (rdev->num_crtc && rdev->mode_info.mode_config_initialized) {
+ list_for_each_entry(crtc,
+ &ddev->mode_config.crtc_list, head) {
+@@ -1791,6 +1793,12 @@ static void radeon_pm_compute_clocks_dpm
+ if (crtc->enabled) {
+ rdev->pm.dpm.new_active_crtcs |= (1 << radeon_crtc->crtc_id);
+ rdev->pm.dpm.new_active_crtc_count++;
++ if (!radeon_crtc->connector)
++ continue;
++
++ radeon_connector = to_radeon_connector(radeon_crtc->connector);
++ if (radeon_connector->pixelclock_for_modeset > 297000)
++ rdev->pm.dpm.high_pixelclock_count++;
+ }
+ }
+ }
+--- a/drivers/gpu/drm/radeon/si_dpm.c
++++ b/drivers/gpu/drm/radeon/si_dpm.c
+@@ -2979,6 +2979,9 @@ static void si_apply_state_adjust_rules(
+ (rdev->pdev->device == 0x6605)) {
+ max_sclk = 75000;
+ }
++
++ if (rdev->pm.dpm.high_pixelclock_count > 1)
++ disable_sclk_switching = true;
+ }
+
+ if (rps->vce_active) {
--- /dev/null
+From c3187cf32216313fb316084efac4dab3a8459b1d Mon Sep 17 00:00:00 2001
+From: Jouni Roivas <jouni.roivas@tuxera.com>
+Date: Fri, 14 May 2021 17:27:33 -0700
+Subject: hfsplus: prevent corruption in shrinking truncate
+
+From: Jouni Roivas <jouni.roivas@tuxera.com>
+
+commit c3187cf32216313fb316084efac4dab3a8459b1d upstream.
+
+I believe there are some issues introduced by commit 31651c607151
+("hfsplus: avoid deadlock on file truncation")
+
+HFS+ has extent records which always contains 8 extents. In case the
+first extent record in catalog file gets full, new ones are allocated from
+extents overflow file.
+
+In case shrinking truncate happens to middle of an extent record which
+locates in extents overflow file, the logic in hfsplus_file_truncate() was
+changed so that call to hfs_brec_remove() is not guarded any more.
+
+Right action would be just freeing the extents that exceed the new size
+inside extent record by calling hfsplus_free_extents(), and then check if
+the whole extent record should be removed. However since the guard
+(blk_cnt > start) is now after the call to hfs_brec_remove(), this has
+unfortunate effect that the last matching extent record is removed
+unconditionally.
+
+To reproduce this issue, create a file which has at least 10 extents, and
+then perform shrinking truncate into middle of the last extent record, so
+that the number of remaining extents is not under or divisible by 8. This
+causes the last extent record (8 extents) to be removed totally instead of
+truncating into middle of it. Thus this causes corruption, and lost data.
+
+Fix for this is simply checking if the new truncated end is below the
+start of this extent record, making it safe to remove the full extent
+record. However call to hfs_brec_remove() can't be moved to it's previous
+place since we're dropping ->tree_lock and it can cause a race condition
+and the cached info being invalidated possibly corrupting the node data.
+
+Another issue is related to this one. When entering into the block
+(blk_cnt > start) we are not holding the ->tree_lock. We break out from
+the loop not holding the lock, but hfs_find_exit() does unlock it. Not
+sure if it's possible for someone else to take the lock under our feet,
+but it can cause hard to debug errors and premature unlocking. Even if
+there's no real risk of it, the locking should still always be kept in
+balance. Thus taking the lock now just before the check.
+
+Link: https://lkml.kernel.org/r/20210429165139.3082828-1-jouni.roivas@tuxera.com
+Fixes: 31651c607151f ("hfsplus: avoid deadlock on file truncation")
+Signed-off-by: Jouni Roivas <jouni.roivas@tuxera.com>
+Reviewed-by: Anton Altaparmakov <anton@tuxera.com>
+Cc: Anatoly Trosinenko <anatoly.trosinenko@gmail.com>
+Cc: Viacheslav Dubeyko <slava@dubeyko.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/hfsplus/extents.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+--- a/fs/hfsplus/extents.c
++++ b/fs/hfsplus/extents.c
+@@ -598,13 +598,15 @@ void hfsplus_file_truncate(struct inode
+ res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
+ if (res)
+ break;
+- hfs_brec_remove(&fd);
+
+- mutex_unlock(&fd.tree->tree_lock);
+ start = hip->cached_start;
++ if (blk_cnt <= start)
++ hfs_brec_remove(&fd);
++ mutex_unlock(&fd.tree->tree_lock);
+ hfsplus_free_extents(sb, hip->cached_extents,
+ alloc_cnt - start, alloc_cnt - blk_cnt);
+ hfsplus_dump_extent(hip->cached_extents);
++ mutex_lock(&fd.tree->tree_lock);
+ if (blk_cnt > start) {
+ hip->extent_state |= HFSPLUS_EXT_DIRTY;
+ break;
+@@ -612,7 +614,6 @@ void hfsplus_file_truncate(struct inode
+ alloc_cnt = start;
+ hip->cached_start = hip->cached_blocks = 0;
+ hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
+- mutex_lock(&fd.tree->tree_lock);
+ }
+ hfs_find_exit(&fd);
+
--- /dev/null
+From f649dc0e0d7b509c75570ee403723660f5b72ec7 Mon Sep 17 00:00:00 2001
+From: Peter Collingbourne <pcc@google.com>
+Date: Fri, 14 May 2021 17:27:27 -0700
+Subject: kasan: fix unit tests with CONFIG_UBSAN_LOCAL_BOUNDS enabled
+
+From: Peter Collingbourne <pcc@google.com>
+
+commit f649dc0e0d7b509c75570ee403723660f5b72ec7 upstream.
+
+These tests deliberately access these arrays out of bounds, which will
+cause the dynamic local bounds checks inserted by
+CONFIG_UBSAN_LOCAL_BOUNDS to fail and panic the kernel. To avoid this
+problem, access the arrays via volatile pointers, which will prevent the
+compiler from being able to determine the array bounds.
+
+These accesses use volatile pointers to char (char *volatile) rather than
+the more conventional pointers to volatile char (volatile char *) because
+we want to prevent the compiler from making inferences about the pointer
+itself (i.e. its array bounds), not the data that it refers to.
+
+Link: https://lkml.kernel.org/r/20210507025915.1464056-1-pcc@google.com
+Link: https://linux-review.googlesource.com/id/I90b1713fbfa1bf68ff895aef099ea77b98a7c3b9
+Signed-off-by: Peter Collingbourne <pcc@google.com>
+Tested-by: Alexander Potapenko <glider@google.com>
+Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
+Cc: Peter Collingbourne <pcc@google.com>
+Cc: George Popescu <georgepope@android.com>
+Cc: Elena Petrova <lenaptr@google.com>
+Cc: Evgenii Stepanov <eugenis@google.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ lib/test_kasan.c | 29 +++++++++++++++++++++++------
+ 1 file changed, 23 insertions(+), 6 deletions(-)
+
+--- a/lib/test_kasan.c
++++ b/lib/test_kasan.c
+@@ -646,8 +646,20 @@ static char global_array[10];
+
+ static void kasan_global_oob(struct kunit *test)
+ {
+- volatile int i = 3;
+- char *p = &global_array[ARRAY_SIZE(global_array) + i];
++ /*
++ * Deliberate out-of-bounds access. To prevent CONFIG_UBSAN_LOCAL_BOUNDS
++ * from failing here and panicing the kernel, access the array via a
++ * volatile pointer, which will prevent the compiler from being able to
++ * determine the array bounds.
++ *
++ * This access uses a volatile pointer to char (char *volatile) rather
++ * than the more conventional pointer to volatile char (volatile char *)
++ * because we want to prevent the compiler from making inferences about
++ * the pointer itself (i.e. its array bounds), not the data that it
++ * refers to.
++ */
++ char *volatile array = global_array;
++ char *p = &array[ARRAY_SIZE(global_array) + 3];
+
+ /* Only generic mode instruments globals. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+@@ -695,8 +707,9 @@ static void ksize_uaf(struct kunit *test
+ static void kasan_stack_oob(struct kunit *test)
+ {
+ char stack_array[10];
+- volatile int i = OOB_TAG_OFF;
+- char *p = &stack_array[ARRAY_SIZE(stack_array) + i];
++ /* See comment in kasan_global_oob. */
++ char *volatile array = stack_array;
++ char *p = &array[ARRAY_SIZE(stack_array) + OOB_TAG_OFF];
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK);
+
+@@ -707,7 +720,9 @@ static void kasan_alloca_oob_left(struct
+ {
+ volatile int i = 10;
+ char alloca_array[i];
+- char *p = alloca_array - 1;
++ /* See comment in kasan_global_oob. */
++ char *volatile array = alloca_array;
++ char *p = array - 1;
+
+ /* Only generic mode instruments dynamic allocas. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+@@ -720,7 +735,9 @@ static void kasan_alloca_oob_right(struc
+ {
+ volatile int i = 10;
+ char alloca_array[i];
+- char *p = alloca_array + i;
++ /* See comment in kasan_global_oob. */
++ char *volatile array = alloca_array;
++ char *p = array + i;
+
+ /* Only generic mode instruments dynamic allocas. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
--- /dev/null
+From 262de4102c7bb8e59f26a967a8ffe8cce85cc537 Mon Sep 17 00:00:00 2001
+From: Benjamin Segall <bsegall@google.com>
+Date: Thu, 29 Apr 2021 16:22:34 +0000
+Subject: kvm: exit halt polling on need_resched() as well
+
+From: Benjamin Segall <bsegall@google.com>
+
+commit 262de4102c7bb8e59f26a967a8ffe8cce85cc537 upstream.
+
+single_task_running() is usually more general than need_resched()
+but CFS_BANDWIDTH throttling will use resched_task() when there
+is just one task to get the task to block. This was causing
+long-need_resched warnings and was likely allowing VMs to
+overrun their quota when halt polling.
+
+Signed-off-by: Ben Segall <bsegall@google.com>
+Signed-off-by: Venkatesh Srinivas <venkateshs@chromium.org>
+Message-Id: <20210429162233.116849-1-venkateshs@chromium.org>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ virt/kvm/kvm_main.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/virt/kvm/kvm_main.c
++++ b/virt/kvm/kvm_main.c
+@@ -2838,7 +2838,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcp
+ goto out;
+ }
+ poll_end = cur = ktime_get();
+- } while (single_task_running() && ktime_before(cur, stop));
++ } while (single_task_running() && !need_resched() &&
++ ktime_before(cur, stop));
+ }
+
+ prepare_to_rcuwait(&vcpu->wait);
--- /dev/null
+From 84894e1c42e9f25c17f2888e0c0e1505cb727538 Mon Sep 17 00:00:00 2001
+From: Peter Xu <peterx@redhat.com>
+Date: Fri, 14 May 2021 17:27:07 -0700
+Subject: mm/hugetlb: fix cow where page writtable in child
+
+From: Peter Xu <peterx@redhat.com>
+
+commit 84894e1c42e9f25c17f2888e0c0e1505cb727538 upstream.
+
+When rework early cow of pinned hugetlb pages, we moved huge_ptep_get()
+upper but overlooked a side effect that the huge_ptep_get() will fetch the
+pte after wr-protection. After moving it upwards, we need explicit
+wr-protect of child pte or we will keep the write bit set in the child
+process, which could cause data corrution where the child can write to the
+original page directly.
+
+This issue can also be exposed by "memfd_test hugetlbfs" kselftest.
+
+Link: https://lkml.kernel.org/r/20210503234356.9097-3-peterx@redhat.com
+Fixes: 4eae4efa2c299 ("hugetlb: do early cow when page pinned on src mm")
+Signed-off-by: Peter Xu <peterx@redhat.com>
+Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/hugetlb.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -3905,6 +3905,7 @@ again:
+ * See Documentation/vm/mmu_notifier.rst
+ */
+ huge_ptep_set_wrprotect(src, addr, src_pte);
++ entry = huge_pte_wrprotect(entry);
+ }
+
+ page_dup_rmap(ptepage, true);
--- /dev/null
+From 22247efd822e6d263f3c8bd327f3f769aea9b1d9 Mon Sep 17 00:00:00 2001
+From: Peter Xu <peterx@redhat.com>
+Date: Fri, 14 May 2021 17:27:04 -0700
+Subject: mm/hugetlb: fix F_SEAL_FUTURE_WRITE
+
+From: Peter Xu <peterx@redhat.com>
+
+commit 22247efd822e6d263f3c8bd327f3f769aea9b1d9 upstream.
+
+Patch series "mm/hugetlb: Fix issues on file sealing and fork", v2.
+
+Hugh reported issue with F_SEAL_FUTURE_WRITE not applied correctly to
+hugetlbfs, which I can easily verify using the memfd_test program, which
+seems that the program is hardly run with hugetlbfs pages (as by default
+shmem).
+
+Meanwhile I found another probably even more severe issue on that hugetlb
+fork won't wr-protect child cow pages, so child can potentially write to
+parent private pages. Patch 2 addresses that.
+
+After this series applied, "memfd_test hugetlbfs" should start to pass.
+
+This patch (of 2):
+
+F_SEAL_FUTURE_WRITE is missing for hugetlb starting from the first day.
+There is a test program for that and it fails constantly.
+
+$ ./memfd_test hugetlbfs
+memfd-hugetlb: CREATE
+memfd-hugetlb: BASIC
+memfd-hugetlb: SEAL-WRITE
+memfd-hugetlb: SEAL-FUTURE-WRITE
+mmap() didn't fail as expected
+Aborted (core dumped)
+
+I think it's probably because no one is really running the hugetlbfs test.
+
+Fix it by checking FUTURE_WRITE also in hugetlbfs_file_mmap() as what we
+do in shmem_mmap(). Generalize a helper for that.
+
+Link: https://lkml.kernel.org/r/20210503234356.9097-1-peterx@redhat.com
+Link: https://lkml.kernel.org/r/20210503234356.9097-2-peterx@redhat.com
+Fixes: ab3948f58ff84 ("mm/memfd: add an F_SEAL_FUTURE_WRITE seal to memfd")
+Signed-off-by: Peter Xu <peterx@redhat.com>
+Reported-by: Hugh Dickins <hughd@google.com>
+Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/hugetlbfs/inode.c | 5 +++++
+ include/linux/mm.h | 32 ++++++++++++++++++++++++++++++++
+ mm/shmem.c | 22 ++++------------------
+ 3 files changed, 41 insertions(+), 18 deletions(-)
+
+--- a/fs/hugetlbfs/inode.c
++++ b/fs/hugetlbfs/inode.c
+@@ -131,6 +131,7 @@ static void huge_pagevec_release(struct
+ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+ {
+ struct inode *inode = file_inode(file);
++ struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
+ loff_t len, vma_len;
+ int ret;
+ struct hstate *h = hstate_file(file);
+@@ -146,6 +147,10 @@ static int hugetlbfs_file_mmap(struct fi
+ vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
+ vma->vm_ops = &hugetlb_vm_ops;
+
++ ret = seal_check_future_write(info->seals, vma);
++ if (ret)
++ return ret;
++
+ /*
+ * page based offset in vm_pgoff could be sufficiently large to
+ * overflow a loff_t when converted to byte offset. This can
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3170,5 +3170,37 @@ extern int sysctl_nr_trim_pages;
+
+ void mem_dump_obj(void *object);
+
++/**
++ * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it
++ * @seals: the seals to check
++ * @vma: the vma to operate on
++ *
++ * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on
++ * the vma flags. Return 0 if check pass, or <0 for errors.
++ */
++static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
++{
++ if (seals & F_SEAL_FUTURE_WRITE) {
++ /*
++ * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
++ * "future write" seal active.
++ */
++ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
++ return -EPERM;
++
++ /*
++ * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
++ * MAP_SHARED and read-only, take care to not allow mprotect to
++ * revert protections on such mappings. Do this only for shared
++ * mappings. For private mappings, don't need to mask
++ * VM_MAYWRITE as we still want them to be COW-writable.
++ */
++ if (vma->vm_flags & VM_SHARED)
++ vma->vm_flags &= ~(VM_MAYWRITE);
++ }
++
++ return 0;
++}
++
+ #endif /* __KERNEL__ */
+ #endif /* _LINUX_MM_H */
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -2258,25 +2258,11 @@ out_nomem:
+ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+ {
+ struct shmem_inode_info *info = SHMEM_I(file_inode(file));
++ int ret;
+
+- if (info->seals & F_SEAL_FUTURE_WRITE) {
+- /*
+- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+- * "future write" seal active.
+- */
+- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+- return -EPERM;
+-
+- /*
+- * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
+- * MAP_SHARED and read-only, take care to not allow mprotect to
+- * revert protections on such mappings. Do this only for shared
+- * mappings. For private mappings, don't need to mask
+- * VM_MAYWRITE as we still want them to be COW-writable.
+- */
+- if (vma->vm_flags & VM_SHARED)
+- vma->vm_flags &= ~(VM_MAYWRITE);
+- }
++ ret = seal_check_future_write(info->seals, vma);
++ if (ret)
++ return ret;
+
+ /* arm64 - allow memory tagging on RAM-based files */
+ vma->vm_flags |= VM_MTE_ALLOWED;
--- /dev/null
+From aec86b052df6541cc97c5fca44e5934cbea4963b Mon Sep 17 00:00:00 2001
+From: Michael Ellerman <mpe@ellerman.id.au>
+Date: Thu, 6 May 2021 14:49:59 +1000
+Subject: powerpc/64s: Fix crashes when toggling entry flush barrier
+
+From: Michael Ellerman <mpe@ellerman.id.au>
+
+commit aec86b052df6541cc97c5fca44e5934cbea4963b upstream.
+
+The entry flush mitigation can be enabled/disabled at runtime via a
+debugfs file (entry_flush), which causes the kernel to patch itself to
+enable/disable the relevant mitigations.
+
+However depending on which mitigation we're using, it may not be safe to
+do that patching while other CPUs are active. For example the following
+crash:
+
+ sleeper[15639]: segfault (11) at c000000000004c20 nip c000000000004c20 lr c000000000004c20
+
+Shows that we returned to userspace with a corrupted LR that points into
+the kernel, due to executing the partially patched call to the fallback
+entry flush (ie. we missed the LR restore).
+
+Fix it by doing the patching under stop machine. The CPUs that aren't
+doing the patching will be spinning in the core of the stop machine
+logic. That is currently sufficient for our purposes, because none of
+the patching we do is to that code or anywhere in the vicinity.
+
+Fixes: f79643787e0a ("powerpc/64s: flush L1D on kernel entry")
+Cc: stable@vger.kernel.org # v5.10+
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/20210506044959.1298123-2-mpe@ellerman.id.au
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/lib/feature-fixups.c | 16 +++++++++++++++-
+ 1 file changed, 15 insertions(+), 1 deletion(-)
+
+--- a/arch/powerpc/lib/feature-fixups.c
++++ b/arch/powerpc/lib/feature-fixups.c
+@@ -299,8 +299,9 @@ void do_uaccess_flush_fixups(enum l1d_fl
+ : "unknown");
+ }
+
+-void do_entry_flush_fixups(enum l1d_flush_type types)
++static int __do_entry_flush_fixups(void *data)
+ {
++ enum l1d_flush_type types = *(enum l1d_flush_type *)data;
+ unsigned int instrs[3], *dest;
+ long *start, *end;
+ int i;
+@@ -369,6 +370,19 @@ void do_entry_flush_fixups(enum l1d_flus
+ : "ori type" :
+ (types & L1D_FLUSH_MTTRIG) ? "mttrig type"
+ : "unknown");
++
++ return 0;
++}
++
++void do_entry_flush_fixups(enum l1d_flush_type types)
++{
++ /*
++ * The call to the fallback flush can not be safely patched in/out while
++ * other CPUs are executing it. So call __do_entry_flush_fixups() on one
++ * CPU while all other CPUs spin in the stop machine core with interrupts
++ * hard disabled.
++ */
++ stop_machine(__do_entry_flush_fixups, &types, NULL);
+ }
+
+ void do_rfi_flush_fixups(enum l1d_flush_type types)
--- /dev/null
+From 8ec7791bae1327b1c279c5cd6e929c3b12daaf0a Mon Sep 17 00:00:00 2001
+From: Michael Ellerman <mpe@ellerman.id.au>
+Date: Thu, 6 May 2021 14:49:58 +1000
+Subject: powerpc/64s: Fix crashes when toggling stf barrier
+
+From: Michael Ellerman <mpe@ellerman.id.au>
+
+commit 8ec7791bae1327b1c279c5cd6e929c3b12daaf0a upstream.
+
+The STF (store-to-load forwarding) barrier mitigation can be
+enabled/disabled at runtime via a debugfs file (stf_barrier), which
+causes the kernel to patch itself to enable/disable the relevant
+mitigations.
+
+However depending on which mitigation we're using, it may not be safe to
+do that patching while other CPUs are active. For example the following
+crash:
+
+ User access of kernel address (c00000003fff5af0) - exploit attempt? (uid: 0)
+ segfault (11) at c00000003fff5af0 nip 7fff8ad12198 lr 7fff8ad121f8 code 1
+ code: 40820128 e93c00d0 e9290058 7c292840 40810058 38600000 4bfd9a81 e8410018
+ code: 2c030006 41810154 3860ffb6 e9210098 <e94d8ff0> 7d295279 39400000 40820a3c
+
+Shows that we returned to userspace without restoring the user r13
+value, due to executing the partially patched STF exit code.
+
+Fix it by doing the patching under stop machine. The CPUs that aren't
+doing the patching will be spinning in the core of the stop machine
+logic. That is currently sufficient for our purposes, because none of
+the patching we do is to that code or anywhere in the vicinity.
+
+Fixes: a048a07d7f45 ("powerpc/64s: Add support for a store forwarding barrier at kernel entry/exit")
+Cc: stable@vger.kernel.org # v4.17+
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/20210506044959.1298123-1-mpe@ellerman.id.au
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/lib/feature-fixups.c | 19 +++++++++++++++++--
+ 1 file changed, 17 insertions(+), 2 deletions(-)
+
+--- a/arch/powerpc/lib/feature-fixups.c
++++ b/arch/powerpc/lib/feature-fixups.c
+@@ -14,6 +14,7 @@
+ #include <linux/string.h>
+ #include <linux/init.h>
+ #include <linux/sched/mm.h>
++#include <linux/stop_machine.h>
+ #include <asm/cputable.h>
+ #include <asm/code-patching.h>
+ #include <asm/page.h>
+@@ -227,11 +228,25 @@ static void do_stf_exit_barrier_fixups(e
+ : "unknown");
+ }
+
++static int __do_stf_barrier_fixups(void *data)
++{
++ enum stf_barrier_type *types = data;
++
++ do_stf_entry_barrier_fixups(*types);
++ do_stf_exit_barrier_fixups(*types);
++
++ return 0;
++}
+
+ void do_stf_barrier_fixups(enum stf_barrier_type types)
+ {
+- do_stf_entry_barrier_fixups(types);
+- do_stf_exit_barrier_fixups(types);
++ /*
++ * The call to the fallback entry flush, and the fallback/sync-ori exit
++ * flush can not be safely patched in/out while other CPUs are executing
++ * them. So call __do_stf_barrier_fixups() on one CPU while all other CPUs
++ * spin in the stop machine core with interrupts hard disabled.
++ */
++ stop_machine(__do_stf_barrier_fixups, &types, NULL);
+ }
+
+ void do_uaccess_flush_fixups(enum l1d_flush_type types)
i40e-fix-phy-type-identifiers-for-2.5g-and-5g-adapte.patch
i40e-remove-lldp-frame-filters.patch
mptcp-fix-splat-when-closing-unaccepted-socket.patch
+arc-entry-fix-off-by-one-error-in-syscall-number-validation.patch
+arc-mm-pae-use-40-bit-physical-page-mask.patch
+arc-mm-use-max_high_pfn-as-a-highmem-zone-border.patch
+sh-remove-unused-variable.patch
+powerpc-64s-fix-crashes-when-toggling-stf-barrier.patch
+powerpc-64s-fix-crashes-when-toggling-entry-flush-barrier.patch
+hfsplus-prevent-corruption-in-shrinking-truncate.patch
+squashfs-fix-divide-error-in-calculate_skip.patch
+userfaultfd-release-page-in-error-path-to-avoid-bug_on.patch
+kasan-fix-unit-tests-with-config_ubsan_local_bounds-enabled.patch
+mm-hugetlb-fix-f_seal_future_write.patch
+mm-hugetlb-fix-cow-where-page-writtable-in-child.patch
+blk-iocost-fix-weight-updates-of-inner-active-iocgs.patch
+x86-sched-fix-the-amd-cppc-maximum-performance-value-on-certain-amd-ryzen-generations.patch
+arm64-mte-initialize-rgsr_el1.seed-in-__cpu_setup.patch
+arm64-fix-race-condition-on-pg_dcache_clean-in-__sync_icache_dcache.patch
+btrfs-fix-deadlock-when-cloning-inline-extents-and-using-qgroups.patch
+btrfs-zoned-fix-silent-data-loss-after-failure-splitting-ordered-extent.patch
+btrfs-fix-race-leading-to-unpersisted-data-and-metadata-on-fsync.patch
+btrfs-initialize-return-variable-in-cleanup_free_space_cache_v1.patch
+btrfs-zoned-sanity-check-zone-type.patch
+drm-radeon-dpm-disable-sclk-switching-on-oland-when-two-4k-60hz-monitors-are-connected.patch
+drm-amd-display-initialize-attribute-for-hdcp_srm-sysfs-file.patch
+drm-i915-avoid-div-by-zero-on-gen2.patch
+drm-i915-dp-use-slow-and-wide-link-training-for-everything.patch
+kvm-exit-halt-polling-on-need_resched-as-well.patch
--- /dev/null
+From 0d3ae948741ac6d80e39ab27b45297367ee477de Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 14 Apr 2021 10:05:17 -0700
+Subject: sh: Remove unused variable
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit 0d3ae948741ac6d80e39ab27b45297367ee477de upstream.
+
+Removes this annoying warning:
+
+arch/sh/kernel/traps.c: In function ‘nmi_trap_handler’:
+arch/sh/kernel/traps.c:183:15: warning: unused variable ‘cpu’ [-Wunused-variable]
+ 183 | unsigned int cpu = smp_processor_id();
+
+Fixes: fe3f1d5d7cd3 ("sh: Get rid of nmi_count()")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210414170517.1205430-1-eric.dumazet@gmail.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/sh/kernel/traps.c | 1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/arch/sh/kernel/traps.c
++++ b/arch/sh/kernel/traps.c
+@@ -180,7 +180,6 @@ static inline void arch_ftrace_nmi_exit(
+
+ BUILD_TRAP_HANDLER(nmi)
+ {
+- unsigned int cpu = smp_processor_id();
+ TRAP_HANDLER_DECL;
+
+ arch_ftrace_nmi_enter();
--- /dev/null
+From d6e621de1fceb3b098ebf435ef7ea91ec4838a1a Mon Sep 17 00:00:00 2001
+From: Phillip Lougher <phillip@squashfs.org.uk>
+Date: Fri, 14 May 2021 17:27:16 -0700
+Subject: squashfs: fix divide error in calculate_skip()
+
+From: Phillip Lougher <phillip@squashfs.org.uk>
+
+commit d6e621de1fceb3b098ebf435ef7ea91ec4838a1a upstream.
+
+Sysbot has reported a "divide error" which has been identified as being
+caused by a corrupted file_size value within the file inode. This value
+has been corrupted to a much larger value than expected.
+
+Calculate_skip() is passed i_size_read(inode) >> msblk->block_log. Due to
+the file_size value corruption this overflows the int argument/variable in
+that function, leading to the divide error.
+
+This patch changes the function to use u64. This will accommodate any
+unexpectedly large values due to corruption.
+
+The value returned from calculate_skip() is clamped to be never more than
+SQUASHFS_CACHED_BLKS - 1, or 7. So file_size corruption does not lead to
+an unexpectedly large return result here.
+
+Link: https://lkml.kernel.org/r/20210507152618.9447-1-phillip@squashfs.org.uk
+Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
+Reported-by: <syzbot+e8f781243ce16ac2f962@syzkaller.appspotmail.com>
+Reported-by: <syzbot+7b98870d4fec9447b951@syzkaller.appspotmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/squashfs/file.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/fs/squashfs/file.c
++++ b/fs/squashfs/file.c
+@@ -211,11 +211,11 @@ failure:
+ * If the skip factor is limited in this way then the file will use multiple
+ * slots.
+ */
+-static inline int calculate_skip(int blocks)
++static inline int calculate_skip(u64 blocks)
+ {
+- int skip = blocks / ((SQUASHFS_META_ENTRIES + 1)
++ u64 skip = blocks / ((SQUASHFS_META_ENTRIES + 1)
+ * SQUASHFS_META_INDEXES);
+- return min(SQUASHFS_CACHED_BLKS - 1, skip + 1);
++ return min((u64) SQUASHFS_CACHED_BLKS - 1, skip + 1);
+ }
+
+
--- /dev/null
+From 7ed9d238c7dbb1fdb63ad96a6184985151b0171c Mon Sep 17 00:00:00 2001
+From: Axel Rasmussen <axelrasmussen@google.com>
+Date: Fri, 14 May 2021 17:27:19 -0700
+Subject: userfaultfd: release page in error path to avoid BUG_ON
+
+From: Axel Rasmussen <axelrasmussen@google.com>
+
+commit 7ed9d238c7dbb1fdb63ad96a6184985151b0171c upstream.
+
+Consider the following sequence of events:
+
+1. Userspace issues a UFFD ioctl, which ends up calling into
+ shmem_mfill_atomic_pte(). We successfully account the blocks, we
+ shmem_alloc_page(), but then the copy_from_user() fails. We return
+ -ENOENT. We don't release the page we allocated.
+2. Our caller detects this error code, tries the copy_from_user() after
+ dropping the mmap_lock, and retries, calling back into
+ shmem_mfill_atomic_pte().
+3. Meanwhile, let's say another process filled up the tmpfs being used.
+4. So shmem_mfill_atomic_pte() fails to account blocks this time, and
+ immediately returns - without releasing the page.
+
+This triggers a BUG_ON in our caller, which asserts that the page
+should always be consumed, unless -ENOENT is returned.
+
+To fix this, detect if we have such a "dangling" page when accounting
+fails, and if so, release it before returning.
+
+Link: https://lkml.kernel.org/r/20210428230858.348400-1-axelrasmussen@google.com
+Fixes: cb658a453b93 ("userfaultfd: shmem: avoid leaking blocks and used blocks in UFFDIO_COPY")
+Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
+Reported-by: Hugh Dickins <hughd@google.com>
+Acked-by: Hugh Dickins <hughd@google.com>
+Reviewed-by: Peter Xu <peterx@redhat.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/shmem.c | 12 +++++++++++-
+ 1 file changed, 11 insertions(+), 1 deletion(-)
+
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -2375,8 +2375,18 @@ static int shmem_mfill_atomic_pte(struct
+ pgoff_t offset, max_off;
+
+ ret = -ENOMEM;
+- if (!shmem_inode_acct_block(inode, 1))
++ if (!shmem_inode_acct_block(inode, 1)) {
++ /*
++ * We may have got a page, returned -ENOENT triggering a retry,
++ * and now we find ourselves with -ENOMEM. Release the page, to
++ * avoid a BUG_ON in our caller.
++ */
++ if (unlikely(*pagep)) {
++ put_page(*pagep);
++ *pagep = NULL;
++ }
+ goto out;
++ }
+
+ if (!*pagep) {
+ page = shmem_alloc_page(gfp, info, pgoff);
--- /dev/null
+From 3743d55b289c203d8f77b7cd47c24926b9d186ae Mon Sep 17 00:00:00 2001
+From: Huang Rui <ray.huang@amd.com>
+Date: Sun, 25 Apr 2021 15:34:51 +0800
+Subject: x86, sched: Fix the AMD CPPC maximum performance value on certain AMD Ryzen generations
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Huang Rui <ray.huang@amd.com>
+
+commit 3743d55b289c203d8f77b7cd47c24926b9d186ae upstream.
+
+Some AMD Ryzen generations has different calculation method on maximum
+performance. 255 is not for all ASICs, some specific generations should use 166
+as the maximum performance. Otherwise, it will report incorrect frequency value
+like below:
+
+ ~ → lscpu | grep MHz
+ CPU MHz: 3400.000
+ CPU max MHz: 7228.3198
+ CPU min MHz: 2200.0000
+
+[ mingo: Tidied up whitespace use. ]
+[ Alexander Monakov <amonakov@ispras.ru>: fix 225 -> 255 typo. ]
+
+Fixes: 41ea667227ba ("x86, sched: Calculate frequency invariance for AMD systems")
+Fixes: 3c55e94c0ade ("cpufreq: ACPI: Extend frequency tables to cover boost frequencies")
+Reported-by: Jason Bagavatsingham <jason.bagavatsingham@gmail.com>
+Fixed-by: Alexander Monakov <amonakov@ispras.ru>
+Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Huang Rui <ray.huang@amd.com>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Tested-by: Jason Bagavatsingham <jason.bagavatsingham@gmail.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20210425073451.2557394-1-ray.huang@amd.com
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=211791
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/processor.h | 2 ++
+ arch/x86/kernel/cpu/amd.c | 16 ++++++++++++++++
+ arch/x86/kernel/smpboot.c | 2 +-
+ drivers/cpufreq/acpi-cpufreq.c | 6 +++++-
+ 4 files changed, 24 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -804,8 +804,10 @@ DECLARE_PER_CPU(u64, msr_misc_features_s
+
+ #ifdef CONFIG_CPU_SUP_AMD
+ extern u32 amd_get_nodes_per_socket(void);
++extern u32 amd_get_highest_perf(void);
+ #else
+ static inline u32 amd_get_nodes_per_socket(void) { return 0; }
++static inline u32 amd_get_highest_perf(void) { return 0; }
+ #endif
+
+ static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
+--- a/arch/x86/kernel/cpu/amd.c
++++ b/arch/x86/kernel/cpu/amd.c
+@@ -1170,3 +1170,19 @@ void set_dr_addr_mask(unsigned long mask
+ break;
+ }
+ }
++
++u32 amd_get_highest_perf(void)
++{
++ struct cpuinfo_x86 *c = &boot_cpu_data;
++
++ if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
++ (c->x86_model >= 0x70 && c->x86_model < 0x80)))
++ return 166;
++
++ if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
++ (c->x86_model >= 0x40 && c->x86_model < 0x70)))
++ return 166;
++
++ return 255;
++}
++EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+--- a/arch/x86/kernel/smpboot.c
++++ b/arch/x86/kernel/smpboot.c
+@@ -2046,7 +2046,7 @@ static bool amd_set_max_freq_ratio(void)
+ return false;
+ }
+
+- highest_perf = perf_caps.highest_perf;
++ highest_perf = amd_get_highest_perf();
+ nominal_perf = perf_caps.nominal_perf;
+
+ if (!highest_perf || !nominal_perf) {
+--- a/drivers/cpufreq/acpi-cpufreq.c
++++ b/drivers/cpufreq/acpi-cpufreq.c
+@@ -646,7 +646,11 @@ static u64 get_max_boost_ratio(unsigned
+ return 0;
+ }
+
+- highest_perf = perf_caps.highest_perf;
++ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
++ highest_perf = amd_get_highest_perf();
++ else
++ highest_perf = perf_caps.highest_perf;
++
+ nominal_perf = perf_caps.nominal_perf;
+
+ if (!highest_perf || !nominal_perf) {