From: Greg Kroah-Hartman Date: Sat, 16 Sep 2023 12:20:29 +0000 (+0200) Subject: 6.5-stable patches X-Git-Tag: v5.10.195~43 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=fd6453bed330ab41f66cff556df88592c6a0f092;p=thirdparty%2Fkernel%2Fstable-queue.git 6.5-stable patches added patches: arc-atomics-add-compiler-barrier-to-atomic-operations.patch ata-ahci-add-elkhart-lake-ahci-controller.patch ata-pata_falcon-fix-io-base-selection-for-q40.patch ata-pata_ftide010-add-missing-module_description.patch ata-sata_gemini-add-missing-module_description.patch btrfs-don-t-start-transaction-when-joining-with-trans_join_nostart.patch btrfs-fix-start-transaction-qgroup-rsv-double-free.patch btrfs-free-qgroup-rsv-on-io-failure.patch btrfs-set-page-extent-mapped-after-read_folio-in-relocate_one_page.patch btrfs-zoned-do-not-zone-finish-data-relocation-block-group.patch clocksource-drivers-arm_arch_timer-disable-timer-before-programming-cval.patch dmaengine-sh-rz-dmac-fix-destination-and-source-data-size-setting.patch ext4-add-correct-group-descriptors-and-reserved-gdt-blocks-to-system-zone.patch ext4-drop-dio-overwrite-only-flag-and-associated-warning.patch ext4-fix-bug-in-ext4_mb_new_inode_pa-due-to-overflow.patch ext4-fix-memory-leaks-in-ext4_fname_-setup_filename-prepare_lookup.patch ext4-fix-slab-use-after-free-in-ext4_es_insert_extent.patch f2fs-avoid-false-alarm-of-circular-locking.patch f2fs-flush-inode-if-atomic-file-is-aborted.patch f2fs-get-out-of-a-repeat-loop-when-getting-a-locked-data-page.patch fuse-nlookup-missing-decrement-in-fuse_direntplus_link.patch hwspinlock-qcom-add-missing-regmap-config-for-sfpb-mmio-implementation.patch jbd2-check-jh-b_transaction-before-removing-it-from-checkpoint.patch jbd2-correct-the-end-of-the-journal-recovery-scan-range.patch jbd2-fix-checkpoint-cleanup-performance-regression.patch lib-test_scanf-add-explicit-type-cast-to-result-initialization-in-test_number_prefix.patch memcontrol-ensure-memcg-acquired-by-id-is-properly-set-up.patch misc-fastrpc-fix-incorrect-dma-mapping-unmap-request.patch misc-fastrpc-fix-remote-heap-allocation-request.patch --- diff --git a/queue-6.5/arc-atomics-add-compiler-barrier-to-atomic-operations.patch b/queue-6.5/arc-atomics-add-compiler-barrier-to-atomic-operations.patch new file mode 100644 index 00000000000..7dd9450eb75 --- /dev/null +++ b/queue-6.5/arc-atomics-add-compiler-barrier-to-atomic-operations.patch @@ -0,0 +1,100 @@ +From 42f51fb24fd39cc547c086ab3d8a314cc603a91c Mon Sep 17 00:00:00 2001 +From: Pavel Kozlov +Date: Tue, 15 Aug 2023 19:11:36 +0400 +Subject: ARC: atomics: Add compiler barrier to atomic operations... + +From: Pavel Kozlov + +commit 42f51fb24fd39cc547c086ab3d8a314cc603a91c upstream. + +... to avoid unwanted gcc optimizations + +SMP kernels fail to boot with commit 596ff4a09b89 +("cpumask: re-introduce constant-sized cpumask optimizations"). + +| +| percpu: BUG: failure at mm/percpu.c:2981/pcpu_build_alloc_info()! +| + +The write operation performed by the SCOND instruction in the atomic +inline asm code is not properly passed to the compiler. The compiler +cannot correctly optimize a nested loop that runs through the cpumask +in the pcpu_build_alloc_info() function. + +Fix this by add a compiler barrier (memory clobber in inline asm). + +Apparently atomic ops used to have memory clobber implicitly via +surrounding smp_mb(). However commit b64be6836993c431e +("ARC: atomics: implement relaxed variants") removed the smp_mb() for +the relaxed variants, but failed to add the explicit compiler barrier. + +Link: https://github.com/foss-for-synopsys-dwc-arc-processors/linux/issues/135 +Cc: # v6.3+ +Fixes: b64be6836993c43 ("ARC: atomics: implement relaxed variants") +Signed-off-by: Pavel Kozlov +Signed-off-by: Vineet Gupta +[vgupta: tweaked the changelog and added Fixes tag] +Signed-off-by: Greg Kroah-Hartman +--- + arch/arc/include/asm/atomic-llsc.h | 6 +++--- + arch/arc/include/asm/atomic64-arcv2.h | 6 +++--- + 2 files changed, 6 insertions(+), 6 deletions(-) + +--- a/arch/arc/include/asm/atomic-llsc.h ++++ b/arch/arc/include/asm/atomic-llsc.h +@@ -18,7 +18,7 @@ static inline void arch_atomic_##op(int + : [val] "=&r" (val) /* Early clobber to prevent reg reuse */ \ + : [ctr] "r" (&v->counter), /* Not "m": llock only supports reg direct addr mode */ \ + [i] "ir" (i) \ +- : "cc"); \ ++ : "cc", "memory"); \ + } \ + + #define ATOMIC_OP_RETURN(op, asm_op) \ +@@ -34,7 +34,7 @@ static inline int arch_atomic_##op##_ret + : [val] "=&r" (val) \ + : [ctr] "r" (&v->counter), \ + [i] "ir" (i) \ +- : "cc"); \ ++ : "cc", "memory"); \ + \ + return val; \ + } +@@ -56,7 +56,7 @@ static inline int arch_atomic_fetch_##op + [orig] "=&r" (orig) \ + : [ctr] "r" (&v->counter), \ + [i] "ir" (i) \ +- : "cc"); \ ++ : "cc", "memory"); \ + \ + return orig; \ + } +--- a/arch/arc/include/asm/atomic64-arcv2.h ++++ b/arch/arc/include/asm/atomic64-arcv2.h +@@ -60,7 +60,7 @@ static inline void arch_atomic64_##op(s6 + " bnz 1b \n" \ + : "=&r"(val) \ + : "r"(&v->counter), "ir"(a) \ +- : "cc"); \ ++ : "cc", "memory"); \ + } \ + + #define ATOMIC64_OP_RETURN(op, op1, op2) \ +@@ -77,7 +77,7 @@ static inline s64 arch_atomic64_##op##_r + " bnz 1b \n" \ + : [val] "=&r"(val) \ + : "r"(&v->counter), "ir"(a) \ +- : "cc"); /* memory clobber comes from smp_mb() */ \ ++ : "cc", "memory"); \ + \ + return val; \ + } +@@ -99,7 +99,7 @@ static inline s64 arch_atomic64_fetch_## + " bnz 1b \n" \ + : "=&r"(orig), "=&r"(val) \ + : "r"(&v->counter), "ir"(a) \ +- : "cc"); /* memory clobber comes from smp_mb() */ \ ++ : "cc", "memory"); \ + \ + return orig; \ + } diff --git a/queue-6.5/ata-ahci-add-elkhart-lake-ahci-controller.patch b/queue-6.5/ata-ahci-add-elkhart-lake-ahci-controller.patch new file mode 100644 index 00000000000..684332d92d3 --- /dev/null +++ b/queue-6.5/ata-ahci-add-elkhart-lake-ahci-controller.patch @@ -0,0 +1,61 @@ +From 2a2df98ec592667927b5c1351afa6493ea125c9f Mon Sep 17 00:00:00 2001 +From: Werner Fischer +Date: Tue, 29 Aug 2023 13:33:58 +0200 +Subject: ata: ahci: Add Elkhart Lake AHCI controller +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Werner Fischer + +commit 2a2df98ec592667927b5c1351afa6493ea125c9f upstream. + +Elkhart Lake is the successor of Apollo Lake and Gemini Lake. These +CPUs and their PCHs are used in mobile and embedded environments. + +With this patch I suggest that Elkhart Lake SATA controllers [1] should +use the default LPM policy for mobile chipsets. +The disadvantage of missing hot-plug support with this setting should +not be an issue, as those CPUs are used in embedded environments and +not in servers with hot-plug backplanes. + +We discovered that the Elkhart Lake SATA controllers have been missing +in ahci.c after a customer reported the throttling of his SATA SSD +after a short period of higher I/O. We determined the high temperature +of the SSD controller in idle mode as the root cause for that. + +Depending on the used SSD, we have seen up to 1.8 Watt lower system +idle power usage and up to 30°C lower SSD controller temperatures in +our tests, when we set med_power_with_dipm manually. I have provided a +table showing seven different SATA SSDs from ATP, Intel/Solidigm and +Samsung [2]. + +Intel lists a total of 3 SATA controller IDs (4B60, 4B62, 4B63) in [1] +for those mobile PCHs. +This commit just adds 0x4b63 as I do not have test systems with 0x4b60 +and 0x4b62 SATA controllers. +I have tested this patch with a system which uses 0x4b63 as SATA +controller. + +[1] https://sata-io.org/product/8803 +[2] https://www.thomas-krenn.com/en/wiki/SATA_Link_Power_Management#Example_LES_v4 + +Signed-off-by: Werner Fischer +Cc: stable@vger.kernel.org +Signed-off-by: Damien Le Moal +Signed-off-by: Greg Kroah-Hartman +--- + drivers/ata/ahci.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/ata/ahci.c ++++ b/drivers/ata/ahci.c +@@ -421,6 +421,8 @@ static const struct pci_device_id ahci_p + { PCI_VDEVICE(INTEL, 0x34d3), board_ahci_low_power }, /* Ice Lake LP AHCI */ + { PCI_VDEVICE(INTEL, 0x02d3), board_ahci_low_power }, /* Comet Lake PCH-U AHCI */ + { PCI_VDEVICE(INTEL, 0x02d7), board_ahci_low_power }, /* Comet Lake PCH RAID */ ++ /* Elkhart Lake IDs 0x4b60 & 0x4b62 https://sata-io.org/product/8803 not tested yet */ ++ { PCI_VDEVICE(INTEL, 0x4b63), board_ahci_low_power }, /* Elkhart Lake AHCI */ + + /* JMicron 360/1/3/5/6, match class to avoid IDE function */ + { PCI_VENDOR_ID_JMICRON, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, diff --git a/queue-6.5/ata-pata_falcon-fix-io-base-selection-for-q40.patch b/queue-6.5/ata-pata_falcon-fix-io-base-selection-for-q40.patch new file mode 100644 index 00000000000..1c43b9de37e --- /dev/null +++ b/queue-6.5/ata-pata_falcon-fix-io-base-selection-for-q40.patch @@ -0,0 +1,123 @@ +From 8a1f00b753ecfdb117dc1a07e68c46d80e7923ea Mon Sep 17 00:00:00 2001 +From: Michael Schmitz +Date: Sun, 27 Aug 2023 16:13:47 +1200 +Subject: ata: pata_falcon: fix IO base selection for Q40 + +From: Michael Schmitz + +commit 8a1f00b753ecfdb117dc1a07e68c46d80e7923ea upstream. + +With commit 44b1fbc0f5f3 ("m68k/q40: Replace q40ide driver +with pata_falcon and falconide"), the Q40 IDE driver was +replaced by pata_falcon.c. + +Both IO and memory resources were defined for the Q40 IDE +platform device, but definition of the IDE register addresses +was modeled after the Falcon case, both in use of the memory +resources and in including register shift and byte vs. word +offset in the address. + +This was correct for the Falcon case, which does not apply +any address translation to the register addresses. In the +Q40 case, all of device base address, byte access offset +and register shift is included in the platform specific +ISA access translation (in asm/mm_io.h). + +As a consequence, such address translation gets applied +twice, and register addresses are mangled. + +Use the device base address from the platform IO resource +for Q40 (the IO address translation will then add the correct +ISA window base address and byte access offset), with register +shift 1. Use MMIO base address and register shift 2 as before +for Falcon. + +Encode PIO_OFFSET into IO port addresses for all registers +for Q40 except the data transfer register. Encode the MMIO +offset there (pata_falcon_data_xfer() directly uses raw IO +with no address translation). + +Reported-by: William R Sowerbutts +Closes: https://lore.kernel.org/r/CAMuHMdUU62jjunJh9cqSqHT87B0H0A4udOOPs=WN7WZKpcagVA@mail.gmail.com +Link: https://lore.kernel.org/r/CAMuHMdUU62jjunJh9cqSqHT87B0H0A4udOOPs=WN7WZKpcagVA@mail.gmail.com +Fixes: 44b1fbc0f5f3 ("m68k/q40: Replace q40ide driver with pata_falcon and falconide") +Cc: stable@vger.kernel.org +Cc: Finn Thain +Cc: Geert Uytterhoeven +Tested-by: William R Sowerbutts +Signed-off-by: Michael Schmitz +Reviewed-by: Sergey Shtylyov +Reviewed-by: Geert Uytterhoeven +Signed-off-by: Damien Le Moal +Signed-off-by: Greg Kroah-Hartman +--- + drivers/ata/pata_falcon.c | 50 ++++++++++++++++++++++++++-------------------- + 1 file changed, 29 insertions(+), 21 deletions(-) + +--- a/drivers/ata/pata_falcon.c ++++ b/drivers/ata/pata_falcon.c +@@ -123,8 +123,8 @@ static int __init pata_falcon_init_one(s + struct resource *base_res, *ctl_res, *irq_res; + struct ata_host *host; + struct ata_port *ap; +- void __iomem *base; +- int irq = 0; ++ void __iomem *base, *ctl_base; ++ int irq = 0, io_offset = 1, reg_shift = 2; /* Falcon defaults */ + + dev_info(&pdev->dev, "Atari Falcon and Q40/Q60 PATA controller\n"); + +@@ -165,26 +165,34 @@ static int __init pata_falcon_init_one(s + ap->pio_mask = ATA_PIO4; + ap->flags |= ATA_FLAG_SLAVE_POSS | ATA_FLAG_NO_IORDY; + +- base = (void __iomem *)base_mem_res->start; + /* N.B. this assumes data_addr will be used for word-sized I/O only */ +- ap->ioaddr.data_addr = base + 0 + 0 * 4; +- ap->ioaddr.error_addr = base + 1 + 1 * 4; +- ap->ioaddr.feature_addr = base + 1 + 1 * 4; +- ap->ioaddr.nsect_addr = base + 1 + 2 * 4; +- ap->ioaddr.lbal_addr = base + 1 + 3 * 4; +- ap->ioaddr.lbam_addr = base + 1 + 4 * 4; +- ap->ioaddr.lbah_addr = base + 1 + 5 * 4; +- ap->ioaddr.device_addr = base + 1 + 6 * 4; +- ap->ioaddr.status_addr = base + 1 + 7 * 4; +- ap->ioaddr.command_addr = base + 1 + 7 * 4; +- +- base = (void __iomem *)ctl_mem_res->start; +- ap->ioaddr.altstatus_addr = base + 1; +- ap->ioaddr.ctl_addr = base + 1; +- +- ata_port_desc(ap, "cmd 0x%lx ctl 0x%lx", +- (unsigned long)base_mem_res->start, +- (unsigned long)ctl_mem_res->start); ++ ap->ioaddr.data_addr = (void __iomem *)base_mem_res->start; ++ ++ if (base_res) { /* only Q40 has IO resources */ ++ io_offset = 0x10000; ++ reg_shift = 0; ++ base = (void __iomem *)base_res->start; ++ ctl_base = (void __iomem *)ctl_res->start; ++ } else { ++ base = (void __iomem *)base_mem_res->start; ++ ctl_base = (void __iomem *)ctl_mem_res->start; ++ } ++ ++ ap->ioaddr.error_addr = base + io_offset + (1 << reg_shift); ++ ap->ioaddr.feature_addr = base + io_offset + (1 << reg_shift); ++ ap->ioaddr.nsect_addr = base + io_offset + (2 << reg_shift); ++ ap->ioaddr.lbal_addr = base + io_offset + (3 << reg_shift); ++ ap->ioaddr.lbam_addr = base + io_offset + (4 << reg_shift); ++ ap->ioaddr.lbah_addr = base + io_offset + (5 << reg_shift); ++ ap->ioaddr.device_addr = base + io_offset + (6 << reg_shift); ++ ap->ioaddr.status_addr = base + io_offset + (7 << reg_shift); ++ ap->ioaddr.command_addr = base + io_offset + (7 << reg_shift); ++ ++ ap->ioaddr.altstatus_addr = ctl_base + io_offset; ++ ap->ioaddr.ctl_addr = ctl_base + io_offset; ++ ++ ata_port_desc(ap, "cmd %px ctl %px data %px", ++ base, ctl_base, ap->ioaddr.data_addr); + + irq_res = platform_get_resource(pdev, IORESOURCE_IRQ, 0); + if (irq_res && irq_res->start > 0) { diff --git a/queue-6.5/ata-pata_ftide010-add-missing-module_description.patch b/queue-6.5/ata-pata_ftide010-add-missing-module_description.patch new file mode 100644 index 00000000000..f89f6d5bda0 --- /dev/null +++ b/queue-6.5/ata-pata_ftide010-add-missing-module_description.patch @@ -0,0 +1,34 @@ +From 7274eef5729037300f29d14edeb334a47a098f65 Mon Sep 17 00:00:00 2001 +From: Damien Le Moal +Date: Thu, 24 Aug 2023 07:41:59 +0900 +Subject: ata: pata_ftide010: Add missing MODULE_DESCRIPTION + +From: Damien Le Moal + +commit 7274eef5729037300f29d14edeb334a47a098f65 upstream. + +Add the missing MODULE_DESCRIPTION() to avoid warnings such as: + +WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/ata/pata_ftide010.o + +when compiling with W=1. + +Fixes: be4e456ed3a5 ("ata: Add driver for Faraday Technology FTIDE010") +Cc: stable@vger.kernel.org +Signed-off-by: Damien Le Moal +Reviewed-by: Linus Walleij +Signed-off-by: Greg Kroah-Hartman +--- + drivers/ata/pata_ftide010.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/ata/pata_ftide010.c ++++ b/drivers/ata/pata_ftide010.c +@@ -567,6 +567,7 @@ static struct platform_driver pata_ftide + }; + module_platform_driver(pata_ftide010_driver); + ++MODULE_DESCRIPTION("low level driver for Faraday Technology FTIDE010"); + MODULE_AUTHOR("Linus Walleij "); + MODULE_LICENSE("GPL"); + MODULE_ALIAS("platform:" DRV_NAME); diff --git a/queue-6.5/ata-sata_gemini-add-missing-module_description.patch b/queue-6.5/ata-sata_gemini-add-missing-module_description.patch new file mode 100644 index 00000000000..4dc5ff86c16 --- /dev/null +++ b/queue-6.5/ata-sata_gemini-add-missing-module_description.patch @@ -0,0 +1,34 @@ +From 8566572bf3b4d6e416a4bf2110dbb4817d11ba59 Mon Sep 17 00:00:00 2001 +From: Damien Le Moal +Date: Thu, 24 Aug 2023 07:43:18 +0900 +Subject: ata: sata_gemini: Add missing MODULE_DESCRIPTION + +From: Damien Le Moal + +commit 8566572bf3b4d6e416a4bf2110dbb4817d11ba59 upstream. + +Add the missing MODULE_DESCRIPTION() to avoid warnings such as: + +WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/ata/sata_gemini.o + +when compiling with W=1. + +Fixes: be4e456ed3a5 ("ata: Add driver for Faraday Technology FTIDE010") +Cc: stable@vger.kernel.org +Signed-off-by: Damien Le Moal +Reviewed-by: Linus Walleij +Signed-off-by: Greg Kroah-Hartman +--- + drivers/ata/sata_gemini.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/ata/sata_gemini.c ++++ b/drivers/ata/sata_gemini.c +@@ -428,6 +428,7 @@ static struct platform_driver gemini_sat + }; + module_platform_driver(gemini_sata_driver); + ++MODULE_DESCRIPTION("low level driver for Cortina Systems Gemini SATA bridge"); + MODULE_AUTHOR("Linus Walleij "); + MODULE_LICENSE("GPL"); + MODULE_ALIAS("platform:" DRV_NAME); diff --git a/queue-6.5/btrfs-don-t-start-transaction-when-joining-with-trans_join_nostart.patch b/queue-6.5/btrfs-don-t-start-transaction-when-joining-with-trans_join_nostart.patch new file mode 100644 index 00000000000..29cd3b0b866 --- /dev/null +++ b/queue-6.5/btrfs-don-t-start-transaction-when-joining-with-trans_join_nostart.patch @@ -0,0 +1,43 @@ +From 4490e803e1fe9fab8db5025e44e23b55df54078b Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 26 Jul 2023 16:56:57 +0100 +Subject: btrfs: don't start transaction when joining with TRANS_JOIN_NOSTART + +From: Filipe Manana + +commit 4490e803e1fe9fab8db5025e44e23b55df54078b upstream. + +When joining a transaction with TRANS_JOIN_NOSTART, if we don't find a +running transaction we end up creating one. This goes against the purpose +of TRANS_JOIN_NOSTART which is to join a running transaction if its state +is at or below the state TRANS_STATE_COMMIT_START, otherwise return an +-ENOENT error and don't start a new transaction. So fix this to not create +a new transaction if there's no running transaction at or below that +state. + +CC: stable@vger.kernel.org # 4.14+ +Fixes: a6d155d2e363 ("Btrfs: fix deadlock between fiemap and transaction commits") +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/transaction.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -292,10 +292,11 @@ loop: + spin_unlock(&fs_info->trans_lock); + + /* +- * If we are ATTACH, we just want to catch the current transaction, +- * and commit it. If there is no transaction, just return ENOENT. ++ * If we are ATTACH or TRANS_JOIN_NOSTART, we just want to catch the ++ * current transaction, and commit it. If there is no transaction, just ++ * return ENOENT. + */ +- if (type == TRANS_ATTACH) ++ if (type == TRANS_ATTACH || type == TRANS_JOIN_NOSTART) + return -ENOENT; + + /* diff --git a/queue-6.5/btrfs-fix-start-transaction-qgroup-rsv-double-free.patch b/queue-6.5/btrfs-fix-start-transaction-qgroup-rsv-double-free.patch new file mode 100644 index 00000000000..2827787810d --- /dev/null +++ b/queue-6.5/btrfs-fix-start-transaction-qgroup-rsv-double-free.patch @@ -0,0 +1,97 @@ +From a6496849671a5bc9218ecec25a983253b34351b1 Mon Sep 17 00:00:00 2001 +From: Boris Burkov +Date: Fri, 21 Jul 2023 09:02:07 -0700 +Subject: btrfs: fix start transaction qgroup rsv double free + +From: Boris Burkov + +commit a6496849671a5bc9218ecec25a983253b34351b1 upstream. + +btrfs_start_transaction reserves metadata space of the PERTRANS type +before it identifies a transaction to start/join. This allows flushing +when reserving that space without a deadlock. However, it results in a +race which temporarily breaks qgroup rsv accounting. + +T1 T2 +start_transaction +do_stuff + start_transaction + qgroup_reserve_meta_pertrans +commit_transaction + qgroup_free_meta_all_pertrans + hit an error starting txn + goto reserve_fail + qgroup_free_meta_pertrans (already freed!) + +The basic issue is that there is nothing preventing another commit from +committing before start_transaction finishes (in fact sometimes we +intentionally wait for it) so any error path that frees the reserve is +at risk of this race. + +While this exact space was getting freed anyway, and it's not a huge +deal to double free it (just a warning, the free code catches this), it +can result in incorrectly freeing some other pertrans reservation in +this same reservation, which could then lead to spuriously granting +reservations we might not have the space for. Therefore, I do believe it +is worth fixing. + +To fix it, use the existing prealloc->pertrans conversion mechanism. +When we first reserve the space, we reserve prealloc space and only when +we are sure we have a transaction do we convert it to pertrans. This way +any racing commits do not blow away our reservation, but we still get a +pertrans reservation that is freed when _this_ transaction gets committed. + +This issue can be reproduced by running generic/269 with either qgroups +or squotas enabled via mkfs on the scratch device. + +Reviewed-by: Josef Bacik +CC: stable@vger.kernel.org # 5.10+ +Signed-off-by: Boris Burkov +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/transaction.c | 19 ++++++++++++++++--- + 1 file changed, 16 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -591,8 +591,13 @@ start_transaction(struct btrfs_root *roo + u64 delayed_refs_bytes = 0; + + qgroup_reserved = num_items * fs_info->nodesize; +- ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved, +- enforce_qgroups); ++ /* ++ * Use prealloc for now, as there might be a currently running ++ * transaction that could free this reserved space prematurely ++ * by committing. ++ */ ++ ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserved, ++ enforce_qgroups, false); + if (ret) + return ERR_PTR(ret); + +@@ -705,6 +710,14 @@ again: + h->reloc_reserved = reloc_reserved; + } + ++ /* ++ * Now that we have found a transaction to be a part of, convert the ++ * qgroup reservation from prealloc to pertrans. A different transaction ++ * can't race in and free our pertrans out from under us. ++ */ ++ if (qgroup_reserved) ++ btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); ++ + got_it: + if (!current->journal_info) + current->journal_info = h; +@@ -752,7 +765,7 @@ alloc_fail: + btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv, + num_bytes, NULL); + reserve_fail: +- btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved); ++ btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); + return ERR_PTR(ret); + } + diff --git a/queue-6.5/btrfs-free-qgroup-rsv-on-io-failure.patch b/queue-6.5/btrfs-free-qgroup-rsv-on-io-failure.patch new file mode 100644 index 00000000000..1e6fa7f7d4d --- /dev/null +++ b/queue-6.5/btrfs-free-qgroup-rsv-on-io-failure.patch @@ -0,0 +1,46 @@ +From e28b02118b94e42be3355458a2406c6861e2dd32 Mon Sep 17 00:00:00 2001 +From: Boris Burkov +Date: Fri, 21 Jul 2023 09:02:06 -0700 +Subject: btrfs: free qgroup rsv on io failure + +From: Boris Burkov + +commit e28b02118b94e42be3355458a2406c6861e2dd32 upstream. + +If we do a write whose bio suffers an error, we will never reclaim the +qgroup reserved space for it. We allocate the space in the write_iter +codepath, then release the reservation as we allocate the ordered +extent, but we only create a delayed ref if the ordered extent finishes. +If it has an error, we simply leak the rsv. This is apparent in running +any error injecting (dmerror) fstests like btrfs/146 or btrfs/160. Such +tests fail due to dmesg on umount complaining about the leaked qgroup +data space. + +When we clean up other aspects of space on failed ordered_extents, also +free the qgroup rsv. + +Reviewed-by: Josef Bacik +CC: stable@vger.kernel.org # 5.10+ +Signed-off-by: Boris Burkov +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/inode.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -3359,6 +3359,13 @@ out: + btrfs_free_reserved_extent(fs_info, + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes, 1); ++ /* ++ * Actually free the qgroup rsv which was released when ++ * the ordered extent was created. ++ */ ++ btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid, ++ ordered_extent->qgroup_rsv, ++ BTRFS_QGROUP_RSV_DATA); + } + } + diff --git a/queue-6.5/btrfs-set-page-extent-mapped-after-read_folio-in-relocate_one_page.patch b/queue-6.5/btrfs-set-page-extent-mapped-after-read_folio-in-relocate_one_page.patch new file mode 100644 index 00000000000..3e157059e4d --- /dev/null +++ b/queue-6.5/btrfs-set-page-extent-mapped-after-read_folio-in-relocate_one_page.patch @@ -0,0 +1,100 @@ +From e7f1326cc24e22b38afc3acd328480a1183f9e79 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Mon, 31 Jul 2023 11:13:00 -0400 +Subject: btrfs: set page extent mapped after read_folio in relocate_one_page + +From: Josef Bacik + +commit e7f1326cc24e22b38afc3acd328480a1183f9e79 upstream. + +One of the CI runs triggered the following panic + + assertion failed: PagePrivate(page) && page->private, in fs/btrfs/subpage.c:229 + ------------[ cut here ]------------ + kernel BUG at fs/btrfs/subpage.c:229! + Internal error: Oops - BUG: 00000000f2000800 [#1] SMP + CPU: 0 PID: 923660 Comm: btrfs Not tainted 6.5.0-rc3+ #1 + pstate: 61400005 (nZCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) + pc : btrfs_subpage_assert+0xbc/0xf0 + lr : btrfs_subpage_assert+0xbc/0xf0 + sp : ffff800093213720 + x29: ffff800093213720 x28: ffff8000932138b4 x27: 000000000c280000 + x26: 00000001b5d00000 x25: 000000000c281000 x24: 000000000c281fff + x23: 0000000000001000 x22: 0000000000000000 x21: ffffff42b95bf880 + x20: ffff42b9528e0000 x19: 0000000000001000 x18: ffffffffffffffff + x17: 667274622f736620 x16: 6e69202c65746176 x15: 0000000000000028 + x14: 0000000000000003 x13: 00000000002672d7 x12: 0000000000000000 + x11: ffffcd3f0ccd9204 x10: ffffcd3f0554ae50 x9 : ffffcd3f0379528c + x8 : ffff800093213428 x7 : 0000000000000000 x6 : ffffcd3f091771e8 + x5 : ffff42b97f333948 x4 : 0000000000000000 x3 : 0000000000000000 + x2 : 0000000000000000 x1 : ffff42b9556cde80 x0 : 000000000000004f + Call trace: + btrfs_subpage_assert+0xbc/0xf0 + btrfs_subpage_set_dirty+0x38/0xa0 + btrfs_page_set_dirty+0x58/0x88 + relocate_one_page+0x204/0x5f0 + relocate_file_extent_cluster+0x11c/0x180 + relocate_data_extent+0xd0/0xf8 + relocate_block_group+0x3d0/0x4e8 + btrfs_relocate_block_group+0x2d8/0x490 + btrfs_relocate_chunk+0x54/0x1a8 + btrfs_balance+0x7f4/0x1150 + btrfs_ioctl+0x10f0/0x20b8 + __arm64_sys_ioctl+0x120/0x11d8 + invoke_syscall.constprop.0+0x80/0xd8 + do_el0_svc+0x6c/0x158 + el0_svc+0x50/0x1b0 + el0t_64_sync_handler+0x120/0x130 + el0t_64_sync+0x194/0x198 + Code: 91098021 b0007fa0 91346000 97e9c6d2 (d4210000) + +This is the same problem outlined in 17b17fcd6d44 ("btrfs: +set_page_extent_mapped after read_folio in btrfs_cont_expand") , and the +fix is the same. I originally looked for the same pattern elsewhere in +our code, but mistakenly skipped over this code because I saw the page +cache readahead before we set_page_extent_mapped, not realizing that +this was only in the !page case, that we can still end up with a +!uptodate page and then do the btrfs_read_folio further down. + +The fix here is the same as the above mentioned patch, move the +set_page_extent_mapped call to after the btrfs_read_folio() block to +make sure that we have the subpage blocksize stuff setup properly before +using the page. + +CC: stable@vger.kernel.org # 6.1+ +Reviewed-by: Filipe Manana +Signed-off-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/relocation.c | 12 +++++++++--- + 1 file changed, 9 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -3006,9 +3006,6 @@ static int relocate_one_page(struct inod + if (!page) + return -ENOMEM; + } +- ret = set_page_extent_mapped(page); +- if (ret < 0) +- goto release_page; + + if (PageReadahead(page)) + page_cache_async_readahead(inode->i_mapping, ra, NULL, +@@ -3024,6 +3021,15 @@ static int relocate_one_page(struct inod + } + } + ++ /* ++ * We could have lost page private when we dropped the lock to read the ++ * page above, make sure we set_page_extent_mapped here so we have any ++ * of the subpage blocksize stuff we need in place. ++ */ ++ ret = set_page_extent_mapped(page); ++ if (ret < 0) ++ goto release_page; ++ + page_start = page_offset(page); + page_end = page_start + PAGE_SIZE - 1; + diff --git a/queue-6.5/btrfs-zoned-do-not-zone-finish-data-relocation-block-group.patch b/queue-6.5/btrfs-zoned-do-not-zone-finish-data-relocation-block-group.patch new file mode 100644 index 00000000000..746aef8d0f5 --- /dev/null +++ b/queue-6.5/btrfs-zoned-do-not-zone-finish-data-relocation-block-group.patch @@ -0,0 +1,166 @@ +From 332581bde2a419d5f12a93a1cdc2856af649a3cc Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Fri, 21 Jul 2023 16:42:14 +0900 +Subject: btrfs: zoned: do not zone finish data relocation block group + +From: Naohiro Aota + +commit 332581bde2a419d5f12a93a1cdc2856af649a3cc upstream. + +When multiple writes happen at once, we may need to sacrifice a currently +active block group to be zone finished for a new allocation. We choose a +block group with the least free space left, and zone finish it. + +To do the finishing, we need to send IOs for already allocated region +and wait for them and on-going IOs. Otherwise, these IOs fail because the +zone is already finished at the time the IO reach a device. + +However, if a block group dedicated to the data relocation is zone +finished, there is a chance that finishing it before an ongoing write IO +reaches the device. That is because there is timing gap between an +allocation is done (block_group->reservations == 0, as pre-allocation is +done) and an ordered extent is created when the relocation IO starts. +Thus, if we finish the zone between them, we can fail the IOs. + +We cannot simply use "fs_info->data_reloc_bg == block_group->start" to +avoid the zone finishing. Because, the data_reloc_bg may already switch to +a new block group, while there are still ongoing write IOs to the old +data_reloc_bg. + +So, this patch reworks the BLOCK_GROUP_FLAG_ZONED_DATA_RELOC bit to +indicate there is a data relocation allocation and/or ongoing write to the +block group. The bit is set on allocation and cleared in end_io function of +the last IO for the currently allocated region. + +To change the timing of the bit setting also solves the issue that the bit +being left even after there is no IO going on. With the current code, if +the data_reloc_bg switches after the last IO to the current data_reloc_bg, +the bit is set at this timing and there is no one clearing that bit. As a +result, that block group is kept unallocatable for anything. + +Fixes: 343d8a30851c ("btrfs: zoned: prevent allocation from previous data relocation BG") +Fixes: 74e91b12b115 ("btrfs: zoned: zone finish unused block group") +CC: stable@vger.kernel.org # 6.1+ +Reviewed-by: Christoph Hellwig +Reviewed-by: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent-tree.c | 43 +++++++++++++++++++++++-------------------- + fs/btrfs/zoned.c | 16 +++++++++++++--- + 2 files changed, 36 insertions(+), 23 deletions(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -3709,7 +3709,8 @@ static int do_allocation_zoned(struct bt + fs_info->data_reloc_bg == 0); + + if (block_group->ro || +- test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { ++ (!ffe_ctl->for_data_reloc && ++ test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) { + ret = 1; + goto out; + } +@@ -3752,8 +3753,26 @@ static int do_allocation_zoned(struct bt + if (ffe_ctl->for_treelog && !fs_info->treelog_bg) + fs_info->treelog_bg = block_group->start; + +- if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg) +- fs_info->data_reloc_bg = block_group->start; ++ if (ffe_ctl->for_data_reloc) { ++ if (!fs_info->data_reloc_bg) ++ fs_info->data_reloc_bg = block_group->start; ++ /* ++ * Do not allow allocations from this block group, unless it is ++ * for data relocation. Compared to increasing the ->ro, setting ++ * the ->zoned_data_reloc_ongoing flag still allows nocow ++ * writers to come in. See btrfs_inc_nocow_writers(). ++ * ++ * We need to disable an allocation to avoid an allocation of ++ * regular (non-relocation data) extent. With mix of relocation ++ * extents and regular extents, we can dispatch WRITE commands ++ * (for relocation extents) and ZONE APPEND commands (for ++ * regular extents) at the same time to the same zone, which ++ * easily break the write pointer. ++ * ++ * Also, this flag avoids this block group to be zone finished. ++ */ ++ set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); ++ } + + ffe_ctl->found_offset = start + block_group->alloc_offset; + block_group->alloc_offset += num_bytes; +@@ -3771,24 +3790,8 @@ static int do_allocation_zoned(struct bt + out: + if (ret && ffe_ctl->for_treelog) + fs_info->treelog_bg = 0; +- if (ret && ffe_ctl->for_data_reloc && +- fs_info->data_reloc_bg == block_group->start) { +- /* +- * Do not allow further allocations from this block group. +- * Compared to increasing the ->ro, setting the +- * ->zoned_data_reloc_ongoing flag still allows nocow +- * writers to come in. See btrfs_inc_nocow_writers(). +- * +- * We need to disable an allocation to avoid an allocation of +- * regular (non-relocation data) extent. With mix of relocation +- * extents and regular extents, we can dispatch WRITE commands +- * (for relocation extents) and ZONE APPEND commands (for +- * regular extents) at the same time to the same zone, which +- * easily break the write pointer. +- */ +- set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); ++ if (ret && ffe_ctl->for_data_reloc) + fs_info->data_reloc_bg = 0; +- } + spin_unlock(&fs_info->relocation_bg_lock); + spin_unlock(&fs_info->treelog_bg_lock); + spin_unlock(&block_group->lock); +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -2017,6 +2017,10 @@ static int do_zone_finish(struct btrfs_b + * and block_group->meta_write_pointer for metadata. + */ + if (!fully_written) { ++ if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { ++ spin_unlock(&block_group->lock); ++ return -EAGAIN; ++ } + spin_unlock(&block_group->lock); + + ret = btrfs_inc_block_group_ro(block_group, false); +@@ -2045,7 +2049,9 @@ static int do_zone_finish(struct btrfs_b + return 0; + } + +- if (block_group->reserved) { ++ if (block_group->reserved || ++ test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, ++ &block_group->runtime_flags)) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return -EAGAIN; +@@ -2276,7 +2282,10 @@ void btrfs_zoned_release_data_reloc_bg(s + + /* All relocation extents are written. */ + if (block_group->start + block_group->alloc_offset == logical + length) { +- /* Now, release this block group for further allocations. */ ++ /* ++ * Now, release this block group for further allocations and ++ * zone finish. ++ */ + clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, + &block_group->runtime_flags); + } +@@ -2300,7 +2309,8 @@ int btrfs_zone_finish_one_bg(struct btrf + + spin_lock(&block_group->lock); + if (block_group->reserved || block_group->alloc_offset == 0 || +- (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) { ++ (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) || ++ test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { + spin_unlock(&block_group->lock); + continue; + } diff --git a/queue-6.5/btrfs-zoned-re-enable-metadata-over-commit-for-zoned-mode.patch b/queue-6.5/btrfs-zoned-re-enable-metadata-over-commit-for-zoned-mode.patch new file mode 100644 index 00000000000..d09419edd75 --- /dev/null +++ b/queue-6.5/btrfs-zoned-re-enable-metadata-over-commit-for-zoned-mode.patch @@ -0,0 +1,47 @@ +From 5b135b382a360f4c87cf8896d1465b0b07f10cb0 Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Tue, 8 Aug 2023 01:12:40 +0900 +Subject: btrfs: zoned: re-enable metadata over-commit for zoned mode + +From: Naohiro Aota + +commit 5b135b382a360f4c87cf8896d1465b0b07f10cb0 upstream. + +Now that, we can re-enable metadata over-commit. As we moved the activation +from the reservation time to the write time, we no longer need to ensure +all the reserved bytes is properly activated. + +Without the metadata over-commit, it suffers from lower performance because +it needs to flush the delalloc items more often and allocate more block +groups. Re-enabling metadata over-commit will solve the issue. + +Fixes: 79417d040f4f ("btrfs: zoned: disable metadata overcommit for zoned") +CC: stable@vger.kernel.org # 6.1+ +Reviewed-by: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/space-info.c | 6 +----- + 1 file changed, 1 insertion(+), 5 deletions(-) + +diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c +index 356638f54fef..d7e8cd4f140c 100644 +--- a/fs/btrfs/space-info.c ++++ b/fs/btrfs/space-info.c +@@ -389,11 +389,7 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + return 0; + + used = btrfs_space_info_used(space_info, true); +- if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags) && +- (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) +- avail = 0; +- else +- avail = calc_available_free_space(fs_info, space_info, flush); ++ avail = calc_available_free_space(fs_info, space_info, flush); + + if (used + bytes < space_info->total_bytes + avail) + return 1; +-- +2.42.0 + diff --git a/queue-6.5/clocksource-drivers-arm_arch_timer-disable-timer-before-programming-cval.patch b/queue-6.5/clocksource-drivers-arm_arch_timer-disable-timer-before-programming-cval.patch new file mode 100644 index 00000000000..b5ef0fa14b1 --- /dev/null +++ b/queue-6.5/clocksource-drivers-arm_arch_timer-disable-timer-before-programming-cval.patch @@ -0,0 +1,58 @@ +From e7d65e40ab5a5940785c5922f317602d0268caaf Mon Sep 17 00:00:00 2001 +From: Walter Chang +Date: Mon, 17 Jul 2023 17:07:34 +0800 +Subject: clocksource/drivers/arm_arch_timer: Disable timer before programming CVAL + +From: Walter Chang + +commit e7d65e40ab5a5940785c5922f317602d0268caaf upstream. + +Due to the fact that the use of `writeq_relaxed()` to program CVAL is +not guaranteed to be atomic, it is necessary to disable the timer before +programming CVAL. + +However, if the MMIO timer is already enabled and has not yet expired, +there is a possibility of unexpected behavior occurring: when the CPU +enters the idle state during this period, and if the CPU's local event +is earlier than the broadcast event, the following process occurs: + +tick_broadcast_enter() + tick_broadcast_oneshot_control(TICK_BROADCAST_ENTER) + __tick_broadcast_oneshot_control() + ___tick_broadcast_oneshot_control() + tick_broadcast_set_event() + clockevents_program_event() + set_next_event_mem() + +During this process, the MMIO timer remains enabled while programming +CVAL. To prevent such behavior, disable timer explicitly prior to +programming CVAL. + +Fixes: 8b82c4f883a7 ("clocksource/drivers/arm_arch_timer: Move MMIO timer programming over to CVAL") +Cc: stable@vger.kernel.org +Signed-off-by: Walter Chang +Acked-by: Marc Zyngier +Reviewed-by: AngeloGioacchino Del Regno +Signed-off-by: Daniel Lezcano +Link: https://lore.kernel.org/r/20230717090735.19370-1-walter.chang@mediatek.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/clocksource/arm_arch_timer.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/drivers/clocksource/arm_arch_timer.c ++++ b/drivers/clocksource/arm_arch_timer.c +@@ -792,6 +792,13 @@ static __always_inline void set_next_eve + u64 cnt; + + ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); ++ ++ /* Timer must be disabled before programming CVAL */ ++ if (ctrl & ARCH_TIMER_CTRL_ENABLE) { ++ ctrl &= ~ARCH_TIMER_CTRL_ENABLE; ++ arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); ++ } ++ + ctrl |= ARCH_TIMER_CTRL_ENABLE; + ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; + diff --git a/queue-6.5/dmaengine-sh-rz-dmac-fix-destination-and-source-data-size-setting.patch b/queue-6.5/dmaengine-sh-rz-dmac-fix-destination-and-source-data-size-setting.patch new file mode 100644 index 00000000000..d366ca3c580 --- /dev/null +++ b/queue-6.5/dmaengine-sh-rz-dmac-fix-destination-and-source-data-size-setting.patch @@ -0,0 +1,64 @@ +From c6ec8c83a29fb3aec3efa6fabbf5344498f57c7f Mon Sep 17 00:00:00 2001 +From: Hien Huynh +Date: Thu, 6 Jul 2023 12:21:50 +0100 +Subject: dmaengine: sh: rz-dmac: Fix destination and source data size setting + +From: Hien Huynh + +commit c6ec8c83a29fb3aec3efa6fabbf5344498f57c7f upstream. + +Before setting DDS and SDS values, we need to clear its value first +otherwise, we get incorrect results when we change/update the DMA bus +width several times due to the 'OR' expression. + +Fixes: 5000d37042a6 ("dmaengine: sh: Add DMAC driver for RZ/G2L SoC") +Cc: stable@kernel.org +Signed-off-by: Hien Huynh +Signed-off-by: Biju Das +Reviewed-by: Geert Uytterhoeven +Link: https://lore.kernel.org/r/20230706112150.198941-3-biju.das.jz@bp.renesas.com +Signed-off-by: Vinod Koul +Signed-off-by: Greg Kroah-Hartman +--- + drivers/dma/sh/rz-dmac.c | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +--- a/drivers/dma/sh/rz-dmac.c ++++ b/drivers/dma/sh/rz-dmac.c +@@ -9,6 +9,7 @@ + * Copyright 2012 Javier Martin, Vista Silicon + */ + ++#include + #include + #include + #include +@@ -145,8 +146,8 @@ struct rz_dmac { + #define CHCFG_REQD BIT(3) + #define CHCFG_SEL(bits) ((bits) & 0x07) + #define CHCFG_MEM_COPY (0x80400008) +-#define CHCFG_FILL_DDS(a) (((a) << 16) & GENMASK(19, 16)) +-#define CHCFG_FILL_SDS(a) (((a) << 12) & GENMASK(15, 12)) ++#define CHCFG_FILL_DDS_MASK GENMASK(19, 16) ++#define CHCFG_FILL_SDS_MASK GENMASK(15, 12) + #define CHCFG_FILL_TM(a) (((a) & BIT(5)) << 22) + #define CHCFG_FILL_AM(a) (((a) & GENMASK(4, 2)) << 6) + #define CHCFG_FILL_LVL(a) (((a) & BIT(1)) << 5) +@@ -607,13 +608,15 @@ static int rz_dmac_config(struct dma_cha + if (val == CHCFG_DS_INVALID) + return -EINVAL; + +- channel->chcfg |= CHCFG_FILL_DDS(val); ++ channel->chcfg &= ~CHCFG_FILL_DDS_MASK; ++ channel->chcfg |= FIELD_PREP(CHCFG_FILL_DDS_MASK, val); + + val = rz_dmac_ds_to_val_mapping(config->src_addr_width); + if (val == CHCFG_DS_INVALID) + return -EINVAL; + +- channel->chcfg |= CHCFG_FILL_SDS(val); ++ channel->chcfg &= ~CHCFG_FILL_SDS_MASK; ++ channel->chcfg |= FIELD_PREP(CHCFG_FILL_SDS_MASK, val); + + return 0; + } diff --git a/queue-6.5/ext4-add-correct-group-descriptors-and-reserved-gdt-blocks-to-system-zone.patch b/queue-6.5/ext4-add-correct-group-descriptors-and-reserved-gdt-blocks-to-system-zone.patch new file mode 100644 index 00000000000..2933e577af9 --- /dev/null +++ b/queue-6.5/ext4-add-correct-group-descriptors-and-reserved-gdt-blocks-to-system-zone.patch @@ -0,0 +1,101 @@ +From 68228da51c9a436872a4ef4b5a7692e29f7e5bc7 Mon Sep 17 00:00:00 2001 +From: Wang Jianjian +Date: Thu, 3 Aug 2023 00:28:39 +0800 +Subject: ext4: add correct group descriptors and reserved GDT blocks to system zone + +From: Wang Jianjian + +commit 68228da51c9a436872a4ef4b5a7692e29f7e5bc7 upstream. + +When setup_system_zone, flex_bg is not initialized so it is always 1. +Use a new helper function, ext4_num_base_meta_blocks() which does not +depend on sbi->s_log_groups_per_flex being initialized. + +[ Squashed two patches in the Link URL's below together into a single + commit, which is simpler to review/understand. Also fix checkpatch + warnings. --TYT ] + +Cc: stable@kernel.org +Signed-off-by: Wang Jianjian +Link: https://lore.kernel.org/r/tencent_21AF0D446A9916ED5C51492CC6C9A0A77B05@qq.com +Link: https://lore.kernel.org/r/tencent_D744D1450CC169AEA77FCF0A64719909ED05@qq.com +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/balloc.c | 15 +++++++++++---- + fs/ext4/block_validity.c | 8 ++++---- + fs/ext4/ext4.h | 2 ++ + 3 files changed, 17 insertions(+), 8 deletions(-) + +--- a/fs/ext4/balloc.c ++++ b/fs/ext4/balloc.c +@@ -913,11 +913,11 @@ unsigned long ext4_bg_num_gdb(struct sup + } + + /* +- * This function returns the number of file system metadata clusters at ++ * This function returns the number of file system metadata blocks at + * the beginning of a block group, including the reserved gdt blocks. + */ +-static unsigned ext4_num_base_meta_clusters(struct super_block *sb, +- ext4_group_t block_group) ++unsigned int ext4_num_base_meta_blocks(struct super_block *sb, ++ ext4_group_t block_group) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned num; +@@ -935,8 +935,15 @@ static unsigned ext4_num_base_meta_clust + } else { /* For META_BG_BLOCK_GROUPS */ + num += ext4_bg_num_gdb_meta(sb, block_group); + } +- return EXT4_NUM_B2C(sbi, num); ++ return num; + } ++ ++static unsigned int ext4_num_base_meta_clusters(struct super_block *sb, ++ ext4_group_t block_group) ++{ ++ return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group)); ++} ++ + /** + * ext4_inode_to_goal_block - return a hint for block allocation + * @inode: inode for block allocation +--- a/fs/ext4/block_validity.c ++++ b/fs/ext4/block_validity.c +@@ -215,7 +215,6 @@ int ext4_setup_system_zone(struct super_ + struct ext4_system_blocks *system_blks; + struct ext4_group_desc *gdp; + ext4_group_t i; +- int flex_size = ext4_flex_bg_size(sbi); + int ret; + + system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); +@@ -223,12 +222,13 @@ int ext4_setup_system_zone(struct super_ + return -ENOMEM; + + for (i=0; i < ngroups; i++) { ++ unsigned int meta_blks = ext4_num_base_meta_blocks(sb, i); ++ + cond_resched(); +- if (ext4_bg_has_super(sb, i) && +- ((i < 5) || ((i % flex_size) == 0))) { ++ if (meta_blks != 0) { + ret = add_system_zone(system_blks, + ext4_group_first_block_no(sb, i), +- ext4_bg_num_gdb(sb, i) + 1, 0); ++ meta_blks, 0); + if (ret) + goto err; + } +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -3084,6 +3084,8 @@ extern const char *ext4_decode_error(str + extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, + ext4_group_t block_group, + unsigned int flags); ++extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb, ++ ext4_group_t block_group); + + extern __printf(7, 8) + void __ext4_error(struct super_block *, const char *, unsigned int, bool, diff --git a/queue-6.5/ext4-drop-dio-overwrite-only-flag-and-associated-warning.patch b/queue-6.5/ext4-drop-dio-overwrite-only-flag-and-associated-warning.patch new file mode 100644 index 00000000000..93d3faceb74 --- /dev/null +++ b/queue-6.5/ext4-drop-dio-overwrite-only-flag-and-associated-warning.patch @@ -0,0 +1,110 @@ +From 194505b55dd7899da114a4d47825204eefc0fff5 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Thu, 10 Aug 2023 12:55:59 -0400 +Subject: ext4: drop dio overwrite only flag and associated warning + +From: Brian Foster + +commit 194505b55dd7899da114a4d47825204eefc0fff5 upstream. + +The commit referenced below opened up concurrent unaligned dio under +shared locking for pure overwrites. In doing so, it enabled use of +the IOMAP_DIO_OVERWRITE_ONLY flag and added a warning on unexpected +-EAGAIN returns as an extra precaution, since ext4 does not retry +writes in such cases. The flag itself is advisory in this case since +ext4 checks for unaligned I/Os and uses appropriate locking up +front, rather than on a retry in response to -EAGAIN. + +As it turns out, the warning check is susceptible to false positives +because there are scenarios where -EAGAIN can be expected from lower +layers without necessarily having IOCB_NOWAIT set on the iocb. For +example, one instance of the warning has been seen where io_uring +sets IOCB_HIPRI, which in turn results in REQ_POLLED|REQ_NOWAIT on +the bio. This results in -EAGAIN if the block layer is unable to +allocate a request, etc. [Note that there is an outstanding patch to +untangle REQ_POLLED and REQ_NOWAIT such that the latter relies on +IOCB_NOWAIT, which would also address this instance of the warning.] + +Another instance of the warning has been reproduced by syzbot. A dio +write is interrupted down in __get_user_pages_locked() waiting on +the mm lock and returns -EAGAIN up the stack. If the iomap dio +iteration layer has made no progress on the write to this point, +-EAGAIN returns up to the filesystem and triggers the warning. + +This use of the overwrite flag in ext4 is precautionary and +half-baked. I.e., ext4 doesn't actually implement overwrite checking +in the iomap callbacks when the flag is set, so the only extra +verification it provides are i_size checks in the generic iomap dio +layer. Combined with the tendency for false positives, the added +verification is not worth the extra trouble. Remove the flag, +associated warning, and update the comments to document when +concurrent unaligned dio writes are allowed and why said flag is not +used. + +Cc: stable@kernel.org +Reported-by: syzbot+5050ad0fb47527b1808a@syzkaller.appspotmail.com +Reported-by: Pengfei Xu +Fixes: 310ee0902b8d ("ext4: allow concurrent unaligned dio overwrites") +Signed-off-by: Brian Foster +Reviewed-by: Jan Kara +Link: https://lore.kernel.org/r/20230810165559.946222-1-bfoster@redhat.com +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/file.c | 25 ++++++++++--------------- + 1 file changed, 10 insertions(+), 15 deletions(-) + +diff --git a/fs/ext4/file.c b/fs/ext4/file.c +index 2071b1e4322c..e99cc17b6bd2 100644 +--- a/fs/ext4/file.c ++++ b/fs/ext4/file.c +@@ -476,6 +476,11 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, + * required to change security info in file_modified(), for extending + * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten + * extents (as partial block zeroing may be required). ++ * ++ * Note that unaligned writes are allowed under shared lock so long as ++ * they are pure overwrites. Otherwise, concurrent unaligned writes risk ++ * data corruption due to partial block zeroing in the dio layer, and so ++ * the I/O must occur exclusively. + */ + if (*ilock_shared && + ((!IS_NOSEC(inode) || *extend || !overwrite || +@@ -492,21 +497,12 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, + + /* + * Now that locking is settled, determine dio flags and exclusivity +- * requirements. Unaligned writes are allowed under shared lock so long +- * as they are pure overwrites. Set the iomap overwrite only flag as an +- * added precaution in this case. Even though this is unnecessary, we +- * can detect and warn on unexpected -EAGAIN if an unsafe unaligned +- * write is ever submitted. +- * +- * Otherwise, concurrent unaligned writes risk data corruption due to +- * partial block zeroing in the dio layer, and so the I/O must occur +- * exclusively. The inode lock is already held exclusive if the write is +- * non-overwrite or extending, so drain all outstanding dio and set the +- * force wait dio flag. ++ * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce ++ * behavior already. The inode lock is already held exclusive if the ++ * write is non-overwrite or extending, so drain all outstanding dio and ++ * set the force wait dio flag. + */ +- if (*ilock_shared && unaligned_io) { +- *dio_flags = IOMAP_DIO_OVERWRITE_ONLY; +- } else if (!*ilock_shared && (unaligned_io || *extend)) { ++ if (!*ilock_shared && (unaligned_io || *extend)) { + if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; +@@ -608,7 +604,6 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) + iomap_ops = &ext4_iomap_overwrite_ops; + ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, + dio_flags, NULL, 0); +- WARN_ON_ONCE(ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)); + if (ret == -ENOTBLK) + ret = 0; + +-- +2.42.0 + diff --git a/queue-6.5/ext4-fix-bug-in-ext4_mb_new_inode_pa-due-to-overflow.patch b/queue-6.5/ext4-fix-bug-in-ext4_mb_new_inode_pa-due-to-overflow.patch new file mode 100644 index 00000000000..a637e9e58cb --- /dev/null +++ b/queue-6.5/ext4-fix-bug-in-ext4_mb_new_inode_pa-due-to-overflow.patch @@ -0,0 +1,117 @@ +From bc056e7163ac7db945366de219745cf94f32a3e6 Mon Sep 17 00:00:00 2001 +From: Baokun Li +Date: Mon, 24 Jul 2023 20:10:58 +0800 +Subject: ext4: fix BUG in ext4_mb_new_inode_pa() due to overflow + +From: Baokun Li + +commit bc056e7163ac7db945366de219745cf94f32a3e6 upstream. + +When we calculate the end position of ext4_free_extent, this position may +be exactly where ext4_lblk_t (i.e. uint) overflows. For example, if +ac_g_ex.fe_logical is 4294965248 and ac_orig_goal_len is 2048, then the +computed end is 0x100000000, which is 0. If ac->ac_o_ex.fe_logical is not +the first case of adjusting the best extent, that is, new_bex_end > 0, the +following BUG_ON will be triggered: + +========================================================= +kernel BUG at fs/ext4/mballoc.c:5116! +invalid opcode: 0000 [#1] PREEMPT SMP PTI +CPU: 3 PID: 673 Comm: xfs_io Tainted: G E 6.5.0-rc1+ #279 +RIP: 0010:ext4_mb_new_inode_pa+0xc5/0x430 +Call Trace: + + ext4_mb_use_best_found+0x203/0x2f0 + ext4_mb_try_best_found+0x163/0x240 + ext4_mb_regular_allocator+0x158/0x1550 + ext4_mb_new_blocks+0x86a/0xe10 + ext4_ext_map_blocks+0xb0c/0x13a0 + ext4_map_blocks+0x2cd/0x8f0 + ext4_iomap_begin+0x27b/0x400 + iomap_iter+0x222/0x3d0 + __iomap_dio_rw+0x243/0xcb0 + iomap_dio_rw+0x16/0x80 +========================================================= + +A simple reproducer demonstrating the problem: + + mkfs.ext4 -F /dev/sda -b 4096 100M + mount /dev/sda /tmp/test + fallocate -l1M /tmp/test/tmp + fallocate -l10M /tmp/test/file + fallocate -i -o 1M -l16777203M /tmp/test/file + fsstress -d /tmp/test -l 0 -n 100000 -p 8 & + sleep 10 && killall -9 fsstress + rm -f /tmp/test/tmp + xfs_io -c "open -ad /tmp/test/file" -c "pwrite -S 0xff 0 8192" + +We simply refactor the logic for adjusting the best extent by adding +a temporary ext4_free_extent ex and use extent_logical_end() to avoid +overflow, which also simplifies the code. + +Cc: stable@kernel.org # 6.4 +Fixes: 93cdf49f6eca ("ext4: Fix best extent lstart adjustment logic in ext4_mb_new_inode_pa()") +Signed-off-by: Baokun Li +Reviewed-by: Ritesh Harjani (IBM) +Link: https://lore.kernel.org/r/20230724121059.11834-3-libaokun1@huawei.com +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/mballoc.c | 31 ++++++++++++++----------------- + 1 file changed, 14 insertions(+), 17 deletions(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -5181,8 +5181,11 @@ ext4_mb_new_inode_pa(struct ext4_allocat + pa = ac->ac_pa; + + if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) { +- int new_bex_start; +- int new_bex_end; ++ struct ext4_free_extent ex = { ++ .fe_logical = ac->ac_g_ex.fe_logical, ++ .fe_len = ac->ac_orig_goal_len, ++ }; ++ loff_t orig_goal_end = extent_logical_end(sbi, &ex); + + /* we can't allocate as much as normalizer wants. + * so, found space must get proper lstart +@@ -5201,29 +5204,23 @@ ext4_mb_new_inode_pa(struct ext4_allocat + * still cover original start + * 3. Else, keep the best ex at start of original request. + */ +- new_bex_end = ac->ac_g_ex.fe_logical + +- EXT4_C2B(sbi, ac->ac_orig_goal_len); +- new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); +- if (ac->ac_o_ex.fe_logical >= new_bex_start) +- goto adjust_bex; ++ ex.fe_len = ac->ac_b_ex.fe_len; + +- new_bex_start = ac->ac_g_ex.fe_logical; +- new_bex_end = +- new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); +- if (ac->ac_o_ex.fe_logical < new_bex_end) ++ ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len); ++ if (ac->ac_o_ex.fe_logical >= ex.fe_logical) + goto adjust_bex; + +- new_bex_start = ac->ac_o_ex.fe_logical; +- new_bex_end = +- new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); ++ ex.fe_logical = ac->ac_g_ex.fe_logical; ++ if (ac->ac_o_ex.fe_logical < extent_logical_end(sbi, &ex)) ++ goto adjust_bex; + ++ ex.fe_logical = ac->ac_o_ex.fe_logical; + adjust_bex: +- ac->ac_b_ex.fe_logical = new_bex_start; ++ ac->ac_b_ex.fe_logical = ex.fe_logical; + + BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); + BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); +- BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical + +- EXT4_C2B(sbi, ac->ac_orig_goal_len))); ++ BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end); + } + + pa->pa_lstart = ac->ac_b_ex.fe_logical; diff --git a/queue-6.5/ext4-fix-memory-leaks-in-ext4_fname_-setup_filename-prepare_lookup.patch b/queue-6.5/ext4-fix-memory-leaks-in-ext4_fname_-setup_filename-prepare_lookup.patch new file mode 100644 index 00000000000..b2ec8caed53 --- /dev/null +++ b/queue-6.5/ext4-fix-memory-leaks-in-ext4_fname_-setup_filename-prepare_lookup.patch @@ -0,0 +1,49 @@ +From 7ca4b085f430f3774c3838b3da569ceccd6a0177 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Lu=C3=ADs=20Henriques?= +Date: Thu, 3 Aug 2023 10:17:13 +0100 +Subject: ext4: fix memory leaks in ext4_fname_{setup_filename,prepare_lookup} +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Luís Henriques + +commit 7ca4b085f430f3774c3838b3da569ceccd6a0177 upstream. + +If the filename casefolding fails, we'll be leaking memory from the +fscrypt_name struct, namely from the 'crypto_buf.name' member. + +Make sure we free it in the error path on both ext4_fname_setup_filename() +and ext4_fname_prepare_lookup() functions. + +Cc: stable@kernel.org +Fixes: 1ae98e295fa2 ("ext4: optimize match for casefolded encrypted dirs") +Signed-off-by: Luís Henriques +Reviewed-by: Eric Biggers +Link: https://lore.kernel.org/r/20230803091713.13239-1-lhenriques@suse.de +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/crypto.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/fs/ext4/crypto.c ++++ b/fs/ext4/crypto.c +@@ -33,6 +33,8 @@ int ext4_fname_setup_filename(struct ino + + #if IS_ENABLED(CONFIG_UNICODE) + err = ext4_fname_setup_ci_filename(dir, iname, fname); ++ if (err) ++ ext4_fname_free_filename(fname); + #endif + return err; + } +@@ -51,6 +53,8 @@ int ext4_fname_prepare_lookup(struct ino + + #if IS_ENABLED(CONFIG_UNICODE) + err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); ++ if (err) ++ ext4_fname_free_filename(fname); + #endif + return err; + } diff --git a/queue-6.5/ext4-fix-slab-use-after-free-in-ext4_es_insert_extent.patch b/queue-6.5/ext4-fix-slab-use-after-free-in-ext4_es_insert_extent.patch new file mode 100644 index 00000000000..051a5529389 --- /dev/null +++ b/queue-6.5/ext4-fix-slab-use-after-free-in-ext4_es_insert_extent.patch @@ -0,0 +1,181 @@ +From 768d612f79822d30a1e7d132a4d4b05337ce42ec Mon Sep 17 00:00:00 2001 +From: Baokun Li +Date: Tue, 15 Aug 2023 15:08:08 +0800 +Subject: ext4: fix slab-use-after-free in ext4_es_insert_extent() + +From: Baokun Li + +commit 768d612f79822d30a1e7d132a4d4b05337ce42ec upstream. + +Yikebaer reported an issue: +================================================================== +BUG: KASAN: slab-use-after-free in ext4_es_insert_extent+0xc68/0xcb0 +fs/ext4/extents_status.c:894 +Read of size 4 at addr ffff888112ecc1a4 by task syz-executor/8438 + +CPU: 1 PID: 8438 Comm: syz-executor Not tainted 6.5.0-rc5 #1 +Call Trace: + [...] + kasan_report+0xba/0xf0 mm/kasan/report.c:588 + ext4_es_insert_extent+0xc68/0xcb0 fs/ext4/extents_status.c:894 + ext4_map_blocks+0x92a/0x16f0 fs/ext4/inode.c:680 + ext4_alloc_file_blocks.isra.0+0x2df/0xb70 fs/ext4/extents.c:4462 + ext4_zero_range fs/ext4/extents.c:4622 [inline] + ext4_fallocate+0x251c/0x3ce0 fs/ext4/extents.c:4721 + [...] + +Allocated by task 8438: + [...] + kmem_cache_zalloc include/linux/slab.h:693 [inline] + __es_alloc_extent fs/ext4/extents_status.c:469 [inline] + ext4_es_insert_extent+0x672/0xcb0 fs/ext4/extents_status.c:873 + ext4_map_blocks+0x92a/0x16f0 fs/ext4/inode.c:680 + ext4_alloc_file_blocks.isra.0+0x2df/0xb70 fs/ext4/extents.c:4462 + ext4_zero_range fs/ext4/extents.c:4622 [inline] + ext4_fallocate+0x251c/0x3ce0 fs/ext4/extents.c:4721 + [...] + +Freed by task 8438: + [...] + kmem_cache_free+0xec/0x490 mm/slub.c:3823 + ext4_es_try_to_merge_right fs/ext4/extents_status.c:593 [inline] + __es_insert_extent+0x9f4/0x1440 fs/ext4/extents_status.c:802 + ext4_es_insert_extent+0x2ca/0xcb0 fs/ext4/extents_status.c:882 + ext4_map_blocks+0x92a/0x16f0 fs/ext4/inode.c:680 + ext4_alloc_file_blocks.isra.0+0x2df/0xb70 fs/ext4/extents.c:4462 + ext4_zero_range fs/ext4/extents.c:4622 [inline] + ext4_fallocate+0x251c/0x3ce0 fs/ext4/extents.c:4721 + [...] +================================================================== + +The flow of issue triggering is as follows: +1. remove es + raw es es removed es1 +|-------------------| -> |----|.......|------| + +2. insert es + es insert es1 merge with es es1 merge with es and free es1 +|----|.......|------| -> |------------|------| -> |-------------------| + +es merges with newes, then merges with es1, frees es1, then determines +if es1->es_len is 0 and triggers a UAF. + +The code flow is as follows: +ext4_es_insert_extent + es1 = __es_alloc_extent(true); + es2 = __es_alloc_extent(true); + __es_remove_extent(inode, lblk, end, NULL, es1) + __es_insert_extent(inode, &newes, es1) ---> insert es1 to es tree + __es_insert_extent(inode, &newes, es2) + ext4_es_try_to_merge_right + ext4_es_free_extent(inode, es1) ---> es1 is freed + if (es1 && !es1->es_len) + // Trigger UAF by determining if es1 is used. + +We determine whether es1 or es2 is used immediately after calling +__es_remove_extent() or __es_insert_extent() to avoid triggering a +UAF if es1 or es2 is freed. + +Reported-by: Yikebaer Aizezi +Closes: https://lore.kernel.org/lkml/CALcu4raD4h9coiyEBL4Bm0zjDwxC2CyPiTwsP3zFuhot6y9Beg@mail.gmail.com +Fixes: 2a69c450083d ("ext4: using nofail preallocation in ext4_es_insert_extent()") +Cc: stable@kernel.org +Signed-off-by: Baokun Li +Reviewed-by: Jan Kara +Link: https://lore.kernel.org/r/20230815070808.3377171-1-libaokun1@huawei.com +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/extents_status.c | 44 +++++++++++++++++++++++++++------------- + 1 file changed, 30 insertions(+), 14 deletions(-) + +diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c +index 9b5b8951afb4..6f7de14c0fa8 100644 +--- a/fs/ext4/extents_status.c ++++ b/fs/ext4/extents_status.c +@@ -878,23 +878,29 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + err1 = __es_remove_extent(inode, lblk, end, NULL, es1); + if (err1 != 0) + goto error; ++ /* Free preallocated extent if it didn't get used. */ ++ if (es1) { ++ if (!es1->es_len) ++ __es_free_extent(es1); ++ es1 = NULL; ++ } + + err2 = __es_insert_extent(inode, &newes, es2); + if (err2 == -ENOMEM && !ext4_es_must_keep(&newes)) + err2 = 0; + if (err2 != 0) + goto error; ++ /* Free preallocated extent if it didn't get used. */ ++ if (es2) { ++ if (!es2->es_len) ++ __es_free_extent(es2); ++ es2 = NULL; ++ } + + if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && + (status & EXTENT_STATUS_WRITTEN || + status & EXTENT_STATUS_UNWRITTEN)) + __revise_pending(inode, lblk, len); +- +- /* es is pre-allocated but not used, free it. */ +- if (es1 && !es1->es_len) +- __es_free_extent(es1); +- if (es2 && !es2->es_len) +- __es_free_extent(es2); + error: + write_unlock(&EXT4_I(inode)->i_es_lock); + if (err1 || err2) +@@ -1491,8 +1497,12 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + */ + write_lock(&EXT4_I(inode)->i_es_lock); + err = __es_remove_extent(inode, lblk, end, &reserved, es); +- if (es && !es->es_len) +- __es_free_extent(es); ++ /* Free preallocated extent if it didn't get used. */ ++ if (es) { ++ if (!es->es_len) ++ __es_free_extent(es); ++ es = NULL; ++ } + write_unlock(&EXT4_I(inode)->i_es_lock); + if (err) + goto retry; +@@ -2047,19 +2057,25 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1); + if (err1 != 0) + goto error; ++ /* Free preallocated extent if it didn't get used. */ ++ if (es1) { ++ if (!es1->es_len) ++ __es_free_extent(es1); ++ es1 = NULL; ++ } + + err2 = __es_insert_extent(inode, &newes, es2); + if (err2 != 0) + goto error; ++ /* Free preallocated extent if it didn't get used. */ ++ if (es2) { ++ if (!es2->es_len) ++ __es_free_extent(es2); ++ es2 = NULL; ++ } + + if (allocated) + __insert_pending(inode, lblk); +- +- /* es is pre-allocated but not used, free it. */ +- if (es1 && !es1->es_len) +- __es_free_extent(es1); +- if (es2 && !es2->es_len) +- __es_free_extent(es2); + error: + write_unlock(&EXT4_I(inode)->i_es_lock); + if (err1 || err2) +-- +2.42.0 + diff --git a/queue-6.5/f2fs-avoid-false-alarm-of-circular-locking.patch b/queue-6.5/f2fs-avoid-false-alarm-of-circular-locking.patch new file mode 100644 index 00000000000..817d9a77080 --- /dev/null +++ b/queue-6.5/f2fs-avoid-false-alarm-of-circular-locking.patch @@ -0,0 +1,155 @@ +From 5c13e2388bf3426fd69a89eb46e50469e9624e56 Mon Sep 17 00:00:00 2001 +From: Jaegeuk Kim +Date: Fri, 18 Aug 2023 11:34:32 -0700 +Subject: f2fs: avoid false alarm of circular locking + +From: Jaegeuk Kim + +commit 5c13e2388bf3426fd69a89eb46e50469e9624e56 upstream. + +====================================================== +WARNING: possible circular locking dependency detected +6.5.0-rc5-syzkaller-00353-gae545c3283dc #0 Not tainted +------------------------------------------------------ +syz-executor273/5027 is trying to acquire lock: +ffff888077fe1fb0 (&fi->i_sem){+.+.}-{3:3}, at: f2fs_down_write fs/f2fs/f2fs.h:2133 [inline] +ffff888077fe1fb0 (&fi->i_sem){+.+.}-{3:3}, at: f2fs_add_inline_entry+0x300/0x6f0 fs/f2fs/inline.c:644 + +but task is already holding lock: +ffff888077fe07c8 (&fi->i_xattr_sem){.+.+}-{3:3}, at: f2fs_down_read fs/f2fs/f2fs.h:2108 [inline] +ffff888077fe07c8 (&fi->i_xattr_sem){.+.+}-{3:3}, at: f2fs_add_dentry+0x92/0x230 fs/f2fs/dir.c:783 + +which lock already depends on the new lock. + +the existing dependency chain (in reverse order) is: + +-> #1 (&fi->i_xattr_sem){.+.+}-{3:3}: + down_read+0x9c/0x470 kernel/locking/rwsem.c:1520 + f2fs_down_read fs/f2fs/f2fs.h:2108 [inline] + f2fs_getxattr+0xb1e/0x12c0 fs/f2fs/xattr.c:532 + __f2fs_get_acl+0x5a/0x900 fs/f2fs/acl.c:179 + f2fs_acl_create fs/f2fs/acl.c:377 [inline] + f2fs_init_acl+0x15c/0xb30 fs/f2fs/acl.c:420 + f2fs_init_inode_metadata+0x159/0x1290 fs/f2fs/dir.c:558 + f2fs_add_regular_entry+0x79e/0xb90 fs/f2fs/dir.c:740 + f2fs_add_dentry+0x1de/0x230 fs/f2fs/dir.c:788 + f2fs_do_add_link+0x190/0x280 fs/f2fs/dir.c:827 + f2fs_add_link fs/f2fs/f2fs.h:3554 [inline] + f2fs_mkdir+0x377/0x620 fs/f2fs/namei.c:781 + vfs_mkdir+0x532/0x7e0 fs/namei.c:4117 + do_mkdirat+0x2a9/0x330 fs/namei.c:4140 + __do_sys_mkdir fs/namei.c:4160 [inline] + __se_sys_mkdir fs/namei.c:4158 [inline] + __x64_sys_mkdir+0xf2/0x140 fs/namei.c:4158 + do_syscall_x64 arch/x86/entry/common.c:50 [inline] + do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80 + entry_SYSCALL_64_after_hwframe+0x63/0xcd + +-> #0 (&fi->i_sem){+.+.}-{3:3}: + check_prev_add kernel/locking/lockdep.c:3142 [inline] + check_prevs_add kernel/locking/lockdep.c:3261 [inline] + validate_chain kernel/locking/lockdep.c:3876 [inline] + __lock_acquire+0x2e3d/0x5de0 kernel/locking/lockdep.c:5144 + lock_acquire kernel/locking/lockdep.c:5761 [inline] + lock_acquire+0x1ae/0x510 kernel/locking/lockdep.c:5726 + down_write+0x93/0x200 kernel/locking/rwsem.c:1573 + f2fs_down_write fs/f2fs/f2fs.h:2133 [inline] + f2fs_add_inline_entry+0x300/0x6f0 fs/f2fs/inline.c:644 + f2fs_add_dentry+0xa6/0x230 fs/f2fs/dir.c:784 + f2fs_do_add_link+0x190/0x280 fs/f2fs/dir.c:827 + f2fs_add_link fs/f2fs/f2fs.h:3554 [inline] + f2fs_mkdir+0x377/0x620 fs/f2fs/namei.c:781 + vfs_mkdir+0x532/0x7e0 fs/namei.c:4117 + ovl_do_mkdir fs/overlayfs/overlayfs.h:196 [inline] + ovl_mkdir_real+0xb5/0x370 fs/overlayfs/dir.c:146 + ovl_workdir_create+0x3de/0x820 fs/overlayfs/super.c:309 + ovl_make_workdir fs/overlayfs/super.c:711 [inline] + ovl_get_workdir fs/overlayfs/super.c:864 [inline] + ovl_fill_super+0xdab/0x6180 fs/overlayfs/super.c:1400 + vfs_get_super+0xf9/0x290 fs/super.c:1152 + vfs_get_tree+0x88/0x350 fs/super.c:1519 + do_new_mount fs/namespace.c:3335 [inline] + path_mount+0x1492/0x1ed0 fs/namespace.c:3662 + do_mount fs/namespace.c:3675 [inline] + __do_sys_mount fs/namespace.c:3884 [inline] + __se_sys_mount fs/namespace.c:3861 [inline] + __x64_sys_mount+0x293/0x310 fs/namespace.c:3861 + do_syscall_x64 arch/x86/entry/common.c:50 [inline] + do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80 + entry_SYSCALL_64_after_hwframe+0x63/0xcd + +other info that might help us debug this: + + Possible unsafe locking scenario: + + CPU0 CPU1 + ---- ---- + rlock(&fi->i_xattr_sem); + lock(&fi->i_sem); + lock(&fi->i_xattr_sem); + lock(&fi->i_sem); + +Cc: +Reported-and-tested-by: syzbot+e5600587fa9cbf8e3826@syzkaller.appspotmail.com +Fixes: 5eda1ad1aaff "f2fs: fix deadlock in i_xattr_sem and inode page lock" +Tested-by: Guenter Roeck +Reviewed-by: Chao Yu +Signed-off-by: Jaegeuk Kim +Signed-off-by: Greg Kroah-Hartman +--- + fs/f2fs/f2fs.h | 24 +++++++++++++++--------- + fs/f2fs/inline.c | 3 ++- + 2 files changed, 17 insertions(+), 10 deletions(-) + +--- a/fs/f2fs/f2fs.h ++++ b/fs/f2fs/f2fs.h +@@ -2122,15 +2122,6 @@ static inline int f2fs_down_read_trylock + return down_read_trylock(&sem->internal_rwsem); + } + +-#ifdef CONFIG_DEBUG_LOCK_ALLOC +-static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass) +-{ +- down_read_nested(&sem->internal_rwsem, subclass); +-} +-#else +-#define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem) +-#endif +- + static inline void f2fs_up_read(struct f2fs_rwsem *sem) + { + up_read(&sem->internal_rwsem); +@@ -2141,6 +2132,21 @@ static inline void f2fs_down_write(struc + down_write(&sem->internal_rwsem); + } + ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass) ++{ ++ down_read_nested(&sem->internal_rwsem, subclass); ++} ++ ++static inline void f2fs_down_write_nested(struct f2fs_rwsem *sem, int subclass) ++{ ++ down_write_nested(&sem->internal_rwsem, subclass); ++} ++#else ++#define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem) ++#define f2fs_down_write_nested(sem, subclass) f2fs_down_write(sem) ++#endif ++ + static inline int f2fs_down_write_trylock(struct f2fs_rwsem *sem) + { + return down_write_trylock(&sem->internal_rwsem); +--- a/fs/f2fs/inline.c ++++ b/fs/f2fs/inline.c +@@ -641,7 +641,8 @@ int f2fs_add_inline_entry(struct inode * + } + + if (inode) { +- f2fs_down_write(&F2FS_I(inode)->i_sem); ++ f2fs_down_write_nested(&F2FS_I(inode)->i_sem, ++ SINGLE_DEPTH_NESTING); + page = f2fs_init_inode_metadata(inode, dir, fname, ipage); + if (IS_ERR(page)) { + err = PTR_ERR(page); diff --git a/queue-6.5/f2fs-flush-inode-if-atomic-file-is-aborted.patch b/queue-6.5/f2fs-flush-inode-if-atomic-file-is-aborted.patch new file mode 100644 index 00000000000..cff64fe7c75 --- /dev/null +++ b/queue-6.5/f2fs-flush-inode-if-atomic-file-is-aborted.patch @@ -0,0 +1,118 @@ +From a3ab55746612247ce3dcaac6de66f5ffc055b9df Mon Sep 17 00:00:00 2001 +From: Jaegeuk Kim +Date: Fri, 7 Jul 2023 07:03:13 -0700 +Subject: f2fs: flush inode if atomic file is aborted + +From: Jaegeuk Kim + +commit a3ab55746612247ce3dcaac6de66f5ffc055b9df upstream. + +Let's flush the inode being aborted atomic operation to avoid stale dirty +inode during eviction in this call stack: + + f2fs_mark_inode_dirty_sync+0x22/0x40 [f2fs] + f2fs_abort_atomic_write+0xc4/0xf0 [f2fs] + f2fs_evict_inode+0x3f/0x690 [f2fs] + ? sugov_start+0x140/0x140 + evict+0xc3/0x1c0 + evict_inodes+0x17b/0x210 + generic_shutdown_super+0x32/0x120 + kill_block_super+0x21/0x50 + deactivate_locked_super+0x31/0x90 + cleanup_mnt+0x100/0x160 + task_work_run+0x59/0x90 + do_exit+0x33b/0xa50 + do_group_exit+0x2d/0x80 + __x64_sys_exit_group+0x14/0x20 + do_syscall_64+0x3b/0x90 + entry_SYSCALL_64_after_hwframe+0x63/0xcd + +This triggers f2fs_bug_on() in f2fs_evict_inode: + f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); + +This fixes the syzbot report: + +loop0: detected capacity change from 0 to 131072 +F2FS-fs (loop0): invalid crc value +F2FS-fs (loop0): Found nat_bits in checkpoint +F2FS-fs (loop0): Mounted with checkpoint version = 48b305e4 +------------[ cut here ]------------ +kernel BUG at fs/f2fs/inode.c:869! +invalid opcode: 0000 [#1] PREEMPT SMP KASAN +CPU: 0 PID: 5014 Comm: syz-executor220 Not tainted 6.4.0-syzkaller-11479-g6cd06ab12d1a #0 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/27/2023 +RIP: 0010:f2fs_evict_inode+0x172d/0x1e00 fs/f2fs/inode.c:869 +Code: ff df 48 c1 ea 03 80 3c 02 00 0f 85 6a 06 00 00 8b 75 40 ba 01 00 00 00 4c 89 e7 e8 6d ce 06 00 e9 aa fc ff ff e8 63 22 e2 fd <0f> 0b e8 5c 22 e2 fd 48 c7 c0 a8 3a 18 8d 48 ba 00 00 00 00 00 fc +RSP: 0018:ffffc90003a6fa00 EFLAGS: 00010293 +RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000 +RDX: ffff8880273b8000 RSI: ffffffff83a2bd0d RDI: 0000000000000007 +RBP: ffff888077db91b0 R08: 0000000000000007 R09: 0000000000000000 +R10: 0000000000000001 R11: 0000000000000001 R12: ffff888029a3c000 +R13: ffff888077db9660 R14: ffff888029a3c0b8 R15: ffff888077db9c50 +FS: 0000000000000000(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00007f1909bb9000 CR3: 00000000276a9000 CR4: 0000000000350ef0 +Call Trace: + + evict+0x2ed/0x6b0 fs/inode.c:665 + dispose_list+0x117/0x1e0 fs/inode.c:698 + evict_inodes+0x345/0x440 fs/inode.c:748 + generic_shutdown_super+0xaf/0x480 fs/super.c:478 + kill_block_super+0x64/0xb0 fs/super.c:1417 + kill_f2fs_super+0x2af/0x3c0 fs/f2fs/super.c:4704 + deactivate_locked_super+0x98/0x160 fs/super.c:330 + deactivate_super+0xb1/0xd0 fs/super.c:361 + cleanup_mnt+0x2ae/0x3d0 fs/namespace.c:1254 + task_work_run+0x16f/0x270 kernel/task_work.c:179 + exit_task_work include/linux/task_work.h:38 [inline] + do_exit+0xa9a/0x29a0 kernel/exit.c:874 + do_group_exit+0xd4/0x2a0 kernel/exit.c:1024 + __do_sys_exit_group kernel/exit.c:1035 [inline] + __se_sys_exit_group kernel/exit.c:1033 [inline] + __x64_sys_exit_group+0x3e/0x50 kernel/exit.c:1033 + do_syscall_x64 arch/x86/entry/common.c:50 [inline] + do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80 + entry_SYSCALL_64_after_hwframe+0x63/0xcd +RIP: 0033:0x7f309be71a09 +Code: Unable to access opcode bytes at 0x7f309be719df. +RSP: 002b:00007fff171df518 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7 +RAX: ffffffffffffffda RBX: 00007f309bef7330 RCX: 00007f309be71a09 +RDX: 000000000000003c RSI: 00000000000000e7 RDI: 0000000000000001 +RBP: 0000000000000001 R08: ffffffffffffffc0 R09: 00007f309bef1e40 +R10: 0000000000010600 R11: 0000000000000246 R12: 00007f309bef7330 +R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000001 + +Modules linked in: +---[ end trace 0000000000000000 ]--- +RIP: 0010:f2fs_evict_inode+0x172d/0x1e00 fs/f2fs/inode.c:869 +Code: ff df 48 c1 ea 03 80 3c 02 00 0f 85 6a 06 00 00 8b 75 40 ba 01 00 00 00 4c 89 e7 e8 6d ce 06 00 e9 aa fc ff ff e8 63 22 e2 fd <0f> 0b e8 5c 22 e2 fd 48 c7 c0 a8 3a 18 8d 48 ba 00 00 00 00 00 fc +RSP: 0018:ffffc90003a6fa00 EFLAGS: 00010293 +RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000 +RDX: ffff8880273b8000 RSI: ffffffff83a2bd0d RDI: 0000000000000007 +RBP: ffff888077db91b0 R08: 0000000000000007 R09: 0000000000000000 +R10: 0000000000000001 R11: 0000000000000001 R12: ffff888029a3c000 +R13: ffff888077db9660 R14: ffff888029a3c0b8 R15: ffff888077db9c50 +FS: 0000000000000000(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00007f1909bb9000 CR3: 00000000276a9000 CR4: 0000000000350ef0 + +Cc: +Reported-and-tested-by: syzbot+e1246909d526a9d470fa@syzkaller.appspotmail.com +Reviewed-by: Chao Yu +Signed-off-by: Jaegeuk Kim +Signed-off-by: Greg Kroah-Hartman +--- + fs/f2fs/segment.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/f2fs/segment.c ++++ b/fs/f2fs/segment.c +@@ -205,6 +205,8 @@ void f2fs_abort_atomic_write(struct inod + f2fs_i_size_write(inode, fi->original_i_size); + fi->original_i_size = 0; + } ++ /* avoid stale dirty inode during eviction */ ++ sync_inode_metadata(inode, 0); + } + + static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, diff --git a/queue-6.5/f2fs-get-out-of-a-repeat-loop-when-getting-a-locked-data-page.patch b/queue-6.5/f2fs-get-out-of-a-repeat-loop-when-getting-a-locked-data-page.patch new file mode 100644 index 00000000000..a6bd06b4d14 --- /dev/null +++ b/queue-6.5/f2fs-get-out-of-a-repeat-loop-when-getting-a-locked-data-page.patch @@ -0,0 +1,44 @@ +From d2d9bb3b6d2fbccb5b33d3a85a2830971625a4ea Mon Sep 17 00:00:00 2001 +From: Jaegeuk Kim +Date: Thu, 19 Jan 2023 10:47:00 -0800 +Subject: f2fs: get out of a repeat loop when getting a locked data page + +From: Jaegeuk Kim + +commit d2d9bb3b6d2fbccb5b33d3a85a2830971625a4ea upstream. + +https://bugzilla.kernel.org/show_bug.cgi?id=216050 + +Somehow we're getting a page which has a different mapping. +Let's avoid the infinite loop. + +Cc: +Signed-off-by: Jaegeuk Kim +Signed-off-by: Greg Kroah-Hartman +--- + fs/f2fs/data.c | 8 ++------ + 1 file changed, 2 insertions(+), 6 deletions(-) + +--- a/fs/f2fs/data.c ++++ b/fs/f2fs/data.c +@@ -1389,18 +1389,14 @@ struct page *f2fs_get_lock_data_page(str + { + struct address_space *mapping = inode->i_mapping; + struct page *page; +-repeat: ++ + page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL); + if (IS_ERR(page)) + return page; + + /* wait for read completion */ + lock_page(page); +- if (unlikely(page->mapping != mapping)) { +- f2fs_put_page(page, 1); +- goto repeat; +- } +- if (unlikely(!PageUptodate(page))) { ++ if (unlikely(page->mapping != mapping || !PageUptodate(page))) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } diff --git a/queue-6.5/fuse-nlookup-missing-decrement-in-fuse_direntplus_link.patch b/queue-6.5/fuse-nlookup-missing-decrement-in-fuse_direntplus_link.patch new file mode 100644 index 00000000000..e533c7c7d0b --- /dev/null +++ b/queue-6.5/fuse-nlookup-missing-decrement-in-fuse_direntplus_link.patch @@ -0,0 +1,47 @@ +From b8bd342d50cbf606666488488f9fea374aceb2d5 Mon Sep 17 00:00:00 2001 +From: ruanmeisi +Date: Tue, 25 Apr 2023 19:13:54 +0800 +Subject: fuse: nlookup missing decrement in fuse_direntplus_link + +From: ruanmeisi + +commit b8bd342d50cbf606666488488f9fea374aceb2d5 upstream. + +During our debugging of glusterfs, we found an Assertion failed error: +inode_lookup >= nlookup, which was caused by the nlookup value in the +kernel being greater than that in the FUSE file system. + +The issue was introduced by fuse_direntplus_link, where in the function, +fuse_iget increments nlookup, and if d_splice_alias returns failure, +fuse_direntplus_link returns failure without decrementing nlookup +https://github.com/gluster/glusterfs/pull/4081 + +Signed-off-by: ruanmeisi +Fixes: 0b05b18381ee ("fuse: implement NFS-like readdirplus support") +Cc: # v3.9 +Signed-off-by: Miklos Szeredi +Signed-off-by: Greg Kroah-Hartman +--- + fs/fuse/readdir.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/fs/fuse/readdir.c ++++ b/fs/fuse/readdir.c +@@ -243,8 +243,16 @@ retry: + dput(dentry); + dentry = alias; + } +- if (IS_ERR(dentry)) ++ if (IS_ERR(dentry)) { ++ if (!IS_ERR(inode)) { ++ struct fuse_inode *fi = get_fuse_inode(inode); ++ ++ spin_lock(&fi->lock); ++ fi->nlookup--; ++ spin_unlock(&fi->lock); ++ } + return PTR_ERR(dentry); ++ } + } + if (fc->readdirplus_auto) + set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); diff --git a/queue-6.5/hwspinlock-qcom-add-missing-regmap-config-for-sfpb-mmio-implementation.patch b/queue-6.5/hwspinlock-qcom-add-missing-regmap-config-for-sfpb-mmio-implementation.patch new file mode 100644 index 00000000000..782de43b35b --- /dev/null +++ b/queue-6.5/hwspinlock-qcom-add-missing-regmap-config-for-sfpb-mmio-implementation.patch @@ -0,0 +1,53 @@ +From 23316be8a9d450f33a21f1efe7d89570becbec58 Mon Sep 17 00:00:00 2001 +From: Christian Marangi +Date: Sun, 16 Jul 2023 04:28:04 +0200 +Subject: hwspinlock: qcom: add missing regmap config for SFPB MMIO implementation + +From: Christian Marangi + +commit 23316be8a9d450f33a21f1efe7d89570becbec58 upstream. + +Commit 5d4753f741d8 ("hwspinlock: qcom: add support for MMIO on older +SoCs") introduced and made regmap_config mandatory in the of_data struct +but didn't add the regmap_config for sfpb based devices. + +SFPB based devices can both use the legacy syscon way to probe or the +new MMIO way and currently device that use the MMIO way are broken as +they lack the definition of the now required regmap_config and always +return -EINVAL (and indirectly makes fail probing everything that +depends on it, smem, nandc with smem-parser...) + +Fix this by correctly adding the missing regmap_config and restore +function of hwspinlock on SFPB based devices with MMIO implementation. + +Cc: stable@vger.kernel.org +Fixes: 5d4753f741d8 ("hwspinlock: qcom: add support for MMIO on older SoCs") +Signed-off-by: Christian Marangi +Link: https://lore.kernel.org/r/20230716022804.21239-1-ansuelsmth@gmail.com +Signed-off-by: Bjorn Andersson +Signed-off-by: Greg Kroah-Hartman +--- + drivers/hwspinlock/qcom_hwspinlock.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/drivers/hwspinlock/qcom_hwspinlock.c ++++ b/drivers/hwspinlock/qcom_hwspinlock.c +@@ -69,9 +69,18 @@ static const struct hwspinlock_ops qcom_ + .unlock = qcom_hwspinlock_unlock, + }; + ++static const struct regmap_config sfpb_mutex_config = { ++ .reg_bits = 32, ++ .reg_stride = 4, ++ .val_bits = 32, ++ .max_register = 0x100, ++ .fast_io = true, ++}; ++ + static const struct qcom_hwspinlock_of_data of_sfpb_mutex = { + .offset = 0x4, + .stride = 0x4, ++ .regmap_config = &sfpb_mutex_config, + }; + + static const struct regmap_config tcsr_msm8226_mutex_config = { diff --git a/queue-6.5/jbd2-check-jh-b_transaction-before-removing-it-from-checkpoint.patch b/queue-6.5/jbd2-check-jh-b_transaction-before-removing-it-from-checkpoint.patch new file mode 100644 index 00000000000..2b851f5efaa --- /dev/null +++ b/queue-6.5/jbd2-check-jh-b_transaction-before-removing-it-from-checkpoint.patch @@ -0,0 +1,67 @@ +From 590a809ff743e7bd890ba5fb36bc38e20a36de53 Mon Sep 17 00:00:00 2001 +From: Zhihao Cheng +Date: Fri, 14 Jul 2023 10:55:27 +0800 +Subject: jbd2: check 'jh->b_transaction' before removing it from checkpoint +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Zhihao Cheng + +commit 590a809ff743e7bd890ba5fb36bc38e20a36de53 upstream. + +Following process will corrupt ext4 image: +Step 1: +jbd2_journal_commit_transaction + __jbd2_journal_insert_checkpoint(jh, commit_transaction) + // Put jh into trans1->t_checkpoint_list + journal->j_checkpoint_transactions = commit_transaction + // Put trans1 into journal->j_checkpoint_transactions + +Step 2: +do_get_write_access + test_clear_buffer_dirty(bh) // clear buffer dirty,set jbd dirty + __jbd2_journal_file_buffer(jh, transaction) // jh belongs to trans2 + +Step 3: +drop_cache + journal_shrink_one_cp_list + jbd2_journal_try_remove_checkpoint + if (!trylock_buffer(bh)) // lock bh, true + if (buffer_dirty(bh)) // buffer is not dirty + __jbd2_journal_remove_checkpoint(jh) + // remove jh from trans1->t_checkpoint_list + +Step 4: +jbd2_log_do_checkpoint + trans1 = journal->j_checkpoint_transactions + // jh is not in trans1->t_checkpoint_list + jbd2_cleanup_journal_tail(journal) // trans1 is done + +Step 5: Power cut, trans2 is not committed, jh is lost in next mounting. + +Fix it by checking 'jh->b_transaction' before remove it from checkpoint. + +Cc: stable@kernel.org +Fixes: 46f881b5b175 ("jbd2: fix a race when checking checkpoint buffer busy") +Signed-off-by: Zhihao Cheng +Signed-off-by: Zhang Yi +Reviewed-by: Jan Kara +Link: https://lore.kernel.org/r/20230714025528.564988-3-yi.zhang@huaweicloud.com +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman +--- + fs/jbd2/checkpoint.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/jbd2/checkpoint.c ++++ b/fs/jbd2/checkpoint.c +@@ -639,6 +639,8 @@ int jbd2_journal_try_remove_checkpoint(s + { + struct buffer_head *bh = jh2bh(jh); + ++ if (jh->b_transaction) ++ return -EBUSY; + if (!trylock_buffer(bh)) + return -EBUSY; + if (buffer_dirty(bh)) { diff --git a/queue-6.5/jbd2-correct-the-end-of-the-journal-recovery-scan-range.patch b/queue-6.5/jbd2-correct-the-end-of-the-journal-recovery-scan-range.patch new file mode 100644 index 00000000000..f56333132ca --- /dev/null +++ b/queue-6.5/jbd2-correct-the-end-of-the-journal-recovery-scan-range.patch @@ -0,0 +1,76 @@ +From 2dfba3bb40ad8536b9fa802364f2d40da31aa88e Mon Sep 17 00:00:00 2001 +From: Zhang Yi +Date: Mon, 26 Jun 2023 15:33:22 +0800 +Subject: jbd2: correct the end of the journal recovery scan range + +From: Zhang Yi + +commit 2dfba3bb40ad8536b9fa802364f2d40da31aa88e upstream. + +We got a filesystem inconsistency issue below while running generic/475 +I/O failure pressure test with fast_commit feature enabled. + + Symlink /p3/d3/d1c/d6c/dd6/dce/l101 (inode #132605) is invalid. + +If fast_commit feature is enabled, a special fast_commit journal area is +appended to the end of the normal journal area. The journal->j_last +point to the first unused block behind the normal journal area instead +of the whole log area, and the journal->j_fc_last point to the first +unused block behind the fast_commit journal area. While doing journal +recovery, do_one_pass(PASS_SCAN) should first scan the normal journal +area and turn around to the first block once it meet journal->j_last, +but the wrap() macro misuse the journal->j_fc_last, so the recovering +could not read the next magic block (commit block perhaps) and would end +early mistakenly and missing tN and every transaction after it in the +following example. Finally, it could lead to filesystem inconsistency. + + | normal journal area | fast commit area | + +-------------------------------------------------+------------------+ + | tN(rere) | tN+1 |~| tN-x |...| tN-1 | tN(front) | .... | + +-------------------------------------------------+------------------+ + / / / + start journal->j_last journal->j_fc_last + +This patch fix it by use the correct ending journal->j_last. + +Fixes: 5b849b5f96b4 ("jbd2: fast commit recovery path") +Cc: stable@kernel.org +Reported-by: Theodore Ts'o +Link: https://lore.kernel.org/linux-ext4/20230613043120.GB1584772@mit.edu/ +Signed-off-by: Zhang Yi +Reviewed-by: Jan Kara +Link: https://lore.kernel.org/r/20230626073322.3956567-1-yi.zhang@huaweicloud.com +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman +--- + fs/jbd2/recovery.c | 12 +++--------- + 1 file changed, 3 insertions(+), 9 deletions(-) + +--- a/fs/jbd2/recovery.c ++++ b/fs/jbd2/recovery.c +@@ -230,12 +230,8 @@ static int count_tags(journal_t *journal + /* Make sure we wrap around the log correctly! */ + #define wrap(journal, var) \ + do { \ +- unsigned long _wrap_last = \ +- jbd2_has_feature_fast_commit(journal) ? \ +- (journal)->j_fc_last : (journal)->j_last; \ +- \ +- if (var >= _wrap_last) \ +- var -= (_wrap_last - (journal)->j_first); \ ++ if (var >= (journal)->j_last) \ ++ var -= ((journal)->j_last - (journal)->j_first); \ + } while (0) + + static int fc_do_one_pass(journal_t *journal, +@@ -524,9 +520,7 @@ static int do_one_pass(journal_t *journa + break; + + jbd2_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", +- next_commit_ID, next_log_block, +- jbd2_has_feature_fast_commit(journal) ? +- journal->j_fc_last : journal->j_last); ++ next_commit_ID, next_log_block, journal->j_last); + + /* Skip over each chunk of the transaction looking + * either the next descriptor block or the final commit diff --git a/queue-6.5/jbd2-fix-checkpoint-cleanup-performance-regression.patch b/queue-6.5/jbd2-fix-checkpoint-cleanup-performance-regression.patch new file mode 100644 index 00000000000..422c4224341 --- /dev/null +++ b/queue-6.5/jbd2-fix-checkpoint-cleanup-performance-regression.patch @@ -0,0 +1,123 @@ +From 373ac521799d9e97061515aca6ec6621789036bb Mon Sep 17 00:00:00 2001 +From: Zhang Yi +Date: Fri, 14 Jul 2023 10:55:26 +0800 +Subject: jbd2: fix checkpoint cleanup performance regression + +From: Zhang Yi + +commit 373ac521799d9e97061515aca6ec6621789036bb upstream. + +journal_clean_one_cp_list() has been merged into +journal_shrink_one_cp_list(), but do chekpoint buffer cleanup from the +committing process is just a best effort, it should stop scan once it +meet a busy buffer, or else it will cause a lot of invalid buffer scan +and checks. We catch a performance regression when doing fs_mark tests +below. + +Test cmd: + ./fs_mark -d scratch -s 1024 -n 10000 -t 1 -D 100 -N 100 + +Before merging checkpoint buffer cleanup: + FSUse% Count Size Files/sec App Overhead + 95 10000 1024 8304.9 49033 + +After merging checkpoint buffer cleanup: + FSUse% Count Size Files/sec App Overhead + 95 10000 1024 7649.0 50012 + FSUse% Count Size Files/sec App Overhead + 95 10000 1024 2107.1 50871 + +After merging checkpoint buffer cleanup, the total loop count in +journal_shrink_one_cp_list() could be up to 6,261,600+ (50,000+ ~ +100,000+ in general), most of them are invalid. This patch fix it +through passing 'shrink_type' into journal_shrink_one_cp_list() and add +a new 'SHRINK_BUSY_STOP' to indicate it should stop once meet a busy +buffer. After fix, the loop count descending back to 10,000+. + +After this fix: + FSUse% Count Size Files/sec App Overhead + 95 10000 1024 8558.4 49109 + +Cc: stable@kernel.org +Fixes: b98dba273a0e ("jbd2: remove journal_clean_one_cp_list()") +Signed-off-by: Zhang Yi +Reviewed-by: Jan Kara +Link: https://lore.kernel.org/r/20230714025528.564988-2-yi.zhang@huaweicloud.com +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman +--- + fs/jbd2/checkpoint.c | 20 ++++++++++++++------ + 1 file changed, 14 insertions(+), 6 deletions(-) + +--- a/fs/jbd2/checkpoint.c ++++ b/fs/jbd2/checkpoint.c +@@ -349,6 +349,8 @@ int jbd2_cleanup_journal_tail(journal_t + + /* Checkpoint list management */ + ++enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP}; ++ + /* + * journal_shrink_one_cp_list + * +@@ -360,7 +362,8 @@ int jbd2_cleanup_journal_tail(journal_t + * Called with j_list_lock held. + */ + static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, +- bool destroy, bool *released) ++ enum shrink_type type, ++ bool *released) + { + struct journal_head *last_jh; + struct journal_head *next_jh = jh; +@@ -376,12 +379,15 @@ static unsigned long journal_shrink_one_ + jh = next_jh; + next_jh = jh->b_cpnext; + +- if (destroy) { ++ if (type == SHRINK_DESTROY) { + ret = __jbd2_journal_remove_checkpoint(jh); + } else { + ret = jbd2_journal_try_remove_checkpoint(jh); +- if (ret < 0) +- continue; ++ if (ret < 0) { ++ if (type == SHRINK_BUSY_SKIP) ++ continue; ++ break; ++ } + } + + nr_freed++; +@@ -445,7 +451,7 @@ again: + tid = transaction->t_tid; + + freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list, +- false, &released); ++ SHRINK_BUSY_SKIP, &released); + nr_freed += freed; + (*nr_to_scan) -= min(*nr_to_scan, freed); + if (*nr_to_scan == 0) +@@ -485,19 +491,21 @@ out: + void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) + { + transaction_t *transaction, *last_transaction, *next_transaction; ++ enum shrink_type type; + bool released; + + transaction = journal->j_checkpoint_transactions; + if (!transaction) + return; + ++ type = destroy ? SHRINK_DESTROY : SHRINK_BUSY_STOP; + last_transaction = transaction->t_cpprev; + next_transaction = transaction; + do { + transaction = next_transaction; + next_transaction = transaction->t_cpnext; + journal_shrink_one_cp_list(transaction->t_checkpoint_list, +- destroy, &released); ++ type, &released); + /* + * This function only frees up some memory if possible so we + * dont have an obligation to finish processing. Bail out if diff --git a/queue-6.5/lib-test_scanf-add-explicit-type-cast-to-result-initialization-in-test_number_prefix.patch b/queue-6.5/lib-test_scanf-add-explicit-type-cast-to-result-initialization-in-test_number_prefix.patch new file mode 100644 index 00000000000..8031f4628ae --- /dev/null +++ b/queue-6.5/lib-test_scanf-add-explicit-type-cast-to-result-initialization-in-test_number_prefix.patch @@ -0,0 +1,53 @@ +From 92382d744176f230101d54f5c017bccd62770f01 Mon Sep 17 00:00:00 2001 +From: Nathan Chancellor +Date: Mon, 7 Aug 2023 08:36:28 -0700 +Subject: lib: test_scanf: Add explicit type cast to result initialization in test_number_prefix() + +From: Nathan Chancellor + +commit 92382d744176f230101d54f5c017bccd62770f01 upstream. + +A recent change in clang allows it to consider more expressions as +compile time constants, which causes it to point out an implicit +conversion in the scanf tests: + + lib/test_scanf.c:661:2: warning: implicit conversion from 'int' to 'unsigned char' changes value from -168 to 88 [-Wconstant-conversion] + 661 | test_number_prefix(unsigned char, "0xA7", "%2hhx%hhx", 0, 0xa7, 2, check_uchar); + | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + lib/test_scanf.c:609:29: note: expanded from macro 'test_number_prefix' + 609 | T result[2] = {~expect[0], ~expect[1]}; \ + | ~ ^~~~~~~~~~ + 1 warning generated. + +The result of the bitwise negation is the type of the operand after +going through the integer promotion rules, so this truncation is +expected but harmless, as the initial values in the result array get +overwritten by _test() anyways. Add an explicit cast to the expected +type in test_number_prefix() to silence the warning. There is no +functional change, as all the tests still pass with GCC 13.1.0 and clang +18.0.0. + +Cc: stable@vger.kernel.org +Link: https://github.com/ClangBuiltLinux/linuxq/issues/1899 +Link: https://github.com/llvm/llvm-project/commit/610ec954e1f81c0e8fcadedcd25afe643f5a094e +Suggested-by: Nick Desaulniers +Signed-off-by: Nathan Chancellor +Reviewed-by: Petr Mladek +Signed-off-by: Petr Mladek +Link: https://lore.kernel.org/r/20230807-test_scanf-wconstant-conversion-v2-1-839ca39083e1@kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + lib/test_scanf.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/lib/test_scanf.c ++++ b/lib/test_scanf.c +@@ -606,7 +606,7 @@ static void __init numbers_slice(void) + #define test_number_prefix(T, str, scan_fmt, expect0, expect1, n_args, fn) \ + do { \ + const T expect[2] = { expect0, expect1 }; \ +- T result[2] = {~expect[0], ~expect[1]}; \ ++ T result[2] = { (T)~expect[0], (T)~expect[1] }; \ + \ + _test(fn, &expect, str, scan_fmt, n_args, &result[0], &result[1]); \ + } while (0) diff --git a/queue-6.5/memcontrol-ensure-memcg-acquired-by-id-is-properly-set-up.patch b/queue-6.5/memcontrol-ensure-memcg-acquired-by-id-is-properly-set-up.patch new file mode 100644 index 00000000000..bbf48fd5de9 --- /dev/null +++ b/queue-6.5/memcontrol-ensure-memcg-acquired-by-id-is-properly-set-up.patch @@ -0,0 +1,131 @@ +From 6f0df8e16eb543167f2929cb756e695709a3551d Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 23 Aug 2023 15:54:30 -0700 +Subject: memcontrol: ensure memcg acquired by id is properly set up + +From: Johannes Weiner + +commit 6f0df8e16eb543167f2929cb756e695709a3551d upstream. + +In the eviction recency check, we attempt to retrieve the memcg to which +the folio belonged when it was evicted, by the memcg id stored in the +shadow entry. However, there is a chance that the retrieved memcg is not +the original memcg that has been killed, but a new one which happens to +have the same id. + +This is a somewhat unfortunate, but acceptable and rare inaccuracy in the +heuristics. However, if we retrieve this new memcg between its allocation +and when it is properly attached to the memcg hierarchy, we could run into +the following NULL pointer exception during the memcg hierarchy traversal +done in mem_cgroup_get_nr_swap_pages(): + +[ 155757.793456] BUG: kernel NULL pointer dereference, address: 00000000000000c0 +[ 155757.807568] #PF: supervisor read access in kernel mode +[ 155757.818024] #PF: error_code(0x0000) - not-present page +[ 155757.828482] PGD 401f77067 P4D 401f77067 PUD 401f76067 PMD 0 +[ 155757.839985] Oops: 0000 [#1] SMP +[ 155757.887870] RIP: 0010:mem_cgroup_get_nr_swap_pages+0x3d/0xb0 +[ 155757.899377] Code: 29 19 4a 02 48 39 f9 74 63 48 8b 97 c0 00 00 00 48 8b b7 58 02 00 00 48 2b b7 c0 01 00 00 48 39 f0 48 0f 4d c6 48 39 d1 74 42 <48> 8b b2 c0 00 00 00 48 8b ba 58 02 00 00 48 2b ba c0 01 00 00 48 +[ 155757.937125] RSP: 0018:ffffc9002ecdfbc8 EFLAGS: 00010286 +[ 155757.947755] RAX: 00000000003a3b1c RBX: 000007ffffffffff RCX: ffff888280183000 +[ 155757.962202] RDX: 0000000000000000 RSI: 0007ffffffffffff RDI: ffff888bbc2d1000 +[ 155757.976648] RBP: 0000000000000001 R08: 000000000000000b R09: ffff888ad9cedba0 +[ 155757.991094] R10: ffffea0039c07900 R11: 0000000000000010 R12: ffff888b23a7b000 +[ 155758.005540] R13: 0000000000000000 R14: ffff888bbc2d1000 R15: 000007ffffc71354 +[ 155758.019991] FS: 00007f6234c68640(0000) GS:ffff88903f9c0000(0000) knlGS:0000000000000000 +[ 155758.036356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 155758.048023] CR2: 00000000000000c0 CR3: 0000000a83eb8004 CR4: 00000000007706e0 +[ 155758.062473] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +[ 155758.076924] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +[ 155758.091376] PKRU: 55555554 +[ 155758.096957] Call Trace: +[ 155758.102016] +[ 155758.106502] ? __die+0x78/0xc0 +[ 155758.112793] ? page_fault_oops+0x286/0x380 +[ 155758.121175] ? exc_page_fault+0x5d/0x110 +[ 155758.129209] ? asm_exc_page_fault+0x22/0x30 +[ 155758.137763] ? mem_cgroup_get_nr_swap_pages+0x3d/0xb0 +[ 155758.148060] workingset_test_recent+0xda/0x1b0 +[ 155758.157133] workingset_refault+0xca/0x1e0 +[ 155758.165508] filemap_add_folio+0x4d/0x70 +[ 155758.173538] page_cache_ra_unbounded+0xed/0x190 +[ 155758.182919] page_cache_sync_ra+0xd6/0x1e0 +[ 155758.191738] filemap_read+0x68d/0xdf0 +[ 155758.199495] ? mlx5e_napi_poll+0x123/0x940 +[ 155758.207981] ? __napi_schedule+0x55/0x90 +[ 155758.216095] __x64_sys_pread64+0x1d6/0x2c0 +[ 155758.224601] do_syscall_64+0x3d/0x80 +[ 155758.232058] entry_SYSCALL_64_after_hwframe+0x46/0xb0 +[ 155758.242473] RIP: 0033:0x7f62c29153b5 +[ 155758.249938] Code: e8 48 89 75 f0 89 7d f8 48 89 4d e0 e8 b4 e6 f7 ff 41 89 c0 4c 8b 55 e0 48 8b 55 e8 48 8b 75 f0 8b 7d f8 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 45 f8 e8 e7 e6 f7 ff 48 8b +[ 155758.288005] RSP: 002b:00007f6234c5ffd0 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 +[ 155758.303474] RAX: ffffffffffffffda RBX: 00007f628c4e70c0 RCX: 00007f62c29153b5 +[ 155758.318075] RDX: 000000000003c041 RSI: 00007f61d2986000 RDI: 0000000000000076 +[ 155758.332678] RBP: 00007f6234c5fff0 R08: 0000000000000000 R09: 0000000064d5230c +[ 155758.347452] R10: 000000000027d450 R11: 0000000000000293 R12: 000000000003c041 +[ 155758.362044] R13: 00007f61d2986000 R14: 00007f629e11b060 R15: 000000000027d450 +[ 155758.376661] + +This patch fixes the issue by moving the memcg's id publication from the +alloc stage to online stage, ensuring that any memcg acquired via id must +be connected to the memcg tree. + +Link: https://lkml.kernel.org/r/20230823225430.166925-1-nphamcs@gmail.com +Fixes: f78dfc7b77d5 ("workingset: fix confusion around eviction vs refault container") +Signed-off-by: Johannes Weiner +Co-developed-by: Nhat Pham +Signed-off-by: Nhat Pham +Acked-by: Shakeel Butt +Cc: Yosry Ahmed +Cc: Michal Hocko +Cc: Roman Gushchin +Cc: Muchun Song +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/memcontrol.c | 22 +++++++++++++++++----- + 1 file changed, 17 insertions(+), 5 deletions(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -5329,7 +5329,6 @@ static struct mem_cgroup *mem_cgroup_all + INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); + memcg->deferred_split_queue.split_queue_len = 0; + #endif +- idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); + lru_gen_init_memcg(memcg); + return memcg; + fail: +@@ -5401,14 +5400,27 @@ static int mem_cgroup_css_online(struct + if (alloc_shrinker_info(memcg)) + goto offline_kmem; + +- /* Online state pins memcg ID, memcg ID pins CSS */ +- refcount_set(&memcg->id.ref, 1); +- css_get(css); +- + if (unlikely(mem_cgroup_is_root(memcg))) + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, + FLUSH_TIME); + lru_gen_online_memcg(memcg); ++ ++ /* Online state pins memcg ID, memcg ID pins CSS */ ++ refcount_set(&memcg->id.ref, 1); ++ css_get(css); ++ ++ /* ++ * Ensure mem_cgroup_from_id() works once we're fully online. ++ * ++ * We could do this earlier and require callers to filter with ++ * css_tryget_online(). But right now there are no users that ++ * need earlier access, and the workingset code relies on the ++ * cgroup tree linkage (mem_cgroup_get_nr_swap_pages()). So ++ * publish it here at the end of onlining. This matches the ++ * regular ID destruction during offlining. ++ */ ++ idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); ++ + return 0; + offline_kmem: + memcg_offline_kmem(memcg); diff --git a/queue-6.5/misc-fastrpc-fix-incorrect-dma-mapping-unmap-request.patch b/queue-6.5/misc-fastrpc-fix-incorrect-dma-mapping-unmap-request.patch new file mode 100644 index 00000000000..3fa4e7307f1 --- /dev/null +++ b/queue-6.5/misc-fastrpc-fix-incorrect-dma-mapping-unmap-request.patch @@ -0,0 +1,54 @@ +From a2cb9cd6a3949a3804ad9fd7da234892ce6719ec Mon Sep 17 00:00:00 2001 +From: Ekansh Gupta +Date: Fri, 11 Aug 2023 12:56:42 +0100 +Subject: misc: fastrpc: Fix incorrect DMA mapping unmap request + +From: Ekansh Gupta + +commit a2cb9cd6a3949a3804ad9fd7da234892ce6719ec upstream. + +Scatterlist table is obtained during map create request and the same +table is used for DMA mapping unmap. In case there is any failure +while getting the sg_table, ERR_PTR is returned instead of sg_table. + +When the map is getting freed, there is only a non-NULL check of +sg_table which will also be true in case failure was returned instead +of sg_table. This would result in improper unmap request. Add proper +check before setting map table to avoid bad unmap request. + +Fixes: c68cfb718c8f ("misc: fastrpc: Add support for context Invoke method") +Cc: stable +Signed-off-by: Ekansh Gupta +Signed-off-by: Srinivas Kandagatla +Link: https://lore.kernel.org/r/20230811115643.38578-3-srinivas.kandagatla@linaro.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/misc/fastrpc.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +--- a/drivers/misc/fastrpc.c ++++ b/drivers/misc/fastrpc.c +@@ -756,6 +756,7 @@ static int fastrpc_map_create(struct fas + { + struct fastrpc_session_ctx *sess = fl->sctx; + struct fastrpc_map *map = NULL; ++ struct sg_table *table; + int err = 0; + + if (!fastrpc_map_lookup(fl, fd, ppmap, true)) +@@ -783,11 +784,12 @@ static int fastrpc_map_create(struct fas + goto attach_err; + } + +- map->table = dma_buf_map_attachment_unlocked(map->attach, DMA_BIDIRECTIONAL); +- if (IS_ERR(map->table)) { +- err = PTR_ERR(map->table); ++ table = dma_buf_map_attachment_unlocked(map->attach, DMA_BIDIRECTIONAL); ++ if (IS_ERR(table)) { ++ err = PTR_ERR(table); + goto map_err; + } ++ map->table = table; + + if (attr & FASTRPC_ATTR_SECUREMAP) { + map->phys = sg_phys(map->table->sgl); diff --git a/queue-6.5/misc-fastrpc-fix-remote-heap-allocation-request.patch b/queue-6.5/misc-fastrpc-fix-remote-heap-allocation-request.patch new file mode 100644 index 00000000000..a62438ebbdc --- /dev/null +++ b/queue-6.5/misc-fastrpc-fix-remote-heap-allocation-request.patch @@ -0,0 +1,60 @@ +From ada6c2d99aedd1eac2f633d03c652e070bc2ea74 Mon Sep 17 00:00:00 2001 +From: Ekansh Gupta +Date: Fri, 11 Aug 2023 12:56:41 +0100 +Subject: misc: fastrpc: Fix remote heap allocation request + +From: Ekansh Gupta + +commit ada6c2d99aedd1eac2f633d03c652e070bc2ea74 upstream. + +Remote heap is used by DSP audioPD on need basis. This memory is +allocated from reserved CMA memory region and is then shared with +audioPD to use it for it's functionality. + +Current implementation of remote heap is not allocating the memory +from CMA region, instead it is allocating the memory from SMMU +context bank. The arguments passed to scm call for the reassignment +of ownership is also not correct. Added changes to allocate CMA +memory and have a proper ownership reassignment. + +Fixes: 532ad70c6d44 ("misc: fastrpc: Add mmap request assigning for static PD pool") +Cc: stable +Tested-by: Ekansh Gupta +Signed-off-by: Ekansh Gupta +Signed-off-by: Srinivas Kandagatla +Link: https://lore.kernel.org/r/20230811115643.38578-2-srinivas.kandagatla@linaro.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/misc/fastrpc.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +--- a/drivers/misc/fastrpc.c ++++ b/drivers/misc/fastrpc.c +@@ -1871,7 +1871,11 @@ static int fastrpc_req_mmap(struct fastr + return -EINVAL; + } + +- err = fastrpc_buf_alloc(fl, fl->sctx->dev, req.size, &buf); ++ if (req.flags == ADSP_MMAP_REMOTE_HEAP_ADDR) ++ err = fastrpc_remote_heap_alloc(fl, dev, req.size, &buf); ++ else ++ err = fastrpc_buf_alloc(fl, dev, req.size, &buf); ++ + if (err) { + dev_err(dev, "failed to allocate buffer\n"); + return err; +@@ -1910,12 +1914,8 @@ static int fastrpc_req_mmap(struct fastr + + /* Add memory to static PD pool, protection thru hypervisor */ + if (req.flags == ADSP_MMAP_REMOTE_HEAP_ADDR && fl->cctx->vmcount) { +- struct qcom_scm_vmperm perm; +- +- perm.vmid = QCOM_SCM_VMID_HLOS; +- perm.perm = QCOM_SCM_PERM_RWX; +- err = qcom_scm_assign_mem(buf->phys, buf->size, +- &fl->cctx->perms, &perm, 1); ++ err = qcom_scm_assign_mem(buf->phys, (u64)buf->size, ++ &fl->cctx->perms, fl->cctx->vmperms, fl->cctx->vmcount); + if (err) { + dev_err(fl->sctx->dev, "Failed to assign memory phys 0x%llx size 0x%llx err %d", + buf->phys, buf->size, err); diff --git a/queue-6.5/series b/queue-6.5/series index cbe9c3d1ca1..85b135a7ae6 100644 --- a/queue-6.5/series +++ b/queue-6.5/series @@ -176,3 +176,33 @@ sh-push-switch-reorder-cleanup-operations-to-avoid-u.patch linux-export-fix-reference-to-exported-functions-for-parisc64.patch watchdog-advantech_ec_wdt-fix-kconfig-dependencies.patch drm-amd-display-temporary-disable-mst-dp-colorspace-property.patch +arc-atomics-add-compiler-barrier-to-atomic-operations.patch +clocksource-drivers-arm_arch_timer-disable-timer-before-programming-cval.patch +dmaengine-sh-rz-dmac-fix-destination-and-source-data-size-setting.patch +misc-fastrpc-fix-remote-heap-allocation-request.patch +misc-fastrpc-fix-incorrect-dma-mapping-unmap-request.patch +jbd2-fix-checkpoint-cleanup-performance-regression.patch +jbd2-check-jh-b_transaction-before-removing-it-from-checkpoint.patch +jbd2-correct-the-end-of-the-journal-recovery-scan-range.patch +ext4-fix-slab-use-after-free-in-ext4_es_insert_extent.patch +ext4-add-correct-group-descriptors-and-reserved-gdt-blocks-to-system-zone.patch +ext4-fix-memory-leaks-in-ext4_fname_-setup_filename-prepare_lookup.patch +ext4-drop-dio-overwrite-only-flag-and-associated-warning.patch +ext4-fix-bug-in-ext4_mb_new_inode_pa-due-to-overflow.patch +f2fs-get-out-of-a-repeat-loop-when-getting-a-locked-data-page.patch +f2fs-flush-inode-if-atomic-file-is-aborted.patch +f2fs-avoid-false-alarm-of-circular-locking.patch +lib-test_scanf-add-explicit-type-cast-to-result-initialization-in-test_number_prefix.patch +hwspinlock-qcom-add-missing-regmap-config-for-sfpb-mmio-implementation.patch +memcontrol-ensure-memcg-acquired-by-id-is-properly-set-up.patch +ata-ahci-add-elkhart-lake-ahci-controller.patch +ata-pata_falcon-fix-io-base-selection-for-q40.patch +ata-sata_gemini-add-missing-module_description.patch +ata-pata_ftide010-add-missing-module_description.patch +fuse-nlookup-missing-decrement-in-fuse_direntplus_link.patch +btrfs-zoned-do-not-zone-finish-data-relocation-block-group.patch +btrfs-fix-start-transaction-qgroup-rsv-double-free.patch +btrfs-free-qgroup-rsv-on-io-failure.patch +btrfs-don-t-start-transaction-when-joining-with-trans_join_nostart.patch +btrfs-set-page-extent-mapped-after-read_folio-in-relocate_one_page.patch +btrfs-zoned-re-enable-metadata-over-commit-for-zoned-mode.patch