From 871d5c6f1eb7f5ff128420aaf987caac6133d02d Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 26 Sep 2022 08:34:38 +0200 Subject: [PATCH] 5.19-stable patches added patches: devdax-fix-soft-reservation-memory-description.patch ext4-fix-bug-in-extents-parsing-when-eh_entries-0-and-eh_depth-0.patch ext4-limit-the-number-of-retries-after-discarding-preallocations-blocks.patch ext4-make-mballoc-try-target-group-first-even-with-mb_optimize_scan.patch --- ...-soft-reservation-memory-description.patch | 58 ++++++++++++ ...ing-when-eh_entries-0-and-eh_depth-0.patch | 85 ++++++++++++++++++ ...ter-discarding-preallocations-blocks.patch | 74 +++++++++++++++ ...oup-first-even-with-mb_optimize_scan.patch | 89 +++++++++++++++++++ queue-5.19/series | 4 + 5 files changed, 310 insertions(+) create mode 100644 queue-5.19/devdax-fix-soft-reservation-memory-description.patch create mode 100644 queue-5.19/ext4-fix-bug-in-extents-parsing-when-eh_entries-0-and-eh_depth-0.patch create mode 100644 queue-5.19/ext4-limit-the-number-of-retries-after-discarding-preallocations-blocks.patch create mode 100644 queue-5.19/ext4-make-mballoc-try-target-group-first-even-with-mb_optimize_scan.patch diff --git a/queue-5.19/devdax-fix-soft-reservation-memory-description.patch b/queue-5.19/devdax-fix-soft-reservation-memory-description.patch new file mode 100644 index 00000000000..ca5964fdcd3 --- /dev/null +++ b/queue-5.19/devdax-fix-soft-reservation-memory-description.patch @@ -0,0 +1,58 @@ +From 67feaba413ec68daf4124e9870878899b4ed9a0e Mon Sep 17 00:00:00 2001 +From: Dan Williams +Date: Fri, 23 Sep 2022 15:05:56 -0700 +Subject: devdax: Fix soft-reservation memory description + +From: Dan Williams + +commit 67feaba413ec68daf4124e9870878899b4ed9a0e upstream. + +The "hmem" platform-devices that are created to represent the +platform-advertised "Soft Reserved" memory ranges end up inserting a +resource that causes the iomem_resource tree to look like this: + +340000000-43fffffff : hmem.0 + 340000000-43fffffff : Soft Reserved + 340000000-43fffffff : dax0.0 + +This is because insert_resource() reparents ranges when they completely +intersect an existing range. + +This matters because code that uses region_intersects() to scan for a +given IORES_DESC will only check that top-level 'hmem.0' resource and +not the 'Soft Reserved' descendant. + +So, to support EINJ (via einj_error_inject()) to inject errors into +memory hosted by a dax-device, be sure to describe the memory as +IORES_DESC_SOFT_RESERVED. This is a follow-on to: + +commit b13a3e5fd40b ("ACPI: APEI: Fix _EINJ vs EFI_MEMORY_SP") + +...that fixed EINJ support for "Soft Reserved" ranges in the first +instance. + +Fixes: 262b45ae3ab4 ("x86/efi: EFI soft reservation to E820 enumeration") +Reported-by: Ricardo Sandoval Torres +Tested-by: Ricardo Sandoval Torres +Cc: +Cc: Tony Luck +Cc: Omar Avelar +Cc: Rafael J. Wysocki +Cc: Mark Gross +Link: https://lore.kernel.org/r/166397075670.389916.7435722208896316387.stgit@dwillia2-xfh.jf.intel.com +Signed-off-by: Dan Williams +Signed-off-by: Greg Kroah-Hartman +--- + drivers/dax/hmem/device.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/dax/hmem/device.c ++++ b/drivers/dax/hmem/device.c +@@ -15,6 +15,7 @@ void hmem_register_device(int target_nid + .start = r->start, + .end = r->end, + .flags = IORESOURCE_MEM, ++ .desc = IORES_DESC_SOFT_RESERVED, + }; + struct platform_device *pdev; + struct memregion_info info; diff --git a/queue-5.19/ext4-fix-bug-in-extents-parsing-when-eh_entries-0-and-eh_depth-0.patch b/queue-5.19/ext4-fix-bug-in-extents-parsing-when-eh_entries-0-and-eh_depth-0.patch new file mode 100644 index 00000000000..2436dc19acc --- /dev/null +++ b/queue-5.19/ext4-fix-bug-in-extents-parsing-when-eh_entries-0-and-eh_depth-0.patch @@ -0,0 +1,85 @@ +From 29a5b8a137ac8eb410cc823653a29ac0e7b7e1b0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Lu=C3=ADs=20Henriques?= +Date: Mon, 22 Aug 2022 10:42:35 +0100 +Subject: ext4: fix bug in extents parsing when eh_entries == 0 and eh_depth > 0 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Luís Henriques + +commit 29a5b8a137ac8eb410cc823653a29ac0e7b7e1b0 upstream. + +When walking through an inode extents, the ext4_ext_binsearch_idx() function +assumes that the extent header has been previously validated. However, there +are no checks that verify that the number of entries (eh->eh_entries) is +non-zero when depth is > 0. And this will lead to problems because the +EXT_FIRST_INDEX() and EXT_LAST_INDEX() will return garbage and result in this: + +[ 135.245946] ------------[ cut here ]------------ +[ 135.247579] kernel BUG at fs/ext4/extents.c:2258! +[ 135.249045] invalid opcode: 0000 [#1] PREEMPT SMP +[ 135.250320] CPU: 2 PID: 238 Comm: tmp118 Not tainted 5.19.0-rc8+ #4 +[ 135.252067] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b-rebuilt.opensuse.org 04/01/2014 +[ 135.255065] RIP: 0010:ext4_ext_map_blocks+0xc20/0xcb0 +[ 135.256475] Code: +[ 135.261433] RSP: 0018:ffffc900005939f8 EFLAGS: 00010246 +[ 135.262847] RAX: 0000000000000024 RBX: ffffc90000593b70 RCX: 0000000000000023 +[ 135.264765] RDX: ffff8880038e5f10 RSI: 0000000000000003 RDI: ffff8880046e922c +[ 135.266670] RBP: ffff8880046e9348 R08: 0000000000000001 R09: ffff888002ca580c +[ 135.268576] R10: 0000000000002602 R11: 0000000000000000 R12: 0000000000000024 +[ 135.270477] R13: 0000000000000000 R14: 0000000000000024 R15: 0000000000000000 +[ 135.272394] FS: 00007fdabdc56740(0000) GS:ffff88807dd00000(0000) knlGS:0000000000000000 +[ 135.274510] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 135.276075] CR2: 00007ffc26bd4f00 CR3: 0000000006261004 CR4: 0000000000170ea0 +[ 135.277952] Call Trace: +[ 135.278635] +[ 135.279247] ? preempt_count_add+0x6d/0xa0 +[ 135.280358] ? percpu_counter_add_batch+0x55/0xb0 +[ 135.281612] ? _raw_read_unlock+0x18/0x30 +[ 135.282704] ext4_map_blocks+0x294/0x5a0 +[ 135.283745] ? xa_load+0x6f/0xa0 +[ 135.284562] ext4_mpage_readpages+0x3d6/0x770 +[ 135.285646] read_pages+0x67/0x1d0 +[ 135.286492] ? folio_add_lru+0x51/0x80 +[ 135.287441] page_cache_ra_unbounded+0x124/0x170 +[ 135.288510] filemap_get_pages+0x23d/0x5a0 +[ 135.289457] ? path_openat+0xa72/0xdd0 +[ 135.290332] filemap_read+0xbf/0x300 +[ 135.291158] ? _raw_spin_lock_irqsave+0x17/0x40 +[ 135.292192] new_sync_read+0x103/0x170 +[ 135.293014] vfs_read+0x15d/0x180 +[ 135.293745] ksys_read+0xa1/0xe0 +[ 135.294461] do_syscall_64+0x3c/0x80 +[ 135.295284] entry_SYSCALL_64_after_hwframe+0x46/0xb0 + +This patch simply adds an extra check in __ext4_ext_check(), verifying that +eh_entries is not 0 when eh_depth is > 0. + +Link: https://bugzilla.kernel.org/show_bug.cgi?id=215941 +Link: https://bugzilla.kernel.org/show_bug.cgi?id=216283 +Cc: Baokun Li +Cc: stable@kernel.org +Signed-off-by: Luís Henriques +Reviewed-by: Jan Kara +Reviewed-by: Baokun Li +Link: https://lore.kernel.org/r/20220822094235.2690-1-lhenriques@suse.de +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/extents.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -460,6 +460,10 @@ static int __ext4_ext_check(const char * + error_msg = "invalid eh_entries"; + goto corrupted; + } ++ if (unlikely((eh->eh_entries == 0) && (depth > 0))) { ++ error_msg = "eh_entries is 0 but eh_depth is > 0"; ++ goto corrupted; ++ } + if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) { + error_msg = "invalid extent entries"; + goto corrupted; diff --git a/queue-5.19/ext4-limit-the-number-of-retries-after-discarding-preallocations-blocks.patch b/queue-5.19/ext4-limit-the-number-of-retries-after-discarding-preallocations-blocks.patch new file mode 100644 index 00000000000..c32b787ee74 --- /dev/null +++ b/queue-5.19/ext4-limit-the-number-of-retries-after-discarding-preallocations-blocks.patch @@ -0,0 +1,74 @@ +From 80fa46d6b9e7b1527bfd2197d75431fd9c382161 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Thu, 1 Sep 2022 18:03:14 -0400 +Subject: ext4: limit the number of retries after discarding preallocations blocks + +From: Theodore Ts'o + +commit 80fa46d6b9e7b1527bfd2197d75431fd9c382161 upstream. + +This patch avoids threads live-locking for hours when a large number +threads are competing over the last few free extents as they blocks +getting added and removed from preallocation pools. From our bug +reporter: + + A reliable way for triggering this has multiple writers + continuously write() to files when the filesystem is full, while + small amounts of space are freed (e.g. by truncating a large file + -1MiB at a time). In the local filesystem, this can be done by + simply not checking the return code of write (0) and/or the error + (ENOSPACE) that is set. Over NFS with an async mount, even clients + with proper error checking will behave this way since the linux NFS + client implementation will not propagate the server errors [the + write syscalls immediately return success] until the file handle is + closed. This leads to a situation where NFS clients send a + continuous stream of WRITE rpcs which result in ERRNOSPACE -- but + since the client isn't seeing this, the stream of writes continues + at maximum network speed. + + When some space does appear, multiple writers will all attempt to + claim it for their current write. For NFS, we may see dozens to + hundreds of threads that do this. + + The real-world scenario of this is database backup tooling (in + particular, github.com/mdkent/percona-xtrabackup) which may write + large files (>1TiB) to NFS for safe keeping. Some temporary files + are written, rewound, and read back -- all before closing the file + handle (the temp file is actually unlinked, to trigger automatic + deletion on close/crash.) An application like this operating on an + async NFS mount will not see an error code until TiB have been + written/read. + + The lockup was observed when running this database backup on large + filesystems (64 TiB in this case) with a high number of block + groups and no free space. Fragmentation is generally not a factor + in this filesystem (~thousands of large files, mostly contiguous + except for the parts written while the filesystem is at capacity.) + +Signed-off-by: Theodore Ts'o +Cc: stable@kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/mballoc.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -5559,6 +5559,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t + ext4_fsblk_t block = 0; + unsigned int inquota = 0; + unsigned int reserv_clstrs = 0; ++ int retries = 0; + u64 seq; + + might_sleep(); +@@ -5661,7 +5662,8 @@ repeat: + ar->len = ac->ac_b_ex.fe_len; + } + } else { +- if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) ++ if (++retries < 3 && ++ ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) + goto repeat; + /* + * If block allocation fails then the pa allocated above diff --git a/queue-5.19/ext4-make-mballoc-try-target-group-first-even-with-mb_optimize_scan.patch b/queue-5.19/ext4-make-mballoc-try-target-group-first-even-with-mb_optimize_scan.patch new file mode 100644 index 00000000000..ebf9b2e4970 --- /dev/null +++ b/queue-5.19/ext4-make-mballoc-try-target-group-first-even-with-mb_optimize_scan.patch @@ -0,0 +1,89 @@ +From 4fca50d440cc5d4dc570ad5484cc0b70b381bc2a Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Thu, 8 Sep 2022 11:21:24 +0200 +Subject: ext4: make mballoc try target group first even with mb_optimize_scan + +From: Jan Kara + +commit 4fca50d440cc5d4dc570ad5484cc0b70b381bc2a upstream. + +One of the side-effects of mb_optimize_scan was that the optimized +functions to select next group to try were called even before we tried +the goal group. As a result we no longer allocate files close to +corresponding inodes as well as we don't try to expand currently +allocated extent in the same group. This results in reaim regression +with workfile.disk workload of upto 8% with many clients on my test +machine: + + baseline mb_optimize_scan +Hmean disk-1 2114.16 ( 0.00%) 2099.37 ( -0.70%) +Hmean disk-41 87794.43 ( 0.00%) 83787.47 * -4.56%* +Hmean disk-81 148170.73 ( 0.00%) 135527.05 * -8.53%* +Hmean disk-121 177506.11 ( 0.00%) 166284.93 * -6.32%* +Hmean disk-161 220951.51 ( 0.00%) 207563.39 * -6.06%* +Hmean disk-201 208722.74 ( 0.00%) 203235.59 ( -2.63%) +Hmean disk-241 222051.60 ( 0.00%) 217705.51 ( -1.96%) +Hmean disk-281 252244.17 ( 0.00%) 241132.72 * -4.41%* +Hmean disk-321 255844.84 ( 0.00%) 245412.84 * -4.08%* + +Also this is causing huge regression (time increased by a factor of 5 or +so) when untarring archive with lots of small files on some eMMC storage +cards. + +Fix the problem by making sure we try goal group first. + +Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning") +CC: stable@kernel.org +Reported-and-tested-by: Stefan Wahren +Tested-by: Ojaswin Mujoo +Reviewed-by: Ritesh Harjani (IBM) +Link: https://lore.kernel.org/all/20220727105123.ckwrhbilzrxqpt24@quack3/ +Link: https://lore.kernel.org/all/0d81a7c2-46b7-6010-62a4-3e6cfc1628d6@i2se.com/ +Signed-off-by: Jan Kara +Link: https://lore.kernel.org/r/20220908092136.11770-1-jack@suse.cz +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/mballoc.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -1049,8 +1049,10 @@ static void ext4_mb_choose_next_group(st + { + *new_cr = ac->ac_criteria; + +- if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) ++ if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { ++ *group = next_linear_group(ac, *group, ngroups); + return; ++ } + + if (*new_cr == 0) { + ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); +@@ -2630,7 +2632,7 @@ static noinline_for_stack int + ext4_mb_regular_allocator(struct ext4_allocation_context *ac) + { + ext4_group_t prefetch_grp = 0, ngroups, group, i; +- int cr = -1; ++ int cr = -1, new_cr; + int err = 0, first_err = 0; + unsigned int nr = 0, prefetch_ios = 0; + struct ext4_sb_info *sbi; +@@ -2705,13 +2707,11 @@ repeat: + ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; + prefetch_grp = group; + +- for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups), +- i++) { +- int ret = 0, new_cr; ++ for (i = 0, new_cr = cr; i < ngroups; i++, ++ ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { ++ int ret = 0; + + cond_resched(); +- +- ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups); + if (new_cr != cr) { + cr = new_cr; + goto repeat; diff --git a/queue-5.19/series b/queue-5.19/series index a8cffcf43ba..1ad4f6e0af8 100644 --- a/queue-5.19/series +++ b/queue-5.19/series @@ -196,3 +196,7 @@ drm-amdgpu-don-t-register-a-dirty-callback-for-non-a.patch certs-make-system-keyring-depend-on-built-in-x509-pa.patch makefile.debug-set-g-unconditional-on-config_debug_i.patch makefile.debug-re-enable-debug-info-for-.s-files.patch +devdax-fix-soft-reservation-memory-description.patch +ext4-fix-bug-in-extents-parsing-when-eh_entries-0-and-eh_depth-0.patch +ext4-limit-the-number-of-retries-after-discarding-preallocations-blocks.patch +ext4-make-mballoc-try-target-group-first-even-with-mb_optimize_scan.patch -- 2.47.3