]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.19-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 26 Sep 2022 06:34:38 +0000 (08:34 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 26 Sep 2022 06:34:38 +0000 (08:34 +0200)
added patches:
devdax-fix-soft-reservation-memory-description.patch
ext4-fix-bug-in-extents-parsing-when-eh_entries-0-and-eh_depth-0.patch
ext4-limit-the-number-of-retries-after-discarding-preallocations-blocks.patch
ext4-make-mballoc-try-target-group-first-even-with-mb_optimize_scan.patch

queue-5.19/devdax-fix-soft-reservation-memory-description.patch [new file with mode: 0644]
queue-5.19/ext4-fix-bug-in-extents-parsing-when-eh_entries-0-and-eh_depth-0.patch [new file with mode: 0644]
queue-5.19/ext4-limit-the-number-of-retries-after-discarding-preallocations-blocks.patch [new file with mode: 0644]
queue-5.19/ext4-make-mballoc-try-target-group-first-even-with-mb_optimize_scan.patch [new file with mode: 0644]
queue-5.19/series

diff --git a/queue-5.19/devdax-fix-soft-reservation-memory-description.patch b/queue-5.19/devdax-fix-soft-reservation-memory-description.patch
new file mode 100644 (file)
index 0000000..ca5964f
--- /dev/null
@@ -0,0 +1,58 @@
+From 67feaba413ec68daf4124e9870878899b4ed9a0e Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Fri, 23 Sep 2022 15:05:56 -0700
+Subject: devdax: Fix soft-reservation memory description
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit 67feaba413ec68daf4124e9870878899b4ed9a0e upstream.
+
+The "hmem" platform-devices that are created to represent the
+platform-advertised "Soft Reserved" memory ranges end up inserting a
+resource that causes the iomem_resource tree to look like this:
+
+340000000-43fffffff : hmem.0
+  340000000-43fffffff : Soft Reserved
+    340000000-43fffffff : dax0.0
+
+This is because insert_resource() reparents ranges when they completely
+intersect an existing range.
+
+This matters because code that uses region_intersects() to scan for a
+given IORES_DESC will only check that top-level 'hmem.0' resource and
+not the 'Soft Reserved' descendant.
+
+So, to support EINJ (via einj_error_inject()) to inject errors into
+memory hosted by a dax-device, be sure to describe the memory as
+IORES_DESC_SOFT_RESERVED. This is a follow-on to:
+
+commit b13a3e5fd40b ("ACPI: APEI: Fix _EINJ vs EFI_MEMORY_SP")
+
+...that fixed EINJ support for "Soft Reserved" ranges in the first
+instance.
+
+Fixes: 262b45ae3ab4 ("x86/efi: EFI soft reservation to E820 enumeration")
+Reported-by: Ricardo Sandoval Torres <ricardo.sandoval.torres@intel.com>
+Tested-by: Ricardo Sandoval Torres <ricardo.sandoval.torres@intel.com>
+Cc: <stable@vger.kernel.org>
+Cc: Tony Luck <tony.luck@intel.com>
+Cc: Omar Avelar <omar.avelar@intel.com>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Mark Gross <markgross@kernel.org>
+Link: https://lore.kernel.org/r/166397075670.389916.7435722208896316387.stgit@dwillia2-xfh.jf.intel.com
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/dax/hmem/device.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/dax/hmem/device.c
++++ b/drivers/dax/hmem/device.c
+@@ -15,6 +15,7 @@ void hmem_register_device(int target_nid
+               .start = r->start,
+               .end = r->end,
+               .flags = IORESOURCE_MEM,
++              .desc = IORES_DESC_SOFT_RESERVED,
+       };
+       struct platform_device *pdev;
+       struct memregion_info info;
diff --git a/queue-5.19/ext4-fix-bug-in-extents-parsing-when-eh_entries-0-and-eh_depth-0.patch b/queue-5.19/ext4-fix-bug-in-extents-parsing-when-eh_entries-0-and-eh_depth-0.patch
new file mode 100644 (file)
index 0000000..2436dc1
--- /dev/null
@@ -0,0 +1,85 @@
+From 29a5b8a137ac8eb410cc823653a29ac0e7b7e1b0 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Lu=C3=ADs=20Henriques?= <lhenriques@suse.de>
+Date: Mon, 22 Aug 2022 10:42:35 +0100
+Subject: ext4: fix bug in extents parsing when eh_entries == 0 and eh_depth > 0
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Luís Henriques <lhenriques@suse.de>
+
+commit 29a5b8a137ac8eb410cc823653a29ac0e7b7e1b0 upstream.
+
+When walking through an inode extents, the ext4_ext_binsearch_idx() function
+assumes that the extent header has been previously validated.  However, there
+are no checks that verify that the number of entries (eh->eh_entries) is
+non-zero when depth is > 0.  And this will lead to problems because the
+EXT_FIRST_INDEX() and EXT_LAST_INDEX() will return garbage and result in this:
+
+[  135.245946] ------------[ cut here ]------------
+[  135.247579] kernel BUG at fs/ext4/extents.c:2258!
+[  135.249045] invalid opcode: 0000 [#1] PREEMPT SMP
+[  135.250320] CPU: 2 PID: 238 Comm: tmp118 Not tainted 5.19.0-rc8+ #4
+[  135.252067] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b-rebuilt.opensuse.org 04/01/2014
+[  135.255065] RIP: 0010:ext4_ext_map_blocks+0xc20/0xcb0
+[  135.256475] Code:
+[  135.261433] RSP: 0018:ffffc900005939f8 EFLAGS: 00010246
+[  135.262847] RAX: 0000000000000024 RBX: ffffc90000593b70 RCX: 0000000000000023
+[  135.264765] RDX: ffff8880038e5f10 RSI: 0000000000000003 RDI: ffff8880046e922c
+[  135.266670] RBP: ffff8880046e9348 R08: 0000000000000001 R09: ffff888002ca580c
+[  135.268576] R10: 0000000000002602 R11: 0000000000000000 R12: 0000000000000024
+[  135.270477] R13: 0000000000000000 R14: 0000000000000024 R15: 0000000000000000
+[  135.272394] FS:  00007fdabdc56740(0000) GS:ffff88807dd00000(0000) knlGS:0000000000000000
+[  135.274510] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[  135.276075] CR2: 00007ffc26bd4f00 CR3: 0000000006261004 CR4: 0000000000170ea0
+[  135.277952] Call Trace:
+[  135.278635]  <TASK>
+[  135.279247]  ? preempt_count_add+0x6d/0xa0
+[  135.280358]  ? percpu_counter_add_batch+0x55/0xb0
+[  135.281612]  ? _raw_read_unlock+0x18/0x30
+[  135.282704]  ext4_map_blocks+0x294/0x5a0
+[  135.283745]  ? xa_load+0x6f/0xa0
+[  135.284562]  ext4_mpage_readpages+0x3d6/0x770
+[  135.285646]  read_pages+0x67/0x1d0
+[  135.286492]  ? folio_add_lru+0x51/0x80
+[  135.287441]  page_cache_ra_unbounded+0x124/0x170
+[  135.288510]  filemap_get_pages+0x23d/0x5a0
+[  135.289457]  ? path_openat+0xa72/0xdd0
+[  135.290332]  filemap_read+0xbf/0x300
+[  135.291158]  ? _raw_spin_lock_irqsave+0x17/0x40
+[  135.292192]  new_sync_read+0x103/0x170
+[  135.293014]  vfs_read+0x15d/0x180
+[  135.293745]  ksys_read+0xa1/0xe0
+[  135.294461]  do_syscall_64+0x3c/0x80
+[  135.295284]  entry_SYSCALL_64_after_hwframe+0x46/0xb0
+
+This patch simply adds an extra check in __ext4_ext_check(), verifying that
+eh_entries is not 0 when eh_depth is > 0.
+
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=215941
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=216283
+Cc: Baokun Li <libaokun1@huawei.com>
+Cc: stable@kernel.org
+Signed-off-by: Luís Henriques <lhenriques@suse.de>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Reviewed-by: Baokun Li <libaokun1@huawei.com>
+Link: https://lore.kernel.org/r/20220822094235.2690-1-lhenriques@suse.de
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/extents.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -460,6 +460,10 @@ static int __ext4_ext_check(const char *
+               error_msg = "invalid eh_entries";
+               goto corrupted;
+       }
++      if (unlikely((eh->eh_entries == 0) && (depth > 0))) {
++              error_msg = "eh_entries is 0 but eh_depth is > 0";
++              goto corrupted;
++      }
+       if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
+               error_msg = "invalid extent entries";
+               goto corrupted;
diff --git a/queue-5.19/ext4-limit-the-number-of-retries-after-discarding-preallocations-blocks.patch b/queue-5.19/ext4-limit-the-number-of-retries-after-discarding-preallocations-blocks.patch
new file mode 100644 (file)
index 0000000..c32b787
--- /dev/null
@@ -0,0 +1,74 @@
+From 80fa46d6b9e7b1527bfd2197d75431fd9c382161 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Thu, 1 Sep 2022 18:03:14 -0400
+Subject: ext4: limit the number of retries after discarding preallocations blocks
+
+From: Theodore Ts'o <tytso@mit.edu>
+
+commit 80fa46d6b9e7b1527bfd2197d75431fd9c382161 upstream.
+
+This patch avoids threads live-locking for hours when a large number
+threads are competing over the last few free extents as they blocks
+getting added and removed from preallocation pools.  From our bug
+reporter:
+
+   A reliable way for triggering this has multiple writers
+   continuously write() to files when the filesystem is full, while
+   small amounts of space are freed (e.g. by truncating a large file
+   -1MiB at a time). In the local filesystem, this can be done by
+   simply not checking the return code of write (0) and/or the error
+   (ENOSPACE) that is set. Over NFS with an async mount, even clients
+   with proper error checking will behave this way since the linux NFS
+   client implementation will not propagate the server errors [the
+   write syscalls immediately return success] until the file handle is
+   closed. This leads to a situation where NFS clients send a
+   continuous stream of WRITE rpcs which result in ERRNOSPACE -- but
+   since the client isn't seeing this, the stream of writes continues
+   at maximum network speed.
+
+   When some space does appear, multiple writers will all attempt to
+   claim it for their current write. For NFS, we may see dozens to
+   hundreds of threads that do this.
+
+   The real-world scenario of this is database backup tooling (in
+   particular, github.com/mdkent/percona-xtrabackup) which may write
+   large files (>1TiB) to NFS for safe keeping. Some temporary files
+   are written, rewound, and read back -- all before closing the file
+   handle (the temp file is actually unlinked, to trigger automatic
+   deletion on close/crash.) An application like this operating on an
+   async NFS mount will not see an error code until TiB have been
+   written/read.
+
+   The lockup was observed when running this database backup on large
+   filesystems (64 TiB in this case) with a high number of block
+   groups and no free space. Fragmentation is generally not a factor
+   in this filesystem (~thousands of large files, mostly contiguous
+   except for the parts written while the filesystem is at capacity.)
+
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Cc: stable@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/mballoc.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -5559,6 +5559,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t
+       ext4_fsblk_t block = 0;
+       unsigned int inquota = 0;
+       unsigned int reserv_clstrs = 0;
++      int retries = 0;
+       u64 seq;
+       might_sleep();
+@@ -5661,7 +5662,8 @@ repeat:
+                       ar->len = ac->ac_b_ex.fe_len;
+               }
+       } else {
+-              if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
++              if (++retries < 3 &&
++                  ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
+                       goto repeat;
+               /*
+                * If block allocation fails then the pa allocated above
diff --git a/queue-5.19/ext4-make-mballoc-try-target-group-first-even-with-mb_optimize_scan.patch b/queue-5.19/ext4-make-mballoc-try-target-group-first-even-with-mb_optimize_scan.patch
new file mode 100644 (file)
index 0000000..ebf9b2e
--- /dev/null
@@ -0,0 +1,89 @@
+From 4fca50d440cc5d4dc570ad5484cc0b70b381bc2a Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 8 Sep 2022 11:21:24 +0200
+Subject: ext4: make mballoc try target group first even with mb_optimize_scan
+
+From: Jan Kara <jack@suse.cz>
+
+commit 4fca50d440cc5d4dc570ad5484cc0b70b381bc2a upstream.
+
+One of the side-effects of mb_optimize_scan was that the optimized
+functions to select next group to try were called even before we tried
+the goal group. As a result we no longer allocate files close to
+corresponding inodes as well as we don't try to expand currently
+allocated extent in the same group. This results in reaim regression
+with workfile.disk workload of upto 8% with many clients on my test
+machine:
+
+                     baseline               mb_optimize_scan
+Hmean     disk-1       2114.16 (   0.00%)     2099.37 (  -0.70%)
+Hmean     disk-41     87794.43 (   0.00%)    83787.47 *  -4.56%*
+Hmean     disk-81    148170.73 (   0.00%)   135527.05 *  -8.53%*
+Hmean     disk-121   177506.11 (   0.00%)   166284.93 *  -6.32%*
+Hmean     disk-161   220951.51 (   0.00%)   207563.39 *  -6.06%*
+Hmean     disk-201   208722.74 (   0.00%)   203235.59 (  -2.63%)
+Hmean     disk-241   222051.60 (   0.00%)   217705.51 (  -1.96%)
+Hmean     disk-281   252244.17 (   0.00%)   241132.72 *  -4.41%*
+Hmean     disk-321   255844.84 (   0.00%)   245412.84 *  -4.08%*
+
+Also this is causing huge regression (time increased by a factor of 5 or
+so) when untarring archive with lots of small files on some eMMC storage
+cards.
+
+Fix the problem by making sure we try goal group first.
+
+Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning")
+CC: stable@kernel.org
+Reported-and-tested-by: Stefan Wahren <stefan.wahren@i2se.com>
+Tested-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
+Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
+Link: https://lore.kernel.org/all/20220727105123.ckwrhbilzrxqpt24@quack3/
+Link: https://lore.kernel.org/all/0d81a7c2-46b7-6010-62a4-3e6cfc1628d6@i2se.com/
+Signed-off-by: Jan Kara <jack@suse.cz>
+Link: https://lore.kernel.org/r/20220908092136.11770-1-jack@suse.cz
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/mballoc.c |   14 +++++++-------
+ 1 file changed, 7 insertions(+), 7 deletions(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -1049,8 +1049,10 @@ static void ext4_mb_choose_next_group(st
+ {
+       *new_cr = ac->ac_criteria;
+-      if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining)
++      if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) {
++              *group = next_linear_group(ac, *group, ngroups);
+               return;
++      }
+       if (*new_cr == 0) {
+               ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
+@@ -2630,7 +2632,7 @@ static noinline_for_stack int
+ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
+ {
+       ext4_group_t prefetch_grp = 0, ngroups, group, i;
+-      int cr = -1;
++      int cr = -1, new_cr;
+       int err = 0, first_err = 0;
+       unsigned int nr = 0, prefetch_ios = 0;
+       struct ext4_sb_info *sbi;
+@@ -2705,13 +2707,11 @@ repeat:
+               ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
+               prefetch_grp = group;
+-              for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups),
+-                           i++) {
+-                      int ret = 0, new_cr;
++              for (i = 0, new_cr = cr; i < ngroups; i++,
++                   ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
++                      int ret = 0;
+                       cond_resched();
+-
+-                      ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups);
+                       if (new_cr != cr) {
+                               cr = new_cr;
+                               goto repeat;
index a8cffcf43ba3d802badcd0c6299f6255fb2cb531..1ad4f6e0af8819f30766805c711beed03d921673 100644 (file)
@@ -196,3 +196,7 @@ drm-amdgpu-don-t-register-a-dirty-callback-for-non-a.patch
 certs-make-system-keyring-depend-on-built-in-x509-pa.patch
 makefile.debug-set-g-unconditional-on-config_debug_i.patch
 makefile.debug-re-enable-debug-info-for-.s-files.patch
+devdax-fix-soft-reservation-memory-description.patch
+ext4-fix-bug-in-extents-parsing-when-eh_entries-0-and-eh_depth-0.patch
+ext4-limit-the-number-of-retries-after-discarding-preallocations-blocks.patch
+ext4-make-mballoc-try-target-group-first-even-with-mb_optimize_scan.patch