]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.15-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 30 Jun 2022 11:22:31 +0000 (13:22 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 30 Jun 2022 11:22:31 +0000 (13:22 +0200)
added patches:
bcache-memset-on-stack-variables-in-bch_btree_check-and-bch_sectors_dirty_init.patch
xfs-check-sb_meta_uuid-for-dabuf-buffer-recovery.patch
xfs-fix-the-free-logic-of-state-in-xfs_attr_node_hasname.patch
xfs-only-bother-with-sync_filesystem-during-readonly-remount.patch
xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch
xfs-punch-out-data-fork-delalloc-blocks-on-cow-writeback-failure.patch
xfs-remove-all-cow-fork-extents-when-remounting-readonly.patch
xfs-use-kmem_cache_free-for-kmem_cache-objects.patch

queue-5.15/bcache-memset-on-stack-variables-in-bch_btree_check-and-bch_sectors_dirty_init.patch [new file with mode: 0644]
queue-5.15/series
queue-5.15/xfs-check-sb_meta_uuid-for-dabuf-buffer-recovery.patch [new file with mode: 0644]
queue-5.15/xfs-fix-the-free-logic-of-state-in-xfs_attr_node_hasname.patch [new file with mode: 0644]
queue-5.15/xfs-only-bother-with-sync_filesystem-during-readonly-remount.patch [new file with mode: 0644]
queue-5.15/xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch [new file with mode: 0644]
queue-5.15/xfs-punch-out-data-fork-delalloc-blocks-on-cow-writeback-failure.patch [new file with mode: 0644]
queue-5.15/xfs-remove-all-cow-fork-extents-when-remounting-readonly.patch [new file with mode: 0644]
queue-5.15/xfs-use-kmem_cache_free-for-kmem_cache-objects.patch [new file with mode: 0644]

diff --git a/queue-5.15/bcache-memset-on-stack-variables-in-bch_btree_check-and-bch_sectors_dirty_init.patch b/queue-5.15/bcache-memset-on-stack-variables-in-bch_btree_check-and-bch_sectors_dirty_init.patch
new file mode 100644 (file)
index 0000000..510a4bc
--- /dev/null
@@ -0,0 +1,42 @@
+From 7d6b902ea0e02b2a25c480edf471cbaa4ebe6b3c Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Fri, 27 May 2022 23:28:16 +0800
+Subject: bcache: memset on stack variables in bch_btree_check() and bch_sectors_dirty_init()
+
+From: Coly Li <colyli@suse.de>
+
+commit 7d6b902ea0e02b2a25c480edf471cbaa4ebe6b3c upstream.
+
+The local variables check_state (in bch_btree_check()) and state (in
+bch_sectors_dirty_init()) should be fully filled by 0, because before
+allocating them on stack, they were dynamically allocated by kzalloc().
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Link: https://lore.kernel.org/r/20220527152818.27545-2-colyli@suse.de
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/bcache/btree.c     |    1 +
+ drivers/md/bcache/writeback.c |    1 +
+ 2 files changed, 2 insertions(+)
+
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -2017,6 +2017,7 @@ int bch_btree_check(struct cache_set *c)
+       if (c->root->level == 0)
+               return 0;
++      memset(&check_state, 0, sizeof(struct btree_check_state));
+       check_state.c = c;
+       check_state.total_threads = bch_btree_chkthread_nr();
+       check_state.key_idx = 0;
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -947,6 +947,7 @@ void bch_sectors_dirty_init(struct bcach
+               return;
+       }
++      memset(&state, 0, sizeof(struct bch_dirty_init_state));
+       state.c = c;
+       state.d = d;
+       state.total_threads = bch_btre_dirty_init_thread_nr();
index db6a4b5b72951a5f47ff016c952bdda050a2b53f..7f3afc2d7f9a0267abfa7657edd988ae8307af02 100644 (file)
@@ -1,3 +1,11 @@
 tick-nohz-unexport-__init-annotated-tick_nohz_full_setup.patch
 clocksource-drivers-ixp4xx-remove-__init-from-ixp4xx_timer_setup.patch
 x86-kvm-use-proper-asm-macros-for-kvm_vcpu_is_preempted.patch
+bcache-memset-on-stack-variables-in-bch_btree_check-and-bch_sectors_dirty_init.patch
+xfs-use-kmem_cache_free-for-kmem_cache-objects.patch
+xfs-punch-out-data-fork-delalloc-blocks-on-cow-writeback-failure.patch
+xfs-fix-the-free-logic-of-state-in-xfs_attr_node_hasname.patch
+xfs-remove-all-cow-fork-extents-when-remounting-readonly.patch
+xfs-check-sb_meta_uuid-for-dabuf-buffer-recovery.patch
+xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch
+xfs-only-bother-with-sync_filesystem-during-readonly-remount.patch
diff --git a/queue-5.15/xfs-check-sb_meta_uuid-for-dabuf-buffer-recovery.patch b/queue-5.15/xfs-check-sb_meta_uuid-for-dabuf-buffer-recovery.patch
new file mode 100644 (file)
index 0000000..d052e72
--- /dev/null
@@ -0,0 +1,85 @@
+From foo@baz Thu Jun 30 01:21:44 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 28 Jun 2022 11:39:49 -0700
+Subject: xfs: check sb_meta_uuid for dabuf buffer recovery
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, Dave Chinner <dchinner@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220628183951.3425528-6-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 09654ed8a18cfd45027a67d6cbca45c9ea54feab ]
+
+Got a report that a repeated crash test of a container host would
+eventually fail with a log recovery error preventing the system from
+mounting the root filesystem. It manifested as a directory leaf node
+corruption on writeback like so:
+
+ XFS (loop0): Mounting V5 Filesystem
+ XFS (loop0): Starting recovery (logdev: internal)
+ XFS (loop0): Metadata corruption detected at xfs_dir3_leaf_check_int+0x99/0xf0, xfs_dir3_leaf1 block 0x12faa158
+ XFS (loop0): Unmount and run xfs_repair
+ XFS (loop0): First 128 bytes of corrupted metadata buffer:
+ 00000000: 00 00 00 00 00 00 00 00 3d f1 00 00 e1 9e d5 8b  ........=.......
+ 00000010: 00 00 00 00 12 fa a1 58 00 00 00 29 00 00 1b cc  .......X...)....
+ 00000020: 91 06 78 ff f7 7e 4a 7d 8d 53 86 f2 ac 47 a8 23  ..x..~J}.S...G.#
+ 00000030: 00 00 00 00 17 e0 00 80 00 43 00 00 00 00 00 00  .........C......
+ 00000040: 00 00 00 2e 00 00 00 08 00 00 17 2e 00 00 00 0a  ................
+ 00000050: 02 35 79 83 00 00 00 30 04 d3 b4 80 00 00 01 50  .5y....0.......P
+ 00000060: 08 40 95 7f 00 00 02 98 08 41 fe b7 00 00 02 d4  .@.......A......
+ 00000070: 0d 62 ef a7 00 00 01 f2 14 50 21 41 00 00 00 0c  .b.......P!A....
+ XFS (loop0): Corruption of in-memory data (0x8) detected at xfs_do_force_shutdown+0x1a/0x20 (fs/xfs/xfs_buf.c:1514).  Shutting down.
+ XFS (loop0): Please unmount the filesystem and rectify the problem(s)
+ XFS (loop0): log mount/recovery failed: error -117
+ XFS (loop0): log mount failed
+
+Tracing indicated that we were recovering changes from a transaction
+at LSN 0x29/0x1c16 into a buffer that had an LSN of 0x29/0x1d57.
+That is, log recovery was overwriting a buffer with newer changes on
+disk than was in the transaction. Tracing indicated that we were
+hitting the "recovery immediately" case in
+xfs_buf_log_recovery_lsn(), and hence it was ignoring the LSN in the
+buffer.
+
+The code was extracting the LSN correctly, then ignoring it because
+the UUID in the buffer did not match the superblock UUID. The
+problem arises because the UUID check uses the wrong UUID - it
+should be checking the sb_meta_uuid, not sb_uuid. This filesystem
+has sb_uuid != sb_meta_uuid (which is fine), and the buffer has the
+correct matching sb_meta_uuid in it, it's just the code checked it
+against the wrong superblock uuid.
+
+The is no corruption in the filesystem, and failing to recover the
+buffer due to a write verifier failure means the recovery bug did
+not propagate the corruption to disk. Hence there is no corruption
+before or after this bug has manifested, the impact is limited
+simply to an unmountable filesystem....
+
+This was missed back in 2015 during an audit of incorrect sb_uuid
+usage that resulted in commit fcfbe2c4ef42 ("xfs: log recovery needs
+to validate against sb_meta_uuid") that fixed the magic32 buffers to
+validate against sb_meta_uuid instead of sb_uuid. It missed the
+magicda buffers....
+
+Fixes: ce748eaa65f2 ("xfs: create new metadata UUID field and incompat flag")
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item_recover.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_buf_item_recover.c
++++ b/fs/xfs/xfs_buf_item_recover.c
+@@ -816,7 +816,7 @@ xlog_recover_get_buf_lsn(
+       }
+       if (lsn != (xfs_lsn_t)-1) {
+-              if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
++              if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
+                       goto recover_immediately;
+               return lsn;
+       }
diff --git a/queue-5.15/xfs-fix-the-free-logic-of-state-in-xfs_attr_node_hasname.patch b/queue-5.15/xfs-fix-the-free-logic-of-state-in-xfs_attr_node_hasname.patch
new file mode 100644 (file)
index 0000000..1c3a5f3
--- /dev/null
@@ -0,0 +1,129 @@
+From foo@baz Thu Jun 30 01:21:44 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 28 Jun 2022 11:39:47 -0700
+Subject: xfs: Fix the free logic of state in xfs_attr_node_hasname
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, Yang Xu <xuyang2018.jy@fujitsu.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220628183951.3425528-4-leah.rumancik@gmail.com>
+
+From: Yang Xu <xuyang2018.jy@fujitsu.com>
+
+[ Upstream commit a1de97fe296c52eafc6590a3506f4bbd44ecb19a ]
+
+When testing xfstests xfs/126 on lastest upstream kernel, it will hang on some machine.
+Adding a getxattr operation after xattr corrupted, I can reproduce it 100%.
+
+The deadlock as below:
+[983.923403] task:setfattr        state:D stack:    0 pid:17639 ppid: 14687 flags:0x00000080
+[  983.923405] Call Trace:
+[  983.923410]  __schedule+0x2c4/0x700
+[  983.923412]  schedule+0x37/0xa0
+[  983.923414]  schedule_timeout+0x274/0x300
+[  983.923416]  __down+0x9b/0xf0
+[  983.923451]  ? xfs_buf_find.isra.29+0x3c8/0x5f0 [xfs]
+[  983.923453]  down+0x3b/0x50
+[  983.923471]  xfs_buf_lock+0x33/0xf0 [xfs]
+[  983.923490]  xfs_buf_find.isra.29+0x3c8/0x5f0 [xfs]
+[  983.923508]  xfs_buf_get_map+0x4c/0x320 [xfs]
+[  983.923525]  xfs_buf_read_map+0x53/0x310 [xfs]
+[  983.923541]  ? xfs_da_read_buf+0xcf/0x120 [xfs]
+[  983.923560]  xfs_trans_read_buf_map+0x1cf/0x360 [xfs]
+[  983.923575]  ? xfs_da_read_buf+0xcf/0x120 [xfs]
+[  983.923590]  xfs_da_read_buf+0xcf/0x120 [xfs]
+[  983.923606]  xfs_da3_node_read+0x1f/0x40 [xfs]
+[  983.923621]  xfs_da3_node_lookup_int+0x69/0x4a0 [xfs]
+[  983.923624]  ? kmem_cache_alloc+0x12e/0x270
+[  983.923637]  xfs_attr_node_hasname+0x6e/0xa0 [xfs]
+[  983.923651]  xfs_has_attr+0x6e/0xd0 [xfs]
+[  983.923664]  xfs_attr_set+0x273/0x320 [xfs]
+[  983.923683]  xfs_xattr_set+0x87/0xd0 [xfs]
+[  983.923686]  __vfs_removexattr+0x4d/0x60
+[  983.923688]  __vfs_removexattr_locked+0xac/0x130
+[  983.923689]  vfs_removexattr+0x4e/0xf0
+[  983.923690]  removexattr+0x4d/0x80
+[  983.923693]  ? __check_object_size+0xa8/0x16b
+[  983.923695]  ? strncpy_from_user+0x47/0x1a0
+[  983.923696]  ? getname_flags+0x6a/0x1e0
+[  983.923697]  ? _cond_resched+0x15/0x30
+[  983.923699]  ? __sb_start_write+0x1e/0x70
+[  983.923700]  ? mnt_want_write+0x28/0x50
+[  983.923701]  path_removexattr+0x9b/0xb0
+[  983.923702]  __x64_sys_removexattr+0x17/0x20
+[  983.923704]  do_syscall_64+0x5b/0x1a0
+[  983.923705]  entry_SYSCALL_64_after_hwframe+0x65/0xca
+[  983.923707] RIP: 0033:0x7f080f10ee1b
+
+When getxattr calls xfs_attr_node_get function, xfs_da3_node_lookup_int fails with EFSCORRUPTED in
+xfs_attr_node_hasname because we have use blocktrash to random it in xfs/126. So it
+free state in internal and xfs_attr_node_get doesn't do xfs_buf_trans release job.
+
+Then subsequent removexattr will hang because of it.
+
+This bug was introduced by kernel commit 07120f1abdff ("xfs: Add xfs_has_attr and subroutines").
+It adds xfs_attr_node_hasname helper and said caller will be responsible for freeing the state
+in this case. But xfs_attr_node_hasname will free state itself instead of caller if
+xfs_da3_node_lookup_int fails.
+
+Fix this bug by moving the step of free state into caller.
+
+Also, use "goto error/out" instead of returning error directly in xfs_attr_node_addname_find_attr and
+xfs_attr_node_removename_setup function because we should free state ourselves.
+
+Fixes: 07120f1abdff ("xfs: Add xfs_has_attr and subroutines")
+Signed-off-by: Yang Xu <xuyang2018.jy@fujitsu.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_attr.c |   17 +++++++----------
+ 1 file changed, 7 insertions(+), 10 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_attr.c
++++ b/fs/xfs/libxfs/xfs_attr.c
+@@ -1077,21 +1077,18 @@ xfs_attr_node_hasname(
+       state = xfs_da_state_alloc(args);
+       if (statep != NULL)
+-              *statep = NULL;
++              *statep = state;
+       /*
+        * Search to see if name exists, and get back a pointer to it.
+        */
+       error = xfs_da3_node_lookup_int(state, &retval);
+-      if (error) {
+-              xfs_da_state_free(state);
+-              return error;
+-      }
++      if (error)
++              retval = error;
+-      if (statep != NULL)
+-              *statep = state;
+-      else
++      if (!statep)
+               xfs_da_state_free(state);
++
+       return retval;
+ }
+@@ -1112,7 +1109,7 @@ xfs_attr_node_addname_find_attr(
+        */
+       retval = xfs_attr_node_hasname(args, &dac->da_state);
+       if (retval != -ENOATTR && retval != -EEXIST)
+-              return retval;
++              goto error;
+       if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
+               goto error;
+@@ -1337,7 +1334,7 @@ int xfs_attr_node_removename_setup(
+       error = xfs_attr_node_hasname(args, state);
+       if (error != -EEXIST)
+-              return error;
++              goto out;
+       error = 0;
+       ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL);
diff --git a/queue-5.15/xfs-only-bother-with-sync_filesystem-during-readonly-remount.patch b/queue-5.15/xfs-only-bother-with-sync_filesystem-during-readonly-remount.patch
new file mode 100644 (file)
index 0000000..ccb5c5f
--- /dev/null
@@ -0,0 +1,53 @@
+From foo@baz Thu Jun 30 01:21:44 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 28 Jun 2022 11:39:51 -0700
+Subject: xfs: only bother with sync_filesystem during readonly remount
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220628183951.3425528-8-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit b97cca3ba9098522e5a1c3388764ead42640c1a5 ]
+
+In commit 02b9984d6408, we pushed a sync_filesystem() call from the VFS
+into xfs_fs_remount.  The only time that we ever need to push dirty file
+data or metadata to disk for a remount is if we're remounting the
+filesystem read only, so this really could be moved to xfs_remount_ro.
+
+Once we've moved the call site, actually check the return value from
+sync_filesystem.
+
+Fixes: 02b9984d6408 ("fs: push sync_filesystem() down to the file system's remount_fs()")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_super.c |    7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1773,6 +1773,11 @@ xfs_remount_ro(
+       };
+       int                     error;
++      /* Flush all the dirty data to disk. */
++      error = sync_filesystem(mp->m_super);
++      if (error)
++              return error;
++
+       /*
+        * Cancel background eofb scanning so it cannot race with the final
+        * log force+buftarg wait and deadlock the remount.
+@@ -1851,8 +1856,6 @@ xfs_fs_reconfigure(
+       if (error)
+               return error;
+-      sync_filesystem(mp->m_super);
+-
+       /* inode32 -> inode64 */
+       if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {
+               mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
diff --git a/queue-5.15/xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch b/queue-5.15/xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch
new file mode 100644 (file)
index 0000000..62c67f2
--- /dev/null
@@ -0,0 +1,155 @@
+From foo@baz Thu Jun 30 01:21:44 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 28 Jun 2022 11:39:50 -0700
+Subject: xfs: prevent UAF in xfs_log_item_in_current_chkpt
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220628183951.3425528-7-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit f8d92a66e810acbef6ddbc0bd0cbd9b117ce8acd ]
+
+While I was running with KASAN and lockdep enabled, I stumbled upon an
+KASAN report about a UAF to a freed CIL checkpoint.  Looking at the
+comment for xfs_log_item_in_current_chkpt, it seems pretty obvious to me
+that the original patch to xfs_defer_finish_noroll should have done
+something to lock the CIL to prevent it from switching the CIL contexts
+while the predicate runs.
+
+For upper level code that needs to know if a given log item is new
+enough not to need relogging, add a new wrapper that takes the CIL
+context lock long enough to sample the current CIL context.  This is
+kind of racy in that the CIL can switch the contexts immediately after
+sampling, but that's ok because the consequence is that the defer ops
+code is a little slow to relog items.
+
+ ==================================================================
+ BUG: KASAN: use-after-free in xfs_log_item_in_current_chkpt+0x139/0x160 [xfs]
+ Read of size 8 at addr ffff88804ea5f608 by task fsstress/527999
+
+ CPU: 1 PID: 527999 Comm: fsstress Tainted: G      D      5.16.0-rc4-xfsx #rc4
+ Call Trace:
+  <TASK>
+  dump_stack_lvl+0x45/0x59
+  print_address_description.constprop.0+0x1f/0x140
+  kasan_report.cold+0x83/0xdf
+  xfs_log_item_in_current_chkpt+0x139/0x160
+  xfs_defer_finish_noroll+0x3bb/0x1e30
+  __xfs_trans_commit+0x6c8/0xcf0
+  xfs_reflink_remap_extent+0x66f/0x10e0
+  xfs_reflink_remap_blocks+0x2dd/0xa90
+  xfs_file_remap_range+0x27b/0xc30
+  vfs_dedupe_file_range_one+0x368/0x420
+  vfs_dedupe_file_range+0x37c/0x5d0
+  do_vfs_ioctl+0x308/0x1260
+  __x64_sys_ioctl+0xa1/0x170
+  do_syscall_64+0x35/0x80
+  entry_SYSCALL_64_after_hwframe+0x44/0xae
+ RIP: 0033:0x7f2c71a2950b
+ Code: 0f 1e fa 48 8b 05 85 39 0d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff
+ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01
+f0 ff ff 73 01 c3 48 8b 0d 55 39 0d 00 f7 d8 64 89 01 48
+ RSP: 002b:00007ffe8c0e03c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
+ RAX: ffffffffffffffda RBX: 00005600862a8740 RCX: 00007f2c71a2950b
+ RDX: 00005600862a7be0 RSI: 00000000c0189436 RDI: 0000000000000004
+ RBP: 000000000000000b R08: 0000000000000027 R09: 0000000000000003
+ R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000005a
+ R13: 00005600862804a8 R14: 0000000000016000 R15: 00005600862a8a20
+  </TASK>
+
+ Allocated by task 464064:
+  kasan_save_stack+0x1e/0x50
+  __kasan_kmalloc+0x81/0xa0
+  kmem_alloc+0xcd/0x2c0 [xfs]
+  xlog_cil_ctx_alloc+0x17/0x1e0 [xfs]
+  xlog_cil_push_work+0x141/0x13d0 [xfs]
+  process_one_work+0x7f6/0x1380
+  worker_thread+0x59d/0x1040
+  kthread+0x3b0/0x490
+  ret_from_fork+0x1f/0x30
+
+ Freed by task 51:
+  kasan_save_stack+0x1e/0x50
+  kasan_set_track+0x21/0x30
+  kasan_set_free_info+0x20/0x30
+  __kasan_slab_free+0xed/0x130
+  slab_free_freelist_hook+0x7f/0x160
+  kfree+0xde/0x340
+  xlog_cil_committed+0xbfd/0xfe0 [xfs]
+  xlog_cil_process_committed+0x103/0x1c0 [xfs]
+  xlog_state_do_callback+0x45d/0xbd0 [xfs]
+  xlog_ioend_work+0x116/0x1c0 [xfs]
+  process_one_work+0x7f6/0x1380
+  worker_thread+0x59d/0x1040
+  kthread+0x3b0/0x490
+  ret_from_fork+0x1f/0x30
+
+ Last potentially related work creation:
+  kasan_save_stack+0x1e/0x50
+  __kasan_record_aux_stack+0xb7/0xc0
+  insert_work+0x48/0x2e0
+  __queue_work+0x4e7/0xda0
+  queue_work_on+0x69/0x80
+  xlog_cil_push_now.isra.0+0x16b/0x210 [xfs]
+  xlog_cil_force_seq+0x1b7/0x850 [xfs]
+  xfs_log_force_seq+0x1c7/0x670 [xfs]
+  xfs_file_fsync+0x7c1/0xa60 [xfs]
+  __x64_sys_fsync+0x52/0x80
+  do_syscall_64+0x35/0x80
+  entry_SYSCALL_64_after_hwframe+0x44/0xae
+
+ The buggy address belongs to the object at ffff88804ea5f600
+  which belongs to the cache kmalloc-256 of size 256
+ The buggy address is located 8 bytes inside of
+  256-byte region [ffff88804ea5f600, ffff88804ea5f700)
+ The buggy address belongs to the page:
+ page:ffffea00013a9780 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88804ea5ea00 pfn:0x4ea5e
+ head:ffffea00013a9780 order:1 compound_mapcount:0
+ flags: 0x4fff80000010200(slab|head|node=1|zone=1|lastcpupid=0xfff)
+ raw: 04fff80000010200 ffffea0001245908 ffffea00011bd388 ffff888004c42b40
+ raw: ffff88804ea5ea00 0000000000100009 00000001ffffffff 0000000000000000
+ page dumped because: kasan: bad access detected
+
+ Memory state around the buggy address:
+  ffff88804ea5f500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+  ffff88804ea5f580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ >ffff88804ea5f600: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+                       ^
+  ffff88804ea5f680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+  ffff88804ea5f700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ==================================================================
+
+Fixes: 4e919af7827a ("xfs: periodically relog deferred intent items")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_cil.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_log_cil.c
++++ b/fs/xfs/xfs_log_cil.c
+@@ -1442,9 +1442,9 @@ out_shutdown:
+  */
+ bool
+ xfs_log_item_in_current_chkpt(
+-      struct xfs_log_item *lip)
++      struct xfs_log_item     *lip)
+ {
+-      struct xfs_cil_ctx *ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
++      struct xfs_cil          *cil = lip->li_mountp->m_log->l_cilp;
+       if (list_empty(&lip->li_cil))
+               return false;
+@@ -1454,7 +1454,7 @@ xfs_log_item_in_current_chkpt(
+        * first checkpoint it is written to. Hence if it is different to the
+        * current sequence, we're in a new checkpoint.
+        */
+-      return lip->li_seq == ctx->sequence;
++      return lip->li_seq == READ_ONCE(cil->xc_current_sequence);
+ }
+ /*
diff --git a/queue-5.15/xfs-punch-out-data-fork-delalloc-blocks-on-cow-writeback-failure.patch b/queue-5.15/xfs-punch-out-data-fork-delalloc-blocks-on-cow-writeback-failure.patch
new file mode 100644 (file)
index 0000000..8fe290a
--- /dev/null
@@ -0,0 +1,91 @@
+From foo@baz Thu Jun 30 01:21:44 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 28 Jun 2022 11:39:46 -0700
+Subject: xfs: punch out data fork delalloc blocks on COW writeback failure
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220628183951.3425528-3-leah.rumancik@gmail.com>
+
+From: Brian Foster <bfoster@redhat.com>
+
+[ Upstream commit 5ca5916b6bc93577c360c06cb7cdf71adb9b5faf ]
+
+If writeback I/O to a COW extent fails, the COW fork blocks are
+punched out and the data fork blocks left alone. It is possible for
+COW fork blocks to overlap non-shared data fork blocks (due to
+cowextsz hint prealloc), however, and writeback unconditionally maps
+to the COW fork whenever blocks exist at the corresponding offset of
+the page undergoing writeback. This means it's quite possible for a
+COW fork extent to overlap delalloc data fork blocks, writeback to
+convert and map to the COW fork blocks, writeback to fail, and
+finally for ioend completion to cancel the COW fork blocks and leave
+stale data fork delalloc blocks around in the inode. The blocks are
+effectively stale because writeback failure also discards dirty page
+state.
+
+If this occurs, it is likely to trigger assert failures, free space
+accounting corruption and failures in unrelated file operations. For
+example, a subsequent reflink attempt of the affected file to a new
+target file will trip over the stale delalloc in the source file and
+fail. Several of these issues are occasionally reproduced by
+generic/648, but are reproducible on demand with the right sequence
+of operations and timely I/O error injection.
+
+To fix this problem, update the ioend failure path to also punch out
+underlying data fork delalloc blocks on I/O error. This is analogous
+to the writeback submission failure path in xfs_discard_page() where
+we might fail to map data fork delalloc blocks and consistent with
+the successful COW writeback completion path, which is responsible
+for unmapping from the data fork and remapping in COW fork blocks.
+
+Fixes: 787eb485509f ("xfs: fix and streamline error handling in xfs_end_io")
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_aops.c |   15 ++++++++++++---
+ 1 file changed, 12 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -82,6 +82,7 @@ xfs_end_ioend(
+       struct iomap_ioend      *ioend)
+ {
+       struct xfs_inode        *ip = XFS_I(ioend->io_inode);
++      struct xfs_mount        *mp = ip->i_mount;
+       xfs_off_t               offset = ioend->io_offset;
+       size_t                  size = ioend->io_size;
+       unsigned int            nofs_flag;
+@@ -97,18 +98,26 @@ xfs_end_ioend(
+       /*
+        * Just clean up the in-memory structures if the fs has been shut down.
+        */
+-      if (xfs_is_shutdown(ip->i_mount)) {
++      if (xfs_is_shutdown(mp)) {
+               error = -EIO;
+               goto done;
+       }
+       /*
+-       * Clean up any COW blocks on an I/O error.
++       * Clean up all COW blocks and underlying data fork delalloc blocks on
++       * I/O error. The delalloc punch is required because this ioend was
++       * mapped to blocks in the COW fork and the associated pages are no
++       * longer dirty. If we don't remove delalloc blocks here, they become
++       * stale and can corrupt free space accounting on unmount.
+        */
+       error = blk_status_to_errno(ioend->io_bio->bi_status);
+       if (unlikely(error)) {
+-              if (ioend->io_flags & IOMAP_F_SHARED)
++              if (ioend->io_flags & IOMAP_F_SHARED) {
+                       xfs_reflink_cancel_cow_range(ip, offset, size, true);
++                      xfs_bmap_punch_delalloc_range(ip,
++                                                    XFS_B_TO_FSBT(mp, offset),
++                                                    XFS_B_TO_FSB(mp, size));
++              }
+               goto done;
+       }
diff --git a/queue-5.15/xfs-remove-all-cow-fork-extents-when-remounting-readonly.patch b/queue-5.15/xfs-remove-all-cow-fork-extents-when-remounting-readonly.patch
new file mode 100644 (file)
index 0000000..01ff126
--- /dev/null
@@ -0,0 +1,89 @@
+From foo@baz Thu Jun 30 01:21:44 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 28 Jun 2022 11:39:48 -0700
+Subject: xfs: remove all COW fork extents when remounting readonly
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Chandan Babu R <chandan.babu@oracle.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220628183951.3425528-5-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 089558bc7ba785c03815a49c89e28ad9b8de51f9 ]
+
+As part of multiple customer escalations due to file data corruption
+after copy on write operations, I wrote some fstests that use fsstress
+to hammer on COW to shake things loose.  Regrettably, I caught some
+filesystem shutdowns due to incorrect rmap operations with the following
+loop:
+
+mount <filesystem>                             # (0)
+fsstress <run only readonly ops> &             # (1)
+while true; do
+       fsstress <run all ops>
+       mount -o remount,ro                     # (2)
+       fsstress <run only readonly ops>
+       mount -o remount,rw                     # (3)
+done
+
+When (2) happens, notice that (1) is still running.  xfs_remount_ro will
+call xfs_blockgc_stop to walk the inode cache to free all the COW
+extents, but the blockgc mechanism races with (1)'s reader threads to
+take IOLOCKs and loses, which means that it doesn't clean them all out.
+Call such a file (A).
+
+When (3) happens, xfs_remount_rw calls xfs_reflink_recover_cow, which
+walks the ondisk refcount btree and frees any COW extent that it finds.
+This function does not check the inode cache, which means that incore
+COW forks of inode (A) is now inconsistent with the ondisk metadata.  If
+one of those former COW extents are allocated and mapped into another
+file (B) and someone triggers a COW to the stale reservation in (A), A's
+dirty data will be written into (B) and once that's done, those blocks
+will be transferred to (A)'s data fork without bumping the refcount.
+
+The results are catastrophic -- file (B) and the refcount btree are now
+corrupt.  Solve this race by forcing the xfs_blockgc_free_space to run
+synchronously, which causes xfs_icwalk to return to inodes that were
+skipped because the blockgc code couldn't take the IOLOCK.  This is safe
+to do here because the VFS has already prohibited new writer threads.
+
+Fixes: 10ddf64e420f ("xfs: remove leftover CoW reservations when remounting ro")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Chandan Babu R <chandan.babu@oracle.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_super.c |   14 +++++++++++---
+ 1 file changed, 11 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1768,7 +1768,10 @@ static int
+ xfs_remount_ro(
+       struct xfs_mount        *mp)
+ {
+-      int error;
++      struct xfs_icwalk       icw = {
++              .icw_flags      = XFS_ICWALK_FLAG_SYNC,
++      };
++      int                     error;
+       /*
+        * Cancel background eofb scanning so it cannot race with the final
+@@ -1776,8 +1779,13 @@ xfs_remount_ro(
+        */
+       xfs_blockgc_stop(mp);
+-      /* Get rid of any leftover CoW reservations... */
+-      error = xfs_blockgc_free_space(mp, NULL);
++      /*
++       * Clear out all remaining COW staging extents and speculative post-EOF
++       * preallocations so that we don't leave inodes requiring inactivation
++       * cleanups during reclaim on a read-only mount.  We must process every
++       * cached inode, so this requires a synchronous cache scan.
++       */
++      error = xfs_blockgc_free_space(mp, &icw);
+       if (error) {
+               xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+               return error;
diff --git a/queue-5.15/xfs-use-kmem_cache_free-for-kmem_cache-objects.patch b/queue-5.15/xfs-use-kmem_cache_free-for-kmem_cache-objects.patch
new file mode 100644 (file)
index 0000000..a77d8c4
--- /dev/null
@@ -0,0 +1,65 @@
+From foo@baz Thu Jun 30 01:21:44 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 28 Jun 2022 11:39:45 -0700
+Subject: xfs: use kmem_cache_free() for kmem_cache objects
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, Rustam Kovhaev <rkovhaev@gmail.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220628183951.3425528-2-leah.rumancik@gmail.com>
+
+From: Rustam Kovhaev <rkovhaev@gmail.com>
+
+[ Upstream commit c30a0cbd07ecc0eec7b3cd568f7b1c7bb7913f93 ]
+
+For kmalloc() allocations SLOB prepends the blocks with a 4-byte header,
+and it puts the size of the allocated blocks in that header.
+Blocks allocated with kmem_cache_alloc() allocations do not have that
+header.
+
+SLOB explodes when you allocate memory with kmem_cache_alloc() and then
+try to free it with kfree() instead of kmem_cache_free().
+SLOB will assume that there is a header when there is none, read some
+garbage to size variable and corrupt the adjacent objects, which
+eventually leads to hang or panic.
+
+Let's make XFS work with SLOB by using proper free function.
+
+Fixes: 9749fee83f38 ("xfs: enable the xfs_defer mechanism to process extents to free")
+Signed-off-by: Rustam Kovhaev <rkovhaev@gmail.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_extfree_item.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_extfree_item.c
++++ b/fs/xfs/xfs_extfree_item.c
+@@ -482,7 +482,7 @@ xfs_extent_free_finish_item(
+                       free->xefi_startblock,
+                       free->xefi_blockcount,
+                       &free->xefi_oinfo, free->xefi_skip_discard);
+-      kmem_free(free);
++      kmem_cache_free(xfs_bmap_free_item_zone, free);
+       return error;
+ }
+@@ -502,7 +502,7 @@ xfs_extent_free_cancel_item(
+       struct xfs_extent_free_item     *free;
+       free = container_of(item, struct xfs_extent_free_item, xefi_list);
+-      kmem_free(free);
++      kmem_cache_free(xfs_bmap_free_item_zone, free);
+ }
+ const struct xfs_defer_op_type xfs_extent_free_defer_type = {
+@@ -564,7 +564,7 @@ xfs_agfl_free_finish_item(
+       extp->ext_len = free->xefi_blockcount;
+       efdp->efd_next_extent++;
+-      kmem_free(free);
++      kmem_cache_free(xfs_bmap_free_item_zone, free);
+       return error;
+ }