]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
xfs: catch stale AGF/AGF metadata
authorDave Chinner <dchinner@redhat.com>
Wed, 25 Jun 2025 22:48:55 +0000 (08:48 +1000)
committerCarlos Maiolino <cem@kernel.org>
Fri, 27 Jun 2025 12:13:34 +0000 (14:13 +0200)
There is a race condition that can trigger in dmflakey fstests that
can result in asserts in xfs_ialloc_read_agi() and
xfs_alloc_read_agf() firing. The asserts look like this:

 XFS: Assertion failed: pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks), file: fs/xfs/libxfs/xfs_alloc.c, line: 3440
.....
 Call Trace:
  <TASK>
  xfs_alloc_read_agf+0x2ad/0x3a0
  xfs_alloc_fix_freelist+0x280/0x720
  xfs_alloc_vextent_prepare_ag+0x42/0x120
  xfs_alloc_vextent_iterate_ags+0x67/0x260
  xfs_alloc_vextent_start_ag+0xe4/0x1c0
  xfs_bmapi_allocate+0x6fe/0xc90
  xfs_bmapi_convert_delalloc+0x338/0x560
  xfs_map_blocks+0x354/0x580
  iomap_writepages+0x52b/0xa70
  xfs_vm_writepages+0xd7/0x100
  do_writepages+0xe1/0x2c0
  __writeback_single_inode+0x44/0x340
  writeback_sb_inodes+0x2d0/0x570
  __writeback_inodes_wb+0x9c/0xf0
  wb_writeback+0x139/0x2d0
  wb_workfn+0x23e/0x4c0
  process_scheduled_works+0x1d4/0x400
  worker_thread+0x234/0x2e0
  kthread+0x147/0x170
  ret_from_fork+0x3e/0x50
  ret_from_fork_asm+0x1a/0x30

I've seen the AGI variant from scrub running on the filesysetm
after unmount failed due to systemd interference:

 XFS: Assertion failed: pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || xfs_is_shutdown(pag->pag_mount), file: fs/xfs/libxfs/xfs_ialloc.c, line: 2804
.....
 Call Trace:
  <TASK>
  xfs_ialloc_read_agi+0xee/0x150
  xchk_perag_drain_and_lock+0x7d/0x240
  xchk_ag_init+0x34/0x90
  xchk_inode_xref+0x7b/0x220
  xchk_inode+0x14d/0x180
  xfs_scrub_metadata+0x2e2/0x510
  xfs_ioc_scrub_metadata+0x62/0xb0
  xfs_file_ioctl+0x446/0xbf0
  __se_sys_ioctl+0x6f/0xc0
  __x64_sys_ioctl+0x1d/0x30
  x64_sys_call+0x1879/0x2ee0
  do_syscall_64+0x68/0x130
  ? exc_page_fault+0x62/0xc0
  entry_SYSCALL_64_after_hwframe+0x76/0x7e

Essentially, it is the same problem. When _flakey_drop_and_remount()
loads the drop-writes table, it makes all writes silently fail. Writes
are reported to the fs as completed successfully, but they are not
issued to the backing store. The filesystem sees the successful
write completion and marks the metadata buffer clean and removes it
from the AIL.

If this happens at the same time as memory pressure is occuring,
the now-clean AGF and/or AGI buffers can be reclaimed from memory.

Shortly afterwards, but before _flakey_drop_and_remount() runs
unmount, background writeback is kicked and it tries to allocate
blocks for the dirty pages in memory. This then tries to access the
AGF buffer we just turfed out of memory. It's not found, so it gets
read in from disk.

This is all fine, except for the fact that the last writeback of the
AGF did not actually reach disk. The AGF on disk is stale compared
to the in-memory state held by the perag, and so they don't match
and the assert fires.

Then other operations on that inode hang because the task was killed
whilst holding inode locks. e.g:

 Workqueue: xfs-conv/dm-12 xfs_end_io
 Call Trace:
  <TASK>
  __schedule+0x650/0xb10
  schedule+0x6d/0xf0
  schedule_preempt_disabled+0x15/0x30
  rwsem_down_write_slowpath+0x31a/0x5f0
  down_write+0x43/0x60
  xfs_ilock+0x1a8/0x210
  xfs_trans_alloc_inode+0x9c/0x240
  xfs_iomap_write_unwritten+0xe3/0x300
  xfs_end_ioend+0x90/0x130
  xfs_end_io+0xce/0x100
  process_scheduled_works+0x1d4/0x400
  worker_thread+0x234/0x2e0
  kthread+0x147/0x170
  ret_from_fork+0x3e/0x50
  ret_from_fork_asm+0x1a/0x30
  </TASK>

and it's all down hill from there.

Memory pressure is one way to trigger this, another is to run "echo
3 > /proc/sys/vm/drop_caches" randomly while tests are running.

Regardless of how it is triggered, this effectively takes down the
system once umount hangs because it's holding a sb->s_umount lock
exclusive and now every sync(1) call gets stuck on it.

Fix this by replacing the asserts with a corruption detection check
and a shutdown.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
fs/xfs/libxfs/xfs_alloc.c
fs/xfs/libxfs/xfs_ialloc.c

index 7839efe050bfa056d35933ec5c5f8c871bdc7b21..000cc7f4a3ce5085ab73fef86fb202ebafe59174 100644 (file)
@@ -3444,16 +3444,41 @@ xfs_alloc_read_agf(
 
                set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
        }
+
 #ifdef DEBUG
-       else if (!xfs_is_shutdown(mp)) {
-               ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
-               ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
-               ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
-               ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
-               ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level));
-               ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level));
+       /*
+        * It's possible for the AGF to be out of sync if the block device is
+        * silently dropping writes. This can happen in fstests with dmflakey
+        * enabled, which allows the buffer to be cleaned and reclaimed by
+        * memory pressure and then re-read from disk here. We will get a
+        * stale version of the AGF from disk, and nothing good can happen from
+        * here. Hence if we detect this situation, immediately shut down the
+        * filesystem.
+        *
+        * This can also happen if we are already in the middle of a forced
+        * shutdown, so don't bother checking if we are already shut down.
+        */
+       if (!xfs_is_shutdown(pag_mount(pag))) {
+               bool    ok = true;
+
+               ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks);
+               ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks);
+               ok &= pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks);
+               ok &= pag->pagf_flcount == be32_to_cpu(agf->agf_flcount);
+               ok &= pag->pagf_longest == be32_to_cpu(agf->agf_longest);
+               ok &= pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level);
+               ok &= pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level);
+
+               if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) {
+                       xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
+                       xfs_trans_brelse(tp, agfbp);
+                       xfs_force_shutdown(pag_mount(pag),
+                                       SHUTDOWN_CORRUPT_ONDISK);
+                       return -EFSCORRUPTED;
+               }
        }
-#endif
+#endif /* DEBUG */
+
        if (agfbpp)
                *agfbpp = agfbp;
        else
index 0c47b5c6ca7d99c4067ceb8ba22f2b4ddc89ad0f..750111634d9f7b82b953f606e13aa7169b0ce7f6 100644 (file)
@@ -2801,12 +2801,35 @@ xfs_ialloc_read_agi(
                set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
        }
 
+#ifdef DEBUG
        /*
-        * It's possible for these to be out of sync if
-        * we are in the middle of a forced shutdown.
+        * It's possible for the AGF to be out of sync if the block device is
+        * silently dropping writes. This can happen in fstests with dmflakey
+        * enabled, which allows the buffer to be cleaned and reclaimed by
+        * memory pressure and then re-read from disk here. We will get a
+        * stale version of the AGF from disk, and nothing good can happen from
+        * here. Hence if we detect this situation, immediately shut down the
+        * filesystem.
+        *
+        * This can also happen if we are already in the middle of a forced
+        * shutdown, so don't bother checking if we are already shut down.
         */
-       ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
-               xfs_is_shutdown(pag_mount(pag)));
+       if (!xfs_is_shutdown(pag_mount(pag))) {
+               bool    ok = true;
+
+               ok &= pag->pagi_freecount == be32_to_cpu(agi->agi_freecount);
+               ok &= pag->pagi_count == be32_to_cpu(agi->agi_count);
+
+               if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) {
+                       xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
+                       xfs_trans_brelse(tp, agibp);
+                       xfs_force_shutdown(pag_mount(pag),
+                                       SHUTDOWN_CORRUPT_ONDISK);
+                       return -EFSCORRUPTED;
+               }
+       }
+#endif /* DEBUG */
+
        if (agibpp)
                *agibpp = agibp;
        else