From: Greg Kroah-Hartman Date: Mon, 1 Aug 2022 08:41:30 +0000 (+0200) Subject: 5.10-stable patches X-Git-Tag: v5.4.209~13 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=7ba39eba26f385bd41d168b4c6fda0bf8d1d33a0;p=thirdparty%2Fkernel%2Fstable-queue.git 5.10-stable patches added patches: docs-kernel-parameters-update-descriptions-for-mitigations-param-with-retbleed.patch edac-ghes-set-the-dimm-label-unconditionally.patch xfs-enforce-attr3-buffer-recovery-order.patch xfs-fix-log-intent-recovery-enospc-shutdowns-when-inactivating-inodes.patch xfs-force-the-log-offline-when-log-intent-item-recovery-fails.patch xfs-hold-buffer-across-unpin-and-potential-shutdown-processing.patch xfs-logging-the-on-disk-inode-lsn-can-make-it-go-backwards.patch xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch xfs-refactor-xfs_file_fsync.patch xfs-remove-dead-stale-buf-unpin-handling-code.patch xfs-xfs_log_force_lsn-isn-t-passed-a-lsn.patch --- diff --git a/queue-5.10/docs-kernel-parameters-update-descriptions-for-mitigations-param-with-retbleed.patch b/queue-5.10/docs-kernel-parameters-update-descriptions-for-mitigations-param-with-retbleed.patch new file mode 100644 index 00000000000..548dccc488f --- /dev/null +++ b/queue-5.10/docs-kernel-parameters-update-descriptions-for-mitigations-param-with-retbleed.patch @@ -0,0 +1,39 @@ +From ea304a8b89fd0d6cf94ee30cb139dc23d9f1a62f Mon Sep 17 00:00:00 2001 +From: Eiichi Tsukata +Date: Thu, 28 Jul 2022 04:39:07 +0000 +Subject: docs/kernel-parameters: Update descriptions for "mitigations=" param with retbleed + +From: Eiichi Tsukata + +commit ea304a8b89fd0d6cf94ee30cb139dc23d9f1a62f upstream. + +Updates descriptions for "mitigations=off" and "mitigations=auto,nosmt" +with the respective retbleed= settings. + +Signed-off-by: Eiichi Tsukata +Signed-off-by: Borislav Petkov +Cc: corbet@lwn.net +Link: https://lore.kernel.org/r/20220728043907.165688-1-eiichi.tsukata@nutanix.com +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/admin-guide/kernel-parameters.txt | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -2873,6 +2873,7 @@ + no_entry_flush [PPC] + no_uaccess_flush [PPC] + mmio_stale_data=off [X86] ++ retbleed=off [X86] + + Exceptions: + This does not have any effect on +@@ -2895,6 +2896,7 @@ + mds=full,nosmt [X86] + tsx_async_abort=full,nosmt [X86] + mmio_stale_data=full,nosmt [X86] ++ retbleed=auto,nosmt [X86] + + mminit_loglevel= + [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this diff --git a/queue-5.10/edac-ghes-set-the-dimm-label-unconditionally.patch b/queue-5.10/edac-ghes-set-the-dimm-label-unconditionally.patch new file mode 100644 index 00000000000..45ad9e43fc3 --- /dev/null +++ b/queue-5.10/edac-ghes-set-the-dimm-label-unconditionally.patch @@ -0,0 +1,108 @@ +From 5e2805d5379619c4a2e3ae4994e73b36439f4bad Mon Sep 17 00:00:00 2001 +From: Toshi Kani +Date: Thu, 21 Jul 2022 12:05:03 -0600 +Subject: EDAC/ghes: Set the DIMM label unconditionally + +From: Toshi Kani + +commit 5e2805d5379619c4a2e3ae4994e73b36439f4bad upstream. + +The commit + + cb51a371d08e ("EDAC/ghes: Setup DIMM label from DMI and use it in error reports") + +enforced that both the bank and device strings passed to +dimm_setup_label() are not NULL. + +However, there are BIOSes, for example on a + + HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/15/2019 + +which don't populate both strings: + + Handle 0x0020, DMI type 17, 84 bytes + Memory Device + Array Handle: 0x0013 + Error Information Handle: Not Provided + Total Width: 72 bits + Data Width: 64 bits + Size: 32 GB + Form Factor: DIMM + Set: None + Locator: PROC 1 DIMM 1 <===== device + Bank Locator: Not Specified <===== bank + +This results in a buffer overflow because ghes_edac_register() calls +strlen() on an uninitialized label, which had non-zero values left over +from krealloc_array(): + + detected buffer overflow in __fortify_strlen + ------------[ cut here ]------------ + kernel BUG at lib/string_helpers.c:983! + invalid opcode: 0000 [#1] PREEMPT SMP NOPTI + CPU: 1 PID: 1 Comm: swapper/0 Tainted: G I 5.18.6-200.fc36.x86_64 #1 + Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/15/2019 + RIP: 0010:fortify_panic + ... + Call Trace: + + ghes_edac_register.cold + ghes_probe + platform_probe + really_probe + __driver_probe_device + driver_probe_device + __driver_attach + ? __device_attach_driver + bus_for_each_dev + bus_add_driver + driver_register + acpi_ghes_init + acpi_init + ? acpi_sleep_proc_init + do_one_initcall + +The label contains garbage because the commit in Fixes reallocs the +DIMMs array while scanning the system but doesn't clear the newly +allocated memory. + +Change dimm_setup_label() to always initialize the label to fix the +issue. Set it to the empty string in case BIOS does not provide both +bank and device so that ghes_edac_register() can keep the default label +given by edac_mc_alloc_dimms(). + + [ bp: Rewrite commit message. ] + +Fixes: b9cae27728d1f ("EDAC/ghes: Scan the system once on driver init") +Co-developed-by: Robert Richter +Signed-off-by: Robert Richter +Signed-off-by: Toshi Kani +Signed-off-by: Borislav Petkov +Tested-by: Robert Elliott +Cc: +Link: https://lore.kernel.org/r/20220719220124.760359-1-toshi.kani@hpe.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/edac/ghes_edac.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/drivers/edac/ghes_edac.c ++++ b/drivers/edac/ghes_edac.c +@@ -101,9 +101,14 @@ static void dimm_setup_label(struct dimm + + dmi_memdev_name(handle, &bank, &device); + +- /* both strings must be non-zero */ +- if (bank && *bank && device && *device) +- snprintf(dimm->label, sizeof(dimm->label), "%s %s", bank, device); ++ /* ++ * Set to a NULL string when both bank and device are zero. In this case, ++ * the label assigned by default will be preserved. ++ */ ++ snprintf(dimm->label, sizeof(dimm->label), "%s%s%s", ++ (bank && *bank) ? bank : "", ++ (bank && *bank && device && *device) ? " " : "", ++ (device && *device) ? device : ""); + } + + static void assign_dmi_dimm_info(struct dimm_info *dimm, struct memdev_dmi_entry *entry) diff --git a/queue-5.10/series b/queue-5.10/series index fd90c7c4453..3c068f2020f 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -48,3 +48,14 @@ arm-crypto-comment-out-gcc-warning-that-breaks-clang-builds.patch page_alloc-fix-invalid-watermark-check-on-a-negative-value.patch mt7601u-add-usb-device-id-for-some-versions-of-xiaodu-wifi-dongle.patch arm-9216-1-fix-max_dma_address-overflow.patch +edac-ghes-set-the-dimm-label-unconditionally.patch +docs-kernel-parameters-update-descriptions-for-mitigations-param-with-retbleed.patch +xfs-refactor-xfs_file_fsync.patch +xfs-xfs_log_force_lsn-isn-t-passed-a-lsn.patch +xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch +xfs-fix-log-intent-recovery-enospc-shutdowns-when-inactivating-inodes.patch +xfs-force-the-log-offline-when-log-intent-item-recovery-fails.patch +xfs-hold-buffer-across-unpin-and-potential-shutdown-processing.patch +xfs-remove-dead-stale-buf-unpin-handling-code.patch +xfs-logging-the-on-disk-inode-lsn-can-make-it-go-backwards.patch +xfs-enforce-attr3-buffer-recovery-order.patch diff --git a/queue-5.10/xfs-enforce-attr3-buffer-recovery-order.patch b/queue-5.10/xfs-enforce-attr3-buffer-recovery-order.patch new file mode 100644 index 00000000000..06d903f84ed --- /dev/null +++ b/queue-5.10/xfs-enforce-attr3-buffer-recovery-order.patch @@ -0,0 +1,102 @@ +From foo@baz Mon Aug 1 10:39:45 AM CEST 2022 +From: Amir Goldstein +Date: Fri, 29 Jul 2022 18:16:09 +0200 +Subject: xfs: Enforce attr3 buffer recovery order +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Luis Chamberlain , Adam Manzanares , linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner , Christoph Hellwig +Message-ID: <20220729161609.4071252-10-amir73il@gmail.com> + +From: Dave Chinner + +commit d8f4c2d0398fa1d92cacf854daf80d21a46bfefc upstream. + +>From the department of "WTAF? How did we miss that!?"... + +When we are recovering a buffer, the first thing we do is check the +buffer magic number and extract the LSN from the buffer. If the LSN +is older than the current LSN, we replay the modification to it. If +the metadata on disk is newer than the transaction in the log, we +skip it. This is a fundamental v5 filesystem metadata recovery +behaviour. + +generic/482 failed with an attribute writeback failure during log +recovery. The write verifier caught the corruption before it got +written to disk, and the attr buffer dump looked like: + +XFS (dm-3): Metadata corruption detected at xfs_attr3_leaf_verify+0x275/0x2e0, xfs_attr3_leaf block 0x19be8 +XFS (dm-3): Unmount and run xfs_repair +XFS (dm-3): First 128 bytes of corrupted metadata buffer: +00000000: 00 00 00 00 00 00 00 00 3b ee 00 00 4d 2a 01 e1 ........;...M*.. +00000010: 00 00 00 00 00 01 9b e8 00 00 00 01 00 00 05 38 ...............8 + ^^^^^^^^^^^^^^^^^^^^^^^ +00000020: df 39 5e 51 58 ac 44 b6 8d c5 e7 10 44 09 bc 17 .9^QX.D.....D... +00000030: 00 00 00 00 00 02 00 83 00 03 00 cc 0f 24 01 00 .............$.. +00000040: 00 68 0e bc 0f c8 00 10 00 00 00 00 00 00 00 00 .h.............. +00000050: 00 00 3c 31 0f 24 01 00 00 00 3c 32 0f 88 01 00 ..<1.$....<2.... +00000060: 00 00 3c 33 0f d8 01 00 00 00 00 00 00 00 00 00 ..<3............ +00000070: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +..... + +The highlighted bytes are the LSN that was replayed into the +buffer: 0x100000538. This is cycle 1, block 0x538. Prior to replay, +that block on disk looks like this: + +$ sudo xfs_db -c "fsb 0x417d" -c "type attr3" -c p /dev/mapper/thin-vol +hdr.info.hdr.forw = 0 +hdr.info.hdr.back = 0 +hdr.info.hdr.magic = 0x3bee +hdr.info.crc = 0xb5af0bc6 (correct) +hdr.info.bno = 105448 +hdr.info.lsn = 0x100000900 + ^^^^^^^^^^^ +hdr.info.uuid = df395e51-58ac-44b6-8dc5-e7104409bc17 +hdr.info.owner = 131203 +hdr.count = 2 +hdr.usedbytes = 120 +hdr.firstused = 3796 +hdr.holes = 1 +hdr.freemap[0-2] = [base,size] + +Note the LSN stamped into the buffer on disk: 1/0x900. The version +on disk is much newer than the log transaction that was being +replayed. That's a bug, and should -never- happen. + +So I immediately went to look at xlog_recover_get_buf_lsn() to check +that we handled the LSN correctly. I was wondering if there was a +similar "two commits with the same start LSN skips the second +replay" problem with buffers. I didn't get that far, because I found +a much more basic, rudimentary bug: xlog_recover_get_buf_lsn() +doesn't recognise buffers with XFS_ATTR3_LEAF_MAGIC set in them!!! + +IOWs, attr3 leaf buffers fall through the magic number checks +unrecognised, so trigger the "recover immediately" behaviour instead +of undergoing an LSN check. IOWs, we incorrectly replay ATTR3 leaf +buffers and that causes silent on disk corruption of inode attribute +forks and potentially other things.... + +Git history shows this is *another* zero day bug, this time +introduced in commit 50d5c8d8e938 ("xfs: check LSN ordering for v5 +superblocks during recovery") which failed to handle the attr3 leaf +buffers in recovery. And we've failed to handle them ever since... + +Signed-off-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Amir Goldstein +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_buf_item_recover.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/xfs/xfs_buf_item_recover.c ++++ b/fs/xfs/xfs_buf_item_recover.c +@@ -796,6 +796,7 @@ xlog_recover_get_buf_lsn( + switch (magicda) { + case XFS_DIR3_LEAF1_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: ++ case XFS_ATTR3_LEAF_MAGIC: + case XFS_DA3_NODE_MAGIC: + lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); + uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; diff --git a/queue-5.10/xfs-fix-log-intent-recovery-enospc-shutdowns-when-inactivating-inodes.patch b/queue-5.10/xfs-fix-log-intent-recovery-enospc-shutdowns-when-inactivating-inodes.patch new file mode 100644 index 00000000000..6351969de23 --- /dev/null +++ b/queue-5.10/xfs-fix-log-intent-recovery-enospc-shutdowns-when-inactivating-inodes.patch @@ -0,0 +1,60 @@ +From foo@baz Mon Aug 1 10:39:45 AM CEST 2022 +From: Amir Goldstein +Date: Fri, 29 Jul 2022 18:16:04 +0200 +Subject: xfs: fix log intent recovery ENOSPC shutdowns when inactivating inodes +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Luis Chamberlain , Adam Manzanares , linux-xfs@vger.kernel.org, stable@vger.kernel.org, Christoph Hellwig +Message-ID: <20220729161609.4071252-5-amir73il@gmail.com> + +From: "Darrick J. Wong" + +commit 81ed94751b1513fcc5978dcc06eb1f5b4e55a785 upstream. + +During regular operation, the xfs_inactive operations create +transactions with zero block reservation because in general we're +freeing space, not asking for more. The per-AG space reservations +created at mount time enable us to handle expansions of the refcount +btree without needing to reserve blocks to the transaction. + +Unfortunately, log recovery doesn't create the per-AG space reservations +when intent items are being recovered. This isn't an issue for intent +item recovery itself because they explicitly request blocks, but any +inode inactivation that can happen during log recovery uses the same +xfs_inactive paths as regular runtime. If a refcount btree expansion +happens, the transaction will fail due to blk_res_used > blk_res, and we +shut down the filesystem unnecessarily. + +Fix this problem by making per-AG reservations temporarily so that we +can handle the inactivations, and releasing them at the end. This +brings the recovery environment closer to the runtime environment. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Amir Goldstein +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_mount.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/fs/xfs/xfs_mount.c ++++ b/fs/xfs/xfs_mount.c +@@ -968,9 +968,17 @@ xfs_mountfs( + /* + * Finish recovering the file system. This part needed to be delayed + * until after the root and real-time bitmap inodes were consistently +- * read in. ++ * read in. Temporarily create per-AG space reservations for metadata ++ * btree shape changes because space freeing transactions (for inode ++ * inactivation) require the per-AG reservation in lieu of reserving ++ * blocks. + */ ++ error = xfs_fs_reserve_ag_blocks(mp); ++ if (error && error == -ENOSPC) ++ xfs_warn(mp, ++ "ENOSPC reserving per-AG metadata pool, log recovery may fail."); + error = xfs_log_mount_finish(mp); ++ xfs_fs_unreserve_ag_blocks(mp); + if (error) { + xfs_warn(mp, "log mount finish failed"); + goto out_rtunmount; diff --git a/queue-5.10/xfs-force-the-log-offline-when-log-intent-item-recovery-fails.patch b/queue-5.10/xfs-force-the-log-offline-when-log-intent-item-recovery-fails.patch new file mode 100644 index 00000000000..09b03c8ed41 --- /dev/null +++ b/queue-5.10/xfs-force-the-log-offline-when-log-intent-item-recovery-fails.patch @@ -0,0 +1,68 @@ +From foo@baz Mon Aug 1 10:39:45 AM CEST 2022 +From: Amir Goldstein +Date: Fri, 29 Jul 2022 18:16:05 +0200 +Subject: xfs: force the log offline when log intent item recovery fails +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Luis Chamberlain , Adam Manzanares , linux-xfs@vger.kernel.org, stable@vger.kernel.org, Christoph Hellwig +Message-ID: <20220729161609.4071252-6-amir73il@gmail.com> + +From: "Darrick J. Wong" + +commit 4e6b8270c820c8c57a73f869799a0af2b56eff3e upstream. + +If any part of log intent item recovery fails, we should shut down the +log immediately to stop the log from writing a clean unmount record to +disk, because the metadata is not consistent. The inability to cancel a +dirty transaction catches most of these cases, but there are a few +things that have slipped through the cracks, such as ENOSPC from a +transaction allocation, or runtime errors that result in cancellation of +a non-dirty transaction. + +This solves some weird behaviors reported by customers where a system +goes down, the first mount fails, the second succeeds, but then the fs +goes down later because of inconsistent metadata. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Amir Goldstein +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_log.c | 3 +++ + fs/xfs/xfs_log_recover.c | 5 ++++- + 2 files changed, 7 insertions(+), 1 deletion(-) + +--- a/fs/xfs/xfs_log.c ++++ b/fs/xfs/xfs_log.c +@@ -765,6 +765,9 @@ xfs_log_mount_finish( + if (readonly) + mp->m_flags |= XFS_MOUNT_RDONLY; + ++ /* Make sure the log is dead if we're returning failure. */ ++ ASSERT(!error || (mp->m_log->l_flags & XLOG_IO_ERROR)); ++ + return error; + } + +--- a/fs/xfs/xfs_log_recover.c ++++ b/fs/xfs/xfs_log_recover.c +@@ -2457,8 +2457,10 @@ xlog_finish_defer_ops( + + error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres, + dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp); +- if (error) ++ if (error) { ++ xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + return error; ++ } + + /* + * Transfer to this new transaction all the dfops we captured +@@ -3454,6 +3456,7 @@ xlog_recover_finish( + * this) before we get around to xfs_log_mount_cancel. + */ + xlog_recover_cancel_intents(log); ++ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xfs_alert(log->l_mp, "Failed to recover intents"); + return error; + } diff --git a/queue-5.10/xfs-hold-buffer-across-unpin-and-potential-shutdown-processing.patch b/queue-5.10/xfs-hold-buffer-across-unpin-and-potential-shutdown-processing.patch new file mode 100644 index 00000000000..2ed0baa5920 --- /dev/null +++ b/queue-5.10/xfs-hold-buffer-across-unpin-and-potential-shutdown-processing.patch @@ -0,0 +1,123 @@ +From foo@baz Mon Aug 1 10:39:45 AM CEST 2022 +From: Amir Goldstein +Date: Fri, 29 Jul 2022 18:16:06 +0200 +Subject: xfs: hold buffer across unpin and potential shutdown processing +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Luis Chamberlain , Adam Manzanares , linux-xfs@vger.kernel.org, stable@vger.kernel.org, Brian Foster +Message-ID: <20220729161609.4071252-7-amir73il@gmail.com> + +From: Brian Foster + +commit 84d8949e770745b16a7e8a68dcb1d0f3687bdee9 upstream. + +The special processing used to simulate a buffer I/O failure on fs +shutdown has a difficult to reproduce race that can result in a use +after free of the associated buffer. Consider a buffer that has been +committed to the on-disk log and thus is AIL resident. The buffer +lands on the writeback delwri queue, but is subsequently locked, +committed and pinned by another transaction before submitted for +I/O. At this point, the buffer is stuck on the delwri queue as it +cannot be submitted for I/O until it is unpinned. A log checkpoint +I/O failure occurs sometime later, which aborts the bli. The unpin +handler is called with the aborted log item, drops the bli reference +count, the pin count, and falls into the I/O failure simulation +path. + +The potential problem here is that once the pin count falls to zero +in ->iop_unpin(), xfsaild is free to retry delwri submission of the +buffer at any time, before the unpin handler even completes. If +delwri queue submission wins the race to the buffer lock, it +observes the shutdown state and simulates the I/O failure itself. +This releases both the bli and delwri queue holds and frees the +buffer while xfs_buf_item_unpin() sits on xfs_buf_lock() waiting to +run through the same failure sequence. This problem is rare and +requires many iterations of fstest generic/019 (which simulates disk +I/O failures) to reproduce. + +To avoid this problem, grab a hold on the buffer before the log item +is unpinned if the associated item has been aborted and will require +a simulated I/O failure. The hold is already required for the +simulated I/O failure, so the ordering simply guarantees the unpin +handler access to the buffer before it is unpinned and thus +processed by the AIL. This particular ordering is required so long +as the AIL does not acquire a reference on the bli, which is the +long term solution to this problem. + +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Amir Goldstein +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_buf_item.c | 37 +++++++++++++++++++++---------------- + 1 file changed, 21 insertions(+), 16 deletions(-) + +--- a/fs/xfs/xfs_buf_item.c ++++ b/fs/xfs/xfs_buf_item.c +@@ -393,17 +393,8 @@ xfs_buf_item_pin( + } + + /* +- * This is called to unpin the buffer associated with the buf log +- * item which was previously pinned with a call to xfs_buf_item_pin(). +- * +- * Also drop the reference to the buf item for the current transaction. +- * If the XFS_BLI_STALE flag is set and we are the last reference, +- * then free up the buf log item and unlock the buffer. +- * +- * If the remove flag is set we are called from uncommit in the +- * forced-shutdown path. If that is true and the reference count on +- * the log item is going to drop to zero we need to free the item's +- * descriptor in the transaction. ++ * This is called to unpin the buffer associated with the buf log item which ++ * was previously pinned with a call to xfs_buf_item_pin(). + */ + STATIC void + xfs_buf_item_unpin( +@@ -420,12 +411,26 @@ xfs_buf_item_unpin( + + trace_xfs_buf_item_unpin(bip); + ++ /* ++ * Drop the bli ref associated with the pin and grab the hold required ++ * for the I/O simulation failure in the abort case. We have to do this ++ * before the pin count drops because the AIL doesn't acquire a bli ++ * reference. Therefore if the refcount drops to zero, the bli could ++ * still be AIL resident and the buffer submitted for I/O (and freed on ++ * completion) at any point before we return. This can be removed once ++ * the AIL properly holds a reference on the bli. ++ */ + freed = atomic_dec_and_test(&bip->bli_refcount); +- ++ if (freed && !stale && remove) ++ xfs_buf_hold(bp); + if (atomic_dec_and_test(&bp->b_pin_count)) + wake_up_all(&bp->b_waiters); + +- if (freed && stale) { ++ /* nothing to do but drop the pin count if the bli is active */ ++ if (!freed) ++ return; ++ ++ if (stale) { + ASSERT(bip->bli_flags & XFS_BLI_STALE); + ASSERT(xfs_buf_islocked(bp)); + ASSERT(bp->b_flags & XBF_STALE); +@@ -468,13 +473,13 @@ xfs_buf_item_unpin( + ASSERT(bp->b_log_item == NULL); + } + xfs_buf_relse(bp); +- } else if (freed && remove) { ++ } else if (remove) { + /* + * The buffer must be locked and held by the caller to simulate +- * an async I/O failure. ++ * an async I/O failure. We acquired the hold for this case ++ * before the buffer was unpinned. + */ + xfs_buf_lock(bp); +- xfs_buf_hold(bp); + bp->b_flags |= XBF_ASYNC; + xfs_buf_ioend_fail(bp); + } diff --git a/queue-5.10/xfs-logging-the-on-disk-inode-lsn-can-make-it-go-backwards.patch b/queue-5.10/xfs-logging-the-on-disk-inode-lsn-can-make-it-go-backwards.patch new file mode 100644 index 00000000000..1f0d8d4c564 --- /dev/null +++ b/queue-5.10/xfs-logging-the-on-disk-inode-lsn-can-make-it-go-backwards.patch @@ -0,0 +1,242 @@ +From foo@baz Mon Aug 1 10:39:45 AM CEST 2022 +From: Amir Goldstein +Date: Fri, 29 Jul 2022 18:16:08 +0200 +Subject: xfs: logging the on disk inode LSN can make it go backwards +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Luis Chamberlain , Adam Manzanares , linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner +Message-ID: <20220729161609.4071252-9-amir73il@gmail.com> + +From: Dave Chinner + +commit 32baa63d82ee3f5ab3bd51bae6bf7d1c15aed8c7 upstream. + +When we log an inode, we format the "log inode" core and set an LSN +in that inode core. We do that via xfs_inode_item_format_core(), +which calls: + + xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn); + +to format the log inode. It writes the LSN from the inode item into +the log inode, and if recovery decides the inode item needs to be +replayed, it recovers the log inode LSN field and writes it into the +on disk inode LSN field. + +Now this might seem like a reasonable thing to do, but it is wrong +on multiple levels. Firstly, if the item is not yet in the AIL, +item->li_lsn is zero. i.e. the first time the inode it is logged and +formatted, the LSN we write into the log inode will be zero. If we +only log it once, recovery will run and can write this zero LSN into +the inode. + +This means that the next time the inode is logged and log recovery +runs, it will *always* replay changes to the inode regardless of +whether the inode is newer on disk than the version in the log and +that violates the entire purpose of recording the LSN in the inode +at writeback time (i.e. to stop it going backwards in time on disk +during recovery). + +Secondly, if we commit the CIL to the journal so the inode item +moves to the AIL, and then relog the inode, the LSN that gets +stamped into the log inode will be the LSN of the inode's current +location in the AIL, not it's age on disk. And it's not the LSN that +will be associated with the current change. That means when log +recovery replays this inode item, the LSN that ends up on disk is +the LSN for the previous changes in the log, not the current +changes being replayed. IOWs, after recovery the LSN on disk is not +in sync with the LSN of the modifications that were replayed into +the inode. This, again, violates the recovery ordering semantics +that on-disk writeback LSNs provide. + +Hence the inode LSN in the log dinode is -always- invalid. + +Thirdly, recovery actually has the LSN of the log transaction it is +replaying right at hand - it uses it to determine if it should +replay the inode by comparing it to the on-disk inode's LSN. But it +doesn't use that LSN to stamp the LSN into the inode which will be +written back when the transaction is fully replayed. It uses the one +in the log dinode, which we know is always going to be incorrect. + +Looking back at the change history, the inode logging was broken by +commit 93f958f9c41f ("xfs: cull unnecessary icdinode fields") way +back in 2016 by a stupid idiot who thought he knew how this code +worked. i.e. me. That commit replaced an in memory di_lsn field that +was updated only at inode writeback time from the inode item.li_lsn +value - and hence always contained the same LSN that appeared in the +on-disk inode - with a read of the inode item LSN at inode format +time. CLearly these are not the same thing. + +Before 93f958f9c41f, the log recovery behaviour was irrelevant, +because the LSN in the log inode always matched the on-disk LSN at +the time the inode was logged, hence recovery of the transaction +would never make the on-disk LSN in the inode go backwards or get +out of sync. + +A symptom of the problem is this, caught from a failure of +generic/482. Before log recovery, the inode has been allocated but +never used: + +xfs_db> inode 393388 +xfs_db> p +core.magic = 0x494e +core.mode = 0 +.... +v3.crc = 0x99126961 (correct) +v3.change_count = 0 +v3.lsn = 0 +v3.flags2 = 0 +v3.cowextsize = 0 +v3.crtime.sec = Thu Jan 1 10:00:00 1970 +v3.crtime.nsec = 0 + +After log recovery: + +xfs_db> p +core.magic = 0x494e +core.mode = 020444 +.... +v3.crc = 0x23e68f23 (correct) +v3.change_count = 2 +v3.lsn = 0 +v3.flags2 = 0 +v3.cowextsize = 0 +v3.crtime.sec = Thu Jul 22 17:03:03 2021 +v3.crtime.nsec = 751000000 +... + +You can see that the LSN of the on-disk inode is 0, even though it +clearly has been written to disk. I point out this inode, because +the generic/482 failure occurred because several adjacent inodes in +this specific inode cluster were not replayed correctly and still +appeared to be zero on disk when all the other metadata (inobt, +finobt, directories, etc) indicated they should be allocated and +written back. + +The fix for this is two-fold. The first is that we need to either +revert the LSN changes in 93f958f9c41f or stop logging the inode LSN +altogether. If we do the former, log recovery does not need to +change but we add 8 bytes of memory per inode to store what is +largely a write-only inode field. If we do the latter, log recovery +needs to stamp the on-disk inode in the same manner that inode +writeback does. + +I prefer the latter, because we shouldn't really be trying to log +and replay changes to the on disk LSN as the on-disk value is the +canonical source of the on-disk version of the inode. It also +matches the way we recover buffer items - we create a buf_log_item +that carries the current recovery transaction LSN that gets stamped +into the buffer by the write verifier when it gets written back +when the transaction is fully recovered. + +However, this might break log recovery on older kernels even more, +so I'm going to simply ignore the logged value in recovery and stamp +the on-disk inode with the LSN of the transaction being recovered +that will trigger writeback on transaction recovery completion. This +will ensure that the on-disk inode LSN always reflects the LSN of +the last change that was written to disk, regardless of whether it +comes from log recovery or runtime writeback. + +Fixes: 93f958f9c41f ("xfs: cull unnecessary icdinode fields") +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Amir Goldstein +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_log_format.h | 11 ++++++++++- + fs/xfs/xfs_inode_item_recover.c | 39 +++++++++++++++++++++++++++++---------- + 2 files changed, 39 insertions(+), 11 deletions(-) + +--- a/fs/xfs/libxfs/xfs_log_format.h ++++ b/fs/xfs/libxfs/xfs_log_format.h +@@ -414,7 +414,16 @@ struct xfs_log_dinode { + /* start of the extended dinode, writable fields */ + uint32_t di_crc; /* CRC of the inode */ + uint64_t di_changecount; /* number of attribute changes */ +- xfs_lsn_t di_lsn; /* flush sequence */ ++ ++ /* ++ * The LSN we write to this field during formatting is not a reflection ++ * of the current on-disk LSN. It should never be used for recovery ++ * sequencing, nor should it be recovered into the on-disk inode at all. ++ * See xlog_recover_inode_commit_pass2() and xfs_log_dinode_to_disk() ++ * for details. ++ */ ++ xfs_lsn_t di_lsn; ++ + uint64_t di_flags2; /* more random flags */ + uint32_t di_cowextsize; /* basic cow extent size for file */ + uint8_t di_pad2[12]; /* more padding for future expansion */ +--- a/fs/xfs/xfs_inode_item_recover.c ++++ b/fs/xfs/xfs_inode_item_recover.c +@@ -145,7 +145,8 @@ xfs_log_dinode_to_disk_ts( + STATIC void + xfs_log_dinode_to_disk( + struct xfs_log_dinode *from, +- struct xfs_dinode *to) ++ struct xfs_dinode *to, ++ xfs_lsn_t lsn) + { + to->di_magic = cpu_to_be16(from->di_magic); + to->di_mode = cpu_to_be16(from->di_mode); +@@ -182,7 +183,7 @@ xfs_log_dinode_to_disk( + to->di_flags2 = cpu_to_be64(from->di_flags2); + to->di_cowextsize = cpu_to_be32(from->di_cowextsize); + to->di_ino = cpu_to_be64(from->di_ino); +- to->di_lsn = cpu_to_be64(from->di_lsn); ++ to->di_lsn = cpu_to_be64(lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + to->di_flushiter = 0; +@@ -261,16 +262,25 @@ xlog_recover_inode_commit_pass2( + } + + /* +- * If the inode has an LSN in it, recover the inode only if it's less +- * than the lsn of the transaction we are replaying. Note: we still +- * need to replay an owner change even though the inode is more recent +- * than the transaction as there is no guarantee that all the btree +- * blocks are more recent than this transaction, too. ++ * If the inode has an LSN in it, recover the inode only if the on-disk ++ * inode's LSN is older than the lsn of the transaction we are ++ * replaying. We can have multiple checkpoints with the same start LSN, ++ * so the current LSN being equal to the on-disk LSN doesn't necessarily ++ * mean that the on-disk inode is more recent than the change being ++ * replayed. ++ * ++ * We must check the current_lsn against the on-disk inode ++ * here because the we can't trust the log dinode to contain a valid LSN ++ * (see comment below before replaying the log dinode for details). ++ * ++ * Note: we still need to replay an owner change even though the inode ++ * is more recent than the transaction as there is no guarantee that all ++ * the btree blocks are more recent than this transaction, too. + */ + if (dip->di_version >= 3) { + xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); + +- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { ++ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) > 0) { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_owner_change; +@@ -368,8 +378,17 @@ xlog_recover_inode_commit_pass2( + goto out_release; + } + +- /* recover the log dinode inode into the on disk inode */ +- xfs_log_dinode_to_disk(ldip, dip); ++ /* ++ * Recover the log dinode inode into the on disk inode. ++ * ++ * The LSN in the log dinode is garbage - it can be zero or reflect ++ * stale in-memory runtime state that isn't coherent with the changes ++ * logged in this transaction or the changes written to the on-disk ++ * inode. Hence we write the current lSN into the inode because that ++ * matches what xfs_iflush() would write inode the inode when flushing ++ * the changes in this transaction. ++ */ ++ xfs_log_dinode_to_disk(ldip, dip, current_lsn); + + fields = in_f->ilf_fields; + if (fields & XFS_ILOG_DEV) diff --git a/queue-5.10/xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch b/queue-5.10/xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch new file mode 100644 index 00000000000..c645ea6f1e8 --- /dev/null +++ b/queue-5.10/xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch @@ -0,0 +1,155 @@ +From foo@baz Mon Aug 1 10:39:45 AM CEST 2022 +From: Amir Goldstein +Date: Fri, 29 Jul 2022 18:16:03 +0200 +Subject: xfs: prevent UAF in xfs_log_item_in_current_chkpt +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Luis Chamberlain , Adam Manzanares , linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner +Message-ID: <20220729161609.4071252-4-amir73il@gmail.com> + +From: "Darrick J. Wong" + +commit f8d92a66e810acbef6ddbc0bd0cbd9b117ce8acd upstream. + +While I was running with KASAN and lockdep enabled, I stumbled upon an +KASAN report about a UAF to a freed CIL checkpoint. Looking at the +comment for xfs_log_item_in_current_chkpt, it seems pretty obvious to me +that the original patch to xfs_defer_finish_noroll should have done +something to lock the CIL to prevent it from switching the CIL contexts +while the predicate runs. + +For upper level code that needs to know if a given log item is new +enough not to need relogging, add a new wrapper that takes the CIL +context lock long enough to sample the current CIL context. This is +kind of racy in that the CIL can switch the contexts immediately after +sampling, but that's ok because the consequence is that the defer ops +code is a little slow to relog items. + + ================================================================== + BUG: KASAN: use-after-free in xfs_log_item_in_current_chkpt+0x139/0x160 [xfs] + Read of size 8 at addr ffff88804ea5f608 by task fsstress/527999 + + CPU: 1 PID: 527999 Comm: fsstress Tainted: G D 5.16.0-rc4-xfsx #rc4 + Call Trace: + + dump_stack_lvl+0x45/0x59 + print_address_description.constprop.0+0x1f/0x140 + kasan_report.cold+0x83/0xdf + xfs_log_item_in_current_chkpt+0x139/0x160 + xfs_defer_finish_noroll+0x3bb/0x1e30 + __xfs_trans_commit+0x6c8/0xcf0 + xfs_reflink_remap_extent+0x66f/0x10e0 + xfs_reflink_remap_blocks+0x2dd/0xa90 + xfs_file_remap_range+0x27b/0xc30 + vfs_dedupe_file_range_one+0x368/0x420 + vfs_dedupe_file_range+0x37c/0x5d0 + do_vfs_ioctl+0x308/0x1260 + __x64_sys_ioctl+0xa1/0x170 + do_syscall_64+0x35/0x80 + entry_SYSCALL_64_after_hwframe+0x44/0xae + RIP: 0033:0x7f2c71a2950b + Code: 0f 1e fa 48 8b 05 85 39 0d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff +ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 +f0 ff ff 73 01 c3 48 8b 0d 55 39 0d 00 f7 d8 64 89 01 48 + RSP: 002b:00007ffe8c0e03c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 + RAX: ffffffffffffffda RBX: 00005600862a8740 RCX: 00007f2c71a2950b + RDX: 00005600862a7be0 RSI: 00000000c0189436 RDI: 0000000000000004 + RBP: 000000000000000b R08: 0000000000000027 R09: 0000000000000003 + R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000005a + R13: 00005600862804a8 R14: 0000000000016000 R15: 00005600862a8a20 + + + Allocated by task 464064: + kasan_save_stack+0x1e/0x50 + __kasan_kmalloc+0x81/0xa0 + kmem_alloc+0xcd/0x2c0 [xfs] + xlog_cil_ctx_alloc+0x17/0x1e0 [xfs] + xlog_cil_push_work+0x141/0x13d0 [xfs] + process_one_work+0x7f6/0x1380 + worker_thread+0x59d/0x1040 + kthread+0x3b0/0x490 + ret_from_fork+0x1f/0x30 + + Freed by task 51: + kasan_save_stack+0x1e/0x50 + kasan_set_track+0x21/0x30 + kasan_set_free_info+0x20/0x30 + __kasan_slab_free+0xed/0x130 + slab_free_freelist_hook+0x7f/0x160 + kfree+0xde/0x340 + xlog_cil_committed+0xbfd/0xfe0 [xfs] + xlog_cil_process_committed+0x103/0x1c0 [xfs] + xlog_state_do_callback+0x45d/0xbd0 [xfs] + xlog_ioend_work+0x116/0x1c0 [xfs] + process_one_work+0x7f6/0x1380 + worker_thread+0x59d/0x1040 + kthread+0x3b0/0x490 + ret_from_fork+0x1f/0x30 + + Last potentially related work creation: + kasan_save_stack+0x1e/0x50 + __kasan_record_aux_stack+0xb7/0xc0 + insert_work+0x48/0x2e0 + __queue_work+0x4e7/0xda0 + queue_work_on+0x69/0x80 + xlog_cil_push_now.isra.0+0x16b/0x210 [xfs] + xlog_cil_force_seq+0x1b7/0x850 [xfs] + xfs_log_force_seq+0x1c7/0x670 [xfs] + xfs_file_fsync+0x7c1/0xa60 [xfs] + __x64_sys_fsync+0x52/0x80 + do_syscall_64+0x35/0x80 + entry_SYSCALL_64_after_hwframe+0x44/0xae + + The buggy address belongs to the object at ffff88804ea5f600 + which belongs to the cache kmalloc-256 of size 256 + The buggy address is located 8 bytes inside of + 256-byte region [ffff88804ea5f600, ffff88804ea5f700) + The buggy address belongs to the page: + page:ffffea00013a9780 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88804ea5ea00 pfn:0x4ea5e + head:ffffea00013a9780 order:1 compound_mapcount:0 + flags: 0x4fff80000010200(slab|head|node=1|zone=1|lastcpupid=0xfff) + raw: 04fff80000010200 ffffea0001245908 ffffea00011bd388 ffff888004c42b40 + raw: ffff88804ea5ea00 0000000000100009 00000001ffffffff 0000000000000000 + page dumped because: kasan: bad access detected + + Memory state around the buggy address: + ffff88804ea5f500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc + ffff88804ea5f580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc + >ffff88804ea5f600: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ^ + ffff88804ea5f680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ffff88804ea5f700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc + ================================================================== + +Fixes: 4e919af7827a ("xfs: periodically relog deferred intent items") +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Amir Goldstein +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_log_cil.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/fs/xfs/xfs_log_cil.c ++++ b/fs/xfs/xfs_log_cil.c +@@ -1179,9 +1179,9 @@ out_shutdown: + */ + bool + xfs_log_item_in_current_chkpt( +- struct xfs_log_item *lip) ++ struct xfs_log_item *lip) + { +- struct xfs_cil_ctx *ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; ++ struct xfs_cil *cil = lip->li_mountp->m_log->l_cilp; + + if (list_empty(&lip->li_cil)) + return false; +@@ -1191,7 +1191,7 @@ xfs_log_item_in_current_chkpt( + * first checkpoint it is written to. Hence if it is different to the + * current sequence, we're in a new checkpoint. + */ +- return lip->li_seq == ctx->sequence; ++ return lip->li_seq == READ_ONCE(cil->xc_current_sequence); + } + + /* diff --git a/queue-5.10/xfs-refactor-xfs_file_fsync.patch b/queue-5.10/xfs-refactor-xfs_file_fsync.patch new file mode 100644 index 00000000000..1829738ab3b --- /dev/null +++ b/queue-5.10/xfs-refactor-xfs_file_fsync.patch @@ -0,0 +1,136 @@ +From foo@baz Mon Aug 1 10:39:45 AM CEST 2022 +From: Amir Goldstein +Date: Fri, 29 Jul 2022 18:16:01 +0200 +Subject: xfs: refactor xfs_file_fsync +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Luis Chamberlain , Adam Manzanares , linux-xfs@vger.kernel.org, stable@vger.kernel.org, Christoph Hellwig , Brian Foster , Dave Chinner +Message-ID: <20220729161609.4071252-2-amir73il@gmail.com> + +From: Christoph Hellwig + +commit f22c7f87777361f94aa17f746fbadfa499248dc8 upstream. + +[backported for dependency] + +Factor out the log syncing logic into two helpers to make the code easier +to read and more maintainable. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Amir Goldstein +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_file.c | 81 +++++++++++++++++++++++++++++++++--------------------- + 1 file changed, 50 insertions(+), 31 deletions(-) + +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -118,6 +118,54 @@ xfs_dir_fsync( + return xfs_log_force_inode(ip); + } + ++static xfs_lsn_t ++xfs_fsync_lsn( ++ struct xfs_inode *ip, ++ bool datasync) ++{ ++ if (!xfs_ipincount(ip)) ++ return 0; ++ if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) ++ return 0; ++ return ip->i_itemp->ili_last_lsn; ++} ++ ++/* ++ * All metadata updates are logged, which means that we just have to flush the ++ * log up to the latest LSN that touched the inode. ++ * ++ * If we have concurrent fsync/fdatasync() calls, we need them to all block on ++ * the log force before we clear the ili_fsync_fields field. This ensures that ++ * we don't get a racing sync operation that does not wait for the metadata to ++ * hit the journal before returning. If we race with clearing ili_fsync_fields, ++ * then all that will happen is the log force will do nothing as the lsn will ++ * already be on disk. We can't race with setting ili_fsync_fields because that ++ * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock ++ * shared until after the ili_fsync_fields is cleared. ++ */ ++static int ++xfs_fsync_flush_log( ++ struct xfs_inode *ip, ++ bool datasync, ++ int *log_flushed) ++{ ++ int error = 0; ++ xfs_lsn_t lsn; ++ ++ xfs_ilock(ip, XFS_ILOCK_SHARED); ++ lsn = xfs_fsync_lsn(ip, datasync); ++ if (lsn) { ++ error = xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, ++ log_flushed); ++ ++ spin_lock(&ip->i_itemp->ili_lock); ++ ip->i_itemp->ili_fsync_fields = 0; ++ spin_unlock(&ip->i_itemp->ili_lock); ++ } ++ xfs_iunlock(ip, XFS_ILOCK_SHARED); ++ return error; ++} ++ + STATIC int + xfs_file_fsync( + struct file *file, +@@ -125,13 +173,10 @@ xfs_file_fsync( + loff_t end, + int datasync) + { +- struct inode *inode = file->f_mapping->host; +- struct xfs_inode *ip = XFS_I(inode); +- struct xfs_inode_log_item *iip = ip->i_itemp; ++ struct xfs_inode *ip = XFS_I(file->f_mapping->host); + struct xfs_mount *mp = ip->i_mount; + int error = 0; + int log_flushed = 0; +- xfs_lsn_t lsn = 0; + + trace_xfs_file_fsync(ip); + +@@ -155,33 +200,7 @@ xfs_file_fsync( + else if (mp->m_logdev_targp != mp->m_ddev_targp) + xfs_blkdev_issue_flush(mp->m_ddev_targp); + +- /* +- * All metadata updates are logged, which means that we just have to +- * flush the log up to the latest LSN that touched the inode. If we have +- * concurrent fsync/fdatasync() calls, we need them to all block on the +- * log force before we clear the ili_fsync_fields field. This ensures +- * that we don't get a racing sync operation that does not wait for the +- * metadata to hit the journal before returning. If we race with +- * clearing the ili_fsync_fields, then all that will happen is the log +- * force will do nothing as the lsn will already be on disk. We can't +- * race with setting ili_fsync_fields because that is done under +- * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared +- * until after the ili_fsync_fields is cleared. +- */ +- xfs_ilock(ip, XFS_ILOCK_SHARED); +- if (xfs_ipincount(ip)) { +- if (!datasync || +- (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) +- lsn = iip->ili_last_lsn; +- } +- +- if (lsn) { +- error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); +- spin_lock(&iip->ili_lock); +- iip->ili_fsync_fields = 0; +- spin_unlock(&iip->ili_lock); +- } +- xfs_iunlock(ip, XFS_ILOCK_SHARED); ++ error = xfs_fsync_flush_log(ip, datasync, &log_flushed); + + /* + * If we only have a single device, and the log force about was diff --git a/queue-5.10/xfs-remove-dead-stale-buf-unpin-handling-code.patch b/queue-5.10/xfs-remove-dead-stale-buf-unpin-handling-code.patch new file mode 100644 index 00000000000..a9e75d8d684 --- /dev/null +++ b/queue-5.10/xfs-remove-dead-stale-buf-unpin-handling-code.patch @@ -0,0 +1,74 @@ +From foo@baz Mon Aug 1 10:39:45 AM CEST 2022 +From: Amir Goldstein +Date: Fri, 29 Jul 2022 18:16:07 +0200 +Subject: xfs: remove dead stale buf unpin handling code +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Luis Chamberlain , Adam Manzanares , linux-xfs@vger.kernel.org, stable@vger.kernel.org, Brian Foster +Message-ID: <20220729161609.4071252-8-amir73il@gmail.com> + +From: Brian Foster + +commit e53d3aa0b605c49d780e1b2fd0b49dba4154f32b upstream. + +This code goes back to a time when transaction commits wrote +directly to iclogs. The associated log items were pinned, written to +the log, and then "uncommitted" if some part of the log write had +failed. This uncommit sequence called an ->iop_unpin_remove() +handler that was eventually folded into ->iop_unpin() via the remove +parameter. The log subsystem has since changed significantly in that +transactions commit to the CIL instead of direct to iclogs, though +log items must still be aborted in the event of an eventual log I/O +error. However, the context for a log item abort is now asynchronous +from transaction commit, which means the committing transaction has +been freed by this point in time and the transaction uncommit +sequence of events is no longer relevant. + +Further, since stale buffers remain locked at transaction commit +through unpin, we can be certain that the buffer is not associated +with any transaction when the unpin callback executes. Remove this +unused hunk of code and replace it with an assertion that the buffer +is disassociated from transaction context. + +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Amir Goldstein +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_buf_item.c | 21 ++------------------- + 1 file changed, 2 insertions(+), 19 deletions(-) + +--- a/fs/xfs/xfs_buf_item.c ++++ b/fs/xfs/xfs_buf_item.c +@@ -435,28 +435,11 @@ xfs_buf_item_unpin( + ASSERT(xfs_buf_islocked(bp)); + ASSERT(bp->b_flags & XBF_STALE); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); ++ ASSERT(list_empty(&lip->li_trans)); ++ ASSERT(!bp->b_transp); + + trace_xfs_buf_item_unpin_stale(bip); + +- if (remove) { +- /* +- * If we are in a transaction context, we have to +- * remove the log item from the transaction as we are +- * about to release our reference to the buffer. If we +- * don't, the unlock that occurs later in +- * xfs_trans_uncommit() will try to reference the +- * buffer which we no longer have a hold on. +- */ +- if (!list_empty(&lip->li_trans)) +- xfs_trans_del_item(lip); +- +- /* +- * Since the transaction no longer refers to the buffer, +- * the buffer should no longer refer to the transaction. +- */ +- bp->b_transp = NULL; +- } +- + /* + * If we get called here because of an IO error, we may or may + * not have the item on the AIL. xfs_trans_ail_delete() will diff --git a/queue-5.10/xfs-xfs_log_force_lsn-isn-t-passed-a-lsn.patch b/queue-5.10/xfs-xfs_log_force_lsn-isn-t-passed-a-lsn.patch new file mode 100644 index 00000000000..9eaa03990f4 --- /dev/null +++ b/queue-5.10/xfs-xfs_log_force_lsn-isn-t-passed-a-lsn.patch @@ -0,0 +1,472 @@ +From foo@baz Mon Aug 1 10:39:45 AM CEST 2022 +From: Amir Goldstein +Date: Fri, 29 Jul 2022 18:16:02 +0200 +Subject: xfs: xfs_log_force_lsn isn't passed a LSN +To: Greg Kroah-Hartman +Cc: Sasha Levin , "Darrick J . Wong" , Leah Rumancik , Chandan Babu R , Luis Chamberlain , Adam Manzanares , linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner , Brian Foster , Allison Henderson +Message-ID: <20220729161609.4071252-3-amir73il@gmail.com> + +From: Dave Chinner + +commit 5f9b4b0de8dc2fb8eb655463b438001c111570fe upstream. + +[backported from CIL scalability series for dependency] + +In doing an investigation into AIL push stalls, I was looking at the +log force code to see if an async CIL push could be done instead. +This lead me to xfs_log_force_lsn() and looking at how it works. + +xfs_log_force_lsn() is only called from inode synchronisation +contexts such as fsync(), and it takes the ip->i_itemp->ili_last_lsn +value as the LSN to sync the log to. This gets passed to +xlog_cil_force_lsn() via xfs_log_force_lsn() to flush the CIL to the +journal, and then used by xfs_log_force_lsn() to flush the iclogs to +the journal. + +The problem is that ip->i_itemp->ili_last_lsn does not store a +log sequence number. What it stores is passed to it from the +->iop_committing method, which is called by xfs_log_commit_cil(). +The value this passes to the iop_committing method is the CIL +context sequence number that the item was committed to. + +As it turns out, xlog_cil_force_lsn() converts the sequence to an +actual commit LSN for the related context and returns that to +xfs_log_force_lsn(). xfs_log_force_lsn() overwrites it's "lsn" +variable that contained a sequence with an actual LSN and then uses +that to sync the iclogs. + +This caused me some confusion for a while, even though I originally +wrote all this code a decade ago. ->iop_committing is only used by +a couple of log item types, and only inode items use the sequence +number it is passed. + +Let's clean up the API, CIL structures and inode log item to call it +a sequence number, and make it clear that the high level code is +using CIL sequence numbers and not on-disk LSNs for integrity +synchronisation purposes. + +Signed-off-by: Dave Chinner +Reviewed-by: Brian Foster +Reviewed-by: Darrick J. Wong +Reviewed-by: Allison Henderson +Signed-off-by: Darrick J. Wong +Signed-off-by: Amir Goldstein +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_types.h | 1 + + fs/xfs/xfs_buf_item.c | 2 +- + fs/xfs/xfs_dquot_item.c | 2 +- + fs/xfs/xfs_file.c | 14 +++++++------- + fs/xfs/xfs_inode.c | 10 +++++----- + fs/xfs/xfs_inode_item.c | 4 ++-- + fs/xfs/xfs_inode_item.h | 2 +- + fs/xfs/xfs_log.c | 27 ++++++++++++++------------- + fs/xfs/xfs_log.h | 4 +--- + fs/xfs/xfs_log_cil.c | 30 +++++++++++------------------- + fs/xfs/xfs_log_priv.h | 15 +++++++-------- + fs/xfs/xfs_trans.c | 6 +++--- + fs/xfs/xfs_trans.h | 4 ++-- + 13 files changed, 56 insertions(+), 65 deletions(-) + +--- a/fs/xfs/libxfs/xfs_types.h ++++ b/fs/xfs/libxfs/xfs_types.h +@@ -21,6 +21,7 @@ typedef int32_t xfs_suminfo_t; /* type + typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */ + + typedef int64_t xfs_lsn_t; /* log sequence number */ ++typedef int64_t xfs_csn_t; /* CIL sequence number */ + + typedef uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ + typedef uint32_t xfs_dahash_t; /* dir/attr hash value */ +--- a/fs/xfs/xfs_buf_item.c ++++ b/fs/xfs/xfs_buf_item.c +@@ -632,7 +632,7 @@ xfs_buf_item_release( + STATIC void + xfs_buf_item_committing( + struct xfs_log_item *lip, +- xfs_lsn_t commit_lsn) ++ xfs_csn_t seq) + { + return xfs_buf_item_release(lip); + } +--- a/fs/xfs/xfs_dquot_item.c ++++ b/fs/xfs/xfs_dquot_item.c +@@ -188,7 +188,7 @@ xfs_qm_dquot_logitem_release( + STATIC void + xfs_qm_dquot_logitem_committing( + struct xfs_log_item *lip, +- xfs_lsn_t commit_lsn) ++ xfs_csn_t seq) + { + return xfs_qm_dquot_logitem_release(lip); + } +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -118,8 +118,8 @@ xfs_dir_fsync( + return xfs_log_force_inode(ip); + } + +-static xfs_lsn_t +-xfs_fsync_lsn( ++static xfs_csn_t ++xfs_fsync_seq( + struct xfs_inode *ip, + bool datasync) + { +@@ -127,7 +127,7 @@ xfs_fsync_lsn( + return 0; + if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) + return 0; +- return ip->i_itemp->ili_last_lsn; ++ return ip->i_itemp->ili_commit_seq; + } + + /* +@@ -150,12 +150,12 @@ xfs_fsync_flush_log( + int *log_flushed) + { + int error = 0; +- xfs_lsn_t lsn; ++ xfs_csn_t seq; + + xfs_ilock(ip, XFS_ILOCK_SHARED); +- lsn = xfs_fsync_lsn(ip, datasync); +- if (lsn) { +- error = xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, ++ seq = xfs_fsync_seq(ip, datasync); ++ if (seq) { ++ error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, + log_flushed); + + spin_lock(&ip->i_itemp->ili_lock); +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -2754,7 +2754,7 @@ xfs_iunpin( + trace_xfs_inode_unpin_nowait(ip, _RET_IP_); + + /* Give the log a push to start the unpinning I/O */ +- xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0, NULL); ++ xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL); + + } + +@@ -3716,16 +3716,16 @@ int + xfs_log_force_inode( + struct xfs_inode *ip) + { +- xfs_lsn_t lsn = 0; ++ xfs_csn_t seq = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) +- lsn = ip->i_itemp->ili_last_lsn; ++ seq = ip->i_itemp->ili_commit_seq; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + +- if (!lsn) ++ if (!seq) + return 0; +- return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL); ++ return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL); + } + + /* +--- a/fs/xfs/xfs_inode_item.c ++++ b/fs/xfs/xfs_inode_item.c +@@ -617,9 +617,9 @@ xfs_inode_item_committed( + STATIC void + xfs_inode_item_committing( + struct xfs_log_item *lip, +- xfs_lsn_t commit_lsn) ++ xfs_csn_t seq) + { +- INODE_ITEM(lip)->ili_last_lsn = commit_lsn; ++ INODE_ITEM(lip)->ili_commit_seq = seq; + return xfs_inode_item_release(lip); + } + +--- a/fs/xfs/xfs_inode_item.h ++++ b/fs/xfs/xfs_inode_item.h +@@ -33,7 +33,7 @@ struct xfs_inode_log_item { + unsigned int ili_fields; /* fields to be logged */ + unsigned int ili_fsync_fields; /* logged since last fsync */ + xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ +- xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ ++ xfs_csn_t ili_commit_seq; /* last transaction commit */ + }; + + static inline int xfs_inode_clean(struct xfs_inode *ip) +--- a/fs/xfs/xfs_log.c ++++ b/fs/xfs/xfs_log.c +@@ -3210,14 +3210,13 @@ out_error: + } + + static int +-__xfs_log_force_lsn( +- struct xfs_mount *mp, ++xlog_force_lsn( ++ struct xlog *log, + xfs_lsn_t lsn, + uint flags, + int *log_flushed, + bool already_slept) + { +- struct xlog *log = mp->m_log; + struct xlog_in_core *iclog; + + spin_lock(&log->l_icloglock); +@@ -3250,8 +3249,6 @@ __xfs_log_force_lsn( + if (!already_slept && + (iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) { +- XFS_STATS_INC(mp, xs_log_force_sleep); +- + xlog_wait(&iclog->ic_prev->ic_write_wait, + &log->l_icloglock); + return -EAGAIN; +@@ -3289,25 +3286,29 @@ out_error: + * to disk, that thread will wake up all threads waiting on the queue. + */ + int +-xfs_log_force_lsn( ++xfs_log_force_seq( + struct xfs_mount *mp, +- xfs_lsn_t lsn, ++ xfs_csn_t seq, + uint flags, + int *log_flushed) + { ++ struct xlog *log = mp->m_log; ++ xfs_lsn_t lsn; + int ret; +- ASSERT(lsn != 0); ++ ASSERT(seq != 0); + + XFS_STATS_INC(mp, xs_log_force); +- trace_xfs_log_force(mp, lsn, _RET_IP_); ++ trace_xfs_log_force(mp, seq, _RET_IP_); + +- lsn = xlog_cil_force_lsn(mp->m_log, lsn); ++ lsn = xlog_cil_force_seq(log, seq); + if (lsn == NULLCOMMITLSN) + return 0; + +- ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false); +- if (ret == -EAGAIN) +- ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true); ++ ret = xlog_force_lsn(log, lsn, flags, log_flushed, false); ++ if (ret == -EAGAIN) { ++ XFS_STATS_INC(mp, xs_log_force_sleep); ++ ret = xlog_force_lsn(log, lsn, flags, log_flushed, true); ++ } + return ret; + } + +--- a/fs/xfs/xfs_log.h ++++ b/fs/xfs/xfs_log.h +@@ -106,7 +106,7 @@ struct xfs_item_ops; + struct xfs_trans; + + int xfs_log_force(struct xfs_mount *mp, uint flags); +-int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags, ++int xfs_log_force_seq(struct xfs_mount *mp, xfs_csn_t seq, uint flags, + int *log_forced); + int xfs_log_mount(struct xfs_mount *mp, + struct xfs_buftarg *log_target, +@@ -132,8 +132,6 @@ bool xfs_log_writable(struct xfs_mount * + struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); + void xfs_log_ticket_put(struct xlog_ticket *ticket); + +-void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, +- xfs_lsn_t *commit_lsn, bool regrant); + void xlog_cil_process_committed(struct list_head *list); + bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); + +--- a/fs/xfs/xfs_log_cil.c ++++ b/fs/xfs/xfs_log_cil.c +@@ -777,7 +777,7 @@ xlog_cil_push_work( + * that higher sequences will wait for us to write out a commit record + * before they do. + * +- * xfs_log_force_lsn requires us to mirror the new sequence into the cil ++ * xfs_log_force_seq requires us to mirror the new sequence into the cil + * structure atomically with the addition of this sequence to the + * committing list. This also ensures that we can do unlocked checks + * against the current sequence in log forces without risking +@@ -1020,16 +1020,14 @@ xlog_cil_empty( + * allowed again. + */ + void +-xfs_log_commit_cil( +- struct xfs_mount *mp, ++xlog_cil_commit( ++ struct xlog *log, + struct xfs_trans *tp, +- xfs_lsn_t *commit_lsn, ++ xfs_csn_t *commit_seq, + bool regrant) + { +- struct xlog *log = mp->m_log; + struct xfs_cil *cil = log->l_cilp; + struct xfs_log_item *lip, *next; +- xfs_lsn_t xc_commit_lsn; + + /* + * Do all necessary memory allocation before we lock the CIL. +@@ -1043,10 +1041,6 @@ xfs_log_commit_cil( + + xlog_cil_insert_items(log, tp); + +- xc_commit_lsn = cil->xc_ctx->sequence; +- if (commit_lsn) +- *commit_lsn = xc_commit_lsn; +- + if (regrant && !XLOG_FORCED_SHUTDOWN(log)) + xfs_log_ticket_regrant(log, tp->t_ticket); + else +@@ -1069,8 +1063,10 @@ xfs_log_commit_cil( + list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) { + xfs_trans_del_item(lip); + if (lip->li_ops->iop_committing) +- lip->li_ops->iop_committing(lip, xc_commit_lsn); ++ lip->li_ops->iop_committing(lip, cil->xc_ctx->sequence); + } ++ if (commit_seq) ++ *commit_seq = cil->xc_ctx->sequence; + + /* xlog_cil_push_background() releases cil->xc_ctx_lock */ + xlog_cil_push_background(log); +@@ -1087,9 +1083,9 @@ xfs_log_commit_cil( + * iclog flush is necessary following this call. + */ + xfs_lsn_t +-xlog_cil_force_lsn( ++xlog_cil_force_seq( + struct xlog *log, +- xfs_lsn_t sequence) ++ xfs_csn_t sequence) + { + struct xfs_cil *cil = log->l_cilp; + struct xfs_cil_ctx *ctx; +@@ -1185,21 +1181,17 @@ bool + xfs_log_item_in_current_chkpt( + struct xfs_log_item *lip) + { +- struct xfs_cil_ctx *ctx; ++ struct xfs_cil_ctx *ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; + + if (list_empty(&lip->li_cil)) + return false; + +- ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; +- + /* + * li_seq is written on the first commit of a log item to record the + * first checkpoint it is written to. Hence if it is different to the + * current sequence, we're in a new checkpoint. + */ +- if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0) +- return false; +- return true; ++ return lip->li_seq == ctx->sequence; + } + + /* +--- a/fs/xfs/xfs_log_priv.h ++++ b/fs/xfs/xfs_log_priv.h +@@ -230,7 +230,7 @@ struct xfs_cil; + + struct xfs_cil_ctx { + struct xfs_cil *cil; +- xfs_lsn_t sequence; /* chkpt sequence # */ ++ xfs_csn_t sequence; /* chkpt sequence # */ + xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ + xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ + struct xlog_ticket *ticket; /* chkpt ticket */ +@@ -268,10 +268,10 @@ struct xfs_cil { + struct xfs_cil_ctx *xc_ctx; + + spinlock_t xc_push_lock ____cacheline_aligned_in_smp; +- xfs_lsn_t xc_push_seq; ++ xfs_csn_t xc_push_seq; + struct list_head xc_committing; + wait_queue_head_t xc_commit_wait; +- xfs_lsn_t xc_current_sequence; ++ xfs_csn_t xc_current_sequence; + struct work_struct xc_push_work; + wait_queue_head_t xc_push_wait; /* background push throttle */ + } ____cacheline_aligned_in_smp; +@@ -547,19 +547,18 @@ int xlog_cil_init(struct xlog *log); + void xlog_cil_init_post_recovery(struct xlog *log); + void xlog_cil_destroy(struct xlog *log); + bool xlog_cil_empty(struct xlog *log); ++void xlog_cil_commit(struct xlog *log, struct xfs_trans *tp, ++ xfs_csn_t *commit_seq, bool regrant); + + /* + * CIL force routines + */ +-xfs_lsn_t +-xlog_cil_force_lsn( +- struct xlog *log, +- xfs_lsn_t sequence); ++xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence); + + static inline void + xlog_cil_force(struct xlog *log) + { +- xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); ++ xlog_cil_force_seq(log, log->l_cilp->xc_current_sequence); + } + + /* +--- a/fs/xfs/xfs_trans.c ++++ b/fs/xfs/xfs_trans.c +@@ -832,7 +832,7 @@ __xfs_trans_commit( + bool regrant) + { + struct xfs_mount *mp = tp->t_mountp; +- xfs_lsn_t commit_lsn = -1; ++ xfs_csn_t commit_seq = 0; + int error = 0; + int sync = tp->t_flags & XFS_TRANS_SYNC; + +@@ -874,7 +874,7 @@ __xfs_trans_commit( + xfs_trans_apply_sb_deltas(tp); + xfs_trans_apply_dquot_deltas(tp); + +- xfs_log_commit_cil(mp, tp, &commit_lsn, regrant); ++ xlog_cil_commit(mp->m_log, tp, &commit_seq, regrant); + + xfs_trans_free(tp); + +@@ -883,7 +883,7 @@ __xfs_trans_commit( + * log out now and wait for it. + */ + if (sync) { +- error = xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL); ++ error = xfs_log_force_seq(mp, commit_seq, XFS_LOG_SYNC, NULL); + XFS_STATS_INC(mp, xs_trans_sync); + } else { + XFS_STATS_INC(mp, xs_trans_async); +--- a/fs/xfs/xfs_trans.h ++++ b/fs/xfs/xfs_trans.h +@@ -43,7 +43,7 @@ struct xfs_log_item { + struct list_head li_cil; /* CIL pointers */ + struct xfs_log_vec *li_lv; /* active log vector */ + struct xfs_log_vec *li_lv_shadow; /* standby vector */ +- xfs_lsn_t li_seq; /* CIL commit seq */ ++ xfs_csn_t li_seq; /* CIL commit seq */ + }; + + /* +@@ -69,7 +69,7 @@ struct xfs_item_ops { + void (*iop_pin)(struct xfs_log_item *); + void (*iop_unpin)(struct xfs_log_item *, int remove); + uint (*iop_push)(struct xfs_log_item *, struct list_head *); +- void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn); ++ void (*iop_committing)(struct xfs_log_item *lip, xfs_csn_t seq); + void (*iop_release)(struct xfs_log_item *); + xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); + int (*iop_recover)(struct xfs_log_item *lip,