]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.10-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 1 Aug 2022 08:41:30 +0000 (10:41 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 1 Aug 2022 08:41:30 +0000 (10:41 +0200)
added patches:
docs-kernel-parameters-update-descriptions-for-mitigations-param-with-retbleed.patch
edac-ghes-set-the-dimm-label-unconditionally.patch
xfs-enforce-attr3-buffer-recovery-order.patch
xfs-fix-log-intent-recovery-enospc-shutdowns-when-inactivating-inodes.patch
xfs-force-the-log-offline-when-log-intent-item-recovery-fails.patch
xfs-hold-buffer-across-unpin-and-potential-shutdown-processing.patch
xfs-logging-the-on-disk-inode-lsn-can-make-it-go-backwards.patch
xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch
xfs-refactor-xfs_file_fsync.patch
xfs-remove-dead-stale-buf-unpin-handling-code.patch
xfs-xfs_log_force_lsn-isn-t-passed-a-lsn.patch

12 files changed:
queue-5.10/docs-kernel-parameters-update-descriptions-for-mitigations-param-with-retbleed.patch [new file with mode: 0644]
queue-5.10/edac-ghes-set-the-dimm-label-unconditionally.patch [new file with mode: 0644]
queue-5.10/series
queue-5.10/xfs-enforce-attr3-buffer-recovery-order.patch [new file with mode: 0644]
queue-5.10/xfs-fix-log-intent-recovery-enospc-shutdowns-when-inactivating-inodes.patch [new file with mode: 0644]
queue-5.10/xfs-force-the-log-offline-when-log-intent-item-recovery-fails.patch [new file with mode: 0644]
queue-5.10/xfs-hold-buffer-across-unpin-and-potential-shutdown-processing.patch [new file with mode: 0644]
queue-5.10/xfs-logging-the-on-disk-inode-lsn-can-make-it-go-backwards.patch [new file with mode: 0644]
queue-5.10/xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch [new file with mode: 0644]
queue-5.10/xfs-refactor-xfs_file_fsync.patch [new file with mode: 0644]
queue-5.10/xfs-remove-dead-stale-buf-unpin-handling-code.patch [new file with mode: 0644]
queue-5.10/xfs-xfs_log_force_lsn-isn-t-passed-a-lsn.patch [new file with mode: 0644]

diff --git a/queue-5.10/docs-kernel-parameters-update-descriptions-for-mitigations-param-with-retbleed.patch b/queue-5.10/docs-kernel-parameters-update-descriptions-for-mitigations-param-with-retbleed.patch
new file mode 100644 (file)
index 0000000..548dccc
--- /dev/null
@@ -0,0 +1,39 @@
+From ea304a8b89fd0d6cf94ee30cb139dc23d9f1a62f Mon Sep 17 00:00:00 2001
+From: Eiichi Tsukata <eiichi.tsukata@nutanix.com>
+Date: Thu, 28 Jul 2022 04:39:07 +0000
+Subject: docs/kernel-parameters: Update descriptions for "mitigations=" param with retbleed
+
+From: Eiichi Tsukata <eiichi.tsukata@nutanix.com>
+
+commit ea304a8b89fd0d6cf94ee30cb139dc23d9f1a62f upstream.
+
+Updates descriptions for "mitigations=off" and "mitigations=auto,nosmt"
+with the respective retbleed= settings.
+
+Signed-off-by: Eiichi Tsukata <eiichi.tsukata@nutanix.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Cc: corbet@lwn.net
+Link: https://lore.kernel.org/r/20220728043907.165688-1-eiichi.tsukata@nutanix.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/admin-guide/kernel-parameters.txt |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/Documentation/admin-guide/kernel-parameters.txt
++++ b/Documentation/admin-guide/kernel-parameters.txt
+@@ -2873,6 +2873,7 @@
+                                              no_entry_flush [PPC]
+                                              no_uaccess_flush [PPC]
+                                              mmio_stale_data=off [X86]
++                                             retbleed=off [X86]
+                               Exceptions:
+                                              This does not have any effect on
+@@ -2895,6 +2896,7 @@
+                                              mds=full,nosmt [X86]
+                                              tsx_async_abort=full,nosmt [X86]
+                                              mmio_stale_data=full,nosmt [X86]
++                                             retbleed=auto,nosmt [X86]
+       mminit_loglevel=
+                       [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
diff --git a/queue-5.10/edac-ghes-set-the-dimm-label-unconditionally.patch b/queue-5.10/edac-ghes-set-the-dimm-label-unconditionally.patch
new file mode 100644 (file)
index 0000000..45ad9e4
--- /dev/null
@@ -0,0 +1,108 @@
+From 5e2805d5379619c4a2e3ae4994e73b36439f4bad Mon Sep 17 00:00:00 2001
+From: Toshi Kani <toshi.kani@hpe.com>
+Date: Thu, 21 Jul 2022 12:05:03 -0600
+Subject: EDAC/ghes: Set the DIMM label unconditionally
+
+From: Toshi Kani <toshi.kani@hpe.com>
+
+commit 5e2805d5379619c4a2e3ae4994e73b36439f4bad upstream.
+
+The commit
+
+  cb51a371d08e ("EDAC/ghes: Setup DIMM label from DMI and use it in error reports")
+
+enforced that both the bank and device strings passed to
+dimm_setup_label() are not NULL.
+
+However, there are BIOSes, for example on a
+
+  HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/15/2019
+
+which don't populate both strings:
+
+  Handle 0x0020, DMI type 17, 84 bytes
+  Memory Device
+          Array Handle: 0x0013
+          Error Information Handle: Not Provided
+          Total Width: 72 bits
+          Data Width: 64 bits
+          Size: 32 GB
+          Form Factor: DIMM
+          Set: None
+          Locator: PROC 1 DIMM 1        <===== device
+          Bank Locator: Not Specified   <===== bank
+
+This results in a buffer overflow because ghes_edac_register() calls
+strlen() on an uninitialized label, which had non-zero values left over
+from krealloc_array():
+
+  detected buffer overflow in __fortify_strlen
+   ------------[ cut here ]------------
+   kernel BUG at lib/string_helpers.c:983!
+   invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
+   CPU: 1 PID: 1 Comm: swapper/0 Tainted: G          I       5.18.6-200.fc36.x86_64 #1
+   Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/15/2019
+   RIP: 0010:fortify_panic
+   ...
+   Call Trace:
+    <TASK>
+    ghes_edac_register.cold
+    ghes_probe
+    platform_probe
+    really_probe
+    __driver_probe_device
+    driver_probe_device
+    __driver_attach
+    ? __device_attach_driver
+    bus_for_each_dev
+    bus_add_driver
+    driver_register
+    acpi_ghes_init
+    acpi_init
+    ? acpi_sleep_proc_init
+    do_one_initcall
+
+The label contains garbage because the commit in Fixes reallocs the
+DIMMs array while scanning the system but doesn't clear the newly
+allocated memory.
+
+Change dimm_setup_label() to always initialize the label to fix the
+issue. Set it to the empty string in case BIOS does not provide both
+bank and device so that ghes_edac_register() can keep the default label
+given by edac_mc_alloc_dimms().
+
+  [ bp: Rewrite commit message. ]
+
+Fixes: b9cae27728d1f ("EDAC/ghes: Scan the system once on driver init")
+Co-developed-by: Robert Richter <rric@kernel.org>
+Signed-off-by: Robert Richter <rric@kernel.org>
+Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Tested-by: Robert Elliott <elliott@hpe.com>
+Cc: <stable@vger.kernel.org>
+Link: https://lore.kernel.org/r/20220719220124.760359-1-toshi.kani@hpe.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/edac/ghes_edac.c |   11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/drivers/edac/ghes_edac.c
++++ b/drivers/edac/ghes_edac.c
+@@ -101,9 +101,14 @@ static void dimm_setup_label(struct dimm
+       dmi_memdev_name(handle, &bank, &device);
+-      /* both strings must be non-zero */
+-      if (bank && *bank && device && *device)
+-              snprintf(dimm->label, sizeof(dimm->label), "%s %s", bank, device);
++      /*
++       * Set to a NULL string when both bank and device are zero. In this case,
++       * the label assigned by default will be preserved.
++       */
++      snprintf(dimm->label, sizeof(dimm->label), "%s%s%s",
++               (bank && *bank) ? bank : "",
++               (bank && *bank && device && *device) ? " " : "",
++               (device && *device) ? device : "");
+ }
+ static void assign_dmi_dimm_info(struct dimm_info *dimm, struct memdev_dmi_entry *entry)
index fd90c7c4453473debead4cab6989838d387fbf63..3c068f2020f1328d1b406f88603914dd07f79fc0 100644 (file)
@@ -48,3 +48,14 @@ arm-crypto-comment-out-gcc-warning-that-breaks-clang-builds.patch
 page_alloc-fix-invalid-watermark-check-on-a-negative-value.patch
 mt7601u-add-usb-device-id-for-some-versions-of-xiaodu-wifi-dongle.patch
 arm-9216-1-fix-max_dma_address-overflow.patch
+edac-ghes-set-the-dimm-label-unconditionally.patch
+docs-kernel-parameters-update-descriptions-for-mitigations-param-with-retbleed.patch
+xfs-refactor-xfs_file_fsync.patch
+xfs-xfs_log_force_lsn-isn-t-passed-a-lsn.patch
+xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch
+xfs-fix-log-intent-recovery-enospc-shutdowns-when-inactivating-inodes.patch
+xfs-force-the-log-offline-when-log-intent-item-recovery-fails.patch
+xfs-hold-buffer-across-unpin-and-potential-shutdown-processing.patch
+xfs-remove-dead-stale-buf-unpin-handling-code.patch
+xfs-logging-the-on-disk-inode-lsn-can-make-it-go-backwards.patch
+xfs-enforce-attr3-buffer-recovery-order.patch
diff --git a/queue-5.10/xfs-enforce-attr3-buffer-recovery-order.patch b/queue-5.10/xfs-enforce-attr3-buffer-recovery-order.patch
new file mode 100644 (file)
index 0000000..06d903f
--- /dev/null
@@ -0,0 +1,102 @@
+From foo@baz Mon Aug  1 10:39:45 AM CEST 2022
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Fri, 29 Jul 2022 18:16:09 +0200
+Subject: xfs: Enforce attr3 buffer recovery order
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Luis Chamberlain <mcgrof@kernel.org>, Adam Manzanares <a.manzanares@samsung.com>, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner <dchinner@redhat.com>, Christoph Hellwig <hch@lst.de>
+Message-ID: <20220729161609.4071252-10-amir73il@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit d8f4c2d0398fa1d92cacf854daf80d21a46bfefc upstream.
+
+>From the department of "WTAF? How did we miss that!?"...
+
+When we are recovering a buffer, the first thing we do is check the
+buffer magic number and extract the LSN from the buffer. If the LSN
+is older than the current LSN, we replay the modification to it. If
+the metadata on disk is newer than the transaction in the log, we
+skip it. This is a fundamental v5 filesystem metadata recovery
+behaviour.
+
+generic/482 failed with an attribute writeback failure during log
+recovery. The write verifier caught the corruption before it got
+written to disk, and the attr buffer dump looked like:
+
+XFS (dm-3): Metadata corruption detected at xfs_attr3_leaf_verify+0x275/0x2e0, xfs_attr3_leaf block 0x19be8
+XFS (dm-3): Unmount and run xfs_repair
+XFS (dm-3): First 128 bytes of corrupted metadata buffer:
+00000000: 00 00 00 00 00 00 00 00 3b ee 00 00 4d 2a 01 e1  ........;...M*..
+00000010: 00 00 00 00 00 01 9b e8 00 00 00 01 00 00 05 38  ...............8
+                                  ^^^^^^^^^^^^^^^^^^^^^^^
+00000020: df 39 5e 51 58 ac 44 b6 8d c5 e7 10 44 09 bc 17  .9^QX.D.....D...
+00000030: 00 00 00 00 00 02 00 83 00 03 00 cc 0f 24 01 00  .............$..
+00000040: 00 68 0e bc 0f c8 00 10 00 00 00 00 00 00 00 00  .h..............
+00000050: 00 00 3c 31 0f 24 01 00 00 00 3c 32 0f 88 01 00  ..<1.$....<2....
+00000060: 00 00 3c 33 0f d8 01 00 00 00 00 00 00 00 00 00  ..<3............
+00000070: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+.....
+
+The highlighted bytes are the LSN that was replayed into the
+buffer: 0x100000538. This is cycle 1, block 0x538. Prior to replay,
+that block on disk looks like this:
+
+$ sudo xfs_db -c "fsb 0x417d" -c "type attr3" -c p /dev/mapper/thin-vol
+hdr.info.hdr.forw = 0
+hdr.info.hdr.back = 0
+hdr.info.hdr.magic = 0x3bee
+hdr.info.crc = 0xb5af0bc6 (correct)
+hdr.info.bno = 105448
+hdr.info.lsn = 0x100000900
+               ^^^^^^^^^^^
+hdr.info.uuid = df395e51-58ac-44b6-8dc5-e7104409bc17
+hdr.info.owner = 131203
+hdr.count = 2
+hdr.usedbytes = 120
+hdr.firstused = 3796
+hdr.holes = 1
+hdr.freemap[0-2] = [base,size]
+
+Note the LSN stamped into the buffer on disk: 1/0x900. The version
+on disk is much newer than the log transaction that was being
+replayed. That's a bug, and should -never- happen.
+
+So I immediately went to look at xlog_recover_get_buf_lsn() to check
+that we handled the LSN correctly. I was wondering if there was a
+similar "two commits with the same start LSN skips the second
+replay" problem with buffers. I didn't get that far, because I found
+a much more basic, rudimentary bug: xlog_recover_get_buf_lsn()
+doesn't recognise buffers with XFS_ATTR3_LEAF_MAGIC set in them!!!
+
+IOWs, attr3 leaf buffers fall through the magic number checks
+unrecognised, so trigger the "recover immediately" behaviour instead
+of undergoing an LSN check. IOWs, we incorrectly replay ATTR3 leaf
+buffers and that causes silent on disk corruption of inode attribute
+forks and potentially other things....
+
+Git history shows this is *another* zero day bug, this time
+introduced in commit 50d5c8d8e938 ("xfs: check LSN ordering for v5
+superblocks during recovery") which failed to handle the attr3 leaf
+buffers in recovery. And we've failed to handle them ever since...
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item_recover.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/xfs/xfs_buf_item_recover.c
++++ b/fs/xfs/xfs_buf_item_recover.c
+@@ -796,6 +796,7 @@ xlog_recover_get_buf_lsn(
+       switch (magicda) {
+       case XFS_DIR3_LEAF1_MAGIC:
+       case XFS_DIR3_LEAFN_MAGIC:
++      case XFS_ATTR3_LEAF_MAGIC:
+       case XFS_DA3_NODE_MAGIC:
+               lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
+               uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
diff --git a/queue-5.10/xfs-fix-log-intent-recovery-enospc-shutdowns-when-inactivating-inodes.patch b/queue-5.10/xfs-fix-log-intent-recovery-enospc-shutdowns-when-inactivating-inodes.patch
new file mode 100644 (file)
index 0000000..6351969
--- /dev/null
@@ -0,0 +1,60 @@
+From foo@baz Mon Aug  1 10:39:45 AM CEST 2022
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Fri, 29 Jul 2022 18:16:04 +0200
+Subject: xfs: fix log intent recovery ENOSPC shutdowns when inactivating inodes
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Luis Chamberlain <mcgrof@kernel.org>, Adam Manzanares <a.manzanares@samsung.com>, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Christoph Hellwig <hch@lst.de>
+Message-ID: <20220729161609.4071252-5-amir73il@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+commit 81ed94751b1513fcc5978dcc06eb1f5b4e55a785 upstream.
+
+During regular operation, the xfs_inactive operations create
+transactions with zero block reservation because in general we're
+freeing space, not asking for more.  The per-AG space reservations
+created at mount time enable us to handle expansions of the refcount
+btree without needing to reserve blocks to the transaction.
+
+Unfortunately, log recovery doesn't create the per-AG space reservations
+when intent items are being recovered.  This isn't an issue for intent
+item recovery itself because they explicitly request blocks, but any
+inode inactivation that can happen during log recovery uses the same
+xfs_inactive paths as regular runtime.  If a refcount btree expansion
+happens, the transaction will fail due to blk_res_used > blk_res, and we
+shut down the filesystem unnecessarily.
+
+Fix this problem by making per-AG reservations temporarily so that we
+can handle the inactivations, and releasing them at the end.  This
+brings the recovery environment closer to the runtime environment.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_mount.c |   10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -968,9 +968,17 @@ xfs_mountfs(
+       /*
+        * Finish recovering the file system.  This part needed to be delayed
+        * until after the root and real-time bitmap inodes were consistently
+-       * read in.
++       * read in.  Temporarily create per-AG space reservations for metadata
++       * btree shape changes because space freeing transactions (for inode
++       * inactivation) require the per-AG reservation in lieu of reserving
++       * blocks.
+        */
++      error = xfs_fs_reserve_ag_blocks(mp);
++      if (error && error == -ENOSPC)
++              xfs_warn(mp,
++      "ENOSPC reserving per-AG metadata pool, log recovery may fail.");
+       error = xfs_log_mount_finish(mp);
++      xfs_fs_unreserve_ag_blocks(mp);
+       if (error) {
+               xfs_warn(mp, "log mount finish failed");
+               goto out_rtunmount;
diff --git a/queue-5.10/xfs-force-the-log-offline-when-log-intent-item-recovery-fails.patch b/queue-5.10/xfs-force-the-log-offline-when-log-intent-item-recovery-fails.patch
new file mode 100644 (file)
index 0000000..09b03c8
--- /dev/null
@@ -0,0 +1,68 @@
+From foo@baz Mon Aug  1 10:39:45 AM CEST 2022
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Fri, 29 Jul 2022 18:16:05 +0200
+Subject: xfs: force the log offline when log intent item recovery fails
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Luis Chamberlain <mcgrof@kernel.org>, Adam Manzanares <a.manzanares@samsung.com>, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Christoph Hellwig <hch@lst.de>
+Message-ID: <20220729161609.4071252-6-amir73il@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+commit 4e6b8270c820c8c57a73f869799a0af2b56eff3e upstream.
+
+If any part of log intent item recovery fails, we should shut down the
+log immediately to stop the log from writing a clean unmount record to
+disk, because the metadata is not consistent.  The inability to cancel a
+dirty transaction catches most of these cases, but there are a few
+things that have slipped through the cracks, such as ENOSPC from a
+transaction allocation, or runtime errors that result in cancellation of
+a non-dirty transaction.
+
+This solves some weird behaviors reported by customers where a system
+goes down, the first mount fails, the second succeeds, but then the fs
+goes down later because of inconsistent metadata.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c         |    3 +++
+ fs/xfs/xfs_log_recover.c |    5 ++++-
+ 2 files changed, 7 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -765,6 +765,9 @@ xfs_log_mount_finish(
+       if (readonly)
+               mp->m_flags |= XFS_MOUNT_RDONLY;
++      /* Make sure the log is dead if we're returning failure. */
++      ASSERT(!error || (mp->m_log->l_flags & XLOG_IO_ERROR));
++
+       return error;
+ }
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -2457,8 +2457,10 @@ xlog_finish_defer_ops(
+               error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
+                               dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
+-              if (error)
++              if (error) {
++                      xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+                       return error;
++              }
+               /*
+                * Transfer to this new transaction all the dfops we captured
+@@ -3454,6 +3456,7 @@ xlog_recover_finish(
+                        * this) before we get around to xfs_log_mount_cancel.
+                        */
+                       xlog_recover_cancel_intents(log);
++                      xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+                       xfs_alert(log->l_mp, "Failed to recover intents");
+                       return error;
+               }
diff --git a/queue-5.10/xfs-hold-buffer-across-unpin-and-potential-shutdown-processing.patch b/queue-5.10/xfs-hold-buffer-across-unpin-and-potential-shutdown-processing.patch
new file mode 100644 (file)
index 0000000..2ed0baa
--- /dev/null
@@ -0,0 +1,123 @@
+From foo@baz Mon Aug  1 10:39:45 AM CEST 2022
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Fri, 29 Jul 2022 18:16:06 +0200
+Subject: xfs: hold buffer across unpin and potential shutdown processing
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Luis Chamberlain <mcgrof@kernel.org>, Adam Manzanares <a.manzanares@samsung.com>, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Brian Foster <bfoster@redhat.com>
+Message-ID: <20220729161609.4071252-7-amir73il@gmail.com>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 84d8949e770745b16a7e8a68dcb1d0f3687bdee9 upstream.
+
+The special processing used to simulate a buffer I/O failure on fs
+shutdown has a difficult to reproduce race that can result in a use
+after free of the associated buffer. Consider a buffer that has been
+committed to the on-disk log and thus is AIL resident. The buffer
+lands on the writeback delwri queue, but is subsequently locked,
+committed and pinned by another transaction before submitted for
+I/O. At this point, the buffer is stuck on the delwri queue as it
+cannot be submitted for I/O until it is unpinned. A log checkpoint
+I/O failure occurs sometime later, which aborts the bli. The unpin
+handler is called with the aborted log item, drops the bli reference
+count, the pin count, and falls into the I/O failure simulation
+path.
+
+The potential problem here is that once the pin count falls to zero
+in ->iop_unpin(), xfsaild is free to retry delwri submission of the
+buffer at any time, before the unpin handler even completes. If
+delwri queue submission wins the race to the buffer lock, it
+observes the shutdown state and simulates the I/O failure itself.
+This releases both the bli and delwri queue holds and frees the
+buffer while xfs_buf_item_unpin() sits on xfs_buf_lock() waiting to
+run through the same failure sequence. This problem is rare and
+requires many iterations of fstest generic/019 (which simulates disk
+I/O failures) to reproduce.
+
+To avoid this problem, grab a hold on the buffer before the log item
+is unpinned if the associated item has been aborted and will require
+a simulated I/O failure. The hold is already required for the
+simulated I/O failure, so the ordering simply guarantees the unpin
+handler access to the buffer before it is unpinned and thus
+processed by the AIL. This particular ordering is required so long
+as the AIL does not acquire a reference on the bli, which is the
+long term solution to this problem.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c |   37 +++++++++++++++++++++----------------
+ 1 file changed, 21 insertions(+), 16 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -393,17 +393,8 @@ xfs_buf_item_pin(
+ }
+ /*
+- * This is called to unpin the buffer associated with the buf log
+- * item which was previously pinned with a call to xfs_buf_item_pin().
+- *
+- * Also drop the reference to the buf item for the current transaction.
+- * If the XFS_BLI_STALE flag is set and we are the last reference,
+- * then free up the buf log item and unlock the buffer.
+- *
+- * If the remove flag is set we are called from uncommit in the
+- * forced-shutdown path.  If that is true and the reference count on
+- * the log item is going to drop to zero we need to free the item's
+- * descriptor in the transaction.
++ * This is called to unpin the buffer associated with the buf log item which
++ * was previously pinned with a call to xfs_buf_item_pin().
+  */
+ STATIC void
+ xfs_buf_item_unpin(
+@@ -420,12 +411,26 @@ xfs_buf_item_unpin(
+       trace_xfs_buf_item_unpin(bip);
++      /*
++       * Drop the bli ref associated with the pin and grab the hold required
++       * for the I/O simulation failure in the abort case. We have to do this
++       * before the pin count drops because the AIL doesn't acquire a bli
++       * reference. Therefore if the refcount drops to zero, the bli could
++       * still be AIL resident and the buffer submitted for I/O (and freed on
++       * completion) at any point before we return. This can be removed once
++       * the AIL properly holds a reference on the bli.
++       */
+       freed = atomic_dec_and_test(&bip->bli_refcount);
+-
++      if (freed && !stale && remove)
++              xfs_buf_hold(bp);
+       if (atomic_dec_and_test(&bp->b_pin_count))
+               wake_up_all(&bp->b_waiters);
+-      if (freed && stale) {
++       /* nothing to do but drop the pin count if the bli is active */
++      if (!freed)
++              return;
++
++      if (stale) {
+               ASSERT(bip->bli_flags & XFS_BLI_STALE);
+               ASSERT(xfs_buf_islocked(bp));
+               ASSERT(bp->b_flags & XBF_STALE);
+@@ -468,13 +473,13 @@ xfs_buf_item_unpin(
+                       ASSERT(bp->b_log_item == NULL);
+               }
+               xfs_buf_relse(bp);
+-      } else if (freed && remove) {
++      } else if (remove) {
+               /*
+                * The buffer must be locked and held by the caller to simulate
+-               * an async I/O failure.
++               * an async I/O failure. We acquired the hold for this case
++               * before the buffer was unpinned.
+                */
+               xfs_buf_lock(bp);
+-              xfs_buf_hold(bp);
+               bp->b_flags |= XBF_ASYNC;
+               xfs_buf_ioend_fail(bp);
+       }
diff --git a/queue-5.10/xfs-logging-the-on-disk-inode-lsn-can-make-it-go-backwards.patch b/queue-5.10/xfs-logging-the-on-disk-inode-lsn-can-make-it-go-backwards.patch
new file mode 100644 (file)
index 0000000..1f0d8d4
--- /dev/null
@@ -0,0 +1,242 @@
+From foo@baz Mon Aug  1 10:39:45 AM CEST 2022
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Fri, 29 Jul 2022 18:16:08 +0200
+Subject: xfs: logging the on disk inode LSN can make it go backwards
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Luis Chamberlain <mcgrof@kernel.org>, Adam Manzanares <a.manzanares@samsung.com>, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner <dchinner@redhat.com>
+Message-ID: <20220729161609.4071252-9-amir73il@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 32baa63d82ee3f5ab3bd51bae6bf7d1c15aed8c7 upstream.
+
+When we log an inode, we format the "log inode" core and set an LSN
+in that inode core. We do that via xfs_inode_item_format_core(),
+which calls:
+
+       xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn);
+
+to format the log inode. It writes the LSN from the inode item into
+the log inode, and if recovery decides the inode item needs to be
+replayed, it recovers the log inode LSN field and writes it into the
+on disk inode LSN field.
+
+Now this might seem like a reasonable thing to do, but it is wrong
+on multiple levels. Firstly, if the item is not yet in the AIL,
+item->li_lsn is zero. i.e. the first time the inode it is logged and
+formatted, the LSN we write into the log inode will be zero. If we
+only log it once, recovery will run and can write this zero LSN into
+the inode.
+
+This means that the next time the inode is logged and log recovery
+runs, it will *always* replay changes to the inode regardless of
+whether the inode is newer on disk than the version in the log and
+that violates the entire purpose of recording the LSN in the inode
+at writeback time (i.e. to stop it going backwards in time on disk
+during recovery).
+
+Secondly, if we commit the CIL to the journal so the inode item
+moves to the AIL, and then relog the inode, the LSN that gets
+stamped into the log inode will be the LSN of the inode's current
+location in the AIL, not it's age on disk. And it's not the LSN that
+will be associated with the current change. That means when log
+recovery replays this inode item, the LSN that ends up on disk is
+the LSN for the previous changes in the log, not the current
+changes being replayed. IOWs, after recovery the LSN on disk is not
+in sync with the LSN of the modifications that were replayed into
+the inode. This, again, violates the recovery ordering semantics
+that on-disk writeback LSNs provide.
+
+Hence the inode LSN in the log dinode is -always- invalid.
+
+Thirdly, recovery actually has the LSN of the log transaction it is
+replaying right at hand - it uses it to determine if it should
+replay the inode by comparing it to the on-disk inode's LSN. But it
+doesn't use that LSN to stamp the LSN into the inode which will be
+written back when the transaction is fully replayed. It uses the one
+in the log dinode, which we know is always going to be incorrect.
+
+Looking back at the change history, the inode logging was broken by
+commit 93f958f9c41f ("xfs: cull unnecessary icdinode fields") way
+back in 2016 by a stupid idiot who thought he knew how this code
+worked. i.e. me. That commit replaced an in memory di_lsn field that
+was updated only at inode writeback time from the inode item.li_lsn
+value - and hence always contained the same LSN that appeared in the
+on-disk inode - with a read of the inode item LSN at inode format
+time. CLearly these are not the same thing.
+
+Before 93f958f9c41f, the log recovery behaviour was irrelevant,
+because the LSN in the log inode always matched the on-disk LSN at
+the time the inode was logged, hence recovery of the transaction
+would never make the on-disk LSN in the inode go backwards or get
+out of sync.
+
+A symptom of the problem is this, caught from a failure of
+generic/482. Before log recovery, the inode has been allocated but
+never used:
+
+xfs_db> inode 393388
+xfs_db> p
+core.magic = 0x494e
+core.mode = 0
+....
+v3.crc = 0x99126961 (correct)
+v3.change_count = 0
+v3.lsn = 0
+v3.flags2 = 0
+v3.cowextsize = 0
+v3.crtime.sec = Thu Jan  1 10:00:00 1970
+v3.crtime.nsec = 0
+
+After log recovery:
+
+xfs_db> p
+core.magic = 0x494e
+core.mode = 020444
+....
+v3.crc = 0x23e68f23 (correct)
+v3.change_count = 2
+v3.lsn = 0
+v3.flags2 = 0
+v3.cowextsize = 0
+v3.crtime.sec = Thu Jul 22 17:03:03 2021
+v3.crtime.nsec = 751000000
+...
+
+You can see that the LSN of the on-disk inode is 0, even though it
+clearly has been written to disk. I point out this inode, because
+the generic/482 failure occurred because several adjacent inodes in
+this specific inode cluster were not replayed correctly and still
+appeared to be zero on disk when all the other metadata (inobt,
+finobt, directories, etc) indicated they should be allocated and
+written back.
+
+The fix for this is two-fold. The first is that we need to either
+revert the LSN changes in 93f958f9c41f or stop logging the inode LSN
+altogether. If we do the former, log recovery does not need to
+change but we add 8 bytes of memory per inode to store what is
+largely a write-only inode field. If we do the latter, log recovery
+needs to stamp the on-disk inode in the same manner that inode
+writeback does.
+
+I prefer the latter, because we shouldn't really be trying to log
+and replay changes to the on disk LSN as the on-disk value is the
+canonical source of the on-disk version of the inode. It also
+matches the way we recover buffer items - we create a buf_log_item
+that carries the current recovery transaction LSN that gets stamped
+into the buffer by the write verifier when it gets written back
+when the transaction is fully recovered.
+
+However, this might break log recovery on older kernels even more,
+so I'm going to simply ignore the logged value in recovery and stamp
+the on-disk inode with the LSN of the transaction being recovered
+that will trigger writeback on transaction recovery completion. This
+will ensure that the on-disk inode LSN always reflects the LSN of
+the last change that was written to disk, regardless of whether it
+comes from log recovery or runtime writeback.
+
+Fixes: 93f958f9c41f ("xfs: cull unnecessary icdinode fields")
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_log_format.h  |   11 ++++++++++-
+ fs/xfs/xfs_inode_item_recover.c |   39 +++++++++++++++++++++++++++++----------
+ 2 files changed, 39 insertions(+), 11 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_log_format.h
++++ b/fs/xfs/libxfs/xfs_log_format.h
+@@ -414,7 +414,16 @@ struct xfs_log_dinode {
+       /* start of the extended dinode, writable fields */
+       uint32_t        di_crc;         /* CRC of the inode */
+       uint64_t        di_changecount; /* number of attribute changes */
+-      xfs_lsn_t       di_lsn;         /* flush sequence */
++
++      /*
++       * The LSN we write to this field during formatting is not a reflection
++       * of the current on-disk LSN. It should never be used for recovery
++       * sequencing, nor should it be recovered into the on-disk inode at all.
++       * See xlog_recover_inode_commit_pass2() and xfs_log_dinode_to_disk()
++       * for details.
++       */
++      xfs_lsn_t       di_lsn;
++
+       uint64_t        di_flags2;      /* more random flags */
+       uint32_t        di_cowextsize;  /* basic cow extent size for file */
+       uint8_t         di_pad2[12];    /* more padding for future expansion */
+--- a/fs/xfs/xfs_inode_item_recover.c
++++ b/fs/xfs/xfs_inode_item_recover.c
+@@ -145,7 +145,8 @@ xfs_log_dinode_to_disk_ts(
+ STATIC void
+ xfs_log_dinode_to_disk(
+       struct xfs_log_dinode   *from,
+-      struct xfs_dinode       *to)
++      struct xfs_dinode       *to,
++      xfs_lsn_t               lsn)
+ {
+       to->di_magic = cpu_to_be16(from->di_magic);
+       to->di_mode = cpu_to_be16(from->di_mode);
+@@ -182,7 +183,7 @@ xfs_log_dinode_to_disk(
+               to->di_flags2 = cpu_to_be64(from->di_flags2);
+               to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
+               to->di_ino = cpu_to_be64(from->di_ino);
+-              to->di_lsn = cpu_to_be64(from->di_lsn);
++              to->di_lsn = cpu_to_be64(lsn);
+               memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+               uuid_copy(&to->di_uuid, &from->di_uuid);
+               to->di_flushiter = 0;
+@@ -261,16 +262,25 @@ xlog_recover_inode_commit_pass2(
+       }
+       /*
+-       * If the inode has an LSN in it, recover the inode only if it's less
+-       * than the lsn of the transaction we are replaying. Note: we still
+-       * need to replay an owner change even though the inode is more recent
+-       * than the transaction as there is no guarantee that all the btree
+-       * blocks are more recent than this transaction, too.
++       * If the inode has an LSN in it, recover the inode only if the on-disk
++       * inode's LSN is older than the lsn of the transaction we are
++       * replaying. We can have multiple checkpoints with the same start LSN,
++       * so the current LSN being equal to the on-disk LSN doesn't necessarily
++       * mean that the on-disk inode is more recent than the change being
++       * replayed.
++       *
++       * We must check the current_lsn against the on-disk inode
++       * here because the we can't trust the log dinode to contain a valid LSN
++       * (see comment below before replaying the log dinode for details).
++       *
++       * Note: we still need to replay an owner change even though the inode
++       * is more recent than the transaction as there is no guarantee that all
++       * the btree blocks are more recent than this transaction, too.
+        */
+       if (dip->di_version >= 3) {
+               xfs_lsn_t       lsn = be64_to_cpu(dip->di_lsn);
+-              if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
++              if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) > 0) {
+                       trace_xfs_log_recover_inode_skip(log, in_f);
+                       error = 0;
+                       goto out_owner_change;
+@@ -368,8 +378,17 @@ xlog_recover_inode_commit_pass2(
+               goto out_release;
+       }
+-      /* recover the log dinode inode into the on disk inode */
+-      xfs_log_dinode_to_disk(ldip, dip);
++      /*
++       * Recover the log dinode inode into the on disk inode.
++       *
++       * The LSN in the log dinode is garbage - it can be zero or reflect
++       * stale in-memory runtime state that isn't coherent with the changes
++       * logged in this transaction or the changes written to the on-disk
++       * inode.  Hence we write the current lSN into the inode because that
++       * matches what xfs_iflush() would write inode the inode when flushing
++       * the changes in this transaction.
++       */
++      xfs_log_dinode_to_disk(ldip, dip, current_lsn);
+       fields = in_f->ilf_fields;
+       if (fields & XFS_ILOG_DEV)
diff --git a/queue-5.10/xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch b/queue-5.10/xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch
new file mode 100644 (file)
index 0000000..c645ea6
--- /dev/null
@@ -0,0 +1,155 @@
+From foo@baz Mon Aug  1 10:39:45 AM CEST 2022
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Fri, 29 Jul 2022 18:16:03 +0200
+Subject: xfs: prevent UAF in xfs_log_item_in_current_chkpt
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Luis Chamberlain <mcgrof@kernel.org>, Adam Manzanares <a.manzanares@samsung.com>, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner <dchinner@redhat.com>
+Message-ID: <20220729161609.4071252-4-amir73il@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+commit f8d92a66e810acbef6ddbc0bd0cbd9b117ce8acd upstream.
+
+While I was running with KASAN and lockdep enabled, I stumbled upon an
+KASAN report about a UAF to a freed CIL checkpoint.  Looking at the
+comment for xfs_log_item_in_current_chkpt, it seems pretty obvious to me
+that the original patch to xfs_defer_finish_noroll should have done
+something to lock the CIL to prevent it from switching the CIL contexts
+while the predicate runs.
+
+For upper level code that needs to know if a given log item is new
+enough not to need relogging, add a new wrapper that takes the CIL
+context lock long enough to sample the current CIL context.  This is
+kind of racy in that the CIL can switch the contexts immediately after
+sampling, but that's ok because the consequence is that the defer ops
+code is a little slow to relog items.
+
+ ==================================================================
+ BUG: KASAN: use-after-free in xfs_log_item_in_current_chkpt+0x139/0x160 [xfs]
+ Read of size 8 at addr ffff88804ea5f608 by task fsstress/527999
+
+ CPU: 1 PID: 527999 Comm: fsstress Tainted: G      D      5.16.0-rc4-xfsx #rc4
+ Call Trace:
+  <TASK>
+  dump_stack_lvl+0x45/0x59
+  print_address_description.constprop.0+0x1f/0x140
+  kasan_report.cold+0x83/0xdf
+  xfs_log_item_in_current_chkpt+0x139/0x160
+  xfs_defer_finish_noroll+0x3bb/0x1e30
+  __xfs_trans_commit+0x6c8/0xcf0
+  xfs_reflink_remap_extent+0x66f/0x10e0
+  xfs_reflink_remap_blocks+0x2dd/0xa90
+  xfs_file_remap_range+0x27b/0xc30
+  vfs_dedupe_file_range_one+0x368/0x420
+  vfs_dedupe_file_range+0x37c/0x5d0
+  do_vfs_ioctl+0x308/0x1260
+  __x64_sys_ioctl+0xa1/0x170
+  do_syscall_64+0x35/0x80
+  entry_SYSCALL_64_after_hwframe+0x44/0xae
+ RIP: 0033:0x7f2c71a2950b
+ Code: 0f 1e fa 48 8b 05 85 39 0d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff
+ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01
+f0 ff ff 73 01 c3 48 8b 0d 55 39 0d 00 f7 d8 64 89 01 48
+ RSP: 002b:00007ffe8c0e03c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
+ RAX: ffffffffffffffda RBX: 00005600862a8740 RCX: 00007f2c71a2950b
+ RDX: 00005600862a7be0 RSI: 00000000c0189436 RDI: 0000000000000004
+ RBP: 000000000000000b R08: 0000000000000027 R09: 0000000000000003
+ R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000005a
+ R13: 00005600862804a8 R14: 0000000000016000 R15: 00005600862a8a20
+  </TASK>
+
+ Allocated by task 464064:
+  kasan_save_stack+0x1e/0x50
+  __kasan_kmalloc+0x81/0xa0
+  kmem_alloc+0xcd/0x2c0 [xfs]
+  xlog_cil_ctx_alloc+0x17/0x1e0 [xfs]
+  xlog_cil_push_work+0x141/0x13d0 [xfs]
+  process_one_work+0x7f6/0x1380
+  worker_thread+0x59d/0x1040
+  kthread+0x3b0/0x490
+  ret_from_fork+0x1f/0x30
+
+ Freed by task 51:
+  kasan_save_stack+0x1e/0x50
+  kasan_set_track+0x21/0x30
+  kasan_set_free_info+0x20/0x30
+  __kasan_slab_free+0xed/0x130
+  slab_free_freelist_hook+0x7f/0x160
+  kfree+0xde/0x340
+  xlog_cil_committed+0xbfd/0xfe0 [xfs]
+  xlog_cil_process_committed+0x103/0x1c0 [xfs]
+  xlog_state_do_callback+0x45d/0xbd0 [xfs]
+  xlog_ioend_work+0x116/0x1c0 [xfs]
+  process_one_work+0x7f6/0x1380
+  worker_thread+0x59d/0x1040
+  kthread+0x3b0/0x490
+  ret_from_fork+0x1f/0x30
+
+ Last potentially related work creation:
+  kasan_save_stack+0x1e/0x50
+  __kasan_record_aux_stack+0xb7/0xc0
+  insert_work+0x48/0x2e0
+  __queue_work+0x4e7/0xda0
+  queue_work_on+0x69/0x80
+  xlog_cil_push_now.isra.0+0x16b/0x210 [xfs]
+  xlog_cil_force_seq+0x1b7/0x850 [xfs]
+  xfs_log_force_seq+0x1c7/0x670 [xfs]
+  xfs_file_fsync+0x7c1/0xa60 [xfs]
+  __x64_sys_fsync+0x52/0x80
+  do_syscall_64+0x35/0x80
+  entry_SYSCALL_64_after_hwframe+0x44/0xae
+
+ The buggy address belongs to the object at ffff88804ea5f600
+  which belongs to the cache kmalloc-256 of size 256
+ The buggy address is located 8 bytes inside of
+  256-byte region [ffff88804ea5f600, ffff88804ea5f700)
+ The buggy address belongs to the page:
+ page:ffffea00013a9780 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88804ea5ea00 pfn:0x4ea5e
+ head:ffffea00013a9780 order:1 compound_mapcount:0
+ flags: 0x4fff80000010200(slab|head|node=1|zone=1|lastcpupid=0xfff)
+ raw: 04fff80000010200 ffffea0001245908 ffffea00011bd388 ffff888004c42b40
+ raw: ffff88804ea5ea00 0000000000100009 00000001ffffffff 0000000000000000
+ page dumped because: kasan: bad access detected
+
+ Memory state around the buggy address:
+  ffff88804ea5f500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+  ffff88804ea5f580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ >ffff88804ea5f600: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+                       ^
+  ffff88804ea5f680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+  ffff88804ea5f700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ==================================================================
+
+Fixes: 4e919af7827a ("xfs: periodically relog deferred intent items")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_cil.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_log_cil.c
++++ b/fs/xfs/xfs_log_cil.c
+@@ -1179,9 +1179,9 @@ out_shutdown:
+  */
+ bool
+ xfs_log_item_in_current_chkpt(
+-      struct xfs_log_item *lip)
++      struct xfs_log_item     *lip)
+ {
+-      struct xfs_cil_ctx *ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
++      struct xfs_cil          *cil = lip->li_mountp->m_log->l_cilp;
+       if (list_empty(&lip->li_cil))
+               return false;
+@@ -1191,7 +1191,7 @@ xfs_log_item_in_current_chkpt(
+        * first checkpoint it is written to. Hence if it is different to the
+        * current sequence, we're in a new checkpoint.
+        */
+-      return lip->li_seq == ctx->sequence;
++      return lip->li_seq == READ_ONCE(cil->xc_current_sequence);
+ }
+ /*
diff --git a/queue-5.10/xfs-refactor-xfs_file_fsync.patch b/queue-5.10/xfs-refactor-xfs_file_fsync.patch
new file mode 100644 (file)
index 0000000..1829738
--- /dev/null
@@ -0,0 +1,136 @@
+From foo@baz Mon Aug  1 10:39:45 AM CEST 2022
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Fri, 29 Jul 2022 18:16:01 +0200
+Subject: xfs: refactor xfs_file_fsync
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Luis Chamberlain <mcgrof@kernel.org>, Adam Manzanares <a.manzanares@samsung.com>, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Christoph Hellwig <hch@lst.de>, Brian Foster <bfoster@redhat.com>, Dave Chinner <dchinner@redhat.com>
+Message-ID: <20220729161609.4071252-2-amir73il@gmail.com>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit f22c7f87777361f94aa17f746fbadfa499248dc8 upstream.
+
+[backported for dependency]
+
+Factor out the log syncing logic into two helpers to make the code easier
+to read and more maintainable.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_file.c |   81 +++++++++++++++++++++++++++++++++---------------------
+ 1 file changed, 50 insertions(+), 31 deletions(-)
+
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -118,6 +118,54 @@ xfs_dir_fsync(
+       return xfs_log_force_inode(ip);
+ }
++static xfs_lsn_t
++xfs_fsync_lsn(
++      struct xfs_inode        *ip,
++      bool                    datasync)
++{
++      if (!xfs_ipincount(ip))
++              return 0;
++      if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
++              return 0;
++      return ip->i_itemp->ili_last_lsn;
++}
++
++/*
++ * All metadata updates are logged, which means that we just have to flush the
++ * log up to the latest LSN that touched the inode.
++ *
++ * If we have concurrent fsync/fdatasync() calls, we need them to all block on
++ * the log force before we clear the ili_fsync_fields field. This ensures that
++ * we don't get a racing sync operation that does not wait for the metadata to
++ * hit the journal before returning.  If we race with clearing ili_fsync_fields,
++ * then all that will happen is the log force will do nothing as the lsn will
++ * already be on disk.  We can't race with setting ili_fsync_fields because that
++ * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
++ * shared until after the ili_fsync_fields is cleared.
++ */
++static  int
++xfs_fsync_flush_log(
++      struct xfs_inode        *ip,
++      bool                    datasync,
++      int                     *log_flushed)
++{
++      int                     error = 0;
++      xfs_lsn_t               lsn;
++
++      xfs_ilock(ip, XFS_ILOCK_SHARED);
++      lsn = xfs_fsync_lsn(ip, datasync);
++      if (lsn) {
++              error = xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC,
++                                        log_flushed);
++
++              spin_lock(&ip->i_itemp->ili_lock);
++              ip->i_itemp->ili_fsync_fields = 0;
++              spin_unlock(&ip->i_itemp->ili_lock);
++      }
++      xfs_iunlock(ip, XFS_ILOCK_SHARED);
++      return error;
++}
++
+ STATIC int
+ xfs_file_fsync(
+       struct file             *file,
+@@ -125,13 +173,10 @@ xfs_file_fsync(
+       loff_t                  end,
+       int                     datasync)
+ {
+-      struct inode            *inode = file->f_mapping->host;
+-      struct xfs_inode        *ip = XFS_I(inode);
+-      struct xfs_inode_log_item *iip = ip->i_itemp;
++      struct xfs_inode        *ip = XFS_I(file->f_mapping->host);
+       struct xfs_mount        *mp = ip->i_mount;
+       int                     error = 0;
+       int                     log_flushed = 0;
+-      xfs_lsn_t               lsn = 0;
+       trace_xfs_file_fsync(ip);
+@@ -155,33 +200,7 @@ xfs_file_fsync(
+       else if (mp->m_logdev_targp != mp->m_ddev_targp)
+               xfs_blkdev_issue_flush(mp->m_ddev_targp);
+-      /*
+-       * All metadata updates are logged, which means that we just have to
+-       * flush the log up to the latest LSN that touched the inode. If we have
+-       * concurrent fsync/fdatasync() calls, we need them to all block on the
+-       * log force before we clear the ili_fsync_fields field. This ensures
+-       * that we don't get a racing sync operation that does not wait for the
+-       * metadata to hit the journal before returning. If we race with
+-       * clearing the ili_fsync_fields, then all that will happen is the log
+-       * force will do nothing as the lsn will already be on disk. We can't
+-       * race with setting ili_fsync_fields because that is done under
+-       * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
+-       * until after the ili_fsync_fields is cleared.
+-       */
+-      xfs_ilock(ip, XFS_ILOCK_SHARED);
+-      if (xfs_ipincount(ip)) {
+-              if (!datasync ||
+-                  (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+-                      lsn = iip->ili_last_lsn;
+-      }
+-
+-      if (lsn) {
+-              error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+-              spin_lock(&iip->ili_lock);
+-              iip->ili_fsync_fields = 0;
+-              spin_unlock(&iip->ili_lock);
+-      }
+-      xfs_iunlock(ip, XFS_ILOCK_SHARED);
++      error = xfs_fsync_flush_log(ip, datasync, &log_flushed);
+       /*
+        * If we only have a single device, and the log force about was
diff --git a/queue-5.10/xfs-remove-dead-stale-buf-unpin-handling-code.patch b/queue-5.10/xfs-remove-dead-stale-buf-unpin-handling-code.patch
new file mode 100644 (file)
index 0000000..a9e75d8
--- /dev/null
@@ -0,0 +1,74 @@
+From foo@baz Mon Aug  1 10:39:45 AM CEST 2022
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Fri, 29 Jul 2022 18:16:07 +0200
+Subject: xfs: remove dead stale buf unpin handling code
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Luis Chamberlain <mcgrof@kernel.org>, Adam Manzanares <a.manzanares@samsung.com>, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Brian Foster <bfoster@redhat.com>
+Message-ID: <20220729161609.4071252-8-amir73il@gmail.com>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit e53d3aa0b605c49d780e1b2fd0b49dba4154f32b upstream.
+
+This code goes back to a time when transaction commits wrote
+directly to iclogs. The associated log items were pinned, written to
+the log, and then "uncommitted" if some part of the log write had
+failed. This uncommit sequence called an ->iop_unpin_remove()
+handler that was eventually folded into ->iop_unpin() via the remove
+parameter. The log subsystem has since changed significantly in that
+transactions commit to the CIL instead of direct to iclogs, though
+log items must still be aborted in the event of an eventual log I/O
+error. However, the context for a log item abort is now asynchronous
+from transaction commit, which means the committing transaction has
+been freed by this point in time and the transaction uncommit
+sequence of events is no longer relevant.
+
+Further, since stale buffers remain locked at transaction commit
+through unpin, we can be certain that the buffer is not associated
+with any transaction when the unpin callback executes. Remove this
+unused hunk of code and replace it with an assertion that the buffer
+is disassociated from transaction context.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c |   21 ++-------------------
+ 1 file changed, 2 insertions(+), 19 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -435,28 +435,11 @@ xfs_buf_item_unpin(
+               ASSERT(xfs_buf_islocked(bp));
+               ASSERT(bp->b_flags & XBF_STALE);
+               ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
++              ASSERT(list_empty(&lip->li_trans));
++              ASSERT(!bp->b_transp);
+               trace_xfs_buf_item_unpin_stale(bip);
+-              if (remove) {
+-                      /*
+-                       * If we are in a transaction context, we have to
+-                       * remove the log item from the transaction as we are
+-                       * about to release our reference to the buffer.  If we
+-                       * don't, the unlock that occurs later in
+-                       * xfs_trans_uncommit() will try to reference the
+-                       * buffer which we no longer have a hold on.
+-                       */
+-                      if (!list_empty(&lip->li_trans))
+-                              xfs_trans_del_item(lip);
+-
+-                      /*
+-                       * Since the transaction no longer refers to the buffer,
+-                       * the buffer should no longer refer to the transaction.
+-                       */
+-                      bp->b_transp = NULL;
+-              }
+-
+               /*
+                * If we get called here because of an IO error, we may or may
+                * not have the item on the AIL. xfs_trans_ail_delete() will
diff --git a/queue-5.10/xfs-xfs_log_force_lsn-isn-t-passed-a-lsn.patch b/queue-5.10/xfs-xfs_log_force_lsn-isn-t-passed-a-lsn.patch
new file mode 100644 (file)
index 0000000..9eaa039
--- /dev/null
@@ -0,0 +1,472 @@
+From foo@baz Mon Aug  1 10:39:45 AM CEST 2022
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Fri, 29 Jul 2022 18:16:02 +0200
+Subject: xfs: xfs_log_force_lsn isn't passed a LSN
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Sasha Levin <sashal@kernel.org>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandan.babu@oracle.com>, Luis Chamberlain <mcgrof@kernel.org>, Adam Manzanares <a.manzanares@samsung.com>, linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner <dchinner@redhat.com>, Brian Foster <bfoster@redhat.com>, Allison Henderson <allison.henderson@oracle.com>
+Message-ID: <20220729161609.4071252-3-amir73il@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 5f9b4b0de8dc2fb8eb655463b438001c111570fe upstream.
+
+[backported from CIL scalability series for dependency]
+
+In doing an investigation into AIL push stalls, I was looking at the
+log force code to see if an async CIL push could be done instead.
+This lead me to xfs_log_force_lsn() and looking at how it works.
+
+xfs_log_force_lsn() is only called from inode synchronisation
+contexts such as fsync(), and it takes the ip->i_itemp->ili_last_lsn
+value as the LSN to sync the log to. This gets passed to
+xlog_cil_force_lsn() via xfs_log_force_lsn() to flush the CIL to the
+journal, and then used by xfs_log_force_lsn() to flush the iclogs to
+the journal.
+
+The problem is that ip->i_itemp->ili_last_lsn does not store a
+log sequence number. What it stores is passed to it from the
+->iop_committing method, which is called by xfs_log_commit_cil().
+The value this passes to the iop_committing method is the CIL
+context sequence number that the item was committed to.
+
+As it turns out, xlog_cil_force_lsn() converts the sequence to an
+actual commit LSN for the related context and returns that to
+xfs_log_force_lsn(). xfs_log_force_lsn() overwrites it's "lsn"
+variable that contained a sequence with an actual LSN and then uses
+that to sync the iclogs.
+
+This caused me some confusion for a while, even though I originally
+wrote all this code a decade ago. ->iop_committing is only used by
+a couple of log item types, and only inode items use the sequence
+number it is passed.
+
+Let's clean up the API, CIL structures and inode log item to call it
+a sequence number, and make it clear that the high level code is
+using CIL sequence numbers and not on-disk LSNs for integrity
+synchronisation purposes.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_types.h |    1 +
+ fs/xfs/xfs_buf_item.c     |    2 +-
+ fs/xfs/xfs_dquot_item.c   |    2 +-
+ fs/xfs/xfs_file.c         |   14 +++++++-------
+ fs/xfs/xfs_inode.c        |   10 +++++-----
+ fs/xfs/xfs_inode_item.c   |    4 ++--
+ fs/xfs/xfs_inode_item.h   |    2 +-
+ fs/xfs/xfs_log.c          |   27 ++++++++++++++-------------
+ fs/xfs/xfs_log.h          |    4 +---
+ fs/xfs/xfs_log_cil.c      |   30 +++++++++++-------------------
+ fs/xfs/xfs_log_priv.h     |   15 +++++++--------
+ fs/xfs/xfs_trans.c        |    6 +++---
+ fs/xfs/xfs_trans.h        |    4 ++--
+ 13 files changed, 56 insertions(+), 65 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_types.h
++++ b/fs/xfs/libxfs/xfs_types.h
+@@ -21,6 +21,7 @@ typedef int32_t              xfs_suminfo_t;  /* type
+ typedef uint32_t      xfs_rtword_t;   /* word type for bitmap manipulations */
+ typedef int64_t               xfs_lsn_t;      /* log sequence number */
++typedef int64_t               xfs_csn_t;      /* CIL sequence number */
+ typedef uint32_t      xfs_dablk_t;    /* dir/attr block number (in file) */
+ typedef uint32_t      xfs_dahash_t;   /* dir/attr hash value */
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -632,7 +632,7 @@ xfs_buf_item_release(
+ STATIC void
+ xfs_buf_item_committing(
+       struct xfs_log_item     *lip,
+-      xfs_lsn_t               commit_lsn)
++      xfs_csn_t               seq)
+ {
+       return xfs_buf_item_release(lip);
+ }
+--- a/fs/xfs/xfs_dquot_item.c
++++ b/fs/xfs/xfs_dquot_item.c
+@@ -188,7 +188,7 @@ xfs_qm_dquot_logitem_release(
+ STATIC void
+ xfs_qm_dquot_logitem_committing(
+       struct xfs_log_item     *lip,
+-      xfs_lsn_t               commit_lsn)
++      xfs_csn_t               seq)
+ {
+       return xfs_qm_dquot_logitem_release(lip);
+ }
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -118,8 +118,8 @@ xfs_dir_fsync(
+       return xfs_log_force_inode(ip);
+ }
+-static xfs_lsn_t
+-xfs_fsync_lsn(
++static xfs_csn_t
++xfs_fsync_seq(
+       struct xfs_inode        *ip,
+       bool                    datasync)
+ {
+@@ -127,7 +127,7 @@ xfs_fsync_lsn(
+               return 0;
+       if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+               return 0;
+-      return ip->i_itemp->ili_last_lsn;
++      return ip->i_itemp->ili_commit_seq;
+ }
+ /*
+@@ -150,12 +150,12 @@ xfs_fsync_flush_log(
+       int                     *log_flushed)
+ {
+       int                     error = 0;
+-      xfs_lsn_t               lsn;
++      xfs_csn_t               seq;
+       xfs_ilock(ip, XFS_ILOCK_SHARED);
+-      lsn = xfs_fsync_lsn(ip, datasync);
+-      if (lsn) {
+-              error = xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC,
++      seq = xfs_fsync_seq(ip, datasync);
++      if (seq) {
++              error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
+                                         log_flushed);
+               spin_lock(&ip->i_itemp->ili_lock);
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -2754,7 +2754,7 @@ xfs_iunpin(
+       trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
+       /* Give the log a push to start the unpinning I/O */
+-      xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0, NULL);
++      xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
+ }
+@@ -3716,16 +3716,16 @@ int
+ xfs_log_force_inode(
+       struct xfs_inode        *ip)
+ {
+-      xfs_lsn_t               lsn = 0;
++      xfs_csn_t               seq = 0;
+       xfs_ilock(ip, XFS_ILOCK_SHARED);
+       if (xfs_ipincount(ip))
+-              lsn = ip->i_itemp->ili_last_lsn;
++              seq = ip->i_itemp->ili_commit_seq;
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+-      if (!lsn)
++      if (!seq)
+               return 0;
+-      return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL);
++      return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL);
+ }
+ /*
+--- a/fs/xfs/xfs_inode_item.c
++++ b/fs/xfs/xfs_inode_item.c
+@@ -617,9 +617,9 @@ xfs_inode_item_committed(
+ STATIC void
+ xfs_inode_item_committing(
+       struct xfs_log_item     *lip,
+-      xfs_lsn_t               commit_lsn)
++      xfs_csn_t               seq)
+ {
+-      INODE_ITEM(lip)->ili_last_lsn = commit_lsn;
++      INODE_ITEM(lip)->ili_commit_seq = seq;
+       return xfs_inode_item_release(lip);
+ }
+--- a/fs/xfs/xfs_inode_item.h
++++ b/fs/xfs/xfs_inode_item.h
+@@ -33,7 +33,7 @@ struct xfs_inode_log_item {
+       unsigned int            ili_fields;        /* fields to be logged */
+       unsigned int            ili_fsync_fields;  /* logged since last fsync */
+       xfs_lsn_t               ili_flush_lsn;     /* lsn at last flush */
+-      xfs_lsn_t               ili_last_lsn;      /* lsn at last transaction */
++      xfs_csn_t               ili_commit_seq;    /* last transaction commit */
+ };
+ static inline int xfs_inode_clean(struct xfs_inode *ip)
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -3210,14 +3210,13 @@ out_error:
+ }
+ static int
+-__xfs_log_force_lsn(
+-      struct xfs_mount        *mp,
++xlog_force_lsn(
++      struct xlog             *log,
+       xfs_lsn_t               lsn,
+       uint                    flags,
+       int                     *log_flushed,
+       bool                    already_slept)
+ {
+-      struct xlog             *log = mp->m_log;
+       struct xlog_in_core     *iclog;
+       spin_lock(&log->l_icloglock);
+@@ -3250,8 +3249,6 @@ __xfs_log_force_lsn(
+               if (!already_slept &&
+                   (iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC ||
+                    iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) {
+-                      XFS_STATS_INC(mp, xs_log_force_sleep);
+-
+                       xlog_wait(&iclog->ic_prev->ic_write_wait,
+                                       &log->l_icloglock);
+                       return -EAGAIN;
+@@ -3289,25 +3286,29 @@ out_error:
+  * to disk, that thread will wake up all threads waiting on the queue.
+  */
+ int
+-xfs_log_force_lsn(
++xfs_log_force_seq(
+       struct xfs_mount        *mp,
+-      xfs_lsn_t               lsn,
++      xfs_csn_t               seq,
+       uint                    flags,
+       int                     *log_flushed)
+ {
++      struct xlog             *log = mp->m_log;
++      xfs_lsn_t               lsn;
+       int                     ret;
+-      ASSERT(lsn != 0);
++      ASSERT(seq != 0);
+       XFS_STATS_INC(mp, xs_log_force);
+-      trace_xfs_log_force(mp, lsn, _RET_IP_);
++      trace_xfs_log_force(mp, seq, _RET_IP_);
+-      lsn = xlog_cil_force_lsn(mp->m_log, lsn);
++      lsn = xlog_cil_force_seq(log, seq);
+       if (lsn == NULLCOMMITLSN)
+               return 0;
+-      ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false);
+-      if (ret == -EAGAIN)
+-              ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true);
++      ret = xlog_force_lsn(log, lsn, flags, log_flushed, false);
++      if (ret == -EAGAIN) {
++              XFS_STATS_INC(mp, xs_log_force_sleep);
++              ret = xlog_force_lsn(log, lsn, flags, log_flushed, true);
++      }
+       return ret;
+ }
+--- a/fs/xfs/xfs_log.h
++++ b/fs/xfs/xfs_log.h
+@@ -106,7 +106,7 @@ struct xfs_item_ops;
+ struct xfs_trans;
+ int     xfs_log_force(struct xfs_mount *mp, uint flags);
+-int     xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags,
++int     xfs_log_force_seq(struct xfs_mount *mp, xfs_csn_t seq, uint flags,
+               int *log_forced);
+ int     xfs_log_mount(struct xfs_mount        *mp,
+                       struct xfs_buftarg      *log_target,
+@@ -132,8 +132,6 @@ bool       xfs_log_writable(struct xfs_mount *
+ struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
+ void    xfs_log_ticket_put(struct xlog_ticket *ticket);
+-void  xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+-                              xfs_lsn_t *commit_lsn, bool regrant);
+ void  xlog_cil_process_committed(struct list_head *list);
+ bool  xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
+--- a/fs/xfs/xfs_log_cil.c
++++ b/fs/xfs/xfs_log_cil.c
+@@ -777,7 +777,7 @@ xlog_cil_push_work(
+        * that higher sequences will wait for us to write out a commit record
+        * before they do.
+        *
+-       * xfs_log_force_lsn requires us to mirror the new sequence into the cil
++       * xfs_log_force_seq requires us to mirror the new sequence into the cil
+        * structure atomically with the addition of this sequence to the
+        * committing list. This also ensures that we can do unlocked checks
+        * against the current sequence in log forces without risking
+@@ -1020,16 +1020,14 @@ xlog_cil_empty(
+  * allowed again.
+  */
+ void
+-xfs_log_commit_cil(
+-      struct xfs_mount        *mp,
++xlog_cil_commit(
++      struct xlog             *log,
+       struct xfs_trans        *tp,
+-      xfs_lsn_t               *commit_lsn,
++      xfs_csn_t               *commit_seq,
+       bool                    regrant)
+ {
+-      struct xlog             *log = mp->m_log;
+       struct xfs_cil          *cil = log->l_cilp;
+       struct xfs_log_item     *lip, *next;
+-      xfs_lsn_t               xc_commit_lsn;
+       /*
+        * Do all necessary memory allocation before we lock the CIL.
+@@ -1043,10 +1041,6 @@ xfs_log_commit_cil(
+       xlog_cil_insert_items(log, tp);
+-      xc_commit_lsn = cil->xc_ctx->sequence;
+-      if (commit_lsn)
+-              *commit_lsn = xc_commit_lsn;
+-
+       if (regrant && !XLOG_FORCED_SHUTDOWN(log))
+               xfs_log_ticket_regrant(log, tp->t_ticket);
+       else
+@@ -1069,8 +1063,10 @@ xfs_log_commit_cil(
+       list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
+               xfs_trans_del_item(lip);
+               if (lip->li_ops->iop_committing)
+-                      lip->li_ops->iop_committing(lip, xc_commit_lsn);
++                      lip->li_ops->iop_committing(lip, cil->xc_ctx->sequence);
+       }
++      if (commit_seq)
++              *commit_seq = cil->xc_ctx->sequence;
+       /* xlog_cil_push_background() releases cil->xc_ctx_lock */
+       xlog_cil_push_background(log);
+@@ -1087,9 +1083,9 @@ xfs_log_commit_cil(
+  * iclog flush is necessary following this call.
+  */
+ xfs_lsn_t
+-xlog_cil_force_lsn(
++xlog_cil_force_seq(
+       struct xlog     *log,
+-      xfs_lsn_t       sequence)
++      xfs_csn_t       sequence)
+ {
+       struct xfs_cil          *cil = log->l_cilp;
+       struct xfs_cil_ctx      *ctx;
+@@ -1185,21 +1181,17 @@ bool
+ xfs_log_item_in_current_chkpt(
+       struct xfs_log_item *lip)
+ {
+-      struct xfs_cil_ctx *ctx;
++      struct xfs_cil_ctx *ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
+       if (list_empty(&lip->li_cil))
+               return false;
+-      ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
+-
+       /*
+        * li_seq is written on the first commit of a log item to record the
+        * first checkpoint it is written to. Hence if it is different to the
+        * current sequence, we're in a new checkpoint.
+        */
+-      if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
+-              return false;
+-      return true;
++      return lip->li_seq == ctx->sequence;
+ }
+ /*
+--- a/fs/xfs/xfs_log_priv.h
++++ b/fs/xfs/xfs_log_priv.h
+@@ -230,7 +230,7 @@ struct xfs_cil;
+ struct xfs_cil_ctx {
+       struct xfs_cil          *cil;
+-      xfs_lsn_t               sequence;       /* chkpt sequence # */
++      xfs_csn_t               sequence;       /* chkpt sequence # */
+       xfs_lsn_t               start_lsn;      /* first LSN of chkpt commit */
+       xfs_lsn_t               commit_lsn;     /* chkpt commit record lsn */
+       struct xlog_ticket      *ticket;        /* chkpt ticket */
+@@ -268,10 +268,10 @@ struct xfs_cil {
+       struct xfs_cil_ctx      *xc_ctx;
+       spinlock_t              xc_push_lock ____cacheline_aligned_in_smp;
+-      xfs_lsn_t               xc_push_seq;
++      xfs_csn_t               xc_push_seq;
+       struct list_head        xc_committing;
+       wait_queue_head_t       xc_commit_wait;
+-      xfs_lsn_t               xc_current_sequence;
++      xfs_csn_t               xc_current_sequence;
+       struct work_struct      xc_push_work;
+       wait_queue_head_t       xc_push_wait;   /* background push throttle */
+ } ____cacheline_aligned_in_smp;
+@@ -547,19 +547,18 @@ int      xlog_cil_init(struct xlog *log);
+ void  xlog_cil_init_post_recovery(struct xlog *log);
+ void  xlog_cil_destroy(struct xlog *log);
+ bool  xlog_cil_empty(struct xlog *log);
++void  xlog_cil_commit(struct xlog *log, struct xfs_trans *tp,
++                      xfs_csn_t *commit_seq, bool regrant);
+ /*
+  * CIL force routines
+  */
+-xfs_lsn_t
+-xlog_cil_force_lsn(
+-      struct xlog *log,
+-      xfs_lsn_t sequence);
++xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence);
+ static inline void
+ xlog_cil_force(struct xlog *log)
+ {
+-      xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
++      xlog_cil_force_seq(log, log->l_cilp->xc_current_sequence);
+ }
+ /*
+--- a/fs/xfs/xfs_trans.c
++++ b/fs/xfs/xfs_trans.c
+@@ -832,7 +832,7 @@ __xfs_trans_commit(
+       bool                    regrant)
+ {
+       struct xfs_mount        *mp = tp->t_mountp;
+-      xfs_lsn_t               commit_lsn = -1;
++      xfs_csn_t               commit_seq = 0;
+       int                     error = 0;
+       int                     sync = tp->t_flags & XFS_TRANS_SYNC;
+@@ -874,7 +874,7 @@ __xfs_trans_commit(
+               xfs_trans_apply_sb_deltas(tp);
+       xfs_trans_apply_dquot_deltas(tp);
+-      xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
++      xlog_cil_commit(mp->m_log, tp, &commit_seq, regrant);
+       xfs_trans_free(tp);
+@@ -883,7 +883,7 @@ __xfs_trans_commit(
+        * log out now and wait for it.
+        */
+       if (sync) {
+-              error = xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
++              error = xfs_log_force_seq(mp, commit_seq, XFS_LOG_SYNC, NULL);
+               XFS_STATS_INC(mp, xs_trans_sync);
+       } else {
+               XFS_STATS_INC(mp, xs_trans_async);
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -43,7 +43,7 @@ struct xfs_log_item {
+       struct list_head                li_cil;         /* CIL pointers */
+       struct xfs_log_vec              *li_lv;         /* active log vector */
+       struct xfs_log_vec              *li_lv_shadow;  /* standby vector */
+-      xfs_lsn_t                       li_seq;         /* CIL commit seq */
++      xfs_csn_t                       li_seq;         /* CIL commit seq */
+ };
+ /*
+@@ -69,7 +69,7 @@ struct xfs_item_ops {
+       void (*iop_pin)(struct xfs_log_item *);
+       void (*iop_unpin)(struct xfs_log_item *, int remove);
+       uint (*iop_push)(struct xfs_log_item *, struct list_head *);
+-      void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn);
++      void (*iop_committing)(struct xfs_log_item *lip, xfs_csn_t seq);
+       void (*iop_release)(struct xfs_log_item *);
+       xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
+       int (*iop_recover)(struct xfs_log_item *lip,