From 198bab6df9db2f47d2c4f4a26474b8e06a0d245c Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 24 Jul 2023 08:21:00 +0200
Subject: [PATCH] 6.4-stable patches

added patches:
	jbd2-recheck-chechpointing-non-dirty-buffer.patch
---
 ...check-chechpointing-non-dirty-buffer.patch | 191 ++++++++++++++++++
 1 file changed, 191 insertions(+)
 create mode 100644 queue-6.4/jbd2-recheck-chechpointing-non-dirty-buffer.patch

diff --git a/queue-6.4/jbd2-recheck-chechpointing-non-dirty-buffer.patch b/queue-6.4/jbd2-recheck-chechpointing-non-dirty-buffer.patch
new file mode 100644
index 00000000000..2cd2baafb78
--- /dev/null
+++ b/queue-6.4/jbd2-recheck-chechpointing-non-dirty-buffer.patch
@@ -0,0 +1,191 @@
+From c2d6fd9d6f35079f1669f0100f05b46708c74b7f Mon Sep 17 00:00:00 2001
+From: Zhang Yi <yi.zhang@huawei.com>
+Date: Tue, 6 Jun 2023 21:59:23 +0800
+Subject: jbd2: recheck chechpointing non-dirty buffer
+
+From: Zhang Yi <yi.zhang@huawei.com>
+
+commit c2d6fd9d6f35079f1669f0100f05b46708c74b7f upstream.
+
+There is a long-standing metadata corruption issue that happens from
+time to time, but it's very difficult to reproduce and analyse, benefit
+from the JBD2_CYCLE_RECORD option, we found out that the problem is the
+checkpointing process miss to write out some buffers which are raced by
+another do_get_write_access(). Looks below for detail.
+
+jbd2_log_do_checkpoint() //transaction X
+ //buffer A is dirty and not belones to any transaction
+ __buffer_relink_io() //move it to the IO list
+ __flush_batch()
+  write_dirty_buffer()
+                             do_get_write_access()
+                             clear_buffer_dirty
+                             __jbd2_journal_file_buffer()
+                             //add buffer A to a new transaction Y
+   lock_buffer(bh)
+   //doesn't write out
+ __jbd2_journal_remove_checkpoint()
+ //finish checkpoint except buffer A
+ //filesystem corrupt if the new transaction Y isn't fully write out.
+
+Due to the t_checkpoint_list walking loop in jbd2_log_do_checkpoint()
+have already handles waiting for buffers under IO and re-added new
+transaction to complete commit, and it also removing cleaned buffers,
+this makes sure the list will eventually get empty. So it's fine to
+leave buffers on the t_checkpoint_list while flushing out and completely
+stop using the t_checkpoint_io_list.
+
+Cc: stable@vger.kernel.org
+Suggested-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
+Tested-by: Zhihao Cheng <chengzhihao1@huawei.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Link: https://lore.kernel.org/r/20230606135928.434610-2-yi.zhang@huaweicloud.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/jbd2/checkpoint.c |  102 ++++++++++++++-------------------------------------
+ 1 file changed, 29 insertions(+), 73 deletions(-)
+
+--- a/fs/jbd2/checkpoint.c
++++ b/fs/jbd2/checkpoint.c
+@@ -58,28 +58,6 @@ static inline void __buffer_unlink(struc
+ }
+ 
+ /*
+- * Move a buffer from the checkpoint list to the checkpoint io list
+- *
+- * Called with j_list_lock held
+- */
+-static inline void __buffer_relink_io(struct journal_head *jh)
+-{
+-	transaction_t *transaction = jh->b_cp_transaction;
+-
+-	__buffer_unlink_first(jh);
+-
+-	if (!transaction->t_checkpoint_io_list) {
+-		jh->b_cpnext = jh->b_cpprev = jh;
+-	} else {
+-		jh->b_cpnext = transaction->t_checkpoint_io_list;
+-		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
+-		jh->b_cpprev->b_cpnext = jh;
+-		jh->b_cpnext->b_cpprev = jh;
+-	}
+-	transaction->t_checkpoint_io_list = jh;
+-}
+-
+-/*
+  * Check a checkpoint buffer could be release or not.
+  *
+  * Requires j_list_lock
+@@ -183,6 +161,7 @@ __flush_batch(journal_t *journal, int *b
+ 		struct buffer_head *bh = journal->j_chkpt_bhs[i];
+ 		BUFFER_TRACE(bh, "brelse");
+ 		__brelse(bh);
++		journal->j_chkpt_bhs[i] = NULL;
+ 	}
+ 	*batch_count = 0;
+ }
+@@ -242,6 +221,11 @@ restart:
+ 		jh = transaction->t_checkpoint_list;
+ 		bh = jh2bh(jh);
+ 
++		/*
++		 * The buffer may be writing back, or flushing out in the
++		 * last couple of cycles, or re-adding into a new transaction,
++		 * need to check it again until it's unlocked.
++		 */
+ 		if (buffer_locked(bh)) {
+ 			get_bh(bh);
+ 			spin_unlock(&journal->j_list_lock);
+@@ -287,28 +271,32 @@ restart:
+ 		}
+ 		if (!buffer_dirty(bh)) {
+ 			BUFFER_TRACE(bh, "remove from checkpoint");
+-			if (__jbd2_journal_remove_checkpoint(jh))
+-				/* The transaction was released; we're done */
++			/*
++			 * If the transaction was released or the checkpoint
++			 * list was empty, we're done.
++			 */
++			if (__jbd2_journal_remove_checkpoint(jh) ||
++			    !transaction->t_checkpoint_list)
+ 				goto out;
+-			continue;
++		} else {
++			/*
++			 * We are about to write the buffer, it could be
++			 * raced by some other transaction shrink or buffer
++			 * re-log logic once we release the j_list_lock,
++			 * leave it on the checkpoint list and check status
++			 * again to make sure it's clean.
++			 */
++			BUFFER_TRACE(bh, "queue");
++			get_bh(bh);
++			J_ASSERT_BH(bh, !buffer_jwrite(bh));
++			journal->j_chkpt_bhs[batch_count++] = bh;
++			transaction->t_chp_stats.cs_written++;
++			transaction->t_checkpoint_list = jh->b_cpnext;
+ 		}
+-		/*
+-		 * Important: we are about to write the buffer, and
+-		 * possibly block, while still holding the journal
+-		 * lock.  We cannot afford to let the transaction
+-		 * logic start messing around with this buffer before
+-		 * we write it to disk, as that would break
+-		 * recoverability.
+-		 */
+-		BUFFER_TRACE(bh, "queue");
+-		get_bh(bh);
+-		J_ASSERT_BH(bh, !buffer_jwrite(bh));
+-		journal->j_chkpt_bhs[batch_count++] = bh;
+-		__buffer_relink_io(jh);
+-		transaction->t_chp_stats.cs_written++;
++
+ 		if ((batch_count == JBD2_NR_BATCH) ||
+-		    need_resched() ||
+-		    spin_needbreak(&journal->j_list_lock))
++		    need_resched() || spin_needbreak(&journal->j_list_lock) ||
++		    jh2bh(transaction->t_checkpoint_list) == journal->j_chkpt_bhs[0])
+ 			goto unlock_and_flush;
+ 	}
+ 
+@@ -322,38 +310,6 @@ restart:
+ 			goto restart;
+ 	}
+ 
+-	/*
+-	 * Now we issued all of the transaction's buffers, let's deal
+-	 * with the buffers that are out for I/O.
+-	 */
+-restart2:
+-	/* Did somebody clean up the transaction in the meanwhile? */
+-	if (journal->j_checkpoint_transactions != transaction ||
+-	    transaction->t_tid != this_tid)
+-		goto out;
+-
+-	while (transaction->t_checkpoint_io_list) {
+-		jh = transaction->t_checkpoint_io_list;
+-		bh = jh2bh(jh);
+-		if (buffer_locked(bh)) {
+-			get_bh(bh);
+-			spin_unlock(&journal->j_list_lock);
+-			wait_on_buffer(bh);
+-			/* the journal_head may have gone by now */
+-			BUFFER_TRACE(bh, "brelse");
+-			__brelse(bh);
+-			spin_lock(&journal->j_list_lock);
+-			goto restart2;
+-		}
+-
+-		/*
+-		 * Now in whatever state the buffer currently is, we
+-		 * know that it has been written out and so we can
+-		 * drop it from the list
+-		 */
+-		if (__jbd2_journal_remove_checkpoint(jh))
+-			break;
+-	}
+ out:
+ 	spin_unlock(&journal->j_list_lock);
+ 	result = jbd2_cleanup_journal_tail(journal);
-- 
2.47.3