From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 13 Aug 2021 12:23:12 +0000 (+0200)
Subject: 5.4-stable patches
X-Git-Tag: v4.4.281~3
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=52c0db7cb402cb73cab4efaf3949808645a7897d;p=thirdparty%2Fkernel%2Fstable-queue.git

5.4-stable patches

added patches:
	btrfs-don-t-flush-from-btrfs_delayed_inode_reserve_metadata.patch
	btrfs-export-and-rename-qgroup_reserve_meta.patch
	btrfs-qgroup-don-t-commit-transaction-when-we-already-hold-the-handle.patch
---

diff --git a/queue-5.4/btrfs-don-t-flush-from-btrfs_delayed_inode_reserve_metadata.patch b/queue-5.4/btrfs-don-t-flush-from-btrfs_delayed_inode_reserve_metadata.patch
new file mode 100644
index 00000000000..c333c35bbe2
--- /dev/null
+++ b/queue-5.4/btrfs-don-t-flush-from-btrfs_delayed_inode_reserve_metadata.patch
@@ -0,0 +1,120 @@
+From foo@baz Fri Aug 13 02:22:25 PM CEST 2021
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 13 Aug 2021 20:12:25 +0800
+Subject: btrfs: don't flush from btrfs_delayed_inode_reserve_metadata
+To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, fdmanana@suse.com, Nikolay Borisov <nborisov@suse.com>, Qu Wenruo <wqu@suse.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <67b189dab7575f9edbc41caa4118ee68856a56a2.1628854236.git.anand.jain@oracle.com>
+
+From: Nikolay Borisov <nborisov@suse.com>
+
+commit 4d14c5cde5c268a2bc26addecf09489cb953ef64 upstream
+
+Calling btrfs_qgroup_reserve_meta_prealloc from
+btrfs_delayed_inode_reserve_metadata can result in flushing delalloc
+while holding a transaction and delayed node locks. This is deadlock
+prone. In the past multiple commits:
+
+ * ae5e070eaca9 ("btrfs: qgroup: don't try to wait flushing if we're
+already holding a transaction")
+
+ * 6f23277a49e6 ("btrfs: qgroup: don't commit transaction when we already
+ hold the handle")
+
+Tried to solve various aspects of this but this was always a
+whack-a-mole game. Unfortunately those 2 fixes don't solve a deadlock
+scenario involving btrfs_delayed_node::mutex. Namely, one thread
+can call btrfs_dirty_inode as a result of reading a file and modifying
+its atime:
+
+  PID: 6963   TASK: ffff8c7f3f94c000  CPU: 2   COMMAND: "test"
+  #0  __schedule at ffffffffa529e07d
+  #1  schedule at ffffffffa529e4ff
+  #2  schedule_timeout at ffffffffa52a1bdd
+  #3  wait_for_completion at ffffffffa529eeea             <-- sleeps with delayed node mutex held
+  #4  start_delalloc_inodes at ffffffffc0380db5
+  #5  btrfs_start_delalloc_snapshot at ffffffffc0393836
+  #6  try_flush_qgroup at ffffffffc03f04b2
+  #7  __btrfs_qgroup_reserve_meta at ffffffffc03f5bb6     <-- tries to reserve space and starts delalloc inodes.
+  #8  btrfs_delayed_update_inode at ffffffffc03e31aa      <-- acquires delayed node mutex
+  #9  btrfs_update_inode at ffffffffc0385ba8
+ #10  btrfs_dirty_inode at ffffffffc038627b               <-- TRANSACTIION OPENED
+ #11  touch_atime at ffffffffa4cf0000
+ #12  generic_file_read_iter at ffffffffa4c1f123
+ #13  new_sync_read at ffffffffa4ccdc8a
+ #14  vfs_read at ffffffffa4cd0849
+ #15  ksys_read at ffffffffa4cd0bd1
+ #16  do_syscall_64 at ffffffffa4a052eb
+ #17  entry_SYSCALL_64_after_hwframe at ffffffffa540008c
+
+This will cause an asynchronous work to flush the delalloc inodes to
+happen which can try to acquire the same delayed_node mutex:
+
+  PID: 455    TASK: ffff8c8085fa4000  CPU: 5   COMMAND: "kworker/u16:30"
+  #0  __schedule at ffffffffa529e07d
+  #1  schedule at ffffffffa529e4ff
+  #2  schedule_preempt_disabled at ffffffffa529e80a
+  #3  __mutex_lock at ffffffffa529fdcb                    <-- goes to sleep, never wakes up.
+  #4  btrfs_delayed_update_inode at ffffffffc03e3143      <-- tries to acquire the mutex
+  #5  btrfs_update_inode at ffffffffc0385ba8              <-- this is the same inode that pid 6963 is holding
+  #6  cow_file_range_inline.constprop.78 at ffffffffc0386be7
+  #7  cow_file_range at ffffffffc03879c1
+  #8  btrfs_run_delalloc_range at ffffffffc038894c
+  #9  writepage_delalloc at ffffffffc03a3c8f
+ #10  __extent_writepage at ffffffffc03a4c01
+ #11  extent_write_cache_pages at ffffffffc03a500b
+ #12  extent_writepages at ffffffffc03a6de2
+ #13  do_writepages at ffffffffa4c277eb
+ #14  __filemap_fdatawrite_range at ffffffffa4c1e5bb
+ #15  btrfs_run_delalloc_work at ffffffffc0380987         <-- starts running delayed nodes
+ #16  normal_work_helper at ffffffffc03b706c
+ #17  process_one_work at ffffffffa4aba4e4
+ #18  worker_thread at ffffffffa4aba6fd
+ #19  kthread at ffffffffa4ac0a3d
+ #20  ret_from_fork at ffffffffa54001ff
+
+To fully address those cases the complete fix is to never issue any
+flushing while holding the transaction or the delayed node lock. This
+patch achieves it by calling qgroup_reserve_meta directly which will
+either succeed without flushing or will fail and return -EDQUOT. In the
+latter case that return value is going to be propagated to
+btrfs_dirty_inode which will fallback to start a new transaction. That's
+fine as the majority of time we expect the inode will have
+BTRFS_DELAYED_NODE_INODE_DIRTY flag set which will result in directly
+copying the in-memory state.
+
+Fixes: c53e9653605d ("btrfs: qgroup: try to flush qgroup space when we get -EDQUOT")
+CC: stable@vger.kernel.org # 5.10+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/delayed-inode.c |    3 ++-
+ fs/btrfs/inode.c         |    2 +-
+ 2 files changed, 3 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/delayed-inode.c
++++ b/fs/btrfs/delayed-inode.c
+@@ -627,7 +627,8 @@ static int btrfs_delayed_inode_reserve_m
+ 	 */
+ 	if (!src_rsv || (!trans->bytes_reserved &&
+ 			 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
+-		ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
++		ret = btrfs_qgroup_reserve_meta(root, num_bytes,
++					  BTRFS_QGROUP_RSV_META_PREALLOC, true);
+ 		if (ret < 0)
+ 			return ret;
+ 		ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -6375,7 +6375,7 @@ static int btrfs_dirty_inode(struct inod
+ 		return PTR_ERR(trans);
+ 
+ 	ret = btrfs_update_inode(trans, root, inode);
+-	if (ret && ret == -ENOSPC) {
++	if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
+ 		/* whoops, lets try again with the full transaction */
+ 		btrfs_end_transaction(trans);
+ 		trans = btrfs_start_transaction(root, 1);
diff --git a/queue-5.4/btrfs-export-and-rename-qgroup_reserve_meta.patch b/queue-5.4/btrfs-export-and-rename-qgroup_reserve_meta.patch
new file mode 100644
index 00000000000..f3934725002
--- /dev/null
+++ b/queue-5.4/btrfs-export-and-rename-qgroup_reserve_meta.patch
@@ -0,0 +1,64 @@
+From foo@baz Fri Aug 13 02:22:24 PM CEST 2021
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 13 Aug 2021 20:12:24 +0800
+Subject: btrfs: export and rename qgroup_reserve_meta
+To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, fdmanana@suse.com, Nikolay Borisov <nborisov@suse.com>, David Sterba <dsterba@suse.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <4cdae9f94a8feae8f848574c214016ae6a923078.1628854236.git.anand.jain@oracle.com>
+
+From: Nikolay Borisov <nborisov@suse.com>
+
+commit 80e9baed722c853056e0c5374f51524593cb1031 upstream
+
+Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/qgroup.c |    8 ++++----
+ fs/btrfs/qgroup.h |    3 ++-
+ 2 files changed, 6 insertions(+), 5 deletions(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -3800,8 +3800,8 @@ static int sub_root_meta_rsv(struct btrf
+ 	return num_bytes;
+ }
+ 
+-static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+-				enum btrfs_qgroup_rsv_type type, bool enforce)
++int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
++			      enum btrfs_qgroup_rsv_type type, bool enforce)
+ {
+ 	struct btrfs_fs_info *fs_info = root->fs_info;
+ 	int ret;
+@@ -3832,14 +3832,14 @@ int __btrfs_qgroup_reserve_meta(struct b
+ {
+ 	int ret;
+ 
+-	ret = qgroup_reserve_meta(root, num_bytes, type, enforce);
++	ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
+ 	if (ret <= 0 && ret != -EDQUOT)
+ 		return ret;
+ 
+ 	ret = try_flush_qgroup(root);
+ 	if (ret < 0)
+ 		return ret;
+-	return qgroup_reserve_meta(root, num_bytes, type, enforce);
++	return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
+ }
+ 
+ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
+--- a/fs/btrfs/qgroup.h
++++ b/fs/btrfs/qgroup.h
+@@ -349,7 +349,8 @@ int btrfs_qgroup_reserve_data(struct btr
+ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
+ int btrfs_qgroup_free_data(struct inode *inode,
+ 			struct extent_changeset *reserved, u64 start, u64 len);
+-
++int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
++			      enum btrfs_qgroup_rsv_type type, bool enforce);
+ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ 				enum btrfs_qgroup_rsv_type type, bool enforce);
+ /* Reserve metadata space for pertrans and prealloc type */
diff --git a/queue-5.4/btrfs-qgroup-don-t-commit-transaction-when-we-already-hold-the-handle.patch b/queue-5.4/btrfs-qgroup-don-t-commit-transaction-when-we-already-hold-the-handle.patch
new file mode 100644
index 00000000000..d95fd50e551
--- /dev/null
+++ b/queue-5.4/btrfs-qgroup-don-t-commit-transaction-when-we-already-hold-the-handle.patch
@@ -0,0 +1,132 @@
+From foo@baz Fri Aug 13 02:22:24 PM CEST 2021
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 13 Aug 2021 20:12:23 +0800
+Subject: btrfs: qgroup: don't commit transaction when we already hold the handle
+To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, fdmanana@suse.com, Qu Wenruo <wqu@suse.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <387bbbec5e040ec46f5d15b6a754b7a7af57b302.1628854236.git.anand.jain@oracle.com>
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 6f23277a49e68f8a9355385c846939ad0b1261e7 upstream
+
+[BUG]
+When running the following script, btrfs will trigger an ASSERT():
+
+  #/bin/bash
+  mkfs.btrfs -f $dev
+  mount $dev $mnt
+  xfs_io -f -c "pwrite 0 1G" $mnt/file
+  sync
+  btrfs quota enable $mnt
+  btrfs quota rescan -w $mnt
+
+  # Manually set the limit below current usage
+  btrfs qgroup limit 512M $mnt $mnt
+
+  # Crash happens
+  touch $mnt/file
+
+The dmesg looks like this:
+
+  assertion failed: refcount_read(&trans->use_count) == 1, in fs/btrfs/transaction.c:2022
+  ------------[ cut here ]------------
+  kernel BUG at fs/btrfs/ctree.h:3230!
+  invalid opcode: 0000 [#1] SMP PTI
+  RIP: 0010:assertfail.constprop.0+0x18/0x1a [btrfs]
+   btrfs_commit_transaction.cold+0x11/0x5d [btrfs]
+   try_flush_qgroup+0x67/0x100 [btrfs]
+   __btrfs_qgroup_reserve_meta+0x3a/0x60 [btrfs]
+   btrfs_delayed_update_inode+0xaa/0x350 [btrfs]
+   btrfs_update_inode+0x9d/0x110 [btrfs]
+   btrfs_dirty_inode+0x5d/0xd0 [btrfs]
+   touch_atime+0xb5/0x100
+   iterate_dir+0xf1/0x1b0
+   __x64_sys_getdents64+0x78/0x110
+   do_syscall_64+0x33/0x80
+   entry_SYSCALL_64_after_hwframe+0x44/0xa9
+  RIP: 0033:0x7fb5afe588db
+
+[CAUSE]
+In try_flush_qgroup(), we assume we don't hold a transaction handle at
+all.  This is true for data reservation and mostly true for metadata.
+Since data space reservation always happens before we start a
+transaction, and for most metadata operation we reserve space in
+start_transaction().
+
+But there is an exception, btrfs_delayed_inode_reserve_metadata().
+It holds a transaction handle, while still trying to reserve extra
+metadata space.
+
+When we hit EDQUOT inside btrfs_delayed_inode_reserve_metadata(), we
+will join current transaction and commit, while we still have
+transaction handle from qgroup code.
+
+[FIX]
+Let's check current->journal before we join the transaction.
+
+If current->journal is unset or BTRFS_SEND_TRANS_STUB, it means
+we are not holding a transaction, thus are able to join and then commit
+transaction.
+
+If current->journal is a valid transaction handle, we avoid committing
+transaction and just end it
+
+This is less effective than committing current transaction, as it won't
+free metadata reserved space, but we may still free some data space
+before new data writes.
+
+Bugzilla: https://bugzilla.suse.com/show_bug.cgi?id=1178634
+Fixes: c53e9653605d ("btrfs: qgroup: try to flush qgroup space when we get -EDQUOT")
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/qgroup.c |   20 +++++++++++++++++++-
+ 1 file changed, 19 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -3502,6 +3502,7 @@ static int try_flush_qgroup(struct btrfs
+ {
+ 	struct btrfs_trans_handle *trans;
+ 	int ret;
++	bool can_commit = true;
+ 
+ 	/*
+ 	 * We don't want to run flush again and again, so if there is a running
+@@ -3513,6 +3514,20 @@ static int try_flush_qgroup(struct btrfs
+ 		return 0;
+ 	}
+ 
++	/*
++	 * If current process holds a transaction, we shouldn't flush, as we
++	 * assume all space reservation happens before a transaction handle is
++	 * held.
++	 *
++	 * But there are cases like btrfs_delayed_item_reserve_metadata() where
++	 * we try to reserve space with one transction handle already held.
++	 * In that case we can't commit transaction, but at least try to end it
++	 * and hope the started data writes can free some space.
++	 */
++	if (current->journal_info &&
++	    current->journal_info != BTRFS_SEND_TRANS_STUB)
++		can_commit = false;
++
+ 	ret = btrfs_start_delalloc_snapshot(root);
+ 	if (ret < 0)
+ 		goto out;
+@@ -3524,7 +3539,10 @@ static int try_flush_qgroup(struct btrfs
+ 		goto out;
+ 	}
+ 
+-	ret = btrfs_commit_transaction(trans);
++	if (can_commit)
++		ret = btrfs_commit_transaction(trans);
++	else
++		ret = btrfs_end_transaction(trans);
+ out:
+ 	clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
+ 	wake_up(&root->qgroup_flush_wait);
diff --git a/queue-5.4/series b/queue-5.4/series
index f2bb82ace48..babd7d925fc 100644
--- a/queue-5.4/series
+++ b/queue-5.4/series
@@ -22,3 +22,6 @@ btrfs-transaction-cleanup-unused-trans_state_blocked.patch
 btrfs-qgroup-remove-async_commit-mechanism-in-favor-of-reserve-retry-after-edquot.patch
 btrfs-fix-lockdep-splat-when-enabling-and-disabling-qgroups.patch
 net-xilinx_emaclite-do-not-print-real-iomem-pointer.patch
+btrfs-qgroup-don-t-commit-transaction-when-we-already-hold-the-handle.patch
+btrfs-export-and-rename-qgroup_reserve_meta.patch
+btrfs-don-t-flush-from-btrfs_delayed_inode_reserve_metadata.patch