--- /dev/null
+From foo@baz Fri Aug 13 02:22:25 PM CEST 2021
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 13 Aug 2021 20:12:25 +0800
+Subject: btrfs: don't flush from btrfs_delayed_inode_reserve_metadata
+To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, fdmanana@suse.com, Nikolay Borisov <nborisov@suse.com>, Qu Wenruo <wqu@suse.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <67b189dab7575f9edbc41caa4118ee68856a56a2.1628854236.git.anand.jain@oracle.com>
+
+From: Nikolay Borisov <nborisov@suse.com>
+
+commit 4d14c5cde5c268a2bc26addecf09489cb953ef64 upstream
+
+Calling btrfs_qgroup_reserve_meta_prealloc from
+btrfs_delayed_inode_reserve_metadata can result in flushing delalloc
+while holding a transaction and delayed node locks. This is deadlock
+prone. In the past multiple commits:
+
+ * ae5e070eaca9 ("btrfs: qgroup: don't try to wait flushing if we're
+already holding a transaction")
+
+ * 6f23277a49e6 ("btrfs: qgroup: don't commit transaction when we already
+ hold the handle")
+
+Tried to solve various aspects of this but this was always a
+whack-a-mole game. Unfortunately those 2 fixes don't solve a deadlock
+scenario involving btrfs_delayed_node::mutex. Namely, one thread
+can call btrfs_dirty_inode as a result of reading a file and modifying
+its atime:
+
+ PID: 6963 TASK: ffff8c7f3f94c000 CPU: 2 COMMAND: "test"
+ #0 __schedule at ffffffffa529e07d
+ #1 schedule at ffffffffa529e4ff
+ #2 schedule_timeout at ffffffffa52a1bdd
+ #3 wait_for_completion at ffffffffa529eeea <-- sleeps with delayed node mutex held
+ #4 start_delalloc_inodes at ffffffffc0380db5
+ #5 btrfs_start_delalloc_snapshot at ffffffffc0393836
+ #6 try_flush_qgroup at ffffffffc03f04b2
+ #7 __btrfs_qgroup_reserve_meta at ffffffffc03f5bb6 <-- tries to reserve space and starts delalloc inodes.
+ #8 btrfs_delayed_update_inode at ffffffffc03e31aa <-- acquires delayed node mutex
+ #9 btrfs_update_inode at ffffffffc0385ba8
+ #10 btrfs_dirty_inode at ffffffffc038627b <-- TRANSACTIION OPENED
+ #11 touch_atime at ffffffffa4cf0000
+ #12 generic_file_read_iter at ffffffffa4c1f123
+ #13 new_sync_read at ffffffffa4ccdc8a
+ #14 vfs_read at ffffffffa4cd0849
+ #15 ksys_read at ffffffffa4cd0bd1
+ #16 do_syscall_64 at ffffffffa4a052eb
+ #17 entry_SYSCALL_64_after_hwframe at ffffffffa540008c
+
+This will cause an asynchronous work to flush the delalloc inodes to
+happen which can try to acquire the same delayed_node mutex:
+
+ PID: 455 TASK: ffff8c8085fa4000 CPU: 5 COMMAND: "kworker/u16:30"
+ #0 __schedule at ffffffffa529e07d
+ #1 schedule at ffffffffa529e4ff
+ #2 schedule_preempt_disabled at ffffffffa529e80a
+ #3 __mutex_lock at ffffffffa529fdcb <-- goes to sleep, never wakes up.
+ #4 btrfs_delayed_update_inode at ffffffffc03e3143 <-- tries to acquire the mutex
+ #5 btrfs_update_inode at ffffffffc0385ba8 <-- this is the same inode that pid 6963 is holding
+ #6 cow_file_range_inline.constprop.78 at ffffffffc0386be7
+ #7 cow_file_range at ffffffffc03879c1
+ #8 btrfs_run_delalloc_range at ffffffffc038894c
+ #9 writepage_delalloc at ffffffffc03a3c8f
+ #10 __extent_writepage at ffffffffc03a4c01
+ #11 extent_write_cache_pages at ffffffffc03a500b
+ #12 extent_writepages at ffffffffc03a6de2
+ #13 do_writepages at ffffffffa4c277eb
+ #14 __filemap_fdatawrite_range at ffffffffa4c1e5bb
+ #15 btrfs_run_delalloc_work at ffffffffc0380987 <-- starts running delayed nodes
+ #16 normal_work_helper at ffffffffc03b706c
+ #17 process_one_work at ffffffffa4aba4e4
+ #18 worker_thread at ffffffffa4aba6fd
+ #19 kthread at ffffffffa4ac0a3d
+ #20 ret_from_fork at ffffffffa54001ff
+
+To fully address those cases the complete fix is to never issue any
+flushing while holding the transaction or the delayed node lock. This
+patch achieves it by calling qgroup_reserve_meta directly which will
+either succeed without flushing or will fail and return -EDQUOT. In the
+latter case that return value is going to be propagated to
+btrfs_dirty_inode which will fallback to start a new transaction. That's
+fine as the majority of time we expect the inode will have
+BTRFS_DELAYED_NODE_INODE_DIRTY flag set which will result in directly
+copying the in-memory state.
+
+Fixes: c53e9653605d ("btrfs: qgroup: try to flush qgroup space when we get -EDQUOT")
+CC: stable@vger.kernel.org # 5.10+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/delayed-inode.c | 3 ++-
+ fs/btrfs/inode.c | 2 +-
+ 2 files changed, 3 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/delayed-inode.c
++++ b/fs/btrfs/delayed-inode.c
+@@ -627,7 +627,8 @@ static int btrfs_delayed_inode_reserve_m
+ */
+ if (!src_rsv || (!trans->bytes_reserved &&
+ src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
+- ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
++ ret = btrfs_qgroup_reserve_meta(root, num_bytes,
++ BTRFS_QGROUP_RSV_META_PREALLOC, true);
+ if (ret < 0)
+ return ret;
+ ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -6375,7 +6375,7 @@ static int btrfs_dirty_inode(struct inod
+ return PTR_ERR(trans);
+
+ ret = btrfs_update_inode(trans, root, inode);
+- if (ret && ret == -ENOSPC) {
++ if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
+ /* whoops, lets try again with the full transaction */
+ btrfs_end_transaction(trans);
+ trans = btrfs_start_transaction(root, 1);
--- /dev/null
+From foo@baz Fri Aug 13 02:22:24 PM CEST 2021
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 13 Aug 2021 20:12:24 +0800
+Subject: btrfs: export and rename qgroup_reserve_meta
+To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, fdmanana@suse.com, Nikolay Borisov <nborisov@suse.com>, David Sterba <dsterba@suse.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <4cdae9f94a8feae8f848574c214016ae6a923078.1628854236.git.anand.jain@oracle.com>
+
+From: Nikolay Borisov <nborisov@suse.com>
+
+commit 80e9baed722c853056e0c5374f51524593cb1031 upstream
+
+Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/qgroup.c | 8 ++++----
+ fs/btrfs/qgroup.h | 3 ++-
+ 2 files changed, 6 insertions(+), 5 deletions(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -3800,8 +3800,8 @@ static int sub_root_meta_rsv(struct btrf
+ return num_bytes;
+ }
+
+-static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+- enum btrfs_qgroup_rsv_type type, bool enforce)
++int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
++ enum btrfs_qgroup_rsv_type type, bool enforce)
+ {
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+@@ -3832,14 +3832,14 @@ int __btrfs_qgroup_reserve_meta(struct b
+ {
+ int ret;
+
+- ret = qgroup_reserve_meta(root, num_bytes, type, enforce);
++ ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
+ if (ret <= 0 && ret != -EDQUOT)
+ return ret;
+
+ ret = try_flush_qgroup(root);
+ if (ret < 0)
+ return ret;
+- return qgroup_reserve_meta(root, num_bytes, type, enforce);
++ return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
+ }
+
+ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
+--- a/fs/btrfs/qgroup.h
++++ b/fs/btrfs/qgroup.h
+@@ -349,7 +349,8 @@ int btrfs_qgroup_reserve_data(struct btr
+ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
+ int btrfs_qgroup_free_data(struct inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len);
+-
++int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
++ enum btrfs_qgroup_rsv_type type, bool enforce);
+ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce);
+ /* Reserve metadata space for pertrans and prealloc type */
--- /dev/null
+From foo@baz Fri Aug 13 02:22:24 PM CEST 2021
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 13 Aug 2021 20:12:23 +0800
+Subject: btrfs: qgroup: don't commit transaction when we already hold the handle
+To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, fdmanana@suse.com, Qu Wenruo <wqu@suse.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <387bbbec5e040ec46f5d15b6a754b7a7af57b302.1628854236.git.anand.jain@oracle.com>
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 6f23277a49e68f8a9355385c846939ad0b1261e7 upstream
+
+[BUG]
+When running the following script, btrfs will trigger an ASSERT():
+
+ #/bin/bash
+ mkfs.btrfs -f $dev
+ mount $dev $mnt
+ xfs_io -f -c "pwrite 0 1G" $mnt/file
+ sync
+ btrfs quota enable $mnt
+ btrfs quota rescan -w $mnt
+
+ # Manually set the limit below current usage
+ btrfs qgroup limit 512M $mnt $mnt
+
+ # Crash happens
+ touch $mnt/file
+
+The dmesg looks like this:
+
+ assertion failed: refcount_read(&trans->use_count) == 1, in fs/btrfs/transaction.c:2022
+ ------------[ cut here ]------------
+ kernel BUG at fs/btrfs/ctree.h:3230!
+ invalid opcode: 0000 [#1] SMP PTI
+ RIP: 0010:assertfail.constprop.0+0x18/0x1a [btrfs]
+ btrfs_commit_transaction.cold+0x11/0x5d [btrfs]
+ try_flush_qgroup+0x67/0x100 [btrfs]
+ __btrfs_qgroup_reserve_meta+0x3a/0x60 [btrfs]
+ btrfs_delayed_update_inode+0xaa/0x350 [btrfs]
+ btrfs_update_inode+0x9d/0x110 [btrfs]
+ btrfs_dirty_inode+0x5d/0xd0 [btrfs]
+ touch_atime+0xb5/0x100
+ iterate_dir+0xf1/0x1b0
+ __x64_sys_getdents64+0x78/0x110
+ do_syscall_64+0x33/0x80
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+ RIP: 0033:0x7fb5afe588db
+
+[CAUSE]
+In try_flush_qgroup(), we assume we don't hold a transaction handle at
+all. This is true for data reservation and mostly true for metadata.
+Since data space reservation always happens before we start a
+transaction, and for most metadata operation we reserve space in
+start_transaction().
+
+But there is an exception, btrfs_delayed_inode_reserve_metadata().
+It holds a transaction handle, while still trying to reserve extra
+metadata space.
+
+When we hit EDQUOT inside btrfs_delayed_inode_reserve_metadata(), we
+will join current transaction and commit, while we still have
+transaction handle from qgroup code.
+
+[FIX]
+Let's check current->journal before we join the transaction.
+
+If current->journal is unset or BTRFS_SEND_TRANS_STUB, it means
+we are not holding a transaction, thus are able to join and then commit
+transaction.
+
+If current->journal is a valid transaction handle, we avoid committing
+transaction and just end it
+
+This is less effective than committing current transaction, as it won't
+free metadata reserved space, but we may still free some data space
+before new data writes.
+
+Bugzilla: https://bugzilla.suse.com/show_bug.cgi?id=1178634
+Fixes: c53e9653605d ("btrfs: qgroup: try to flush qgroup space when we get -EDQUOT")
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/qgroup.c | 20 +++++++++++++++++++-
+ 1 file changed, 19 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -3502,6 +3502,7 @@ static int try_flush_qgroup(struct btrfs
+ {
+ struct btrfs_trans_handle *trans;
+ int ret;
++ bool can_commit = true;
+
+ /*
+ * We don't want to run flush again and again, so if there is a running
+@@ -3513,6 +3514,20 @@ static int try_flush_qgroup(struct btrfs
+ return 0;
+ }
+
++ /*
++ * If current process holds a transaction, we shouldn't flush, as we
++ * assume all space reservation happens before a transaction handle is
++ * held.
++ *
++ * But there are cases like btrfs_delayed_item_reserve_metadata() where
++ * we try to reserve space with one transction handle already held.
++ * In that case we can't commit transaction, but at least try to end it
++ * and hope the started data writes can free some space.
++ */
++ if (current->journal_info &&
++ current->journal_info != BTRFS_SEND_TRANS_STUB)
++ can_commit = false;
++
+ ret = btrfs_start_delalloc_snapshot(root);
+ if (ret < 0)
+ goto out;
+@@ -3524,7 +3539,10 @@ static int try_flush_qgroup(struct btrfs
+ goto out;
+ }
+
+- ret = btrfs_commit_transaction(trans);
++ if (can_commit)
++ ret = btrfs_commit_transaction(trans);
++ else
++ ret = btrfs_end_transaction(trans);
+ out:
+ clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
+ wake_up(&root->qgroup_flush_wait);
btrfs-qgroup-remove-async_commit-mechanism-in-favor-of-reserve-retry-after-edquot.patch
btrfs-fix-lockdep-splat-when-enabling-and-disabling-qgroups.patch
net-xilinx_emaclite-do-not-print-real-iomem-pointer.patch
+btrfs-qgroup-don-t-commit-transaction-when-we-already-hold-the-handle.patch
+btrfs-export-and-rename-qgroup_reserve_meta.patch
+btrfs-don-t-flush-from-btrfs_delayed_inode_reserve_metadata.patch