From: Greg Kroah-Hartman Date: Fri, 13 Aug 2021 12:23:12 +0000 (+0200) Subject: 5.4-stable patches X-Git-Tag: v4.4.281~3 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=52c0db7cb402cb73cab4efaf3949808645a7897d;p=thirdparty%2Fkernel%2Fstable-queue.git 5.4-stable patches added patches: btrfs-don-t-flush-from-btrfs_delayed_inode_reserve_metadata.patch btrfs-export-and-rename-qgroup_reserve_meta.patch btrfs-qgroup-don-t-commit-transaction-when-we-already-hold-the-handle.patch --- diff --git a/queue-5.4/btrfs-don-t-flush-from-btrfs_delayed_inode_reserve_metadata.patch b/queue-5.4/btrfs-don-t-flush-from-btrfs_delayed_inode_reserve_metadata.patch new file mode 100644 index 00000000000..c333c35bbe2 --- /dev/null +++ b/queue-5.4/btrfs-don-t-flush-from-btrfs_delayed_inode_reserve_metadata.patch @@ -0,0 +1,120 @@ +From foo@baz Fri Aug 13 02:22:25 PM CEST 2021 +From: Anand Jain +Date: Fri, 13 Aug 2021 20:12:25 +0800 +Subject: btrfs: don't flush from btrfs_delayed_inode_reserve_metadata +To: linux-kernel@vger.kernel.org, stable@vger.kernel.org +Cc: linux-btrfs@vger.kernel.org, fdmanana@suse.com, Nikolay Borisov , Qu Wenruo , Anand Jain +Message-ID: <67b189dab7575f9edbc41caa4118ee68856a56a2.1628854236.git.anand.jain@oracle.com> + +From: Nikolay Borisov + +commit 4d14c5cde5c268a2bc26addecf09489cb953ef64 upstream + +Calling btrfs_qgroup_reserve_meta_prealloc from +btrfs_delayed_inode_reserve_metadata can result in flushing delalloc +while holding a transaction and delayed node locks. This is deadlock +prone. In the past multiple commits: + + * ae5e070eaca9 ("btrfs: qgroup: don't try to wait flushing if we're +already holding a transaction") + + * 6f23277a49e6 ("btrfs: qgroup: don't commit transaction when we already + hold the handle") + +Tried to solve various aspects of this but this was always a +whack-a-mole game. Unfortunately those 2 fixes don't solve a deadlock +scenario involving btrfs_delayed_node::mutex. Namely, one thread +can call btrfs_dirty_inode as a result of reading a file and modifying +its atime: + + PID: 6963 TASK: ffff8c7f3f94c000 CPU: 2 COMMAND: "test" + #0 __schedule at ffffffffa529e07d + #1 schedule at ffffffffa529e4ff + #2 schedule_timeout at ffffffffa52a1bdd + #3 wait_for_completion at ffffffffa529eeea <-- sleeps with delayed node mutex held + #4 start_delalloc_inodes at ffffffffc0380db5 + #5 btrfs_start_delalloc_snapshot at ffffffffc0393836 + #6 try_flush_qgroup at ffffffffc03f04b2 + #7 __btrfs_qgroup_reserve_meta at ffffffffc03f5bb6 <-- tries to reserve space and starts delalloc inodes. + #8 btrfs_delayed_update_inode at ffffffffc03e31aa <-- acquires delayed node mutex + #9 btrfs_update_inode at ffffffffc0385ba8 + #10 btrfs_dirty_inode at ffffffffc038627b <-- TRANSACTIION OPENED + #11 touch_atime at ffffffffa4cf0000 + #12 generic_file_read_iter at ffffffffa4c1f123 + #13 new_sync_read at ffffffffa4ccdc8a + #14 vfs_read at ffffffffa4cd0849 + #15 ksys_read at ffffffffa4cd0bd1 + #16 do_syscall_64 at ffffffffa4a052eb + #17 entry_SYSCALL_64_after_hwframe at ffffffffa540008c + +This will cause an asynchronous work to flush the delalloc inodes to +happen which can try to acquire the same delayed_node mutex: + + PID: 455 TASK: ffff8c8085fa4000 CPU: 5 COMMAND: "kworker/u16:30" + #0 __schedule at ffffffffa529e07d + #1 schedule at ffffffffa529e4ff + #2 schedule_preempt_disabled at ffffffffa529e80a + #3 __mutex_lock at ffffffffa529fdcb <-- goes to sleep, never wakes up. + #4 btrfs_delayed_update_inode at ffffffffc03e3143 <-- tries to acquire the mutex + #5 btrfs_update_inode at ffffffffc0385ba8 <-- this is the same inode that pid 6963 is holding + #6 cow_file_range_inline.constprop.78 at ffffffffc0386be7 + #7 cow_file_range at ffffffffc03879c1 + #8 btrfs_run_delalloc_range at ffffffffc038894c + #9 writepage_delalloc at ffffffffc03a3c8f + #10 __extent_writepage at ffffffffc03a4c01 + #11 extent_write_cache_pages at ffffffffc03a500b + #12 extent_writepages at ffffffffc03a6de2 + #13 do_writepages at ffffffffa4c277eb + #14 __filemap_fdatawrite_range at ffffffffa4c1e5bb + #15 btrfs_run_delalloc_work at ffffffffc0380987 <-- starts running delayed nodes + #16 normal_work_helper at ffffffffc03b706c + #17 process_one_work at ffffffffa4aba4e4 + #18 worker_thread at ffffffffa4aba6fd + #19 kthread at ffffffffa4ac0a3d + #20 ret_from_fork at ffffffffa54001ff + +To fully address those cases the complete fix is to never issue any +flushing while holding the transaction or the delayed node lock. This +patch achieves it by calling qgroup_reserve_meta directly which will +either succeed without flushing or will fail and return -EDQUOT. In the +latter case that return value is going to be propagated to +btrfs_dirty_inode which will fallback to start a new transaction. That's +fine as the majority of time we expect the inode will have +BTRFS_DELAYED_NODE_INODE_DIRTY flag set which will result in directly +copying the in-memory state. + +Fixes: c53e9653605d ("btrfs: qgroup: try to flush qgroup space when we get -EDQUOT") +CC: stable@vger.kernel.org # 5.10+ +Reviewed-by: Qu Wenruo +Signed-off-by: Nikolay Borisov +Signed-off-by: David Sterba +Signed-off-by: Anand Jain +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/delayed-inode.c | 3 ++- + fs/btrfs/inode.c | 2 +- + 2 files changed, 3 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/delayed-inode.c ++++ b/fs/btrfs/delayed-inode.c +@@ -627,7 +627,8 @@ static int btrfs_delayed_inode_reserve_m + */ + if (!src_rsv || (!trans->bytes_reserved && + src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { +- ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); ++ ret = btrfs_qgroup_reserve_meta(root, num_bytes, ++ BTRFS_QGROUP_RSV_META_PREALLOC, true); + if (ret < 0) + return ret; + ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -6375,7 +6375,7 @@ static int btrfs_dirty_inode(struct inod + return PTR_ERR(trans); + + ret = btrfs_update_inode(trans, root, inode); +- if (ret && ret == -ENOSPC) { ++ if (ret && (ret == -ENOSPC || ret == -EDQUOT)) { + /* whoops, lets try again with the full transaction */ + btrfs_end_transaction(trans); + trans = btrfs_start_transaction(root, 1); diff --git a/queue-5.4/btrfs-export-and-rename-qgroup_reserve_meta.patch b/queue-5.4/btrfs-export-and-rename-qgroup_reserve_meta.patch new file mode 100644 index 00000000000..f3934725002 --- /dev/null +++ b/queue-5.4/btrfs-export-and-rename-qgroup_reserve_meta.patch @@ -0,0 +1,64 @@ +From foo@baz Fri Aug 13 02:22:24 PM CEST 2021 +From: Anand Jain +Date: Fri, 13 Aug 2021 20:12:24 +0800 +Subject: btrfs: export and rename qgroup_reserve_meta +To: linux-kernel@vger.kernel.org, stable@vger.kernel.org +Cc: linux-btrfs@vger.kernel.org, fdmanana@suse.com, Nikolay Borisov , David Sterba , Anand Jain +Message-ID: <4cdae9f94a8feae8f848574c214016ae6a923078.1628854236.git.anand.jain@oracle.com> + +From: Nikolay Borisov + +commit 80e9baed722c853056e0c5374f51524593cb1031 upstream + +Signed-off-by: Nikolay Borisov +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Anand Jain +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/qgroup.c | 8 ++++---- + fs/btrfs/qgroup.h | 3 ++- + 2 files changed, 6 insertions(+), 5 deletions(-) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -3800,8 +3800,8 @@ static int sub_root_meta_rsv(struct btrf + return num_bytes; + } + +-static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, +- enum btrfs_qgroup_rsv_type type, bool enforce) ++int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, ++ enum btrfs_qgroup_rsv_type type, bool enforce) + { + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; +@@ -3832,14 +3832,14 @@ int __btrfs_qgroup_reserve_meta(struct b + { + int ret; + +- ret = qgroup_reserve_meta(root, num_bytes, type, enforce); ++ ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); + if (ret <= 0 && ret != -EDQUOT) + return ret; + + ret = try_flush_qgroup(root); + if (ret < 0) + return ret; +- return qgroup_reserve_meta(root, num_bytes, type, enforce); ++ return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); + } + + void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) +--- a/fs/btrfs/qgroup.h ++++ b/fs/btrfs/qgroup.h +@@ -349,7 +349,8 @@ int btrfs_qgroup_reserve_data(struct btr + int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); + int btrfs_qgroup_free_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); +- ++int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, ++ enum btrfs_qgroup_rsv_type type, bool enforce); + int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type, bool enforce); + /* Reserve metadata space for pertrans and prealloc type */ diff --git a/queue-5.4/btrfs-qgroup-don-t-commit-transaction-when-we-already-hold-the-handle.patch b/queue-5.4/btrfs-qgroup-don-t-commit-transaction-when-we-already-hold-the-handle.patch new file mode 100644 index 00000000000..d95fd50e551 --- /dev/null +++ b/queue-5.4/btrfs-qgroup-don-t-commit-transaction-when-we-already-hold-the-handle.patch @@ -0,0 +1,132 @@ +From foo@baz Fri Aug 13 02:22:24 PM CEST 2021 +From: Anand Jain +Date: Fri, 13 Aug 2021 20:12:23 +0800 +Subject: btrfs: qgroup: don't commit transaction when we already hold the handle +To: linux-kernel@vger.kernel.org, stable@vger.kernel.org +Cc: linux-btrfs@vger.kernel.org, fdmanana@suse.com, Qu Wenruo , Anand Jain +Message-ID: <387bbbec5e040ec46f5d15b6a754b7a7af57b302.1628854236.git.anand.jain@oracle.com> + +From: Qu Wenruo + +commit 6f23277a49e68f8a9355385c846939ad0b1261e7 upstream + +[BUG] +When running the following script, btrfs will trigger an ASSERT(): + + #/bin/bash + mkfs.btrfs -f $dev + mount $dev $mnt + xfs_io -f -c "pwrite 0 1G" $mnt/file + sync + btrfs quota enable $mnt + btrfs quota rescan -w $mnt + + # Manually set the limit below current usage + btrfs qgroup limit 512M $mnt $mnt + + # Crash happens + touch $mnt/file + +The dmesg looks like this: + + assertion failed: refcount_read(&trans->use_count) == 1, in fs/btrfs/transaction.c:2022 + ------------[ cut here ]------------ + kernel BUG at fs/btrfs/ctree.h:3230! + invalid opcode: 0000 [#1] SMP PTI + RIP: 0010:assertfail.constprop.0+0x18/0x1a [btrfs] + btrfs_commit_transaction.cold+0x11/0x5d [btrfs] + try_flush_qgroup+0x67/0x100 [btrfs] + __btrfs_qgroup_reserve_meta+0x3a/0x60 [btrfs] + btrfs_delayed_update_inode+0xaa/0x350 [btrfs] + btrfs_update_inode+0x9d/0x110 [btrfs] + btrfs_dirty_inode+0x5d/0xd0 [btrfs] + touch_atime+0xb5/0x100 + iterate_dir+0xf1/0x1b0 + __x64_sys_getdents64+0x78/0x110 + do_syscall_64+0x33/0x80 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + RIP: 0033:0x7fb5afe588db + +[CAUSE] +In try_flush_qgroup(), we assume we don't hold a transaction handle at +all. This is true for data reservation and mostly true for metadata. +Since data space reservation always happens before we start a +transaction, and for most metadata operation we reserve space in +start_transaction(). + +But there is an exception, btrfs_delayed_inode_reserve_metadata(). +It holds a transaction handle, while still trying to reserve extra +metadata space. + +When we hit EDQUOT inside btrfs_delayed_inode_reserve_metadata(), we +will join current transaction and commit, while we still have +transaction handle from qgroup code. + +[FIX] +Let's check current->journal before we join the transaction. + +If current->journal is unset or BTRFS_SEND_TRANS_STUB, it means +we are not holding a transaction, thus are able to join and then commit +transaction. + +If current->journal is a valid transaction handle, we avoid committing +transaction and just end it + +This is less effective than committing current transaction, as it won't +free metadata reserved space, but we may still free some data space +before new data writes. + +Bugzilla: https://bugzilla.suse.com/show_bug.cgi?id=1178634 +Fixes: c53e9653605d ("btrfs: qgroup: try to flush qgroup space when we get -EDQUOT") +Reviewed-by: Filipe Manana +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Anand Jain +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/qgroup.c | 20 +++++++++++++++++++- + 1 file changed, 19 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -3502,6 +3502,7 @@ static int try_flush_qgroup(struct btrfs + { + struct btrfs_trans_handle *trans; + int ret; ++ bool can_commit = true; + + /* + * We don't want to run flush again and again, so if there is a running +@@ -3513,6 +3514,20 @@ static int try_flush_qgroup(struct btrfs + return 0; + } + ++ /* ++ * If current process holds a transaction, we shouldn't flush, as we ++ * assume all space reservation happens before a transaction handle is ++ * held. ++ * ++ * But there are cases like btrfs_delayed_item_reserve_metadata() where ++ * we try to reserve space with one transction handle already held. ++ * In that case we can't commit transaction, but at least try to end it ++ * and hope the started data writes can free some space. ++ */ ++ if (current->journal_info && ++ current->journal_info != BTRFS_SEND_TRANS_STUB) ++ can_commit = false; ++ + ret = btrfs_start_delalloc_snapshot(root); + if (ret < 0) + goto out; +@@ -3524,7 +3539,10 @@ static int try_flush_qgroup(struct btrfs + goto out; + } + +- ret = btrfs_commit_transaction(trans); ++ if (can_commit) ++ ret = btrfs_commit_transaction(trans); ++ else ++ ret = btrfs_end_transaction(trans); + out: + clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); + wake_up(&root->qgroup_flush_wait); diff --git a/queue-5.4/series b/queue-5.4/series index f2bb82ace48..babd7d925fc 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -22,3 +22,6 @@ btrfs-transaction-cleanup-unused-trans_state_blocked.patch btrfs-qgroup-remove-async_commit-mechanism-in-favor-of-reserve-retry-after-edquot.patch btrfs-fix-lockdep-splat-when-enabling-and-disabling-qgroups.patch net-xilinx_emaclite-do-not-print-real-iomem-pointer.patch +btrfs-qgroup-don-t-commit-transaction-when-we-already-hold-the-handle.patch +btrfs-export-and-rename-qgroup_reserve_meta.patch +btrfs-don-t-flush-from-btrfs_delayed_inode_reserve_metadata.patch