From 68ef0607b38c1bf0368ab94a61a9371e8415ccc3 Mon Sep 17 00:00:00 2001
From: Sasha Levin <sashal@kernel.org>
Date: Wed, 15 Apr 2020 16:24:06 -0400
Subject: [PATCH] Fixes for 4.14

Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 ...during-unmount-due-to-race-with-dela.patch | 231 +++++++++++++++++
 ...llocations-for-running-delayed-items.patch | 236 ++++++++++++++++++
 queue-4.14/series                             |   2 +
 3 files changed, 469 insertions(+)
 create mode 100644 queue-4.14/btrfs-fix-crash-during-unmount-due-to-race-with-dela.patch
 create mode 100644 queue-4.14/btrfs-use-nofs-allocations-for-running-delayed-items.patch

diff --git a/queue-4.14/btrfs-fix-crash-during-unmount-due-to-race-with-dela.patch b/queue-4.14/btrfs-fix-crash-during-unmount-due-to-race-with-dela.patch
new file mode 100644
index 00000000000..11c4db11a37
--- /dev/null
+++ b/queue-4.14/btrfs-fix-crash-during-unmount-due-to-race-with-dela.patch
@@ -0,0 +1,231 @@
+From 97f687985d1496da37e3235a81d24ac18b9784ff Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 28 Feb 2020 13:04:36 +0000
+Subject: Btrfs: fix crash during unmount due to race with delayed inode
+ workers
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit f0cc2cd70164efe8f75c5d99560f0f69969c72e4 ]
+
+During unmount we can have a job from the delayed inode items work queue
+still running, that can lead to at least two bad things:
+
+1) A crash, because the worker can try to create a transaction just
+   after the fs roots were freed;
+
+2) A transaction leak, because the worker can create a transaction
+   before the fs roots are freed and just after we committed the last
+   transaction and after we stopped the transaction kthread.
+
+A stack trace example of the crash:
+
+ [79011.691214] kernel BUG at lib/radix-tree.c:982!
+ [79011.692056] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
+ [79011.693180] CPU: 3 PID: 1394 Comm: kworker/u8:2 Tainted: G        W         5.6.0-rc2-btrfs-next-54 #2
+ (...)
+ [79011.696789] Workqueue: btrfs-delayed-meta btrfs_work_helper [btrfs]
+ [79011.697904] RIP: 0010:radix_tree_tag_set+0xe7/0x170
+ (...)
+ [79011.702014] RSP: 0018:ffffb3c84a317ca0 EFLAGS: 00010293
+ [79011.702949] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
+ [79011.704202] RDX: ffffb3c84a317cb0 RSI: ffffb3c84a317ca8 RDI: ffff8db3931340a0
+ [79011.705463] RBP: 0000000000000005 R08: 0000000000000005 R09: ffffffff974629d0
+ [79011.706756] R10: ffffb3c84a317bc0 R11: 0000000000000001 R12: ffff8db393134000
+ [79011.708010] R13: ffff8db3931340a0 R14: ffff8db393134068 R15: 0000000000000001
+ [79011.709270] FS:  0000000000000000(0000) GS:ffff8db3b6a00000(0000) knlGS:0000000000000000
+ [79011.710699] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ [79011.711710] CR2: 00007f22c2a0a000 CR3: 0000000232ad4005 CR4: 00000000003606e0
+ [79011.712958] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+ [79011.714205] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+ [79011.715448] Call Trace:
+ [79011.715925]  record_root_in_trans+0x72/0xf0 [btrfs]
+ [79011.716819]  btrfs_record_root_in_trans+0x4b/0x70 [btrfs]
+ [79011.717925]  start_transaction+0xdd/0x5c0 [btrfs]
+ [79011.718829]  btrfs_async_run_delayed_root+0x17e/0x2b0 [btrfs]
+ [79011.719915]  btrfs_work_helper+0xaa/0x720 [btrfs]
+ [79011.720773]  process_one_work+0x26d/0x6a0
+ [79011.721497]  worker_thread+0x4f/0x3e0
+ [79011.722153]  ? process_one_work+0x6a0/0x6a0
+ [79011.722901]  kthread+0x103/0x140
+ [79011.723481]  ? kthread_create_worker_on_cpu+0x70/0x70
+ [79011.724379]  ret_from_fork+0x3a/0x50
+ (...)
+
+The following diagram shows a sequence of steps that lead to the crash
+during ummount of the filesystem:
+
+        CPU 1                                             CPU 2                                CPU 3
+
+ btrfs_punch_hole()
+   btrfs_btree_balance_dirty()
+     btrfs_balance_delayed_items()
+       --> sees
+           fs_info->delayed_root->items
+           with value 200, which is greater
+           than
+           BTRFS_DELAYED_BACKGROUND (128)
+           and smaller than
+           BTRFS_DELAYED_WRITEBACK (512)
+       btrfs_wq_run_delayed_node()
+         --> queues a job for
+             fs_info->delayed_workers to run
+             btrfs_async_run_delayed_root()
+
+                                                                                            btrfs_async_run_delayed_root()
+                                                                                              --> job queued by CPU 1
+
+                                                                                              --> starts picking and running
+                                                                                                  delayed nodes from the
+                                                                                                  prepare_list list
+
+                                                 close_ctree()
+
+                                                   btrfs_delete_unused_bgs()
+
+                                                   btrfs_commit_super()
+
+                                                     btrfs_join_transaction()
+                                                       --> gets transaction N
+
+                                                     btrfs_commit_transaction(N)
+                                                       --> set transaction state
+                                                        to TRANTS_STATE_COMMIT_START
+
+                                                                                             btrfs_first_prepared_delayed_node()
+                                                                                               --> picks delayed node X through
+                                                                                                   the prepared_list list
+
+                                                       btrfs_run_delayed_items()
+
+                                                         btrfs_first_delayed_node()
+                                                           --> also picks delayed node X
+                                                               but through the node_list
+                                                               list
+
+                                                         __btrfs_commit_inode_delayed_items()
+                                                            --> runs all delayed items from
+                                                                this node and drops the
+                                                                node's item count to 0
+                                                                through call to
+                                                                btrfs_release_delayed_inode()
+
+                                                         --> finishes running any remaining
+                                                             delayed nodes
+
+                                                       --> finishes transaction commit
+
+                                                   --> stops cleaner and transaction threads
+
+                                                   btrfs_free_fs_roots()
+                                                     --> frees all roots and removes them
+                                                         from the radix tree
+                                                         fs_info->fs_roots_radix
+
+                                                                                             btrfs_join_transaction()
+                                                                                               start_transaction()
+                                                                                                 btrfs_record_root_in_trans()
+                                                                                                   record_root_in_trans()
+                                                                                                     radix_tree_tag_set()
+                                                                                                       --> crashes because
+                                                                                                           the root is not in
+                                                                                                           the radix tree
+                                                                                                           anymore
+
+If the worker is able to call btrfs_join_transaction() before the unmount
+task frees the fs roots, we end up leaking a transaction and all its
+resources, since after the call to btrfs_commit_super() and stopping the
+transaction kthread, we don't expect to have any transaction open anymore.
+
+When this situation happens the worker has a delayed node that has no
+more items to run, since the task calling btrfs_run_delayed_items(),
+which is doing a transaction commit, picks the same node and runs all
+its items first.
+
+We can not wait for the worker to complete when running delayed items
+through btrfs_run_delayed_items(), because we call that function in
+several phases of a transaction commit, and that could cause a deadlock
+because the worker calls btrfs_join_transaction() and the task doing the
+transaction commit may have already set the transaction state to
+TRANS_STATE_COMMIT_DOING.
+
+Also it's not possible to get into a situation where only some of the
+items of a delayed node are added to the fs/subvolume tree in the current
+transaction and the remaining ones in the next transaction, because when
+running the items of a delayed inode we lock its mutex, effectively
+waiting for the worker if the worker is running the items of the delayed
+node already.
+
+Since this can only cause issues when unmounting a filesystem, fix it in
+a simple way by waiting for any jobs on the delayed workers queue before
+calling btrfs_commit_supper() at close_ctree(). This works because at this
+point no one can call btrfs_btree_balance_dirty() or
+btrfs_balance_delayed_items(), and if we end up waiting for any worker to
+complete, btrfs_commit_super() will commit the transaction created by the
+worker.
+
+CC: stable@vger.kernel.org # 4.4+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/async-thread.c |  8 ++++++++
+ fs/btrfs/async-thread.h |  2 ++
+ fs/btrfs/disk-io.c      | 13 +++++++++++++
+ 3 files changed, 23 insertions(+)
+
+diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
+index 72d7589072f52..92615badc1734 100644
+--- a/fs/btrfs/async-thread.c
++++ b/fs/btrfs/async-thread.c
+@@ -447,3 +447,11 @@ void btrfs_set_work_high_priority(struct btrfs_work *work)
+ {
+ 	set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+ }
++
++void btrfs_flush_workqueue(struct btrfs_workqueue *wq)
++{
++	if (wq->high)
++		flush_workqueue(wq->high->normal_wq);
++
++	flush_workqueue(wq->normal->normal_wq);
++}
+diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
+index fc957e00cef14..2a25aef6ef2a5 100644
+--- a/fs/btrfs/async-thread.h
++++ b/fs/btrfs/async-thread.h
+@@ -85,4 +85,6 @@ void btrfs_set_work_high_priority(struct btrfs_work *work);
+ struct btrfs_fs_info *btrfs_work_owner(const struct btrfs_work *work);
+ struct btrfs_fs_info *btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
+ bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq);
++void btrfs_flush_workqueue(struct btrfs_workqueue *wq);
++
+ #endif
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index 6b4fee5c79f9d..096c015b22a46 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -3725,6 +3725,19 @@ void close_ctree(struct btrfs_fs_info *fs_info)
+ 		 */
+ 		btrfs_delete_unused_bgs(fs_info);
+ 
++		/*
++		 * There might be existing delayed inode workers still running
++		 * and holding an empty delayed inode item. We must wait for
++		 * them to complete first because they can create a transaction.
++		 * This happens when someone calls btrfs_balance_delayed_items()
++		 * and then a transaction commit runs the same delayed nodes
++		 * before any delayed worker has done something with the nodes.
++		 * We must wait for any worker here and not at transaction
++		 * commit time since that could cause a deadlock.
++		 * This is a very rare case.
++		 */
++		btrfs_flush_workqueue(fs_info->delayed_workers);
++
+ 		ret = btrfs_commit_super(fs_info);
+ 		if (ret)
+ 			btrfs_err(fs_info, "commit super ret %d", ret);
+-- 
+2.20.1
+
diff --git a/queue-4.14/btrfs-use-nofs-allocations-for-running-delayed-items.patch b/queue-4.14/btrfs-use-nofs-allocations-for-running-delayed-items.patch
new file mode 100644
index 00000000000..24433cd29bb
--- /dev/null
+++ b/queue-4.14/btrfs-use-nofs-allocations-for-running-delayed-items.patch
@@ -0,0 +1,236 @@
+From cd942d42304f3ceb885e9df6284970c46616d599 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 19 Mar 2020 10:11:32 -0400
+Subject: btrfs: use nofs allocations for running delayed items
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+[ Upstream commit 351cbf6e4410e7ece05e35d0a07320538f2418b4 ]
+
+Zygo reported the following lockdep splat while testing the balance
+patches
+
+======================================================
+WARNING: possible circular locking dependency detected
+5.6.0-c6f0579d496a+ #53 Not tainted
+------------------------------------------------------
+kswapd0/1133 is trying to acquire lock:
+ffff888092f622c0 (&delayed_node->mutex){+.+.}, at: __btrfs_release_delayed_node+0x7c/0x5b0
+
+but task is already holding lock:
+ffffffff8fc5f860 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30
+
+which lock already depends on the new lock.
+
+the existing dependency chain (in reverse order) is:
+
+-> #1 (fs_reclaim){+.+.}:
+       fs_reclaim_acquire.part.91+0x29/0x30
+       fs_reclaim_acquire+0x19/0x20
+       kmem_cache_alloc_trace+0x32/0x740
+       add_block_entry+0x45/0x260
+       btrfs_ref_tree_mod+0x6e2/0x8b0
+       btrfs_alloc_tree_block+0x789/0x880
+       alloc_tree_block_no_bg_flush+0xc6/0xf0
+       __btrfs_cow_block+0x270/0x940
+       btrfs_cow_block+0x1ba/0x3a0
+       btrfs_search_slot+0x999/0x1030
+       btrfs_insert_empty_items+0x81/0xe0
+       btrfs_insert_delayed_items+0x128/0x7d0
+       __btrfs_run_delayed_items+0xf4/0x2a0
+       btrfs_run_delayed_items+0x13/0x20
+       btrfs_commit_transaction+0x5cc/0x1390
+       insert_balance_item.isra.39+0x6b2/0x6e0
+       btrfs_balance+0x72d/0x18d0
+       btrfs_ioctl_balance+0x3de/0x4c0
+       btrfs_ioctl+0x30ab/0x44a0
+       ksys_ioctl+0xa1/0xe0
+       __x64_sys_ioctl+0x43/0x50
+       do_syscall_64+0x77/0x2c0
+       entry_SYSCALL_64_after_hwframe+0x49/0xbe
+
+-> #0 (&delayed_node->mutex){+.+.}:
+       __lock_acquire+0x197e/0x2550
+       lock_acquire+0x103/0x220
+       __mutex_lock+0x13d/0xce0
+       mutex_lock_nested+0x1b/0x20
+       __btrfs_release_delayed_node+0x7c/0x5b0
+       btrfs_remove_delayed_node+0x49/0x50
+       btrfs_evict_inode+0x6fc/0x900
+       evict+0x19a/0x2c0
+       dispose_list+0xa0/0xe0
+       prune_icache_sb+0xbd/0xf0
+       super_cache_scan+0x1b5/0x250
+       do_shrink_slab+0x1f6/0x530
+       shrink_slab+0x32e/0x410
+       shrink_node+0x2a5/0xba0
+       balance_pgdat+0x4bd/0x8a0
+       kswapd+0x35a/0x800
+       kthread+0x1e9/0x210
+       ret_from_fork+0x3a/0x50
+
+other info that might help us debug this:
+
+ Possible unsafe locking scenario:
+
+       CPU0                    CPU1
+       ----                    ----
+  lock(fs_reclaim);
+                               lock(&delayed_node->mutex);
+                               lock(fs_reclaim);
+  lock(&delayed_node->mutex);
+
+ *** DEADLOCK ***
+
+3 locks held by kswapd0/1133:
+ #0: ffffffff8fc5f860 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30
+ #1: ffffffff8fc380d8 (shrinker_rwsem){++++}, at: shrink_slab+0x1e8/0x410
+ #2: ffff8881e0e6c0e8 (&type->s_umount_key#42){++++}, at: trylock_super+0x1b/0x70
+
+stack backtrace:
+CPU: 2 PID: 1133 Comm: kswapd0 Not tainted 5.6.0-c6f0579d496a+ #53
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
+Call Trace:
+ dump_stack+0xc1/0x11a
+ print_circular_bug.isra.38.cold.57+0x145/0x14a
+ check_noncircular+0x2a9/0x2f0
+ ? print_circular_bug.isra.38+0x130/0x130
+ ? stack_trace_consume_entry+0x90/0x90
+ ? save_trace+0x3cc/0x420
+ __lock_acquire+0x197e/0x2550
+ ? btrfs_inode_clear_file_extent_range+0x9b/0xb0
+ ? register_lock_class+0x960/0x960
+ lock_acquire+0x103/0x220
+ ? __btrfs_release_delayed_node+0x7c/0x5b0
+ __mutex_lock+0x13d/0xce0
+ ? __btrfs_release_delayed_node+0x7c/0x5b0
+ ? __asan_loadN+0xf/0x20
+ ? pvclock_clocksource_read+0xeb/0x190
+ ? __btrfs_release_delayed_node+0x7c/0x5b0
+ ? mutex_lock_io_nested+0xc20/0xc20
+ ? __kasan_check_read+0x11/0x20
+ ? check_chain_key+0x1e6/0x2e0
+ mutex_lock_nested+0x1b/0x20
+ ? mutex_lock_nested+0x1b/0x20
+ __btrfs_release_delayed_node+0x7c/0x5b0
+ btrfs_remove_delayed_node+0x49/0x50
+ btrfs_evict_inode+0x6fc/0x900
+ ? btrfs_setattr+0x840/0x840
+ ? do_raw_spin_unlock+0xa8/0x140
+ evict+0x19a/0x2c0
+ dispose_list+0xa0/0xe0
+ prune_icache_sb+0xbd/0xf0
+ ? invalidate_inodes+0x310/0x310
+ super_cache_scan+0x1b5/0x250
+ do_shrink_slab+0x1f6/0x530
+ shrink_slab+0x32e/0x410
+ ? do_shrink_slab+0x530/0x530
+ ? do_shrink_slab+0x530/0x530
+ ? __kasan_check_read+0x11/0x20
+ ? mem_cgroup_protected+0x13d/0x260
+ shrink_node+0x2a5/0xba0
+ balance_pgdat+0x4bd/0x8a0
+ ? mem_cgroup_shrink_node+0x490/0x490
+ ? _raw_spin_unlock_irq+0x27/0x40
+ ? finish_task_switch+0xce/0x390
+ ? rcu_read_lock_bh_held+0xb0/0xb0
+ kswapd+0x35a/0x800
+ ? _raw_spin_unlock_irqrestore+0x4c/0x60
+ ? balance_pgdat+0x8a0/0x8a0
+ ? finish_wait+0x110/0x110
+ ? __kasan_check_read+0x11/0x20
+ ? __kthread_parkme+0xc6/0xe0
+ ? balance_pgdat+0x8a0/0x8a0
+ kthread+0x1e9/0x210
+ ? kthread_create_worker_on_cpu+0xc0/0xc0
+ ret_from_fork+0x3a/0x50
+
+This is because we hold that delayed node's mutex while doing tree
+operations.  Fix this by just wrapping the searches in nofs.
+
+CC: stable@vger.kernel.org # 4.4+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/delayed-inode.c | 13 +++++++++++++
+ 1 file changed, 13 insertions(+)
+
+diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
+index 87414fc9e268f..416fb50a5378c 100644
+--- a/fs/btrfs/delayed-inode.c
++++ b/fs/btrfs/delayed-inode.c
+@@ -18,6 +18,7 @@
+  */
+ 
+ #include <linux/slab.h>
++#include <linux/sched/mm.h>
+ #include "delayed-inode.h"
+ #include "disk-io.h"
+ #include "transaction.h"
+@@ -833,11 +834,14 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
+ {
+ 	struct btrfs_fs_info *fs_info = root->fs_info;
+ 	struct extent_buffer *leaf;
++	unsigned int nofs_flag;
+ 	char *ptr;
+ 	int ret;
+ 
++	nofs_flag = memalloc_nofs_save();
+ 	ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
+ 				      delayed_item->data_len);
++	memalloc_nofs_restore(nofs_flag);
+ 	if (ret < 0 && ret != -EEXIST)
+ 		return ret;
+ 
+@@ -966,6 +970,7 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
+ 				      struct btrfs_delayed_node *node)
+ {
+ 	struct btrfs_delayed_item *curr, *prev;
++	unsigned int nofs_flag;
+ 	int ret = 0;
+ 
+ do_again:
+@@ -974,7 +979,9 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
+ 	if (!curr)
+ 		goto delete_fail;
+ 
++	nofs_flag = memalloc_nofs_save();
+ 	ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
++	memalloc_nofs_restore(nofs_flag);
+ 	if (ret < 0)
+ 		goto delete_fail;
+ 	else if (ret > 0) {
+@@ -1041,6 +1048,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+ 	struct btrfs_key key;
+ 	struct btrfs_inode_item *inode_item;
+ 	struct extent_buffer *leaf;
++	unsigned int nofs_flag;
+ 	int mod;
+ 	int ret;
+ 
+@@ -1053,7 +1061,9 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+ 	else
+ 		mod = 1;
+ 
++	nofs_flag = memalloc_nofs_save();
+ 	ret = btrfs_lookup_inode(trans, root, path, &key, mod);
++	memalloc_nofs_restore(nofs_flag);
+ 	if (ret > 0) {
+ 		btrfs_release_path(path);
+ 		return -ENOENT;
+@@ -1104,7 +1114,10 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+ 
+ 	key.type = BTRFS_INODE_EXTREF_KEY;
+ 	key.offset = -1;
++
++	nofs_flag = memalloc_nofs_save();
+ 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
++	memalloc_nofs_restore(nofs_flag);
+ 	if (ret < 0)
+ 		goto err_out;
+ 	ASSERT(ret);
+-- 
+2.20.1
+
diff --git a/queue-4.14/series b/queue-4.14/series
index 352b7a1bb42..5ffc4bebb5c 100644
--- a/queue-4.14/series
+++ b/queue-4.14/series
@@ -92,3 +92,5 @@ powerpc-kprobes-ignore-traps-that-happened-in-real-mode.patch
 scsi-mpt3sas-fix-kernel-panic-observed-on-soft-hba-unplug.patch
 powerpc-add-attributes-for-setjmp-longjmp.patch
 powerpc-make-setjmp-longjmp-signature-standard.patch
+btrfs-fix-crash-during-unmount-due-to-race-with-dela.patch
+btrfs-use-nofs-allocations-for-running-delayed-items.patch
-- 
2.47.3