--- /dev/null
+From 572c83acdcdafeb04e70aa46be1fa539310be20c Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Tue, 29 Sep 2020 08:53:54 -0400
+Subject: btrfs: cleanup cow block on error
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 572c83acdcdafeb04e70aa46be1fa539310be20c upstream.
+
+In fstest btrfs/064 a transaction abort in __btrfs_cow_block could lead
+to a system lockup. It gets stuck trying to write back inodes, and the
+write back thread was trying to lock an extent buffer:
+
+ $ cat /proc/2143497/stack
+ [<0>] __btrfs_tree_lock+0x108/0x250
+ [<0>] lock_extent_buffer_for_io+0x35e/0x3a0
+ [<0>] btree_write_cache_pages+0x15a/0x3b0
+ [<0>] do_writepages+0x28/0xb0
+ [<0>] __writeback_single_inode+0x54/0x5c0
+ [<0>] writeback_sb_inodes+0x1e8/0x510
+ [<0>] wb_writeback+0xcc/0x440
+ [<0>] wb_workfn+0xd7/0x650
+ [<0>] process_one_work+0x236/0x560
+ [<0>] worker_thread+0x55/0x3c0
+ [<0>] kthread+0x13a/0x150
+ [<0>] ret_from_fork+0x1f/0x30
+
+This is because we got an error while COWing a block, specifically here
+
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
+ ret = btrfs_reloc_cow_block(trans, root, buf, cow);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ }
+
+ [16402.241552] BTRFS: Transaction aborted (error -2)
+ [16402.242362] WARNING: CPU: 1 PID: 2563188 at fs/btrfs/ctree.c:1074 __btrfs_cow_block+0x376/0x540
+ [16402.249469] CPU: 1 PID: 2563188 Comm: fsstress Not tainted 5.9.0-rc6+ #8
+ [16402.249936] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
+ [16402.250525] RIP: 0010:__btrfs_cow_block+0x376/0x540
+ [16402.252417] RSP: 0018:ffff9cca40e578b0 EFLAGS: 00010282
+ [16402.252787] RAX: 0000000000000025 RBX: 0000000000000002 RCX: ffff9132bbd19388
+ [16402.253278] RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffff9132bbd19380
+ [16402.254063] RBP: ffff9132b41a49c0 R08: 0000000000000000 R09: 0000000000000000
+ [16402.254887] R10: 0000000000000000 R11: ffff91324758b080 R12: ffff91326ef17ce0
+ [16402.255694] R13: ffff91325fc0f000 R14: ffff91326ef176b0 R15: ffff9132815e2000
+ [16402.256321] FS: 00007f542c6d7b80(0000) GS:ffff9132bbd00000(0000) knlGS:0000000000000000
+ [16402.256973] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ [16402.257374] CR2: 00007f127b83f250 CR3: 0000000133480002 CR4: 0000000000370ee0
+ [16402.257867] Call Trace:
+ [16402.258072] btrfs_cow_block+0x109/0x230
+ [16402.258356] btrfs_search_slot+0x530/0x9d0
+ [16402.258655] btrfs_lookup_file_extent+0x37/0x40
+ [16402.259155] __btrfs_drop_extents+0x13c/0xd60
+ [16402.259628] ? btrfs_block_rsv_migrate+0x4f/0xb0
+ [16402.259949] btrfs_replace_file_extents+0x190/0x820
+ [16402.260873] btrfs_clone+0x9ae/0xc00
+ [16402.261139] btrfs_extent_same_range+0x66/0x90
+ [16402.261771] btrfs_remap_file_range+0x353/0x3b1
+ [16402.262333] vfs_dedupe_file_range_one.part.0+0xd5/0x140
+ [16402.262821] vfs_dedupe_file_range+0x189/0x220
+ [16402.263150] do_vfs_ioctl+0x552/0x700
+ [16402.263662] __x64_sys_ioctl+0x62/0xb0
+ [16402.264023] do_syscall_64+0x33/0x40
+ [16402.264364] entry_SYSCALL_64_after_hwframe+0x44/0xa9
+ [16402.264862] RIP: 0033:0x7f542c7d15cb
+ [16402.266901] RSP: 002b:00007ffd35944ea8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
+ [16402.267627] RAX: ffffffffffffffda RBX: 00000000009d1968 RCX: 00007f542c7d15cb
+ [16402.268298] RDX: 00000000009d2490 RSI: 00000000c0189436 RDI: 0000000000000003
+ [16402.268958] RBP: 00000000009d2520 R08: 0000000000000036 R09: 00000000009d2e64
+ [16402.269726] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000002
+ [16402.270659] R13: 000000000001f000 R14: 00000000009d1970 R15: 00000000009d2e80
+ [16402.271498] irq event stamp: 0
+ [16402.271846] hardirqs last enabled at (0): [<0000000000000000>] 0x0
+ [16402.272497] hardirqs last disabled at (0): [<ffffffff910dbf59>] copy_process+0x6b9/0x1ba0
+ [16402.273343] softirqs last enabled at (0): [<ffffffff910dbf59>] copy_process+0x6b9/0x1ba0
+ [16402.273905] softirqs last disabled at (0): [<0000000000000000>] 0x0
+ [16402.274338] ---[ end trace 737874a5a41a8236 ]---
+ [16402.274669] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry
+ [16402.276179] BTRFS info (device dm-9): forced readonly
+ [16402.277046] BTRFS: error (device dm-9) in btrfs_replace_file_extents:2723: errno=-2 No such entry
+ [16402.278744] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry
+ [16402.279968] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry
+ [16402.280582] BTRFS info (device dm-9): balance: ended with status: -30
+
+The problem here is that as soon as we allocate the new block it is
+locked and marked dirty in the btree inode. This means that we could
+attempt to writeback this block and need to lock the extent buffer.
+However we're not unlocking it here and thus we deadlock.
+
+Fix this by unlocking the cow block if we have any errors inside of
+__btrfs_cow_block, and also free it so we do not leak it.
+
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ctree.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -1061,6 +1061,8 @@ static noinline int __btrfs_cow_block(st
+
+ ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
+ if (ret) {
++ btrfs_tree_unlock(cow);
++ free_extent_buffer(cow);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+@@ -1068,6 +1070,8 @@ static noinline int __btrfs_cow_block(st
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
+ ret = btrfs_reloc_cow_block(trans, root, buf, cow);
+ if (ret) {
++ btrfs_tree_unlock(cow);
++ free_extent_buffer(cow);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+@@ -1100,6 +1104,8 @@ static noinline int __btrfs_cow_block(st
+ if (last_ref) {
+ ret = tree_mod_log_free_eb(buf);
+ if (ret) {
++ btrfs_tree_unlock(cow);
++ free_extent_buffer(cow);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
--- /dev/null
+From 7837fa88704a66257404bb14144c9e4ab631a28a Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 14 Oct 2020 17:00:51 -0400
+Subject: btrfs: drop the path before adding block group sysfs files
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 7837fa88704a66257404bb14144c9e4ab631a28a upstream.
+
+Dave reported a problem with my rwsem conversion patch where we got the
+following lockdep splat:
+
+ ======================================================
+ WARNING: possible circular locking dependency detected
+ 5.9.0-default+ #1297 Not tainted
+ ------------------------------------------------------
+ kswapd0/76 is trying to acquire lock:
+ ffff9d5d25df2530 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs]
+
+ but task is already holding lock:
+ ffffffffa40cbba0 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30
+
+ which lock already depends on the new lock.
+
+ the existing dependency chain (in reverse order) is:
+
+ -> #4 (fs_reclaim){+.+.}-{0:0}:
+ __lock_acquire+0x582/0xac0
+ lock_acquire+0xca/0x430
+ fs_reclaim_acquire.part.0+0x25/0x30
+ kmem_cache_alloc+0x30/0x9c0
+ alloc_inode+0x81/0x90
+ iget_locked+0xcd/0x1a0
+ kernfs_get_inode+0x1b/0x130
+ kernfs_get_tree+0x136/0x210
+ sysfs_get_tree+0x1a/0x50
+ vfs_get_tree+0x1d/0xb0
+ path_mount+0x70f/0xa80
+ do_mount+0x75/0x90
+ __x64_sys_mount+0x8e/0xd0
+ do_syscall_64+0x2d/0x70
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+ -> #3 (kernfs_mutex){+.+.}-{3:3}:
+ __lock_acquire+0x582/0xac0
+ lock_acquire+0xca/0x430
+ __mutex_lock+0xa0/0xaf0
+ kernfs_add_one+0x23/0x150
+ kernfs_create_dir_ns+0x58/0x80
+ sysfs_create_dir_ns+0x70/0xd0
+ kobject_add_internal+0xbb/0x2d0
+ kobject_add+0x7a/0xd0
+ btrfs_sysfs_add_block_group_type+0x141/0x1d0 [btrfs]
+ btrfs_read_block_groups+0x1f1/0x8c0 [btrfs]
+ open_ctree+0x981/0x1108 [btrfs]
+ btrfs_mount_root.cold+0xe/0xb0 [btrfs]
+ legacy_get_tree+0x2d/0x60
+ vfs_get_tree+0x1d/0xb0
+ fc_mount+0xe/0x40
+ vfs_kern_mount.part.0+0x71/0x90
+ btrfs_mount+0x13b/0x3e0 [btrfs]
+ legacy_get_tree+0x2d/0x60
+ vfs_get_tree+0x1d/0xb0
+ path_mount+0x70f/0xa80
+ do_mount+0x75/0x90
+ __x64_sys_mount+0x8e/0xd0
+ do_syscall_64+0x2d/0x70
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+ -> #2 (btrfs-extent-00){++++}-{3:3}:
+ __lock_acquire+0x582/0xac0
+ lock_acquire+0xca/0x430
+ down_read_nested+0x45/0x220
+ __btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
+ __btrfs_read_lock_root_node+0x3a/0x50 [btrfs]
+ btrfs_search_slot+0x6d4/0xfd0 [btrfs]
+ check_committed_ref+0x69/0x200 [btrfs]
+ btrfs_cross_ref_exist+0x65/0xb0 [btrfs]
+ run_delalloc_nocow+0x446/0x9b0 [btrfs]
+ btrfs_run_delalloc_range+0x61/0x6a0 [btrfs]
+ writepage_delalloc+0xae/0x160 [btrfs]
+ __extent_writepage+0x262/0x420 [btrfs]
+ extent_write_cache_pages+0x2b6/0x510 [btrfs]
+ extent_writepages+0x43/0x90 [btrfs]
+ do_writepages+0x40/0xe0
+ __writeback_single_inode+0x62/0x610
+ writeback_sb_inodes+0x20f/0x500
+ wb_writeback+0xef/0x4a0
+ wb_do_writeback+0x49/0x2e0
+ wb_workfn+0x81/0x340
+ process_one_work+0x233/0x5d0
+ worker_thread+0x50/0x3b0
+ kthread+0x137/0x150
+ ret_from_fork+0x1f/0x30
+
+ -> #1 (btrfs-fs-00){++++}-{3:3}:
+ __lock_acquire+0x582/0xac0
+ lock_acquire+0xca/0x430
+ down_read_nested+0x45/0x220
+ __btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
+ __btrfs_read_lock_root_node+0x3a/0x50 [btrfs]
+ btrfs_search_slot+0x6d4/0xfd0 [btrfs]
+ btrfs_lookup_inode+0x3a/0xc0 [btrfs]
+ __btrfs_update_delayed_inode+0x93/0x2c0 [btrfs]
+ __btrfs_commit_inode_delayed_items+0x7de/0x850 [btrfs]
+ __btrfs_run_delayed_items+0x8e/0x140 [btrfs]
+ btrfs_commit_transaction+0x367/0xbc0 [btrfs]
+ btrfs_mksubvol+0x2db/0x470 [btrfs]
+ btrfs_mksnapshot+0x7b/0xb0 [btrfs]
+ __btrfs_ioctl_snap_create+0x16f/0x1a0 [btrfs]
+ btrfs_ioctl_snap_create_v2+0xb0/0xf0 [btrfs]
+ btrfs_ioctl+0xd0b/0x2690 [btrfs]
+ __x64_sys_ioctl+0x6f/0xa0
+ do_syscall_64+0x2d/0x70
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+ -> #0 (&delayed_node->mutex){+.+.}-{3:3}:
+ check_prev_add+0x91/0xc60
+ validate_chain+0xa6e/0x2a20
+ __lock_acquire+0x582/0xac0
+ lock_acquire+0xca/0x430
+ __mutex_lock+0xa0/0xaf0
+ __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs]
+ btrfs_evict_inode+0x3cc/0x560 [btrfs]
+ evict+0xd6/0x1c0
+ dispose_list+0x48/0x70
+ prune_icache_sb+0x54/0x80
+ super_cache_scan+0x121/0x1a0
+ do_shrink_slab+0x16d/0x3b0
+ shrink_slab+0xb1/0x2e0
+ shrink_node+0x230/0x6a0
+ balance_pgdat+0x325/0x750
+ kswapd+0x206/0x4d0
+ kthread+0x137/0x150
+ ret_from_fork+0x1f/0x30
+
+ other info that might help us debug this:
+
+ Chain exists of:
+ &delayed_node->mutex --> kernfs_mutex --> fs_reclaim
+
+ Possible unsafe locking scenario:
+
+ CPU0 CPU1
+ ---- ----
+ lock(fs_reclaim);
+ lock(kernfs_mutex);
+ lock(fs_reclaim);
+ lock(&delayed_node->mutex);
+
+ *** DEADLOCK ***
+
+ 3 locks held by kswapd0/76:
+ #0: ffffffffa40cbba0 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30
+ #1: ffffffffa40b8b58 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x54/0x2e0
+ #2: ffff9d5d322390e8 (&type->s_umount_key#26){++++}-{3:3}, at: trylock_super+0x16/0x50
+
+ stack backtrace:
+ CPU: 2 PID: 76 Comm: kswapd0 Not tainted 5.9.0-default+ #1297
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
+ Call Trace:
+ dump_stack+0x77/0x97
+ check_noncircular+0xff/0x110
+ ? save_trace+0x50/0x470
+ check_prev_add+0x91/0xc60
+ validate_chain+0xa6e/0x2a20
+ ? save_trace+0x50/0x470
+ __lock_acquire+0x582/0xac0
+ lock_acquire+0xca/0x430
+ ? __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs]
+ __mutex_lock+0xa0/0xaf0
+ ? __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs]
+ ? __lock_acquire+0x582/0xac0
+ ? __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs]
+ ? btrfs_evict_inode+0x30b/0x560 [btrfs]
+ ? __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs]
+ __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs]
+ btrfs_evict_inode+0x3cc/0x560 [btrfs]
+ evict+0xd6/0x1c0
+ dispose_list+0x48/0x70
+ prune_icache_sb+0x54/0x80
+ super_cache_scan+0x121/0x1a0
+ do_shrink_slab+0x16d/0x3b0
+ shrink_slab+0xb1/0x2e0
+ shrink_node+0x230/0x6a0
+ balance_pgdat+0x325/0x750
+ kswapd+0x206/0x4d0
+ ? finish_wait+0x90/0x90
+ ? balance_pgdat+0x750/0x750
+ kthread+0x137/0x150
+ ? kthread_mod_delayed_work+0xc0/0xc0
+ ret_from_fork+0x1f/0x30
+
+This happens because we are still holding the path open when we start
+adding the sysfs files for the block groups, which creates a dependency
+on fs_reclaim via the tree lock. Fix this by dropping the path before
+we start doing anything with sysfs.
+
+Reported-by: David Sterba <dsterba@suse.com>
+CC: stable@vger.kernel.org # 5.8+
+Reviewed-by: Anand Jain <anand.jain@oracle.com>
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/block-group.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -2034,6 +2034,7 @@ int btrfs_read_block_groups(struct btrfs
+ key.offset = 0;
+ btrfs_release_path(path);
+ }
++ btrfs_release_path(path);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(space_info, &info->space_info, list) {
--- /dev/null
+From 66d204a16c94f24ad08290a7663ab67e7fc04e82 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 12 Oct 2020 11:55:24 +0100
+Subject: btrfs: fix readahead hang and use-after-free after removing a device
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 66d204a16c94f24ad08290a7663ab67e7fc04e82 upstream.
+
+Very sporadically I had test case btrfs/069 from fstests hanging (for
+years, it is not a recent regression), with the following traces in
+dmesg/syslog:
+
+ [162301.160628] BTRFS info (device sdc): dev_replace from /dev/sdd (devid 2) to /dev/sdg started
+ [162301.181196] BTRFS info (device sdc): scrub: finished on devid 4 with status: 0
+ [162301.287162] BTRFS info (device sdc): dev_replace from /dev/sdd (devid 2) to /dev/sdg finished
+ [162513.513792] INFO: task btrfs-transacti:1356167 blocked for more than 120 seconds.
+ [162513.514318] Not tainted 5.9.0-rc6-btrfs-next-69 #1
+ [162513.514522] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+ [162513.514747] task:btrfs-transacti state:D stack: 0 pid:1356167 ppid: 2 flags:0x00004000
+ [162513.514751] Call Trace:
+ [162513.514761] __schedule+0x5ce/0xd00
+ [162513.514765] ? _raw_spin_unlock_irqrestore+0x3c/0x60
+ [162513.514771] schedule+0x46/0xf0
+ [162513.514844] wait_current_trans+0xde/0x140 [btrfs]
+ [162513.514850] ? finish_wait+0x90/0x90
+ [162513.514864] start_transaction+0x37c/0x5f0 [btrfs]
+ [162513.514879] transaction_kthread+0xa4/0x170 [btrfs]
+ [162513.514891] ? btrfs_cleanup_transaction+0x660/0x660 [btrfs]
+ [162513.514894] kthread+0x153/0x170
+ [162513.514897] ? kthread_stop+0x2c0/0x2c0
+ [162513.514902] ret_from_fork+0x22/0x30
+ [162513.514916] INFO: task fsstress:1356184 blocked for more than 120 seconds.
+ [162513.515192] Not tainted 5.9.0-rc6-btrfs-next-69 #1
+ [162513.515431] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+ [162513.515680] task:fsstress state:D stack: 0 pid:1356184 ppid:1356177 flags:0x00004000
+ [162513.515682] Call Trace:
+ [162513.515688] __schedule+0x5ce/0xd00
+ [162513.515691] ? _raw_spin_unlock_irqrestore+0x3c/0x60
+ [162513.515697] schedule+0x46/0xf0
+ [162513.515712] wait_current_trans+0xde/0x140 [btrfs]
+ [162513.515716] ? finish_wait+0x90/0x90
+ [162513.515729] start_transaction+0x37c/0x5f0 [btrfs]
+ [162513.515743] btrfs_attach_transaction_barrier+0x1f/0x50 [btrfs]
+ [162513.515753] btrfs_sync_fs+0x61/0x1c0 [btrfs]
+ [162513.515758] ? __ia32_sys_fdatasync+0x20/0x20
+ [162513.515761] iterate_supers+0x87/0xf0
+ [162513.515765] ksys_sync+0x60/0xb0
+ [162513.515768] __do_sys_sync+0xa/0x10
+ [162513.515771] do_syscall_64+0x33/0x80
+ [162513.515774] entry_SYSCALL_64_after_hwframe+0x44/0xa9
+ [162513.515781] RIP: 0033:0x7f5238f50bd7
+ [162513.515782] Code: Bad RIP value.
+ [162513.515784] RSP: 002b:00007fff67b978e8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a2
+ [162513.515786] RAX: ffffffffffffffda RBX: 000055b1fad2c560 RCX: 00007f5238f50bd7
+ [162513.515788] RDX: 00000000ffffffff RSI: 000000000daf0e74 RDI: 000000000000003a
+ [162513.515789] RBP: 0000000000000032 R08: 000000000000000a R09: 00007f5239019be0
+ [162513.515791] R10: fffffffffffff24f R11: 0000000000000206 R12: 000000000000003a
+ [162513.515792] R13: 00007fff67b97950 R14: 00007fff67b97906 R15: 000055b1fad1a340
+ [162513.515804] INFO: task fsstress:1356185 blocked for more than 120 seconds.
+ [162513.516064] Not tainted 5.9.0-rc6-btrfs-next-69 #1
+ [162513.516329] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+ [162513.516617] task:fsstress state:D stack: 0 pid:1356185 ppid:1356177 flags:0x00000000
+ [162513.516620] Call Trace:
+ [162513.516625] __schedule+0x5ce/0xd00
+ [162513.516628] ? _raw_spin_unlock_irqrestore+0x3c/0x60
+ [162513.516634] schedule+0x46/0xf0
+ [162513.516647] wait_current_trans+0xde/0x140 [btrfs]
+ [162513.516650] ? finish_wait+0x90/0x90
+ [162513.516662] start_transaction+0x4d7/0x5f0 [btrfs]
+ [162513.516679] btrfs_setxattr_trans+0x3c/0x100 [btrfs]
+ [162513.516686] __vfs_setxattr+0x66/0x80
+ [162513.516691] __vfs_setxattr_noperm+0x70/0x200
+ [162513.516697] vfs_setxattr+0x6b/0x120
+ [162513.516703] setxattr+0x125/0x240
+ [162513.516709] ? lock_acquire+0xb1/0x480
+ [162513.516712] ? mnt_want_write+0x20/0x50
+ [162513.516721] ? rcu_read_lock_any_held+0x8e/0xb0
+ [162513.516723] ? preempt_count_add+0x49/0xa0
+ [162513.516725] ? __sb_start_write+0x19b/0x290
+ [162513.516727] ? preempt_count_add+0x49/0xa0
+ [162513.516732] path_setxattr+0xba/0xd0
+ [162513.516739] __x64_sys_setxattr+0x27/0x30
+ [162513.516741] do_syscall_64+0x33/0x80
+ [162513.516743] entry_SYSCALL_64_after_hwframe+0x44/0xa9
+ [162513.516745] RIP: 0033:0x7f5238f56d5a
+ [162513.516746] Code: Bad RIP value.
+ [162513.516748] RSP: 002b:00007fff67b97868 EFLAGS: 00000202 ORIG_RAX: 00000000000000bc
+ [162513.516750] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f5238f56d5a
+ [162513.516751] RDX: 000055b1fbb0d5a0 RSI: 00007fff67b978a0 RDI: 000055b1fbb0d470
+ [162513.516753] RBP: 000055b1fbb0d5a0 R08: 0000000000000001 R09: 00007fff67b97700
+ [162513.516754] R10: 0000000000000004 R11: 0000000000000202 R12: 0000000000000004
+ [162513.516756] R13: 0000000000000024 R14: 0000000000000001 R15: 00007fff67b978a0
+ [162513.516767] INFO: task fsstress:1356196 blocked for more than 120 seconds.
+ [162513.517064] Not tainted 5.9.0-rc6-btrfs-next-69 #1
+ [162513.517365] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+ [162513.517763] task:fsstress state:D stack: 0 pid:1356196 ppid:1356177 flags:0x00004000
+ [162513.517780] Call Trace:
+ [162513.517786] __schedule+0x5ce/0xd00
+ [162513.517789] ? _raw_spin_unlock_irqrestore+0x3c/0x60
+ [162513.517796] schedule+0x46/0xf0
+ [162513.517810] wait_current_trans+0xde/0x140 [btrfs]
+ [162513.517814] ? finish_wait+0x90/0x90
+ [162513.517829] start_transaction+0x37c/0x5f0 [btrfs]
+ [162513.517845] btrfs_attach_transaction_barrier+0x1f/0x50 [btrfs]
+ [162513.517857] btrfs_sync_fs+0x61/0x1c0 [btrfs]
+ [162513.517862] ? __ia32_sys_fdatasync+0x20/0x20
+ [162513.517865] iterate_supers+0x87/0xf0
+ [162513.517869] ksys_sync+0x60/0xb0
+ [162513.517872] __do_sys_sync+0xa/0x10
+ [162513.517875] do_syscall_64+0x33/0x80
+ [162513.517878] entry_SYSCALL_64_after_hwframe+0x44/0xa9
+ [162513.517881] RIP: 0033:0x7f5238f50bd7
+ [162513.517883] Code: Bad RIP value.
+ [162513.517885] RSP: 002b:00007fff67b978e8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a2
+ [162513.517887] RAX: ffffffffffffffda RBX: 000055b1fad2c560 RCX: 00007f5238f50bd7
+ [162513.517889] RDX: 0000000000000000 RSI: 000000007660add2 RDI: 0000000000000053
+ [162513.517891] RBP: 0000000000000032 R08: 0000000000000067 R09: 00007f5239019be0
+ [162513.517893] R10: fffffffffffff24f R11: 0000000000000206 R12: 0000000000000053
+ [162513.517895] R13: 00007fff67b97950 R14: 00007fff67b97906 R15: 000055b1fad1a340
+ [162513.517908] INFO: task fsstress:1356197 blocked for more than 120 seconds.
+ [162513.518298] Not tainted 5.9.0-rc6-btrfs-next-69 #1
+ [162513.518672] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+ [162513.519157] task:fsstress state:D stack: 0 pid:1356197 ppid:1356177 flags:0x00000000
+ [162513.519160] Call Trace:
+ [162513.519165] __schedule+0x5ce/0xd00
+ [162513.519168] ? _raw_spin_unlock_irqrestore+0x3c/0x60
+ [162513.519174] schedule+0x46/0xf0
+ [162513.519190] wait_current_trans+0xde/0x140 [btrfs]
+ [162513.519193] ? finish_wait+0x90/0x90
+ [162513.519206] start_transaction+0x4d7/0x5f0 [btrfs]
+ [162513.519222] btrfs_create+0x57/0x200 [btrfs]
+ [162513.519230] lookup_open+0x522/0x650
+ [162513.519246] path_openat+0x2b8/0xa50
+ [162513.519270] do_filp_open+0x91/0x100
+ [162513.519275] ? find_held_lock+0x32/0x90
+ [162513.519280] ? lock_acquired+0x33b/0x470
+ [162513.519285] ? do_raw_spin_unlock+0x4b/0xc0
+ [162513.519287] ? _raw_spin_unlock+0x29/0x40
+ [162513.519295] do_sys_openat2+0x20d/0x2d0
+ [162513.519300] do_sys_open+0x44/0x80
+ [162513.519304] do_syscall_64+0x33/0x80
+ [162513.519307] entry_SYSCALL_64_after_hwframe+0x44/0xa9
+ [162513.519309] RIP: 0033:0x7f5238f4a903
+ [162513.519310] Code: Bad RIP value.
+ [162513.519312] RSP: 002b:00007fff67b97758 EFLAGS: 00000246 ORIG_RAX: 0000000000000055
+ [162513.519314] RAX: ffffffffffffffda RBX: 00000000ffffffff RCX: 00007f5238f4a903
+ [162513.519316] RDX: 0000000000000000 RSI: 00000000000001b6 RDI: 000055b1fbb0d470
+ [162513.519317] RBP: 00007fff67b978c0 R08: 0000000000000001 R09: 0000000000000002
+ [162513.519319] R10: 00007fff67b974f7 R11: 0000000000000246 R12: 0000000000000013
+ [162513.519320] R13: 00000000000001b6 R14: 00007fff67b97906 R15: 000055b1fad1c620
+ [162513.519332] INFO: task btrfs:1356211 blocked for more than 120 seconds.
+ [162513.519727] Not tainted 5.9.0-rc6-btrfs-next-69 #1
+ [162513.520115] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+ [162513.520508] task:btrfs state:D stack: 0 pid:1356211 ppid:1356178 flags:0x00004002
+ [162513.520511] Call Trace:
+ [162513.520516] __schedule+0x5ce/0xd00
+ [162513.520519] ? _raw_spin_unlock_irqrestore+0x3c/0x60
+ [162513.520525] schedule+0x46/0xf0
+ [162513.520544] btrfs_scrub_pause+0x11f/0x180 [btrfs]
+ [162513.520548] ? finish_wait+0x90/0x90
+ [162513.520562] btrfs_commit_transaction+0x45a/0xc30 [btrfs]
+ [162513.520574] ? start_transaction+0xe0/0x5f0 [btrfs]
+ [162513.520596] btrfs_dev_replace_finishing+0x6d8/0x711 [btrfs]
+ [162513.520619] btrfs_dev_replace_by_ioctl.cold+0x1cc/0x1fd [btrfs]
+ [162513.520639] btrfs_ioctl+0x2a25/0x36f0 [btrfs]
+ [162513.520643] ? do_sigaction+0xf3/0x240
+ [162513.520645] ? find_held_lock+0x32/0x90
+ [162513.520648] ? do_sigaction+0xf3/0x240
+ [162513.520651] ? lock_acquired+0x33b/0x470
+ [162513.520655] ? _raw_spin_unlock_irq+0x24/0x50
+ [162513.520657] ? lockdep_hardirqs_on+0x7d/0x100
+ [162513.520660] ? _raw_spin_unlock_irq+0x35/0x50
+ [162513.520662] ? do_sigaction+0xf3/0x240
+ [162513.520671] ? __x64_sys_ioctl+0x83/0xb0
+ [162513.520672] __x64_sys_ioctl+0x83/0xb0
+ [162513.520677] do_syscall_64+0x33/0x80
+ [162513.520679] entry_SYSCALL_64_after_hwframe+0x44/0xa9
+ [162513.520681] RIP: 0033:0x7fc3cd307d87
+ [162513.520682] Code: Bad RIP value.
+ [162513.520684] RSP: 002b:00007ffe30a56bb8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
+ [162513.520686] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007fc3cd307d87
+ [162513.520687] RDX: 00007ffe30a57a30 RSI: 00000000ca289435 RDI: 0000000000000003
+ [162513.520689] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
+ [162513.520690] R10: 0000000000000008 R11: 0000000000000202 R12: 0000000000000003
+ [162513.520692] R13: 0000557323a212e0 R14: 00007ffe30a5a520 R15: 0000000000000001
+ [162513.520703]
+ Showing all locks held in the system:
+ [162513.520712] 1 lock held by khungtaskd/54:
+ [162513.520713] #0: ffffffffb40a91a0 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x15/0x197
+ [162513.520728] 1 lock held by in:imklog/596:
+ [162513.520729] #0: ffff8f3f0d781400 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0x4d/0x60
+ [162513.520782] 1 lock held by btrfs-transacti/1356167:
+ [162513.520784] #0: ffff8f3d810cc848 (&fs_info->transaction_kthread_mutex){+.+.}-{3:3}, at: transaction_kthread+0x4a/0x170 [btrfs]
+ [162513.520798] 1 lock held by btrfs/1356190:
+ [162513.520800] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write_file+0x22/0x60
+ [162513.520805] 1 lock held by fsstress/1356184:
+ [162513.520806] #0: ffff8f3d576440e8 (&type->s_umount_key#62){++++}-{3:3}, at: iterate_supers+0x6f/0xf0
+ [162513.520811] 3 locks held by fsstress/1356185:
+ [162513.520812] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write+0x20/0x50
+ [162513.520815] #1: ffff8f3d80a650b8 (&type->i_mutex_dir_key#10){++++}-{3:3}, at: vfs_setxattr+0x50/0x120
+ [162513.520820] #2: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs]
+ [162513.520833] 1 lock held by fsstress/1356196:
+ [162513.520834] #0: ffff8f3d576440e8 (&type->s_umount_key#62){++++}-{3:3}, at: iterate_supers+0x6f/0xf0
+ [162513.520838] 3 locks held by fsstress/1356197:
+ [162513.520839] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write+0x20/0x50
+ [162513.520843] #1: ffff8f3d506465e8 (&type->i_mutex_dir_key#10){++++}-{3:3}, at: path_openat+0x2a7/0xa50
+ [162513.520846] #2: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs]
+ [162513.520858] 2 locks held by btrfs/1356211:
+ [162513.520859] #0: ffff8f3d810cde30 (&fs_info->dev_replace.lock_finishing_cancel_unmount){+.+.}-{3:3}, at: btrfs_dev_replace_finishing+0x52/0x711 [btrfs]
+ [162513.520877] #1: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs]
+
+This was weird because the stack traces show that a transaction commit,
+triggered by a device replace operation, is blocking trying to pause any
+running scrubs but there are no stack traces of blocked tasks doing a
+scrub.
+
+After poking around with drgn, I noticed there was a scrub task that was
+constantly running and blocking for shorts periods of time:
+
+ >>> t = find_task(prog, 1356190)
+ >>> prog.stack_trace(t)
+ #0 __schedule+0x5ce/0xcfc
+ #1 schedule+0x46/0xe4
+ #2 schedule_timeout+0x1df/0x475
+ #3 btrfs_reada_wait+0xda/0x132
+ #4 scrub_stripe+0x2a8/0x112f
+ #5 scrub_chunk+0xcd/0x134
+ #6 scrub_enumerate_chunks+0x29e/0x5ee
+ #7 btrfs_scrub_dev+0x2d5/0x91b
+ #8 btrfs_ioctl+0x7f5/0x36e7
+ #9 __x64_sys_ioctl+0x83/0xb0
+ #10 do_syscall_64+0x33/0x77
+ #11 entry_SYSCALL_64+0x7c/0x156
+
+Which corresponds to:
+
+int btrfs_reada_wait(void *handle)
+{
+ struct reada_control *rc = handle;
+ struct btrfs_fs_info *fs_info = rc->fs_info;
+
+ while (atomic_read(&rc->elems)) {
+ if (!atomic_read(&fs_info->reada_works_cnt))
+ reada_start_machine(fs_info);
+ wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
+ (HZ + 9) / 10);
+ }
+(...)
+
+So the counter "rc->elems" was set to 1 and never decreased to 0, causing
+the scrub task to loop forever in that function. Then I used the following
+script for drgn to check the readahead requests:
+
+ $ cat dump_reada.py
+ import sys
+ import drgn
+ from drgn import NULL, Object, cast, container_of, execscript, \
+ reinterpret, sizeof
+ from drgn.helpers.linux import *
+
+ mnt_path = b"/home/fdmanana/btrfs-tests/scratch_1"
+
+ mnt = None
+ for mnt in for_each_mount(prog, dst = mnt_path):
+ pass
+
+ if mnt is None:
+ sys.stderr.write(f'Error: mount point {mnt_path} not found\n')
+ sys.exit(1)
+
+ fs_info = cast('struct btrfs_fs_info *', mnt.mnt.mnt_sb.s_fs_info)
+
+ def dump_re(re):
+ nzones = re.nzones.value_()
+ print(f're at {hex(re.value_())}')
+ print(f'\t logical {re.logical.value_()}')
+ print(f'\t refcnt {re.refcnt.value_()}')
+ print(f'\t nzones {nzones}')
+ for i in range(nzones):
+ dev = re.zones[i].device
+ name = dev.name.str.string_()
+ print(f'\t\t dev id {dev.devid.value_()} name {name}')
+ print()
+
+ for _, e in radix_tree_for_each(fs_info.reada_tree):
+ re = cast('struct reada_extent *', e)
+ dump_re(re)
+
+ $ drgn dump_reada.py
+ re at 0xffff8f3da9d25ad8
+ logical 38928384
+ refcnt 1
+ nzones 1
+ dev id 0 name b'/dev/sdd'
+ $
+
+So there was one readahead extent with a single zone corresponding to the
+source device of that last device replace operation logged in dmesg/syslog.
+Also the ID of that zone's device was 0 which is a special value set in
+the source device of a device replace operation when the operation finishes
+(constant BTRFS_DEV_REPLACE_DEVID set at btrfs_dev_replace_finishing()),
+confirming again that device /dev/sdd was the source of a device replace
+operation.
+
+Normally there should be as many zones in the readahead extent as there are
+devices, and I wasn't expecting the extent to be in a block group with a
+'single' profile, so I went and confirmed with the following drgn script
+that there weren't any single profile block groups:
+
+ $ cat dump_block_groups.py
+ import sys
+ import drgn
+ from drgn import NULL, Object, cast, container_of, execscript, \
+ reinterpret, sizeof
+ from drgn.helpers.linux import *
+
+ mnt_path = b"/home/fdmanana/btrfs-tests/scratch_1"
+
+ mnt = None
+ for mnt in for_each_mount(prog, dst = mnt_path):
+ pass
+
+ if mnt is None:
+ sys.stderr.write(f'Error: mount point {mnt_path} not found\n')
+ sys.exit(1)
+
+ fs_info = cast('struct btrfs_fs_info *', mnt.mnt.mnt_sb.s_fs_info)
+
+ BTRFS_BLOCK_GROUP_DATA = (1 << 0)
+ BTRFS_BLOCK_GROUP_SYSTEM = (1 << 1)
+ BTRFS_BLOCK_GROUP_METADATA = (1 << 2)
+ BTRFS_BLOCK_GROUP_RAID0 = (1 << 3)
+ BTRFS_BLOCK_GROUP_RAID1 = (1 << 4)
+ BTRFS_BLOCK_GROUP_DUP = (1 << 5)
+ BTRFS_BLOCK_GROUP_RAID10 = (1 << 6)
+ BTRFS_BLOCK_GROUP_RAID5 = (1 << 7)
+ BTRFS_BLOCK_GROUP_RAID6 = (1 << 8)
+ BTRFS_BLOCK_GROUP_RAID1C3 = (1 << 9)
+ BTRFS_BLOCK_GROUP_RAID1C4 = (1 << 10)
+
+ def bg_flags_string(bg):
+ flags = bg.flags.value_()
+ ret = ''
+ if flags & BTRFS_BLOCK_GROUP_DATA:
+ ret = 'data'
+ if flags & BTRFS_BLOCK_GROUP_METADATA:
+ if len(ret) > 0:
+ ret += '|'
+ ret += 'meta'
+ if flags & BTRFS_BLOCK_GROUP_SYSTEM:
+ if len(ret) > 0:
+ ret += '|'
+ ret += 'system'
+ if flags & BTRFS_BLOCK_GROUP_RAID0:
+ ret += ' raid0'
+ elif flags & BTRFS_BLOCK_GROUP_RAID1:
+ ret += ' raid1'
+ elif flags & BTRFS_BLOCK_GROUP_DUP:
+ ret += ' dup'
+ elif flags & BTRFS_BLOCK_GROUP_RAID10:
+ ret += ' raid10'
+ elif flags & BTRFS_BLOCK_GROUP_RAID5:
+ ret += ' raid5'
+ elif flags & BTRFS_BLOCK_GROUP_RAID6:
+ ret += ' raid6'
+ elif flags & BTRFS_BLOCK_GROUP_RAID1C3:
+ ret += ' raid1c3'
+ elif flags & BTRFS_BLOCK_GROUP_RAID1C4:
+ ret += ' raid1c4'
+ else:
+ ret += ' single'
+
+ return ret
+
+ def dump_bg(bg):
+ print()
+ print(f'block group at {hex(bg.value_())}')
+ print(f'\t start {bg.start.value_()} length {bg.length.value_()}')
+ print(f'\t flags {bg.flags.value_()} - {bg_flags_string(bg)}')
+
+ bg_root = fs_info.block_group_cache_tree.address_of_()
+ for bg in rbtree_inorder_for_each_entry('struct btrfs_block_group', bg_root, 'cache_node'):
+ dump_bg(bg)
+
+ $ drgn dump_block_groups.py
+
+ block group at 0xffff8f3d673b0400
+ start 22020096 length 16777216
+ flags 258 - system raid6
+
+ block group at 0xffff8f3d53ddb400
+ start 38797312 length 536870912
+ flags 260 - meta raid6
+
+ block group at 0xffff8f3d5f4d9c00
+ start 575668224 length 2147483648
+ flags 257 - data raid6
+
+ block group at 0xffff8f3d08189000
+ start 2723151872 length 67108864
+ flags 258 - system raid6
+
+ block group at 0xffff8f3db70ff000
+ start 2790260736 length 1073741824
+ flags 260 - meta raid6
+
+ block group at 0xffff8f3d5f4dd800
+ start 3864002560 length 67108864
+ flags 258 - system raid6
+
+ block group at 0xffff8f3d67037000
+ start 3931111424 length 2147483648
+ flags 257 - data raid6
+ $
+
+So there were only 2 reasons left for having a readahead extent with a
+single zone: reada_find_zone(), called when creating a readahead extent,
+returned NULL either because we failed to find the corresponding block
+group or because a memory allocation failed. With some additional and
+custom tracing I figured out that on every further ocurrence of the
+problem the block group had just been deleted when we were looping to
+create the zones for the readahead extent (at reada_find_extent()), so we
+ended up with only one zone in the readahead extent, corresponding to a
+device that ends up getting replaced.
+
+So after figuring that out it became obvious why the hang happens:
+
+1) Task A starts a scrub on any device of the filesystem, except for
+ device /dev/sdd;
+
+2) Task B starts a device replace with /dev/sdd as the source device;
+
+3) Task A calls btrfs_reada_add() from scrub_stripe() and it is currently
+ starting to scrub a stripe from block group X. This call to
+ btrfs_reada_add() is the one for the extent tree. When btrfs_reada_add()
+ calls reada_add_block(), it passes the logical address of the extent
+ tree's root node as its 'logical' argument - a value of 38928384;
+
+4) Task A then enters reada_find_extent(), called from reada_add_block().
+ It finds there isn't any existing readahead extent for the logical
+ address 38928384, so it proceeds to the path of creating a new one.
+
+ It calls btrfs_map_block() to find out which stripes exist for the block
+ group X. On the first iteration of the for loop that iterates over the
+ stripes, it finds the stripe for device /dev/sdd, so it creates one
+ zone for that device and adds it to the readahead extent. Before getting
+ into the second iteration of the loop, the cleanup kthread deletes block
+ group X because it was empty. So in the iterations for the remaining
+ stripes it does not add more zones to the readahead extent, because the
+ calls to reada_find_zone() returned NULL because they couldn't find
+ block group X anymore.
+
+ As a result the new readahead extent has a single zone, corresponding to
+ the device /dev/sdd;
+
+4) Before task A returns to btrfs_reada_add() and queues the readahead job
+ for the readahead work queue, task B finishes the device replace and at
+ btrfs_dev_replace_finishing() swaps the device /dev/sdd with the new
+ device /dev/sdg;
+
+5) Task A returns to reada_add_block(), which increments the counter
+ "->elems" of the reada_control structure allocated at btrfs_reada_add().
+
+ Then it returns back to btrfs_reada_add() and calls
+ reada_start_machine(). This queues a job in the readahead work queue to
+ run the function reada_start_machine_worker(), which calls
+ __reada_start_machine().
+
+ At __reada_start_machine() we take the device list mutex and for each
+ device found in the current device list, we call
+ reada_start_machine_dev() to start the readahead work. However at this
+ point the device /dev/sdd was already freed and is not in the device
+ list anymore.
+
+ This means the corresponding readahead for the extent at 38928384 is
+ never started, and therefore the "->elems" counter of the reada_control
+ structure allocated at btrfs_reada_add() never goes down to 0, causing
+ the call to btrfs_reada_wait(), done by the scrub task, to wait forever.
+
+Note that the readahead request can be made either after the device replace
+started or before it started, however in pratice it is very unlikely that a
+device replace is able to start after a readahead request is made and is
+able to complete before the readahead request completes - maybe only on a
+very small and nearly empty filesystem.
+
+This hang however is not the only problem we can have with readahead and
+device removals. When the readahead extent has other zones other than the
+one corresponding to the device that is being removed (either by a device
+replace or a device remove operation), we risk having a use-after-free on
+the device when dropping the last reference of the readahead extent.
+
+For example if we create a readahead extent with two zones, one for the
+device /dev/sdd and one for the device /dev/sde:
+
+1) Before the readahead worker starts, the device /dev/sdd is removed,
+ and the corresponding btrfs_device structure is freed. However the
+ readahead extent still has the zone pointing to the device structure;
+
+2) When the readahead worker starts, it only finds device /dev/sde in the
+ current device list of the filesystem;
+
+3) It starts the readahead work, at reada_start_machine_dev(), using the
+ device /dev/sde;
+
+4) Then when it finishes reading the extent from device /dev/sde, it calls
+ __readahead_hook() which ends up dropping the last reference on the
+ readahead extent through the last call to reada_extent_put();
+
+5) At reada_extent_put() it iterates over each zone of the readahead extent
+ and attempts to delete an element from the device's 'reada_extents'
+ radix tree, resulting in a use-after-free, as the device pointer of the
+ zone for /dev/sdd is now stale. We can also access the device after
+ dropping the last reference of a zone, through reada_zone_release(),
+ also called by reada_extent_put().
+
+And a device remove suffers the same problem, however since it shrinks the
+device size down to zero before removing the device, it is very unlikely to
+still have readahead requests not completed by the time we free the device,
+the only possibility is if the device has a very little space allocated.
+
+While the hang problem is exclusive to scrub, since it is currently the
+only user of btrfs_reada_add() and btrfs_reada_wait(), the use-after-free
+problem affects any path that triggers readhead, which includes
+btree_readahead_hook() and __readahead_hook() (a readahead worker can
+trigger readahed for the children of a node) for example - any path that
+ends up calling reada_add_block() can trigger the use-after-free after a
+device is removed.
+
+So fix this by waiting for any readahead requests for a device to complete
+before removing a device, ensuring that while waiting for existing ones no
+new ones can be made.
+
+This problem has been around for a very long time - the readahead code was
+added in 2011, device remove exists since 2008 and device replace was
+introduced in 2013, hard to pick a specific commit for a git Fixes tag.
+
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ctree.h | 2 ++
+ fs/btrfs/dev-replace.c | 5 +++++
+ fs/btrfs/reada.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
+ fs/btrfs/volumes.c | 3 +++
+ fs/btrfs/volumes.h | 1 +
+ 5 files changed, 56 insertions(+)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -3517,6 +3517,8 @@ struct reada_control *btrfs_reada_add(st
+ int btrfs_reada_wait(void *handle);
+ void btrfs_reada_detach(void *handle);
+ int btree_readahead_hook(struct extent_buffer *eb, int err);
++void btrfs_reada_remove_dev(struct btrfs_device *dev);
++void btrfs_reada_undo_remove_dev(struct btrfs_device *dev);
+
+ static inline int is_fstree(u64 rootid)
+ {
+--- a/fs/btrfs/dev-replace.c
++++ b/fs/btrfs/dev-replace.c
+@@ -668,6 +668,9 @@ static int btrfs_dev_replace_finishing(s
+ }
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+
++ if (!scrub_ret)
++ btrfs_reada_remove_dev(src_device);
++
+ /*
+ * We have to use this loop approach because at this point src_device
+ * has to be available for transaction commit to complete, yet new
+@@ -676,6 +679,7 @@ static int btrfs_dev_replace_finishing(s
+ while (1) {
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
++ btrfs_reada_undo_remove_dev(src_device);
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+@@ -726,6 +730,7 @@ error:
+ up_write(&dev_replace->rwsem);
+ mutex_unlock(&fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
++ btrfs_reada_undo_remove_dev(src_device);
+ btrfs_rm_dev_replace_blocked(fs_info);
+ if (tgt_device)
+ btrfs_destroy_dev_replace_tgtdev(tgt_device);
+--- a/fs/btrfs/reada.c
++++ b/fs/btrfs/reada.c
+@@ -421,6 +421,9 @@ static struct reada_extent *reada_find_e
+ if (!dev->bdev)
+ continue;
+
++ if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state))
++ continue;
++
+ if (dev_replace_is_ongoing &&
+ dev == fs_info->dev_replace.tgtdev) {
+ /*
+@@ -1014,3 +1017,45 @@ void btrfs_reada_detach(void *handle)
+
+ kref_put(&rc->refcnt, reada_control_release);
+ }
++
++/*
++ * Before removing a device (device replace or device remove ioctls), call this
++ * function to wait for all existing readahead requests on the device and to
++ * make sure no one queues more readahead requests for the device.
++ *
++ * Must be called without holding neither the device list mutex nor the device
++ * replace semaphore, otherwise it will deadlock.
++ */
++void btrfs_reada_remove_dev(struct btrfs_device *dev)
++{
++ struct btrfs_fs_info *fs_info = dev->fs_info;
++
++ /* Serialize with readahead extent creation at reada_find_extent(). */
++ spin_lock(&fs_info->reada_lock);
++ set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
++ spin_unlock(&fs_info->reada_lock);
++
++ /*
++ * There might be readahead requests added to the radix trees which
++ * were not yet added to the readahead work queue. We need to start
++ * them and wait for their completion, otherwise we can end up with
++ * use-after-free problems when dropping the last reference on the
++ * readahead extents and their zones, as they need to access the
++ * device structure.
++ */
++ reada_start_machine(fs_info);
++ btrfs_flush_workqueue(fs_info->readahead_workers);
++}
++
++/*
++ * If when removing a device (device replace or device remove ioctls) an error
++ * happens after calling btrfs_reada_remove_dev(), call this to undo what that
++ * function did. This is safe to call even if btrfs_reada_remove_dev() was not
++ * called before.
++ */
++void btrfs_reada_undo_remove_dev(struct btrfs_device *dev)
++{
++ spin_lock(&dev->fs_info->reada_lock);
++ clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
++ spin_unlock(&dev->fs_info->reada_lock);
++}
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -2104,6 +2104,8 @@ int btrfs_rm_device(struct btrfs_fs_info
+
+ mutex_unlock(&uuid_mutex);
+ ret = btrfs_shrink_device(device, 0);
++ if (!ret)
++ btrfs_reada_remove_dev(device);
+ mutex_lock(&uuid_mutex);
+ if (ret)
+ goto error_undo;
+@@ -2191,6 +2193,7 @@ out:
+ return ret;
+
+ error_undo:
++ btrfs_reada_undo_remove_dev(device);
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
+ mutex_lock(&fs_info->chunk_mutex);
+ list_add(&device->dev_alloc_list,
+--- a/fs/btrfs/volumes.h
++++ b/fs/btrfs/volumes.h
+@@ -50,6 +50,7 @@ struct btrfs_io_geometry {
+ #define BTRFS_DEV_STATE_MISSING (2)
+ #define BTRFS_DEV_STATE_REPLACE_TGT (3)
+ #define BTRFS_DEV_STATE_FLUSH_SENT (4)
++#define BTRFS_DEV_STATE_NO_READA (5)
+
+ struct btrfs_device {
+ struct list_head dev_list; /* device_list_mutex */
--- /dev/null
+From 83bc1560e02e25c6439341352024ebe8488f4fbd Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 12 Oct 2020 11:55:23 +0100
+Subject: btrfs: fix use-after-free on readahead extent after failure to create it
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 83bc1560e02e25c6439341352024ebe8488f4fbd upstream.
+
+If we fail to find suitable zones for a new readahead extent, we end up
+leaving a stale pointer in the global readahead extents radix tree
+(fs_info->reada_tree), which can trigger the following trace later on:
+
+ [13367.696354] BUG: kernel NULL pointer dereference, address: 00000000000000b0
+ [13367.696802] #PF: supervisor read access in kernel mode
+ [13367.697249] #PF: error_code(0x0000) - not-present page
+ [13367.697721] PGD 0 P4D 0
+ [13367.698171] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
+ [13367.698632] CPU: 6 PID: 851214 Comm: btrfs Tainted: G W 5.9.0-rc6-btrfs-next-69 #1
+ [13367.699100] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
+ [13367.700069] RIP: 0010:__lock_acquire+0x20a/0x3970
+ [13367.700562] Code: ff 1f 0f b7 c0 48 0f (...)
+ [13367.701609] RSP: 0018:ffffb14448f57790 EFLAGS: 00010046
+ [13367.702140] RAX: 0000000000000000 RBX: 29b935140c15e8cf RCX: 0000000000000000
+ [13367.702698] RDX: 0000000000000002 RSI: ffffffffb3d66bd0 RDI: 0000000000000046
+ [13367.703240] RBP: ffff8a52ba8ac040 R08: 00000c2866ad9288 R09: 0000000000000001
+ [13367.703783] R10: 0000000000000001 R11: 00000000b66d9b53 R12: ffff8a52ba8ac9b0
+ [13367.704330] R13: 0000000000000000 R14: ffff8a532b6333e8 R15: 0000000000000000
+ [13367.704880] FS: 00007fe1df6b5700(0000) GS:ffff8a5376600000(0000) knlGS:0000000000000000
+ [13367.705438] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ [13367.705995] CR2: 00000000000000b0 CR3: 000000022cca8004 CR4: 00000000003706e0
+ [13367.706565] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+ [13367.707127] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+ [13367.707686] Call Trace:
+ [13367.708246] ? ___slab_alloc+0x395/0x740
+ [13367.708820] ? reada_add_block+0xae/0xee0 [btrfs]
+ [13367.709383] lock_acquire+0xb1/0x480
+ [13367.709955] ? reada_add_block+0xe0/0xee0 [btrfs]
+ [13367.710537] ? reada_add_block+0xae/0xee0 [btrfs]
+ [13367.711097] ? rcu_read_lock_sched_held+0x5d/0x90
+ [13367.711659] ? kmem_cache_alloc_trace+0x8d2/0x990
+ [13367.712221] ? lock_acquired+0x33b/0x470
+ [13367.712784] _raw_spin_lock+0x34/0x80
+ [13367.713356] ? reada_add_block+0xe0/0xee0 [btrfs]
+ [13367.713966] reada_add_block+0xe0/0xee0 [btrfs]
+ [13367.714529] ? btrfs_root_node+0x15/0x1f0 [btrfs]
+ [13367.715077] btrfs_reada_add+0x117/0x170 [btrfs]
+ [13367.715620] scrub_stripe+0x21e/0x10d0 [btrfs]
+ [13367.716141] ? kvm_sched_clock_read+0x5/0x10
+ [13367.716657] ? __lock_acquire+0x41e/0x3970
+ [13367.717184] ? scrub_chunk+0x60/0x140 [btrfs]
+ [13367.717697] ? find_held_lock+0x32/0x90
+ [13367.718254] ? scrub_chunk+0x60/0x140 [btrfs]
+ [13367.718773] ? lock_acquired+0x33b/0x470
+ [13367.719278] ? scrub_chunk+0xcd/0x140 [btrfs]
+ [13367.719786] scrub_chunk+0xcd/0x140 [btrfs]
+ [13367.720291] scrub_enumerate_chunks+0x270/0x5c0 [btrfs]
+ [13367.720787] ? finish_wait+0x90/0x90
+ [13367.721281] btrfs_scrub_dev+0x1ee/0x620 [btrfs]
+ [13367.721762] ? rcu_read_lock_any_held+0x8e/0xb0
+ [13367.722235] ? preempt_count_add+0x49/0xa0
+ [13367.722710] ? __sb_start_write+0x19b/0x290
+ [13367.723192] btrfs_ioctl+0x7f5/0x36f0 [btrfs]
+ [13367.723660] ? __fget_files+0x101/0x1d0
+ [13367.724118] ? find_held_lock+0x32/0x90
+ [13367.724559] ? __fget_files+0x101/0x1d0
+ [13367.724982] ? __x64_sys_ioctl+0x83/0xb0
+ [13367.725399] __x64_sys_ioctl+0x83/0xb0
+ [13367.725802] do_syscall_64+0x33/0x80
+ [13367.726188] entry_SYSCALL_64_after_hwframe+0x44/0xa9
+ [13367.726574] RIP: 0033:0x7fe1df7add87
+ [13367.726948] Code: 00 00 00 48 8b 05 09 91 (...)
+ [13367.727763] RSP: 002b:00007fe1df6b4d48 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
+ [13367.728179] RAX: ffffffffffffffda RBX: 000055ce1fb596a0 RCX: 00007fe1df7add87
+ [13367.728604] RDX: 000055ce1fb596a0 RSI: 00000000c400941b RDI: 0000000000000003
+ [13367.729021] RBP: 0000000000000000 R08: 00007fe1df6b5700 R09: 0000000000000000
+ [13367.729431] R10: 00007fe1df6b5700 R11: 0000000000000246 R12: 00007ffd922b07de
+ [13367.729842] R13: 00007ffd922b07df R14: 00007fe1df6b4e40 R15: 0000000000802000
+ [13367.730275] Modules linked in: btrfs blake2b_generic xor (...)
+ [13367.732638] CR2: 00000000000000b0
+ [13367.733166] ---[ end trace d298b6805556acd9 ]---
+
+What happens is the following:
+
+1) At reada_find_extent() we don't find any existing readahead extent for
+ the metadata extent starting at logical address X;
+
+2) So we proceed to create a new one. We then call btrfs_map_block() to get
+ information about which stripes contain extent X;
+
+3) After that we iterate over the stripes and create only one zone for the
+ readahead extent - only one because reada_find_zone() returned NULL for
+ all iterations except for one, either because a memory allocation failed
+ or it couldn't find the block group of the extent (it may have just been
+ deleted);
+
+4) We then add the new readahead extent to the readahead extents radix
+ tree at fs_info->reada_tree;
+
+5) Then we iterate over each zone of the new readahead extent, and find
+ that the device used for that zone no longer exists, because it was
+ removed or it was the source device of a device replace operation.
+ Since this left 'have_zone' set to 0, after finishing the loop we jump
+ to the 'error' label, call kfree() on the new readahead extent and
+ return without removing it from the radix tree at fs_info->reada_tree;
+
+6) Any future call to reada_find_extent() for the logical address X will
+ find the stale pointer in the readahead extents radix tree, increment
+ its reference counter, which can trigger the use-after-free right
+ away or return it to the caller reada_add_block() that results in the
+ use-after-free of the example trace above.
+
+So fix this by making sure we delete the readahead extent from the radix
+tree if we fail to setup zones for it (when 'have_zone = 0').
+
+Fixes: 319450211842ba ("btrfs: reada: bypass adding extent when all zone failed")
+CC: stable@vger.kernel.org # 4.9+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/reada.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/btrfs/reada.c
++++ b/fs/btrfs/reada.c
+@@ -445,6 +445,8 @@ static struct reada_extent *reada_find_e
+ }
+ have_zone = 1;
+ }
++ if (!have_zone)
++ radix_tree_delete(&fs_info->reada_tree, index);
+ spin_unlock(&fs_info->reada_lock);
+ up_read(&fs_info->dev_replace.rwsem);
+
--- /dev/null
+From 79dae17d8d44b2d15779e332180080af45df5352 Mon Sep 17 00:00:00 2001
+From: Anand Jain <anand.jain@oracle.com>
+Date: Thu, 3 Sep 2020 21:30:12 +0800
+Subject: btrfs: improve device scanning messages
+
+From: Anand Jain <anand.jain@oracle.com>
+
+commit 79dae17d8d44b2d15779e332180080af45df5352 upstream.
+
+Systems booting without the initramfs seems to scan an unusual kind
+of device path (/dev/root). And at a later time, the device is updated
+to the correct path. We generally print the process name and PID of the
+process scanning the device but we don't capture the same information if
+the device path is rescanned with a different pathname.
+
+The current message is too long, so drop the unnecessary UUID and add
+process name and PID.
+
+While at this also update the duplicate device warning to include the
+process name and PID so the messages are consistent
+
+CC: stable@vger.kernel.org # 4.19+
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=89721
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/volumes.c | 14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -942,16 +942,18 @@ static noinline struct btrfs_device *dev
+ bdput(path_bdev);
+ mutex_unlock(&fs_devices->device_list_mutex);
+ btrfs_warn_in_rcu(device->fs_info,
+- "duplicate device fsid:devid for %pU:%llu old:%s new:%s",
+- disk_super->fsid, devid,
+- rcu_str_deref(device->name), path);
++ "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
++ path, devid, found_transid,
++ current->comm,
++ task_pid_nr(current));
+ return ERR_PTR(-EEXIST);
+ }
+ bdput(path_bdev);
+ btrfs_info_in_rcu(device->fs_info,
+- "device fsid %pU devid %llu moved old:%s new:%s",
+- disk_super->fsid, devid,
+- rcu_str_deref(device->name), path);
++ "devid %llu device path %s changed to %s scanned by %s (%d)",
++ devid, rcu_str_deref(device->name),
++ path, current->comm,
++ task_pid_nr(current));
+ }
+
+ name = rcu_string_strdup(path, GFP_NOFS);
--- /dev/null
+From e85fde5162bf1b242cbd6daf7dba0f9b457d592b Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Fri, 24 Jul 2020 14:46:10 +0800
+Subject: btrfs: qgroup: fix qgroup meta rsv leak for subvolume operations
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit e85fde5162bf1b242cbd6daf7dba0f9b457d592b upstream.
+
+[BUG]
+When quota is enabled for TEST_DEV, generic/013 sometimes fails like this:
+
+ generic/013 14s ... _check_dmesg: something found in dmesg (see xfstests-dev/results//generic/013.dmesg)
+
+And with the following metadata leak:
+
+ BTRFS warning (device dm-3): qgroup 0/1370 has unreleased space, type 2 rsv 49152
+ ------------[ cut here ]------------
+ WARNING: CPU: 2 PID: 47912 at fs/btrfs/disk-io.c:4078 close_ctree+0x1dc/0x323 [btrfs]
+ Call Trace:
+ btrfs_put_super+0x15/0x17 [btrfs]
+ generic_shutdown_super+0x72/0x110
+ kill_anon_super+0x18/0x30
+ btrfs_kill_super+0x17/0x30 [btrfs]
+ deactivate_locked_super+0x3b/0xa0
+ deactivate_super+0x40/0x50
+ cleanup_mnt+0x135/0x190
+ __cleanup_mnt+0x12/0x20
+ task_work_run+0x64/0xb0
+ __prepare_exit_to_usermode+0x1bc/0x1c0
+ __syscall_return_slowpath+0x47/0x230
+ do_syscall_64+0x64/0xb0
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+ ---[ end trace a6cfd45ba80e4e06 ]---
+ BTRFS error (device dm-3): qgroup reserved space leaked
+ BTRFS info (device dm-3): disk space caching is enabled
+ BTRFS info (device dm-3): has skinny extents
+
+[CAUSE]
+The qgroup preallocated meta rsv operations of that offending root are:
+
+ btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072
+ btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072
+ btrfs_subvolume_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=49152
+ btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072
+ btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072
+
+It's pretty obvious that, we reserve qgroup meta rsv in
+btrfs_subvolume_reserve_metadata(), but doesn't have corresponding
+release/convert calls in btrfs_subvolume_release_metadata().
+
+This leads to the leakage.
+
+[FIX]
+To fix this bug, we should follow what we're doing in
+btrfs_delalloc_reserve_metadata(), where we reserve qgroup space, and
+add it to block_rsv->qgroup_rsv_reserved.
+
+And free the qgroup reserved metadata space when releasing the
+block_rsv.
+
+To do this, we need to change the btrfs_subvolume_release_metadata() to
+accept btrfs_root, and record the qgroup_to_release number, and call
+btrfs_qgroup_convert_reserved_meta() for it.
+
+Fixes: 733e03a0b26a ("btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans")
+CC: stable@vger.kernel.org # 4.19+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ctree.h | 2 +-
+ fs/btrfs/inode.c | 2 +-
+ fs/btrfs/ioctl.c | 6 +++---
+ fs/btrfs/root-tree.c | 13 +++++++++++--
+ 4 files changed, 16 insertions(+), 7 deletions(-)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -2619,7 +2619,7 @@ enum btrfs_flush_state {
+ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ int nitems, bool use_global_rsv);
+-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
++void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv);
+ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -4051,7 +4051,7 @@ out_end_trans:
+ err = ret;
+ inode->i_flags |= S_DEAD;
+ out_release:
+- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
++ btrfs_subvolume_release_metadata(root, &block_rsv);
+ out_up_write:
+ up_write(&fs_info->subvol_sem);
+ if (err) {
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -618,7 +618,7 @@ static noinline int create_subvol(struct
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
++ btrfs_subvolume_release_metadata(root, &block_rsv);
+ goto fail_free;
+ }
+ trans->block_rsv = &block_rsv;
+@@ -742,7 +742,7 @@ fail:
+ kfree(root_item);
+ trans->block_rsv = NULL;
+ trans->bytes_reserved = 0;
+- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
++ btrfs_subvolume_release_metadata(root, &block_rsv);
+
+ err = btrfs_commit_transaction(trans);
+ if (err && !ret)
+@@ -856,7 +856,7 @@ fail:
+ if (ret && pending_snapshot->snap)
+ pending_snapshot->snap->anon_dev = 0;
+ btrfs_put_root(pending_snapshot->snap);
+- btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
++ btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv);
+ free_pending:
+ if (pending_snapshot->anon_dev)
+ free_anon_bdev(pending_snapshot->anon_dev);
+--- a/fs/btrfs/root-tree.c
++++ b/fs/btrfs/root-tree.c
+@@ -512,11 +512,20 @@ int btrfs_subvolume_reserve_metadata(str
+ if (ret && qgroup_num_bytes)
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+
++ if (!ret) {
++ spin_lock(&rsv->lock);
++ rsv->qgroup_rsv_reserved += qgroup_num_bytes;
++ spin_unlock(&rsv->lock);
++ }
+ return ret;
+ }
+
+-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
++void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv)
+ {
+- btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
++ struct btrfs_fs_info *fs_info = root->fs_info;
++ u64 qgroup_to_release;
++
++ btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release);
++ btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release);
+ }
--- /dev/null
+From b4c5d8fdfff3e2b6c4fa4a5043e8946dff500f8c Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Fri, 24 Jul 2020 14:46:09 +0800
+Subject: btrfs: qgroup: fix wrong qgroup metadata reserve for delayed inode
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit b4c5d8fdfff3e2b6c4fa4a5043e8946dff500f8c upstream.
+
+For delayed inode facility, qgroup metadata is reserved for it, and
+later freed.
+
+However we're freeing more bytes than we reserved.
+In btrfs_delayed_inode_reserve_metadata():
+
+ num_bytes = btrfs_calc_metadata_size(fs_info, 1);
+ ...
+ ret = btrfs_qgroup_reserve_meta_prealloc(root,
+ fs_info->nodesize, true);
+ ...
+ if (!ret) {
+ node->bytes_reserved = num_bytes;
+
+But in btrfs_delayed_inode_release_metadata():
+
+ if (qgroup_free)
+ btrfs_qgroup_free_meta_prealloc(node->root,
+ node->bytes_reserved);
+ else
+ btrfs_qgroup_convert_reserved_meta(node->root,
+ node->bytes_reserved);
+
+This means, we're always releasing more qgroup metadata rsv than we have
+reserved.
+
+This won't trigger selftest warning, as btrfs qgroup metadata rsv has
+extra protection against cases like quota enabled half-way.
+
+But we still need to fix this problem any way.
+
+This patch will use the same num_bytes for qgroup metadata rsv so we
+could handle it correctly.
+
+Fixes: f218ea6c4792 ("btrfs: delayed-inode: Remove wrong qgroup meta reservation calls")
+CC: stable@vger.kernel.org # 4.19+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/delayed-inode.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/btrfs/delayed-inode.c
++++ b/fs/btrfs/delayed-inode.c
+@@ -627,8 +627,7 @@ static int btrfs_delayed_inode_reserve_m
+ */
+ if (!src_rsv || (!trans->bytes_reserved &&
+ src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
+- ret = btrfs_qgroup_reserve_meta_prealloc(root,
+- fs_info->nodesize, true);
++ ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+ if (ret < 0)
+ return ret;
+ ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
--- /dev/null
+From bb56f02f26fe23798edb1b2175707419b28c752a Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 14 Sep 2020 15:27:50 +0100
+Subject: btrfs: reschedule if necessary when logging directory items
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit bb56f02f26fe23798edb1b2175707419b28c752a upstream.
+
+Logging directories with many entries can take a significant amount of
+time, and in some cases monopolize a cpu/core for a long time if the
+logging task doesn't happen to block often enough.
+
+Johannes and Lu Fengqi reported test case generic/041 triggering a soft
+lockup when the kernel has CONFIG_SOFTLOCKUP_DETECTOR=y. For this test
+case we log an inode with 3002 hard links, and because the test removed
+one hard link before fsyncing the file, the inode logging causes the
+parent directory do be logged as well, which has 6004 directory items to
+log (3002 BTRFS_DIR_ITEM_KEY items plus 3002 BTRFS_DIR_INDEX_KEY items),
+so it can take a significant amount of time and trigger the soft lockup.
+
+So just make tree-log.c:log_dir_items() reschedule when necessary,
+releasing the current search path before doing so and then resume from
+where it was before the reschedule.
+
+The stack trace produced when the soft lockup happens is the following:
+
+[10480.277653] watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [xfs_io:28172]
+[10480.279418] Modules linked in: dm_thin_pool dm_persistent_data (...)
+[10480.284915] irq event stamp: 29646366
+[10480.285987] hardirqs last enabled at (29646365): [<ffffffff85249b66>] __slab_alloc.constprop.0+0x56/0x60
+[10480.288482] hardirqs last disabled at (29646366): [<ffffffff8579b00d>] irqentry_enter+0x1d/0x50
+[10480.290856] softirqs last enabled at (4612): [<ffffffff85a00323>] __do_softirq+0x323/0x56c
+[10480.293615] softirqs last disabled at (4483): [<ffffffff85800dbf>] asm_call_on_stack+0xf/0x20
+[10480.296428] CPU: 2 PID: 28172 Comm: xfs_io Not tainted 5.9.0-rc4-default+ #1248
+[10480.298948] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
+[10480.302455] RIP: 0010:__slab_alloc.constprop.0+0x19/0x60
+[10480.304151] Code: 86 e8 31 75 21 00 66 66 2e 0f 1f 84 00 00 00 (...)
+[10480.309558] RSP: 0018:ffffadbe09397a58 EFLAGS: 00000282
+[10480.311179] RAX: ffff8a495ab92840 RBX: 0000000000000282 RCX: 0000000000000006
+[10480.313242] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff85249b66
+[10480.315260] RBP: ffff8a497d04b740 R08: 0000000000000001 R09: 0000000000000001
+[10480.317229] R10: ffff8a497d044800 R11: ffff8a495ab93c40 R12: 0000000000000000
+[10480.319169] R13: 0000000000000000 R14: 0000000000000c40 R15: ffffffffc01daf70
+[10480.321104] FS: 00007fa1dc5c0e40(0000) GS:ffff8a497da00000(0000) knlGS:0000000000000000
+[10480.323559] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[10480.325235] CR2: 00007fa1dc5befb8 CR3: 0000000004f8a006 CR4: 0000000000170ea0
+[10480.327259] Call Trace:
+[10480.328286] ? overwrite_item+0x1f0/0x5a0 [btrfs]
+[10480.329784] __kmalloc+0x831/0xa20
+[10480.331009] ? btrfs_get_32+0xb0/0x1d0 [btrfs]
+[10480.332464] overwrite_item+0x1f0/0x5a0 [btrfs]
+[10480.333948] log_dir_items+0x2ee/0x570 [btrfs]
+[10480.335413] log_directory_changes+0x82/0xd0 [btrfs]
+[10480.336926] btrfs_log_inode+0xc9b/0xda0 [btrfs]
+[10480.338374] ? init_once+0x20/0x20 [btrfs]
+[10480.339711] btrfs_log_inode_parent+0x8d3/0xd10 [btrfs]
+[10480.341257] ? dget_parent+0x97/0x2e0
+[10480.342480] btrfs_log_dentry_safe+0x3a/0x50 [btrfs]
+[10480.343977] btrfs_sync_file+0x24b/0x5e0 [btrfs]
+[10480.345381] do_fsync+0x38/0x70
+[10480.346483] __x64_sys_fsync+0x10/0x20
+[10480.347703] do_syscall_64+0x2d/0x70
+[10480.348891] entry_SYSCALL_64_after_hwframe+0x44/0xa9
+[10480.350444] RIP: 0033:0x7fa1dc80970b
+[10480.351642] Code: 0f 05 48 3d 00 f0 ff ff 77 45 c3 0f 1f 40 00 48 (...)
+[10480.356952] RSP: 002b:00007fffb3d081d0 EFLAGS: 00000293 ORIG_RAX: 000000000000004a
+[10480.359458] RAX: ffffffffffffffda RBX: 0000562d93d45e40 RCX: 00007fa1dc80970b
+[10480.361426] RDX: 0000562d93d44ab0 RSI: 0000562d93d45e60 RDI: 0000000000000003
+[10480.363367] RBP: 0000000000000001 R08: 0000000000000000 R09: 00007fa1dc7b2a40
+[10480.365317] R10: 0000562d93d0e366 R11: 0000000000000293 R12: 0000000000000001
+[10480.367299] R13: 0000562d93d45290 R14: 0000562d93d45e40 R15: 0000562d93d45e60
+
+Link: https://lore.kernel.org/linux-btrfs/20180713090216.GC575@fnst.localdomain/
+Reported-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+CC: stable@vger.kernel.org # 4.4+
+Tested-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/tree-log.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -3615,6 +3615,7 @@ static noinline int log_dir_items(struct
+ * search and this search we'll not find the key again and can just
+ * bail.
+ */
++search:
+ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+ if (ret != 0)
+ goto done;
+@@ -3634,6 +3635,13 @@ static noinline int log_dir_items(struct
+
+ if (min_key.objectid != ino || min_key.type != key_type)
+ goto done;
++
++ if (need_resched()) {
++ btrfs_release_path(path);
++ cond_resched();
++ goto search;
++ }
++
+ ret = overwrite_item(trans, log, dst_path, src, i,
+ &min_key);
+ if (ret) {
--- /dev/null
+From 6b613cc97f0ace77f92f7bc112b8f6ad3f52baf8 Mon Sep 17 00:00:00 2001
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Date: Tue, 22 Sep 2020 17:27:29 +0900
+Subject: btrfs: reschedule when cloning lots of extents
+
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+
+commit 6b613cc97f0ace77f92f7bc112b8f6ad3f52baf8 upstream.
+
+We have several occurrences of a soft lockup from fstest's generic/175
+testcase, which look more or less like this one:
+
+ watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [xfs_io:10030]
+ Kernel panic - not syncing: softlockup: hung tasks
+ CPU: 0 PID: 10030 Comm: xfs_io Tainted: G L 5.9.0-rc5+ #768
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4-rebuilt.opensuse.org 04/01/2014
+ Call Trace:
+ <IRQ>
+ dump_stack+0x77/0xa0
+ panic+0xfa/0x2cb
+ watchdog_timer_fn.cold+0x85/0xa5
+ ? lockup_detector_update_enable+0x50/0x50
+ __hrtimer_run_queues+0x99/0x4c0
+ ? recalibrate_cpu_khz+0x10/0x10
+ hrtimer_run_queues+0x9f/0xb0
+ update_process_times+0x28/0x80
+ tick_handle_periodic+0x1b/0x60
+ __sysvec_apic_timer_interrupt+0x76/0x210
+ asm_call_on_stack+0x12/0x20
+ </IRQ>
+ sysvec_apic_timer_interrupt+0x7f/0x90
+ asm_sysvec_apic_timer_interrupt+0x12/0x20
+ RIP: 0010:btrfs_tree_unlock+0x91/0x1a0 [btrfs]
+ RSP: 0018:ffffc90007123a58 EFLAGS: 00000282
+ RAX: ffff8881cea2fbe0 RBX: ffff8881cea2fbe0 RCX: 0000000000000000
+ RDX: ffff8881d23fd200 RSI: ffffffff82045220 RDI: ffff8881cea2fba0
+ RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000032
+ R10: 0000160000000000 R11: 0000000000001000 R12: 0000000000001000
+ R13: ffff8882357fd5b0 R14: ffff88816fa76e70 R15: ffff8881cea2fad0
+ ? btrfs_tree_unlock+0x15b/0x1a0 [btrfs]
+ btrfs_release_path+0x67/0x80 [btrfs]
+ btrfs_insert_replace_extent+0x177/0x2c0 [btrfs]
+ btrfs_replace_file_extents+0x472/0x7c0 [btrfs]
+ btrfs_clone+0x9ba/0xbd0 [btrfs]
+ btrfs_clone_files.isra.0+0xeb/0x140 [btrfs]
+ ? file_update_time+0xcd/0x120
+ btrfs_remap_file_range+0x322/0x3b0 [btrfs]
+ do_clone_file_range+0xb7/0x1e0
+ vfs_clone_file_range+0x30/0xa0
+ ioctl_file_clone+0x8a/0xc0
+ do_vfs_ioctl+0x5b2/0x6f0
+ __x64_sys_ioctl+0x37/0xa0
+ do_syscall_64+0x33/0x40
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+ RIP: 0033:0x7f87977fc247
+ RSP: 002b:00007ffd51a2f6d8 EFLAGS: 00000206 ORIG_RAX: 0000000000000010
+ RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f87977fc247
+ RDX: 00007ffd51a2f710 RSI: 000000004020940d RDI: 0000000000000003
+ RBP: 0000000000000004 R08: 00007ffd51a79080 R09: 0000000000000000
+ R10: 00005621f11352f2 R11: 0000000000000206 R12: 0000000000000000
+ R13: 0000000000000000 R14: 00005621f128b958 R15: 0000000080000000
+ Kernel Offset: disabled
+ ---[ end Kernel panic - not syncing: softlockup: hung tasks ]---
+
+All of these lockup reports have the call chain btrfs_clone_files() ->
+btrfs_clone() in common. btrfs_clone_files() calls btrfs_clone() with
+both source and destination extents locked and loops over the source
+extent to create the clones.
+
+Conditionally reschedule in the btrfs_clone() loop, to give some time back
+to other processes.
+
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/reflink.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/btrfs/reflink.c
++++ b/fs/btrfs/reflink.c
+@@ -520,6 +520,8 @@ process_slot:
+ ret = -EINTR;
+ goto out;
+ }
++
++ cond_resched();
+ }
+ ret = 0;
+
--- /dev/null
+From 98272bb77bf4cc20ed1ffca89832d713e70ebf09 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 21 Sep 2020 14:13:29 +0100
+Subject: btrfs: send, orphanize first all conflicting inodes when processing references
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 98272bb77bf4cc20ed1ffca89832d713e70ebf09 upstream.
+
+When doing an incremental send it is possible that when processing the new
+references for an inode we end up issuing rename or link operations that
+have an invalid path, which contains the orphanized name of a directory
+before we actually orphanized it, causing the receiver to fail.
+
+The following reproducer triggers such scenario:
+
+ $ cat reproducer.sh
+ #!/bin/bash
+
+ mkfs.btrfs -f /dev/sdi >/dev/null
+ mount /dev/sdi /mnt/sdi
+
+ touch /mnt/sdi/a
+ touch /mnt/sdi/b
+ mkdir /mnt/sdi/testdir
+ # We want "a" to have a lower inode number then "testdir" (257 vs 259).
+ mv /mnt/sdi/a /mnt/sdi/testdir/a
+
+ # Filesystem looks like:
+ #
+ # . (ino 256)
+ # |----- testdir/ (ino 259)
+ # | |----- a (ino 257)
+ # |
+ # |----- b (ino 258)
+
+ btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap1
+ btrfs send -f /tmp/snap1.send /mnt/sdi/snap1
+
+ # Now rename 259 to "testdir_2", then change the name of 257 to
+ # "testdir" and make it a direct descendant of the root inode (256).
+ # Also create a new link for inode 257 with the old name of inode 258.
+ # By swapping the names and location of several inodes and create a
+ # nasty dependency chain of rename and link operations.
+ mv /mnt/sdi/testdir/a /mnt/sdi/a2
+ touch /mnt/sdi/testdir/a
+ mv /mnt/sdi/b /mnt/sdi/b2
+ ln /mnt/sdi/a2 /mnt/sdi/b
+ mv /mnt/sdi/testdir /mnt/sdi/testdir_2
+ mv /mnt/sdi/a2 /mnt/sdi/testdir
+
+ # Filesystem now looks like:
+ #
+ # . (ino 256)
+ # |----- testdir_2/ (ino 259)
+ # | |----- a (ino 260)
+ # |
+ # |----- testdir (ino 257)
+ # |----- b (ino 257)
+ # |----- b2 (ino 258)
+
+ btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap2
+ btrfs send -f /tmp/snap2.send -p /mnt/sdi/snap1 /mnt/sdi/snap2
+
+ mkfs.btrfs -f /dev/sdj >/dev/null
+ mount /dev/sdj /mnt/sdj
+
+ btrfs receive -f /tmp/snap1.send /mnt/sdj
+ btrfs receive -f /tmp/snap2.send /mnt/sdj
+
+ umount /mnt/sdi
+ umount /mnt/sdj
+
+When running the reproducer, the receive of the incremental send stream
+fails:
+
+ $ ./reproducer.sh
+ Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap1'
+ At subvol /mnt/sdi/snap1
+ Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap2'
+ At subvol /mnt/sdi/snap2
+ At subvol snap1
+ At snapshot snap2
+ ERROR: link b -> o259-6-0/a failed: No such file or directory
+
+The problem happens because of the following:
+
+1) Before we start iterating the list of new references for inode 257,
+ we generate its current path and store it at @valid_path, done at
+ the very beginning of process_recorded_refs(). The generated path
+ is "o259-6-0/a", containing the orphanized name for inode 259;
+
+2) Then we iterate over the list of new references, which has the
+ references "b" and "testdir" in that specific order;
+
+3) We process reference "b" first, because it is in the list before
+ reference "testdir". We then issue a link operation to create
+ the new reference "b" using a target path corresponding to the
+ content at @valid_path, which corresponds to "o259-6-0/a".
+ However we haven't yet orphanized inode 259, its name is still
+ "testdir", and not "o259-6-0". The orphanization of 259 did not
+ happen yet because we will process the reference named "testdir"
+ for inode 257 only in the next iteration of the loop that goes
+ over the list of new references.
+
+Fix the issue by having a preliminar iteration over all the new references
+at process_recorded_refs(). This iteration is responsible only for doing
+the orphanization of other inodes that have and old reference that
+conflicts with one of the new references of the inode we are currently
+processing. The emission of rename and link operations happen now in the
+next iteration of the new references.
+
+A test case for fstests will follow soon.
+
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/send.c | 127 ++++++++++++++++++++++++++++++++++++++------------------
+ 1 file changed, 87 insertions(+), 40 deletions(-)
+
+--- a/fs/btrfs/send.c
++++ b/fs/btrfs/send.c
+@@ -3880,52 +3880,56 @@ static int process_recorded_refs(struct
+ goto out;
+ }
+
++ /*
++ * Before doing any rename and link operations, do a first pass on the
++ * new references to orphanize any unprocessed inodes that may have a
++ * reference that conflicts with one of the new references of the current
++ * inode. This needs to happen first because a new reference may conflict
++ * with the old reference of a parent directory, so we must make sure
++ * that the path used for link and rename commands don't use an
++ * orphanized name when an ancestor was not yet orphanized.
++ *
++ * Example:
++ *
++ * Parent snapshot:
++ *
++ * . (ino 256)
++ * |----- testdir/ (ino 259)
++ * | |----- a (ino 257)
++ * |
++ * |----- b (ino 258)
++ *
++ * Send snapshot:
++ *
++ * . (ino 256)
++ * |----- testdir_2/ (ino 259)
++ * | |----- a (ino 260)
++ * |
++ * |----- testdir (ino 257)
++ * |----- b (ino 257)
++ * |----- b2 (ino 258)
++ *
++ * Processing the new reference for inode 257 with name "b" may happen
++ * before processing the new reference with name "testdir". If so, we
++ * must make sure that by the time we send a link command to create the
++ * hard link "b", inode 259 was already orphanized, since the generated
++ * path in "valid_path" already contains the orphanized name for 259.
++ * We are processing inode 257, so only later when processing 259 we do
++ * the rename operation to change its temporary (orphanized) name to
++ * "testdir_2".
++ */
+ list_for_each_entry(cur, &sctx->new_refs, list) {
+- /*
+- * We may have refs where the parent directory does not exist
+- * yet. This happens if the parent directories inum is higher
+- * than the current inum. To handle this case, we create the
+- * parent directory out of order. But we need to check if this
+- * did already happen before due to other refs in the same dir.
+- */
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+- if (ret == inode_state_will_create) {
+- ret = 0;
+- /*
+- * First check if any of the current inodes refs did
+- * already create the dir.
+- */
+- list_for_each_entry(cur2, &sctx->new_refs, list) {
+- if (cur == cur2)
+- break;
+- if (cur2->dir == cur->dir) {
+- ret = 1;
+- break;
+- }
+- }
+-
+- /*
+- * If that did not happen, check if a previous inode
+- * did already create the dir.
+- */
+- if (!ret)
+- ret = did_create_dir(sctx, cur->dir);
+- if (ret < 0)
+- goto out;
+- if (!ret) {
+- ret = send_create_inode(sctx, cur->dir);
+- if (ret < 0)
+- goto out;
+- }
+- }
++ if (ret == inode_state_will_create)
++ continue;
+
+ /*
+- * Check if this new ref would overwrite the first ref of
+- * another unprocessed inode. If yes, orphanize the
+- * overwritten inode. If we find an overwritten ref that is
+- * not the first ref, simply unlink it.
++ * Check if this new ref would overwrite the first ref of another
++ * unprocessed inode. If yes, orphanize the overwritten inode.
++ * If we find an overwritten ref that is not the first ref,
++ * simply unlink it.
+ */
+ ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
+ cur->name, cur->name_len,
+@@ -4002,6 +4006,49 @@ static int process_recorded_refs(struct
+ if (ret < 0)
+ goto out;
+ }
++ }
++
++ }
++
++ list_for_each_entry(cur, &sctx->new_refs, list) {
++ /*
++ * We may have refs where the parent directory does not exist
++ * yet. This happens if the parent directories inum is higher
++ * than the current inum. To handle this case, we create the
++ * parent directory out of order. But we need to check if this
++ * did already happen before due to other refs in the same dir.
++ */
++ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
++ if (ret < 0)
++ goto out;
++ if (ret == inode_state_will_create) {
++ ret = 0;
++ /*
++ * First check if any of the current inodes refs did
++ * already create the dir.
++ */
++ list_for_each_entry(cur2, &sctx->new_refs, list) {
++ if (cur == cur2)
++ break;
++ if (cur2->dir == cur->dir) {
++ ret = 1;
++ break;
++ }
++ }
++
++ /*
++ * If that did not happen, check if a previous inode
++ * did already create the dir.
++ */
++ if (!ret)
++ ret = did_create_dir(sctx, cur->dir);
++ if (ret < 0)
++ goto out;
++ if (!ret) {
++ ret = send_create_inode(sctx, cur->dir);
++ if (ret < 0)
++ goto out;
++ }
+ }
+
+ if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
--- /dev/null
+From 9c2b4e0347067396ceb3ae929d6888c81d610259 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 21 Sep 2020 14:13:30 +0100
+Subject: btrfs: send, recompute reference path after orphanization of a directory
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 9c2b4e0347067396ceb3ae929d6888c81d610259 upstream.
+
+During an incremental send, when an inode has multiple new references we
+might end up emitting rename operations for orphanizations that have a
+source path that is no longer valid due to a previous orphanization of
+some directory inode. This causes the receiver to fail since it tries
+to rename a path that does not exists.
+
+Example reproducer:
+
+ $ cat reproducer.sh
+ #!/bin/bash
+
+ mkfs.btrfs -f /dev/sdi >/dev/null
+ mount /dev/sdi /mnt/sdi
+
+ touch /mnt/sdi/f1
+ touch /mnt/sdi/f2
+ mkdir /mnt/sdi/d1
+ mkdir /mnt/sdi/d1/d2
+
+ # Filesystem looks like:
+ #
+ # . (ino 256)
+ # |----- f1 (ino 257)
+ # |----- f2 (ino 258)
+ # |----- d1/ (ino 259)
+ # |----- d2/ (ino 260)
+
+ btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap1
+ btrfs send -f /tmp/snap1.send /mnt/sdi/snap1
+
+ # Now do a series of changes such that:
+ #
+ # *) inode 258 has one new hardlink and the previous name changed
+ #
+ # *) both names conflict with the old names of two other inodes:
+ #
+ # 1) the new name "d1" conflicts with the old name of inode 259,
+ # under directory inode 256 (root)
+ #
+ # 2) the new name "d2" conflicts with the old name of inode 260
+ # under directory inode 259
+ #
+ # *) inodes 259 and 260 now have the old names of inode 258
+ #
+ # *) inode 257 is now located under inode 260 - an inode with a number
+ # smaller than the inode (258) for which we created a second hard
+ # link and swapped its names with inodes 259 and 260
+ #
+ ln /mnt/sdi/f2 /mnt/sdi/d1/f2_link
+ mv /mnt/sdi/f1 /mnt/sdi/d1/d2/f1
+
+ # Swap d1 and f2.
+ mv /mnt/sdi/d1 /mnt/sdi/tmp
+ mv /mnt/sdi/f2 /mnt/sdi/d1
+ mv /mnt/sdi/tmp /mnt/sdi/f2
+
+ # Swap d2 and f2_link
+ mv /mnt/sdi/f2/d2 /mnt/sdi/tmp
+ mv /mnt/sdi/f2/f2_link /mnt/sdi/f2/d2
+ mv /mnt/sdi/tmp /mnt/sdi/f2/f2_link
+
+ # Filesystem now looks like:
+ #
+ # . (ino 256)
+ # |----- d1 (ino 258)
+ # |----- f2/ (ino 259)
+ # |----- f2_link/ (ino 260)
+ # | |----- f1 (ino 257)
+ # |
+ # |----- d2 (ino 258)
+
+ btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap2
+ btrfs send -f /tmp/snap2.send -p /mnt/sdi/snap1 /mnt/sdi/snap2
+
+ mkfs.btrfs -f /dev/sdj >/dev/null
+ mount /dev/sdj /mnt/sdj
+
+ btrfs receive -f /tmp/snap1.send /mnt/sdj
+ btrfs receive -f /tmp/snap2.send /mnt/sdj
+
+ umount /mnt/sdi
+ umount /mnt/sdj
+
+When executed the receive of the incremental stream fails:
+
+ $ ./reproducer.sh
+ Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap1'
+ At subvol /mnt/sdi/snap1
+ Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap2'
+ At subvol /mnt/sdi/snap2
+ At subvol snap1
+ At snapshot snap2
+ ERROR: rename d1/d2 -> o260-6-0 failed: No such file or directory
+
+This happens because:
+
+1) When processing inode 257 we end up computing the name for inode 259
+ because it is an ancestor in the send snapshot, and at that point it
+ still has its old name, "d1", from the parent snapshot because inode
+ 259 was not yet processed. We then cache that name, which is valid
+ until we start processing inode 259 (or set the progress to 260 after
+ processing its references);
+
+2) Later we start processing inode 258 and collecting all its new
+ references into the list sctx->new_refs. The first reference in the
+ list happens to be the reference for name "d1" while the reference for
+ name "d2" is next (the last element of the list).
+ We compute the full path "d1/d2" for this second reference and store
+ it in the reference (its ->full_path member). The path used for the
+ new parent directory was "d1" and not "f2" because inode 259, the
+ new parent, was not yet processed;
+
+3) When we start processing the new references at process_recorded_refs()
+ we start with the first reference in the list, for the new name "d1".
+ Because there is a conflicting inode that was not yet processed, which
+ is directory inode 259, we orphanize it, renaming it from "d1" to
+ "o259-6-0";
+
+4) Then we start processing the new reference for name "d2", and we
+ realize it conflicts with the reference of inode 260 in the parent
+ snapshot. So we issue an orphanization operation for inode 260 by
+ emitting a rename operation with a destination path of "o260-6-0"
+ and a source path of "d1/d2" - this source path is the value we
+ stored in the reference earlier at step 2), corresponding to the
+ ->full_path member of the reference, however that path is no longer
+ valid due to the orphanization of the directory inode 259 in step 3).
+ This makes the receiver fail since the path does not exists, it should
+ have been "o259-6-0/d2".
+
+Fix this by recomputing the full path of a reference before emitting an
+orphanization if we previously orphanized any directory, since that
+directory could be a parent in the new path. This is a rare scenario so
+keeping it simple and not checking if that previously orphanized directory
+is in fact an ancestor of the inode we are trying to orphanize.
+
+A test case for fstests follows soon.
+
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/send.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 72 insertions(+)
+
+--- a/fs/btrfs/send.c
++++ b/fs/btrfs/send.c
+@@ -3813,6 +3813,72 @@ static int update_ref_path(struct send_c
+ }
+
+ /*
++ * When processing the new references for an inode we may orphanize an existing
++ * directory inode because its old name conflicts with one of the new references
++ * of the current inode. Later, when processing another new reference of our
++ * inode, we might need to orphanize another inode, but the path we have in the
++ * reference reflects the pre-orphanization name of the directory we previously
++ * orphanized. For example:
++ *
++ * parent snapshot looks like:
++ *
++ * . (ino 256)
++ * |----- f1 (ino 257)
++ * |----- f2 (ino 258)
++ * |----- d1/ (ino 259)
++ * |----- d2/ (ino 260)
++ *
++ * send snapshot looks like:
++ *
++ * . (ino 256)
++ * |----- d1 (ino 258)
++ * |----- f2/ (ino 259)
++ * |----- f2_link/ (ino 260)
++ * | |----- f1 (ino 257)
++ * |
++ * |----- d2 (ino 258)
++ *
++ * When processing inode 257 we compute the name for inode 259 as "d1", and we
++ * cache it in the name cache. Later when we start processing inode 258, when
++ * collecting all its new references we set a full path of "d1/d2" for its new
++ * reference with name "d2". When we start processing the new references we
++ * start by processing the new reference with name "d1", and this results in
++ * orphanizing inode 259, since its old reference causes a conflict. Then we
++ * move on the next new reference, with name "d2", and we find out we must
++ * orphanize inode 260, as its old reference conflicts with ours - but for the
++ * orphanization we use a source path corresponding to the path we stored in the
++ * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
++ * receiver fail since the path component "d1/" no longer exists, it was renamed
++ * to "o259-6-0/" when processing the previous new reference. So in this case we
++ * must recompute the path in the new reference and use it for the new
++ * orphanization operation.
++ */
++static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
++{
++ char *name;
++ int ret;
++
++ name = kmemdup(ref->name, ref->name_len, GFP_KERNEL);
++ if (!name)
++ return -ENOMEM;
++
++ fs_path_reset(ref->full_path);
++ ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path);
++ if (ret < 0)
++ goto out;
++
++ ret = fs_path_add(ref->full_path, name, ref->name_len);
++ if (ret < 0)
++ goto out;
++
++ /* Update the reference's base name pointer. */
++ set_ref_path(ref, ref->full_path);
++out:
++ kfree(name);
++ return ret;
++}
++
++/*
+ * This does all the move/link/unlink/rmdir magic.
+ */
+ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
+@@ -3946,6 +4012,12 @@ static int process_recorded_refs(struct
+ struct name_cache_entry *nce;
+ struct waiting_dir_move *wdm;
+
++ if (orphanized_dir) {
++ ret = refresh_ref_path(sctx, cur);
++ if (ret < 0)
++ goto out;
++ }
++
+ ret = orphanize_inode(sctx, ow_inode, ow_gen,
+ cur->full_path);
+ if (ret < 0)
--- /dev/null
+From 96c2e067ed3e3e004580a643c76f58729206b829 Mon Sep 17 00:00:00 2001
+From: Anand Jain <anand.jain@oracle.com>
+Date: Wed, 30 Sep 2020 21:09:52 +0800
+Subject: btrfs: skip devices without magic signature when mounting
+
+From: Anand Jain <anand.jain@oracle.com>
+
+commit 96c2e067ed3e3e004580a643c76f58729206b829 upstream.
+
+Many things can happen after the device is scanned and before the device
+is mounted. One such thing is losing the BTRFS_MAGIC on the device.
+If it happens we still won't free that device from the memory and cause
+the userland confusion.
+
+For example: As the BTRFS_IOC_DEV_INFO still carries the device path
+which does not have the BTRFS_MAGIC, 'btrfs fi show' still lists
+device which does not belong to the filesystem anymore:
+
+ $ mkfs.btrfs -fq -draid1 -mraid1 /dev/sda /dev/sdb
+ $ wipefs -a /dev/sdb
+ # /dev/sdb does not contain magic signature
+ $ mount -o degraded /dev/sda /btrfs
+ $ btrfs fi show -m
+ Label: none uuid: 470ec6fb-646b-4464-b3cb-df1b26c527bd
+ Total devices 2 FS bytes used 128.00KiB
+ devid 1 size 3.00GiB used 571.19MiB path /dev/sda
+ devid 2 size 3.00GiB used 571.19MiB path /dev/sdb
+
+We need to distinguish the missing signature and invalid superblock, so
+add a specific error code ENODATA for that. This also fixes failure of
+fstest btrfs/198.
+
+CC: stable@vger.kernel.org # 4.19+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/disk-io.c | 8 ++++++--
+ fs/btrfs/volumes.c | 18 ++++++++++++------
+ 2 files changed, 18 insertions(+), 8 deletions(-)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -3482,8 +3482,12 @@ struct btrfs_super_block *btrfs_read_dev
+ return ERR_CAST(page);
+
+ super = page_address(page);
+- if (btrfs_super_bytenr(super) != bytenr ||
+- btrfs_super_magic(super) != BTRFS_MAGIC) {
++ if (btrfs_super_magic(super) != BTRFS_MAGIC) {
++ btrfs_release_disk_super(super);
++ return ERR_PTR(-ENODATA);
++ }
++
++ if (btrfs_super_bytenr(super) != bytenr) {
+ btrfs_release_disk_super(super);
+ return ERR_PTR(-EINVAL);
+ }
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -1200,17 +1200,23 @@ static int open_fs_devices(struct btrfs_
+ {
+ struct btrfs_device *device;
+ struct btrfs_device *latest_dev = NULL;
++ struct btrfs_device *tmp_device;
+
+ flags |= FMODE_EXCL;
+
+- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+- /* Just open everything we can; ignore failures here */
+- if (btrfs_open_one_device(fs_devices, device, flags, holder))
+- continue;
++ list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
++ dev_list) {
++ int ret;
+
+- if (!latest_dev ||
+- device->generation > latest_dev->generation)
++ ret = btrfs_open_one_device(fs_devices, device, flags, holder);
++ if (ret == 0 &&
++ (!latest_dev || device->generation > latest_dev->generation)) {
+ latest_dev = device;
++ } else if (ret == -ENODATA) {
++ fs_devices->num_devices--;
++ list_del(&device->dev_list);
++ btrfs_free_device(device);
++ }
+ }
+ if (fs_devices->open_devices == 0)
+ return -EINVAL;
--- /dev/null
+From ca10845a56856fff4de3804c85e6424d0f6d0cde Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Tue, 1 Sep 2020 08:09:01 -0400
+Subject: btrfs: sysfs: init devices outside of the chunk_mutex
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit ca10845a56856fff4de3804c85e6424d0f6d0cde upstream.
+
+While running btrfs/061, btrfs/073, btrfs/078, or btrfs/178 we hit the
+following lockdep splat:
+
+ ======================================================
+ WARNING: possible circular locking dependency detected
+ 5.9.0-rc3+ #4 Not tainted
+ ------------------------------------------------------
+ kswapd0/100 is trying to acquire lock:
+ ffff96ecc22ef4a0 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x330
+
+ but task is already holding lock:
+ ffffffff8dd74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30
+
+ which lock already depends on the new lock.
+
+ the existing dependency chain (in reverse order) is:
+
+ -> #3 (fs_reclaim){+.+.}-{0:0}:
+ fs_reclaim_acquire+0x65/0x80
+ slab_pre_alloc_hook.constprop.0+0x20/0x200
+ kmem_cache_alloc+0x37/0x270
+ alloc_inode+0x82/0xb0
+ iget_locked+0x10d/0x2c0
+ kernfs_get_inode+0x1b/0x130
+ kernfs_get_tree+0x136/0x240
+ sysfs_get_tree+0x16/0x40
+ vfs_get_tree+0x28/0xc0
+ path_mount+0x434/0xc00
+ __x64_sys_mount+0xe3/0x120
+ do_syscall_64+0x33/0x40
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+ -> #2 (kernfs_mutex){+.+.}-{3:3}:
+ __mutex_lock+0x7e/0x7e0
+ kernfs_add_one+0x23/0x150
+ kernfs_create_link+0x63/0xa0
+ sysfs_do_create_link_sd+0x5e/0xd0
+ btrfs_sysfs_add_devices_dir+0x81/0x130
+ btrfs_init_new_device+0x67f/0x1250
+ btrfs_ioctl+0x1ef/0x2e20
+ __x64_sys_ioctl+0x83/0xb0
+ do_syscall_64+0x33/0x40
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+ -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}:
+ __mutex_lock+0x7e/0x7e0
+ btrfs_chunk_alloc+0x125/0x3a0
+ find_free_extent+0xdf6/0x1210
+ btrfs_reserve_extent+0xb3/0x1b0
+ btrfs_alloc_tree_block+0xb0/0x310
+ alloc_tree_block_no_bg_flush+0x4a/0x60
+ __btrfs_cow_block+0x11a/0x530
+ btrfs_cow_block+0x104/0x220
+ btrfs_search_slot+0x52e/0x9d0
+ btrfs_insert_empty_items+0x64/0xb0
+ btrfs_insert_delayed_items+0x90/0x4f0
+ btrfs_commit_inode_delayed_items+0x93/0x140
+ btrfs_log_inode+0x5de/0x2020
+ btrfs_log_inode_parent+0x429/0xc90
+ btrfs_log_new_name+0x95/0x9b
+ btrfs_rename2+0xbb9/0x1800
+ vfs_rename+0x64f/0x9f0
+ do_renameat2+0x320/0x4e0
+ __x64_sys_rename+0x1f/0x30
+ do_syscall_64+0x33/0x40
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+ -> #0 (&delayed_node->mutex){+.+.}-{3:3}:
+ __lock_acquire+0x119c/0x1fc0
+ lock_acquire+0xa7/0x3d0
+ __mutex_lock+0x7e/0x7e0
+ __btrfs_release_delayed_node.part.0+0x3f/0x330
+ btrfs_evict_inode+0x24c/0x500
+ evict+0xcf/0x1f0
+ dispose_list+0x48/0x70
+ prune_icache_sb+0x44/0x50
+ super_cache_scan+0x161/0x1e0
+ do_shrink_slab+0x178/0x3c0
+ shrink_slab+0x17c/0x290
+ shrink_node+0x2b2/0x6d0
+ balance_pgdat+0x30a/0x670
+ kswapd+0x213/0x4c0
+ kthread+0x138/0x160
+ ret_from_fork+0x1f/0x30
+
+ other info that might help us debug this:
+
+ Chain exists of:
+ &delayed_node->mutex --> kernfs_mutex --> fs_reclaim
+
+ Possible unsafe locking scenario:
+
+ CPU0 CPU1
+ ---- ----
+ lock(fs_reclaim);
+ lock(kernfs_mutex);
+ lock(fs_reclaim);
+ lock(&delayed_node->mutex);
+
+ *** DEADLOCK ***
+
+ 3 locks held by kswapd0/100:
+ #0: ffffffff8dd74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30
+ #1: ffffffff8dd65c50 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x115/0x290
+ #2: ffff96ed2ade30e0 (&type->s_umount_key#36){++++}-{3:3}, at: super_cache_scan+0x38/0x1e0
+
+ stack backtrace:
+ CPU: 0 PID: 100 Comm: kswapd0 Not tainted 5.9.0-rc3+ #4
+ Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
+ Call Trace:
+ dump_stack+0x8b/0xb8
+ check_noncircular+0x12d/0x150
+ __lock_acquire+0x119c/0x1fc0
+ lock_acquire+0xa7/0x3d0
+ ? __btrfs_release_delayed_node.part.0+0x3f/0x330
+ __mutex_lock+0x7e/0x7e0
+ ? __btrfs_release_delayed_node.part.0+0x3f/0x330
+ ? __btrfs_release_delayed_node.part.0+0x3f/0x330
+ ? lock_acquire+0xa7/0x3d0
+ ? find_held_lock+0x2b/0x80
+ __btrfs_release_delayed_node.part.0+0x3f/0x330
+ btrfs_evict_inode+0x24c/0x500
+ evict+0xcf/0x1f0
+ dispose_list+0x48/0x70
+ prune_icache_sb+0x44/0x50
+ super_cache_scan+0x161/0x1e0
+ do_shrink_slab+0x178/0x3c0
+ shrink_slab+0x17c/0x290
+ shrink_node+0x2b2/0x6d0
+ balance_pgdat+0x30a/0x670
+ kswapd+0x213/0x4c0
+ ? _raw_spin_unlock_irqrestore+0x41/0x50
+ ? add_wait_queue_exclusive+0x70/0x70
+ ? balance_pgdat+0x670/0x670
+ kthread+0x138/0x160
+ ? kthread_create_worker_on_cpu+0x40/0x40
+ ret_from_fork+0x1f/0x30
+
+This happens because we are holding the chunk_mutex at the time of
+adding in a new device. However we only need to hold the
+device_list_mutex, as we're going to iterate over the fs_devices
+devices. Move the sysfs init stuff outside of the chunk_mutex to get
+rid of this lockdep splat.
+
+CC: stable@vger.kernel.org # 4.4.x: f3cd2c58110dad14e: btrfs: sysfs, rename device_link add/remove functions
+CC: stable@vger.kernel.org # 4.4.x
+Reported-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/volumes.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -2613,9 +2613,6 @@ int btrfs_init_new_device(struct btrfs_f
+ btrfs_set_super_num_devices(fs_info->super_copy,
+ orig_super_num_devices + 1);
+
+- /* add sysfs device entry */
+- btrfs_sysfs_add_devices_dir(fs_devices, device);
+-
+ /*
+ * we've got more storage, clear any full flags on the space
+ * infos
+@@ -2623,6 +2620,10 @@ int btrfs_init_new_device(struct btrfs_f
+ btrfs_clear_space_info_full(fs_info);
+
+ mutex_unlock(&fs_info->chunk_mutex);
++
++ /* Add sysfs device entry */
++ btrfs_sysfs_add_devices_dir(fs_devices, device);
++
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ if (seeding_dev) {
--- /dev/null
+From 437490fed3b0c9ae21af8f70e0f338d34560842b Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Tue, 28 Jul 2020 09:42:49 +0800
+Subject: btrfs: tracepoints: output proper root owner for trace_find_free_extent()
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 437490fed3b0c9ae21af8f70e0f338d34560842b upstream.
+
+The current trace event always output result like this:
+
+ find_free_extent: root=2(EXTENT_TREE) len=16384 empty_size=0 flags=4(METADATA)
+ find_free_extent: root=2(EXTENT_TREE) len=16384 empty_size=0 flags=4(METADATA)
+ find_free_extent: root=2(EXTENT_TREE) len=8192 empty_size=0 flags=1(DATA)
+ find_free_extent: root=2(EXTENT_TREE) len=8192 empty_size=0 flags=1(DATA)
+ find_free_extent: root=2(EXTENT_TREE) len=4096 empty_size=0 flags=1(DATA)
+ find_free_extent: root=2(EXTENT_TREE) len=4096 empty_size=0 flags=1(DATA)
+
+T's saying we're allocating data extent for EXTENT tree, which is not
+even possible.
+
+It's because we always use EXTENT tree as the owner for
+trace_find_free_extent() without using the @root from
+btrfs_reserve_extent().
+
+This patch will change the parameter to use proper @root for
+trace_find_free_extent():
+
+Now it looks much better:
+
+ find_free_extent: root=5(FS_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP)
+ find_free_extent: root=5(FS_TREE) len=8192 empty_size=0 flags=1(DATA)
+ find_free_extent: root=5(FS_TREE) len=16384 empty_size=0 flags=1(DATA)
+ find_free_extent: root=5(FS_TREE) len=4096 empty_size=0 flags=1(DATA)
+ find_free_extent: root=5(FS_TREE) len=8192 empty_size=0 flags=1(DATA)
+ find_free_extent: root=5(FS_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP)
+ find_free_extent: root=7(CSUM_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP)
+ find_free_extent: root=2(EXTENT_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP)
+ find_free_extent: root=1(ROOT_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP)
+
+Reported-by: Hans van Kranenburg <hans@knorrie.org>
+CC: stable@vger.kernel.org # 5.4+
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/extent-tree.c | 7 ++++---
+ include/trace/events/btrfs.h | 10 ++++++----
+ 2 files changed, 10 insertions(+), 7 deletions(-)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -3918,11 +3918,12 @@ static int prepare_allocation(struct btr
+ * |- Push harder to find free extents
+ * |- If not found, re-iterate all block groups
+ */
+-static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
++static noinline int find_free_extent(struct btrfs_root *root,
+ u64 ram_bytes, u64 num_bytes, u64 empty_size,
+ u64 hint_byte_orig, struct btrfs_key *ins,
+ u64 flags, int delalloc)
+ {
++ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret = 0;
+ int cache_block_group_error = 0;
+ struct btrfs_block_group *block_group = NULL;
+@@ -3954,7 +3955,7 @@ static noinline int find_free_extent(str
+ ins->objectid = 0;
+ ins->offset = 0;
+
+- trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
++ trace_find_free_extent(root, num_bytes, empty_size, flags);
+
+ space_info = btrfs_find_space_info(fs_info, flags);
+ if (!space_info) {
+@@ -4203,7 +4204,7 @@ int btrfs_reserve_extent(struct btrfs_ro
+ flags = get_alloc_profile_by_root(root, is_data);
+ again:
+ WARN_ON(num_bytes < fs_info->sectorsize);
+- ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
++ ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
+ hint_byte, ins, flags, delalloc);
+ if (!ret && !is_data) {
+ btrfs_dec_block_group_reservations(fs_info, ins->objectid);
+--- a/include/trace/events/btrfs.h
++++ b/include/trace/events/btrfs.h
+@@ -1176,25 +1176,27 @@ DEFINE_EVENT(btrfs__reserved_extent, bt
+
+ TRACE_EVENT(find_free_extent,
+
+- TP_PROTO(const struct btrfs_fs_info *fs_info, u64 num_bytes,
++ TP_PROTO(const struct btrfs_root *root, u64 num_bytes,
+ u64 empty_size, u64 data),
+
+- TP_ARGS(fs_info, num_bytes, empty_size, data),
++ TP_ARGS(root, num_bytes, empty_size, data),
+
+ TP_STRUCT__entry_btrfs(
++ __field( u64, root_objectid )
+ __field( u64, num_bytes )
+ __field( u64, empty_size )
+ __field( u64, data )
+ ),
+
+- TP_fast_assign_btrfs(fs_info,
++ TP_fast_assign_btrfs(root->fs_info,
++ __entry->root_objectid = root->root_key.objectid;
+ __entry->num_bytes = num_bytes;
+ __entry->empty_size = empty_size;
+ __entry->data = data;
+ ),
+
+ TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)",
+- show_root_type(BTRFS_EXTENT_TREE_OBJECTID),
++ show_root_type(__entry->root_objectid),
+ __entry->num_bytes, __entry->empty_size, __entry->data,
+ __print_flags((unsigned long)__entry->data, "|",
+ BTRFS_GROUP_FLAGS))
--- /dev/null
+From 1465af12e254a68706e110846f59cf0f09683184 Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Tue, 22 Sep 2020 10:37:01 +0800
+Subject: btrfs: tree-checker: fix false alert caused by legacy btrfs root item
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 1465af12e254a68706e110846f59cf0f09683184 upstream.
+
+Commit 259ee7754b67 ("btrfs: tree-checker: Add ROOT_ITEM check")
+introduced btrfs root item size check, however btrfs root item has two
+versions, the legacy one which just ends before generation_v2 member, is
+smaller than current btrfs root item size.
+
+This caused btrfs kernel to reject valid but old tree root leaves.
+
+Fix this problem by also allowing legacy root item, since kernel can
+already handle them pretty well and upgrade to newer root item format
+when needed.
+
+Reported-by: Martin Steigerwald <martin@lichtvoll.de>
+Fixes: 259ee7754b67 ("btrfs: tree-checker: Add ROOT_ITEM check")
+CC: stable@vger.kernel.org # 5.4+
+Tested-By: Martin Steigerwald <martin@lichtvoll.de>
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/tree-checker.c | 17 ++++++++++++-----
+ include/uapi/linux/btrfs_tree.h | 14 ++++++++++++++
+ 2 files changed, 26 insertions(+), 5 deletions(-)
+
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -1035,7 +1035,7 @@ static int check_root_item(struct extent
+ int slot)
+ {
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+- struct btrfs_root_item ri;
++ struct btrfs_root_item ri = { 0 };
+ const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY |
+ BTRFS_ROOT_SUBVOL_DEAD;
+ int ret;
+@@ -1044,14 +1044,21 @@ static int check_root_item(struct extent
+ if (ret < 0)
+ return ret;
+
+- if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) {
++ if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) &&
++ btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) {
+ generic_err(leaf, slot,
+- "invalid root item size, have %u expect %zu",
+- btrfs_item_size_nr(leaf, slot), sizeof(ri));
++ "invalid root item size, have %u expect %zu or %u",
++ btrfs_item_size_nr(leaf, slot), sizeof(ri),
++ btrfs_legacy_root_item_size());
+ }
+
++ /*
++ * For legacy root item, the members starting at generation_v2 will be
++ * all filled with 0.
++ * And since we allow geneartion_v2 as 0, it will still pass the check.
++ */
+ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
+- sizeof(ri));
++ btrfs_item_size_nr(leaf, slot));
+
+ /* Generation related */
+ if (btrfs_root_generation(&ri) >
+--- a/include/uapi/linux/btrfs_tree.h
++++ b/include/uapi/linux/btrfs_tree.h
+@@ -4,6 +4,11 @@
+
+ #include <linux/btrfs.h>
+ #include <linux/types.h>
++#ifdef __KERNEL__
++#include <linux/stddef.h>
++#else
++#include <stddef.h>
++#endif
+
+ /*
+ * This header contains the structure definitions and constants used
+@@ -645,6 +650,15 @@ struct btrfs_root_item {
+ } __attribute__ ((__packed__));
+
+ /*
++ * Btrfs root item used to be smaller than current size. The old format ends
++ * at where member generation_v2 is.
++ */
++static inline __u32 btrfs_legacy_root_item_size(void)
++{
++ return offsetof(struct btrfs_root_item, generation_v2);
++}
++
++/*
+ * this is used for both forward and backward root refs
+ */
+ struct btrfs_root_ref {
--- /dev/null
+From 85d07fbe09efd1c529ff3e025e2f0d2c6c96a1b7 Mon Sep 17 00:00:00 2001
+From: Daniel Xu <dxu@dxuuu.xyz>
+Date: Thu, 8 Oct 2020 18:09:10 -0700
+Subject: btrfs: tree-checker: validate number of chunk stripes and parity
+
+From: Daniel Xu <dxu@dxuuu.xyz>
+
+commit 85d07fbe09efd1c529ff3e025e2f0d2c6c96a1b7 upstream.
+
+If there's no parity and num_stripes < ncopies, a crafted image can
+trigger a division by zero in calc_stripe_length().
+
+The image was generated through fuzzing.
+
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=209587
+Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/tree-checker.c | 18 ++++++++++++++++++
+ 1 file changed, 18 insertions(+)
+
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -760,18 +760,36 @@ int btrfs_check_chunk_valid(struct exten
+ u64 type;
+ u64 features;
+ bool mixed = false;
++ int raid_index;
++ int nparity;
++ int ncopies;
+
+ length = btrfs_chunk_length(leaf, chunk);
+ stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+ type = btrfs_chunk_type(leaf, chunk);
++ raid_index = btrfs_bg_flags_to_raid_index(type);
++ ncopies = btrfs_raid_array[raid_index].ncopies;
++ nparity = btrfs_raid_array[raid_index].nparity;
+
+ if (!num_stripes) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk num_stripes, have %u", num_stripes);
+ return -EUCLEAN;
+ }
++ if (num_stripes < ncopies) {
++ chunk_err(leaf, chunk, logical,
++ "invalid chunk num_stripes < ncopies, have %u < %d",
++ num_stripes, ncopies);
++ return -EUCLEAN;
++ }
++ if (nparity && num_stripes == nparity) {
++ chunk_err(leaf, chunk, logical,
++ "invalid chunk num_stripes == nparity, have %u == %d",
++ num_stripes, nparity);
++ return -EUCLEAN;
++ }
+ if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk logical, have %llu should aligned to %u",
--- /dev/null
+From 8eb2fd00153a3a96a19c62ac9c6d48c2efebe5e8 Mon Sep 17 00:00:00 2001
+From: Denis Efremov <efremov@linux.com>
+Date: Mon, 21 Sep 2020 20:03:35 +0300
+Subject: btrfs: use kvzalloc() to allocate clone_roots in btrfs_ioctl_send()
+
+From: Denis Efremov <efremov@linux.com>
+
+commit 8eb2fd00153a3a96a19c62ac9c6d48c2efebe5e8 upstream.
+
+btrfs_ioctl_send() used open-coded kvzalloc implementation earlier.
+The code was accidentally replaced with kzalloc() call [1]. Restore
+the original code by using kvzalloc() to allocate sctx->clone_roots.
+
+[1] https://patchwork.kernel.org/patch/9757891/#20529627
+
+Fixes: 818e010bf9d0 ("btrfs: replace opencoded kvzalloc with the helper")
+CC: stable@vger.kernel.org # 4.14+
+Signed-off-by: Denis Efremov <efremov@linux.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/send.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/send.c
++++ b/fs/btrfs/send.c
+@@ -7300,7 +7300,7 @@ long btrfs_ioctl_send(struct file *mnt_f
+
+ alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
+
+- sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL);
++ sctx->clone_roots = kvzalloc(alloc_size, GFP_KERNEL);
+ if (!sctx->clone_roots) {
+ ret = -ENOMEM;
+ goto out;
--- /dev/null
+From d12544fb2aa9944b180c35914031a8384ab082c1 Mon Sep 17 00:00:00 2001
+From: Xiang Chen <chenxiang66@hisilicon.com>
+Date: Tue, 22 Sep 2020 21:11:06 +0800
+Subject: PM: runtime: Remove link state checks in rpm_get/put_supplier()
+
+From: Xiang Chen <chenxiang66@hisilicon.com>
+
+commit d12544fb2aa9944b180c35914031a8384ab082c1 upstream.
+
+To support runtime PM for hisi SAS driver (the driver is in directory
+drivers/scsi/hisi_sas), we add device link between scsi_device->sdev_gendev
+(consumer device) and hisi_hba->dev(supplier device) with flags
+DL_FLAG_PM_RUNTIME | DL_FLAG_RPM_ACTIVE.
+
+After runtime suspended consumers and supplier, unload the dirver which
+causes a hung.
+
+We found that it called function device_release_driver_internal() to
+release the supplier device (hisi_hba->dev), as the device link was
+busy, it set the device link state to DL_STATE_SUPPLIER_UNBIND, and
+then it called device_release_driver_internal() to release the consumer
+device (scsi_device->sdev_gendev).
+
+Then it would try to call pm_runtime_get_sync() to resume the consumer
+device, but because consumer-supplier relation existed, it would try
+to resume the supplier first, but as the link state was already
+DL_STATE_SUPPLIER_UNBIND, so it skipped resuming the supplier and only
+resumed the consumer which hanged (it sends IOs to resume scsi_device
+while the SAS controller is suspended).
+
+Simple flow is as follows:
+
+device_release_driver_internal -> (supplier device)
+ if device_links_busy ->
+ device_links_unbind_consumers ->
+ ...
+ WRITE_ONCE(link->status, DL_STATE_SUPPLIER_UNBIND)
+ device_release_driver_internal (consumer device)
+ pm_runtime_get_sync -> (consumer device)
+ ...
+ __rpm_callback ->
+ rpm_get_suppliers ->
+ if link->state == DL_STATE_SUPPLIER_UNBIND -> skip the action of resuming the supplier
+ ...
+ pm_runtime_clean_up_links
+ ...
+
+Correct suspend/resume ordering between a supplier device and its consumer
+devices (resume the supplier device before resuming consumer devices, and
+suspend consumer devices before suspending the supplier device) should be
+guaranteed by runtime PM, but the state checks in rpm_get_supplier() and
+rpm_put_supplier() break this rule, so remove them.
+
+Signed-off-by: Xiang Chen <chenxiang66@hisilicon.com>
+[ rjw: Subject and changelog edits ]
+Cc: All applicable <stable@vger.kernel.org>
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/base/power/runtime.c | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+--- a/drivers/base/power/runtime.c
++++ b/drivers/base/power/runtime.c
+@@ -291,8 +291,7 @@ static int rpm_get_suppliers(struct devi
+ device_links_read_lock_held()) {
+ int retval;
+
+- if (!(link->flags & DL_FLAG_PM_RUNTIME) ||
+- READ_ONCE(link->status) == DL_STATE_SUPPLIER_UNBIND)
++ if (!(link->flags & DL_FLAG_PM_RUNTIME))
+ continue;
+
+ retval = pm_runtime_get_sync(link->supplier);
+@@ -312,8 +311,6 @@ static void rpm_put_suppliers(struct dev
+
+ list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
+ device_links_read_lock_held()) {
+- if (READ_ONCE(link->status) == DL_STATE_SUPPLIER_UNBIND)
+- continue;
+
+ while (refcount_dec_not_one(&link->rpm_active))
+ pm_runtime_put(link->supplier);
--- /dev/null
+From 2f4843b172c2c0360ee7792ad98025fae7baefde Mon Sep 17 00:00:00 2001
+From: Helge Deller <deller@gmx.de>
+Date: Thu, 22 Oct 2020 11:00:05 +0200
+Subject: scsi: mptfusion: Fix null pointer dereferences in mptscsih_remove()
+
+From: Helge Deller <deller@gmx.de>
+
+commit 2f4843b172c2c0360ee7792ad98025fae7baefde upstream.
+
+The mptscsih_remove() function triggers a kernel oops if the Scsi_Host
+pointer (ioc->sh) is NULL, as can be seen in this syslog:
+
+ ioc0: LSI53C1030 B2: Capabilities={Initiator,Target}
+ Begin: Waiting for root file system ...
+ scsi host2: error handler thread failed to spawn, error = -4
+ mptspi: ioc0: WARNING - Unable to register controller with SCSI subsystem
+ Backtrace:
+ [<000000001045b7cc>] mptspi_probe+0x248/0x3d0 [mptspi]
+ [<0000000040946470>] pci_device_probe+0x1ac/0x2d8
+ [<0000000040add668>] really_probe+0x1bc/0x988
+ [<0000000040ade704>] driver_probe_device+0x160/0x218
+ [<0000000040adee24>] device_driver_attach+0x160/0x188
+ [<0000000040adef90>] __driver_attach+0x144/0x320
+ [<0000000040ad7c78>] bus_for_each_dev+0xd4/0x158
+ [<0000000040adc138>] driver_attach+0x4c/0x80
+ [<0000000040adb3ec>] bus_add_driver+0x3e0/0x498
+ [<0000000040ae0130>] driver_register+0xf4/0x298
+ [<00000000409450c4>] __pci_register_driver+0x78/0xa8
+ [<000000000007d248>] mptspi_init+0x18c/0x1c4 [mptspi]
+
+This patch adds the necessary NULL-pointer checks. Successfully tested on
+a HP C8000 parisc workstation with buggy SCSI drives.
+
+Link: https://lore.kernel.org/r/20201022090005.GA9000@ls3530.fritz.box
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Helge Deller <deller@gmx.de>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/message/fusion/mptscsih.c | 13 ++++++++-----
+ 1 file changed, 8 insertions(+), 5 deletions(-)
+
+--- a/drivers/message/fusion/mptscsih.c
++++ b/drivers/message/fusion/mptscsih.c
+@@ -1176,8 +1176,10 @@ mptscsih_remove(struct pci_dev *pdev)
+ MPT_SCSI_HOST *hd;
+ int sz1;
+
+- if((hd = shost_priv(host)) == NULL)
+- return;
++ if (host == NULL)
++ hd = NULL;
++ else
++ hd = shost_priv(host);
+
+ mptscsih_shutdown(pdev);
+
+@@ -1193,14 +1195,15 @@ mptscsih_remove(struct pci_dev *pdev)
+ "Free'd ScsiLookup (%d) memory\n",
+ ioc->name, sz1));
+
+- kfree(hd->info_kbuf);
++ if (hd)
++ kfree(hd->info_kbuf);
+
+ /* NULL the Scsi_Host pointer
+ */
+ ioc->sh = NULL;
+
+- scsi_host_put(host);
+-
++ if (host)
++ scsi_host_put(host);
+ mpt_detach(pdev);
+
+ }
--- /dev/null
+From 50457dab670f396557e60c07f086358460876353 Mon Sep 17 00:00:00 2001
+From: Quinn Tran <qutran@marvell.com>
+Date: Tue, 29 Sep 2020 03:21:50 -0700
+Subject: scsi: qla2xxx: Fix crash on session cleanup with unload
+
+From: Quinn Tran <qutran@marvell.com>
+
+commit 50457dab670f396557e60c07f086358460876353 upstream.
+
+On unload, session cleanup prematurely gave the signal for driver unload
+path to advance.
+
+Link: https://lore.kernel.org/r/20200929102152.32278-6-njavali@marvell.com
+Fixes: 726b85487067 ("qla2xxx: Add framework for async fabric discovery")
+Cc: stable@vger.kernel.org
+Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
+Signed-off-by: Quinn Tran <qutran@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/scsi/qla2xxx/qla_target.c | 13 +++++++------
+ 1 file changed, 7 insertions(+), 6 deletions(-)
+
+--- a/drivers/scsi/qla2xxx/qla_target.c
++++ b/drivers/scsi/qla2xxx/qla_target.c
+@@ -1229,14 +1229,15 @@ void qlt_schedule_sess_for_deletion(stru
+ case DSC_DELETE_PEND:
+ return;
+ case DSC_DELETED:
+- if (tgt && tgt->tgt_stop && (tgt->sess_count == 0))
+- wake_up_all(&tgt->waitQ);
+- if (sess->vha->fcport_count == 0)
+- wake_up_all(&sess->vha->fcport_waitQ);
+-
+ if (!sess->plogi_link[QLT_PLOGI_LINK_SAME_WWN] &&
+- !sess->plogi_link[QLT_PLOGI_LINK_CONFLICT])
++ !sess->plogi_link[QLT_PLOGI_LINK_CONFLICT]) {
++ if (tgt && tgt->tgt_stop && tgt->sess_count == 0)
++ wake_up_all(&tgt->waitQ);
++
++ if (sess->vha->fcport_count == 0)
++ wake_up_all(&sess->vha->fcport_waitQ);
+ return;
++ }
+ break;
+ case DSC_UPD_FCPORT:
+ /*
--- /dev/null
+From 7a6cdbd5e87515ebf6231b762ad903c7cff87b9c Mon Sep 17 00:00:00 2001
+From: Arun Easi <aeasi@marvell.com>
+Date: Tue, 29 Sep 2020 03:21:48 -0700
+Subject: scsi: qla2xxx: Fix MPI reset needed message
+
+From: Arun Easi <aeasi@marvell.com>
+
+commit 7a6cdbd5e87515ebf6231b762ad903c7cff87b9c upstream.
+
+When printing the message:
+
+ "MPI Heartbeat stop. MPI reset is not needed.."
+
+..the wrong register was checked leading to always printing that MPI reset
+is not needed, even when it is needed. Fix the MPI reset message.
+
+Link: https://lore.kernel.org/r/20200929102152.32278-4-njavali@marvell.com
+Fixes: cbb01c2f2f63 ("scsi: qla2xxx: Fix MPI failure AEN (8200) handling")
+Cc: stable@vger.kernel.org
+Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
+Signed-off-by: Arun Easi <aeasi@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/scsi/qla2xxx/qla_isr.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/scsi/qla2xxx/qla_isr.c
++++ b/drivers/scsi/qla2xxx/qla_isr.c
+@@ -767,7 +767,7 @@ qla27xx_handle_8200_aen(scsi_qla_host_t
+ ql_log(ql_log_warn, vha, 0x02f0,
+ "MPI Heartbeat stop. MPI reset is%s needed. "
+ "MB0[%xh] MB1[%xh] MB2[%xh] MB3[%xh]\n",
+- mb[0] & BIT_8 ? "" : " not",
++ mb[1] & BIT_8 ? "" : " not",
+ mb[0], mb[1], mb[2], mb[3]);
+
+ if ((mb[1] & BIT_8) == 0)
--- /dev/null
+From 3e6efab865ac943f4ec43913eb665695737112b0 Mon Sep 17 00:00:00 2001
+From: Arun Easi <aeasi@marvell.com>
+Date: Tue, 29 Sep 2020 03:21:49 -0700
+Subject: scsi: qla2xxx: Fix reset of MPI firmware
+
+From: Arun Easi <aeasi@marvell.com>
+
+commit 3e6efab865ac943f4ec43913eb665695737112b0 upstream.
+
+Normally, the MPI firmware is reset when an MPI dump is collected. If an
+unsaved MPI dump exists in the driver, though, an alternate mechanism is
+used. This mechanism, which was not fully correct, is not recommended and
+instead an MPI dump template walk is suggested to perform the MPI reset.
+
+To allow for the MPI dump template walk, extra space is reserved in the MPI
+dump buffer which gets used only when there is already an MPI dump in
+place.
+
+Link: https://lore.kernel.org/r/20200929102152.32278-5-njavali@marvell.com
+Fixes: cbb01c2f2f63 ("scsi: qla2xxx: Fix MPI failure AEN (8200) handling")
+Cc: stable@vger.kernel.org
+Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
+Signed-off-by: Arun Easi <aeasi@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/scsi/qla2xxx/qla_attr.c | 10 ++++++--
+ drivers/scsi/qla2xxx/qla_gbl.h | 1
+ drivers/scsi/qla2xxx/qla_init.c | 2 +
+ drivers/scsi/qla2xxx/qla_tmpl.c | 49 ++++++++++------------------------------
+ 4 files changed, 23 insertions(+), 39 deletions(-)
+
+--- a/drivers/scsi/qla2xxx/qla_attr.c
++++ b/drivers/scsi/qla2xxx/qla_attr.c
+@@ -157,6 +157,14 @@ qla2x00_sysfs_write_fw_dump(struct file
+ vha->host_no);
+ }
+ break;
++ case 10:
++ if (IS_QLA27XX(ha) || IS_QLA28XX(ha)) {
++ ql_log(ql_log_info, vha, 0x70e9,
++ "Issuing MPI firmware dump on host#%ld.\n",
++ vha->host_no);
++ ha->isp_ops->mpi_fw_dump(vha, 0);
++ }
++ break;
+ }
+ return count;
+ }
+@@ -744,8 +752,6 @@ qla2x00_sysfs_write_reset(struct file *f
+ qla83xx_idc_audit(vha, IDC_AUDIT_TIMESTAMP);
+ qla83xx_idc_unlock(vha, 0);
+ break;
+- } else if (IS_QLA27XX(ha) || IS_QLA28XX(ha)) {
+- qla27xx_reset_mpi(vha);
+ } else {
+ /* Make sure FC side is not in reset */
+ WARN_ON_ONCE(qla2x00_wait_for_hba_online(vha) !=
+--- a/drivers/scsi/qla2xxx/qla_gbl.h
++++ b/drivers/scsi/qla2xxx/qla_gbl.h
+@@ -938,6 +938,5 @@ extern void qla24xx_process_purex_list(s
+
+ /* nvme.c */
+ void qla_nvme_unregister_remote_port(struct fc_port *fcport);
+-void qla27xx_reset_mpi(scsi_qla_host_t *vha);
+ void qla_handle_els_plogi_done(scsi_qla_host_t *vha, struct event_arg *ea);
+ #endif /* _QLA_GBL_H */
+--- a/drivers/scsi/qla2xxx/qla_init.c
++++ b/drivers/scsi/qla2xxx/qla_init.c
+@@ -3298,6 +3298,8 @@ qla2x00_alloc_fw_dump(scsi_qla_host_t *v
+ j, fwdt->dump_size);
+ dump_size += fwdt->dump_size;
+ }
++ /* Add space for spare MPI fw dump. */
++ dump_size += ha->fwdt[1].dump_size;
+ } else {
+ req_q_size = req->length * sizeof(request_t);
+ rsp_q_size = rsp->length * sizeof(response_t);
+--- a/drivers/scsi/qla2xxx/qla_tmpl.c
++++ b/drivers/scsi/qla2xxx/qla_tmpl.c
+@@ -12,33 +12,6 @@
+ #define IOBASE(vha) IOBAR(ISPREG(vha))
+ #define INVALID_ENTRY ((struct qla27xx_fwdt_entry *)0xffffffffffffffffUL)
+
+-/* hardware_lock assumed held. */
+-static void
+-qla27xx_write_remote_reg(struct scsi_qla_host *vha,
+- u32 addr, u32 data)
+-{
+- struct device_reg_24xx __iomem *reg = &vha->hw->iobase->isp24;
+-
+- ql_dbg(ql_dbg_misc, vha, 0xd300,
+- "%s: addr/data = %xh/%xh\n", __func__, addr, data);
+-
+- wrt_reg_dword(®->iobase_addr, 0x40);
+- wrt_reg_dword(®->iobase_c4, data);
+- wrt_reg_dword(®->iobase_window, addr);
+-}
+-
+-void
+-qla27xx_reset_mpi(scsi_qla_host_t *vha)
+-{
+- ql_dbg(ql_dbg_misc + ql_dbg_verbose, vha, 0xd301,
+- "Entered %s.\n", __func__);
+-
+- qla27xx_write_remote_reg(vha, 0x104050, 0x40004);
+- qla27xx_write_remote_reg(vha, 0x10405c, 0x4);
+-
+- vha->hw->stat.num_mpi_reset++;
+-}
+-
+ static inline void
+ qla27xx_insert16(uint16_t value, void *buf, ulong *len)
+ {
+@@ -1028,7 +1001,6 @@ void
+ qla27xx_mpi_fwdump(scsi_qla_host_t *vha, int hardware_locked)
+ {
+ ulong flags = 0;
+- bool need_mpi_reset = true;
+
+ #ifndef __CHECKER__
+ if (!hardware_locked)
+@@ -1036,14 +1008,20 @@ qla27xx_mpi_fwdump(scsi_qla_host_t *vha,
+ #endif
+ if (!vha->hw->mpi_fw_dump) {
+ ql_log(ql_log_warn, vha, 0x02f3, "-> mpi_fwdump no buffer\n");
+- } else if (vha->hw->mpi_fw_dumped) {
+- ql_log(ql_log_warn, vha, 0x02f4,
+- "-> MPI firmware already dumped (%p) -- ignoring request\n",
+- vha->hw->mpi_fw_dump);
+ } else {
+ struct fwdt *fwdt = &vha->hw->fwdt[1];
+ ulong len;
+ void *buf = vha->hw->mpi_fw_dump;
++ bool walk_template_only = false;
++
++ if (vha->hw->mpi_fw_dumped) {
++ /* Use the spare area for any further dumps. */
++ buf += fwdt->dump_size;
++ walk_template_only = true;
++ ql_log(ql_log_warn, vha, 0x02f4,
++ "-> MPI firmware already dumped -- dump saving to temporary buffer %p.\n",
++ buf);
++ }
+
+ ql_log(ql_log_warn, vha, 0x02f5, "-> fwdt1 running...\n");
+ if (!fwdt->template) {
+@@ -1058,9 +1036,10 @@ qla27xx_mpi_fwdump(scsi_qla_host_t *vha,
+ ql_log(ql_log_warn, vha, 0x02f7,
+ "-> fwdt1 fwdump residual=%+ld\n",
+ fwdt->dump_size - len);
+- } else {
+- need_mpi_reset = false;
+ }
++ vha->hw->stat.num_mpi_reset++;
++ if (walk_template_only)
++ goto bailout;
+
+ vha->hw->mpi_fw_dump_len = len;
+ vha->hw->mpi_fw_dumped = 1;
+@@ -1072,8 +1051,6 @@ qla27xx_mpi_fwdump(scsi_qla_host_t *vha,
+ }
+
+ bailout:
+- if (need_mpi_reset)
+- qla27xx_reset_mpi(vha);
+ #ifndef __CHECKER__
+ if (!hardware_locked)
+ spin_unlock_irqrestore(&vha->hw->hardware_lock, flags);
io-wq-assign-numa-node-locality-if-appropriate.patch
w1-mxc_w1-fix-timeout-resolution-problem-leading-to-bus-error.patch
fs-kernel_read_file-remove-firmware_prealloc_buffer-enum.patch
+scsi-mptfusion-fix-null-pointer-dereferences-in-mptscsih_remove.patch
+scsi-qla2xxx-fix-mpi-reset-needed-message.patch
+scsi-qla2xxx-fix-reset-of-mpi-firmware.patch
+scsi-qla2xxx-fix-crash-on-session-cleanup-with-unload.patch
+pm-runtime-remove-link-state-checks-in-rpm_get-put_supplier.patch
+btrfs-qgroup-fix-wrong-qgroup-metadata-reserve-for-delayed-inode.patch
+btrfs-improve-device-scanning-messages.patch
+btrfs-qgroup-fix-qgroup-meta-rsv-leak-for-subvolume-operations.patch
+btrfs-sysfs-init-devices-outside-of-the-chunk_mutex.patch
+btrfs-tracepoints-output-proper-root-owner-for-trace_find_free_extent.patch
+btrfs-reschedule-if-necessary-when-logging-directory-items.patch
+btrfs-send-orphanize-first-all-conflicting-inodes-when-processing-references.patch
+btrfs-send-recompute-reference-path-after-orphanization-of-a-directory.patch
+btrfs-use-kvzalloc-to-allocate-clone_roots-in-btrfs_ioctl_send.patch
+btrfs-tree-checker-fix-false-alert-caused-by-legacy-btrfs-root-item.patch
+btrfs-reschedule-when-cloning-lots-of-extents.patch
+btrfs-cleanup-cow-block-on-error.patch
+btrfs-skip-devices-without-magic-signature-when-mounting.patch
+btrfs-tree-checker-validate-number-of-chunk-stripes-and-parity.patch
+btrfs-fix-use-after-free-on-readahead-extent-after-failure-to-create-it.patch
+btrfs-fix-readahead-hang-and-use-after-free-after-removing-a-device.patch
+btrfs-drop-the-path-before-adding-block-group-sysfs-files.patch